Internal change

PiperOrigin-RevId: 329754787

Internal change
PiperOrigin-RevId: 329754787
cc748b2a · Abdullah Rashwan · A. Unique TensorFlower · 2f788e1d · cc748b2a · cc748b2a
Commit cc748b2a authored Sep 02, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower Sep 02, 2020
20 changed files
--- a/official/vision/beta/modeling/layers/box_matcher.py
+++ b/official/vision/beta/modeling/layers/box_matcher.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Box matcher."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BoxMatcher(tf.keras.layers.Layer):
+  """Match boxes with groundtruth boxes."""
+
+  def __init__(self,
+               foreground_iou_threshold=0.5,
+               background_iou_high_threshold=0.5,
+               background_iou_low_threshold=0,
+               **kwargs):
+    """Initializes a box matcher.
+
+    Args:
+      foreground_iou_threshold: float, represent the IoU threshold for a box to
+        be considered as positive (if >= `foreground_iou_threshold`).
+      background_iou_high_threshold: float, represent the IoU threshold for a
+        box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
+      background_iou_low_threshold: float, represent the IoU threshold for a box
+        to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`])
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'foreground_iou_threshold': foreground_iou_threshold,
+        'background_iou_high_threshold': background_iou_high_threshold,
+        'background_iou_low_threshold': background_iou_low_threshold,
+    }
+    super(BoxMatcher, self).__init__(**kwargs)
+
+  def call(self, boxes, gt_boxes, gt_classes):
+    """Match boxes to groundtruth boxes.
+
+    Given the proposal boxes and the groundtruth boxes and classes, perform the
+    groundtruth matching by taking the argmax of the IoU between boxes and
+    groundtruth boxes.
+
+    Args:
+      boxes: a tensor of shape of [batch_size, N, 4] representing the box
+        coordianates to be matched to groundtruth boxes.
+      gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
+        the groundtruth box coordinates. It is padded with -1s to indicate the
+        invalid boxes.
+      gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+        classes. It is padded with -1s to indicate the invalid classes.
+
+    Returns:
+      matched_gt_boxes: a tensor of shape of [batch, N, 4], representing
+        the matched groundtruth box coordinates for each input box. The box is
+        considered to match to a groundtruth box only if the IoU overlap is
+        greater than `foreground_iou_threshold`. If the box is a negative match,
+        or does not overlap with any groundtruth boxes, the matched boxes will
+        be set to all 0s.
+      matched_gt_classes: a tensor of shape of [batch, N], representing
+        the matched groundtruth classes for each input box. If the box is a
+        negative match or does not overlap with any groundtruth boxes, the
+        matched classes of it will be set to 0, which corresponds to the
+        background class.
+      matched_gt_indices: a tensor of shape of [batch, N], representing the
+        indices of the matched groundtruth boxes in the original gt_boxes
+        tensor. If the box is a negative match or does not overlap with any
+        groundtruth boxes, the index of the matched groundtruth will be set to
+        -1.
+      positive_matches: a bool tensor of shape of [batch, N], representing
+        whether each box is a positive matches or not. A positive match is the
+        case where IoU of a box with any groundtruth box is greater than
+        `foreground_iou_threshold`.
+      negative_matches: a bool tensor of shape of [batch, N], representing
+        whether each box is a negative matches or not. A negative match is the
+        case where IoU of a box with any groundtruth box is greater than
+        `background_iou_low_threshold` and less than
+        `background_iou_low_threshold`.
+      ignored_matches: a bool tensor of shape of [batch, N], representing
+        whether each box is an ignored matches or not. An ignored matches is the
+        match that is neither positive or negative.
+    """
+    matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou, _ = (
+        box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    positive_matches = tf.greater(
+        matched_iou, self._config_dict['foreground_iou_threshold'])
+    negative_matches = tf.logical_and(
+        tf.greater_equal(
+            matched_iou, self._config_dict['background_iou_low_threshold']),
+        tf.less(
+            matched_iou, self._config_dict['background_iou_high_threshold']))
+    ignored_matches = tf.logical_and(
+        tf.less(matched_iou, 0.0),
+        tf.greater_equal(
+            matched_iou, self._config_dict['background_iou_high_threshold']))
+    ignored_matches = tf.logical_and(
+        ignored_matches,
+        tf.less(
+            matched_iou, self._config_dict['foreground_iou_threshold']))
+
+    background_indicator = tf.logical_or(negative_matches, ignored_matches)
+
+    # re-assign negatively matched boxes to the background class.
+    matched_gt_boxes = tf.where(
+        tf.tile(tf.expand_dims(background_indicator, -1), [1, 1, 4]),
+        tf.zeros_like(matched_gt_boxes),
+        matched_gt_boxes)
+    matched_gt_classes = tf.where(
+        background_indicator,
+        tf.zeros_like(matched_gt_classes),
+        matched_gt_classes)
+    matched_gt_indices = tf.where(
+        background_indicator,
+        -tf.ones_like(matched_gt_indices),
+        matched_gt_indices)
+
+    return (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+            positive_matches, negative_matches, ignored_matches)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/box_matcher_test.py
+++ b/official/vision/beta/modeling/layers/box_matcher_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for box_matcher.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import box_matcher
+
+
+class BoxMatcherTest(tf.test.TestCase):
+
+  def test_box_matcher(self):
+    boxes_np = np.array(
+        [[
+            [0, 0, 1, 1],
+            [5, 0, 10, 5],
+        ]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+
+    gt_boxes_np = np.array(
+        [[
+            [0, 0, 5, 5],
+            [0, 5, 5, 10],
+            [5, 0, 10, 5],
+            [5, 5, 10, 10],
+        ]])
+    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
+    gt_classes_np = np.array([[2, 10, 3, -1]])
+    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
+
+    fg_threshold = 0.5
+    bg_thresh_hi = 0.2
+    bg_thresh_lo = 0.0
+
+    matcher = box_matcher.BoxMatcher(fg_threshold, bg_thresh_hi, bg_thresh_lo)
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      (matched_gt_boxes_tpu, matched_gt_classes_tpu, matched_gt_indices_tpu,
+       positive_matches_tpu, negative_matches_tpu, ignored_matches_tpu) = (
+           matcher(boxes, gt_boxes, gt_classes))
+
+    # Runs on CPU.
+    (matched_gt_boxes_cpu, matched_gt_classes_cpu, matched_gt_indices_cpu,
+     positive_matches_cpu, negative_matches_cpu, ignored_matches_cpu) = (
+         matcher(boxes, gt_boxes, gt_classes))
+
+    # correctness
+    self.assertNDArrayNear(
+        matched_gt_boxes_cpu.numpy(),
+        [[[0, 0, 0, 0], [5, 0, 10, 5]]], 1e-4)
+    self.assertAllEqual(
+        matched_gt_classes_cpu.numpy(), [[0, 3]])
+    self.assertAllEqual(
+        matched_gt_indices_cpu.numpy(), [[-1, 2]])
+    self.assertAllEqual(
+        positive_matches_cpu.numpy(), [[False, True]])
+    self.assertAllEqual(
+        negative_matches_cpu.numpy(), [[True, False]])
+    self.assertAllEqual(
+        ignored_matches_cpu.numpy(), [[False, False]])
+
+    # consistency.
+    self.assertNDArrayNear(
+        matched_gt_boxes_cpu.numpy(), matched_gt_boxes_tpu.numpy(), 1e-4)
+    self.assertAllEqual(
+        matched_gt_classes_cpu.numpy(), matched_gt_classes_tpu.numpy())
+    self.assertAllEqual(
+        matched_gt_indices_cpu.numpy(), matched_gt_indices_tpu.numpy())
+    self.assertAllEqual(
+        positive_matches_cpu.numpy(), positive_matches_tpu.numpy())
+    self.assertAllEqual(
+        negative_matches_cpu.numpy(), negative_matches_tpu.numpy())
+    self.assertAllEqual(
+        ignored_matches_cpu.numpy(), ignored_matches_tpu.numpy())
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        foreground_iou_threshold=0.5,
+        background_iou_high_threshold=0.5,
+        background_iou_low_threshold=0.5,
+    )
+    matcher = box_matcher.BoxMatcher(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(matcher.get_config(), expected_config)
+
+    new_matcher = box_matcher.BoxMatcher.from_config(matcher.get_config())
+
+    self.assertAllEqual(matcher.get_config(), new_matcher.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/box_sampler.py
+++ b/official/vision/beta/modeling/layers/box_sampler.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Box sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import sampling_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BoxSampler(tf.keras.layers.Layer):
+  """Sample positive and negative boxes."""
+
+  def __init__(self,
+               num_samples=512,
+               foreground_fraction=0.25,
+               **kwargs):
+    """Initializes a ROI sampler.
+
+    Args:
+      num_samples: int, the number of sampled boxes per image.
+      foreground_fraction: float in [0, 1], what percentage of boxes should be
+        sampled from the positive examples.
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'num_samples': num_samples,
+        'foreground_fraction': foreground_fraction,
+    }
+    super(BoxSampler, self).__init__(**kwargs)
+
+  def call(self, positive_matches, negative_matches, ignored_matches):
+    """Sample and select positive and negative instances.
+
+    Args:
+      positive_matches: a `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        corresponds to a positive example.
+      negative_matches: a `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        corresponds to a negative example.
+      ignored_matches: a `bool` tensor of shape of [batch, N] where N is the
+        number of instances. For each element, `True` means the instance
+        should be ignored.
+
+    Returns:
+      selected_indices: a tensor of shape of [batch_size, K], storing the
+        indices of the sampled examples, where K is `num_samples`.
+    """
+    sample_candidates = tf.logical_and(
+        tf.logical_or(positive_matches, negative_matches),
+        tf.logical_not(ignored_matches))
+
+    sampler = sampling_ops.BalancedPositiveNegativeSampler(
+        positive_fraction=self._config_dict['foreground_fraction'],
+        is_static=True)
+
+    batch_size = sample_candidates.shape[0]
+    sampled_indicators = []
+    for i in range(batch_size):
+      sampled_indicator = sampler.subsample(
+          sample_candidates[i],
+          self._config_dict['num_samples'],
+          positive_matches[i])
+      sampled_indicators.append(sampled_indicator)
+    sampled_indicators = tf.stack(sampled_indicators)
+    _, selected_indices = tf.nn.top_k(
+        tf.cast(sampled_indicators, dtype=tf.int32),
+        k=self._config_dict['num_samples'],
+        sorted=True)
+
+    return selected_indices
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/box_sampler_test.py
+++ b/official/vision/beta/modeling/layers/box_sampler_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for roi_sampler.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import box_sampler
+
+
+class BoxSamplerTest(tf.test.TestCase):
+
+  def test_box_sampler(self):
+    positive_matches = np.array(
+        [[True, False, False, False, True, True, False],
+         [False, False, False, False, False, True, True]])
+    negative_matches = np.array(
+        [[False, True, True, True, False, False, False],
+         [True, True, True, True, False, False, False]])
+    ignored_matches = np.array(
+        [[False, False, False, False, False, False, True],
+         [False, False, False, False, True, False, False]])
+
+    sampler = box_sampler.BoxSampler(num_samples=2, foreground_fraction=0.5)
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      selected_indices_tpu = sampler(
+          positive_matches, negative_matches, ignored_matches)
+
+    self.assertEqual(2, tf.shape(selected_indices_tpu)[1])
+
+    # Runs on CPU.
+    selected_indices_cpu = sampler(
+        positive_matches, negative_matches, ignored_matches)
+    self.assertEqual(2, tf.shape(selected_indices_cpu)[1])
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        num_samples=512,
+        foreground_fraction=0.25,
+    )
+    sampler = box_sampler.BoxSampler(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(sampler.get_config(), expected_config)
+
+    new_sampler = box_sampler.BoxSampler.from_config(
+        sampler.get_config())
+
+    self.assertAllEqual(sampler.get_config(), new_sampler.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/detection_generator.py
+++ b/official/vision/beta/modeling/layers/detection_generator.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generators to generate the final detections."""
+
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import nms
+
+
+def _generate_detections_v1(boxes,
+                            scores,
+                            pre_nms_top_k=5000,
+                            pre_nms_score_threshold=0.05,
+                            nms_iou_threshold=0.5,
+                            max_num_detections=100):
+  """Generate the final detections given the model outputs.
+
+  The implementation unrolls the batch dimension and process images one by one.
+  It required the batch dimension to be statically known and it is TPU
+  compatible.
+
+  Args:
+    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
+      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
+      is the number of total anchors on all levels.
+    scores: a tensor with shape [batch_size, N, num_classes], which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: an int number of top candidate detections per class
+      before NMS.
+    pre_nms_score_threshold: a float representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: a scalar representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: `int` Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size = scores.get_shape().as_list()[0]
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    for i in range(batch_size):
+      (nmsed_boxes_i, nmsed_scores_i, nmsed_classes_i,
+       valid_detections_i) = _generate_detections_per_image(
+           boxes[i],
+           scores[i],
+           max_num_detections,
+           nms_iou_threshold,
+           pre_nms_score_threshold,
+           pre_nms_top_k)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+      valid_detections.append(valid_detections_i)
+  nmsed_boxes = tf.stack(nmsed_boxes, axis=0)
+  nmsed_scores = tf.stack(nmsed_scores, axis=0)
+  nmsed_classes = tf.stack(nmsed_classes, axis=0)
+  valid_detections = tf.stack(valid_detections, axis=0)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _generate_detections_per_image(boxes,
+                                   scores,
+                                   pre_nms_top_k=5000,
+                                   pre_nms_score_threshold=0.05,
+                                   nms_iou_threshold=0.5,
+                                   max_num_detections=100):
+  """Generate the final detections per image given the model outputs.
+
+  Args:
+    boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
+      predictions on all feature levels. The N is the number of total anchors on
+      all levels.
+    scores: a tensor with shape [N, num_classes], which stacks class probability
+      on all feature levels. The N is the number of total anchors on all levels.
+      The num_classes is the number of classes predicted by the model. Note that
+      the class_outputs here is the raw score.
+    pre_nms_top_k: an int number of top candidate detections per class
+      before NMS.
+    pre_nms_score_threshold: a float representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: a scalar representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: `float` Tensor of shape [max_num_detections, 4] representing top
+      detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [max_num_detections] representing sorted
+      confidence scores for detected boxes. The values are between [0, 1].
+    nms_classes: `int` Tensor of shape [max_num_detections] representing classes
+      for detected boxes.
+    valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
+      boxes are valid detections.
+  """
+  nmsed_boxes = []
+  nmsed_scores = []
+  nmsed_classes = []
+  num_classes_for_box = boxes.get_shape().as_list()[1]
+  num_classes = scores.get_shape().as_list()[1]
+  for i in range(num_classes):
+    boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
+    scores_i = scores[:, i]
+
+    # Obtains pre_nms_top_k before running NMS.
+    scores_i, indices = tf.nn.top_k(
+        scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k))
+    boxes_i = tf.gather(boxes_i, indices)
+
+    (nmsed_indices_i,
+     nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
+         tf.cast(boxes_i, tf.float32),
+         tf.cast(scores_i, tf.float32),
+         max_num_detections,
+         iou_threshold=nms_iou_threshold,
+         score_threshold=pre_nms_score_threshold,
+         pad_to_max_output_size=True,
+         name='nms_detections_' + str(i))
+    nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
+    nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
+    # Sets scores of invalid boxes to -1.
+    nmsed_scores_i = tf.where(
+        tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]),
+        nmsed_scores_i, -tf.ones_like(nmsed_scores_i))
+    nmsed_classes_i = tf.fill([max_num_detections], i)
+    nmsed_boxes.append(nmsed_boxes_i)
+    nmsed_scores.append(nmsed_scores_i)
+    nmsed_classes.append(nmsed_classes_i)
+
+  # Concats results from all classes and sort them.
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
+  nmsed_scores = tf.concat(nmsed_scores, axis=0)
+  nmsed_classes = tf.concat(nmsed_classes, axis=0)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_num_detections, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices)
+  nmsed_classes = tf.gather(nmsed_classes, indices)
+  valid_detections = tf.reduce_sum(
+      tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _select_top_k_scores(scores_in, pre_nms_num_detections):
+  """Select top_k scores and indices for each class.
+
+  Args:
+    scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
+      class logit outputs on all feature levels. The N is the number of total
+      anchors on all levels. The num_classes is the number of classes predicted
+      by the model.
+    pre_nms_num_detections: Number of candidates before NMS.
+
+  Returns:
+    scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
+      num_classes].
+  """
+  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
+  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
+  scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
+
+  top_k_scores, top_k_indices = tf.nn.top_k(
+      scores_trans, k=pre_nms_num_detections, sorted=True)
+
+  top_k_scores = tf.reshape(top_k_scores,
+                            [batch_size, num_class, pre_nms_num_detections])
+  top_k_indices = tf.reshape(top_k_indices,
+                             [batch_size, num_class, pre_nms_num_detections])
+
+  return tf.transpose(top_k_scores,
+                      [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
+
+
+def _generate_detections_v2(boxes,
+                            scores,
+                            pre_nms_top_k=5000,
+                            pre_nms_score_threshold=0.05,
+                            nms_iou_threshold=0.5,
+                            max_num_detections=100):
+  """Generate the final detections given the model outputs.
+
+  This implementation unrolls classes dimension while using the tf.while_loop
+  to implement the batched NMS, so that it can be parallelized at the batch
+  dimension. It should give better performance comparing to v1 implementation.
+  It is TPU compatible.
+
+  Args:
+    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
+      N, 1, 4], which box predictions on all feature levels. The N is the number
+      of total anchors on all levels.
+    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    pre_nms_top_k: an int number of top candidate detections per class
+      before NMS.
+    pre_nms_score_threshold: a float representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: a scalar representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: `int` Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
+    _, total_anchors, num_classes = scores.get_shape().as_list()
+    # Selects top pre_nms_num scores and indices before NMS.
+    scores, indices = _select_top_k_scores(
+        scores, min(total_anchors, pre_nms_top_k))
+    for i in range(num_classes):
+      boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
+      scores_i = scores[:, :, i]
+      # Obtains pre_nms_top_k before running NMS.
+      boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
+
+      # Filter out scores.
+      boxes_i, scores_i = box_ops.filter_boxes_by_scores(
+          boxes_i, scores_i, min_score_threshold=pre_nms_score_threshold)
+
+      (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
+          tf.cast(scores_i, tf.float32),
+          tf.cast(boxes_i, tf.float32),
+          max_num_detections,
+          iou_threshold=nms_iou_threshold)
+      nmsed_classes_i = tf.fill([batch_size, max_num_detections], i)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
+  nmsed_scores = tf.concat(nmsed_scores, axis=1)
+  nmsed_classes = tf.concat(nmsed_classes, axis=1)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_num_detections, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
+  nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
+  valid_detections = tf.reduce_sum(
+      input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32), axis=1)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+def _generate_detections_batched(boxes,
+                                 scores,
+                                 pre_nms_score_threshold,
+                                 nms_iou_threshold,
+                                 max_num_detections):
+  """Generates detected boxes with scores and classes for one-stage detector.
+
+  The function takes output of multi-level ConvNets and anchor boxes and
+  generates detected boxes. Note that this used batched nms, which is not
+  supported on TPU currently.
+
+  Args:
+    boxes: a tensor with shape [batch_size, N, num_classes, 4] or
+      [batch_size, N, 1, 4], which box predictions on all feature levels. The N
+      is the number of total anchors on all levels.
+    scores: a tensor with shape [batch_size, N, num_classes], which
+      stacks class probability on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    pre_nms_score_threshold: a float representing the threshold for deciding
+      when to remove boxes based on score.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    max_num_detections: a scalar representing maximum number of boxes retained
+      over all classes.
+
+  Returns:
+    nms_boxes: `float` Tensor of shape [batch_size, max_num_detections, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [batch_size, max_num_detections]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: `int` Tensor of shape [batch_size, max_num_detections]
+      representing classes for detected boxes.
+    valid_detections: `int` Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+        tf.image.combined_non_max_suppression(
+            boxes,
+            scores,
+            max_output_size_per_class=max_num_detections,
+            max_total_size=max_num_detections,
+            iou_threshold=nms_iou_threshold,
+            score_threshold=pre_nms_score_threshold,
+            pad_per_class=False,
+            clip_boxes=False))
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class DetectionGenerator(tf.keras.layers.Layer):
+  """Generates the final detected boxes with scores and classes."""
+
+  def __init__(self,
+               apply_nms=True,
+               pre_nms_top_k=5000,
+               pre_nms_score_threshold=0.05,
+               nms_iou_threshold=0.5,
+               max_num_detections=100,
+               use_batched_nms=False,
+               **kwargs):
+    """Initializes a detection generator.
+
+    Args:
+      apply_nms: bool, whether or not apply non maximum suppression. If False,
+        the decoded boxes and their scores are returned.
+      pre_nms_top_k: int, the number of top scores proposals to be kept before
+        applying NMS.
+      pre_nms_score_threshold: float, the score threshold to apply before
+        applying  NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
+      max_num_detections: int, the final number of total detections to generate.
+      use_batched_nms: bool, whether or not use
+        `tf.image.combined_non_max_suppression`.
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'apply_nms': apply_nms,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'max_num_detections': max_num_detections,
+        'use_batched_nms': use_batched_nms,
+    }
+    super(DetectionGenerator, self).__init__(**kwargs)
+
+  def __call__(self,
+               raw_boxes,
+               raw_scores,
+               anchor_boxes,
+               image_shape):
+    """Generate final detections.
+
+    Args:
+      raw_boxes: a tensor of shape of [batch_size, K, num_classes * 4]
+        representing the class-specific box coordinates relative to anchors.
+      raw_scores: a tensor of shape of [batch_size, K, num_classes]
+        representing the class logits before applying score activiation.
+      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
+        corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: a tensor of shape of [batch_size, 2] storing the image height
+        and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+
+    Returns:
+      If `apply_nms` = True, the return is a dictionary with keys:
+        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
+          representing top detected boxes in [y1, x1, y2, x2].
+        `detection_scores`: float Tensor of shape [batch, max_num_detections]
+          representing sorted confidence scores for detected boxes. The values
+          are between [0, 1].
+        `detection_classes`: int Tensor of shape [batch, max_num_detections]
+          representing classes for detected boxes.
+        `num_detections`: int Tensor of shape [batch] only the first
+          `num_detections` boxes are valid detections
+      If `apply_nms` = False, the return is a dictionary with keys:
+        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
+          representing all the decoded boxes.
+        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
+          representing socres of all the decoded boxes.
+    """
+    box_scores = tf.nn.softmax(raw_scores, axis=-1)
+
+    # Removes the background class.
+    box_scores_shape = tf.shape(box_scores)
+    batch_size = box_scores_shape[0]
+    num_locations = box_scores_shape[1]
+    num_classes = box_scores_shape[-1]
+    num_detections = num_locations * (num_classes - 1)
+
+    box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])
+    raw_boxes = tf.reshape(
+        raw_boxes,
+        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
+    raw_boxes = tf.slice(
+        raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
+    anchor_boxes = tf.tile(
+        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
+    raw_boxes = tf.reshape(
+        raw_boxes,
+        tf.stack([batch_size, num_detections, 4], axis=-1))
+    anchor_boxes = tf.reshape(
+        anchor_boxes,
+        tf.stack([batch_size, num_detections, 4], axis=-1))
+
+    # Box decoding.
+    decoded_boxes = box_ops.decode_boxes(
+        raw_boxes, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
+
+    # Box clipping
+    decoded_boxes = box_ops.clip_boxes(
+        decoded_boxes, tf.expand_dims(image_shape, axis=1))
+
+    decoded_boxes = tf.reshape(
+        decoded_boxes,
+        tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
+
+    if not self._config_dict['apply_nms']:
+      return {
+          'decoded_boxes': decoded_boxes,
+          'decoded_box_scores': box_scores,
+      }
+
+    if self._config_dict['use_batched_nms']:
+      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+          _generate_detections_batched(
+              decoded_boxes,
+              box_scores,
+              self._config_dict['pre_nms_score_threshold'],
+              self._config_dict['nms_iou_threshold'],
+              self._config_dict['max_num_detections']))
+    else:
+      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+          _generate_detections_v2(
+              decoded_boxes,
+              box_scores,
+              self._config_dict['pre_nms_top_k'],
+              self._config_dict['pre_nms_score_threshold'],
+              self._config_dict['nms_iou_threshold'],
+              self._config_dict['max_num_detections']))
+
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+
+    return {
+        'num_detections': valid_detections,
+        'detection_boxes': nmsed_boxes,
+        'detection_classes': nmsed_classes,
+        'detection_scores': nmsed_scores,
+    }
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelDetectionGenerator(tf.keras.layers.Layer):
+  """Generates detected boxes with scores and classes for one-stage detector."""
+
+  def __init__(self,
+               apply_nms=True,
+               pre_nms_top_k=5000,
+               pre_nms_score_threshold=0.05,
+               nms_iou_threshold=0.5,
+               max_num_detections=100,
+               use_batched_nms=False,
+               **kwargs):
+    """Initializes a detection generator.
+
+    Args:
+      apply_nms: bool, whether or not apply non maximum suppression. If False,
+        the decoded boxes and their scores are returned.
+      pre_nms_top_k: int, the number of top scores proposals to be kept before
+        applying NMS.
+      pre_nms_score_threshold: float, the score threshold to apply before
+        applying  NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
+      max_num_detections: int, the final number of total detections to generate.
+      use_batched_nms: bool, whether or not use
+        `tf.image.combined_non_max_suppression`.
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'apply_nms': apply_nms,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'max_num_detections': max_num_detections,
+        'use_batched_nms': use_batched_nms,
+    }
+    super(MultilevelDetectionGenerator, self).__init__(**kwargs)
+
+  def __call__(self,
+               raw_boxes,
+               raw_scores,
+               anchor_boxes,
+               image_shape):
+    """Generate final detections.
+
+    Args:
+      raw_boxes: a dict with keys representing FPN levels and values
+        representing box tenors of shape
+        [batch, feature_h, feature_w, num_anchors * 4].
+      raw_scores: a dict with keys representing FPN levels and values
+        representing logit tensors of shape
+        [batch, feature_h, feature_w, num_anchors].
+      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
+        corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: a tensor of shape of [batch_size, 2] storing the image height
+        and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+
+    Returns:
+      If `apply_nms` = True, the return is a dictionary with keys:
+        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
+          representing top detected boxes in [y1, x1, y2, x2].
+        `detection_scores`: float Tensor of shape [batch, max_num_detections]
+          representing sorted confidence scores for detected boxes. The values
+          are between [0, 1].
+        `detection_classes`: int Tensor of shape [batch, max_num_detections]
+          representing classes for detected boxes.
+        `num_detections`: int Tensor of shape [batch] only the first
+          `num_detections` boxes are valid detections
+      If `apply_nms` = False, the return is a dictionary with keys:
+        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
+          representing all the decoded boxes.
+        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
+          representing socres of all the decoded boxes.
+    """
+    # Collects outputs from all levels into a list.
+    boxes = []
+    scores = []
+    levels = list(raw_boxes.keys())
+    min_level = min(levels)
+    max_level = max(levels)
+    for i in range(min_level, max_level + 1):
+      raw_boxes_i_shape = tf.shape(raw_boxes[i])
+      batch_size = raw_boxes_i_shape[0]
+      num_anchors_per_locations = raw_boxes_i_shape[-1] // 4
+      num_classes = tf.shape(raw_scores[i])[-1] // num_anchors_per_locations
+
+      # Applies score transformation and remove the implicit background class.
+      scores_i = tf.sigmoid(
+          tf.reshape(raw_scores[i], [batch_size, -1, num_classes]))
+      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
+
+      # Box decoding.
+      # The anchor boxes are shared for all data in a batch.
+      # One stage detector only supports class agnostic box regression.
+      anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
+      raw_boxes_i = tf.reshape(raw_boxes[i], [batch_size, -1, 4])
+      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
+
+      # Box clipping.
+      boxes_i = box_ops.clip_boxes(
+          boxes_i, tf.expand_dims(image_shape, axis=1))
+
+      boxes.append(boxes_i)
+      scores.append(scores_i)
+    boxes = tf.concat(boxes, axis=1)
+    boxes = tf.expand_dims(boxes, axis=2)
+    scores = tf.concat(scores, axis=1)
+
+    if not self._config_dict['apply_nms']:
+      return {
+          'decoded_boxes': boxes,
+          'decoded_box_scores': scores,
+      }
+
+    if self._config_dict['use_batched_nms']:
+      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+          _generate_detections_batched(
+              boxes,
+              scores,
+              self._config_dict['pre_nms_score_threshold'],
+              self._config_dict['nms_iou_threshold'],
+              self._config_dict['max_num_detections']))
+    else:
+      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+          _generate_detections_v2(
+              boxes,
+              scores,
+              self._config_dict['pre_nms_top_k'],
+              self._config_dict['pre_nms_score_threshold'],
+              self._config_dict['nms_iou_threshold'],
+              self._config_dict['max_num_detections']))
+
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+
+    return {
+        'num_detections': valid_detections,
+        'detection_boxes': nmsed_boxes,
+        'detection_classes': nmsed_classes,
+        'detection_scores': nmsed_scores,
+    }
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/detection_generator_test.py
+++ b/official/vision/beta/modeling/layers/detection_generator_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for detection_generator.py."""
+# Import libraries
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.ops import anchor
+
+
+class SelectTopKScoresTest(tf.test.TestCase):
+
+  def testSelectTopKScores(self):
+    pre_nms_num_boxes = 2
+    scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]]
+    scores_in = tf.constant(scores_data, dtype=tf.float32)
+    top_k_scores, top_k_indices = detection_generator._select_top_k_scores(
+        scores_in, pre_nms_num_detections=pre_nms_num_boxes)
+    expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]],
+                                     dtype=np.float32)
+
+    expected_top_k_indices = [[[2, 1], [3, 3]]]
+
+    self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores)
+    self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices)
+
+
+class DetectionGeneratorTest(
+    parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (True),
+      (False),
+  )
+  def testDetectionsOutputShape(self, use_batched_nms):
+    max_num_detections = 100
+    num_classes = 4
+    pre_nms_top_k = 5000
+    pre_nms_score_threshold = 0.01
+    batch_size = 1
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': max_num_detections,
+        'use_batched_nms': use_batched_nms,
+    }
+    generator = detection_generator.DetectionGenerator(**kwargs)
+
+    cls_outputs_all = (
+        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
+    box_outputs_all = np.random.rand(84, 4 * num_classes)  # random 84 boxes.
+    anchor_boxes_all = np.random.rand(84, 4)  # random 84 boxes.
+    class_outputs = tf.reshape(
+        tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32),
+        [1, 84, num_classes])
+    box_outputs = tf.reshape(
+        tf.convert_to_tensor(box_outputs_all, dtype=tf.float32),
+        [1, 84, 4 * num_classes])
+    anchor_boxes = tf.reshape(
+        tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32),
+        [1, 84, 4])
+    image_info = tf.constant(
+        [[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
+        dtype=tf.float32)
+    results = generator(
+        box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :])
+    boxes = results['detection_boxes']
+    classes = results['detection_classes']
+    scores = results['detection_scores']
+    valid_detections = results['num_detections']
+
+    self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
+    self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(valid_detections.numpy().shape, (batch_size,))
+
+  def test_serialize_deserialize(self):
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': 1000,
+        'pre_nms_score_threshold': 0.1,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': 10,
+        'use_batched_nms': False,
+    }
+    generator = detection_generator.DetectionGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = (
+        detection_generator.DetectionGenerator.from_config(
+            generator.get_config()))
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+class MultilevelDetectionGeneratorTest(
+    parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (True),
+      (False),
+  )
+  def testDetectionsOutputShape(self, use_batched_nms):
+    min_level = 4
+    max_level = 6
+    num_scales = 2
+    max_num_detections = 100
+    aspect_ratios = [1.0, 2.0,]
+    anchor_scale = 2.0
+    output_size = [64, 64]
+    num_classes = 4
+    pre_nms_top_k = 5000
+    pre_nms_score_threshold = 0.01
+    batch_size = 1
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': max_num_detections,
+        'use_batched_nms': use_batched_nms,
+    }
+
+    input_anchor = anchor.build_anchor_generator(min_level, max_level,
+                                                 num_scales, aspect_ratios,
+                                                 anchor_scale)
+    anchor_boxes = input_anchor(output_size)
+    cls_outputs_all = (
+        np.random.rand(84, num_classes) - 0.5) * 3  # random 84x3 outputs.
+    box_outputs_all = np.random.rand(84, 4)  # random 84 boxes.
+    class_outputs = {
+        4: tf.reshape(tf.convert_to_tensor(
+            cls_outputs_all[0:64], dtype=tf.float32),
+                      [1, 8, 8, num_classes]),
+        5: tf.reshape(tf.convert_to_tensor(
+            cls_outputs_all[64:80], dtype=tf.float32),
+                      [1, 4, 4, num_classes]),
+        6: tf.reshape(tf.convert_to_tensor(
+            cls_outputs_all[80:84], dtype=tf.float32),
+                      [1, 2, 2, num_classes]),
+    }
+    box_outputs = {
+        4: tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]),
+        5: tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]),
+        6: tf.reshape(tf.convert_to_tensor(
+            box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]),
+    }
+    image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]],
+                             dtype=tf.float32)
+    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
+    results = generator(box_outputs, class_outputs, anchor_boxes,
+                        image_info[:, 1, :])
+    boxes = results['detection_boxes']
+    classes = results['detection_classes']
+    scores = results['detection_scores']
+    valid_detections = results['num_detections']
+
+    self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4))
+    self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,))
+    self.assertEqual(valid_detections.numpy().shape, (batch_size,))
+
+  def test_serialize_deserialize(self):
+    kwargs = {
+        'apply_nms': True,
+        'pre_nms_top_k': 1000,
+        'pre_nms_score_threshold': 0.1,
+        'nms_iou_threshold': 0.5,
+        'max_num_detections': 10,
+        'use_batched_nms': False,
+    }
+    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = (
+        detection_generator.MultilevelDetectionGenerator.from_config(
+            generator.get_config()))
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/mask_sampler.py
+++ b/official/vision/beta/modeling/layers/mask_sampler.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mask sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import spatial_transform_ops
+
+
+def _sample_and_crop_foreground_masks(candidate_rois,
+                                      candidate_gt_boxes,
+                                      candidate_gt_classes,
+                                      candidate_gt_indices,
+                                      gt_masks,
+                                      num_sampled_masks=128,
+                                      mask_target_size=28):
+  """Samples and creates cropped foreground masks for training.
+
+  Args:
+    candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
+      number of candidate RoIs to be considered for mask sampling. It includes
+      both positive and negative RoIs. The `num_mask_samples_per_image` positive
+      RoIs will be sampled to create mask training targets.
+    candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
+      corresponding groundtruth boxes to the `candidate_rois`.
+    candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
+      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
+      corresponds to the background class, i.e. negative RoIs.
+    candidate_gt_indices: a tensor of shape [batch_size, N], storing the
+      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
+      gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is the
+      superset of candidate_gt_boxes.
+    gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
+      containing all the groundtruth masks which sample masks are drawn from.
+    num_sampled_masks: an integer which specifies the number of masks
+      to sample.
+    mask_target_size: an integer which specifies the final cropped mask size
+      after sampling. The output masks are resized w.r.t the sampled RoIs.
+
+  Returns:
+    foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
+      that corresponds to the sampled foreground masks, where
+      K = num_mask_samples_per_image.
+    foreground_classes: a tensor of shape of [batch_size, K] storing the classes
+      corresponding to the sampled foreground masks.
+    cropoped_foreground_masks: a tensor of shape of
+      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
+      foreground masks used for training.
+  """
+  _, fg_instance_indices = tf.nn.top_k(
+      tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
+      k=num_sampled_masks)
+
+  fg_instance_indices_shape = tf.shape(fg_instance_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
+      tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
+
+  gather_nd_instance_indices = tf.stack(
+      [batch_indices, fg_instance_indices], axis=-1)
+  foreground_rois = tf.gather_nd(
+      candidate_rois, gather_nd_instance_indices)
+  foreground_boxes = tf.gather_nd(
+      candidate_gt_boxes, gather_nd_instance_indices)
+  foreground_classes = tf.gather_nd(
+      candidate_gt_classes, gather_nd_instance_indices)
+  foreground_gt_indices = tf.gather_nd(
+      candidate_gt_indices, gather_nd_instance_indices)
+  foreground_gt_indices = tf.where(
+      tf.equal(foreground_gt_indices, -1),
+      tf.zeros_like(foreground_gt_indices),
+      foreground_gt_indices)
+
+  foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
+      tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
+  gather_nd_gt_indices = tf.stack(
+      [batch_indices, foreground_gt_indices], axis=-1)
+  foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
+
+  cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
+      foreground_masks, foreground_boxes, foreground_rois, mask_target_size,
+      sample_offset=0.5)
+
+  return foreground_rois, foreground_classes, cropped_foreground_masks
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskSampler(tf.keras.layers.Layer):
+  """Samples and creates mask training targets."""
+
+  def __init__(self,
+               mask_target_size,
+               num_sampled_masks,
+               **kwargs):
+    self._config_dict = {
+        'mask_target_size': mask_target_size,
+        'num_sampled_masks': num_sampled_masks,
+    }
+    super(MaskSampler, self).__init__(**kwargs)
+
+  def call(self,
+           candidate_rois,
+           candidate_gt_boxes,
+           candidate_gt_classes,
+           candidate_gt_indices,
+           gt_masks):
+    """Sample and create mask targets for training.
+
+    Args:
+      candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
+        number of candidate RoIs to be considered for mask sampling. It includes
+        both positive and negative RoIs. The `num_mask_samples_per_image`
+        positive RoIs will be sampled to create mask training targets.
+      candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
+        corresponding groundtruth boxes to the `candidate_rois`.
+      candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
+        corresponding groundtruth classes to the `candidate_rois`. 0 in the
+        tensor corresponds to the background class, i.e. negative RoIs.
+      candidate_gt_indices: a tensor of shape [batch_size, N], storing the
+        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
+        where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N,
+        is the superset of candidate_gt_boxes.
+      gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
+        containing all the groundtruth masks which sample masks are drawn from.
+        after sampling. The output masks are resized w.r.t the sampled RoIs.
+
+    Returns:
+      foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
+        that corresponds to the sampled foreground masks, where
+        K = num_mask_samples_per_image.
+      foreground_classes: a tensor of shape of [batch_size, K] storing the
+        classes corresponding to the sampled foreground masks.
+      cropoped_foreground_masks: a tensor of shape of
+        [batch_size, K, mask_target_size, mask_target_size] storing the
+        cropped foreground masks used for training.
+    """
+    foreground_rois, foreground_classes, cropped_foreground_masks = (
+        _sample_and_crop_foreground_masks(
+            candidate_rois,
+            candidate_gt_boxes,
+            candidate_gt_classes,
+            candidate_gt_indices,
+            gt_masks,
+            self._config_dict['num_sampled_masks'],
+            self._config_dict['mask_target_size']))
+    return foreground_rois, foreground_classes, cropped_foreground_masks
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/mask_sampler_test.py
+++ b/official/vision/beta/modeling/layers/mask_sampler_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for mask_sampler.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import mask_sampler
+
+
+class SampleAndCropForegroundMasksTest(tf.test.TestCase):
+
+  def test_sample_and_crop_foreground_masks(self):
+    candidate_rois_np = np.array(
+        [[[0, 0, 0.5, 0.5], [0.5, 0.5, 1, 1],
+          [2, 2, 4, 4], [1, 1, 5, 5]]])
+    candidate_rois = tf.constant(candidate_rois_np, dtype=tf.float32)
+
+    candidate_gt_boxes_np = np.array(
+        [[[0, 0, 0.6, 0.6], [0, 0, 0, 0],
+          [1, 1, 3, 3], [1, 1, 3, 3]]])
+    candidate_gt_boxes = tf.constant(candidate_gt_boxes_np, dtype=tf.float32)
+
+    candidate_gt_classes_np = np.array([[4, 0, 0, 2]])
+    candidate_gt_classes = tf.constant(
+        candidate_gt_classes_np, dtype=tf.float32)
+
+    candidate_gt_indices_np = np.array([[10, -1, -1, 20]])
+    candidate_gt_indices = tf.constant(
+        candidate_gt_indices_np, dtype=tf.int32)
+
+    gt_masks_np = np.random.rand(1, 100, 32, 32)
+    gt_masks = tf.constant(gt_masks_np, dtype=tf.float32)
+
+    num_mask_samples_per_image = 2
+    mask_target_size = 28
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      foreground_rois, foreground_classes, cropped_foreground_masks = (
+          mask_sampler._sample_and_crop_foreground_masks(
+              candidate_rois, candidate_gt_boxes, candidate_gt_classes,
+              candidate_gt_indices, gt_masks, num_mask_samples_per_image,
+              mask_target_size))
+    foreground_rois_tpu = foreground_rois.numpy()
+    foreground_classes_tpu = foreground_classes.numpy()
+    cropped_foreground_masks_tpu = cropped_foreground_masks.numpy()
+
+    foreground_rois, foreground_classes, cropped_foreground_masks = (
+        mask_sampler._sample_and_crop_foreground_masks(
+            candidate_rois, candidate_gt_boxes, candidate_gt_classes,
+            candidate_gt_indices, gt_masks, num_mask_samples_per_image,
+            mask_target_size))
+    foreground_rois_cpu = foreground_rois.numpy()
+    foreground_classes_cpu = foreground_classes.numpy()
+    cropped_foreground_masks_cpu = cropped_foreground_masks.numpy()
+
+    # consistency.
+    self.assertAllEqual(foreground_rois_tpu.shape, foreground_rois_cpu.shape)
+    self.assertAllEqual(
+        foreground_classes_tpu.shape, foreground_classes_cpu.shape)
+    self.assertAllEqual(
+        cropped_foreground_masks_tpu.shape, cropped_foreground_masks_cpu.shape)
+
+    # correctnesss.
+    self.assertAllEqual(foreground_rois_tpu.shape, [1, 2, 4])
+    self.assertAllEqual(foreground_classes_tpu.shape, [1, 2])
+    self.assertAllEqual(cropped_foreground_masks_tpu.shape, [1, 2, 28, 28])
+
+
+class MaskSamplerTest(tf.test.TestCase):
+
+  def test_mask_sampler(self):
+    candidate_rois_np = np.array(
+        [[[0, 0, 0.5, 0.5], [0.5, 0.5, 1, 1],
+          [2, 2, 4, 4], [1, 1, 5, 5]]])
+    candidate_rois = tf.constant(candidate_rois_np, dtype=tf.float32)
+
+    candidate_gt_boxes_np = np.array(
+        [[[0, 0, 0.6, 0.6], [0, 0, 0, 0],
+          [1, 1, 3, 3], [1, 1, 3, 3]]])
+    candidate_gt_boxes = tf.constant(candidate_gt_boxes_np, dtype=tf.float32)
+
+    candidate_gt_classes_np = np.array([[4, 0, 0, 2]])
+    candidate_gt_classes = tf.constant(
+        candidate_gt_classes_np, dtype=tf.float32)
+
+    candidate_gt_indices_np = np.array([[10, -1, -1, 20]])
+    candidate_gt_indices = tf.constant(
+        candidate_gt_indices_np, dtype=tf.int32)
+
+    gt_masks_np = np.random.rand(1, 100, 32, 32)
+    gt_masks = tf.constant(gt_masks_np, dtype=tf.float32)
+
+    sampler = mask_sampler.MaskSampler(28, 2)
+
+    foreground_rois, foreground_classes, cropped_foreground_masks = sampler(
+        candidate_rois, candidate_gt_boxes, candidate_gt_classes,
+        candidate_gt_indices, gt_masks)
+
+    # correctnesss.
+    self.assertAllEqual(foreground_rois.numpy().shape, [1, 2, 4])
+    self.assertAllEqual(foreground_classes.numpy().shape, [1, 2])
+    self.assertAllEqual(cropped_foreground_masks.numpy().shape, [1, 2, 28, 28])
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        mask_target_size=7,
+        num_sampled_masks=10,
+    )
+    sampler = mask_sampler.MaskSampler(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(sampler.get_config(), expected_config)
+
+    new_sampler = mask_sampler.MaskSampler.from_config(
+        sampler.get_config())
+
+    self.assertAllEqual(sampler.get_config(), new_sampler.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/modeling/layers/nn_blocks.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common building blocks for neural networks."""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+# Import libraries
+
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.beta.modeling.layers import nn_layers
+
+
+def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]:
+  """Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4)."""
+  if axis == 1:
+    return (1, 1, strides, strides)
+  else:
+    return (1, strides, strides, 1)
+
+
+def _maybe_downsample(x: tf.Tensor,
+                      out_filter: int,
+                      strides: int,
+                      axis: int) -> tf.Tensor:
+  """Downsamples feature map and 0-pads tensor if in_filter != out_filter."""
+  data_format = 'NCHW' if axis == 1 else 'NHWC'
+  strides = _pad_strides(strides, axis=axis)
+
+  x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format)
+
+  in_filter = x.shape[axis]
+  if in_filter < out_filter:
+    # Pad on channel dimension with 0s: half on top half on bottom.
+    pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]
+    if axis == 1:
+      x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]])
+    else:
+      x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size])
+
+  return x + 0.
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualBlock(tf.keras.layers.Layer):
+  """A residual block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A residual block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      stochastic_depth_drop_rate: `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+                          Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+                        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(ResidualBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    super(ResidualBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(ResidualBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    return self._activation_fn(x + shortcut)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock(tf.keras.layers.Layer):
+  """A standard bottleneck block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A standard bottleneck block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      stochastic_depth_drop_rate: `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+                          Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+                        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(BottleneckBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters * 4,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv3 = tf.keras.layers.Conv2D(
+        filters=self._filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    super(BottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(BottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+
+    x = self._conv3(x)
+    x = self._norm3(x)
+
+    if self._stochastic_depth:
+      x = self._stochastic_depth(x, training=training)
+
+    return self._activation_fn(x + shortcut)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class InvertedBottleneckBlock(tf.keras.layers.Layer):
+  """An inverted bottleneck block."""
+
+  def __init__(self,
+               in_filters,
+               out_filters,
+               expand_ratio,
+               strides,
+               kernel_size=3,
+               se_ratio=None,
+               stochastic_depth_drop_rate=None,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """An inverted bottleneck block with BN after convolutions.
+
+    Args:
+      in_filters: `int` number of filters of the input tensor.
+      out_filters: `int` number of filters of the output tensor.
+      expand_ratio: `int` expand_ratio for an inverted bottleneck block.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      kernel_size: `int` kernel_size of the depthwise conv layer.
+      se_ratio: `float` or None. If not None, se ratio for the squeeze and
+        excitation layer.
+      stochastic_depth_drop_rate: `float` or None. if not None, drop rate for
+        the stochastic depth layer.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(InvertedBottleneckBlock, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._out_filters = out_filters
+    self._expand_ratio = expand_ratio
+    self._strides = strides
+    self._kernel_size = kernel_size
+    self._se_ratio = se_ratio
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._expand_ratio != 1:
+      # First 1x1 conv for channel expansion.
+      self._conv0 = tf.keras.layers.Conv2D(
+          filters=self._in_filters * self._expand_ratio,
+          kernel_size=1,
+          strides=1,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    # Depthwise conv.
+    self._conv1 = tf.keras.layers.DepthwiseConv2D(
+        kernel_size=(self._kernel_size, self._kernel_size),
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        depthwise_initializer=self._kernel_initializer,
+        depthwise_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    # Squeeze and excitation.
+    if self._se_ratio is not None and self._se_ratio > 0 and self._se_ratio <= 1:
+      self._squeeze_excitation = nn_layers.SqueezeExcitation(
+          in_filters=self._in_filters,
+          se_ratio=self._se_ratio,
+          expand_ratio=self._expand_ratio,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+    else:
+      self._squeeze_excitation = None
+
+    # Last 1x1 conv.
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._out_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._stochastic_depth_drop_rate:
+      self._stochastic_depth = nn_layers.StochasticDepth(
+          self._stochastic_depth_drop_rate)
+    else:
+      self._stochastic_depth = None
+
+    super(InvertedBottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'out_filters': self._out_filters,
+        'expand_ratio': self._expand_ratio,
+        'strides': self._strides,
+        'kernel_size': self._kernel_size,
+        'se_ratio': self._se_ratio,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(InvertedBottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    shortcut = inputs
+    if self._expand_ratio != 1:
+      x = self._conv0(inputs)
+      x = self._norm0(x)
+      x = self._activation_fn(x)
+    else:
+      x = inputs
+
+    x = self._conv1(x)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    if self._squeeze_excitation:
+      x = self._squeeze_excitation(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    if self._in_filters == self._out_filters and self._strides == 1:
+      if self._stochastic_depth:
+        x = self._stochastic_depth(x, training=training)
+      x = tf.add(x, shortcut)
+
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualInner(tf.keras.layers.Layer):
+  """Single inner block of a residual.
+
+  This corresponds to `F`/`G` functions in the RevNet paper:
+  https://arxiv.org/pdf/1707.04585.pdf
+  """
+
+  def __init__(
+      self,
+      filters: int,
+      strides: int,
+      kernel_initializer: Union[
+          str, Callable[..., tf.keras.initializers.Initializer]]
+      = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      batch_norm_first: bool = True,
+      **kwargs):
+    """ResidualInner Initialization.
+
+    Args:
+      filters: `int` output filter size.
+      strides: `int` stride size for convolution for the residual block.
+      kernel_initializer: `str` or `tf.keras.initializers.Initializer` instance
+        for convolutional layers.
+      kernel_regularizer: `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: `str` or `callable` instance of the activation function.
+      use_sync_bn: `bool` if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      batch_norm_first: `bool` whether to apply activation and batch norm
+        before conv.
+      **kwargs: additional keyword arguments to be passed.
+    """
+    super(ResidualInner, self).__init__(**kwargs)
+
+    self.strides = strides
+    self.filters = filters
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._kernel_regularizer = kernel_regularizer
+    self._activation = tf.keras.activations.get(activation)
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._batch_norm_first = batch_norm_first
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: tf.TensorShape):
+    if self._batch_norm_first:
+      self._batch_norm_0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv2d_1 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=self.strides,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    self._batch_norm_1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2d_2 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    super(ResidualInner, self).build(input_shape)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'filters': self.filters,
+        'strides': self.strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'batch_norm_first': self._batch_norm_first,
+    }
+    base_config = super(ResidualInner, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(
+      self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
+    x = inputs
+    if self._batch_norm_first:
+      x = self._batch_norm_0(x, training=training)
+      x = self._activation_fn(x)
+    x = self._conv2d_1(x)
+
+    x = self._batch_norm_1(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_2(x)
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckResidualInner(tf.keras.layers.Layer):
+  """Single inner block of a bottleneck residual.
+
+  This corresponds to `F`/`G` functions in the RevNet paper:
+  https://arxiv.org/pdf/1707.04585.pdf
+  """
+
+  def __init__(
+      self,
+      filters: int,
+      strides: int,
+      kernel_initializer: Union[
+          str, Callable[..., tf.keras.initializers.Initializer]]
+      = 'VarianceScaling',
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activation: Union[str, Callable[..., tf.Tensor]] = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      batch_norm_first: bool = True,
+      **kwargs):
+    """BottleneckResidualInner Initialization.
+
+    Args:
+      filters: `int` number of filters for first 2 convolutions. Last
+        Last, and thus the number of output channels from the bottlneck
+        block is `4*filters`
+      strides: `int` stride size for convolution for the residual block.
+      kernel_initializer: `str` or `tf.keras.initializers.Initializer` instance
+        for convolutional layers.
+      kernel_regularizer: `tf.keras.regularizers.Regularizer` for Conv2D.
+      activation: `str` or `callable` instance of the activation function.
+      use_sync_bn: `bool` if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      batch_norm_first: `bool` whether to apply activation and batch norm
+        before conv.
+      **kwargs: additional keyword arguments to be passed.
+    """
+    super(BottleneckResidualInner, self).__init__(**kwargs)
+
+    self.strides = strides
+    self.filters = filters
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._kernel_regularizer = kernel_regularizer
+    self._activation = tf.keras.activations.get(activation)
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._batch_norm_first = batch_norm_first
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape: tf.TensorShape):
+    if self._batch_norm_first:
+      self._batch_norm_0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+    self._conv2d_1 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=1,
+        strides=self.strides,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._batch_norm_1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2d_2 = tf.keras.layers.Conv2D(
+        filters=self.filters,
+        kernel_size=3,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+    self._batch_norm_2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2d_3 = tf.keras.layers.Conv2D(
+        filters=self.filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer)
+
+    super(BottleneckResidualInner, self).build(input_shape)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'filters': self.filters,
+        'strides': self.strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'batch_norm_first': self._batch_norm_first,
+    }
+    base_config = super(BottleneckResidualInner, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(
+      self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
+    x = inputs
+    if self._batch_norm_first:
+      x = self._batch_norm_0(x, training=training)
+      x = self._activation_fn(x)
+    x = self._conv2d_1(x)
+
+    x = self._batch_norm_1(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_2(x)
+
+    x = self._batch_norm_2(x, training=training)
+    x = self._activation_fn(x)
+    x = self._conv2d_3(x)
+
+    return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ReversibleLayer(tf.keras.layers.Layer):
+  """A reversible layer.
+
+  Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary
+  layers that are stateless, which in this case are `ResidualInner` layers.
+  """
+
+  def __init__(self,
+               f: tf.keras.layers.Layer,
+               g: tf.keras.layers.Layer,
+               manual_grads: bool = True,
+               **kwargs):
+    """ReversibleLayer Initialization.
+
+    Args:
+      f: `tf.keras.layers.Layer` f inner block referred to in paper. Each
+        reversible layer consists of two inner functions. For example, in RevNet
+        the reversible residual consists of two f/g inner (bottleneck) residual
+        functions. Where the input to the reversible layer is x, the input gets
+        partitioned in the channel dimension and the forward pass follows (eq8):
+        x = [x1; x2], z1 = x1 + f(x2), y2 = x2 + g(z1), y1 = stop_gradient(z1).
+      g: `tf.keras.layers.Layer` g inner block referred to in paper. Detailed
+        explanation same as above as `f` arg.
+      manual_grads: `bool` [Testing Only] whether to manually take gradients
+        as in Algorithm 1 or defer to autograd.
+      **kwargs: additional keyword arguments to be passed.
+    """
+    super(ReversibleLayer, self).__init__(**kwargs)
+
+    self._f = f
+    self._g = g
+    self._manual_grads = manual_grads
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._axis = -1
+    else:
+      self._axis = 1
+
+  def get_config(self) -> Dict[str, Any]:
+    config = {
+        'f': self._f,
+        'g': self._g,
+        'manual_grads': self._manual_grads,
+    }
+    base_config = super(ReversibleLayer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _ckpt_non_trainable_vars(self):
+    self._f_non_trainable_vars = [
+        v.read_value() for v in self._f.non_trainable_variables]
+    self._g_non_trainable_vars = [
+        v.read_value() for v in self._g.non_trainable_variables]
+
+  def _load_ckpt_non_trainable_vars(self):
+    for v, v_chkpt in zip(
+        self._f.non_trainable_variables, self._f_non_trainable_vars):
+      v.assign(v_chkpt)
+    for v, v_chkpt in zip(
+        self._g.non_trainable_variables, self._g_non_trainable_vars):
+      v.assign(v_chkpt)
+
+  def call(
+      self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
+
+    @tf.custom_gradient
+    def reversible(x: tf.Tensor) -> Tuple[
+        tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor], List[tf.Tensor]]]]:
+      """Implements Algorithm 1 in RevNet paper.
+
+      Paper: https://arxiv.org/pdf/1707.04585.pdf
+
+      Args:
+        x: input tensor.
+
+      Returns:
+        y: the output [y1; y2] in algorithm 1.
+        grad_fn: callable function that computes the gradients.
+      """
+      with tf.GradientTape() as fwdtape:
+        fwdtape.watch(x)
+        x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis)
+        f_x2 = self._f(x2, training=training)
+        x1_down = _maybe_downsample(
+            x1, f_x2.shape[self._axis], self._f.strides, self._axis)
+        z1 = f_x2 + x1_down
+        g_z1 = self._g(z1, training=training)
+        x2_down = _maybe_downsample(
+            x2, g_z1.shape[self._axis], self._f.strides, self._axis)
+        y2 = x2_down + g_z1
+
+        # Equation 8: https://arxiv.org/pdf/1707.04585.pdf
+        # Decouple y1 and z1 so that their derivatives are different.
+        y1 = tf.identity(z1)
+        y = tf.concat([y1, y2], axis=self._axis)
+
+        irreversible = (
+            (self._f.strides != 1 or self._g.strides != 1)
+            or (y.shape[self._axis] != inputs.shape[self._axis]))
+
+        # Checkpointing moving mean/variance for batch normalization layers
+        # as they shouldn't be updated during the custom gradient pass of f/g.
+        self._ckpt_non_trainable_vars()
+
+      def grad_fn(dy: tf.Tensor,
+                  variables: Optional[List[tf.Variable]] = None,
+                 ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
+        """Given dy calculate (dy/dx)|_{x_{input}} using f/g."""
+        if irreversible or not self._manual_grads:
+          grads_combined = fwdtape.gradient(
+              y, [x] + variables, output_gradients=dy)
+          dx = grads_combined[0]
+          grad_vars = grads_combined[1:]
+        else:
+          y1_nograd = tf.stop_gradient(y1)
+          y2_nograd = tf.stop_gradient(y2)
+          dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis)
+
+          # Index mapping from self.f/g.trainable_variables to grad_fn
+          # input `variables` kwarg so that we can reorder dwf + dwg
+          # variable gradient list to match `variables` order.
+          f_var_refs = [v.ref() for v in self._f.trainable_variables]
+          g_var_refs = [v.ref() for v in self._g.trainable_variables]
+          fg_var_refs = f_var_refs + g_var_refs
+          self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables]
+
+          # Algorithm 1 in paper (line # documented in-line)
+          z1 = y1_nograd                                         # line 2
+          with tf.GradientTape() as gtape:
+            gtape.watch(z1)
+            g_z1 = self._g(z1, training=training)
+          x2 = y2_nograd - g_z1                                  # line 3
+
+          with tf.GradientTape() as ftape:
+            ftape.watch(x2)
+            f_x2 = self._f(x2, training=training)
+          x1 = z1 - f_x2  # pylint: disable=unused-variable      # line 4
+
+          # Compute gradients
+          g_grads_combined = gtape.gradient(
+              g_z1,
+              [z1] + self._g.trainable_variables,
+              output_gradients=dy2)
+          dz1 = dy1 + g_grads_combined[0]                        # line 5
+          dwg = g_grads_combined[1:]                             # line 9
+
+          f_grads_combined = ftape.gradient(
+              f_x2,
+              [x2] + self._f.trainable_variables,
+              output_gradients=dz1)
+          dx2 = dy2 + f_grads_combined[0]                        # line 6
+          dwf = f_grads_combined[1:]                             # line 8
+          dx1 = dz1                                              # line 7
+
+          # Pack the input and variable gradients.
+          dx = tf.concat([dx1, dx2], axis=self._axis)
+          grad_vars = dwf + dwg
+          # Reorder gradients (trainable_variables to variables kwarg order)
+          grad_vars = [grad_vars[i] for i in self_to_var_index]
+
+          # Restore batch normalization moving mean/variance for correctness.
+          self._load_ckpt_non_trainable_vars()
+
+        return dx, grad_vars  # grad_fn end
+
+      return y, grad_fn  # reversible end
+
+    activations = reversible(inputs)
+    return activations
--- a/official/vision/beta/modeling/layers/nn_blocks_3d.py
+++ b/official/vision/beta/modeling/layers/nn_blocks_3d.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common building blocks for 3D networks."""
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SelfGating(tf.keras.layers.Layer):
+  """Feature gating as used in S3D-G (https://arxiv.org/pdf/1712.04851.pdf)."""
+
+  def __init__(self, filters, **kwargs):
+    """Constructor.
+
+    Args:
+      filters: `int` number of filters for the convolutional layer.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(SelfGating, self).__init__(**kwargs)
+    self._filters = filters
+
+  def build(self, input_shape):
+    self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D()
+
+    # No BN and activation after conv.
+    self._transformer_w = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[1, 1, 1],
+        use_bias=True,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            mean=0.0, stddev=0.01))
+
+    super(SelfGating, self).build(input_shape)
+
+  def call(self, inputs):
+    x = self._spatial_temporal_average(inputs)
+
+    x = tf.expand_dims(x, 1)
+    x = tf.expand_dims(x, 2)
+    x = tf.expand_dims(x, 3)
+
+    x = self._transformer_w(x)
+    x = tf.nn.sigmoid(x)
+
+    return tf.math.multiply(x, inputs)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock3D(tf.keras.layers.Layer):
+  """A 3D bottleneck block."""
+
+  def __init__(self,
+               filters,
+               temporal_kernel_size,
+               temporal_strides,
+               spatial_strides,
+               use_self_gating=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A 3D bottleneck block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      temporal_kernel_size: `int` kernel size for the temporal convolutional
+        layer.
+      temporal_strides: `int` temporal stride for the temporal convolutional
+        layer.
+      spatial_strides: `int` spatial stride for the spatial convolutional layer.
+      use_self_gating: `bool` apply self-gating module or not.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(BottleneckBlock3D, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._temporal_kernel_size = temporal_kernel_size
+    self._spatial_strides = spatial_strides
+    self._temporal_strides = temporal_strides
+    self._use_self_gating = use_self_gating
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    self._shortcut_maxpool = tf.keras.layers.MaxPool3D(
+        pool_size=[1, 1, 1],
+        strides=[
+            self._temporal_strides, self._spatial_strides, self._spatial_strides
+        ])
+
+    self._shortcut_conv = tf.keras.layers.Conv3D(
+        filters=4 * self._filters,
+        kernel_size=1,
+        strides=[
+            self._temporal_strides, self._spatial_strides, self._spatial_strides
+        ],
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm0 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._temporal_conv = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[self._temporal_kernel_size, 1, 1],
+        strides=[self._temporal_strides, 1, 1],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._spatial_conv = tf.keras.layers.Conv3D(
+        filters=self._filters,
+        kernel_size=[1, 3, 3],
+        strides=[1, self._spatial_strides, self._spatial_strides],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._expand_conv = tf.keras.layers.Conv3D(
+        filters=4 * self._filters,
+        kernel_size=[1, 1, 1],
+        strides=[1, 1, 1],
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    if self._use_self_gating:
+      self._self_gating = SelfGating(filters=4 * self._filters)
+    else:
+      self._self_gating = None
+
+    super(BottleneckBlock3D, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'temporal_kernel_size': self._temporal_kernel_size,
+        'temporal_strides': self._temporal_strides,
+        'spatial_strides': self._spatial_strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(BottleneckBlock3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    in_filters = inputs.shape.as_list()[-1]
+    if in_filters == 4 * self._filters:
+      if self._temporal_strides == 1 and self._spatial_strides == 1:
+        shortcut = inputs
+      else:
+        shortcut = self._shortcut_maxpool(inputs)
+    else:
+      shortcut = self._shortcut_conv(inputs)
+      shortcut = self._norm0(shortcut)
+
+    x = self._temporal_conv(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._spatial_conv(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+
+    x = self._expand_conv(x)
+    x = self._norm3(x)
+    # Apply activation before additional modules.
+    x = self._activation_fn(x + shortcut)
+
+    if self._self_gating:
+      x = self._self_gating(x)
+
+    return x
--- a/official/vision/beta/modeling/layers/nn_blocks_3d_test.py
+++ b/official/vision/beta/modeling/layers/nn_blocks_3d_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for resnet."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import nn_blocks_3d
+
+
+class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True),
+      (nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False),
+  )
+  def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size,
+                                     temporal_strides, spatial_strides,
+                                     use_self_gating):
+    temporal_size = 16
+    spatial_size = 128
+    filters = 256
+    inputs = tf.keras.Input(
+        shape=(temporal_size, spatial_size, spatial_size, filters * 4),
+        batch_size=1)
+    block = block_fn(
+        filters=filters,
+        temporal_kernel_size=temporal_kernel_size,
+        temporal_strides=temporal_strides,
+        spatial_strides=spatial_strides,
+        use_self_gating=use_self_gating)
+
+    features = block(inputs)
+
+    self.assertAllEqual([
+        1, temporal_size // temporal_strides, spatial_size // spatial_strides,
+        spatial_size // spatial_strides, filters * 4
+    ], features.shape.as_list())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/nn_blocks_test.py
+++ b/official/vision/beta/modeling/layers/nn_blocks_test.py
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nn_blocks."""
+
+from typing import Any, Iterable, Tuple
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.beta.modeling.layers import nn_blocks
+
+
+def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
+  """Returns the combinations of end-to-end tests to run."""
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+  )
+
+
+class NNBlocksTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (nn_blocks.ResidualBlock, 1, False, 0.0),
+      (nn_blocks.ResidualBlock, 2, True, 0.2),
+  )
+  def test_residual_block_creation(
+      self, block_fn, strides, use_projection, stochastic_depth_drop_rate):
+    input_size = 128
+    filter_size = 256
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, filter_size), batch_size=1)
+    block = block_fn(
+        filter_size,
+        strides,
+        use_projection=use_projection,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate,
+    )
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, filter_size],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.BottleneckBlock, 1, False, 0.0),
+      (nn_blocks.BottleneckBlock, 2, True, 0.2),
+  )
+  def test_bottleneck_block_creation(
+      self, block_fn, strides, use_projection, stochastic_depth_drop_rate):
+    input_size = 128
+    filter_size = 256
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, filter_size * 4), batch_size=1)
+    block = block_fn(
+        filter_size,
+        strides,
+        use_projection=use_projection,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate
+    )
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, filter_size * 4],
+        features.shape.as_list())
+
+  @parameterized.parameters(
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 6, 1, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 2, None, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None),
+      (nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2),
+  )
+  def test_invertedbottleneck_block_creation(
+      self, block_fn, expand_ratio, strides, se_ratio,
+      stochastic_depth_drop_rate):
+    input_size = 128
+    in_filters = 24
+    out_filters = 40
+    inputs = tf.keras.Input(
+        shape=(input_size, input_size, in_filters), batch_size=1)
+    block = block_fn(
+        in_filters=in_filters,
+        out_filters=out_filters,
+        expand_ratio=expand_ratio,
+        strides=strides,
+        se_ratio=se_ratio,
+        stochastic_depth_drop_rate=stochastic_depth_drop_rate)
+
+    features = block(inputs)
+
+    self.assertAllEqual(
+        [1, input_size // strides, input_size // strides, out_filters],
+        features.shape.as_list())
+
+
+class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      test_layer = nn_blocks.ResidualInner(filters, strides)
+
+    output = test_layer(input_tensor)
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+
+class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_shape(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      test_layer = nn_blocks.BottleneckResidualInner(filters, strides)
+
+    output = test_layer(input_tensor)
+    expected_output_shape = [bsz, h // strides, w // strides, filters * 4]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+
+class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_downsampling_non_reversible_step(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = 64
+    strides = 2
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      f = nn_blocks.ResidualInner(
+          filters=filters // 2,
+          strides=strides,
+          batch_norm_first=True)
+      g = nn_blocks.ResidualInner(
+          filters=filters // 2,
+          strides=1,
+          batch_norm_first=True)
+      test_layer = nn_blocks.ReversibleLayer(f, g)
+      test_layer.build(input_tensor.shape)
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+
+    @tf.function
+    def step_fn():
+      with tf.GradientTape() as tape:
+        output = test_layer(input_tensor, training=True)
+      grads = tape.gradient(output, test_layer.trainable_variables)
+      # Test applying gradients with optimizer works
+      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
+
+      return output
+
+    replica_output = distribution.run(step_fn)
+    outputs = distribution.experimental_local_results(replica_output)
+
+    # Assert forward pass shape
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    for output in outputs:
+      self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_reversible_step(self, distribution):
+    # Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = c
+    strides = 1
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c])
+    with distribution.scope():
+      f = nn_blocks.ResidualInner(
+          filters=filters // 2,
+          strides=strides,
+          batch_norm_first=False)
+      g = nn_blocks.ResidualInner(
+          filters=filters // 2,
+          strides=1,
+          batch_norm_first=False)
+      test_layer = nn_blocks.ReversibleLayer(f, g)
+      test_layer(input_tensor, training=False)  # init weights
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+
+    @tf.function
+    def step_fn():
+      with tf.GradientTape() as tape:
+        output = test_layer(input_tensor, training=True)
+      grads = tape.gradient(output, test_layer.trainable_variables)
+      # Test applying gradients with optimizer works
+      optimizer.apply_gradients(zip(grads, test_layer.trainable_variables))
+
+      return output
+
+    @tf.function
+    def fwd():
+      test_layer(input_tensor)
+
+    distribution.run(fwd)  # Initialize variables
+    prev_variables = tf.identity_n(test_layer.trainable_variables)
+    replica_output = distribution.run(step_fn)
+    outputs = distribution.experimental_local_results(replica_output)
+
+    # Assert variables values have changed values
+    for v0, v1 in zip(prev_variables, test_layer.trainable_variables):
+      self.assertNotAllEqual(v0, v1)
+
+    # Assert forward pass shape
+    expected_output_shape = [bsz, h // strides, w // strides, filters]
+    for output in outputs:
+      self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_manual_gradients_correctness(self, distribution):
+    bsz, h, w, c = 8, 32, 32, 32
+    filters = c
+    strides = 1
+
+    input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4])  # bottleneck
+    with distribution.scope():
+      f_manual = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2,
+          strides=strides,
+          batch_norm_first=False)
+      g_manual = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2,
+          strides=1,
+          batch_norm_first=False)
+      manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual)
+      manual_grad_layer(input_tensor, training=False)  # init weights
+
+      f_auto = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2,
+          strides=strides,
+          batch_norm_first=False)
+      g_auto = nn_blocks.BottleneckResidualInner(
+          filters=filters // 2,
+          strides=1,
+          batch_norm_first=False)
+      auto_grad_layer = nn_blocks.ReversibleLayer(
+          f_auto, g_auto, manual_grads=False)
+      auto_grad_layer(input_tensor)  # init weights
+      # Clone all weights (tf.keras.layers.Layer has no .clone())
+      auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights())
+      auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights())
+
+    @tf.function
+    def manual_fn():
+      with tf.GradientTape() as tape:
+        output = manual_grad_layer(input_tensor, training=True)
+      grads = tape.gradient(output, manual_grad_layer.trainable_variables)
+      return grads
+
+    @tf.function
+    def auto_fn():
+      with tf.GradientTape() as tape:
+        output = auto_grad_layer(input_tensor, training=True)
+      grads = tape.gradient(output, auto_grad_layer.trainable_variables)
+      return grads
+
+    manual_grads = distribution.run(manual_fn)
+    auto_grads = distribution.run(auto_fn)
+
+    # Assert gradients calculated manually are close to that from autograd
+    for manual_grad, auto_grad in zip(manual_grads, auto_grads):
+      self.assertAllClose(
+          distribution.experimental_local_results(manual_grad),
+          distribution.experimental_local_results(auto_grad),
+          atol=5e-3, rtol=5e-3)
+
+    # Verify that BN moving mean and variance is correct.
+    for manual_var, auto_var in zip(
+        manual_grad_layer.non_trainable_variables,
+        auto_grad_layer.non_trainable_variables):
+      self.assertAllClose(manual_var, auto_var)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common building blocks for neural networks."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SqueezeExcitation(tf.keras.layers.Layer):
+  """Squeeze and excitation layer."""
+
+  def __init__(self,
+               in_filters,
+               se_ratio,
+               expand_ratio,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               **kwargs):
+    """Implementation for squeeze and excitation.
+
+    Args:
+      in_filters: `int` number of filters of the input tensor.
+      se_ratio: `float` or None. If not None, se ratio for the squeeze and
+        excitation layer.
+      expand_ratio: `int` expand_ratio for a MBConv block.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(SqueezeExcitation, self).__init__(**kwargs)
+
+    self._in_filters = in_filters
+    self._se_ratio = se_ratio
+    self._expand_ratio = expand_ratio
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._spatial_axis = [1, 2]
+    else:
+      self._spatial_axis = [2, 3]
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    num_reduced_filters = max(1, int(self._in_filters * self._se_ratio))
+
+    self._se_reduce = tf.keras.layers.Conv2D(
+        filters=num_reduced_filters,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    self._se_expand = tf.keras.layers.Conv2D(
+        filters=self._in_filters * self._expand_ratio,
+        kernel_size=1,
+        strides=1,
+        padding='same',
+        use_bias=True,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    super(SqueezeExcitation, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'in_filters': self._in_filters,
+        'se_ratio': self._se_ratio,
+        'expand_ratio': self._expand_ratio,
+        'strides': self._strides,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+    }
+    base_config = super(SqueezeExcitation, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True)
+    x = self._se_expand(self._activation_fn(self._se_reduce(x)))
+
+    return tf.sigmoid(x) * inputs
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class StochasticDepth(tf.keras.layers.Layer):
+  """Stochastic depth layer."""
+
+  def __init__(self, stochastic_depth_drop_rate, **kwargs):
+    """Initialize stochastic depth.
+
+    Args:
+      stochastic_depth_drop_rate: `float` drop rate.
+      **kwargs: keyword arguments to be passed.
+
+    Returns:
+      A output tensor, which should have the same shape as input.
+    """
+    super(StochasticDepth, self).__init__(**kwargs)
+    self._drop_rate = stochastic_depth_drop_rate
+
+  def get_config(self):
+    config = {'drop_rate': self._drop_rate}
+    base_config = super(StochasticDepth, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    if training is None:
+      is_training = tf.keras.backend.learning_phase()
+    if not is_training or self._drop_rate is None or self._drop_rate == 0:
+      return inputs
+
+    keep_prob = 1.0 - self._drop_rate
+    batch_size = tf.shape(inputs)[0]
+    random_tensor = keep_prob
+    random_tensor += tf.random.uniform(
+        [batch_size, 1, 1, 1], dtype=inputs.dtype)
+    binary_tensor = tf.floor(random_tensor)
+    output = tf.math.divide(inputs, keep_prob) * binary_tensor
+    return output
--- a/official/vision/beta/modeling/layers/roi_aligner.py
+++ b/official/vision/beta/modeling/layers/roi_aligner.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ROI align."""
+
+import tensorflow as tf
+
+from official.vision.beta.ops import spatial_transform_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIAligner(tf.keras.layers.Layer):
+  """Performs ROIAlign for the second stage processing."""
+
+  def __init__(self,
+               crop_size=7,
+               sample_offset=0.5,
+               **kwargs):
+    """Initializes a ROI aligner.
+
+    Args:
+      crop_size: int, the output size of the cropped features.
+      sample_offset: float in [0, 1], the subpixel sample offset.
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'crop_size': crop_size,
+        'sample_offset': sample_offset,
+    }
+    super(MultilevelROIAligner, self).__init__(**kwargs)
+
+  def call(self, features, boxes, training=None):
+    """Generates ROIs.
+
+    Args:
+      features: A dictionary with key as pyramid level and value as features.
+        The features are in shape of
+        [batch_size, height_l, width_l, num_filters].
+      boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
+        represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
+        from grid point.
+      training: bool, whether it is in training mode.
+
+    Returns:
+      roi_features: A 5-D tensor representing feature crop of shape
+      [batch_size, num_boxes, crop_size, crop_size, num_filters].
+    """
+    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        features,
+        boxes,
+        output_size=self._config_dict['crop_size'],
+        sample_offset=self._config_dict['sample_offset'])
+    return roi_features
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/roi_aligner_test.py
+++ b/official/vision/beta/modeling/layers/roi_aligner_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for roi_aligner.py."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import roi_aligner
+
+
+class MultilevelROIAlignerTest(tf.test.TestCase):
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        crop_size=7,
+        sample_offset=0.5,
+    )
+    aligner = roi_aligner.MultilevelROIAligner(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(aligner.get_config(), expected_config)
+
+    new_aligner = roi_aligner.MultilevelROIAligner.from_config(
+        aligner.get_config())
+
+    self.assertAllEqual(aligner.get_config(), new_aligner.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/layers/roi_generator.py
+++ b/official/vision/beta/modeling/layers/roi_generator.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ROI generator."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import nms
+
+
+def _multilevel_propose_rois(raw_boxes,
+                             raw_scores,
+                             anchor_boxes,
+                             image_shape,
+                             pre_nms_top_k=2000,
+                             pre_nms_score_threshold=0.0,
+                             pre_nms_min_size_threshold=0.0,
+                             nms_iou_threshold=0.7,
+                             num_proposals=1000,
+                             use_batched_nms=False,
+                             decode_boxes=True,
+                             clip_boxes=True,
+                             apply_sigmoid_to_score=True):
+  """Proposes RoIs given a group of candidates from different FPN levels.
+
+  The following describes the steps:
+    1. For each individual level:
+      a. Apply sigmoid transform if specified.
+      b. Decode boxes if specified.
+      c. Clip boxes if specified.
+      d. Filter small boxes and those fall outside image if specified.
+      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
+      f. Apply NMS.
+    2. Aggregate post-NMS boxes from each level.
+    3. Apply an overall top k to generate the final selected RoIs.
+
+  Args:
+    raw_boxes: a dict with keys representing FPN levels and values representing
+      box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
+    raw_scores: a dict with keys representing FPN levels and values representing
+      logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+    anchor_boxes: a dict with keys representing FPN levels and values
+      representing anchor box tensors of shape
+      [batch_size, feature_h * feature_w * num_anchors, 4].
+    image_shape: a tensor of shape [batch_size, 2] where the last dimension are
+      [height, width] of the scaled image.
+    pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
+      keep before applying NMS. Default: 2000.
+    pre_nms_score_threshold: a float between 0 and 1 representing the minimal
+      box  score to keep before applying NMS. This is often used as a
+      pre-filtering step for better performance. Default: 0, no filtering is
+      applied.
+    pre_nms_min_size_threshold: a float representing the minimal box size in
+      each side (w.r.t. the scaled image) to keep before applying NMS. This is
+      often used as a pre-filtering step for better performance. Default: 0, no
+      filtering is applied.
+    nms_iou_threshold: a float between 0 and 1 representing the IoU threshold
+      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+    num_proposals: an integer of top scoring RPN proposals *in total* to
+      keep after applying NMS. Default: 1000.
+    use_batched_nms: a boolean indicating whether NMS is applied in batch using
+      `tf.image.combined_non_max_suppression`. Currently only available in
+      CPU/GPU. Default: False.
+    decode_boxes: a boolean indicating whether `raw_boxes` needs to be decoded
+      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
+      `anchor_boxes`. Default: True.
+    clip_boxes: a boolean indicating whether boxes are first clipped to the
+      scaled image size before appliying NMS. If False, no clipping is applied
+      and `image_shape` is ignored. Default: True.
+    apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
+      `raw_scores` before applying NMS. Default: True.
+
+  Returns:
+    selected_rois: a tensor of shape [batch_size, num_proposals, 4],
+      representing the box coordinates of the selected proposals w.r.t. the
+      scaled image.
+    selected_roi_scores: a tensor of shape [batch_size, num_proposals, 1],
+      representing the scores of the selected proposals.
+  """
+  with tf.name_scope('multilevel_propose_rois'):
+    rois = []
+    roi_scores = []
+    image_shape = tf.expand_dims(image_shape, axis=1)
+    for level in sorted(raw_scores.keys()):
+      with tf.name_scope('level_%d' % level):
+        _, feature_h, feature_w, num_anchors_per_location = (
+            raw_scores[level].get_shape().as_list())
+
+        num_boxes = feature_h * feature_w * num_anchors_per_location
+        this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
+        this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
+        this_level_anchors = tf.cast(
+            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+            dtype=this_level_scores.dtype)
+
+        if apply_sigmoid_to_score:
+          this_level_scores = tf.sigmoid(this_level_scores)
+
+        if decode_boxes:
+          this_level_boxes = box_ops.decode_boxes(
+              this_level_boxes, this_level_anchors)
+        if clip_boxes:
+          this_level_boxes = box_ops.clip_boxes(
+              this_level_boxes, image_shape)
+
+        if pre_nms_min_size_threshold > 0.0:
+          this_level_boxes, this_level_scores = box_ops.filter_boxes(
+              this_level_boxes,
+              this_level_scores,
+              image_shape,
+              pre_nms_min_size_threshold)
+
+        this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
+        this_level_post_nms_top_k = min(num_boxes, num_proposals)
+        if nms_iou_threshold > 0.0:
+          if use_batched_nms:
+            this_level_rois, this_level_roi_scores, _, _ = (
+                tf.image.combined_non_max_suppression(
+                    tf.expand_dims(this_level_boxes, axis=2),
+                    tf.expand_dims(this_level_scores, axis=-1),
+                    max_output_size_per_class=this_level_pre_nms_top_k,
+                    max_total_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold,
+                    score_threshold=pre_nms_score_threshold,
+                    pad_per_class=False,
+                    clip_boxes=False))
+          else:
+            if pre_nms_score_threshold > 0.0:
+              this_level_boxes, this_level_scores = (
+                  box_ops.filter_boxes_by_scores(
+                      this_level_boxes,
+                      this_level_scores,
+                      pre_nms_score_threshold))
+            this_level_boxes, this_level_scores = box_ops.top_k_boxes(
+                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
+            this_level_roi_scores, this_level_rois = (
+                nms.sorted_non_max_suppression_padded(
+                    this_level_scores,
+                    this_level_boxes,
+                    max_output_size=this_level_post_nms_top_k,
+                    iou_threshold=nms_iou_threshold))
+        else:
+          this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
+              this_level_boxes,
+              this_level_scores,
+              k=this_level_post_nms_top_k)
+
+        rois.append(this_level_rois)
+        roi_scores.append(this_level_roi_scores)
+
+    all_rois = tf.concat(rois, axis=1)
+    all_roi_scores = tf.concat(roi_scores, axis=1)
+
+    with tf.name_scope('top_k_rois'):
+      _, num_valid_rois = all_roi_scores.get_shape().as_list()
+      overall_top_k = min(num_valid_rois, num_proposals)
+
+      selected_rois, selected_roi_scores = box_ops.top_k_boxes(
+          all_rois, all_roi_scores, k=overall_top_k)
+
+    return selected_rois, selected_roi_scores
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultilevelROIGenerator(tf.keras.layers.Layer):
+  """Proposes RoIs for the second stage processing."""
+
+  def __init__(self,
+               pre_nms_top_k=2000,
+               pre_nms_score_threshold=0.0,
+               pre_nms_min_size_threshold=0.0,
+               nms_iou_threshold=0.7,
+               num_proposals=1000,
+               test_pre_nms_top_k=1000,
+               test_pre_nms_score_threshold=0.0,
+               test_pre_nms_min_size_threshold=0.0,
+               test_nms_iou_threshold=0.7,
+               test_num_proposals=1000,
+               use_batched_nms=False,
+               **kwargs):
+    """Initializes a ROI generator.
+
+    The ROI generator transforms the raw predictions from RPN to ROIs.
+
+    Args:
+      pre_nms_top_k: int, the number of top scores proposals to be kept before
+        applying NMS.
+      pre_nms_score_threshold: float, the score threshold to apply before
+        applying NMS. Proposals whose scores are below this threshold are
+        thrown away.
+      pre_nms_min_size_threshold: float, the threshold of each side of the box
+        (w.r.t. the scaled image). Proposals whose sides are below this
+        threshold are thrown away.
+      nms_iou_threshold: float in [0, 1], the NMS IoU threshold.
+      num_proposals: int, the final number of proposals to generate.
+      test_pre_nms_top_k: int, the number of top scores proposals to be kept
+        before applying NMS in testing.
+      test_pre_nms_score_threshold: float, the score threshold to apply before
+        applying NMS in testing. Proposals whose scores are below this threshold
+        are thrown away.
+      test_pre_nms_min_size_threshold: float, the threshold of each side of the
+        box (w.r.t. the scaled image) in testing. Proposals whose sides are
+        below this threshold are thrown away.
+      test_nms_iou_threshold: float in [0, 1], the NMS IoU threshold in testing.
+      test_num_proposals: int, the final number of proposals to generate in
+        testing.
+      use_batched_nms: bool, whether or not use
+        `tf.image.combined_non_max_suppression`.
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'pre_nms_top_k': pre_nms_top_k,
+        'pre_nms_score_threshold': pre_nms_score_threshold,
+        'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
+        'nms_iou_threshold': nms_iou_threshold,
+        'num_proposals': num_proposals,
+        'test_pre_nms_top_k': test_pre_nms_top_k,
+        'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
+        'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
+        'test_nms_iou_threshold': test_nms_iou_threshold,
+        'test_num_proposals': test_num_proposals,
+        'use_batched_nms': use_batched_nms,
+    }
+    super(MultilevelROIGenerator, self).__init__(**kwargs)
+
+  def call(self,
+           raw_boxes,
+           raw_scores,
+           anchor_boxes,
+           image_shape,
+           training=None):
+    """Proposes RoIs given a group of candidates from different FPN levels.
+
+    The following describes the steps:
+      1. For each individual level:
+        a. Apply sigmoid transform if specified.
+        b. Decode boxes if specified.
+        c. Clip boxes if specified.
+        d. Filter small boxes and those fall outside image if specified.
+        e. Apply pre-NMS filtering including pre-NMS top k and score
+           thresholding.
+        f. Apply NMS.
+      2. Aggregate post-NMS boxes from each level.
+      3. Apply an overall top k to generate the final selected RoIs.
+
+    Args:
+      raw_boxes: a dict with keys representing FPN levels and values
+        representing box tenors of shape
+        [batch, feature_h, feature_w, num_anchors * 4].
+      raw_scores: a dict with keys representing FPN levels and values
+        representing logit tensors of shape
+        [batch, feature_h, feature_w, num_anchors].
+      anchor_boxes: a dict with keys representing FPN levels and values
+        representing anchor box tensors of shape
+        [batch, feature_h * feature_w * num_anchors, 4].
+      image_shape: a tensor of shape [batch, 2] where the last dimension are
+        [height, width] of the scaled image.
+      training: a bool indicat whether it is in training mode.
+
+    Returns:
+     roi_boxes: [batch, num_proposals, 4], the proposed ROIs in the scaled
+        image coordinate.
+      roi_scores: [batch, num_proposals], scores of the proposed ROIs.
+    """
+    roi_boxes, roi_scores = _multilevel_propose_rois(
+        raw_boxes,
+        raw_scores,
+        anchor_boxes,
+        image_shape,
+        pre_nms_top_k=(
+            self._config_dict['pre_nms_top_k'] if training
+            else self._config_dict['test_pre_nms_top_k']),
+        pre_nms_score_threshold=(
+            self._config_dict['pre_nms_score_threshold'] if training
+            else self._config_dict['test_pre_nms_score_threshold']),
+        pre_nms_min_size_threshold=(
+            self._config_dict['pre_nms_min_size_threshold'] if training
+            else self._config_dict['test_pre_nms_min_size_threshold']),
+        nms_iou_threshold=(
+            self._config_dict['nms_iou_threshold'] if training
+            else self._config_dict['test_nms_iou_threshold']),
+        num_proposals=(
+            self._config_dict['num_proposals'] if training
+            else self._config_dict['test_num_proposals']),
+        use_batched_nms=self._config_dict['use_batched_nms'],
+        decode_boxes=True,
+        clip_boxes=True,
+        apply_sigmoid_to_score=True)
+    return roi_boxes, roi_scores
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/roi_generator_test.py
+++ b/official/vision/beta/modeling/layers/roi_generator_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for roi_generator.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import roi_generator
+
+
+class MultilevelProposeRoisTest(tf.test.TestCase):
+
+  def test_multilevel_propose_rois_single_level(self):
+    rpn_boxes_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 4, 4], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    rpn_boxes = {
+        2: tf.constant(rpn_boxes_np, dtype=tf.float32)
+    }
+    rpn_scores_np = np.array(
+        [[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
+    rpn_scores = {
+        2: tf.constant(rpn_scores_np, dtype=tf.float32)
+    }
+    anchor_boxes_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 4, 4], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    anchor_boxes = {
+        2: tf.constant(anchor_boxes_np, dtype=tf.float32)
+    }
+    image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
+
+    selected_rois_np = np.array(
+        [[[0.01, 0.01, 9.9, 9.9], [2, 2, 8, 8], [5, 5, 10, 10], [0, 0, 0, 0]],
+         [[3, 3, 6, 6], [1, 1, 8, 8], [2, 2, 4, 4], [0, 0, 0, 0]]])
+    selected_roi_scores_np = np.array(
+        [[0.9, 0.3, 0.2, 0], [0.8, 0.5, 0.1, 0]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      selected_rois_tpu, selected_roi_scores_tpu = (
+          roi_generator._multilevel_propose_rois(
+              rpn_boxes,
+              rpn_scores,
+              anchor_boxes=anchor_boxes,
+              image_shape=image_shape,
+              pre_nms_top_k=4,
+              pre_nms_score_threshold=0.0,
+              pre_nms_min_size_threshold=0.0,
+              nms_iou_threshold=0.5,
+              num_proposals=4,
+              use_batched_nms=False,
+              decode_boxes=False,
+              clip_boxes=False,
+              apply_sigmoid_to_score=False))
+
+    # Runs on CPU.
+    selected_rois_cpu, selected_roi_scores_cpu = (
+        roi_generator._multilevel_propose_rois(
+            rpn_boxes,
+            rpn_scores,
+            anchor_boxes=anchor_boxes,
+            image_shape=image_shape,
+            pre_nms_top_k=4,
+            pre_nms_score_threshold=0.0,
+            pre_nms_min_size_threshold=0.0,
+            nms_iou_threshold=0.5,
+            num_proposals=4,
+            use_batched_nms=False,
+            decode_boxes=False,
+            clip_boxes=False,
+            apply_sigmoid_to_score=False))
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
+
+  def test_multilevel_propose_rois_two_levels(self):
+    rpn_boxes_1_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    rpn_boxes_2_np = np.array(
+        [[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
+    rpn_boxes = {
+        2: tf.constant(rpn_boxes_1_np, dtype=tf.float32),
+        3: tf.constant(rpn_boxes_2_np, dtype=tf.float32),
+    }
+    rpn_scores_1_np = np.array(
+        [[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
+    rpn_scores_2_np = np.array([[[[0.95]]], [[[0.99]]]])
+    rpn_scores = {
+        2: tf.constant(rpn_scores_1_np, dtype=tf.float32),
+        3: tf.constant(rpn_scores_2_np, dtype=tf.float32),
+    }
+    anchor_boxes_1_np = np.array(
+        [[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
+          [[5, 5, 10, 10], [2, 2, 8, 8]]],
+         [[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
+          [[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
+    anchor_boxes_2_np = np.array(
+        [[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
+    anchor_boxes = {
+        2: tf.constant(anchor_boxes_1_np, dtype=tf.float32),
+        3: tf.constant(anchor_boxes_2_np, dtype=tf.float32),
+    }
+    image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
+
+    selected_rois_np = np.array(
+        [[[0, 0, 10.01, 10.01], [0.01, 0.01, 9.99, 9.99]],
+         [[2, 2, 4.5, 4.5], [3, 3, 6, 6]]])
+    selected_roi_scores_np = np.array([[0.95, 0.9], [0.99, 0.8]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      selected_rois_tpu, selected_roi_scores_tpu = (
+          roi_generator._multilevel_propose_rois(
+              rpn_boxes,
+              rpn_scores,
+              anchor_boxes=anchor_boxes,
+              image_shape=image_shape,
+              pre_nms_top_k=4,
+              pre_nms_score_threshold=0.0,
+              pre_nms_min_size_threshold=0.0,
+              nms_iou_threshold=0.5,
+              num_proposals=2,
+              use_batched_nms=False,
+              decode_boxes=False,
+              clip_boxes=False,
+              apply_sigmoid_to_score=False))
+
+    # Runs on CPU.
+    selected_rois_cpu, selected_roi_scores_cpu = (
+        roi_generator._multilevel_propose_rois(
+            rpn_boxes,
+            rpn_scores,
+            anchor_boxes=anchor_boxes,
+            image_shape=image_shape,
+            pre_nms_top_k=4,
+            pre_nms_score_threshold=0.0,
+            pre_nms_min_size_threshold=0.0,
+            nms_iou_threshold=0.5,
+            num_proposals=2,
+            use_batched_nms=False,
+            decode_boxes=False,
+            clip_boxes=False,
+            apply_sigmoid_to_score=False))
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
+
+    self.assertNDArrayNear(
+        selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
+    self.assertNDArrayNear(
+        selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
+
+
+class MultilevelROIGeneratorTest(tf.test.TestCase):
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        pre_nms_top_k=2000,
+        pre_nms_score_threshold=0.0,
+        pre_nms_min_size_threshold=0.0,
+        nms_iou_threshold=0.7,
+        num_proposals=1000,
+        test_pre_nms_top_k=1000,
+        test_pre_nms_score_threshold=0.0,
+        test_pre_nms_min_size_threshold=0.0,
+        test_nms_iou_threshold=0.7,
+        test_num_proposals=1000,
+        use_batched_nms=False,
+    )
+    generator = roi_generator.MultilevelROIGenerator(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = roi_generator.MultilevelROIGenerator.from_config(
+        generator.get_config())
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
--- a/official/vision/beta/modeling/layers/roi_sampler.py
+++ b/official/vision/beta/modeling/layers/roi_sampler.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ROI sampler."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import box_matcher
+from official.vision.beta.modeling.layers import box_sampler
+from official.vision.beta.ops import box_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ROISampler(tf.keras.layers.Layer):
+  """Sample ROIs and assign targets to the sampled ROIs."""
+
+  def __init__(self,
+               mix_gt_boxes=True,
+               num_sampled_rois=512,
+               foreground_fraction=0.25,
+               foreground_iou_threshold=0.5,
+               background_iou_high_threshold=0.5,
+               background_iou_low_threshold=0,
+               **kwargs):
+    """Initializes a ROI sampler.
+
+    Args:
+      mix_gt_boxes: bool, whether to mix the groundtruth boxes with proposed
+        ROIs.
+      num_sampled_rois: int, the number of sampled ROIs per image.
+      foreground_fraction: float in [0, 1], what percentage of proposed ROIs
+        should be sampled from the foreground boxes.
+      foreground_iou_threshold: float, represent the IoU threshold for a box to
+        be considered as positive (if >= `foreground_iou_threshold`).
+      background_iou_high_threshold: float, represent the IoU threshold for a
+        box to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`]).
+      background_iou_low_threshold: float, represent the IoU threshold for a box
+        to be considered as negative (if overlap in
+        [`background_iou_low_threshold`, `background_iou_high_threshold`])
+      **kwargs: other key word arguments passed to Layer.
+    """
+    self._config_dict = {
+        'mix_gt_boxes': mix_gt_boxes,
+        'num_sampled_rois': num_sampled_rois,
+        'foreground_fraction': foreground_fraction,
+        'foreground_iou_threshold': foreground_iou_threshold,
+        'background_iou_high_threshold': background_iou_high_threshold,
+        'background_iou_low_threshold': background_iou_low_threshold,
+    }
+    self._matcher = box_matcher.BoxMatcher(
+        foreground_iou_threshold,
+        background_iou_high_threshold,
+        background_iou_low_threshold)
+    self._sampler = box_sampler.BoxSampler(
+        num_sampled_rois, foreground_fraction)
+    super(ROISampler, self).__init__(**kwargs)
+
+  def call(self, boxes, gt_boxes, gt_classes):
+    """Assigns the proposals with groundtruth classes and performs subsmpling.
+
+    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+    following algorithm to generate the final `num_samples_per_image` RoIs.
+      1. Calculates the IoU between each proposal box and each gt_boxes.
+      2. Assigns each proposed box with a groundtruth class and box by choosing
+         the largest IoU overlap.
+      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+         returns box_targets, class_targets, and RoIs.
+
+    Args:
+      boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the
+        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        format.
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4].
+        The coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+        tensor might have paddings with values of -1 indicating the invalid
+        classes.
+
+    Returns:
+      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+        coordinates of the sampled RoIs, where K is the number of the sampled
+        RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+        box coordinates of the matched groundtruth boxes of the samples RoIs.
+      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+      sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
+        indices of the sampled groudntruth boxes in the original `gt_boxes`
+        tensor, i.e.
+        gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
+    """
+    if self._config_dict['mix_gt_boxes']:
+      gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
+      boxes = tf.concat([boxes, gt_boxes], axis=1)
+
+    (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+     positive_matches, negative_matches, ignored_matches) = (
+         self._matcher(boxes, gt_boxes, gt_classes))
+
+    sampled_indices = self._sampler(
+        positive_matches, negative_matches, ignored_matches)
+
+    sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices = (
+        box_ops.gather_instances(
+            sampled_indices,
+            boxes,
+            matched_gt_boxes,
+            matched_gt_classes,
+            matched_gt_indices))
+    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
+            sampled_gt_indices)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/modeling/layers/roi_sampler_test.py
+++ b/official/vision/beta/modeling/layers/roi_sampler_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for roi_sampler.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling.layers import roi_sampler
+
+
+class ROISamplerTest(tf.test.TestCase):
+
+  def test_roi_sampler(self):
+    boxes_np = np.array(
+        [[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
+          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+
+    gt_boxes_np = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5],
+          [-1, -1, -1, -1]]])
+    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
+    gt_classes_np = np.array([[2, 10, -1]])
+    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
+
+    generator = roi_sampler.ROISampler(
+        mix_gt_boxes=True,
+        num_sampled_rois=2,
+        foreground_fraction=0.5,
+        foreground_iou_threshold=0.5,
+        background_iou_high_threshold=0.5,
+        background_iou_low_threshold=0.0)
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      _ = generator(boxes, gt_boxes, gt_classes)
+
+    # Runs on CPU.
+    _ = generator(boxes, gt_boxes, gt_classes)
+
+  def test_serialize_deserialize(self):
+    kwargs = dict(
+        mix_gt_boxes=True,
+        num_sampled_rois=512,
+        foreground_fraction=0.25,
+        foreground_iou_threshold=0.5,
+        background_iou_high_threshold=0.5,
+        background_iou_low_threshold=0.5,
+    )
+    generator = roi_sampler.ROISampler(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(generator.get_config(), expected_config)
+
+    new_generator = roi_sampler.ROISampler.from_config(
+        generator.get_config())
+
+    self.assertAllEqual(generator.get_config(), new_generator.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/maskrcnn_model.py
+++ b/official/vision/beta/modeling/maskrcnn_model.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mask R-CNN model."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MaskRCNNModel(tf.keras.Model):
+  """The Mask R-CNN model."""
+
+  def __init__(self,
+               backbone,
+               decoder,
+               rpn_head,
+               detection_head,
+               roi_generator,
+               roi_sampler,
+               roi_aligner,
+               detection_generator,
+               mask_head=None,
+               mask_sampler=None,
+               mask_roi_aligner=None,
+               **kwargs):
+    """Initializes the Mask R-CNN model.
+
+    Args:
+      backbone: `tf.keras.Model`, the backbone network.
+      decoder: `tf.keras.Model`, the decoder network.
+      rpn_head: the RPN head.
+      detection_head: the detection head.
+      roi_generator: the ROI generator.
+      roi_sampler: the ROI sampler.
+      roi_aligner: the ROI aligner.
+      detection_generator: the detection generator.
+      mask_head: the mask head.
+      mask_sampler: the mask sampler.
+      mask_roi_aligner: the ROI alginer for mask prediction.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(MaskRCNNModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'rpn_head': rpn_head,
+        'detection_head': detection_head,
+        'roi_generator': roi_generator,
+        'roi_sampler': roi_sampler,
+        'roi_aligner': roi_aligner,
+        'detection_generator': detection_generator,
+        'mask_head': mask_head,
+        'mask_sampler': mask_sampler,
+        'mask_roi_aligner': mask_roi_aligner,
+    }
+    self.backbone = backbone
+    self.decoder = decoder
+    self.rpn_head = rpn_head
+    self.detection_head = detection_head
+    self.roi_generator = roi_generator
+    self.roi_sampler = roi_sampler
+    self.roi_aligner = roi_aligner
+    self.detection_generator = detection_generator
+    self._include_mask = mask_head is not None
+    self.mask_head = mask_head
+    if self._include_mask and mask_sampler is None:
+      raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
+    self.mask_sampler = mask_sampler
+    if self._include_mask and mask_roi_aligner is None:
+      raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
+    self.mask_roi_aligner = mask_roi_aligner
+
+  def call(self,
+           images,
+           image_shape,
+           anchor_boxes=None,
+           gt_boxes=None,
+           gt_classes=None,
+           gt_masks=None,
+           training=None):
+    model_outputs = {}
+
+    # Feature extraction.
+    features = self.backbone(images)
+    if self.decoder:
+      features = self.decoder(features)
+
+    # Region proposal network.
+    rpn_scores, rpn_boxes = self.rpn_head(features)
+
+    model_outputs.update({
+        'rpn_boxes': rpn_boxes,
+        'rpn_scores': rpn_scores
+    })
+
+    # Generate RoIs.
+    rois, _ = self.roi_generator(
+        rpn_boxes, rpn_scores, anchor_boxes, image_shape, training)
+
+    if training:
+      rois = tf.stop_gradient(rois)
+
+      rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
+          self.roi_sampler(rois, gt_boxes, gt_classes))
+      # Assign target for the 2nd stage classification.
+      box_targets = box_ops.encode_boxes(
+          matched_gt_boxes, rois, weights=[10.0, 10.0, 5.0, 5.0])
+      # If the target is background, the box target is set to all 0s.
+      box_targets = tf.where(
+          tf.tile(
+              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
+              [1, 1, 4]),
+          tf.zeros_like(box_targets),
+          box_targets)
+      model_outputs.update({
+          'class_targets': matched_gt_classes,
+          'box_targets': box_targets,
+      })
+
+    # RoI align.
+    roi_features = self.roi_aligner(features, rois)
+
+    # Detection head.
+    raw_scores, raw_boxes = self.detection_head(roi_features)
+
+    if training:
+      model_outputs.update({
+          'class_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      })
+    else:
+      # Post-processing.
+      detections = self.detection_generator(
+          raw_boxes, raw_scores, rois, image_shape)
+      model_outputs.update({
+          'detection_boxes': detections['detection_boxes'],
+          'detection_scores': detections['detection_scores'],
+          'detection_classes': detections['detection_classes'],
+          'num_detections': detections['num_detections'],
+      })
+
+    if not self._include_mask:
+      return model_outputs
+
+    if training:
+      rois, roi_classes, roi_masks = self.mask_sampler(
+          rois,
+          matched_gt_boxes,
+          matched_gt_classes,
+          matched_gt_indices,
+          gt_masks)
+      roi_masks = tf.stop_gradient(roi_masks)
+
+      model_outputs.update({
+          'mask_class_targets': roi_classes,
+          'mask_targets': roi_masks,
+      })
+    else:
+      rois = model_outputs['detection_boxes']
+      roi_classes = model_outputs['detection_classes']
+
+    # Mask RoI align.
+    mask_roi_features = self.mask_roi_aligner(features, rois)
+
+    # Mask head.
+    raw_masks = self.mask_head([mask_roi_features, roi_classes])
+    if training:
+      model_outputs.update({
+          'mask_outputs': raw_masks,
+      })
+    else:
+      model_outputs.update({
+          'detection_masks': tf.math.sigmoid(raw_masks),
+      })
+    return model_outputs
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)