Internal change

PiperOrigin-RevId: 329754787

Internal change
PiperOrigin-RevId: 329754787
cc748b2a · Abdullah Rashwan · A. Unique TensorFlower · 2f788e1d · cc748b2a · cc748b2a
Commit cc748b2a authored Sep 02, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower Sep 02, 2020
20 changed files
--- a/official/vision/beta/modeling/maskrcnn_model_test.py
+++ b/official/vision/beta/modeling/maskrcnn_model_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for maskrcnn_model.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling import maskrcnn_model
+from official.vision.beta.modeling.backbones import resnet
+from official.vision.beta.modeling.decoders import fpn
+from official.vision.beta.modeling.heads import dense_prediction_heads
+from official.vision.beta.modeling.heads import instance_heads
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.modeling.layers import mask_sampler
+from official.vision.beta.modeling.layers import roi_aligner
+from official.vision.beta.modeling.layers import roi_generator
+from official.vision.beta.modeling.layers import roi_sampler
+from official.vision.beta.ops import anchor
+
+
+class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (3, 3, 7, 3, [1.0], 50, False, False, 41953246),
+  )
+  def test_num_params(self,
+                      num_classes,
+                      min_level,
+                      max_level,
+                      num_scales,
+                      aspect_ratios,
+                      resnet_model_id,
+                      use_separable_conv,
+                      include_mask,
+                      expected_num_params):
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    anchor_boxes = anchor.Anchor(
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=3,
+        image_size=(image_size, image_size)).multilevel_boxes
+    for l in anchor_boxes:
+      anchor_boxes[l] = tf.tile(
+          tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+
+    backbone = resnet.ResNet(model_id=resnet_model_id)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_anchors_per_location=num_anchors_per_location,
+        num_convs=1)
+    detection_head = instance_heads.DetectionHead(
+        num_classes=num_classes)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(
+          num_classes=num_classes, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj)
+
+    gt_boxes = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+        dtype=np.float32)
+    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+    if include_mask:
+      gt_masks = np.ones((2, 3, 100, 100))
+    else:
+      gt_masks = None
+
+    _ = model(images,
+              image_shape,
+              anchor_boxes,
+              gt_boxes,
+              gt_classes,
+              gt_masks,
+              training=True)
+    self.assertEqual(expected_num_params, model.count_params())
+
+  @parameterized.parameters(
+      (False, False,),
+      (False, True,),
+      (True, False,),
+      (True, True,),
+  )
+  def test_forward(self, include_mask, training):
+    num_classes = 3
+    min_level = 3
+    max_level = 4
+    num_scales = 3
+    aspect_ratios = [1.0]
+    image_size = (256, 256)
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array([[224, 100], [100, 224]])
+    anchor_boxes = anchor.Anchor(
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=3,
+        image_size=image_size).multilevel_boxes
+    num_anchors_per_location = len(aspect_ratios) * num_scales
+
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=min_level,
+        max_level=max_level,
+        input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_anchors_per_location=num_anchors_per_location)
+    detection_head = instance_heads.DetectionHead(
+        num_classes=num_classes)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(
+          num_classes=num_classes, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj)
+
+    gt_boxes = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+        dtype=np.float32)
+    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+    if include_mask:
+      gt_masks = np.ones((2, 3, 100, 100))
+    else:
+      gt_masks = None
+
+    results = model(images,
+                    image_shape,
+                    anchor_boxes,
+                    gt_boxes,
+                    gt_classes,
+                    gt_masks,
+                    training=training)
+
+    self.assertIn('rpn_boxes', results)
+    self.assertIn('rpn_scores', results)
+    if training:
+      self.assertIn('class_targets', results)
+      self.assertIn('box_targets', results)
+      self.assertIn('class_outputs', results)
+      self.assertIn('box_outputs', results)
+      if include_mask:
+        self.assertIn('mask_outputs', results)
+    else:
+      self.assertIn('detection_boxes', results)
+      self.assertIn('detection_scores', results)
+      self.assertIn('detection_classes', results)
+      self.assertIn('num_detections', results)
+      if include_mask:
+        self.assertIn('detection_masks', results)
+
+  @parameterized.parameters(
+      (False,),
+      (True,),
+  )
+  def test_serialize_deserialize(self, include_mask):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3,
+        max_level=7,
+        input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3,
+        max_level=7,
+        num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(
+        num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    if include_mask:
+      mask_head = instance_heads.MaskHead(
+          num_classes=2, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+    else:
+      mask_head = None
+      mask_sampler_obj = None
+      mask_roi_aligner_obj = None
+    model = maskrcnn_model.MaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj)
+
+    config = model.get_config()
+    new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/retinanet_model.py
+++ b/official/vision/beta/modeling/retinanet_model.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RetinaNet."""
+
+# Import libraries
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class RetinaNetModel(tf.keras.Model):
+  """The RetinaNet model class."""
+
+  def __init__(self,
+               backbone,
+               decoder,
+               head,
+               detection_generator,
+               **kwargs):
+    """Classification initialization function.
+
+    Args:
+      backbone: `tf.keras.Model` a backbone network.
+      decoder: `tf.keras.Model` a decoder network.
+      head: `RetinaNetHead`, the RetinaNet head.
+      detection_generator: the detection generator.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(RetinaNetModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'decoder': decoder,
+        'head': head,
+        'detection_generator': detection_generator,
+    }
+    self._backbone = backbone
+    self._decoder = decoder
+    self._head = head
+    self._detection_generator = detection_generator
+
+  def call(self,
+           images,
+           image_shape=None,
+           anchor_boxes=None,
+           training=None):
+    """Forward pass of the RetinaNet model.
+
+    Args:
+      images: `Tensor`, the input batched images, whose shape is
+        [batch, height, width, 3].
+      image_shape: `Tensor`, the actual shape of the input images, whose shape
+        is [batch, 2] where the last dimension is [height, width]. Note that
+        this is the actual image shape excluding paddings. For example, images
+        in the batch may be resized into different shapes before padding to the
+        fixed size.
+      anchor_boxes: a dict of tensors which includes multilevel anchors.
+        - key: `int`, the level of the multilevel predictions.
+        - values: `Tensor`, the anchor coordinates of a particular feature
+            level, whose shape is [height_l, width_l, num_anchors_per_location].
+      training: `bool`, indicating whether it is in training mode.
+
+    Returns:
+      scores: a dict of tensors which includes scores of the predictions.
+        - key: `int`, the level of the multilevel predictions.
+        - values: `Tensor`, the box scores predicted from a particular feature
+            level, whose shape is
+            [batch, height_l, width_l, num_classes * num_anchors_per_location].
+      boxes: a dict of tensors which includes coordinates of the predictions.
+        - key: `int`, the level of the multilevel predictions.
+        - values: `Tensor`, the box coordinates predicted from a particular
+            feature level, whose shape is
+            [batch, height_l, width_l, 4 * num_anchors_per_location].
+    """
+    # Feature extraction.
+    features = self.backbone(images)
+    if self.decoder:
+      features = self.decoder(features)
+
+    # Dense prediction.
+    raw_scores, raw_boxes = self.head(features)
+
+    if training:
+      return {
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes,
+      }
+    else:
+      # Post-processing.
+      final_results = self.detection_generator(
+          raw_boxes, raw_scores, anchor_boxes, image_shape)
+      return {
+          'detection_boxes': final_results['detection_boxes'],
+          'detection_scores': final_results['detection_scores'],
+          'detection_classes': final_results['detection_classes'],
+          'num_detections': final_results['num_detections'],
+          'cls_outputs': raw_scores,
+          'box_outputs': raw_boxes
+      }
+
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(backbone=self.backbone, head=self.head)
+    if self.decoder is not None:
+      items.update(decoder=self.decoder)
+
+    return items
+
+  @property
+  def backbone(self):
+    return self._backbone
+
+  @property
+  def decoder(self):
+    return self._decoder
+
+  @property
+  def head(self):
+    return self._head
+
+  @property
+  def detection_generator(self):
+    return self._detection_generator
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/modeling/retinanet_model_test.py
+++ b/official/vision/beta/modeling/retinanet_model_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RetinaNet models."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.beta.modeling import retinanet_model
+from official.vision.beta.modeling.backbones import resnet
+from official.vision.beta.modeling.decoders import fpn
+from official.vision.beta.modeling.heads import dense_prediction_heads
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.ops import anchor
+
+
+class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (3, 3, 7, 3, [1.0], 50, False, 256, 4, 256, 32244949),
+  )
+  def test_num_params(self,
+                      num_classes,
+                      min_level,
+                      max_level,
+                      num_scales,
+                      aspect_ratios,
+                      resnet_model_id,
+                      use_separable_conv,
+                      fpn_num_filters,
+                      head_num_convs,
+                      head_num_filters,
+                      expected_num_params):
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 384
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+
+    anchor_boxes = anchor.Anchor(
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=3,
+        image_size=(image_size, image_size)).multilevel_boxes
+    for l in anchor_boxes:
+      anchor_boxes[l] = tf.tile(
+          tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+
+    backbone = resnet.ResNet(model_id=resnet_model_id)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        num_filters=fpn_num_filters,
+        use_separable_conv=use_separable_conv)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        num_anchors_per_location=num_anchors_per_location,
+        use_separable_conv=use_separable_conv,
+        num_convs=head_num_convs,
+        num_filters=head_num_filters)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator)
+
+    _ = model(images, image_shape, anchor_boxes, training=True)
+    self.assertEqual(expected_num_params, model.count_params())
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          image_size=[(128, 128),],
+          training=[True, False],
+      )
+  )
+  def test_forward(self, strategy, image_size, training):
+    """Test for creation of a R50-FPN RetinaNet."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array(
+        [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
+
+    with strategy.scope():
+      anchor_gen = anchor.build_anchor_generator(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3)
+      anchor_boxes = anchor_gen(image_size)
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+
+      backbone = resnet.ResNet(model_id=50)
+      decoder = fpn.FPN(
+          input_specs=backbone.output_specs,
+          min_level=min_level,
+          max_level=max_level)
+      head = dense_prediction_heads.RetinaNetHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_classes=num_classes,
+          num_anchors_per_location=num_anchors_per_location)
+      generator = detection_generator.MultilevelDetectionGenerator(
+          max_num_detections=10)
+      model = retinanet_model.RetinaNetModel(
+          backbone=backbone,
+          decoder=decoder,
+          head=head,
+          detection_generator=generator)
+
+      model_outputs = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          training=training)
+
+    if training:
+      cls_outputs = model_outputs['cls_outputs']
+      box_outputs = model_outputs['box_outputs']
+      for level in range(min_level, max_level + 1):
+        self.assertIn(level, cls_outputs)
+        self.assertIn(level, box_outputs)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            num_classes * num_anchors_per_location
+        ], cls_outputs[level].numpy().shape)
+        self.assertAllEqual([
+            2,
+            image_size[0] // 2**level,
+            image_size[1] // 2**level,
+            4 * num_anchors_per_location
+        ], box_outputs[level].numpy().shape)
+    else:
+      self.assertIn('detection_boxes', model_outputs)
+      self.assertIn('detection_scores', model_outputs)
+      self.assertIn('detection_classes', model_outputs)
+      self.assertIn('num_detections', model_outputs)
+      self.assertAllEqual(
+          [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_scores'].numpy().shape)
+      self.assertAllEqual(
+          [2, 10], model_outputs['detection_classes'].numpy().shape)
+      self.assertAllEqual(
+          [2,], model_outputs['num_detections'].numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+
+    backbone = resnet.ResNet(model_id=50)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level)
+    head = dense_prediction_heads.RetinaNetHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_classes=num_classes,
+        num_anchors_per_location=num_anchors_per_location)
+    generator = detection_generator.MultilevelDetectionGenerator(
+        max_num_detections=10)
+    model = retinanet_model.RetinaNetModel(
+        backbone=backbone,
+        decoder=decoder,
+        head=head,
+        detection_generator=generator)
+
+    config = model.get_config()
+    new_model = retinanet_model.RetinaNetModel.from_config(config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/video_classification_model.py
+++ b/official/vision/beta/modeling/video_classification_model.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Build video classification models."""
+# Import libraries
+import tensorflow as tf
+
+layers = tf.keras.layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class VideoClassificationModel(tf.keras.Model):
+  """A video classification class builder."""
+
+  def __init__(self,
+               backbone,
+               num_classes,
+               input_specs=layers.InputSpec(shape=[None, None, None, None, 3]),
+               dropout_rate=0.0,
+               kernel_initializer='random_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               add_head_batch_norm=False,
+               use_sync_bn: bool = False,
+               norm_momentum: float = 0.99,
+               norm_epsilon: float = 0.001,
+               **kwargs):
+    """Video Classification initialization function.
+
+    Args:
+      backbone: a 3d backbone network.
+      num_classes: `int` number of classes in classification task.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      dropout_rate: `float` rate for dropout regularization.
+      kernel_initializer: kernel initializer for the dense layer.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
+        None.
+      add_head_batch_norm: `bool` whether to add a batch normalization layer
+        before pool.
+      use_sync_bn: `bool` if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization momentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    self._self_setattr_tracking = False
+    self._config_dict = {
+        'backbone': backbone,
+        'num_classes': num_classes,
+        'input_specs': input_specs,
+        'dropout_rate': dropout_rate,
+        'kernel_initializer': kernel_initializer,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'add_head_batch_norm': add_head_batch_norm,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+    }
+    self._input_specs = input_specs
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._backbone = backbone
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    axis = -1 if tf.keras.backend.image_data_format() == 'channels_last' else 1
+
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+    endpoints = backbone(inputs)
+    x = endpoints[max(endpoints.keys())]
+
+    if add_head_batch_norm:
+      x = self._norm(axis=axis, momentum=norm_momentum, epsilon=norm_epsilon)(x)
+    x = tf.keras.layers.GlobalAveragePooling3D()(x)
+    x = tf.keras.layers.Dropout(dropout_rate)(x)
+    x = tf.keras.layers.Dense(
+        num_classes, kernel_initializer=kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+
+    super(VideoClassificationModel, self).__init__(
+        inputs=inputs, outputs=x, **kwargs)
+
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    return dict(backbone=self.backbone)
+
+  @property
+  def backbone(self):
+    return self._backbone
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/beta/modeling/video_classification_model_test.py
+++ b/official/vision/beta/modeling/video_classification_model_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for video classification network."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.modeling import backbones
+from official.vision.beta.modeling import video_classification_model
+
+
+class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (50, 8, 112, 'relu'),
+      (50, 8, 112, 'swish'),
+  )
+  def test_resnet3d_network_creation(self, model_id, temporal_size,
+                                     spatial_size, activation):
+    """Test for creation of a ResNet3D-50 classifier."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, temporal_size, spatial_size, spatial_size, 3])
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes,
+        input_specs=input_specs,
+        activation=activation)
+
+    num_classes = 1000
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone,
+        num_classes=num_classes,
+        input_specs=input_specs,
+        dropout_rate=0.2,
+    )
+
+    inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
+    logits = model(inputs)
+    self.assertAllEqual([2, num_classes], logits.numpy().shape)
+
+  def test_serialize_deserialize(self):
+    """Validate the classification network can be serialized and deserialized."""
+    model_id = 50
+    temporal_strides = [1, 1, 1, 1]
+    temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
+                             (1, 3, 1)]
+
+    backbone = backbones.ResNet3D(
+        model_id=model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes)
+
+    model = video_classification_model.VideoClassificationModel(
+        backbone=backbone, num_classes=1000)
+
+    config = model.get_config()
+    new_model = video_classification_model.VideoClassificationModel.from_config(
+        config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/anchor.py
+++ b/official/vision/beta/ops/anchor.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Anchor box and labeler definition."""
+
+import collections
+# Import libraries
+import tensorflow as tf
+from official.vision.beta.ops.experimental import anchor_generator
+from official.vision.detection.utils.object_detection import argmax_matcher
+from official.vision.detection.utils.object_detection import balanced_positive_negative_sampler
+from official.vision.detection.utils.object_detection import box_list
+from official.vision.detection.utils.object_detection import faster_rcnn_box_coder
+from official.vision.detection.utils.object_detection import region_similarity_calculator
+from official.vision.detection.utils.object_detection import target_assigner
+
+
+class Anchor(object):
+  """Anchor class for anchor-based object detectors."""
+
+  def __init__(self,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               image_size):
+    """Constructs multiscale anchors.
+
+    Args:
+      min_level: integer number of minimum level of the output feature pyramid.
+      max_level: integer number of maximum level of the output feature pyramid.
+      num_scales: integer number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: list of float numbers representing the aspect raito anchors
+        added on each level. The number indicates the ratio of width to height.
+        For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
+        scale level.
+      anchor_size: float number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      image_size: a list of integer numbers or Tensors representing
+        [height, width] of the input image size.The image_size should be divided
+        by the largest feature stride 2^max_level.
+    """
+    self.min_level = min_level
+    self.max_level = max_level
+    self.num_scales = num_scales
+    self.aspect_ratios = aspect_ratios
+    self.anchor_size = anchor_size
+    self.image_size = image_size
+    self.boxes = self._generate_boxes()
+
+  def _generate_boxes(self):
+    """Generates multiscale anchor boxes.
+
+    Returns:
+      a Tensor of shape [N, 4], representing anchor boxes of all levels
+      concatenated together.
+    """
+    boxes_all = []
+    for level in range(self.min_level, self.max_level + 1):
+      boxes_l = []
+      for scale in range(self.num_scales):
+        for aspect_ratio in self.aspect_ratios:
+          stride = 2 ** level
+          intermidate_scale = 2 ** (scale / float(self.num_scales))
+          base_anchor_size = self.anchor_size * stride * intermidate_scale
+          aspect_x = aspect_ratio ** 0.5
+          aspect_y = aspect_ratio ** -0.5
+          half_anchor_size_x = base_anchor_size * aspect_x / 2.0
+          half_anchor_size_y = base_anchor_size * aspect_y / 2.0
+          x = tf.range(stride / 2, self.image_size[1], stride)
+          y = tf.range(stride / 2, self.image_size[0], stride)
+          xv, yv = tf.meshgrid(x, y)
+          xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32)
+          yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32)
+          # Tensor shape Nx4.
+          boxes = tf.stack([yv - half_anchor_size_y, xv - half_anchor_size_x,
+                            yv + half_anchor_size_y, xv + half_anchor_size_x],
+                           axis=1)
+          boxes_l.append(boxes)
+      # Concat anchors on the same level to tensor shape NxAx4.
+      boxes_l = tf.stack(boxes_l, axis=1)
+      boxes_l = tf.reshape(boxes_l, [-1, 4])
+      boxes_all.append(boxes_l)
+    return tf.concat(boxes_all, axis=0)
+
+  def unpack_labels(self, labels):
+    """Unpacks an array of labels into multiscales labels."""
+    unpacked_labels = collections.OrderedDict()
+    count = 0
+    for level in range(self.min_level, self.max_level + 1):
+      feat_size_y = tf.cast(self.image_size[0] / 2 ** level, tf.int32)
+      feat_size_x = tf.cast(self.image_size[1] / 2 ** level, tf.int32)
+      steps = feat_size_y * feat_size_x * self.anchors_per_location
+      unpacked_labels[level] = tf.reshape(
+          labels[count:count + steps], [feat_size_y, feat_size_x, -1])
+      count += steps
+    return unpacked_labels
+
+  @property
+  def anchors_per_location(self):
+    return self.num_scales * len(self.aspect_ratios)
+
+  @property
+  def multilevel_boxes(self):
+    return self.unpack_labels(self.boxes)
+
+
+class AnchorLabeler(object):
+  """Labeler for dense object detector."""
+
+  def __init__(self,
+               match_threshold=0.5,
+               unmatched_threshold=0.5):
+    """Constructs anchor labeler to assign labels to anchors.
+
+    Args:
+      match_threshold: a float number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: a float number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+    """
+    similarity_calc = region_similarity_calculator.IouSimilarity()
+    matcher = argmax_matcher.ArgMaxMatcher(
+        match_threshold,
+        unmatched_threshold=unmatched_threshold,
+        negatives_lower_than_unmatched=True,
+        force_match_for_each_row=True)
+    box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
+
+    self._target_assigner = target_assigner.TargetAssigner(
+        similarity_calc, matcher, box_coder)
+    self._match_threshold = match_threshold
+    self._unmatched_threshold = unmatched_threshold
+
+  def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+    Returns:
+      cls_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location]. The height_l and
+        width_l represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
+        and width_l represent the dimension of bounding box regression output at
+        l-th level.
+      cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
+        serves as masking / sample weight for classification loss. Its value
+        is 1.0 for positive and negative matched anchors, and 0.0 for ignored
+        anchors.
+      box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
+        serves as masking / sample weight for regression loss. Its value is
+        1.0 for positive matched anchors, and 0.0 for negative and ignored
+        anchors.
+    """
+    gt_box_list = box_list.BoxList(gt_boxes)
+    flattened_anchor_boxes = []
+    for anchors in anchor_boxes.values():
+      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
+    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
+    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
+
+    # The cls_weights, box_weights are not used.
+    (cls_targets, cls_weights, box_targets, box_weights,
+     matches) = self._target_assigner.assign(anchor_box_list, gt_box_list,
+                                             gt_labels)
+
+    # Labels definition in matches.match_results:
+    # (1) match_results[i]>=0, meaning that column i is matched with row
+    #     match_results[i].
+    # (2) match_results[i]=-1, meaning that column i is not matched.
+    # (3) match_results[i]=-2, meaning that column i is ignored.
+    match_results = tf.expand_dims(matches.match_results, axis=1)
+    cls_targets = tf.cast(cls_targets, tf.int32)
+    cls_targets = tf.where(
+        tf.equal(match_results, -1), -tf.ones_like(cls_targets), cls_targets)
+    cls_targets = tf.where(
+        tf.equal(match_results, -2), -2 * tf.ones_like(cls_targets),
+        cls_targets)
+
+    # Unpacks labels into multi-level representations.
+    cls_targets_dict = unpack_targets(cls_targets, anchor_boxes)
+    box_targets_dict = unpack_targets(box_targets, anchor_boxes)
+
+    return cls_targets_dict, box_targets_dict, cls_weights, box_weights
+
+
+class RpnAnchorLabeler(AnchorLabeler):
+  """Labeler for Region Proposal Network."""
+
+  def __init__(self,
+               match_threshold=0.7,
+               unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5):
+    AnchorLabeler.__init__(self, match_threshold=0.7, unmatched_threshold=0.3)
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._rpn_fg_fraction = rpn_fg_fraction
+
+  def _get_rpn_samples(self, match_results):
+    """Computes anchor labels.
+
+    This function performs subsampling for foreground (fg) and background (bg)
+    anchors.
+    Args:
+      match_results: A integer tensor with shape [N] representing the
+        matching results of anchors. (1) match_results[i]>=0,
+        meaning that column i is matched with row match_results[i].
+        (2) match_results[i]=-1, meaning that column i is not matched.
+        (3) match_results[i]=-2, meaning that column i is ignored.
+    Returns:
+      score_targets: a integer tensor with the a shape of [N].
+        (1) score_targets[i]=1, the anchor is a positive sample.
+        (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
+        don't care (ignore).
+    """
+    sampler = (
+        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+            positive_fraction=self._rpn_fg_fraction, is_static=False))
+    # indicator includes both positive and negative labels.
+    # labels includes only positives labels.
+    # positives = indicator & labels.
+    # negatives = indicator & !labels.
+    # ignore = !indicator.
+    indicator = tf.greater(match_results, -2)
+    labels = tf.greater(match_results, -1)
+
+    samples = sampler.subsample(
+        indicator, self._rpn_batch_size_per_im, labels)
+    positive_labels = tf.where(
+        tf.logical_and(samples, labels),
+        tf.constant(2, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    negative_labels = tf.where(
+        tf.logical_and(samples, tf.logical_not(labels)),
+        tf.constant(1, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    ignore_labels = tf.fill(match_results.shape, -1)
+
+    return (ignore_labels + positive_labels + negative_labels,
+            positive_labels, negative_labels)
+
+  def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+    Returns:
+      score_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors]. The height_l and width_l
+        represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors * 4]. The height_l and
+        width_l represent the dimension of bounding box regression output at
+        l-th level.
+    """
+    gt_box_list = box_list.BoxList(gt_boxes)
+    flattened_anchor_boxes = []
+    for anchors in anchor_boxes.values():
+      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
+    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
+    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
+
+    # cls_targets, cls_weights, box_weights are not used.
+    _, _, box_targets, _, matches = self._target_assigner.assign(
+        anchor_box_list, gt_box_list, gt_labels)
+
+    # score_targets contains the subsampled positive and negative anchors.
+    score_targets, _, _ = self._get_rpn_samples(matches.match_results)
+
+    # Unpacks labels.
+    score_targets_dict = unpack_targets(score_targets, anchor_boxes)
+    box_targets_dict = unpack_targets(box_targets, anchor_boxes)
+
+    return score_targets_dict, box_targets_dict
+
+
+def build_anchor_generator(min_level, max_level, num_scales, aspect_ratios,
+                           anchor_size):
+  """Build anchor generator from levels."""
+  anchor_sizes = collections.OrderedDict()
+  strides = collections.OrderedDict()
+  scales = []
+  for scale in range(num_scales):
+    scales.append(2**(scale / float(num_scales)))
+  for level in range(min_level, max_level + 1):
+    stride = 2**level
+    strides[level] = stride
+    anchor_sizes[level] = anchor_size * stride
+  anchor_gen = anchor_generator.AnchorGenerator(
+      anchor_sizes=anchor_sizes,
+      scales=scales,
+      aspect_ratios=aspect_ratios,
+      strides=strides)
+  return anchor_gen
+
+
+def unpack_targets(targets, anchor_boxes_dict):
+  """Unpacks an array of labels into multiscales labels."""
+  unpacked_targets = collections.OrderedDict()
+  count = 0
+  for level, anchor_boxes in anchor_boxes_dict.items():
+    feat_size_shape = anchor_boxes.shape.as_list()
+    feat_size_y = feat_size_shape[0]
+    feat_size_x = feat_size_shape[1]
+    anchors_per_location = int(feat_size_shape[2] / 4)
+    steps = feat_size_y * feat_size_x * anchors_per_location
+    unpacked_targets[level] = tf.reshape(targets[count:count + steps],
+                                         [feat_size_y, feat_size_x, -1])
+    count += steps
+  return unpacked_targets
--- a/official/vision/beta/ops/anchor_test.py
+++ b/official/vision/beta/ops/anchor_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for anchor.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.ops import anchor
+
+
+class AnchorTest(parameterized.TestCase, tf.test.TestCase):
+
+  # The set of parameters are tailored for the MLPerf configuration, where
+  # the number of anchors is 495132, rpn_batch_size_per_im=256, and
+  # rpn_fg_fraction=0.5.
+  @parameterized.parameters(
+      (512, 25, 25, 25, 25, (512, 512)),
+      (512, 25, 25, 25, 25, (512, 640)),
+      (512, 25, 25, 25, 25, (640, 512)),
+      (495132, 100, 100, 100, 100, (512, 512)),
+      (495132, 200, 100, 128, 100, (512, 512)),
+      (495132, 100, 120, 100, 120, (512, 512)),
+      (495132, 100, 200, 100, 156, (512, 512)),
+      (495132, 200, 200, 128, 128, (512, 512)),
+  )
+  def testAnchorRpnSample(self, num_anchors, num_positives,
+                          num_negatives, expected_positives,
+                          expected_negatives, image_size):
+    match_results_np = np.empty([num_anchors])
+    match_results_np.fill(-2)
+    match_results_np[:num_positives] = 0
+    match_results_np[num_positives:num_positives + num_negatives] = -1
+    match_results = tf.convert_to_tensor(value=match_results_np, dtype=tf.int32)
+    anchor_labeler = anchor.RpnAnchorLabeler(
+        match_threshold=0.7,
+        unmatched_threshold=0.3,
+        rpn_batch_size_per_im=256,
+        rpn_fg_fraction=0.5)
+    rpn_sample_op = anchor_labeler._get_rpn_samples(match_results)
+    labels = [v.numpy() for v in rpn_sample_op]
+    self.assertLen(labels[0], num_anchors)
+    positives = np.sum(np.array(labels[0]) == 1)
+    negatives = np.sum(np.array(labels[0]) == 0)
+    self.assertEqual(positives, expected_positives)
+    self.assertEqual(negatives, expected_negatives)
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, 5, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80]]),
+      # Multi scale anchor.
+      (5, 6, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
+      # # Multi aspect ratio anchor.
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
+       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
+
+  )
+  def testAnchorGeneration(self, min_level, max_level, num_scales,
+                           aspect_ratios, anchor_size, expected_boxes):
+    image_size = [64, 64]
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    boxes = anchors.boxes.numpy()
+    self.assertEqual(expected_boxes, boxes.tolist())
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, 5, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80]]),
+      # Multi scale anchor.
+      (5, 6, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
+      # # Multi aspect ratio anchor.
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
+       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
+
+  )
+  def testAnchorGenerationWithImageSizeAsTensor(self,
+                                                min_level,
+                                                max_level,
+                                                num_scales,
+                                                aspect_ratios,
+                                                anchor_size,
+                                                expected_boxes):
+    image_size = tf.constant([64, 64], tf.int32)
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    boxes = anchors.boxes.numpy()
+    self.assertEqual(expected_boxes, boxes.tolist())
+
+  @parameterized.parameters(
+      (3, 6, 2, [1.0], 2.0),
+  )
+  def testLabelAnchors(self, min_level, max_level, num_scales,
+                       aspect_ratios, anchor_size):
+    input_size = [512, 512]
+    ground_truth_class_id = 2
+
+    # The matched anchors are the anchors used as ground truth and the anchors
+    # at the next octave scale on the same location.
+    expected_anchor_locations = [[0, 0, 0], [0, 0, 1]]
+    anchor_gen = anchor.build_anchor_generator(min_level, max_level, num_scales,
+                                               aspect_ratios, anchor_size)
+    anchor_boxes = anchor_gen(input_size)
+    anchor_labeler = anchor.AnchorLabeler()
+
+    # Uses the first anchors as ground truth. The ground truth should map to
+    # two anchors with two intermediate scales at the same location.
+    gt_boxes = anchor_boxes[3][0:1, 0, 0:4]
+    gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32)
+    (cls_targets, box_targets, _,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, gt_boxes, gt_classes)
+
+    for k, v in cls_targets.items():
+      cls_targets[k] = v.numpy()
+    for k, v in box_targets.items():
+      box_targets[k] = v.numpy()
+    box_weights = box_weights.numpy()
+
+    anchor_locations = np.vstack(
+        np.where(cls_targets[min_level] > -1)).transpose()
+    self.assertAllClose(expected_anchor_locations, anchor_locations)
+    # Two anchor boxes on min_level got matched to the gt_boxes.
+    self.assertAllClose(tf.reduce_sum(box_weights), 2)
+
+  @parameterized.parameters(
+      (3, 7, [.5, 1., 2.], 2, 8, (256, 256)),
+      (3, 8, [1.], 3, 32, (512, 512)),
+      (3, 3, [1.], 2, 4, (32, 32)),
+  )
+  def testEquivalentResult(self, min_level, max_level, aspect_ratios,
+                           num_scales, anchor_size, image_size):
+    anchor_gen = anchor.build_anchor_generator(
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+    anchors = anchor_gen(image_size)
+    expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
+                                        aspect_ratios, anchor_size, image_size)
+
+    expected_anchors = expected_anchor_gen.multilevel_boxes
+    for k in expected_anchors.keys():
+      self.assertAllClose(expected_anchors[k], anchors[k])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/box_ops.py
+++ b/official/vision/beta/ops/box_ops.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Box related ops."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+
+EPSILON = 1e-8
+BBOX_XFORM_CLIP = np.log(1000. / 16.)
+
+
+def yxyx_to_xywh(boxes):
+  """Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
+
+  Args:
+    boxes: a numpy array whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+
+  Returns:
+    boxes: a numpy array whose shape is the same as `boxes` in new format.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  boxes_ymin = boxes[..., 0]
+  boxes_xmin = boxes[..., 1]
+  boxes_width = boxes[..., 3] - boxes[..., 1]
+  boxes_height = boxes[..., 2] - boxes[..., 0]
+  new_boxes = np.stack(
+      [boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=-1)
+
+  return new_boxes
+
+
+def jitter_boxes(boxes, noise_scale=0.025):
+  """Jitter the box coordinates by some noise distribution.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    noise_scale: a python float which specifies the magnitude of noise. The rule
+      of thumb is to set this between (0, 0.1]. The default value is found to
+      mimic the noisy detections best empirically.
+
+  Returns:
+    jittered_boxes: a tensor whose shape is the same as `boxes` representing
+      the jittered boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('jitter_boxes'):
+    bbox_jitters = tf.random.normal(tf.shape(boxes), stddev=noise_scale)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    width = xmax - xmin
+    height = ymax - ymin
+    new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
+    new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
+    new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
+    new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
+    jittered_boxes = tf.concat(
+        [new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
+         new_center_y + new_height * 0.5, new_center_x + new_width * 0.5],
+        axis=-1)
+
+    return jittered_boxes
+
+
+def normalize_boxes(boxes, image_shape):
+  """Converts boxes to the normalized coordinates.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+
+  Returns:
+    normalized_boxes: a tensor whose shape is the same as `boxes` representing
+      the normalized boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('normalize_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0:1]
+      width = image_shape[..., 1:2]
+
+    ymin = boxes[..., 0:1] / height
+    xmin = boxes[..., 1:2] / width
+    ymax = boxes[..., 2:3] / height
+    xmax = boxes[..., 3:4] / width
+
+    normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+    return normalized_boxes
+
+
+def denormalize_boxes(boxes, image_shape):
+  """Converts boxes normalized by [height, width] to pixel coordinates.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+
+  Returns:
+    denormalized_boxes: a tensor whose shape is the same as `boxes` representing
+      the denormalized boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  with tf.name_scope('denormalize_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.split(image_shape, 2, axis=-1)
+
+    ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
+    ymin = ymin * height
+    xmin = xmin * width
+    ymax = ymax * height
+    xmax = xmax * width
+
+    denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+    return denormalized_boxes
+
+
+def clip_boxes(boxes, image_shape):
+  """Clips boxes to image boundaries.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+
+  Returns:
+    clipped_boxes: a tensor whose shape is the same as `boxes` representing the
+      clipped boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('clip_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+      max_length = [height, width, height, width]
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.unstack(image_shape, axis=-1)
+      max_length = tf.stack([height, width, height, width], axis=-1)
+
+    clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
+    return clipped_boxes
+
+
+def compute_outer_boxes(boxes, image_shape, scale=1.0):
+  """Compute outer box encloses an object with a margin.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+    scale: a float number specifying the scale of output outer boxes to input
+      `boxes`.
+
+  Returns:
+    outer_boxes: a tensor whose shape is the same as `boxes` representing the
+      outer boxes.
+  """
+  if scale < 1.0:
+    raise ValueError(
+        'scale is {}, but outer box scale must be greater than 1.0.'.format(
+            scale))
+  centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
+  centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
+  box_height = (boxes[..., 2] - boxes[..., 0]) * scale
+  box_width = (boxes[..., 3] - boxes[..., 1]) * scale
+  outer_boxes = tf.stack(
+      [centers_y - box_height / 2.0, centers_x - box_width / 2.0,
+       centers_y + box_height / 2.0, centers_x + box_width / 2.0],
+      axis=1)
+  outer_boxes = clip_boxes(outer_boxes, image_shape)
+  return outer_boxes
+
+
+def encode_boxes(boxes, anchors, weights=None):
+  """Encode boxes to targets.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+
+  Returns:
+    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
+      encoded box targets.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('encode_boxes'):
+    boxes = tf.cast(boxes, dtype=anchors.dtype)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    box_h = ymax - ymin
+    box_w = xmax - xmin
+    box_yc = ymin + 0.5 * box_h
+    box_xc = xmin + 0.5 * box_w
+
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin
+    anchor_w = anchor_xmax - anchor_xmin
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+
+    encoded_dy = (box_yc - anchor_yc) / anchor_h
+    encoded_dx = (box_xc - anchor_xc) / anchor_w
+    encoded_dh = tf.math.log(box_h / anchor_h)
+    encoded_dw = tf.math.log(box_w / anchor_w)
+    if weights:
+      encoded_dy *= weights[0]
+      encoded_dx *= weights[1]
+      encoded_dh *= weights[2]
+      encoded_dw *= weights[3]
+
+    encoded_boxes = tf.concat(
+        [encoded_dy, encoded_dx, encoded_dh, encoded_dw], axis=-1)
+    return encoded_boxes
+
+
+def decode_boxes(encoded_boxes, anchors, weights=None):
+  """Decode boxes.
+
+  Args:
+    encoded_boxes: a tensor whose last dimension is 4 representing the
+      coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+
+  Returns:
+    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
+      decoded box targets.
+  """
+  if encoded_boxes.shape[-1] != 4:
+    raise ValueError(
+        'encoded_boxes.shape[-1] is {:d}, but must be 4.'
+        .format(encoded_boxes.shape[-1]))
+
+  with tf.name_scope('decode_boxes'):
+    encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
+    dy = encoded_boxes[..., 0:1]
+    dx = encoded_boxes[..., 1:2]
+    dh = encoded_boxes[..., 2:3]
+    dw = encoded_boxes[..., 3:4]
+    if weights:
+      dy /= weights[0]
+      dx /= weights[1]
+      dh /= weights[2]
+      dw /= weights[3]
+    dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
+    dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
+
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin
+    anchor_w = anchor_xmax - anchor_xmin
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+
+    decoded_boxes_yc = dy * anchor_h + anchor_yc
+    decoded_boxes_xc = dx * anchor_w + anchor_xc
+    decoded_boxes_h = tf.math.exp(dh) * anchor_h
+    decoded_boxes_w = tf.math.exp(dw) * anchor_w
+
+    decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
+    decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
+    decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h
+    decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w
+
+    decoded_boxes = tf.concat(
+        [decoded_boxes_ymin, decoded_boxes_xmin,
+         decoded_boxes_ymax, decoded_boxes_xmax],
+        axis=-1)
+    return decoded_boxes
+
+
+def filter_boxes(boxes, scores, image_shape, min_size_threshold):
+  """Filter and remove boxes that are too small or fall outside the image.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    image_shape: a tensor whose shape is the same as, or `broadcastable` to
+      `boxes` except the last dimension, which is 2, representing [height,
+      width] of the scaled image.
+    min_size_threshold: a float representing the minimal box size in each side
+      (w.r.t. the scaled image). Boxes whose sides are smaller than it will be
+      filtered out.
+
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with 0.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the positinon of the filtered boxes filled with 0.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('filter_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0]
+      width = image_shape[..., 1]
+
+    ymin = boxes[..., 0]
+    xmin = boxes[..., 1]
+    ymax = boxes[..., 2]
+    xmax = boxes[..., 3]
+
+    h = ymax - ymin
+    w = xmax - xmin
+    yc = ymin + 0.5 * h
+    xc = xmin + 0.5 * w
+
+    min_size = tf.cast(
+        tf.math.maximum(min_size_threshold, 0.0), dtype=boxes.dtype)
+
+    filtered_size_mask = tf.math.logical_and(
+        tf.math.greater(h, min_size), tf.math.greater(w, min_size))
+    filtered_center_mask = tf.logical_and(
+        tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
+        tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
+    filtered_mask = tf.math.logical_and(
+        filtered_size_mask, filtered_center_mask)
+
+    filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+    return filtered_boxes, filtered_scores
+
+
+def filter_boxes_by_scores(boxes, scores, min_score_threshold):
+  """Filter and remove boxes whose scores are smaller than the threshold.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    min_score_threshold: a float representing the minimal box score threshold.
+      Boxes whose score are smaller than it will be filtered out.
+
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with -1.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+
+  with tf.name_scope('filter_boxes_by_scores'):
+    filtered_mask = tf.math.greater(scores, min_score_threshold)
+    filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+
+    return filtered_boxes, filtered_scores
+
+
+def gather_instances(selected_indices, instances, *aux_instances):
+  """Gather instances by indices.
+
+  Args:
+    selected_indices: a Tensor of shape [batch, K] which indicates the selected
+      indices in instance dimension (2nd dimension).
+    instances: a Tensor of shape [batch, N, ...] where the 2nd dimension is
+      the instance dimension to be selected from.
+    *aux_instances: the additional Tensors whose shapes are in [batch, N, ...]
+      which are the tensors to be selected from using the `selected_indices`.
+
+  Returns:
+    selected_instances: the tensor of shape [batch, K, ...] which corresponds to
+      the selected instances of the `instances` tensor.
+    selected_aux_instances: the additional tensors of shape [batch, K, ...]
+      which corresponds to the selected instances of the `aus_instances`
+      tensors.
+  """
+  batch_size = instances.shape[0]
+  if batch_size == 1:
+    selected_instances = tf.squeeze(
+        tf.gather(instances, selected_indices, axis=1), axis=1)
+    if aux_instances:
+      selected_aux_instances = [
+          tf.squeeze(
+              tf.gather(a, selected_indices, axis=1), axis=1)
+          for a in aux_instances
+      ]
+      return tuple([selected_instances] + selected_aux_instances)
+    else:
+      return selected_instances
+  else:
+    indices_shape = tf.shape(selected_indices)
+    batch_indices = (
+        tf.expand_dims(tf.range(indices_shape[0]), axis=-1) *
+        tf.ones([1, indices_shape[-1]], dtype=tf.int32))
+    gather_nd_indices = tf.stack(
+        [batch_indices, selected_indices], axis=-1)
+    selected_instances = tf.gather_nd(instances, gather_nd_indices)
+    if aux_instances:
+      selected_aux_instances = [
+          tf.gather_nd(a, gather_nd_indices) for a in aux_instances
+      ]
+      return tuple([selected_instances] + selected_aux_instances)
+    else:
+      return selected_instances
+
+
+def top_k_boxes(boxes, scores, k):
+  """Sort and select top k boxes according to the scores.
+
+  Args:
+    boxes: a tensor of shape [batch_size, N, 4] representing the coordinate of
+      the boxes. N is the number of boxes per image.
+    scores: a tensor of shsape [batch_size, N] representing the socre of the
+      boxes.
+    k: an integer or a tensor indicating the top k number.
+
+  Returns:
+    selected_boxes: a tensor of shape [batch_size, k, 4] representing the
+      selected top k box coordinates.
+    selected_scores: a tensor of shape [batch_size, k] representing the selected
+      top k box scores.
+  """
+  with tf.name_scope('top_k_boxes'):
+    selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
+    selected_boxes = gather_instances(top_k_indices, boxes)
+    return selected_boxes, selected_scores
+
+
+def get_non_empty_box_indices(boxes):
+  """Get indices for non-empty boxes."""
+  # Selects indices if box height or width is 0.
+  height = boxes[:, 2] - boxes[:, 0]
+  width = boxes[:, 3] - boxes[:, 1]
+  indices = tf.where(tf.logical_and(tf.greater(height, 0),
+                                    tf.greater(width, 0)))
+  return indices[:, 0]
+
+
+def bbox_overlap(boxes, gt_boxes):
+  """Calculates the overlap between proposal and ground truth boxes.
+
+  Some `boxes` or `gt_boxes` may have been padded.  The returned `iou` tensor
+  for these boxes will be -1.
+
+  Args:
+    boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
+      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
+    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
+      tensor might have paddings with a negative value.
+
+  Returns:
+    iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
+  """
+  with tf.name_scope('bbox_overlap'):
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=2)
+
+    # Calculates the intersection area.
+    i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    i_area = (
+        tf.math.maximum((i_xmax - i_xmin), 0) *
+        tf.math.maximum((i_ymax - i_ymin), 0))
+
+    # Calculates the union area.
+    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
+    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
+    # Adds a small epsilon to avoid divide-by-zero.
+    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
+
+    # Calculates IoU.
+    iou = i_area / u_area
+
+    # Fills -1 for IoU entries between the padded ground truth boxes.
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    padding_mask = tf.logical_or(
+        tf.zeros_like(bb_x_min, dtype=tf.bool),
+        tf.transpose(gt_invalid_mask, [0, 2, 1]))
+    iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
+
+    # Fills -1 for for invalid (-1) boxes.
+    boxes_invalid_mask = tf.less(
+        tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
+    iou = tf.where(boxes_invalid_mask, -tf.ones_like(iou), iou)
+
+    return iou
+
+
+def box_matching(boxes, gt_boxes, gt_classes):
+  """Match boxes to groundtruth boxes.
+
+  Given the proposal boxes and the groundtruth boxes and classes, perform the
+  groundtruth matching by taking the argmax of the IoU between boxes and
+  groundtruth boxes.
+
+  Args:
+    boxes: a tensor of shape of [batch_size, N, 4] representing the box
+      coordiantes to be matched to groundtruth boxes.
+    gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
+      the groundtruth box coordinates. It is padded with -1s to indicate the
+      invalid boxes.
+    gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+      classes. It is padded with -1s to indicate the invalid classes.
+
+  Returns:
+    matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
+      the matched groundtruth box coordinates for each input box. If the box
+      does not overlap with any groundtruth boxes, the matched boxes of it
+      will be set to all 0s.
+    matched_gt_classes: a tensor of shape of [batch_size, N], representing
+      the matched groundtruth classes for each input box. If the box does not
+      overlap with any groundtruth boxes, the matched box classes of it will
+      be set to 0, which corresponds to the background class.
+    matched_gt_indices: a tensor of shape of [batch_size, N], representing
+      the indices of the matched groundtruth boxes in the original gt_boxes
+      tensor. If the box does not overlap with any groundtruth boxes, the
+      index of the matched groundtruth will be set to -1.
+    matched_iou: a tensor of shape of [batch_size, N], representing the IoU
+      between the box and its matched groundtruth box. The matched IoU is the
+      maximum IoU of the box and all the groundtruth boxes.
+    iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
+      between boxes and the groundtruth boxes. The IoU between a box and the
+      invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
+  """
+  # Compute IoU between boxes and gt_boxes.
+  # iou <- [batch_size, N, K]
+  iou = bbox_overlap(boxes, gt_boxes)
+
+  # max_iou <- [batch_size, N]
+  # 0.0 -> no match to gt, or -1.0 match to no gt
+  matched_iou = tf.reduce_max(iou, axis=-1)
+
+  # background_box_mask <- bool, [batch_size, N]
+  background_box_mask = tf.less_equal(matched_iou, 0.0)
+
+  argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
+
+  matched_gt_boxes, matched_gt_classes = gather_instances(
+      argmax_iou_indices, gt_boxes, gt_classes)
+  matched_gt_boxes = tf.where(
+      tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
+      tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
+      matched_gt_boxes)
+  matched_gt_classes = tf.where(
+      background_box_mask,
+      tf.zeros_like(matched_gt_classes),
+      matched_gt_classes)
+
+  matched_gt_indices = tf.where(
+      background_box_mask,
+      -tf.ones_like(argmax_iou_indices),
+      argmax_iou_indices)
+
+  return (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+          matched_iou, iou)
--- a/official/vision/beta/ops/box_ops_test.py
+++ b/official/vision/beta/ops/box_ops_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for box_ops.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+
+
+def _transform_boxes_on_tpu_and_cpu(transform_fn, boxes, *args):
+  # Runs on TPU.
+  strategy = tf.distribute.experimental.TPUStrategy()
+  with strategy.scope():
+    transformed_op_tpu = transform_fn(boxes, *args)
+    transfomred_boxes_tpu = tf.nest.map_structure(lambda x: x.numpy(),
+                                                  transformed_op_tpu)
+
+  # Runs on CPU.
+  transfomred_op_cpu = transform_fn(boxes, *args)
+  transfomred_boxes_cpu = tf.nest.map_structure(lambda x: x.numpy(),
+                                                transfomred_op_cpu)
+  return transfomred_boxes_tpu, transfomred_boxes_cpu
+
+
+class ConvertBoxesTest(tf.test.TestCase):
+
+  def testConvertBoxes(self):
+    # y1, x1, y2, x2.
+    boxes = np.array([[0, 0, 1, 2], [0.2, 0.1, 1.2, 1.1]])
+    # x1, y1, width, height
+    target = np.array([[0, 0, 2, 1], [0.1, 0.2, 1, 1]])
+    outboxes = box_ops.yxyx_to_xywh(boxes)
+    self.assertNDArrayNear(outboxes, target, 1e-7)
+
+
+class JitterBoxesTest(tf.test.TestCase):
+
+  def testJitterBoxes(self):
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                  [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+    max_size = max(
+        np.amax(boxes_np[:, 3] - boxes_np[:, 1]),
+        np.amax(boxes_np[:, 2] - boxes_np[:, 0]))
+    noise_scale = 0.025
+    boxes = tf.constant(boxes_np)
+
+    def jitter_fn(input_boxes, arg_noise_scale):
+      return box_ops.jitter_boxes(input_boxes, arg_noise_scale)
+
+    jittered_boxes_tpu, jittered_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
+        jitter_fn, boxes, noise_scale)
+
+    # Test that the jittered box is within 10 stds from the inputs.
+    self.assertNDArrayNear(jittered_boxes_tpu, boxes_np,
+                           noise_scale * max_size * 10)
+    self.assertNDArrayNear(jittered_boxes_cpu, boxes_np,
+                           noise_scale * max_size * 10)
+
+
+class NormalizeBoxesTest(tf.test.TestCase):
+
+  def testNormalizeBoxes1DWithImageShapeAsList(self):
+    boxes = tf.constant([10, 30, 40, 90], tf.float32)
+    image_shape = [50, 100]
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu, [0.2, 0.3, 0.8, 0.9], 1e-5)
+
+  def testNormalizeBoxes1DWithImageShapeAsTensor(self):
+    boxes = tf.constant([10, 30, 40, 90], tf.float32)
+    image_shape = tf.constant([50, 100], tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu, [0.2, 0.3, 0.8, 0.9], 1e-5)
+
+  def testNormalizeBoxes2DWithImageShapeAsList(self):
+    boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
+    image_shape = [50, 100]
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
+
+  def testNormalizeBoxes2DWithImageShapeAsVector(self):
+    boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
+    image_shape = tf.constant([50, 100], dtype=tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
+
+  def testNormalizeBoxes2DWithImageShapeAsBroadcastableTensor(self):
+    boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
+    image_shape = tf.constant([[50, 100]], tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
+
+  def testNormalizeBoxes2DWithImageShapeAsSameShapeTensor(self):
+    boxes = tf.constant([[10, 30, 40, 90], [30, 10, 40, 50]], tf.float32)
+    image_shape = tf.constant([[50, 100], [50, 100]], tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]], 1e-5)
+
+  def testNormalizeBoxes3DWithImageShapeAsList(self):
+    boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
+                         [[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
+    image_shape = [50, 100]
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                            [[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]], 1e-5)
+
+  def testNormalizeBoxes3DWithImageShapeAsVector(self):
+    boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
+                         [[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
+    image_shape = tf.constant([50, 100], tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                            [[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]], 1e-5)
+
+  def testNormalizeBoxes3DWithImageShapeAsBroadcastableTensor(self):
+    boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
+                         [[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
+    image_shape = tf.constant([[[50, 100]], [[500, 1000]]], tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(
+        normalized_boxes_tpu,
+        [[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+         [[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]], 1e-5)
+
+  def testNormalizeBoxes3DWithImageShapeAsSameShapeTensor(self):
+    boxes = tf.constant([[[10, 30, 40, 90], [30, 10, 40, 50]],
+                         [[20, 40, 50, 80], [30, 50, 40, 90]]], tf.float32)
+    image_shape = tf.constant(
+        [[[50, 100], [50, 100]], [[500, 1000], [500, 1000]]], tf.int32)
+
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.normalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(
+        normalized_boxes_tpu,
+        [[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+         [[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]], 1e-5)
+
+
+class DenormalizeBoxesTest(tf.test.TestCase):
+
+  def testDenormalizeBoxes1DWithImageShapeAsList(self):
+    boxes = tf.constant([0.2, 0.3, 0.8, 0.9], tf.float32)
+    image_shape = [50, 100]
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu, [10, 30, 40, 90], 1e-5)
+
+  def testDenormalizeBoxes1DWithImageShapeAsTensor(self):
+    boxes = tf.constant([0.2, 0.3, 0.8, 0.9], tf.float32)
+    image_shape = tf.constant([50, 100], tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu, [10, 30, 40, 90], 1e-5)
+
+  def testDenormalizeBoxes2DWithImageShapeAsList(self):
+    boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                        tf.float32)
+    image_shape = [50, 100]
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
+
+  def testDenormalizeBoxes2DWithImageShapeAsVector(self):
+    boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                        tf.float32)
+    image_shape = tf.constant([50, 100], dtype=tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
+
+  def testDenormalizeBoxes2DWithImageShapeAsBroadcastableTensor(self):
+    boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                        tf.float32)
+    image_shape = tf.constant([[50, 100]], tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
+
+  def testDenormalizeBoxes2DWithImageShapeAsSameShapeTensor(self):
+    boxes = tf.constant([[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                        tf.float32)
+    image_shape = tf.constant([[50, 100], [50, 100]], tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[10, 30, 40, 90], [30, 10, 40, 50]], 1e-5)
+
+  def testDenormalizeBoxes3DWithImageShapeAsList(self):
+    boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                         [[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]],
+                        tf.float32)
+    image_shape = [50, 100]
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[[10, 30, 40, 90], [30, 10, 40, 50]],
+                            [[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
+
+  def testDenormalizeBoxes3DWithImageShapeAsVector(self):
+    boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                         [[0.4, 0.4, 1.0, 0.8], [0.6, 0.5, 0.8, 0.9]]],
+                        tf.float32)
+    image_shape = tf.constant([50, 100], tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[[10, 30, 40, 90], [30, 10, 40, 50]],
+                            [[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
+
+  def testDenormalizeBoxes3DWithImageShapeAsBroadcastableTensor(self):
+    boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                         [[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]],
+                        tf.float32)
+    image_shape = tf.constant([[[50, 100]], [[500, 1000]]], tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[[10, 30, 40, 90], [30, 10, 40, 50]],
+                            [[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
+
+  def testDenormalizeBoxes3DWithImageShapeAsSameShapeTensor(self):
+    boxes = tf.constant([[[0.2, 0.3, 0.8, 0.9], [0.6, 0.1, 0.8, 0.5]],
+                         [[0.04, 0.04, 0.1, 0.08], [0.06, 0.05, 0.08, 0.09]]],
+                        tf.float32)
+    image_shape = tf.constant(
+        [[[50, 100], [50, 100]], [[500, 1000], [500, 1000]]], tf.int32)
+    normalized_boxes_tpu, normalized_boxes_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            box_ops.denormalize_boxes, boxes, image_shape))
+
+    self.assertNDArrayNear(normalized_boxes_tpu, normalized_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(normalized_boxes_tpu,
+                           [[[10, 30, 40, 90], [30, 10, 40, 50]],
+                            [[20, 40, 50, 80], [30, 50, 40, 90]]], 1e-5)
+
+
+class ClipBoxesTest(tf.test.TestCase):
+
+  def testClipBoxesImageShapeAsList(self):
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                  [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
+    image_shape = [3, 3]
+    boxes = tf.constant(boxes_data)
+
+    clipped_boxes_tpu, clipped_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
+        box_ops.clip_boxes, boxes, image_shape)
+
+    self.assertAllClose(clipped_boxes_tpu, clipped_boxes_cpu)
+    self.assertAllClose(clipped_boxes_tpu,
+                        [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                         [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]])
+
+  def testClipBoxesImageShapeAsVector(self):
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                  [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
+    boxes = tf.constant(boxes_data)
+    image_shape = np.array([3, 3], dtype=np.float32)
+    clipped_boxes_tpu, clipped_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
+        box_ops.clip_boxes, boxes, image_shape)
+
+    self.assertAllClose(clipped_boxes_tpu, clipped_boxes_cpu)
+    self.assertAllClose(clipped_boxes_tpu,
+                        [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                         [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]])
+
+  def testClipBoxesImageShapeAsTensor(self):
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                  [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]]
+    boxes = tf.constant(boxes_data)
+    image_shape = tf.constant([[3, 3], [3, 3], [3, 3], [3, 3], [3, 3], [3, 3]],
+                              dtype=tf.float32)
+    clipped_boxes_tpu, clipped_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
+        box_ops.clip_boxes, boxes, image_shape)
+
+    self.assertAllClose(clipped_boxes_tpu, clipped_boxes_cpu)
+    self.assertAllClose(clipped_boxes_tpu,
+                        [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, 0.3, 1, 1.3],
+                         [0, 0.5, 1, 1.5], [0, 0.7, 1, 1.7], [0, 1.9, 1, 1.9]])
+
+
+class EncodeDecodeBoxesTest(tf.test.TestCase):
+
+  def test_encode_decode_boxes(self):
+    boxes_np = np.array([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0]],
+                         [[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    anchors = tf.constant([[[1.5, 2.5, 3.5, 4.5], [2.5, 3.5, 4.5, 5.5]],
+                           [[1.5, 2.5, 3.5, 4.5], [2.5, 3.5, 4.5, 5.5]]],
+                          dtype=tf.float32)
+    weights = [1.0, 1.0, 1.0, 1.0]
+
+    def test_fn(boxes, anchors):
+      encoded_boxes = box_ops.encode_boxes(boxes, anchors, weights)
+      decoded_boxes = box_ops.decode_boxes(encoded_boxes, anchors, weights)
+      return decoded_boxes
+
+    decoded_boxes_tpu, decoded_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
+        test_fn, boxes, anchors)
+
+    self.assertNDArrayNear(decoded_boxes_tpu, decoded_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(decoded_boxes_tpu, boxes_np, 1e-5)
+
+  def test_encode_decode_boxes_batch_broadcast(self):
+    boxes_np = np.array([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0]],
+                         [[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    anchors = tf.constant([[[1.5, 2.5, 3.5, 4.5], [2.5, 3.5, 4.5, 5.5]]],
+                          dtype=tf.float32)
+    weights = [1.0, 1.0, 1.0, 1.0]
+
+    def test_fn(boxes, anchors):
+      encoded_boxes = box_ops.encode_boxes(boxes, anchors, weights)
+      decoded_boxes = box_ops.decode_boxes(encoded_boxes, anchors, weights)
+      return decoded_boxes
+
+    decoded_boxes_tpu, decoded_boxes_cpu = _transform_boxes_on_tpu_and_cpu(
+        test_fn, boxes, anchors)
+
+    self.assertNDArrayNear(decoded_boxes_tpu, decoded_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(decoded_boxes_tpu, boxes_np, 1e-5)
+
+
+class FilterBoxesTest(tf.test.TestCase):
+
+  def test_filter_boxes_batch(self):
+    # boxes -> [[small, good, outside], [outside, small, good]]
+    boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
+                          [7.0, 4.0, 9.5, 6.5]],
+                         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
+                          [4.0, 1.0, 7.0, 4.0]]])
+    filtered_boxes_np = np.array([[[0.0, 0.0, 0.0, 0.0], [2.0, 3.0, 4.5, 5.5],
+                                   [0.0, 0.0, 0.0, 0.0]],
+                                  [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                                   [4.0, 1.0, 7.0, 4.0]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    scores_np = np.array([[0.9, 0.7, 0.5], [0.11, 0.22, 0.33]])
+    filtered_scores_np = np.array([[0.0, 0.7, 0.0], [0.0, 0.0, 0.33]])
+    scores = tf.constant(scores_np, dtype=tf.float32)
+    image_shape = tf.expand_dims(
+        tf.constant([[8, 8], [8, 8]], dtype=tf.int32), axis=1)
+    min_size_threshold = 2.0
+
+    def test_fn(boxes, scores, image_shape):
+      filtered_boxes, filtered_scores = box_ops.filter_boxes(
+          boxes, scores, image_shape, min_size_threshold)
+      return filtered_boxes, filtered_scores
+
+    filtered_results_tpu, filtered_results_cpu = (
+        _transform_boxes_on_tpu_and_cpu(
+            test_fn, boxes, scores, image_shape))
+    filtered_boxes_tpu, filtered_scores_tpu = filtered_results_tpu
+    filtered_boxes_cpu, filtered_scores_cpu = filtered_results_cpu
+
+    self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_cpu, 1e-5)
+    self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_np, 1e-5)
+    self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_np, 1e-5)
+
+
+class FilterBoxesByScoresTest(tf.test.TestCase):
+
+  def test_filter_boxes_by_scores_batch(self):
+    # boxes -> [[small, good, outside], [outside, small, good]]
+    boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
+                          [7.0, 4.0, 9.5, 6.5]],
+                         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
+                          [4.0, 1.0, 7.0, 4.0]]])
+    filtered_boxes_np = np.array([[[0.0, 0.0, 0.0, 0.0], [2.0, 3.0, 4.5, 5.5],
+                                   [7.0, 4.0, 9.5, 6.5]],
+                                  [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                                   [4.0, 1.0, 7.0, 4.0]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    scores_np = np.array([[0.1, 0.7, 0.6], [0.11, 0.22, 0.53]])
+    filtered_scores_np = np.array([[-1.0, 0.7, 0.6], [-1.0, -1.0, 0.53]])
+    scores = tf.constant(scores_np, dtype=tf.float32)
+    min_score_threshold = 0.5
+
+    def test_fn(boxes, scores):
+      filtered_boxes, filtered_scores = box_ops.filter_boxes_by_scores(
+          boxes, scores, min_score_threshold)
+      return filtered_boxes, filtered_scores
+
+    filtered_results_tpu, filtered_results_cpu = _transform_boxes_on_tpu_and_cpu(
+        test_fn, boxes, scores)
+    filtered_boxes_tpu, filtered_scores_tpu = filtered_results_tpu
+    filtered_boxes_cpu, filtered_scores_cpu = filtered_results_cpu
+
+    self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_cpu, 1e-5)
+    self.assertNDArrayNear(filtered_boxes_tpu, filtered_boxes_np, 1e-5)
+    self.assertNDArrayNear(filtered_scores_tpu, filtered_scores_np, 1e-5)
+
+
+class GatherInstancesTest(tf.test.TestCase):
+
+  def test_gather_instances(self):
+    boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
+                          [7.0, 4.0, 9.5, 6.5]],
+                         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
+                          [4.0, 1.0, 7.0, 4.0]]])
+    indices_np = np.array([[2, 0], [0, 1]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    indices = tf.constant(indices_np, dtype=tf.int32)
+
+    selected_boxes = box_ops.gather_instances(indices, boxes)
+
+    expected_selected_boxes = np.array(
+        [[[7.0, 4.0, 9.5, 6.5], [1.0, 2.0, 1.5, 2.5]],
+         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0]]])
+
+    self.assertNDArrayNear(expected_selected_boxes, selected_boxes, 1e-5)
+
+  def test_gather_instances_with_multiple_inputs(self):
+    boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
+                          [7.0, 4.0, 9.5, 6.5]],
+                         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
+                          [4.0, 1.0, 7.0, 4.0]]])
+    classes_np = np.array([[1, 2, 3], [20, 30, 40]])
+    indices_np = np.array([[2, 0], [0, 1]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    classes = tf.constant(classes_np, dtype=tf.int32)
+    indices = tf.constant(indices_np, dtype=tf.int32)
+
+    selected_boxes, selected_classes = box_ops.gather_instances(
+        indices, boxes, classes)
+
+    expected_selected_boxes = np.array(
+        [[[7.0, 4.0, 9.5, 6.5], [1.0, 2.0, 1.5, 2.5]],
+         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0]]])
+    expected_selected_classes = np.array(
+        [[3, 1], [20, 30]])
+
+    self.assertNDArrayNear(expected_selected_boxes, selected_boxes, 1e-5)
+    self.assertAllEqual(expected_selected_classes, selected_classes)
+
+
+class TopKBoxesTest(tf.test.TestCase):
+
+  def test_top_k_boxes_batch1(self):
+    boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
+                          [7.0, 4.0, 9.5, 6.5]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    scores_np = np.array([[0.9, 0.5, 0.7]])
+    scores = tf.constant(scores_np, dtype=tf.float32)
+    top_k_boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [7.0, 4.0, 9.5, 6.5]]])
+    top_k_scores_np = np.array([[0.9, 0.7]])
+
+    def test_fn(boxes, scores):
+      top_k_boxes, top_k_scores = box_ops.top_k_boxes(boxes, scores, k=2)
+      return top_k_boxes, top_k_scores
+
+    top_k_results_tpu, top_k_results_cpu = _transform_boxes_on_tpu_and_cpu(
+        test_fn, boxes, scores)
+    top_k_boxes_tpu, top_k_scores_tpu = top_k_results_tpu
+    top_k_boxes_cpu, top_k_scores_cpu = top_k_results_cpu
+
+    self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_cpu, 1e-5)
+    self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_np, 1e-5)
+    self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_np, 1e-5)
+
+  def test_top_k_boxes_batch2(self):
+    boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5],
+                          [7.0, 4.0, 9.5, 6.5]],
+                         [[-2.0, 5.0, 0.0, 7.5], [5.0, 6.0, 5.1, 6.0],
+                          [4.0, 1.0, 7.0, 4.0]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+    scores_np = np.array([[0.9, 0.7, 0.5], [0.11, 0.22, 0.33]])
+    scores = tf.constant(scores_np, dtype=tf.float32)
+    top_k_boxes_np = np.array([[[1.0, 2.0, 1.5, 2.5], [2.0, 3.0, 4.5, 5.5]],
+                               [[4.0, 1.0, 7.0, 4.0], [5.0, 6.0, 5.1, 6.0]]])
+    top_k_scores_np = np.array([[0.9, 0.7], [0.33, 0.22]])
+
+    def test_fn(boxes, scores):
+      top_k_boxes, top_k_scores = box_ops.top_k_boxes(boxes, scores, k=2)
+      return top_k_boxes, top_k_scores
+
+    top_k_results_tpu, top_k_results_cpu = _transform_boxes_on_tpu_and_cpu(
+        test_fn, boxes, scores)
+    top_k_boxes_tpu, top_k_scores_tpu = top_k_results_tpu
+    top_k_boxes_cpu, top_k_scores_cpu = top_k_results_cpu
+
+    self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_cpu, 1e-5)
+    self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_cpu, 1e-5)
+    self.assertNDArrayNear(top_k_boxes_tpu, top_k_boxes_np, 1e-5)
+    self.assertNDArrayNear(top_k_scores_tpu, top_k_scores_np, 1e-5)
+
+
+class BboxeOverlapTest(tf.test.TestCase):
+
+  def testBBoxeOverlapOpCorrectness(self):
+    boxes_data = [[[0, 0, 0.1, 1], [0, 0.2, 0.2, 1.2], [0, 0.3, 0.3, 1.3],
+                   [0, 0.5, 0.4, 1.5], [0, 0.7, 0.5, 1.7], [0, 0.9, 0.6, 1.9],
+                   [0, 0.1, 0.1, 1.1], [0, 0.3, 0.7, 1.3], [0, 0.9, 2, 1.9]],
+                  [[0, 0, 1, 0.2], [0, 0.2, 0.5, 1.2], [0, 0.4, 0.9, 1.4],
+                   [0, 0.6, 1.1, 1.6], [0, 0.8, 1.2, 1.8], [0, 1, 1.5, 2],
+                   [0, 0.5, 1, 1], [0.5, 0.8, 1, 1.8], [-1, -1, -1, -1]]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+    gt_boxes_data = [[[0, 0.1, 0.1, 1.1], [0, 0.3, 0.7, 1.3], [0, 0.9, 2, 1.9]],
+                     [[0, 0.5, 1, 1], [0.5, 0.8, 1, 1.8], [-1, -1, -1, -1]]]
+    gt_boxes_np = np.array(gt_boxes_data, dtype=np.float32)
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      boxes = tf.constant(boxes_np)
+      gt_boxes = tf.constant(gt_boxes_np)
+      iou = box_ops.bbox_overlap(boxes=boxes, gt_boxes=gt_boxes)
+      iou = iou.numpy()
+    self.assertEqual(iou.shape, (2, 9, 3))
+    self.assertAllEqual(
+        np.argmax(iou, axis=2),
+        [[0, 0, 1, 1, 1, 2, 0, 1, 2], [0, 0, 0, 0, 1, 1, 0, 1, 0]])
+
+  def testBBoxeOverlapOpCheckShape(self):
+    batch_size = 2
+    rpn_post_nms_topn = 2000
+    gt_max_instances = 100
+    boxes_np = np.random.rand(batch_size, rpn_post_nms_topn,
+                              4).astype(np.float32)
+    gt_boxes_np = np.random.rand(batch_size, gt_max_instances,
+                                 4).astype(np.float32)
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      boxes = tf.constant(boxes_np)
+      gt_boxes = tf.constant(gt_boxes_np)
+      iou = box_ops.bbox_overlap(boxes=boxes, gt_boxes=gt_boxes)
+      iou = iou.numpy()
+    self.assertEqual(iou.shape,
+                     (batch_size, (rpn_post_nms_topn), gt_max_instances))
+
+  def testBBoxeOverlapOpCorrectnessWithNegativeData(self):
+    boxes_data = [[[0, -0.01, 0.1, 1.1], [0, 0.2, 0.2, 5.0],
+                   [0, -0.01, 0.1, 1.], [-1, -1, -1, -1]]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+    gt_boxes_np = boxes_np
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      boxes = tf.constant(boxes_np)
+      gt_boxes = tf.constant(gt_boxes_np)
+      iou = box_ops.bbox_overlap(boxes=boxes, gt_boxes=gt_boxes)
+      iou = iou.numpy()
+    expected = np.array([[[0.99999994, 0.0917431, 0.9099099, -1.],
+                          [0.0917431, 1., 0.08154944, -1.],
+                          [0.9099099, 0.08154944, 1., -1.],
+                          [-1., -1., -1., -1.]]])
+    self.assertAllClose(expected, iou)
+
+
+class BoxMatchingTest(tf.test.TestCase):
+
+  def test_box_matching_single(self):
+    boxes_np = np.array(
+        [[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
+          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+
+    gt_boxes_np = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5],
+          [-1, -1, -1, -1]]])
+    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
+    gt_classes_np = np.array([[2, 10, -1]])
+    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
+
+    matched_gt_boxes_np = np.array(
+        [[[2.5, 2.5, 7.5, 7.5],
+          [2.5, 2.5, 7.5, 7.5],
+          [2.5, 2.5, 7.5, 7.5],
+          [10, 10, 15, 15]]])
+    matched_gt_classes_np = np.array([[10, 10, 10, 2]])
+    matched_gt_indices_np = np.array([[1, 1, 1, 0]])
+    matched_iou_np = np.array(
+        [[0.142857142857, 1.0, 0.142857142857, 0.142857142857]])
+    iou_np = np.array(
+        [[[0, 0.142857142857, -1.0],
+          [0, 1.0, -1.0],
+          [0, 0.142857142857, -1.0],
+          [0.142857142857, 0, -1.0]]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      (matched_gt_boxes_tpu, matched_gt_classes_tpu,
+       matched_gt_indices_tpu, matched_iou_tpu, iou_tpu) = (
+           box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    # Runs on CPU.
+    (matched_gt_boxes_cpu, matched_gt_classes_cpu,
+     matched_gt_indices_cpu, matched_iou_cpu, iou_cpu) = (
+         box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    # consistency.
+    self.assertNDArrayNear(
+        matched_gt_boxes_tpu.numpy(), matched_gt_boxes_cpu.numpy(), 1e-5)
+    self.assertAllEqual(
+        matched_gt_classes_tpu.numpy(), matched_gt_classes_cpu.numpy())
+    self.assertAllEqual(
+        matched_gt_indices_tpu.numpy(), matched_gt_indices_cpu.numpy())
+    self.assertNDArrayNear(
+        matched_iou_tpu.numpy(), matched_iou_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        iou_tpu.numpy(), iou_cpu.numpy(), 1e-5)
+
+    # correctness.
+    self.assertNDArrayNear(
+        matched_gt_boxes_tpu.numpy(), matched_gt_boxes_np, 1e-5)
+    self.assertAllEqual(
+        matched_gt_classes_tpu.numpy(), matched_gt_classes_np)
+    self.assertAllEqual(
+        matched_gt_indices_tpu.numpy(), matched_gt_indices_np)
+    self.assertNDArrayNear(
+        matched_iou_tpu.numpy(), matched_iou_np, 1e-5)
+    self.assertNDArrayNear(
+        iou_tpu.numpy(), iou_np, 1e-5)
+
+  def test_box_matching_single_no_gt(self):
+    boxes_np = np.array(
+        [[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
+          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+
+    gt_boxes_np = np.array(
+        [[[-1, -1, -1, -1],
+          [-1, -1, -1, -1],
+          [-1, -1, -1, -1]]])
+    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
+    gt_classes_np = np.array([[-1, -1, -1]])
+    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
+
+    matched_gt_boxes_np = np.array(
+        [[[0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [0, 0, 0, 0]]])
+    matched_gt_classes_np = np.array([[0, 0, 0, 0]])
+    matched_gt_indices_np = np.array([[-1, -1, -1, -1]])
+    matched_iou_np = np.array([[-1, -1, -1, -1]])
+    iou_np = np.array(
+        [[[-1, -1, -1],
+          [-1, -1, -1],
+          [-1, -1, -1],
+          [-1, -1, -1]]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      (matched_gt_boxes_tpu, matched_gt_classes_tpu,
+       matched_gt_indices_tpu, matched_iou_tpu, iou_tpu) = (
+           box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    # Runs on CPU.
+    (matched_gt_boxes_cpu, matched_gt_classes_cpu,
+     matched_gt_indices_cpu, matched_iou_cpu, iou_cpu) = (
+         box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    # consistency.
+    self.assertNDArrayNear(
+        matched_gt_boxes_tpu.numpy(), matched_gt_boxes_cpu.numpy(), 1e-5)
+    self.assertAllEqual(
+        matched_gt_classes_tpu.numpy(), matched_gt_classes_cpu.numpy())
+    self.assertAllEqual(
+        matched_gt_indices_tpu.numpy(), matched_gt_indices_cpu.numpy())
+    self.assertNDArrayNear(
+        matched_iou_tpu.numpy(), matched_iou_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        iou_tpu.numpy(), iou_cpu.numpy(), 1e-5)
+
+    # correctness.
+    self.assertNDArrayNear(
+        matched_gt_boxes_tpu.numpy(), matched_gt_boxes_np, 1e-5)
+    self.assertAllEqual(
+        matched_gt_classes_tpu.numpy(), matched_gt_classes_np)
+    self.assertAllEqual(
+        matched_gt_indices_tpu.numpy(), matched_gt_indices_np)
+    self.assertNDArrayNear(
+        matched_iou_tpu.numpy(), matched_iou_np, 1e-5)
+    self.assertNDArrayNear(
+        iou_tpu.numpy(), iou_np, 1e-5)
+
+  def test_box_matching_batch(self):
+    boxes_np = np.array(
+        [[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
+          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]],
+         [[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
+          [5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
+    boxes = tf.constant(boxes_np, dtype=tf.float32)
+
+    gt_boxes_np = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[-1, -1, -1, -1], [-1, -1, -1, -1], [-1, -1, -1, -1]]])
+    gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
+    gt_classes_np = np.array([[2, 10, -1], [-1, -1, -1]])
+    gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
+
+    matched_gt_boxes_np = np.array(
+        [[[2.5, 2.5, 7.5, 7.5],
+          [2.5, 2.5, 7.5, 7.5],
+          [2.5, 2.5, 7.5, 7.5],
+          [10, 10, 15, 15]],
+         [[0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [0, 0, 0, 0]]])
+    matched_gt_classes_np = np.array(
+        [[10, 10, 10, 2],
+         [0, 0, 0, 0]])
+    matched_gt_indices_np = np.array(
+        [[1, 1, 1, 0],
+         [-1, -1, -1, -1]])
+    matched_iou_np = np.array(
+        [[0.142857142857, 1.0, 0.142857142857, 0.142857142857],
+         [-1, -1, -1, -1]])
+    iou_np = np.array(
+        [[[0, 0.142857142857, -1.0],
+          [0, 1.0, -1.0],
+          [0, 0.142857142857, -1.0],
+          [0.142857142857, 0, -1.0]],
+         [[-1, -1, -1],
+          [-1, -1, -1],
+          [-1, -1, -1],
+          [-1, -1, -1]]])
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      (matched_gt_boxes_tpu, matched_gt_classes_tpu,
+       matched_gt_indices_tpu, matched_iou_tpu, iou_tpu) = (
+           box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    # Runs on CPU.
+    (matched_gt_boxes_cpu, matched_gt_classes_cpu,
+     matched_gt_indices_cpu, matched_iou_cpu, iou_cpu) = (
+         box_ops.box_matching(boxes, gt_boxes, gt_classes))
+
+    # consistency.
+    self.assertNDArrayNear(
+        matched_gt_boxes_tpu.numpy(), matched_gt_boxes_cpu.numpy(), 1e-5)
+    self.assertAllEqual(
+        matched_gt_classes_tpu.numpy(), matched_gt_classes_cpu.numpy())
+    self.assertAllEqual(
+        matched_gt_indices_tpu.numpy(), matched_gt_indices_cpu.numpy())
+    self.assertNDArrayNear(
+        matched_iou_tpu.numpy(), matched_iou_cpu.numpy(), 1e-5)
+    self.assertNDArrayNear(
+        iou_tpu.numpy(), iou_cpu.numpy(), 1e-5)
+
+    # correctness.
+    self.assertNDArrayNear(
+        matched_gt_boxes_tpu.numpy(), matched_gt_boxes_np, 1e-5)
+    self.assertAllEqual(
+        matched_gt_classes_tpu.numpy(), matched_gt_classes_np)
+    self.assertAllEqual(
+        matched_gt_indices_tpu.numpy(), matched_gt_indices_np)
+    self.assertNDArrayNear(
+        matched_iou_tpu.numpy(), matched_iou_np, 1e-5)
+    self.assertNDArrayNear(
+        iou_tpu.numpy(), iou_np, 1e-5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/experimental/anchor_generator.py
+++ b/official/vision/beta/ops/experimental/anchor_generator.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multi scale anchor generator definition."""
+
+import tensorflow as tf
+
+
+# (TODO/tanzheny): consider having customized anchor offset.
+class _SingleAnchorGenerator:
+  """Utility to generate anchors for a single feature map.
+
+  Example:
+  ```python
+  anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
+  anchors = anchor_gen([512, 512, 3])
+  ```
+  """
+
+  def __init__(self,
+               anchor_size,
+               scales,
+               aspect_ratios,
+               stride,
+               clip_boxes=False):
+    """Constructs single scale anchor.
+
+    Args:
+      anchor_size: A single int represents the base anchor size. The anchor
+        height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
+        `anchor_size * sqrt(aspect_ratio)`.
+      scales: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the actual anchor size to the base `anchor_size`.
+      aspect_ratios: a list/tuple of positive floats representing the ratio of
+        anchor width to anchor height.
+      stride: A single int represents the anchor stride size between center of
+        each anchor.
+      clip_boxes: Boolean to represent whether the anchor coordinates should be
+        clipped to the image size. Defaults to `True`.
+    Input shape: the size of the image, `[H, W, C]`
+    Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
+    """
+    self.anchor_size = anchor_size
+    self.scales = scales
+    self.aspect_ratios = aspect_ratios
+    self.stride = stride
+    self.clip_boxes = clip_boxes
+
+  def __call__(self, image_size):
+    image_height = tf.cast(image_size[0], tf.float32)
+    image_width = tf.cast(image_size[1], tf.float32)
+
+    k = len(self.scales) * len(self.aspect_ratios)
+    aspect_ratios_sqrt = tf.cast(tf.sqrt(self.aspect_ratios), dtype=tf.float32)
+    anchor_size = tf.cast(self.anchor_size, tf.float32)
+
+    # [K]
+    anchor_heights = []
+    anchor_widths = []
+    for scale in self.scales:
+      anchor_size_t = anchor_size * scale
+      anchor_height = anchor_size_t / aspect_ratios_sqrt
+      anchor_width = anchor_size_t * aspect_ratios_sqrt
+      anchor_heights.append(anchor_height)
+      anchor_widths.append(anchor_width)
+    anchor_heights = tf.concat(anchor_heights, axis=0)
+    anchor_widths = tf.concat(anchor_widths, axis=0)
+    half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, k])
+    half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, k])
+
+    stride = tf.cast(self.stride, tf.float32)
+    # [W]
+    cx = tf.range(0.5 * stride, image_width, stride)
+    # [H]
+    cy = tf.range(0.5 * stride, image_height, stride)
+    # [H, W]
+    cx_grid, cy_grid = tf.meshgrid(cx, cy)
+    # [H, W, 1]
+    cx_grid = tf.expand_dims(cx_grid, axis=-1)
+    cy_grid = tf.expand_dims(cy_grid, axis=-1)
+
+    # [H, W, K, 1]
+    y_min = tf.expand_dims(cy_grid - half_anchor_heights, axis=-1)
+    y_max = tf.expand_dims(cy_grid + half_anchor_heights, axis=-1)
+    x_min = tf.expand_dims(cx_grid - half_anchor_widths, axis=-1)
+    x_max = tf.expand_dims(cx_grid + half_anchor_widths, axis=-1)
+
+    if self.clip_boxes:
+      y_min = tf.maximum(tf.minimum(y_min, image_height), 0.)
+      y_max = tf.maximum(tf.minimum(y_max, image_height), 0.)
+      x_min = tf.maximum(tf.minimum(x_min, image_width), 0.)
+      x_max = tf.maximum(tf.minimum(x_max, image_width), 0.)
+
+    # [H, W, K, 4]
+    result = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
+    shape = result.shape.as_list()
+    # [H, W, K * 4]
+    return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]])
+
+
+class AnchorGenerator():
+  """Utility to generate anchors for a multiple feature maps.
+
+  Example:
+  ```python
+  anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
+    strides=[16, 32])
+  anchors = anchor_gen([512, 512, 3])
+  ```
+
+  """
+
+  def __init__(self,
+               anchor_sizes,
+               scales,
+               aspect_ratios,
+               strides,
+               clip_boxes=False):
+    """Constructs multiscale anchors.
+
+    Args:
+      anchor_sizes: A list of int represents the anchor size for each scale. The
+        anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
+        will be `anchor_size * sqrt(aspect_ratio)` for each scale.
+      scales: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the actual anchor size to the base `anchor_size`.
+      aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the ratio of anchor width to anchor height.
+      strides: A list/tuple of ints represent the anchor stride size between
+        center of anchors at each scale.
+      clip_boxes: Boolean to represents whether the anchor coordinates should be
+        clipped to the image size. Defaults to `False`.
+    Input shape: the size of the image, `[H, W, C]`
+    Output shape: the size of anchors concat on each level, `[(H /
+      strides) * (W / strides), K * 4]`
+    """
+    # aspect_ratio is a single list that is the same across all levels.
+    aspect_ratios = maybe_map_structure_for_anchor(aspect_ratios, anchor_sizes)
+    scales = maybe_map_structure_for_anchor(scales, anchor_sizes)
+    if isinstance(anchor_sizes, dict):
+      self.anchor_generators = {}
+      for k in anchor_sizes.keys():
+        self.anchor_generators[k] = _SingleAnchorGenerator(
+            anchor_sizes[k], scales[k], aspect_ratios[k], strides[k],
+            clip_boxes)
+    elif isinstance(anchor_sizes, (list, tuple)):
+      self.anchor_generators = []
+      for anchor_size, scale_list, ar_list, stride in zip(
+          anchor_sizes, scales, aspect_ratios, strides):
+        self.anchor_generators.append(
+            _SingleAnchorGenerator(anchor_size, scale_list, ar_list, stride,
+                                   clip_boxes))
+
+  def __call__(self, image_size):
+    anchor_generators = tf.nest.flatten(self.anchor_generators)
+    results = [anchor_gen(image_size) for anchor_gen in anchor_generators]
+    return tf.nest.pack_sequence_as(self.anchor_generators, results)
+
+
+def maybe_map_structure_for_anchor(params, anchor_sizes):
+  """broadcast the params to match anchor_sizes."""
+  if all(isinstance(param, (int, float)) for param in params):
+    if isinstance(anchor_sizes, (tuple, list)):
+      return [params] * len(anchor_sizes)
+    elif isinstance(anchor_sizes, dict):
+      return tf.nest.map_structure(lambda _: params, anchor_sizes)
+    else:
+      raise ValueError("the structure of `anchor_sizes` must be a tuple, "
+                       "list, or dict, given {}".format(anchor_sizes))
+  else:
+    return params
--- a/official/vision/beta/ops/experimental/anchor_generator_test.py
+++ b/official/vision/beta/ops/experimental/anchor_generator_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for anchor_generator.py."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.beta.ops.experimental import anchor_generator
+
+
+class AnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, [1.0], [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+                  [[16., -16., 80., 48.], [16., 16., 80., 80.]]]),
+      # # Multi aspect ratio anchor.
+      (6, [1.0, 4.0, 0.25],
+       [[[-32., -32., 96., 96., 0., -96., 64., 160., -96., 0., 160., 64.]]]),
+  )
+  def testAnchorGeneration(self, level, aspect_ratios, expected_boxes):
+    image_size = [64, 64]
+    anchor_size = 2**(level + 1)
+    stride = 2**level
+    anchor_gen = anchor_generator._SingleAnchorGenerator(
+        anchor_size=anchor_size,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        stride=stride,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, [1.0], [[[0., 0., 48., 48.], [0., 16., 48., 64.]],
+                  [[16., 0., 64., 48.], [16., 16., 64., 64.]]]),
+      # # Multi aspect ratio anchor.
+      (6, [1.0, 4.0, 0.25
+          ], [[[0., 0., 64., 64., 0., 0., 64., 64., 0., 0., 64., 64.]]]),
+  )
+  def testAnchorGenerationClipped(self, level, aspect_ratios, expected_boxes):
+    image_size = [64, 64]
+    anchor_size = 2**(level + 1)
+    stride = 2**level
+    anchor_gen = anchor_generator._SingleAnchorGenerator(
+        anchor_size=anchor_size,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        stride=stride,
+        clip_boxes=True)
+    anchors = anchor_gen(image_size).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @combinations.generate(
+      combinations.combine(distribution=strategy_combinations.all_strategies))
+  def testAnchorGenerationDistributed(self, distribution):
+    image_size = [64, 64]
+    anchor_size = 64
+    stride = 32
+    aspect_ratios = [1.0]
+    with distribution.scope():
+      anchor_gen = anchor_generator._SingleAnchorGenerator(
+          anchor_size=anchor_size,
+          scales=[1.],
+          aspect_ratios=aspect_ratios,
+          stride=stride,
+          clip_boxes=False)
+      anchors = anchor_gen(image_size).numpy()
+      expected_boxes = [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+                        [[16., -16., 80., 48.], [16., 16., 80., 80.]]]
+      self.assertAllClose(expected_boxes, anchors)
+
+
+class MultiScaleAnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
+                              [16, -16, 80, 48], [16, 16, 80, 80],
+                              [-32, -32, 96, 96]]),)
+  def testAnchorGeneration(self, min_level, max_level, aspect_ratios,
+                           expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = [2**(level + 1) for level in levels]
+    strides = [2**level for level in levels]
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides)
+    anchors = anchor_gen(image_size)
+    anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
+    anchors = tf.concat(anchors, axis=0).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
+                              [16, -16, 80, 48], [16, 16, 80, 80],
+                              [-32, -32, 96, 96]]),)
+  def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios,
+                                  expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = [2**(level + 1) for level in levels]
+    strides = [2**level for level in levels]
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size)
+    anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
+    anchors = tf.concat(anchors, axis=0).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [1.0], {
+          5: [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+              [[16., -16., 80., 48.], [16., 16., 80., 80.]]],
+          6: [[[-32, -32, 96, 96]]]
+      }),)
+  def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios,
+                               expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = dict((level, 2**(level + 1)) for level in levels)
+    strides = dict((level, 2**level) for level in levels)
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size)
+    for k in expected_boxes.keys():
+      self.assertAllClose(expected_boxes[k], anchors[k].numpy())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/mask_ops.py
+++ b/official/vision/beta/ops/mask_ops.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for segmentations."""
+
+import math
+# Import libraries
+from cvx2 import latest as cv2
+import numpy as np
+
+
+def paste_instance_masks(masks,
+                         detected_boxes,
+                         image_height,
+                         image_width):
+  """Paste instance masks to generate the image segmentation results.
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+
+  def expand_boxes(boxes, scale):
+    """Expands an array of boxes by a given scale."""
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
+    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+    # whereas `boxes` here is in [x1, y1, w, h] form
+    w_half = boxes[:, 2] * .5
+    h_half = boxes[:, 3] * .5
+    x_c = boxes[:, 0] + w_half
+    y_c = boxes[:, 1] + h_half
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+
+    return boxes_exp
+
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812  # pylint: disable=line-too-long
+  # To work around an issue with cv2.resize (it seems to automatically pad
+  # with repeated border values), we manually zero-pad the masks by 1 pixel
+  # prior to resizing back to the original image resolution. This prevents
+  # "top hat" artifacts. We therefore need to expand the reference boxes by an
+  # appropriate factor.
+  _, mask_height, mask_width = masks.shape
+  scale = max((mask_width + 2.0) / mask_width,
+              (mask_height + 2.0) / mask_height)
+
+  ref_boxes = expand_boxes(detected_boxes, scale)
+  ref_boxes = ref_boxes.astype(np.int32)
+  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+  segms = []
+  for mask_ind, mask in enumerate(masks):
+    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+    # Process mask inside bounding boxes.
+    padded_mask[1:-1, 1:-1] = mask[:, :]
+
+    ref_box = ref_boxes[mask_ind, :]
+    w = ref_box[2] - ref_box[0] + 1
+    h = ref_box[3] - ref_box[1] + 1
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    mask = cv2.resize(padded_mask, (w, h))
+    mask = np.array(mask > 0.5, dtype=np.uint8)
+
+    x_0 = min(max(ref_box[0], 0), image_width)
+    x_1 = min(max(ref_box[2] + 1, 0), image_width)
+    y_0 = min(max(ref_box[1], 0), image_height)
+    y_1 = min(max(ref_box[3] + 1, 0), image_height)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - ref_box[1]):(y_1 - ref_box[1]),
+        (x_0 - ref_box[0]):(x_1 - ref_box[0])
+    ]
+    segms.append(im_mask)
+
+  segms = np.array(segms)
+  assert masks.shape[0] == segms.shape[0]
+  return segms
+
+
+def paste_instance_masks_v2(masks,
+                            detected_boxes,
+                            image_height,
+                            image_width):
+  """Paste instance masks to generate the image segmentation (v2).
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+  _, mask_height, mask_width = masks.shape
+
+  segms = []
+  for i, mask in enumerate(masks):
+    box = detected_boxes[i, :]
+    xmin = box[0]
+    ymin = box[1]
+    xmax = xmin + box[2]
+    ymax = ymin + box[3]
+
+    # Sample points of the cropped mask w.r.t. the image grid.
+    # Note that these coordinates may fall beyond the image.
+    # Pixel clipping will happen after warping.
+    xmin_int = int(math.floor(xmin))
+    xmax_int = int(math.ceil(xmax))
+    ymin_int = int(math.floor(ymin))
+    ymax_int = int(math.ceil(ymax))
+
+    alpha = box[2] / (1.0 * mask_width)
+    beta = box[3] / (1.0 * mask_height)
+    # pylint: disable=invalid-name
+    # Transformation from mask pixel indices to image coordinate.
+    M_mask_to_image = np.array(
+        [[alpha, 0, xmin],
+         [0, beta, ymin],
+         [0, 0, 1]],
+        dtype=np.float32)
+    # Transformation from image to cropped mask coordinate.
+    M_image_to_crop = np.array(
+        [[1, 0, -xmin_int],
+         [0, 1, -ymin_int],
+         [0, 0, 1]],
+        dtype=np.float32)
+    M = np.dot(M_image_to_crop, M_mask_to_image)
+    # Compensate the half pixel offset that OpenCV has in the
+    # warpPerspective implementation: the top-left pixel is sampled
+    # at (0,0), but we want it to be at (0.5, 0.5).
+    M = np.dot(
+        np.dot(
+            np.array([[1, 0, -0.5],
+                      [0, 1, -0.5],
+                      [0, 0, 1]], np.float32),
+            M),
+        np.array([[1, 0, 0.5],
+                  [0, 1, 0.5],
+                  [0, 0, 1]], np.float32))
+    # pylint: enable=invalid-name
+    cropped_mask = cv2.warpPerspective(
+        mask.astype(np.float32), M,
+        (xmax_int - xmin_int, ymax_int - ymin_int))
+    cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
+
+    img_mask = np.zeros((image_height, image_width))
+    x0 = max(min(xmin_int, image_width), 0)
+    x1 = max(min(xmax_int, image_width), 0)
+    y0 = max(min(ymin_int, image_height), 0)
+    y1 = max(min(ymax_int, image_height), 0)
+    img_mask[y0:y1, x0:x1] = cropped_mask[
+        (y0 - ymin_int):(y1 - ymin_int),
+        (x0 - xmin_int):(x1 - xmin_int)]
+
+    segms.append(img_mask)
+
+  segms = np.array(segms)
+  return segms
+
--- a/official/vision/beta/ops/mask_ops_test.py
+++ b/official/vision/beta/ops/mask_ops_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for mask_ops.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.ops import mask_ops
+
+
+class MaskUtilsTest(tf.test.TestCase):
+
+  def testPasteInstanceMasks(self):
+    image_height = 10
+    image_width = 10
+    mask_height = 6
+    mask_width = 6
+    masks = np.random.randint(0, 255, (1, mask_height, mask_width))
+    detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
+
+    _ = mask_ops.paste_instance_masks(
+        masks, detected_boxes, image_height, image_width)
+
+  def testPasteInstanceMasksV2(self):
+    image_height = 10
+    image_width = 10
+    mask_height = 6
+    mask_width = 6
+    masks = np.random.randint(0, 255, (1, mask_height, mask_width))
+    detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
+
+    image_masks = mask_ops.paste_instance_masks_v2(
+        masks, detected_boxes, image_height, image_width)
+
+    self.assertNDArrayNear(
+        image_masks[:, 2:8, 0:6],
+        np.array(masks > 0.5, dtype=np.uint8),
+        1e-5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/nms.py
+++ b/official/vision/beta/ops/nms.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow implementation of non max suppression."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+
+
+NMS_TILE_SIZE = 512
+
+
+def _self_suppression(iou, _, iou_sum):
+  batch_size = tf.shape(iou)[0]
+  can_suppress_others = tf.cast(
+      tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
+  iou_suppressed = tf.reshape(
+      tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
+      [batch_size, -1, 1]) * iou
+  iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
+  return [
+      iou_suppressed,
+      tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
+  ]
+
+
+def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
+  batch_size = tf.shape(boxes)[0]
+  new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
+                       [batch_size, NMS_TILE_SIZE, 4])
+  iou = box_ops.bbox_overlap(new_slice, box_slice)
+  ret_slice = tf.expand_dims(
+      tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
+      2) * box_slice
+  return boxes, ret_slice, iou_threshold, inner_idx + 1
+
+
+def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
+  """Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
+
+  Args:
+    boxes: a tensor with a shape of [batch_size, anchors, 4].
+    iou_threshold: a float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+    output_size: an int32 tensor of size [batch_size]. Representing the number
+      of selected boxes for each batch.
+    idx: an integer scalar representing induction variable.
+
+  Returns:
+    boxes: updated boxes.
+    iou_threshold: pass down iou_threshold to the next iteration.
+    output_size: the updated output_size.
+    idx: the updated induction variable.
+  """
+  num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE
+  batch_size = tf.shape(boxes)[0]
+
+  # Iterates over tiles that can possibly suppress the current tile.
+  box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
+                       [batch_size, NMS_TILE_SIZE, 4])
+  _, box_slice, _, _ = tf.while_loop(
+      lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
+      _cross_suppression, [boxes, box_slice, iou_threshold,
+                           tf.constant(0)])
+
+  # Iterates over the current tile to compute self-suppression.
+  iou = box_ops.bbox_overlap(box_slice, box_slice)
+  mask = tf.expand_dims(
+      tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
+          tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
+  iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
+  suppressed_iou, _, _ = tf.while_loop(
+      lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
+      [iou, tf.constant(True),
+       tf.reduce_sum(iou, [1, 2])])
+  suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
+  box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
+
+  # Uses box_slice to update the input boxes.
+  mask = tf.reshape(
+      tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
+  boxes = tf.tile(tf.expand_dims(
+      box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
+          boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
+  boxes = tf.reshape(boxes, [batch_size, -1, 4])
+
+  # Updates output_size.
+  output_size += tf.reduce_sum(
+      tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
+  return boxes, iou_threshold, output_size, idx + 1
+
+
+def sorted_non_max_suppression_padded(scores,
+                                      boxes,
+                                      max_output_size,
+                                      iou_threshold):
+  """A wrapper that handles non-maximum suppression.
+
+  Assumption:
+    * The boxes are sorted by scores unless the box is a dot (all coordinates
+      are zero).
+    * Boxes with higher scores can be used to suppress boxes with lower scores.
+
+  The overal design of the algorithm is to handle boxes tile-by-tile:
+
+  boxes = boxes.pad_to_multiply_of(tile_size)
+  num_tiles = len(boxes) // tile_size
+  output_boxes = []
+  for i in range(num_tiles):
+    box_tile = boxes[i*tile_size : (i+1)*tile_size]
+    for j in range(i - 1):
+      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
+      iou = bbox_overlap(box_tile, suppressing_tile)
+      # if the box is suppressed in iou, clear it to a dot
+      box_tile *= _update_boxes(iou)
+    # Iteratively handle the diagnal tile.
+    iou = _box_overlap(box_tile, box_tile)
+    iou_changed = True
+    while iou_changed:
+      # boxes that are not suppressed by anything else
+      suppressing_boxes = _get_suppressing_boxes(iou)
+      # boxes that are suppressed by suppressing_boxes
+      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
+      # clear iou to 0 for boxes that are suppressed, as they cannot be used
+      # to suppress other boxes any more
+      new_iou = _clear_iou(iou, suppressed_boxes)
+      iou_changed = (new_iou != iou)
+      iou = new_iou
+    # remaining boxes that can still suppress others, are selected boxes.
+    output_boxes.append(_get_suppressing_boxes(iou))
+    if len(output_boxes) >= max_output_size:
+      break
+
+  Args:
+    scores: a tensor with a shape of [batch_size, anchors].
+    boxes: a tensor with a shape of [batch_size, anchors, 4].
+    max_output_size: a scalar integer `Tensor` representing the maximum number
+      of boxes to be selected by non max suppression.
+    iou_threshold: a float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+
+  Returns:
+    nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
+      dtype as input scores.
+    nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
+      same dtype as input boxes.
+  """
+  batch_size = tf.shape(boxes)[0]
+  num_boxes = tf.shape(boxes)[1]
+  pad = tf.cast(
+      tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
+      tf.int32) * NMS_TILE_SIZE - num_boxes
+  boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
+  scores = tf.pad(
+      tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
+  num_boxes += pad
+
+  def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
+    return tf.logical_and(
+        tf.reduce_min(output_size) < max_output_size,
+        idx < num_boxes // NMS_TILE_SIZE)
+
+  selected_boxes, _, output_size, _ = tf.while_loop(
+      _loop_cond, _suppression_loop_body, [
+          boxes, iou_threshold,
+          tf.zeros([batch_size], tf.int32),
+          tf.constant(0)
+      ])
+  idx = num_boxes - tf.cast(
+      tf.nn.top_k(
+          tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
+          tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
+      tf.int32)
+  idx = tf.minimum(idx, num_boxes - 1)
+  idx = tf.reshape(
+      idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
+  boxes = tf.reshape(
+      tf.gather(tf.reshape(boxes, [-1, 4]), idx),
+      [batch_size, max_output_size, 4])
+  boxes = boxes * tf.cast(
+      tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
+          output_size, [-1, 1, 1]), boxes.dtype)
+  scores = tf.reshape(
+      tf.gather(tf.reshape(scores, [-1, 1]), idx),
+      [batch_size, max_output_size])
+  scores = scores * tf.cast(
+      tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
+          output_size, [-1, 1]), scores.dtype)
+  return scores, boxes
--- a/official/vision/beta/ops/nms_test.py
+++ b/official/vision/beta/ops/nms_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nms.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.ops import nms
+
+
+class SortedNonMaxSuppressionTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(SortedNonMaxSuppressionTest, self).setUp()
+    self.boxes_data = [[[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
+                        [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]],
+                       [[0, 2, 1, 2], [0, 0.8, 1, 1.8], [0, 0.6, 1, 1.6],
+                        [0, 0.4, 1, 1.4], [0, 0.2, 1, 1.2], [0, 0, 1, 1]]]
+    self.scores_data = [[0.9, 0.7, 0.6, 0.5, 0.4, 0.3],
+                        [0.8, 0.7, 0.6, 0.5, 0.4, 0.3]]
+    self.max_output_size = 6
+    self.iou_threshold = 0.5
+
+  def testSortedNonMaxSuppressionOnTPU(self):
+    boxes_np = np.array(self.boxes_data, dtype=np.float32)
+    scores_np = np.array(self.scores_data, dtype=np.float32)
+    iou_threshold_np = np.array(self.iou_threshold, dtype=np.float32)
+
+    boxes = tf.constant(boxes_np)
+    scores = tf.constant(scores_np)
+    iou_threshold = tf.constant(iou_threshold_np)
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      scores_tpu, boxes_tpu = nms.sorted_non_max_suppression_padded(
+          boxes=boxes,
+          scores=scores,
+          max_output_size=self.max_output_size,
+          iou_threshold=iou_threshold)
+
+    self.assertEqual(boxes_tpu.numpy().shape, (2, self.max_output_size, 4))
+    self.assertAllClose(scores_tpu.numpy(),
+                        [[0.9, 0.6, 0.4, 0.3, 0., 0.],
+                         [0.8, 0.7, 0.5, 0.3, 0., 0.]])
+
+  def testSortedNonMaxSuppressionOnCPU(self):
+    boxes_np = np.array(self.boxes_data, dtype=np.float32)
+    scores_np = np.array(self.scores_data, dtype=np.float32)
+    iou_threshold_np = np.array(self.iou_threshold, dtype=np.float32)
+
+    boxes = tf.constant(boxes_np)
+    scores = tf.constant(scores_np)
+    iou_threshold = tf.constant(iou_threshold_np)
+
+    # Runs on CPU.
+    scores_cpu, boxes_cpu = nms.sorted_non_max_suppression_padded(
+        boxes=boxes,
+        scores=scores,
+        max_output_size=self.max_output_size,
+        iou_threshold=iou_threshold)
+
+    self.assertEqual(boxes_cpu.numpy().shape, (2, self.max_output_size, 4))
+    self.assertAllClose(scores_cpu.numpy(),
+                        [[0.9, 0.6, 0.4, 0.3, 0., 0.],
+                         [0.8, 0.7, 0.5, 0.3, 0., 0.]])
+
+  def testSortedNonMaxSuppressionOnTPUSpeed(self):
+    boxes_np = np.random.rand(2, 12000, 4).astype(np.float32)
+    scores_np = np.random.rand(2, 12000).astype(np.float32)
+    iou_threshold_np = np.array(0.7, dtype=np.float32)
+
+    boxes = tf.constant(boxes_np)
+    scores = tf.constant(scores_np)
+    iou_threshold = tf.constant(iou_threshold_np)
+
+    # Runs on TPU.
+    strategy = tf.distribute.experimental.TPUStrategy()
+    with strategy.scope():
+      scores_tpu, boxes_tpu = nms.sorted_non_max_suppression_padded(
+          boxes=boxes,
+          scores=scores,
+          max_output_size=2000,
+          iou_threshold=iou_threshold)
+
+    self.assertEqual(scores_tpu.numpy().shape, (2, 2000))
+    self.assertEqual(boxes_tpu.numpy().shape, (2, 2000, 4))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/preprocess_ops.py
+++ b/official/vision/beta/ops/preprocess_ops.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preprocessing ops."""
+
+import math
+from six.moves import range
+import tensorflow as tf
+
+from official.vision.beta.ops import box_ops
+
+
+CENTER_CROP_FRACTION = 0.875
+
+
+def clip_or_pad_to_fixed_size(input_tensor, size, constant_values=0):
+  """Pads data to a fixed length at the first dimension.
+
+  Args:
+    input_tensor: `Tensor` with any dimension.
+    size: `int` number for the first dimension of output Tensor.
+    constant_values: `int` value assigned to the paddings.
+
+  Returns:
+    `Tensor` with the first dimension padded to `size`.
+  """
+  input_shape = input_tensor.get_shape().as_list()
+  padding_shape = []
+
+  # Computes the padding length on the first dimension, clip input tensor if it
+  # is longer than `size`.
+  input_length = tf.shape(input_tensor)[0]
+  input_length = tf.clip_by_value(input_length, 0, size)
+  input_tensor = input_tensor[:input_length]
+
+  padding_length = tf.maximum(0, size - input_length)
+  padding_shape.append(padding_length)
+
+  # Copies shapes of the rest of input shape dimensions.
+  for i in range(1, len(input_shape)):
+    padding_shape.append(tf.shape(input_tensor)[i])
+
+  # Pads input tensor to the fixed first dimension.
+  paddings = tf.cast(constant_values * tf.ones(padding_shape),
+                     input_tensor.dtype)
+  padded_tensor = tf.concat([input_tensor, paddings], axis=0)
+  output_shape = input_shape
+  output_shape[0] = size
+  padded_tensor.set_shape(output_shape)
+  return padded_tensor
+
+
+def normalize_image(image,
+                    offset=(0.485, 0.456, 0.406),
+                    scale=(0.229, 0.224, 0.225)):
+  """Normalizes the image to zero mean and unit variance."""
+  with tf.name_scope('normalize_image'):
+    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+    offset = tf.constant(offset)
+    offset = tf.expand_dims(offset, axis=0)
+    offset = tf.expand_dims(offset, axis=0)
+    image -= offset
+
+    scale = tf.constant(scale)
+    scale = tf.expand_dims(scale, axis=0)
+    scale = tf.expand_dims(scale, axis=0)
+    image /= scale
+    return image
+
+
+def compute_padded_size(desired_size, stride):
+  """Compute the padded size given the desired size and the stride.
+
+  The padded size will be the smallest rectangle, such that each dimension is
+  the smallest multiple of the stride which is larger than the desired
+  dimension. For example, if desired_size = (100, 200) and stride = 32,
+  the output padded_size = (128, 224).
+
+  Args:
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the target output image size.
+    stride: an integer, the stride of the backbone network.
+
+  Returns:
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size.
+  """
+  if isinstance(desired_size, list) or isinstance(desired_size, tuple):
+    padded_size = [int(math.ceil(d * 1.0 / stride) * stride)
+                   for d in desired_size]
+  else:
+    padded_size = tf.cast(
+        tf.math.ceil(
+            tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
+        tf.int32)
+  return padded_size
+
+
+def resize_and_crop_image(image,
+                          desired_size,
+                          padded_size,
+                          aug_scale_min=1.0,
+                          aug_scale_max=1.0,
+                          seed=1,
+                          method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size (RetinaNet style).
+
+  Resize and pad images given the desired output size of the image and
+  stride size.
+
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and rescale the image to make it
+     the largest rectangle to be bounded by the rectangle specified by the
+     `desired_size`.
+  2. Pad the rescaled image to the padded_size.
+
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the desired actual output image size.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image'):
+    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
+
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+
+    if random_jittering:
+      random_scale = tf.random.uniform(
+          [], aug_scale_min, aug_scale_max, seed=seed)
+      scaled_size = tf.round(random_scale * desired_size)
+    else:
+      scaled_size = desired_size
+
+    scale = tf.minimum(
+        scaled_size[0] / image_size[0], scaled_size[1] / image_size[1])
+    scaled_size = tf.round(image_size * scale)
+
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+
+    if random_jittering:
+      scaled_image = scaled_image[
+          offset[0]:offset[0] + desired_size[0],
+          offset[1]:offset[1] + desired_size[1], :]
+
+    output_image = tf.image.pad_to_bounding_box(
+        scaled_image, 0, 0, padded_size[0], padded_size[1])
+
+    image_info = tf.stack([
+        image_size,
+        tf.constant(desired_size, dtype=tf.float32),
+        image_scale,
+        tf.cast(offset, tf.float32)])
+    return output_image, image_info
+
+
+def resize_and_crop_image_v2(image,
+                             short_side,
+                             long_side,
+                             padded_size,
+                             aug_scale_min=1.0,
+                             aug_scale_max=1.0,
+                             seed=1,
+                             method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size (Faster R-CNN style).
+
+  Resize and pad images given the specified short / long side length and the
+  stride size.
+
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and first try to rescale the short
+     side of the original image to `short_side`.
+  2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
+     the aspect ratio and rescal the long side of the image to `long_side`.
+  2. Pad the rescaled image to the padded_size.
+
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    short_side: a scalar `Tensor` or `int` representing the desired short side
+      to be rescaled to.
+    long_side: a scalar `Tensor` or `int` representing the desired long side to
+      be rescaled to.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image_v2'):
+    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
+
+    scale_using_short_side = (
+        short_side / tf.math.minimum(image_size[0], image_size[1]))
+    scale_using_long_side = (
+        long_side / tf.math.maximum(image_size[0], image_size[1]))
+
+    scaled_size = tf.math.round(image_size * scale_using_short_side)
+    scaled_size = tf.where(
+        tf.math.greater(
+            tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
+        tf.math.round(image_size * scale_using_long_side),
+        scaled_size)
+    desired_size = scaled_size
+
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+
+    if random_jittering:
+      random_scale = tf.random.uniform(
+          [], aug_scale_min, aug_scale_max, seed=seed)
+      scaled_size = tf.math.round(random_scale * scaled_size)
+
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+
+    if random_jittering:
+      scaled_image = scaled_image[
+          offset[0]:offset[0] + desired_size[0],
+          offset[1]:offset[1] + desired_size[1], :]
+
+    output_image = tf.image.pad_to_bounding_box(
+        scaled_image, 0, 0, padded_size[0], padded_size[1])
+
+    image_info = tf.stack([
+        image_size,
+        tf.cast(desired_size, dtype=tf.float32),
+        image_scale,
+        tf.cast(offset, tf.float32)])
+    return output_image, image_info
+
+
+def center_crop_image(image):
+  """Center crop a square shape slice from the input image.
+
+  It crops a square shape slice from the image. The side of the actual crop
+  is 224 / 256 = 0.875 of the short side of the original image. References:
+  [1] Very Deep Convolutional Networks for Large-Scale Image Recognition
+      https://arxiv.org/abs/1409.1556
+  [2] Deep Residual Learning for Image Recognition
+      https://arxiv.org/abs/1512.03385
+
+  Args:
+    image: a Tensor of shape [height, width, 3] representing the input image.
+
+  Returns:
+    cropped_image: a Tensor representing the center cropped image.
+  """
+  with tf.name_scope('center_crop_image'):
+    image_size = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
+    crop_size = (
+        CENTER_CROP_FRACTION * tf.math.minimum(image_size[0], image_size[1]))
+    crop_offset = tf.cast((image_size - crop_size) / 2.0, dtype=tf.int32)
+    crop_size = tf.cast(crop_size, dtype=tf.int32)
+    cropped_image = image[
+        crop_offset[0]:crop_offset[0] + crop_size,
+        crop_offset[1]:crop_offset[1] + crop_size, :]
+    return cropped_image
+
+
+def center_crop_image_v2(image_bytes, image_shape):
+  """Center crop a square shape slice from the input image.
+
+  It crops a square shape slice from the image. The side of the actual crop
+  is 224 / 256 = 0.875 of the short side of the original image. References:
+  [1] Very Deep Convolutional Networks for Large-Scale Image Recognition
+      https://arxiv.org/abs/1409.1556
+  [2] Deep Residual Learning for Image Recognition
+      https://arxiv.org/abs/1512.03385
+
+  This is a faster version of `center_crop_image` which takes the original
+  image bytes and image size as the inputs, and partially decode the JPEG
+  bytes according to the center crop.
+
+  Args:
+    image_bytes: a Tensor of type string representing the raw image bytes.
+    image_shape: a Tensor specifying the shape of the raw image.
+
+  Returns:
+    cropped_image: a Tensor representing the center cropped image.
+  """
+  with tf.name_scope('center_image_crop_v2'):
+    image_shape = tf.cast(image_shape, tf.float32)
+    crop_size = (
+        CENTER_CROP_FRACTION * tf.math.minimum(image_shape[0], image_shape[1]))
+    crop_offset = tf.cast((image_shape - crop_size) / 2.0, dtype=tf.int32)
+    crop_size = tf.cast(crop_size, dtype=tf.int32)
+    crop_window = tf.stack(
+        [crop_offset[0], crop_offset[1], crop_size, crop_size])
+    cropped_image = tf.image.decode_and_crop_jpeg(
+        image_bytes, crop_window, channels=3)
+    return cropped_image
+
+
+def random_crop_image(image,
+                      aspect_ratio_range=(3. / 4., 4. / 3.),
+                      area_range=(0.08, 1.0),
+                      max_attempts=10,
+                      seed=1):
+  """Randomly crop an arbitrary shaped slice from the input image.
+
+  Args:
+    image: a Tensor of shape [height, width, 3] representing the input image.
+    aspect_ratio_range: a list of floats. The cropped area of the image must
+      have an aspect ratio = width / height within this range.
+    area_range: a list of floats. The cropped reas of the image must contain
+      a fraction of the input image within this range.
+    max_attempts: the number of attempts at generating a cropped region of the
+      image of the specified constraints. After max_attempts failures, return
+      the entire image.
+    seed: the seed of the random generator.
+
+  Returns:
+    cropped_image: a Tensor representing the random cropped image. Can be the
+      original image if max_attempts is exhausted.
+  """
+  with tf.name_scope('random_crop_image'):
+    crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
+        seed=seed,
+        min_object_covered=area_range[0],
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts)
+    cropped_image = tf.slice(image, crop_offset, crop_size)
+    return cropped_image
+
+
+def random_crop_image_v2(image_bytes,
+                         image_shape,
+                         aspect_ratio_range=(3. / 4., 4. / 3.),
+                         area_range=(0.08, 1.0),
+                         max_attempts=10,
+                         seed=1):
+  """Randomly crop an arbitrary shaped slice from the input image.
+
+  This is a faster version of `random_crop_image` which takes the original
+  image bytes and image size as the inputs, and partially decode the JPEG
+  bytes according to the generated crop.
+
+  Args:
+    image_bytes: a Tensor of type string representing the raw image bytes.
+    image_shape: a Tensor specifying the shape of the raw image.
+    aspect_ratio_range: a list of floats. The cropped area of the image must
+      have an aspect ratio = width / height within this range.
+    area_range: a list of floats. The cropped reas of the image must contain
+      a fraction of the input image within this range.
+    max_attempts: the number of attempts at generating a cropped region of the
+      image of the specified constraints. After max_attempts failures, return
+      the entire image.
+    seed: the seed of the random generator.
+
+  Returns:
+    cropped_image: a Tensor representing the random cropped image. Can be the
+      original image if max_attempts is exhausted.
+  """
+  with tf.name_scope('random_crop_image_v2'):
+    crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
+        image_shape,
+        tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
+        seed=seed,
+        min_object_covered=area_range[0],
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts)
+    offset_y, offset_x, _ = tf.unstack(crop_offset)
+    crop_height, crop_width, _ = tf.unstack(crop_size)
+    crop_window = tf.stack([offset_y, offset_x, crop_height, crop_width])
+    cropped_image = tf.image.decode_and_crop_jpeg(
+        image_bytes, crop_window, channels=3)
+    return cropped_image
+
+
+def resize_and_crop_boxes(boxes,
+                          image_scale,
+                          output_size,
+                          offset):
+  """Resizes boxes to output size with scale and offset.
+
+  Args:
+    boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
+    image_scale: 2D float `Tensor` representing scale factors that apply to
+      [height, width] of input image.
+    output_size: 2D `Tensor` or `int` representing [height, width] of target
+      output image size.
+    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+      boxes.
+
+  Returns:
+    boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
+  """
+  with tf.name_scope('resize_and_crop_boxes'):
+    # Adjusts box coordinates based on image_scale and offset.
+    boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
+    boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
+    # Clips the boxes.
+    boxes = box_ops.clip_boxes(boxes, output_size)
+    return boxes
+
+
+def resize_and_crop_masks(masks,
+                          image_scale,
+                          output_size,
+                          offset):
+  """Resizes boxes to output size with scale and offset.
+
+  Args:
+    masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
+    image_scale: 2D float `Tensor` representing scale factors that apply to
+      [height, width] of input image.
+    output_size: 2D `Tensor` or `int` representing [height, width] of target
+      output image size.
+    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+      boxes.
+
+  Returns:
+    masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
+  """
+  with tf.name_scope('resize_and_crop_masks'):
+    mask_size = tf.cast(tf.shape(masks)[1:3], tf.float32)
+    # Pad masks to avoid empty mask annotations.
+    masks = tf.concat(
+        [tf.zeros([1, mask_size[0], mask_size[1], 1]), masks], axis=0)
+
+    scaled_size = tf.cast(image_scale * mask_size, tf.int32)
+    scaled_masks = tf.image.resize(
+        masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+    offset = tf.cast(offset, tf.int32)
+    scaled_masks = scaled_masks[
+        :,
+        offset[0]:offset[0] + output_size[0],
+        offset[1]:offset[1] + output_size[1],
+        :]
+
+    output_masks = tf.image.pad_to_bounding_box(
+        scaled_masks, 0, 0, output_size[0], output_size[1])
+    # Remove padding.
+    output_masks = output_masks[1::]
+    return output_masks
+
+
+def horizontal_flip_image(image):
+  """Flips image horizontally."""
+  return tf.image.flip_left_right(image)
+
+
+def horizontal_flip_boxes(normalized_boxes):
+  """Flips normalized boxes horizontally."""
+  ymin, xmin, ymax, xmax = tf.split(
+      value=normalized_boxes, num_or_size_splits=4, axis=1)
+  flipped_xmin = tf.subtract(1.0, xmax)
+  flipped_xmax = tf.subtract(1.0, xmin)
+  flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
+  return flipped_boxes
+
+
+def horizontal_flip_masks(masks):
+  """Flips masks horizontally."""
+  return masks[:, :, ::-1]
+
+
+def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1):
+  """Randomly flips input image and bounding boxes."""
+  with tf.name_scope('random_horizontal_flip'):
+    do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
+
+    image = tf.cond(
+        do_flip,
+        lambda: horizontal_flip_image(image),
+        lambda: image)
+
+    if normalized_boxes is not None:
+      normalized_boxes = tf.cond(
+          do_flip,
+          lambda: horizontal_flip_boxes(normalized_boxes),
+          lambda: normalized_boxes)
+
+    if masks is not None:
+      masks = tf.cond(
+          do_flip,
+          lambda: horizontal_flip_masks(masks),
+          lambda: masks)
+
+    return image, normalized_boxes, masks
--- a/official/vision/beta/ops/preprocess_ops_3d.py
+++ b/official/vision/beta/ops/preprocess_ops_3d.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for processing video dataset features."""
+
+from typing import Optional
+import tensorflow as tf
+
+
+def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
+                                    num_steps: int,
+                                    stride: int,
+                                    offset: tf.Tensor) -> tf.Tensor:
+  """Returns indices to take for sampling or padding sequences to fixed size."""
+  sequence_length = tf.shape(sequence)[0]
+  sel_idx = tf.range(sequence_length)
+
+  # Repeats sequence until num_steps are available in total.
+  max_length = num_steps * stride + offset
+  num_repeats = tf.math.floordiv(
+      max_length + sequence_length - 1, sequence_length)
+  sel_idx = tf.tile(sel_idx, [num_repeats])
+
+  steps = tf.range(offset, offset + num_steps * stride, stride)
+  return tf.gather(sel_idx, steps)
+
+
+def sample_linspace_sequence(sequence: tf.Tensor,
+                             num_windows: int,
+                             num_steps: int,
+                             stride: int) -> tf.Tensor:
+  """Samples `num_windows` segments from sequence with linearly spaced offsets.
+
+  The samples are concatenated in a single `tf.Tensor` in order to have the same
+  format structure per timestep (e.g. a single frame). If `num_steps` * `stride`
+  is bigger than the number of timesteps, the sequence is repeated. This
+  function can be used in evaluation in order to extract enough segments to span
+  the entire sequence.
+
+  Args:
+    sequence: Any tensor where the first dimension is timesteps.
+    num_windows: Number of windows retrieved from the sequence.
+    num_steps: Number of steps (e.g. frames) to take.
+    stride: Distance to sample between timesteps.
+
+  Returns:
+    A single `tf.Tensor` with first dimension `num_windows` * `num_steps`. The
+    tensor contains the concatenated list of `num_windows` tensors which offsets
+    have been linearly spaced from input.
+  """
+  sequence_length = tf.shape(sequence)[0]
+  max_offset = tf.maximum(0, sequence_length - num_steps * stride)
+  offsets = tf.linspace(0.0, tf.cast(max_offset, tf.float32), num_windows)
+  offsets = tf.cast(offsets, tf.int32)
+
+  all_indices = []
+  for i in range(num_windows):
+    all_indices.append(_sample_or_pad_sequence_indices(
+        sequence=sequence,
+        num_steps=num_steps,
+        stride=stride,
+        offset=offsets[i]))
+
+  indices = tf.concat(all_indices, axis=0)
+  indices.set_shape((num_windows * num_steps,))
+  return tf.gather(sequence, indices)
+
+
+def sample_sequence(sequence: tf.Tensor,
+                    num_steps: int,
+                    random: bool,
+                    stride: int,
+                    seed: Optional[int] = None) -> tf.Tensor:
+  """Samples a single segment of size `num_steps` from a given sequence.
+
+  If `random` is not `True`, this function will simply sample the central window
+  of the sequence. Otherwise, a random offset will be chosen in a way that the
+  desired `num_steps` might be extracted from the sequence.
+
+  Args:
+    sequence: Any tensor where the first dimension is timesteps.
+    num_steps: Number of steps (e.g. frames) to take.
+    random: A boolean indicating whether to random sample the single window. If
+      `True`, the offset is randomized. If `False`, the middle frame minus half
+      of `num_steps` is the first frame.
+    stride: Distance to sample between timesteps.
+    seed: A deterministic seed to use when sampling.
+
+  Returns:
+    A single `tf.Tensor` with first dimension `num_steps` with the sampled
+    segment.
+  """
+  sequence_length = tf.shape(sequence)[0]
+
+  if random:
+    sequence_length = tf.cast(sequence_length, tf.float32)
+    max_offset = tf.cond(
+        sequence_length > (num_steps - 1) * stride,
+        lambda: sequence_length - (num_steps - 1) * stride,
+        lambda: sequence_length)
+    offset = tf.random.uniform(
+        (),
+        maxval=tf.cast(max_offset, dtype=tf.int32),
+        dtype=tf.int32,
+        seed=seed)
+  else:
+    offset = (sequence_length - num_steps * stride) // 2
+    offset = tf.maximum(0, offset)
+
+  indices = _sample_or_pad_sequence_indices(
+      sequence=sequence,
+      num_steps=num_steps,
+      stride=stride,
+      offset=offset)
+  indices.set_shape((num_steps,))
+
+  return tf.gather(sequence, indices)
+
+
+def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
+  """Decodes JPEG raw bytes string into a RGB uint8 Tensor.
+
+  Args:
+    image_string: A `tf.Tensor` of type strings with the raw JPEG bytes where
+      the first dimension is timesteps.
+    channels: Number of channels of the JPEG image. Allowed values are 0, 1 and
+      3. If 0, the number of channels will be calculated at runtime and no
+      static shape is set.
+
+  Returns:
+    A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
+  """
+  return tf.map_fn(
+      lambda x: tf.image.decode_jpeg(x, channels=channels),
+      image_string, back_prop=False, dtype=tf.uint8)
+
+
+def crop_image(frames: tf.Tensor,
+               height: int,
+               width: int,
+               random: bool = False,
+               seed: Optional[int] = None) -> tf.Tensor:
+  """Crops the image sequence of images.
+
+  If requested size is bigger than image size, image is padded with 0. If not
+  random cropping, a central crop is performed.
+
+  Args:
+    frames: A Tensor of dimension [timesteps, in_height, in_width, channels].
+    height: Cropped image height.
+    width: Cropped image width.
+    random: A boolean indicating if crop should be randomized.
+    seed: A deterministic seed to use when random cropping.
+
+  Returns:
+    A Tensor of shape [timesteps, out_height, out_width, channels] of type uint8
+    with the cropped images.
+  """
+  if random:
+    # Random spatial crop.
+    shape = tf.shape(frames)
+    # If a static_shape is available (e.g. when using this method from add_image
+    # method), it will be used to have an output tensor with static shape.
+    static_shape = frames.shape.as_list()
+    seq_len = shape[0] if static_shape[0] is None else static_shape[0]
+    channels = shape[3] if static_shape[3] is None else static_shape[3]
+    frames = tf.image.random_crop(frames, (seq_len, height, width, channels),
+                                  seed)
+  else:
+    # Central crop or pad.
+    frames = tf.image.resize_with_crop_or_pad(frames, height, width)
+  return frames
+
+
+def resize_smallest(frames: tf.Tensor,
+                    min_resize: int) -> tf.Tensor:
+  """Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
+
+  This function will not do anything if the min(`height`, `width`) is already
+  equal to `min_resize`. This allows to save compute time.
+
+  Args:
+    frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
+    min_resize: Minimum size of the final image dimensions.
+
+  Returns:
+    A Tensor of shape [timesteps, output_h, output_w, channels] of type
+      frames.dtype where min(output_h, output_w) = min_resize.
+  """
+  shape = tf.shape(frames)
+  input_h = shape[1]
+  input_w = shape[2]
+
+  output_h = tf.maximum(min_resize, (input_h * min_resize) // input_w)
+  output_w = tf.maximum(min_resize, (input_w * min_resize) // input_h)
+
+  def resize_fn():
+    frames_resized = tf.image.resize(frames, (output_h, output_w))
+    return tf.cast(frames_resized, frames.dtype)
+
+  should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w),
+                                     tf.not_equal(input_h, output_h))
+  frames = tf.cond(should_resize, resize_fn, lambda: frames)
+
+  return frames
+
+
+def random_flip_left_right(
+    frames: tf.Tensor,
+    seed: Optional[int] = None) -> tf.Tensor:
+  """Flips all the frames with a probability of 50%.
+
+  Args:
+    frames: A Tensor of shape [timesteps, input_h, input_w, channels].
+    seed: A seed to use for the random sampling.
+
+  Returns:
+    A Tensor of shape [timesteps, output_h, output_w, channels] eventually
+    flipped left right.
+  """
+  is_flipped = tf.random.uniform(
+      (), minval=0, maxval=2, dtype=tf.int32, seed=seed)
+
+  frames = tf.cond(tf.equal(is_flipped, 1),
+                   true_fn=lambda: tf.image.flip_left_right(frames),
+                   false_fn=lambda: frames)
+  return frames
+
+
+def normalize_image(frames: tf.Tensor,
+                    zero_centering_image: bool,
+                    dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+  """Normalizes images.
+
+  Args:
+    frames: A Tensor of numbers.
+    zero_centering_image: If True, results are in [-1, 1], if False, results are
+      in [0, 1].
+    dtype: Type of output Tensor.
+
+  Returns:
+    A Tensor of same shape as the input and of the given type.
+  """
+  frames = tf.cast(frames, dtype)
+  if zero_centering_image:
+    return frames * (2.0 / 255.0) - 1.0
+  else:
+    return frames / 255.0
--- a/official/vision/beta/ops/preprocess_ops_3d_test.py
+++ b/official/vision/beta/ops/preprocess_ops_3d_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import io
+import itertools
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.ops import preprocess_ops_3d
+
+
+class ParserUtilsTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    # [[0, 1, ..., 119], [1, 2, ..., 120], ..., [119, 120, ..., 218]].
+    self._frames = tf.stack([tf.range(i, i + 120) for i in range(90)])
+    self._frames = tf.cast(self._frames, tf.uint8)
+    self._frames = self._frames[tf.newaxis, :, :, tf.newaxis]
+    self._frames = tf.broadcast_to(self._frames, (6, 90, 120, 3))
+
+    # Create an equivalent numpy array for assertions.
+    self._np_frames = np.array([range(i, i + 120) for i in range(90)])
+    self._np_frames = self._np_frames[np.newaxis, :, :, np.newaxis]
+    self._np_frames = np.broadcast_to(self._np_frames, (6, 90, 120, 3))
+
+  def test_sample_linspace_sequence(self):
+    sequence = tf.range(100)
+    sampled_seq_1 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 10, 10, 1)
+    sampled_seq_2 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 7, 10, 1)
+    sampled_seq_3 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 7, 5, 2)
+    sampled_seq_4 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 101, 1, 1)
+
+    self.assertAllEqual(sampled_seq_1, range(100))
+    # [0, 1, 2, 3, 4, ..., 8, 9, 15, 16, ..., 97, 98, 99]
+    self.assertAllEqual(
+        sampled_seq_2,
+        [15 * i + j for i, j in itertools.product(range(7), range(10))])
+    # [0, 2, 4, 6, 8, 15, 17, 19, ..., 96, 98]
+    self.assertAllEqual(
+        sampled_seq_3,
+        [15 * i + 2 * j for i, j in itertools.product(range(7), range(5))])
+    self.assertAllEqual(sampled_seq_4, [0] + list(range(100)))
+
+  def test_sample_sequence(self):
+    sequence = tf.range(100)
+    sampled_seq_1 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 1)
+    sampled_seq_2 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 2)
+    sampled_seq_3 = preprocess_ops_3d.sample_sequence(sequence, 10, True, 1)
+
+    self.assertAllEqual(sampled_seq_1, range(45, 55))
+    self.assertAllEqual(sampled_seq_2, range(40, 60, 2))
+
+    offset_3 = sampled_seq_3[0]
+    self.assertBetween(offset_3, 0, 99)
+    self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
+
+  def test_decode_jpeg(self):
+    # Create a random RGB JPEG image.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
+    decoded_image = preprocess_ops_3d.decode_jpeg(raw_image, 3)
+
+    self.assertEqual(decoded_image.shape.as_list()[3], 3)
+    self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
+
+  def test_crop_image(self):
+    cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70)
+    cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200)
+    cropped_image_3 = preprocess_ops_3d.crop_image(self._frames, 50, 70, True)
+
+    self.assertAllEqual(cropped_image_1.shape, (6, 50, 70, 3))
+    self.assertAllEqual(cropped_image_1, self._np_frames[:, 20:70, 25:95, :])
+
+    self.assertAllEqual(cropped_image_2.shape, (6, 200, 200, 3))
+    expected = np.pad(
+        self._np_frames, ((0, 0), (55, 55), (40, 40), (0, 0)), 'constant')
+    self.assertAllEqual(cropped_image_2, expected)
+
+    self.assertAllEqual(cropped_image_3.shape, (6, 50, 70, 3))
+    offset = cropped_image_3[0, 0, 0, 0]
+    expected = np.array([range(i, i + 70) for i in range(offset, offset + 50)])
+    expected = expected[np.newaxis, :, :, np.newaxis]
+    expected = np.broadcast_to(expected, (6, 50, 70, 3))
+    self.assertAllEqual(cropped_image_3, expected)
+
+  def test_resize_smallest(self):
+    resized_frames_1 = preprocess_ops_3d.resize_smallest(self._frames, 180)
+    resized_frames_2 = preprocess_ops_3d.resize_smallest(self._frames, 45)
+    resized_frames_3 = preprocess_ops_3d.resize_smallest(self._frames, 90)
+    resized_frames_4 = preprocess_ops_3d.resize_smallest(
+        tf.transpose(self._frames, (0, 2, 1, 3)), 45)
+
+    self.assertAllEqual(resized_frames_1.shape, (6, 180, 240, 3))
+    self.assertAllEqual(resized_frames_2.shape, (6, 45, 60, 3))
+    self.assertAllEqual(resized_frames_3.shape, (6, 90, 120, 3))
+    self.assertAllEqual(resized_frames_4.shape, (6, 60, 45, 3))
+
+  def test_random_flip_left_right(self):
+    flipped_frames = preprocess_ops_3d.random_flip_left_right(self._frames)
+
+    flipped = np.fliplr(self._np_frames[0, :, :, 0])
+    flipped = flipped[np.newaxis, :, :, np.newaxis]
+    flipped = np.broadcast_to(flipped, (6, 90, 120, 3))
+    self.assertTrue((flipped_frames == self._np_frames).numpy().all() or (
+        flipped_frames == flipped).numpy().all())
+
+  def test_normalize_image(self):
+    normalized_images_1 = preprocess_ops_3d.normalize_image(
+        self._frames, False, tf.float32)
+    normalized_images_2 = preprocess_ops_3d.normalize_image(
+        self._frames, True, tf.float32)
+
+    self.assertAllClose(normalized_images_1, self._np_frames / 255)
+    self.assertAllClose(normalized_images_2, self._np_frames * 2 / 255 - 1.0)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/preprocess_ops_test.py
+++ b/official/vision/beta/ops/preprocess_ops_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for preprocess_ops.py."""
+
+import io
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.ops import preprocess_ops
+
+
+def _encode_image(image_array, fmt):
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+
+
+class InputUtilsTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ([1], 10),
+      ([1, 2], 10),
+      ([1, 2, 3], 10),
+      ([11], 10),
+      ([12, 2], 10),
+      ([13, 2, 3], 10),
+  )
+  def testPadToFixedSize(self, input_shape, output_size):
+    # Copies input shape to padding shape.
+    clip_shape = input_shape[:]
+    clip_shape[0] = min(output_size, clip_shape[0])
+    padding_shape = input_shape[:]
+    padding_shape[0] = max(output_size - input_shape[0], 0)
+    expected_outputs = np.concatenate(
+        [np.ones(clip_shape), np.zeros(padding_shape)], axis=0)
+
+    data = tf.ones(input_shape)
+    output_data = preprocess_ops.clip_or_pad_to_fixed_size(
+        data, output_size, constant_values=0)
+    output_data = output_data.numpy()
+    self.assertAllClose(output_size, output_data.shape[0])
+    self.assertAllClose(expected_outputs, output_data)
+
+  @parameterized.parameters(
+      (100, 200, 100, 200, 32, 1.0, 1.0, 128, 224),
+      (100, 256, 128, 256, 32, 1.0, 1.0, 128, 256),
+      (200, 512, 200, 128, 32, 0.25, 0.25, 224, 128),
+  )
+  def testResizeAndCropImageRectangluarCase(self,
+                                            input_height,
+                                            input_width,
+                                            desired_height,
+                                            desired_width,
+                                            stride,
+                                            scale_y,
+                                            scale_x,
+                                            output_height,
+                                            output_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+
+    desired_size = (desired_height, desired_width)
+    resized_image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        desired_size=desired_size,
+        padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
+    resized_image_shape = tf.shape(resized_image)
+
+    self.assertAllEqual(
+        [output_height, output_width, 3],
+        resized_image_shape.numpy())
+    self.assertNDArrayNear(
+        [[input_height, input_width],
+         [desired_height, desired_width],
+         [scale_y, scale_x],
+         [0.0, 0.0]],
+        image_info.numpy(),
+        1e-5)
+
+  @parameterized.parameters(
+      (100, 200, 220, 220, 32, 1.1, 1.1, 224, 224),
+      (512, 512, 1024, 1024, 32, 2.0, 2.0, 1024, 1024),
+  )
+  def testResizeAndCropImageSquareCase(self,
+                                       input_height,
+                                       input_width,
+                                       desired_height,
+                                       desired_width,
+                                       stride,
+                                       scale_y,
+                                       scale_x,
+                                       output_height,
+                                       output_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+
+    desired_size = (desired_height, desired_width)
+    resized_image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        desired_size=desired_size,
+        padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
+    resized_image_shape = tf.shape(resized_image)
+
+    self.assertAllEqual(
+        [output_height, output_width, 3],
+        resized_image_shape.numpy())
+    self.assertNDArrayNear(
+        [[input_height, input_width],
+         [desired_height, desired_width],
+         [scale_y, scale_x],
+         [0.0, 0.0]],
+        image_info.numpy(),
+        1e-5)
+
+  @parameterized.parameters(
+      (100, 200, 100, 300, 32, 1.0, 1.0, 100, 200, 128, 320),
+      (200, 100, 100, 300, 32, 1.0, 1.0, 200, 100, 320, 128),
+      (100, 200, 80, 100, 32, 0.5, 0.5, 50, 100, 96, 128),
+      (200, 100, 80, 100, 32, 0.5, 0.5, 100, 50, 128, 96),
+  )
+  def testResizeAndCropImageV2(self,
+                               input_height,
+                               input_width,
+                               short_side,
+                               long_side,
+                               stride,
+                               scale_y,
+                               scale_x,
+                               desired_height,
+                               desired_width,
+                               output_height,
+                               output_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+    image_shape = tf.shape(image)[0:2]
+
+    desired_size = tf.where(
+        tf.greater(image_shape[0], image_shape[1]),
+        tf.constant([long_side, short_side], dtype=tf.int32),
+        tf.constant([short_side, long_side], dtype=tf.int32))
+    resized_image, image_info = preprocess_ops.resize_and_crop_image_v2(
+        image,
+        short_side=short_side,
+        long_side=long_side,
+        padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
+    resized_image_shape = tf.shape(resized_image)
+
+    self.assertAllEqual(
+        [output_height, output_width, 3],
+        resized_image_shape.numpy())
+    self.assertNDArrayNear(
+        [[input_height, input_width],
+         [desired_height, desired_width],
+         [scale_y, scale_x],
+         [0.0, 0.0]],
+        image_info.numpy(),
+        1e-5)
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def testCenterCropImage(self,
+                          input_height,
+                          input_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+    cropped_image = preprocess_ops.center_crop_image(image)
+    cropped_image_shape = tf.shape(cropped_image)
+    self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def testCenterCropImageV2(self,
+                            input_height,
+                            input_width):
+    image_bytes = tf.constant(
+        _encode_image(
+            np.uint8(np.random.rand(input_height, input_width, 3) * 255),
+            fmt='JPEG'),
+        dtype=tf.string)
+    cropped_image = preprocess_ops.center_crop_image_v2(
+        image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
+    cropped_image_shape = tf.shape(cropped_image)
+    self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def testRandomCropImage(self,
+                          input_height,
+                          input_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+    _ = preprocess_ops.random_crop_image(image)
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def testRandomCropImageV2(self,
+                            input_height,
+                            input_width):
+    image_bytes = tf.constant(
+        _encode_image(
+            np.uint8(np.random.rand(input_height, input_width, 3) * 255),
+            fmt='JPEG'),
+        dtype=tf.string)
+    _ = preprocess_ops.random_crop_image_v2(
+        image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/ops/sampling_ops.py
+++ b/official/vision/beta/ops/sampling_ops.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class to subsample minibatches by balancing positives and negatives.
+
+Subsamples minibatches based on a pre-specified positive fraction in range
+[0,1]. The class presumes there are many more negatives than positive examples:
+if the desired batch_size cannot be achieved with the pre-specified positive
+fraction, it fills the rest with negative examples. If this is not sufficient
+for obtaining the desired batch_size, it returns fewer examples.
+
+The main function to call is Subsample(self, indicator, labels). For convenience
+one can also call SubsampleWeights(self, weights, labels) which is defined in
+the minibatch_sampler base class.
+
+When is_static is True, it implements a method that guarantees static shapes.
+It also ensures the length of output of the subsample is always batch_size, even
+when number of examples set to True in indicator is less than batch_size.
+
+This is originally implemented in TensorFlow Object Detection API.
+"""
+
+# Import libraries
+import tensorflow as tf
+
+
+def combined_static_and_dynamic_shape(tensor):
+  """Returns a list containing static and dynamic values for the dimensions.
+
+  Returns a list of static and dynamic values for shape dimensions. This is
+  useful to preserve static shapes when available in reshape operation.
+
+  Args:
+    tensor: A tensor of any type.
+
+  Returns:
+    A list of size tensor.shape.ndims containing integers or a scalar tensor.
+  """
+  static_tensor_shape = tensor.shape.as_list()
+  dynamic_tensor_shape = tf.shape(input=tensor)
+  combined_shape = []
+  for index, dim in enumerate(static_tensor_shape):
+    if dim is not None:
+      combined_shape.append(dim)
+    else:
+      combined_shape.append(dynamic_tensor_shape[index])
+  return combined_shape
+
+
+def indices_to_dense_vector(indices,
+                            size,
+                            indices_value=1.,
+                            default_value=0,
+                            dtype=tf.float32):
+  """Creates dense vector with indices set to specific value and rest to zeros.
+
+  This function exists because it is unclear if it is safe to use
+    tf.sparse_to_dense(indices, [size], 1, validate_indices=False)
+  with indices which are not ordered.
+  This function accepts a dynamic size (e.g. tf.shape(tensor)[0])
+
+  Args:
+    indices: 1d Tensor with integer indices which are to be set to
+        indices_values.
+    size: scalar with size (integer) of output Tensor.
+    indices_value: values of elements specified by indices in the output vector
+    default_value: values of other elements in the output vector.
+    dtype: data type.
+
+  Returns:
+    dense 1D Tensor of shape [size] with indices set to indices_values and the
+        rest set to default_value.
+  """
+  size = tf.cast(size, dtype=tf.int32)
+  zeros = tf.ones([size], dtype=dtype) * default_value
+  values = tf.ones_like(indices, dtype=dtype) * indices_value
+
+  return tf.dynamic_stitch(
+      [tf.range(size), tf.cast(indices, dtype=tf.int32)], [zeros, values])
+
+
+def matmul_gather_on_zeroth_axis(params, indices, scope=None):
+  """Matrix multiplication based implementation of tf.gather on zeroth axis.
+
+  TODO(rathodv, jonathanhuang): enable sparse matmul option.
+
+  Args:
+    params: A float32 Tensor. The tensor from which to gather values.
+      Must be at least rank 1.
+    indices: A Tensor. Must be one of the following types: int32, int64.
+      Must be in range [0, params.shape[0])
+    scope: A name for the operation (optional).
+
+  Returns:
+    A Tensor. Has the same type as params. Values from params gathered
+    from indices given by indices, with shape indices.shape + params.shape[1:].
+  """
+  scope = scope or 'MatMulGather'
+  with tf.name_scope(scope):
+    params_shape = combined_static_and_dynamic_shape(params)
+    indices_shape = combined_static_and_dynamic_shape(indices)
+    params2d = tf.reshape(params, [params_shape[0], -1])
+    indicator_matrix = tf.one_hot(indices, params_shape[0])
+    gathered_result_flattened = tf.matmul(indicator_matrix, params2d)
+    return tf.reshape(gathered_result_flattened,
+                      tf.stack(indices_shape + params_shape[1:]))
+
+
+class BalancedPositiveNegativeSampler:
+  """Subsamples minibatches to a desired balance of positives and negatives."""
+
+  def __init__(self, positive_fraction=0.5, is_static=False):
+    """Constructs a minibatch sampler.
+
+    Args:
+      positive_fraction: desired fraction of positive examples (scalar in [0,1])
+        in the batch.
+      is_static: If True, uses an implementation with static shape guarantees.
+
+    Raises:
+      ValueError: if positive_fraction < 0, or positive_fraction > 1
+    """
+    if positive_fraction < 0 or positive_fraction > 1:
+      raise ValueError('positive_fraction should be in range [0,1]. '
+                       'Received: %s.' % positive_fraction)
+    self._positive_fraction = positive_fraction
+    self._is_static = is_static
+
+  @staticmethod
+  def subsample_indicator(indicator, num_samples):
+    """Subsample indicator vector.
+
+    Given a boolean indicator vector with M elements set to `True`, the function
+    assigns all but `num_samples` of these previously `True` elements to
+    `False`. If `num_samples` is greater than M, the original indicator vector
+    is returned.
+
+    Args:
+      indicator: a 1-dimensional boolean tensor indicating which elements
+        are allowed to be sampled and which are not.
+      num_samples: int32 scalar tensor
+
+    Returns:
+      a boolean tensor with the same shape as input (indicator) tensor
+    """
+    indices = tf.where(indicator)
+    indices = tf.random.shuffle(indices)
+    indices = tf.reshape(indices, [-1])
+
+    num_samples = tf.minimum(tf.size(input=indices), num_samples)
+    selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))
+
+    selected_indicator = indices_to_dense_vector(
+        selected_indices,
+        tf.shape(input=indicator)[0])
+
+    return tf.equal(selected_indicator, 1)
+
+  def _get_num_pos_neg_samples(self, sorted_indices_tensor, sample_size):
+    """Counts the number of positives and negatives numbers to be sampled.
+
+    Args:
+      sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains
+        the signed indices of the examples where the sign is based on the label
+        value. The examples that cannot be sampled are set to 0. It samples
+        at most sample_size*positive_fraction positive examples and remaining
+        from negative examples.
+      sample_size: Size of subsamples.
+
+    Returns:
+      A tuple containing the number of positive and negative labels in the
+      subsample.
+    """
+    input_length = tf.shape(input=sorted_indices_tensor)[0]
+    valid_positive_index = tf.greater(sorted_indices_tensor,
+                                      tf.zeros(input_length, tf.int32))
+    num_sampled_pos = tf.reduce_sum(
+        input_tensor=tf.cast(valid_positive_index, tf.int32))
+    max_num_positive_samples = tf.constant(
+        int(sample_size * self._positive_fraction), tf.int32)
+    num_positive_samples = tf.minimum(max_num_positive_samples, num_sampled_pos)
+    num_negative_samples = tf.constant(sample_size,
+                                       tf.int32) - num_positive_samples
+
+    return num_positive_samples, num_negative_samples
+
+  def _get_values_from_start_and_end(self, input_tensor, num_start_samples,
+                                     num_end_samples, total_num_samples):
+    """slices num_start_samples and last num_end_samples from input_tensor.
+
+    Args:
+      input_tensor: An int32 tensor of shape [N] to be sliced.
+      num_start_samples: Number of examples to be sliced from the beginning
+        of the input tensor.
+      num_end_samples: Number of examples to be sliced from the end of the
+        input tensor.
+      total_num_samples: Sum of is num_start_samples and num_end_samples. This
+        should be a scalar.
+
+    Returns:
+      A tensor containing the first num_start_samples and last num_end_samples
+      from input_tensor.
+
+    """
+    input_length = tf.shape(input=input_tensor)[0]
+    start_positions = tf.less(tf.range(input_length), num_start_samples)
+    end_positions = tf.greater_equal(
+        tf.range(input_length), input_length - num_end_samples)
+    selected_positions = tf.logical_or(start_positions, end_positions)
+    selected_positions = tf.cast(selected_positions, tf.float32)
+    indexed_positions = tf.multiply(tf.cumsum(selected_positions),
+                                    selected_positions)
+    one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1,
+                                  total_num_samples,
+                                  dtype=tf.float32)
+    return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32),
+                                one_hot_selector, axes=[0, 0]), tf.int32)
+
+  def _static_subsample(self, indicator, batch_size, labels):
+    """Returns subsampled minibatch.
+
+    Args:
+      indicator: boolean tensor of shape [N] whose True entries can be sampled.
+        N should be a complie time constant.
+      batch_size: desired batch size. This scalar cannot be None.
+      labels: boolean tensor of shape [N] denoting positive(=True) and negative
+        (=False) examples. N should be a complie time constant.
+
+    Returns:
+      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
+        are sampled. It ensures the length of output of the subsample is always
+        batch_size, even when number of examples set to True in indicator is
+        less than batch_size.
+
+    Raises:
+      ValueError: if labels and indicator are not 1D boolean tensors.
+    """
+    # Check if indicator and labels have a static size.
+    if not indicator.shape.is_fully_defined():
+      raise ValueError('indicator must be static in shape when is_static is'
+                       'True')
+    if not labels.shape.is_fully_defined():
+      raise ValueError('labels must be static in shape when is_static is'
+                       'True')
+    if not isinstance(batch_size, int):
+      raise ValueError('batch_size has to be an integer when is_static is'
+                       'True.')
+
+    input_length = tf.shape(input=indicator)[0]
+
+    # Set the number of examples set True in indicator to be at least
+    # batch_size.
+    num_true_sampled = tf.reduce_sum(
+        input_tensor=tf.cast(indicator, tf.float32))
+    additional_false_sample = tf.less_equal(
+        tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
+        batch_size - num_true_sampled)
+    indicator = tf.logical_or(indicator, additional_false_sample)
+
+    # Shuffle indicator and label. Need to store the permutation to restore the
+    # order post sampling.
+    permutation = tf.random.shuffle(tf.range(input_length))
+    indicator = matmul_gather_on_zeroth_axis(
+        tf.cast(indicator, tf.float32), permutation)
+    labels = matmul_gather_on_zeroth_axis(
+        tf.cast(labels, tf.float32), permutation)
+
+    # index (starting from 1) when indicator is True, 0 when False
+    indicator_idx = tf.where(
+        tf.cast(indicator, tf.bool), tf.range(1, input_length + 1),
+        tf.zeros(input_length, tf.int32))
+
+    # Replace -1 for negative, +1 for positive labels
+    signed_label = tf.where(
+        tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
+        tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
+    # negative of index for negative label, positive index for positive label,
+    # 0 when indicator is False.
+    signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
+    sorted_signed_indicator_idx = tf.nn.top_k(
+        signed_indicator_idx, input_length, sorted=True).values
+
+    [num_positive_samples,
+     num_negative_samples] = self._get_num_pos_neg_samples(
+         sorted_signed_indicator_idx, batch_size)
+
+    sampled_idx = self._get_values_from_start_and_end(
+        sorted_signed_indicator_idx, num_positive_samples,
+        num_negative_samples, batch_size)
+
+    # Shift the indices to start from 0 and remove any samples that are set as
+    # False.
+    sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
+    sampled_idx = tf.multiply(
+        tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
+        sampled_idx)
+
+    sampled_idx_indicator = tf.cast(
+        tf.reduce_sum(
+            input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0),
+        tf.bool)
+
+    # project back the order based on stored permutations
+    reprojections = tf.one_hot(permutation, depth=input_length,
+                               dtype=tf.float32)
+    return tf.cast(tf.tensordot(
+        tf.cast(sampled_idx_indicator, tf.float32),
+        reprojections, axes=[0, 0]), tf.bool)
+
+  def subsample(self, indicator, batch_size, labels, scope=None):
+    """Returns subsampled minibatch.
+
+    Args:
+      indicator: boolean tensor of shape [N] whose True entries can be sampled.
+      batch_size: desired batch size. If None, keeps all positive samples and
+        randomly selects negative samples so that the positive sample fraction
+        matches self._positive_fraction. It cannot be None is is_static is True.
+      labels: boolean tensor of shape [N] denoting positive(=True) and negative
+          (=False) examples.
+      scope: name scope.
+
+    Returns:
+      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
+        are sampled.
+
+    Raises:
+      ValueError: if labels and indicator are not 1D boolean tensors.
+    """
+    if len(indicator.get_shape().as_list()) != 1:
+      raise ValueError('indicator must be 1 dimensional, got a tensor of '
+                       'shape %s' % indicator.get_shape())
+    if len(labels.get_shape().as_list()) != 1:
+      raise ValueError('labels must be 1 dimensional, got a tensor of '
+                       'shape %s' % labels.get_shape())
+    if labels.dtype != tf.bool:
+      raise ValueError('labels should be of type bool. Received: %s' %
+                       labels.dtype)
+    if indicator.dtype != tf.bool:
+      raise ValueError('indicator should be of type bool. Received: %s' %
+                       indicator.dtype)
+    scope = scope or 'BalancedPositiveNegativeSampler'
+    with tf.name_scope(scope):
+      if self._is_static:
+        return self._static_subsample(indicator, batch_size, labels)
+
+      else:
+        # Only sample from indicated samples
+        negative_idx = tf.logical_not(labels)
+        positive_idx = tf.logical_and(labels, indicator)
+        negative_idx = tf.logical_and(negative_idx, indicator)
+
+        # Sample positive and negative samples separately
+        if batch_size is None:
+          max_num_pos = tf.reduce_sum(
+              input_tensor=tf.cast(positive_idx, dtype=tf.int32))
+        else:
+          max_num_pos = int(self._positive_fraction * batch_size)
+        sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos)
+        num_sampled_pos = tf.reduce_sum(
+            input_tensor=tf.cast(sampled_pos_idx, tf.int32))
+        if batch_size is None:
+          negative_positive_ratio = (
+              1 - self._positive_fraction) / self._positive_fraction
+          max_num_neg = tf.cast(
+              negative_positive_ratio *
+              tf.cast(num_sampled_pos, dtype=tf.float32),
+              dtype=tf.int32)
+        else:
+          max_num_neg = batch_size - num_sampled_pos
+        sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg)
+
+        return tf.logical_or(sampled_pos_idx, sampled_neg_idx)