Internal change

PiperOrigin-RevId: 431756117

Internal change
PiperOrigin-RevId: 431756117
c44482ab · A. Unique TensorFlower · 10ee28dd · c44482ab · c44482ab · c44482ab
Commit c44482ab authored Mar 01, 2022 by A. Unique TensorFlower
20 changed files
--- a/official/vision/ops/__init__.py
+++ b/official/vision/ops/__init__.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/vision/ops/anchor.py
+++ b/official/vision/ops/anchor.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Anchor box and labeler definition."""
+
+import collections
+
+# Import libraries
+
+import tensorflow as tf
+
+from official.vision.ops import anchor_generator
+from official.vision.ops import box_matcher
+from official.vision.ops import iou_similarity
+from official.vision.ops import target_gather
+from official.vision.utils.object_detection import balanced_positive_negative_sampler
+from official.vision.utils.object_detection import box_list
+from official.vision.utils.object_detection import faster_rcnn_box_coder
+
+
+class Anchor(object):
+  """Anchor class for anchor-based object detectors."""
+
+  def __init__(self,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               image_size):
+    """Constructs multiscale anchors.
+
+    Args:
+      min_level: integer number of minimum level of the output feature pyramid.
+      max_level: integer number of maximum level of the output feature pyramid.
+      num_scales: integer number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: list of float numbers representing the aspect raito anchors
+        added on each level. The number indicates the ratio of width to height.
+        For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
+        scale level.
+      anchor_size: float number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      image_size: a list of integer numbers or Tensors representing
+        [height, width] of the input image size.The image_size should be divided
+        by the largest feature stride 2^max_level.
+    """
+    self.min_level = min_level
+    self.max_level = max_level
+    self.num_scales = num_scales
+    self.aspect_ratios = aspect_ratios
+    self.anchor_size = anchor_size
+    self.image_size = image_size
+    self.boxes = self._generate_boxes()
+
+  def _generate_boxes(self):
+    """Generates multiscale anchor boxes.
+
+    Returns:
+      a Tensor of shape [N, 4], representing anchor boxes of all levels
+      concatenated together.
+    """
+    boxes_all = []
+    for level in range(self.min_level, self.max_level + 1):
+      boxes_l = []
+      for scale in range(self.num_scales):
+        for aspect_ratio in self.aspect_ratios:
+          stride = 2 ** level
+          intermidate_scale = 2 ** (scale / float(self.num_scales))
+          base_anchor_size = self.anchor_size * stride * intermidate_scale
+          aspect_x = aspect_ratio ** 0.5
+          aspect_y = aspect_ratio ** -0.5
+          half_anchor_size_x = base_anchor_size * aspect_x / 2.0
+          half_anchor_size_y = base_anchor_size * aspect_y / 2.0
+          x = tf.range(stride / 2, self.image_size[1], stride)
+          y = tf.range(stride / 2, self.image_size[0], stride)
+          xv, yv = tf.meshgrid(x, y)
+          xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32)
+          yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32)
+          # Tensor shape Nx4.
+          boxes = tf.stack([yv - half_anchor_size_y, xv - half_anchor_size_x,
+                            yv + half_anchor_size_y, xv + half_anchor_size_x],
+                           axis=1)
+          boxes_l.append(boxes)
+      # Concat anchors on the same level to tensor shape NxAx4.
+      boxes_l = tf.stack(boxes_l, axis=1)
+      boxes_l = tf.reshape(boxes_l, [-1, 4])
+      boxes_all.append(boxes_l)
+    return tf.concat(boxes_all, axis=0)
+
+  def unpack_labels(self, labels):
+    """Unpacks an array of labels into multiscales labels."""
+    unpacked_labels = collections.OrderedDict()
+    count = 0
+    for level in range(self.min_level, self.max_level + 1):
+      feat_size_y = tf.cast(self.image_size[0] / 2 ** level, tf.int32)
+      feat_size_x = tf.cast(self.image_size[1] / 2 ** level, tf.int32)
+      steps = feat_size_y * feat_size_x * self.anchors_per_location
+      unpacked_labels[str(level)] = tf.reshape(
+          labels[count:count + steps], [feat_size_y, feat_size_x, -1])
+      count += steps
+    return unpacked_labels
+
+  @property
+  def anchors_per_location(self):
+    return self.num_scales * len(self.aspect_ratios)
+
+  @property
+  def multilevel_boxes(self):
+    return self.unpack_labels(self.boxes)
+
+
+class AnchorLabeler(object):
+  """Labeler for dense object detector."""
+
+  def __init__(self,
+               match_threshold=0.5,
+               unmatched_threshold=0.5):
+    """Constructs anchor labeler to assign labels to anchors.
+
+    Args:
+      match_threshold: a float number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: a float number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+    """
+    self.similarity_calc = iou_similarity.IouSimilarity()
+    self.target_gather = target_gather.TargetGather()
+    self.matcher = box_matcher.BoxMatcher(
+        thresholds=[unmatched_threshold, match_threshold],
+        indicators=[-1, -2, 1],
+        force_match_for_each_col=True)
+    self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
+
+  def label_anchors(self,
+                    anchor_boxes,
+                    gt_boxes,
+                    gt_labels,
+                    gt_attributes=None,
+                    gt_weights=None):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+      gt_attributes: If not None, a dict of (name, gt_attribute) pairs.
+        `gt_attribute` is a float tensor with shape [N, attribute_size]
+        representing groundtruth attributes.
+      gt_weights: If not None, a float tensor with shape [N] representing
+        groundtruth weights.
+    Returns:
+      cls_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location]. The height_l and
+        width_l represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
+        and width_l represent the dimension of bounding box regression output at
+        l-th level.
+      attribute_targets_dict: a dict with (name, attribute_targets) pairs. Each
+        `attribute_targets` represents an ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors_per_location * attribute_size].
+        The height_l and width_l represent the dimension of attribute prediction
+        output at l-th level.
+      cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that
+        serves as masking / sample weight for classification loss. Its value
+        is 1.0 for positive and negative matched anchors, and 0.0 for ignored
+        anchors.
+      box_weights: A flattened Tensor with shape [batch_size, num_anchors], that
+        serves as masking / sample weight for regression loss. Its value is
+        1.0 for positive matched anchors, and 0.0 for negative and ignored
+        anchors.
+    """
+    flattened_anchor_boxes = []
+    for anchors in anchor_boxes.values():
+      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
+    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
+    similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
+    match_indices, match_indicators = self.matcher(similarity_matrix)
+
+    mask = tf.less_equal(match_indicators, 0)
+    cls_mask = tf.expand_dims(mask, -1)
+    cls_targets = self.target_gather(gt_labels, match_indices, cls_mask, -1)
+    box_mask = tf.tile(cls_mask, [1, 4])
+    box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
+    att_targets = {}
+    if gt_attributes:
+      for k, v in gt_attributes.items():
+        att_size = v.get_shape().as_list()[-1]
+        att_mask = tf.tile(cls_mask, [1, att_size])
+        att_targets[k] = self.target_gather(v, match_indices, att_mask, 0.0)
+
+    weights = tf.squeeze(tf.ones_like(gt_labels, dtype=tf.float32), -1)
+    if gt_weights is not None:
+      weights = tf.math.multiply(weights, gt_weights)
+    box_weights = self.target_gather(weights, match_indices, mask)
+    ignore_mask = tf.equal(match_indicators, -2)
+    cls_weights = self.target_gather(weights, match_indices, ignore_mask)
+    box_targets_list = box_list.BoxList(box_targets)
+    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
+    box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)
+
+    # Unpacks labels into multi-level representations.
+    cls_targets_dict = unpack_targets(cls_targets, anchor_boxes)
+    box_targets_dict = unpack_targets(box_targets, anchor_boxes)
+    attribute_targets_dict = {}
+    for k, v in att_targets.items():
+      attribute_targets_dict[k] = unpack_targets(v, anchor_boxes)
+
+    return cls_targets_dict, box_targets_dict, attribute_targets_dict, cls_weights, box_weights
+
+
+class RpnAnchorLabeler(AnchorLabeler):
+  """Labeler for Region Proposal Network."""
+
+  def __init__(self,
+               match_threshold=0.7,
+               unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5):
+    AnchorLabeler.__init__(self, match_threshold=match_threshold,
+                           unmatched_threshold=unmatched_threshold)
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._rpn_fg_fraction = rpn_fg_fraction
+
+  def _get_rpn_samples(self, match_results):
+    """Computes anchor labels.
+
+    This function performs subsampling for foreground (fg) and background (bg)
+    anchors.
+    Args:
+      match_results: A integer tensor with shape [N] representing the
+        matching results of anchors. (1) match_results[i]>=0,
+        meaning that column i is matched with row match_results[i].
+        (2) match_results[i]=-1, meaning that column i is not matched.
+        (3) match_results[i]=-2, meaning that column i is ignored.
+    Returns:
+      score_targets: a integer tensor with the a shape of [N].
+        (1) score_targets[i]=1, the anchor is a positive sample.
+        (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
+        don't care (ignore).
+    """
+    sampler = (
+        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+            positive_fraction=self._rpn_fg_fraction, is_static=False))
+    # indicator includes both positive and negative labels.
+    # labels includes only positives labels.
+    # positives = indicator & labels.
+    # negatives = indicator & !labels.
+    # ignore = !indicator.
+    indicator = tf.greater(match_results, -2)
+    labels = tf.greater(match_results, -1)
+
+    samples = sampler.subsample(
+        indicator, self._rpn_batch_size_per_im, labels)
+    positive_labels = tf.where(
+        tf.logical_and(samples, labels),
+        tf.constant(2, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    negative_labels = tf.where(
+        tf.logical_and(samples, tf.logical_not(labels)),
+        tf.constant(1, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    ignore_labels = tf.fill(match_results.shape, -1)
+
+    return (ignore_labels + positive_labels + negative_labels,
+            positive_labels, negative_labels)
+
+  def label_anchors(self, anchor_boxes, gt_boxes, gt_labels):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+    Returns:
+      score_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors]. The height_l and width_l
+        represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors * 4]. The height_l and
+        width_l represent the dimension of bounding box regression output at
+        l-th level.
+    """
+    flattened_anchor_boxes = []
+    for anchors in anchor_boxes.values():
+      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
+    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
+    similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
+    match_indices, match_indicators = self.matcher(similarity_matrix)
+    box_mask = tf.tile(tf.expand_dims(tf.less_equal(match_indicators, 0), -1),
+                       [1, 4])
+    box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
+    box_targets_list = box_list.BoxList(box_targets)
+    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
+    box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)
+
+    # Zero out the unmatched and ignored regression targets.
+    num_matches = match_indices.shape.as_list()[0] or tf.shape(match_indices)[0]
+    unmatched_ignored_box_targets = tf.zeros([num_matches, 4], dtype=tf.float32)
+    matched_anchors_mask = tf.greater_equal(match_indicators, 0)
+    # To broadcast matched_anchors_mask to the same shape as
+    # matched_reg_targets.
+    matched_anchors_mask = tf.tile(
+        tf.expand_dims(matched_anchors_mask, 1),
+        [1, tf.shape(box_targets)[1]])
+    box_targets = tf.where(matched_anchors_mask, box_targets,
+                           unmatched_ignored_box_targets)
+
+    # score_targets contains the subsampled positive and negative anchors.
+    score_targets, _, _ = self._get_rpn_samples(match_indicators)
+
+    # Unpacks labels.
+    score_targets_dict = unpack_targets(score_targets, anchor_boxes)
+    box_targets_dict = unpack_targets(box_targets, anchor_boxes)
+
+    return score_targets_dict, box_targets_dict
+
+
+def build_anchor_generator(min_level, max_level, num_scales, aspect_ratios,
+                           anchor_size):
+  """Build anchor generator from levels."""
+  anchor_sizes = collections.OrderedDict()
+  strides = collections.OrderedDict()
+  scales = []
+  for scale in range(num_scales):
+    scales.append(2**(scale / float(num_scales)))
+  for level in range(min_level, max_level + 1):
+    stride = 2**level
+    strides[str(level)] = stride
+    anchor_sizes[str(level)] = anchor_size * stride
+  anchor_gen = anchor_generator.AnchorGenerator(
+      anchor_sizes=anchor_sizes,
+      scales=scales,
+      aspect_ratios=aspect_ratios,
+      strides=strides)
+  return anchor_gen
+
+
+def unpack_targets(targets, anchor_boxes_dict):
+  """Unpacks an array of labels into multiscales labels."""
+  unpacked_targets = collections.OrderedDict()
+  count = 0
+  for level, anchor_boxes in anchor_boxes_dict.items():
+    feat_size_shape = anchor_boxes.shape.as_list()
+    feat_size_y = feat_size_shape[0]
+    feat_size_x = feat_size_shape[1]
+    anchors_per_location = int(feat_size_shape[2] / 4)
+    steps = feat_size_y * feat_size_x * anchors_per_location
+    unpacked_targets[level] = tf.reshape(targets[count:count + steps],
+                                         [feat_size_y, feat_size_x, -1])
+    count += steps
+  return unpacked_targets
--- a/official/vision/ops/anchor_generator.py
+++ b/official/vision/ops/anchor_generator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multi scale anchor generator definition."""
+
+import tensorflow as tf
+
+
+# (TODO/tanzheny): consider having customized anchor offset.
+class _SingleAnchorGenerator:
+  """Utility to generate anchors for a single feature map.
+
+  Example:
+  ```python
+  anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16)
+  anchors = anchor_gen([512, 512, 3])
+  ```
+  """
+
+  def __init__(self,
+               anchor_size,
+               scales,
+               aspect_ratios,
+               stride,
+               clip_boxes=False):
+    """Constructs single scale anchor.
+
+    Args:
+      anchor_size: A single int represents the base anchor size. The anchor
+        height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be
+        `anchor_size * sqrt(aspect_ratio)`.
+      scales: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the actual anchor size to the base `anchor_size`.
+      aspect_ratios: a list/tuple of positive floats representing the ratio of
+        anchor width to anchor height.
+      stride: A single int represents the anchor stride size between center of
+        each anchor.
+      clip_boxes: Boolean to represent whether the anchor coordinates should be
+        clipped to the image size. Defaults to `True`.
+    Input shape: the size of the image, `[H, W, C]`
+    Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]`
+    """
+    self.anchor_size = anchor_size
+    self.scales = scales
+    self.aspect_ratios = aspect_ratios
+    self.stride = stride
+    self.clip_boxes = clip_boxes
+
+  def __call__(self, image_size):
+    image_height = tf.cast(image_size[0], tf.float32)
+    image_width = tf.cast(image_size[1], tf.float32)
+
+    k = len(self.scales) * len(self.aspect_ratios)
+    aspect_ratios_sqrt = tf.cast(tf.sqrt(self.aspect_ratios), dtype=tf.float32)
+    anchor_size = tf.cast(self.anchor_size, tf.float32)
+
+    # [K]
+    anchor_heights = []
+    anchor_widths = []
+    for scale in self.scales:
+      anchor_size_t = anchor_size * scale
+      anchor_height = anchor_size_t / aspect_ratios_sqrt
+      anchor_width = anchor_size_t * aspect_ratios_sqrt
+      anchor_heights.append(anchor_height)
+      anchor_widths.append(anchor_width)
+    anchor_heights = tf.concat(anchor_heights, axis=0)
+    anchor_widths = tf.concat(anchor_widths, axis=0)
+    half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, k])
+    half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, k])
+
+    stride = tf.cast(self.stride, tf.float32)
+    # [W]
+    cx = tf.range(0.5 * stride, image_width, stride)
+    # [H]
+    cy = tf.range(0.5 * stride, image_height, stride)
+    # [H, W]
+    cx_grid, cy_grid = tf.meshgrid(cx, cy)
+    # [H, W, 1]
+    cx_grid = tf.expand_dims(cx_grid, axis=-1)
+    cy_grid = tf.expand_dims(cy_grid, axis=-1)
+
+    # [H, W, K, 1]
+    y_min = tf.expand_dims(cy_grid - half_anchor_heights, axis=-1)
+    y_max = tf.expand_dims(cy_grid + half_anchor_heights, axis=-1)
+    x_min = tf.expand_dims(cx_grid - half_anchor_widths, axis=-1)
+    x_max = tf.expand_dims(cx_grid + half_anchor_widths, axis=-1)
+
+    if self.clip_boxes:
+      y_min = tf.maximum(tf.minimum(y_min, image_height), 0.)
+      y_max = tf.maximum(tf.minimum(y_max, image_height), 0.)
+      x_min = tf.maximum(tf.minimum(x_min, image_width), 0.)
+      x_max = tf.maximum(tf.minimum(x_max, image_width), 0.)
+
+    # [H, W, K, 4]
+    result = tf.concat([y_min, x_min, y_max, x_max], axis=-1)
+    shape = result.shape.as_list()
+    # [H, W, K * 4]
+    return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]])
+
+
+class AnchorGenerator():
+  """Utility to generate anchors for a multiple feature maps.
+
+  Example:
+  ```python
+  anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.],
+    strides=[16, 32])
+  anchors = anchor_gen([512, 512, 3])
+  ```
+
+  """
+
+  def __init__(self,
+               anchor_sizes,
+               scales,
+               aspect_ratios,
+               strides,
+               clip_boxes=False):
+    """Constructs multiscale anchors.
+
+    Args:
+      anchor_sizes: A list of int represents the anchor size for each scale. The
+        anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width
+        will be `anchor_size * sqrt(aspect_ratio)` for each scale.
+      scales: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the actual anchor size to the base `anchor_size`.
+      aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive
+        floats representing the ratio of anchor width to anchor height.
+      strides: A list/tuple of ints represent the anchor stride size between
+        center of anchors at each scale.
+      clip_boxes: Boolean to represents whether the anchor coordinates should be
+        clipped to the image size. Defaults to `False`.
+    Input shape: the size of the image, `[H, W, C]`
+    Output shape: the size of anchors concat on each level, `[(H /
+      strides) * (W / strides), K * 4]`
+    """
+    # aspect_ratio is a single list that is the same across all levels.
+    aspect_ratios = maybe_map_structure_for_anchor(aspect_ratios, anchor_sizes)
+    scales = maybe_map_structure_for_anchor(scales, anchor_sizes)
+    if isinstance(anchor_sizes, dict):
+      self.anchor_generators = {}
+      for k in anchor_sizes.keys():
+        self.anchor_generators[k] = _SingleAnchorGenerator(
+            anchor_sizes[k], scales[k], aspect_ratios[k], strides[k],
+            clip_boxes)
+    elif isinstance(anchor_sizes, (list, tuple)):
+      self.anchor_generators = []
+      for anchor_size, scale_list, ar_list, stride in zip(
+          anchor_sizes, scales, aspect_ratios, strides):
+        self.anchor_generators.append(
+            _SingleAnchorGenerator(anchor_size, scale_list, ar_list, stride,
+                                   clip_boxes))
+
+  def __call__(self, image_size):
+    anchor_generators = tf.nest.flatten(self.anchor_generators)
+    results = [anchor_gen(image_size) for anchor_gen in anchor_generators]
+    return tf.nest.pack_sequence_as(self.anchor_generators, results)
+
+
+def maybe_map_structure_for_anchor(params, anchor_sizes):
+  """broadcast the params to match anchor_sizes."""
+  if all(isinstance(param, (int, float)) for param in params):
+    if isinstance(anchor_sizes, (tuple, list)):
+      return [params] * len(anchor_sizes)
+    elif isinstance(anchor_sizes, dict):
+      return tf.nest.map_structure(lambda _: params, anchor_sizes)
+    else:
+      raise ValueError("the structure of `anchor_sizes` must be a tuple, "
+                       "list, or dict, given {}".format(anchor_sizes))
+  else:
+    return params
--- a/official/vision/ops/anchor_generator_test.py
+++ b/official/vision/ops/anchor_generator_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for anchor_generator.py."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.ops import anchor_generator
+
+
+class AnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, [1.0], [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+                  [[16., -16., 80., 48.], [16., 16., 80., 80.]]]),
+      # # Multi aspect ratio anchor.
+      (6, [1.0, 4.0, 0.25],
+       [[[-32., -32., 96., 96., 0., -96., 64., 160., -96., 0., 160., 64.]]]),
+  )
+  def testAnchorGeneration(self, level, aspect_ratios, expected_boxes):
+    image_size = [64, 64]
+    anchor_size = 2**(level + 1)
+    stride = 2**level
+    anchor_gen = anchor_generator._SingleAnchorGenerator(
+        anchor_size=anchor_size,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        stride=stride,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, [1.0], [[[0., 0., 48., 48.], [0., 16., 48., 64.]],
+                  [[16., 0., 64., 48.], [16., 16., 64., 64.]]]),
+      # # Multi aspect ratio anchor.
+      (6, [1.0, 4.0, 0.25
+          ], [[[0., 0., 64., 64., 0., 0., 64., 64., 0., 0., 64., 64.]]]),
+  )
+  def testAnchorGenerationClipped(self, level, aspect_ratios, expected_boxes):
+    image_size = [64, 64]
+    anchor_size = 2**(level + 1)
+    stride = 2**level
+    anchor_gen = anchor_generator._SingleAnchorGenerator(
+        anchor_size=anchor_size,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        stride=stride,
+        clip_boxes=True)
+    anchors = anchor_gen(image_size).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+
+class MultiScaleAnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
+                              [16, -16, 80, 48], [16, 16, 80, 80],
+                              [-32, -32, 96, 96]]),)
+  def testAnchorGeneration(self, min_level, max_level, aspect_ratios,
+                           expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = [2**(level + 1) for level in levels]
+    strides = [2**level for level in levels]
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides)
+    anchors = anchor_gen(image_size)
+    anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
+    anchors = tf.concat(anchors, axis=0).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80],
+                              [16, -16, 80, 48], [16, 16, 80, 80],
+                              [-32, -32, 96, 96]]),)
+  def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios,
+                                  expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = [2**(level + 1) for level in levels]
+    strides = [2**level for level in levels]
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size)
+    anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors]
+    anchors = tf.concat(anchors, axis=0).numpy()
+    self.assertAllClose(expected_boxes, anchors)
+
+  @parameterized.parameters(
+      # Multi scale anchor.
+      (5, 6, [1.0], {
+          '5': [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]],
+                [[16., -16., 80., 48.], [16., 16., 80., 80.]]],
+          '6': [[[-32, -32, 96, 96]]]
+      }),)
+  def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios,
+                               expected_boxes):
+    image_size = [64, 64]
+    levels = range(min_level, max_level + 1)
+    anchor_sizes = dict((str(level), 2**(level + 1)) for level in levels)
+    strides = dict((str(level), 2**level) for level in levels)
+    anchor_gen = anchor_generator.AnchorGenerator(
+        anchor_sizes=anchor_sizes,
+        scales=[1.],
+        aspect_ratios=aspect_ratios,
+        strides=strides,
+        clip_boxes=False)
+    anchors = anchor_gen(image_size)
+    for k in expected_boxes.keys():
+      self.assertAllClose(expected_boxes[k], anchors[k].numpy())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/anchor_test.py
+++ b/official/vision/ops/anchor_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for anchor.py."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from official.vision.ops import anchor
+
+
+class AnchorTest(parameterized.TestCase, tf.test.TestCase):
+
+  # The set of parameters are tailored for the MLPerf configuration, where
+  # the number of anchors is 495132, rpn_batch_size_per_im=256, and
+  # rpn_fg_fraction=0.5.
+  @parameterized.parameters(
+      (512, 25, 25, 25, 25, (512, 512)),
+      (512, 25, 25, 25, 25, (512, 640)),
+      (512, 25, 25, 25, 25, (640, 512)),
+      (495132, 100, 100, 100, 100, (512, 512)),
+      (495132, 200, 100, 128, 100, (512, 512)),
+      (495132, 100, 120, 100, 120, (512, 512)),
+      (495132, 100, 200, 100, 156, (512, 512)),
+      (495132, 200, 200, 128, 128, (512, 512)),
+  )
+  def testAnchorRpnSample(self, num_anchors, num_positives,
+                          num_negatives, expected_positives,
+                          expected_negatives, image_size):
+    match_results_np = np.empty([num_anchors])
+    match_results_np.fill(-2)
+    match_results_np[:num_positives] = 0
+    match_results_np[num_positives:num_positives + num_negatives] = -1
+    match_results = tf.convert_to_tensor(value=match_results_np, dtype=tf.int32)
+    anchor_labeler = anchor.RpnAnchorLabeler(
+        match_threshold=0.7,
+        unmatched_threshold=0.3,
+        rpn_batch_size_per_im=256,
+        rpn_fg_fraction=0.5)
+    rpn_sample_op = anchor_labeler._get_rpn_samples(match_results)
+    labels = [v.numpy() for v in rpn_sample_op]
+    self.assertLen(labels[0], num_anchors)
+    positives = np.sum(np.array(labels[0]) == 1)
+    negatives = np.sum(np.array(labels[0]) == 0)
+    self.assertEqual(positives, expected_positives)
+    self.assertEqual(negatives, expected_negatives)
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, 5, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80]]),
+      # Multi scale anchor.
+      (5, 6, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
+      # # Multi aspect ratio anchor.
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
+       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
+
+  )
+  def testAnchorGeneration(self, min_level, max_level, num_scales,
+                           aspect_ratios, anchor_size, expected_boxes):
+    image_size = [64, 64]
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    boxes = anchors.boxes.numpy()
+    self.assertEqual(expected_boxes, boxes.tolist())
+
+  @parameterized.parameters(
+      # Single scale anchor.
+      (5, 5, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80]]),
+      # Multi scale anchor.
+      (5, 6, 1, [1.0], 2.0,
+       [[-16, -16, 48, 48], [-16, 16, 48, 80],
+        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
+      # # Multi aspect ratio anchor.
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
+       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
+
+  )
+  def testAnchorGenerationWithImageSizeAsTensor(self,
+                                                min_level,
+                                                max_level,
+                                                num_scales,
+                                                aspect_ratios,
+                                                anchor_size,
+                                                expected_boxes):
+    image_size = tf.constant([64, 64], tf.int32)
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    boxes = anchors.boxes.numpy()
+    self.assertEqual(expected_boxes, boxes.tolist())
+
+  @parameterized.parameters(
+      (3, 6, 2, [1.0], 2.0, False),
+      (3, 6, 2, [1.0], 2.0, True),
+  )
+  def testLabelAnchors(self, min_level, max_level, num_scales, aspect_ratios,
+                       anchor_size, has_attribute):
+    input_size = [512, 512]
+    ground_truth_class_id = 2
+    attribute_name = 'depth'
+    ground_truth_depth = 3.0
+
+    # The matched anchors are the anchors used as ground truth and the anchors
+    # at the next octave scale on the same location.
+    expected_anchor_locations = [[0, 0, 0], [0, 0, 1]]
+    anchor_gen = anchor.build_anchor_generator(min_level, max_level, num_scales,
+                                               aspect_ratios, anchor_size)
+    anchor_boxes = anchor_gen(input_size)
+    anchor_labeler = anchor.AnchorLabeler()
+
+    # Uses the first anchors as ground truth. The ground truth should map to
+    # two anchors with two intermediate scales at the same location.
+    gt_boxes = anchor_boxes['3'][0:1, 0, 0:4]
+    gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32)
+    gt_attributes = {
+        attribute_name: tf.constant([[ground_truth_depth]], dtype=tf.float32)
+    } if has_attribute else {}
+
+    (cls_targets, box_targets, att_targets, _,
+     box_weights) = anchor_labeler.label_anchors(anchor_boxes, gt_boxes,
+                                                 gt_classes, gt_attributes)
+
+    for k, v in cls_targets.items():
+      cls_targets[k] = v.numpy()
+    for k, v in box_targets.items():
+      box_targets[k] = v.numpy()
+    box_weights = box_weights.numpy()
+
+    anchor_locations = np.vstack(
+        np.where(cls_targets[str(min_level)] > -1)).transpose()
+    self.assertAllClose(expected_anchor_locations, anchor_locations)
+    # Two anchor boxes on min_level got matched to the gt_boxes.
+    self.assertAllClose(tf.reduce_sum(box_weights), 2)
+
+    if has_attribute:
+      self.assertIn(attribute_name, att_targets)
+      for k, v in att_targets[attribute_name].items():
+        att_targets[attribute_name][k] = v.numpy()
+      anchor_locations = np.vstack(
+          np.where(
+              att_targets[attribute_name][str(min_level)] > 0.0)).transpose()
+      self.assertAllClose(expected_anchor_locations, anchor_locations)
+    else:
+      self.assertEmpty(att_targets)
+
+  @parameterized.parameters(
+      (3, 7, [.5, 1., 2.], 2, 8, (256, 256)),
+      (3, 8, [1.], 3, 32, (512, 512)),
+      (3, 3, [1.], 2, 4, (32, 32)),
+  )
+  def testEquivalentResult(self, min_level, max_level, aspect_ratios,
+                           num_scales, anchor_size, image_size):
+    anchor_gen = anchor.build_anchor_generator(
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+    anchors = anchor_gen(image_size)
+    expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
+                                        aspect_ratios, anchor_size, image_size)
+
+    expected_anchors = expected_anchor_gen.multilevel_boxes
+    for k in expected_anchors.keys():
+      self.assertAllClose(expected_anchors[k], anchors[k])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/augment.py
+++ b/official/vision/ops/augment.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Augmentation policies for enhanced image/video preprocessing.
+
+AutoAugment Reference:
+  - AutoAugment Reference: https://arxiv.org/abs/1805.09501
+  - AutoAugment for Object Detection Reference: https://arxiv.org/abs/1906.11172
+RandAugment Reference: https://arxiv.org/abs/1909.13719
+RandomErasing Reference: https://arxiv.org/abs/1708.04896
+MixupAndCutmix:
+  - Mixup: https://arxiv.org/abs/1710.09412
+  - Cutmix: https://arxiv.org/abs/1905.04899
+
+RandomErasing, Mixup and Cutmix are inspired by
+https://github.com/rwightman/pytorch-image-models
+
+"""
+import inspect
+import math
+from typing import Any, List, Iterable, Optional, Text, Tuple
+
+from keras.layers.preprocessing import image_preprocessing as image_ops
+import numpy as np
+import tensorflow as tf
+
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+
+def to_4d(image: tf.Tensor) -> tf.Tensor:
+  """Converts an input Tensor to 4 dimensions.
+
+  4D image => [N, H, W, C] or [N, C, H, W]
+  3D image => [1, H, W, C] or [1, C, H, W]
+  2D image => [1, H, W, 1]
+
+  Args:
+    image: The 2/3/4D input tensor.
+
+  Returns:
+    A 4D image tensor.
+
+  Raises:
+    `TypeError` if `image` is not a 2/3/4D tensor.
+
+  """
+  shape = tf.shape(image)
+  original_rank = tf.rank(image)
+  left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32)
+  right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32)
+  new_shape = tf.concat(
+      [
+          tf.ones(shape=left_pad, dtype=tf.int32),
+          shape,
+          tf.ones(shape=right_pad, dtype=tf.int32),
+      ],
+      axis=0,
+  )
+  return tf.reshape(image, new_shape)
+
+
+def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor:
+  """Converts a 4D image back to `ndims` rank."""
+  shape = tf.shape(image)
+  begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32)
+  end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32)
+  new_shape = shape[begin:end]
+  return tf.reshape(image, new_shape)
+
+
+def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor:
+  """Converts translations to a projective transform.
+
+  The translation matrix looks like this:
+    [[1 0 -dx]
+     [0 1 -dy]
+     [0 0 1]]
+
+  Args:
+    translations: The 2-element list representing [dx, dy], or a matrix of
+      2-element lists representing [dx dy] to translate for each image. The
+      shape must be static.
+
+  Returns:
+    The transformation matrix of shape (num_images, 8).
+
+  Raises:
+    `TypeError` if
+      - the shape of `translations` is not known or
+      - the shape of `translations` is not rank 1 or 2.
+
+  """
+  translations = tf.convert_to_tensor(translations, dtype=tf.float32)
+  if translations.get_shape().ndims is None:
+    raise TypeError('translations rank must be statically known')
+  elif len(translations.get_shape()) == 1:
+    translations = translations[None]
+  elif len(translations.get_shape()) != 2:
+    raise TypeError('translations should have rank 1 or 2.')
+  num_translations = tf.shape(translations)[0]
+
+  return tf.concat(
+      values=[
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 0, None],
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 1, None],
+          tf.zeros((num_translations, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+
+
+def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor,
+                                 image_height: tf.Tensor) -> tf.Tensor:
+  """Converts an angle or angles to a projective transform.
+
+  Args:
+    angles: A scalar to rotate all images, or a vector to rotate a batch of
+      images. This must be a scalar.
+    image_width: The width of the image(s) to be transformed.
+    image_height: The height of the image(s) to be transformed.
+
+  Returns:
+    A tensor of shape (num_images, 8).
+
+  Raises:
+    `TypeError` if `angles` is not rank 0 or 1.
+
+  """
+  angles = tf.convert_to_tensor(angles, dtype=tf.float32)
+  if len(angles.get_shape()) == 0:  # pylint:disable=g-explicit-length-test
+    angles = angles[None]
+  elif len(angles.get_shape()) != 1:
+    raise TypeError('Angles should have a rank 0 or 1.')
+  x_offset = ((image_width - 1) -
+              (tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) *
+               (image_height - 1))) / 2.0
+  y_offset = ((image_height - 1) -
+              (tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) *
+               (image_height - 1))) / 2.0
+  num_angles = tf.shape(angles)[0]
+  return tf.concat(
+      values=[
+          tf.math.cos(angles)[:, None],
+          -tf.math.sin(angles)[:, None],
+          x_offset[:, None],
+          tf.math.sin(angles)[:, None],
+          tf.math.cos(angles)[:, None],
+          y_offset[:, None],
+          tf.zeros((num_angles, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+
+
+def transform(image: tf.Tensor, transforms) -> tf.Tensor:
+  """Prepares input data for `image_ops.transform`."""
+  original_ndims = tf.rank(image)
+  transforms = tf.convert_to_tensor(transforms, dtype=tf.float32)
+  if transforms.shape.rank == 1:
+    transforms = transforms[None]
+  image = to_4d(image)
+  image = image_ops.transform(
+      images=image, transforms=transforms, interpolation='nearest')
+  return from_4d(image, original_ndims)
+
+
+def translate(image: tf.Tensor, translations) -> tf.Tensor:
+  """Translates image(s) by provided vectors.
+
+  Args:
+    image: An image Tensor of type uint8.
+    translations: A vector or matrix representing [dx dy].
+
+  Returns:
+    The translated version of the image.
+
+  """
+  transforms = _convert_translation_to_transform(translations)
+  return transform(image, transforms=transforms)
+
+
+def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor:
+  """Rotates the image by degrees either clockwise or counterclockwise.
+
+  Args:
+    image: An image Tensor of type uint8.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+
+  Returns:
+    The rotated version of image.
+
+  """
+  # Convert from degrees to radians.
+  degrees_to_radians = math.pi / 180.0
+  radians = tf.cast(degrees * degrees_to_radians, tf.float32)
+
+  original_ndims = tf.rank(image)
+  image = to_4d(image)
+
+  image_height = tf.cast(tf.shape(image)[1], tf.float32)
+  image_width = tf.cast(tf.shape(image)[2], tf.float32)
+  transforms = _convert_angles_to_transform(
+      angles=radians, image_width=image_width, image_height=image_height)
+  # In practice, we should randomize the rotation degrees by flipping
+  # it negatively half the time, but that's done on 'degrees' outside
+  # of the function.
+  image = transform(image, transforms=transforms)
+  return from_4d(image, original_ndims)
+
+
+def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
+  """Blend image1 and image2 using 'factor'.
+
+  Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+  A value of 1.0 means only image2 is used.  A value between 0.0 and
+  1.0 means we linearly interpolate the pixel values between the two
+  images.  A value greater than 1.0 "extrapolates" the difference
+  between the two pixel values, and we clip the results to values
+  between 0 and 255.
+
+  Args:
+    image1: An image Tensor of type uint8.
+    image2: An image Tensor of type uint8.
+    factor: A floating point value above 0.0.
+
+  Returns:
+    A blended image Tensor of type uint8.
+  """
+  if factor == 0.0:
+    return tf.convert_to_tensor(image1)
+  if factor == 1.0:
+    return tf.convert_to_tensor(image2)
+
+  image1 = tf.cast(image1, tf.float32)
+  image2 = tf.cast(image2, tf.float32)
+
+  difference = image2 - image1
+  scaled = factor * difference
+
+  # Do addition in float.
+  temp = tf.cast(image1, tf.float32) + scaled
+
+  # Interpolate
+  if factor > 0.0 and factor < 1.0:
+    # Interpolation means we always stay within 0 and 255.
+    return tf.cast(temp, tf.uint8)
+
+  # Extrapolate:
+  #
+  # We need to clip and then cast.
+  return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
+
+
+def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor:
+  """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+
+  This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+  a random location within `image`. The pixel values filled in will be of the
+  value `replace`. The location where the mask will be applied is randomly
+  chosen uniformly over the whole image.
+
+  Args:
+    image: An image Tensor of type uint8.
+    pad_size: Specifies how big the zero mask that will be generated is that is
+      applied to the image. The mask will be of size (2*pad_size x 2*pad_size).
+    replace: What pixel value to fill in the image in the area that has the
+      cutout mask applied to it.
+
+  Returns:
+    An image Tensor that is of type uint8.
+  """
+  if image.shape.rank not in [3, 4]:
+    raise ValueError('Bad image rank: {}'.format(image.shape.rank))
+
+  if image.shape.rank == 4:
+    return cutout_video(image, replace=replace)
+
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+
+  # Sample the center location in the image where the zero mask will be applied.
+  cutout_center_height = tf.random.uniform(
+      shape=[], minval=0, maxval=image_height, dtype=tf.int32)
+
+  cutout_center_width = tf.random.uniform(
+      shape=[], minval=0, maxval=image_width, dtype=tf.int32)
+
+  image = _fill_rectangle(image, cutout_center_width, cutout_center_height,
+                          pad_size, pad_size, replace)
+
+  return image
+
+
+def _fill_rectangle(image,
+                    center_width,
+                    center_height,
+                    half_width,
+                    half_height,
+                    replace=None):
+  """Fill blank area."""
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+
+  lower_pad = tf.maximum(0, center_height - half_height)
+  upper_pad = tf.maximum(0, image_height - center_height - half_height)
+  left_pad = tf.maximum(0, center_width - half_width)
+  right_pad = tf.maximum(0, image_width - center_width - half_width)
+
+  cutout_shape = [
+      image_height - (lower_pad + upper_pad),
+      image_width - (left_pad + right_pad)
+  ]
+  padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+  mask = tf.pad(
+      tf.zeros(cutout_shape, dtype=image.dtype),
+      padding_dims,
+      constant_values=1)
+  mask = tf.expand_dims(mask, -1)
+  mask = tf.tile(mask, [1, 1, 3])
+
+  if replace is None:
+    fill = tf.random.normal(tf.shape(image), dtype=image.dtype)
+  elif isinstance(replace, tf.Tensor):
+    fill = replace
+  else:
+    fill = tf.ones_like(image, dtype=image.dtype) * replace
+  image = tf.where(tf.equal(mask, 0), fill, image)
+
+  return image
+
+
+def cutout_video(image: tf.Tensor, replace: int = 0) -> tf.Tensor:
+  """Apply cutout (https://arxiv.org/abs/1708.04552) to a video.
+
+  This operation applies a random size 3D mask of zeros to a random location
+  within `image`. The mask is padded The pixel values filled in will be of the
+  value `replace`. The location where the mask will be applied is randomly
+  chosen uniformly over the whole image. The size of the mask is randomly
+  sampled uniformly from [0.25*height, 0.5*height], [0.25*width, 0.5*width],
+  and [1, 0.25*depth], which represent the height, width, and number of frames
+  of the input video tensor respectively.
+
+  Args:
+    image: A video Tensor of type uint8.
+    replace: What pixel value to fill in the image in the area that has the
+      cutout mask applied to it.
+
+  Returns:
+    An video Tensor that is of type uint8.
+  """
+  image_depth = tf.shape(image)[0]
+  image_height = tf.shape(image)[1]
+  image_width = tf.shape(image)[2]
+
+  # Sample the center location in the image where the zero mask will be applied.
+  cutout_center_height = tf.random.uniform(
+      shape=[], minval=0, maxval=image_height, dtype=tf.int32)
+
+  cutout_center_width = tf.random.uniform(
+      shape=[], minval=0, maxval=image_width, dtype=tf.int32)
+
+  cutout_center_depth = tf.random.uniform(
+      shape=[], minval=0, maxval=image_depth, dtype=tf.int32)
+
+  pad_size_height = tf.random.uniform(
+      shape=[],
+      minval=tf.maximum(1, tf.cast(image_height / 4, tf.int32)),
+      maxval=tf.maximum(2, tf.cast(image_height / 2, tf.int32)),
+      dtype=tf.int32)
+  pad_size_width = tf.random.uniform(
+      shape=[],
+      minval=tf.maximum(1, tf.cast(image_width / 4, tf.int32)),
+      maxval=tf.maximum(2, tf.cast(image_width / 2, tf.int32)),
+      dtype=tf.int32)
+  pad_size_depth = tf.random.uniform(
+      shape=[],
+      minval=1,
+      maxval=tf.maximum(2, tf.cast(image_depth / 4, tf.int32)),
+      dtype=tf.int32)
+
+  lower_pad = tf.maximum(0, cutout_center_height - pad_size_height)
+  upper_pad = tf.maximum(
+      0, image_height - cutout_center_height - pad_size_height)
+  left_pad = tf.maximum(0, cutout_center_width - pad_size_width)
+  right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size_width)
+  back_pad = tf.maximum(0, cutout_center_depth - pad_size_depth)
+  forward_pad = tf.maximum(
+      0, image_depth - cutout_center_depth - pad_size_depth)
+
+  cutout_shape = [
+      image_depth - (back_pad + forward_pad),
+      image_height - (lower_pad + upper_pad),
+      image_width - (left_pad + right_pad),
+  ]
+  padding_dims = [[back_pad, forward_pad],
+                  [lower_pad, upper_pad],
+                  [left_pad, right_pad]]
+  mask = tf.pad(
+      tf.zeros(cutout_shape, dtype=image.dtype),
+      padding_dims,
+      constant_values=1)
+  mask = tf.expand_dims(mask, -1)
+  mask = tf.tile(mask, [1, 1, 1, 3])
+  image = tf.where(
+      tf.equal(mask, 0),
+      tf.ones_like(image, dtype=image.dtype) * replace, image)
+  return image
+
+
+def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor:
+  """Solarize the input image(s)."""
+  # For each pixel in the image, select the pixel
+  # if the value is less than the threshold.
+  # Otherwise, subtract 255 from the pixel.
+  return tf.where(image < threshold, image, 255 - image)
+
+
+def solarize_add(image: tf.Tensor,
+                 addition: int = 0,
+                 threshold: int = 128) -> tf.Tensor:
+  """Additive solarize the input image(s)."""
+  # For each pixel in the image less than threshold
+  # we add 'addition' amount to it and then clip the
+  # pixel value to be between 0 and 255. The value
+  # of 'addition' is between -128 and 128.
+  added_image = tf.cast(image, tf.int64) + addition
+  added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
+  return tf.where(image < threshold, added_image, image)
+
+
+def color(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Color."""
+  degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+  return blend(degenerate, image, factor)
+
+
+def contrast(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Contrast."""
+  degenerate = tf.image.rgb_to_grayscale(image)
+  # Cast before calling tf.histogram.
+  degenerate = tf.cast(degenerate, tf.int32)
+
+  # Compute the grayscale histogram, then compute the mean pixel value,
+  # and create a constant image size of that value.  Use that as the
+  # blending degenerate target of the original image.
+  hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
+  mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
+  degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
+  return blend(degenerate, image, factor)
+
+
+def brightness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Brightness."""
+  degenerate = tf.zeros_like(image)
+  return blend(degenerate, image, factor)
+
+
+def posterize(image: tf.Tensor, bits: int) -> tf.Tensor:
+  """Equivalent of PIL Posterize."""
+  shift = 8 - bits
+  return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
+
+
+def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor:
+  """Applies rotation with wrap/unwrap."""
+  image = rotate(wrap(image), degrees=degrees)
+  return unwrap(image, replace)
+
+
+def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in X dimension."""
+  image = translate(wrap(image), [-pixels, 0])
+  return unwrap(image, replace)
+
+
+def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in Y dimension."""
+  image = translate(wrap(image), [0, -pixels])
+  return unwrap(image, replace)
+
+
+def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in X dimension."""
+  # Shear parallel to x axis is a projective transform
+  # with a matrix form of:
+  # [1  level
+  #  0  1].
+  image = transform(
+      image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.])
+  return unwrap(image, replace)
+
+
+def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in Y dimension."""
+  # Shear parallel to y axis is a projective transform
+  # with a matrix form of:
+  # [1  0
+  #  level  1].
+  image = transform(
+      image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.])
+  return unwrap(image, replace)
+
+
+def autocontrast(image: tf.Tensor) -> tf.Tensor:
+  """Implements Autocontrast function from PIL using TF ops.
+
+  Args:
+    image: A 3D uint8 tensor.
+
+  Returns:
+    The image after it has had autocontrast applied to it and will be of type
+    uint8.
+  """
+
+  def scale_channel(image: tf.Tensor) -> tf.Tensor:
+    """Scale the 2D image using the autocontrast rule."""
+    # A possibly cheaper version can be done using cumsum/unique_with_counts
+    # over the histogram values, rather than iterating over the entire image.
+    # to compute mins and maxes.
+    lo = tf.cast(tf.reduce_min(image), tf.float32)
+    hi = tf.cast(tf.reduce_max(image), tf.float32)
+
+    # Scale the image, making the lowest value 0 and the highest value 255.
+    def scale_values(im):
+      scale = 255.0 / (hi - lo)
+      offset = -lo * scale
+      im = tf.cast(im, tf.float32) * scale + offset
+      im = tf.clip_by_value(im, 0.0, 255.0)
+      return tf.cast(im, tf.uint8)
+
+    result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
+    return result
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image[..., 0])
+  s2 = scale_channel(image[..., 1])
+  s3 = scale_channel(image[..., 2])
+  image = tf.stack([s1, s2, s3], -1)
+
+  return image
+
+
+def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Implements Sharpness function from PIL using TF ops."""
+  orig_image = image
+  image = tf.cast(image, tf.float32)
+  # Make image 4D for conv operation.
+  image = tf.expand_dims(image, 0)
+  # SMOOTH PIL Kernel.
+  if orig_image.shape.rank == 3:
+    kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
+                         dtype=tf.float32,
+                         shape=[3, 3, 1, 1]) / 13.
+    # Tile across channel dimension.
+    kernel = tf.tile(kernel, [1, 1, 3, 1])
+    strides = [1, 1, 1, 1]
+    degenerate = tf.nn.depthwise_conv2d(
+        image, kernel, strides, padding='VALID', dilations=[1, 1])
+  elif orig_image.shape.rank == 4:
+    kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
+                         dtype=tf.float32,
+                         shape=[1, 3, 3, 1, 1]) / 13.
+    strides = [1, 1, 1, 1, 1]
+    # Run the kernel across each channel
+    channels = tf.split(image, 3, axis=-1)
+    degenerates = [
+        tf.nn.conv3d(channel, kernel, strides, padding='VALID',
+                     dilations=[1, 1, 1, 1, 1])
+        for channel in channels
+    ]
+    degenerate = tf.concat(degenerates, -1)
+  else:
+    raise ValueError('Bad image rank: {}'.format(image.shape.rank))
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
+
+  # For the borders of the resulting image, fill in the values of the
+  # original image.
+  mask = tf.ones_like(degenerate)
+  paddings = [[0, 0]] * (orig_image.shape.rank - 3)
+  padded_mask = tf.pad(mask, paddings + [[1, 1], [1, 1], [0, 0]])
+  padded_degenerate = tf.pad(degenerate, paddings + [[1, 1], [1, 1], [0, 0]])
+  result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
+
+  # Blend the final result.
+  return blend(result, orig_image, factor)
+
+
+def equalize(image: tf.Tensor) -> tf.Tensor:
+  """Implements Equalize function from PIL using TF ops."""
+
+  def scale_channel(im, c):
+    """Scale the data in the channel to implement equalize."""
+    im = tf.cast(im[..., c], tf.int32)
+    # Compute the histogram of the image channel.
+    histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
+
+    # For the purposes of computing the step, filter out the nonzeros.
+    nonzero = tf.where(tf.not_equal(histo, 0))
+    nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
+    step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+    def build_lut(histo, step):
+      # Compute the cumulative sum, shifting by step // 2
+      # and then normalization by step.
+      lut = (tf.cumsum(histo) + (step // 2)) // step
+      # Shift lut, prepending with 0.
+      lut = tf.concat([[0], lut[:-1]], 0)
+      # Clip the counts to be in range.  This is done
+      # in the C code for image.point.
+      return tf.clip_by_value(lut, 0, 255)
+
+    # If step is zero, return the original image.  Otherwise, build
+    # lut from the full histogram and step and then index from it.
+    result = tf.cond(
+        tf.equal(step, 0), lambda: im,
+        lambda: tf.gather(build_lut(histo, step), im))
+
+    return tf.cast(result, tf.uint8)
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image, 0)
+  s2 = scale_channel(image, 1)
+  s3 = scale_channel(image, 2)
+  image = tf.stack([s1, s2, s3], -1)
+  return image
+
+
+def invert(image: tf.Tensor) -> tf.Tensor:
+  """Inverts the image pixels."""
+  image = tf.convert_to_tensor(image)
+  return 255 - image
+
+
+def wrap(image: tf.Tensor) -> tf.Tensor:
+  """Returns 'image' with an extra channel set to all 1s."""
+  shape = tf.shape(image)
+  extended_channel = tf.expand_dims(tf.ones(shape[:-1], image.dtype), -1)
+  extended = tf.concat([image, extended_channel], axis=-1)
+  return extended
+
+
+def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor:
+  """Unwraps an image produced by wrap.
+
+  Where there is a 0 in the last channel for every spatial position,
+  the rest of the three channels in that spatial dimension are grayed
+  (set to 128).  Operations like translate and shear on a wrapped
+  Tensor will leave 0s in empty locations.  Some transformations look
+  at the intensity of values to do preprocessing, and we want these
+  empty pixels to assume the 'average' value, rather than pure black.
+
+
+  Args:
+    image: A 3D Image Tensor with 4 channels.
+    replace: A one or three value 1D tensor to fill empty pixels.
+
+  Returns:
+    image: A 3D image Tensor with 3 channels.
+  """
+  image_shape = tf.shape(image)
+  # Flatten the spatial dimensions.
+  flattened_image = tf.reshape(image, [-1, image_shape[-1]])
+
+  # Find all pixels where the last channel is zero.
+  alpha_channel = tf.expand_dims(flattened_image[..., 3], axis=-1)
+
+  replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
+
+  # Where they are zero, fill them in with 'replace'.
+  flattened_image = tf.where(
+      tf.equal(alpha_channel, 0),
+      tf.ones_like(flattened_image, dtype=image.dtype) * replace,
+      flattened_image)
+
+  image = tf.reshape(flattened_image, image_shape)
+  image = tf.slice(
+      image,
+      [0] * image.shape.rank,
+      tf.concat([image_shape[:-1], [3]], -1))
+  return image
+
+
+def _scale_bbox_only_op_probability(prob):
+  """Reduce the probability of the bbox-only operation.
+
+  Probability is reduced so that we do not distort the content of too many
+  bounding boxes that are close to each other. The value of 3.0 was a chosen
+  hyper parameter when designing the autoaugment algorithm that we found
+  empirically to work well.
+
+  Args:
+    prob: Float that is the probability of applying the bbox-only operation.
+
+  Returns:
+    Reduced probability.
+  """
+  return prob / 3.0
+
+
+def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
+  """Applies augmentation_func to the subsection of image indicated by bbox.
+
+  Args:
+    image: 3D uint8 Tensor.
+    bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+      of type float that represents the normalized coordinates between 0 and 1.
+    augmentation_func: Augmentation function that will be applied to the
+      subsection of image.
+    *args: Additional parameters that will be passed into augmentation_func
+      when it is called.
+
+  Returns:
+    A modified version of image, where the bbox location in the image will
+    have `ugmentation_func applied to it.
+  """
+  image_height = tf.cast(tf.shape(image)[0], tf.float32)
+  image_width = tf.cast(tf.shape(image)[1], tf.float32)
+  min_y = tf.cast(image_height * bbox[0], tf.int32)
+  min_x = tf.cast(image_width * bbox[1], tf.int32)
+  max_y = tf.cast(image_height * bbox[2], tf.int32)
+  max_x = tf.cast(image_width * bbox[3], tf.int32)
+  image_height = tf.cast(image_height, tf.int32)
+  image_width = tf.cast(image_width, tf.int32)
+
+  # Clip to be sure the max values do not fall out of range.
+  max_y = tf.minimum(max_y, image_height - 1)
+  max_x = tf.minimum(max_x, image_width - 1)
+
+  # Get the sub-tensor that is the image within the bounding box region.
+  bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]
+
+  # Apply the augmentation function to the bbox portion of the image.
+  augmented_bbox_content = augmentation_func(bbox_content, *args)
+
+  # Pad the augmented_bbox_content and the mask to match the shape of original
+  # image.
+  augmented_bbox_content = tf.pad(augmented_bbox_content,
+                                  [[min_y, (image_height - 1) - max_y],
+                                   [min_x, (image_width - 1) - max_x],
+                                   [0, 0]])
+
+  # Create a mask that will be used to zero out a part of the original image.
+  mask_tensor = tf.zeros_like(bbox_content)
+
+  mask_tensor = tf.pad(mask_tensor,
+                       [[min_y, (image_height - 1) - max_y],
+                        [min_x, (image_width - 1) - max_x],
+                        [0, 0]],
+                       constant_values=1)
+  # Replace the old bbox content with the new augmented content.
+  image = image * mask_tensor + augmented_bbox_content
+  return image
+
+
+def _concat_bbox(bbox, bboxes):
+  """Helper function that concates bbox to bboxes along the first dimension."""
+
+  # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
+  # we discard bboxes and start the bboxes Tensor with the current bbox.
+  bboxes_sum_check = tf.reduce_sum(bboxes)
+  bbox = tf.expand_dims(bbox, 0)
+  # This check will be true when it is an _INVALID_BOX
+  bboxes = tf.cond(tf.equal(bboxes_sum_check, -4.0),
+                   lambda: bbox,
+                   lambda: tf.concat([bboxes, bbox], 0))
+  return bboxes
+
+
+def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
+                                     augmentation_func, func_changes_bbox,
+                                     *args):
+  """Applies _apply_bbox_augmentation with probability prob.
+
+  Args:
+    image: 3D uint8 Tensor.
+    bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+      of type float that represents the normalized coordinates between 0 and 1.
+    new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
+      have been altered by aug_func. These will only be changed when
+      func_changes_bbox is set to true. Each bbox has 4 elements
+      (min_y, min_x, max_y, max_x) of type float that are the normalized
+      bbox coordinates between 0 and 1.
+    prob: Float that is the probability of applying _apply_bbox_augmentation.
+    augmentation_func: Augmentation function that will be applied to the
+      subsection of image.
+    func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
+      to image.
+    *args: Additional parameters that will be passed into augmentation_func
+      when it is called.
+
+  Returns:
+    A tuple. Fist element is a modified version of image, where the bbox
+    location in the image will have augmentation_func applied to it if it is
+    chosen to be called with probability `prob`. The second element is a
+    Tensor of Tensors of length 4 that will contain the altered bbox after
+    applying augmentation_func.
+  """
+  should_apply_op = tf.cast(
+      tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
+  if func_changes_bbox:
+    augmented_image, bbox = tf.cond(
+        should_apply_op,
+        lambda: augmentation_func(image, bbox, *args),
+        lambda: (image, bbox))
+  else:
+    augmented_image = tf.cond(
+        should_apply_op,
+        lambda: _apply_bbox_augmentation(image, bbox, augmentation_func, *args),
+        lambda: image)
+  new_bboxes = _concat_bbox(bbox, new_bboxes)
+  return augmented_image, new_bboxes
+
+
+def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
+                                           func_changes_bbox, *args):
+  """Checks to be sure num bboxes > 0 before calling inner function."""
+  num_bboxes = tf.shape(bboxes)[0]
+  image, bboxes = tf.cond(
+      tf.equal(num_bboxes, 0),
+      lambda: (image, bboxes),
+      # pylint:disable=g-long-lambda
+      lambda: _apply_multi_bbox_augmentation(
+          image, bboxes, prob, aug_func, func_changes_bbox, *args))
+  # pylint:enable=g-long-lambda
+  return image, bboxes
+
+
+# Represents an invalid bounding box that is used for checking for padding
+# lists of bounding box coordinates for a few augmentation operations
+_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]
+
+
+def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
+                                   func_changes_bbox, *args):
+  """Applies aug_func to the image for each bbox in bboxes.
+
+  Args:
+    image: 3D uint8 Tensor.
+    bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+      has 4 elements (min_y, min_x, max_y, max_x) of type float.
+    prob: Float that is the probability of applying aug_func to a specific
+      bounding box within the image.
+    aug_func: Augmentation function that will be applied to the
+      subsections of image indicated by the bbox values in bboxes.
+    func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
+      to image.
+    *args: Additional parameters that will be passed into augmentation_func
+      when it is called.
+
+  Returns:
+    A modified version of image, where each bbox location in the image will
+    have augmentation_func applied to it if it is chosen to be called with
+    probability prob independently across all bboxes. Also the final
+    bboxes are returned that will be unchanged if func_changes_bbox is set to
+    false and if true, the new altered ones will be returned.
+
+  Raises:
+    ValueError if applied to video.
+  """
+  if image.shape.rank == 4:
+    raise ValueError('Image rank 4 is not supported')
+
+  # Will keep track of the new altered bboxes after aug_func is repeatedly
+  # applied. The -1 values are a dummy value and this first Tensor will be
+  # removed upon appending the first real bbox.
+  new_bboxes = tf.constant(_INVALID_BOX)
+
+  # If the bboxes are empty, then just give it _INVALID_BOX. The result
+  # will be thrown away.
+  bboxes = tf.cond(tf.equal(tf.size(bboxes), 0),
+                   lambda: tf.constant(_INVALID_BOX),
+                   lambda: bboxes)
+
+  bboxes = tf.ensure_shape(bboxes, (None, 4))
+
+  # pylint:disable=g-long-lambda
+  wrapped_aug_func = (
+      lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(
+          _image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args))
+  # pylint:enable=g-long-lambda
+
+  # Setup the while_loop.
+  num_bboxes = tf.shape(bboxes)[0]  # We loop until we go over all bboxes.
+  idx = tf.constant(0)  # Counter for the while loop.
+
+  # Conditional function when to end the loop once we go over all bboxes
+  # images_and_bboxes contain (_image, _new_bboxes)
+  cond = lambda _idx, _images_and_bboxes: tf.less(_idx, num_bboxes)
+
+  # Shuffle the bboxes so that the augmentation order is not deterministic if
+  # we are not changing the bboxes with aug_func.
+  if not func_changes_bbox:
+    loop_bboxes = tf.random.shuffle(bboxes)
+  else:
+    loop_bboxes = bboxes
+
+  # Main function of while_loop where we repeatedly apply augmentation on the
+  # bboxes in the image.
+  # pylint:disable=g-long-lambda
+  body = lambda _idx, _images_and_bboxes: [
+      _idx + 1, wrapped_aug_func(_images_and_bboxes[0],
+                                 loop_bboxes[_idx],
+                                 _images_and_bboxes[1])]
+  # pylint:enable=g-long-lambda
+
+  _, (image, new_bboxes) = tf.while_loop(
+      cond, body, [idx, (image, new_bboxes)],
+      shape_invariants=[idx.get_shape(),
+                        (image.get_shape(), tf.TensorShape([None, 4]))])
+
+  # Either return the altered bboxes or the original ones depending on if
+  # we altered them in anyway.
+  if func_changes_bbox:
+    final_bboxes = new_bboxes
+  else:
+    final_bboxes = bboxes
+  return image, final_bboxes
+
+
+def _clip_bbox(min_y, min_x, max_y, max_x):
+  """Clip bounding box coordinates between 0 and 1.
+
+  Args:
+    min_y: Normalized bbox coordinate of type float between 0 and 1.
+    min_x: Normalized bbox coordinate of type float between 0 and 1.
+    max_y: Normalized bbox coordinate of type float between 0 and 1.
+    max_x: Normalized bbox coordinate of type float between 0 and 1.
+
+  Returns:
+    Clipped coordinate values between 0 and 1.
+  """
+  min_y = tf.clip_by_value(min_y, 0.0, 1.0)
+  min_x = tf.clip_by_value(min_x, 0.0, 1.0)
+  max_y = tf.clip_by_value(max_y, 0.0, 1.0)
+  max_x = tf.clip_by_value(max_x, 0.0, 1.0)
+  return min_y, min_x, max_y, max_x
+
+
+def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
+  """Adjusts bbox coordinates to make sure the area is > 0.
+
+  Args:
+    min_y: Normalized bbox coordinate of type float between 0 and 1.
+    min_x: Normalized bbox coordinate of type float between 0 and 1.
+    max_y: Normalized bbox coordinate of type float between 0 and 1.
+    max_x: Normalized bbox coordinate of type float between 0 and 1.
+    delta: Float, this is used to create a gap of size 2 * delta between
+      bbox min/max coordinates that are the same on the boundary.
+      This prevents the bbox from having an area of zero.
+
+  Returns:
+    Tuple of new bbox coordinates between 0 and 1 that will now have a
+    guaranteed area > 0.
+  """
+  height = max_y - min_y
+  width = max_x - min_x
+  def _adjust_bbox_boundaries(min_coord, max_coord):
+    # Make sure max is never 0 and min is never 1.
+    max_coord = tf.maximum(max_coord, 0.0 + delta)
+    min_coord = tf.minimum(min_coord, 1.0 - delta)
+    return min_coord, max_coord
+  min_y, max_y = tf.cond(tf.equal(height, 0.0),
+                         lambda: _adjust_bbox_boundaries(min_y, max_y),
+                         lambda: (min_y, max_y))
+  min_x, max_x = tf.cond(tf.equal(width, 0.0),
+                         lambda: _adjust_bbox_boundaries(min_x, max_x),
+                         lambda: (min_x, max_x))
+  return min_y, min_x, max_y, max_x
+
+
+def _rotate_bbox(bbox, image_height, image_width, degrees):
+  """Rotates the bbox coordinated by degrees.
+
+  Args:
+    bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+      of type float that represents the normalized coordinates between 0 and 1.
+    image_height: Int, height of the image.
+    image_width: Int, height of the image.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+
+  Returns:
+    A tensor of the same shape as bbox, but now with the rotated coordinates.
+  """
+  image_height, image_width = (
+      tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32))
+
+  # Convert from degrees to radians.
+  degrees_to_radians = math.pi / 180.0
+  radians = degrees * degrees_to_radians
+
+  # Translate the bbox to the center of the image and turn the normalized 0-1
+  # coordinates to absolute pixel locations.
+  # Y coordinates are made negative as the y axis of images goes down with
+  # increasing pixel values, so we negate to make sure x axis and y axis points
+  # are in the traditionally positive direction.
+  min_y = -tf.cast(image_height * (bbox[0] - 0.5), tf.int32)
+  min_x = tf.cast(image_width * (bbox[1] - 0.5), tf.int32)
+  max_y = -tf.cast(image_height * (bbox[2] - 0.5), tf.int32)
+  max_x = tf.cast(image_width * (bbox[3] - 0.5), tf.int32)
+  coordinates = tf.stack(
+      [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
+  coordinates = tf.cast(coordinates, tf.float32)
+  # Rotate the coordinates according to the rotation matrix clockwise if
+  # radians is positive, else negative
+  rotation_matrix = tf.stack(
+      [[tf.cos(radians), tf.sin(radians)],
+       [-tf.sin(radians), tf.cos(radians)]])
+  new_coords = tf.cast(
+      tf.matmul(rotation_matrix, tf.transpose(coordinates)), tf.int32)
+  # Find min/max values and convert them back to normalized 0-1 floats.
+  min_y = -(
+      tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height - 0.5)
+  min_x = tf.cast(tf.reduce_min(new_coords[1, :]),
+                  tf.float32) / image_width + 0.5
+  max_y = -(
+      tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height - 0.5)
+  max_x = tf.cast(tf.reduce_max(new_coords[1, :]),
+                  tf.float32) / image_width + 0.5
+
+  # Clip the bboxes to be sure the fall between [0, 1].
+  min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+  min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+  return tf.stack([min_y, min_x, max_y, max_x])
+
+
+def rotate_with_bboxes(image, bboxes, degrees, replace):
+  """Equivalent of PIL Rotate that rotates the image and bbox.
+
+  Args:
+    image: 3D uint8 Tensor.
+    bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+      has 4 elements (min_y, min_x, max_y, max_x) of type float.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+    replace: A one or three value 1D tensor to fill empty pixels.
+
+  Returns:
+    A tuple containing a 3D uint8 Tensor that will be the result of rotating
+    image by degrees. The second element of the tuple is bboxes, where now
+    the coordinates will be shifted to reflect the rotated image.
+
+  Raises:
+    ValueError: If applied to video.
+  """
+  if image.shape.rank == 4:
+    raise ValueError('Image rank 4 is not supported')
+
+  # Rotate the image.
+  image = wrapped_rotate(image, degrees, replace)
+
+  # Convert bbox coordinates to pixel values.
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+  # pylint:disable=g-long-lambda
+  wrapped_rotate_bbox = lambda bbox: _rotate_bbox(
+      bbox, image_height, image_width, degrees)
+  # pylint:enable=g-long-lambda
+  bboxes = tf.map_fn(wrapped_rotate_bbox, bboxes)
+  return image, bboxes
+
+
+def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
+  """Shifts the bbox according to how the image was sheared.
+
+  Args:
+    bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+      of type float that represents the normalized coordinates between 0 and 1.
+    image_height: Int, height of the image.
+    image_width: Int, height of the image.
+    level: Float. How much to shear the image.
+    shear_horizontal: If true then shear in X dimension else shear in
+      the Y dimension.
+
+  Returns:
+    A tensor of the same shape as bbox, but now with the shifted coordinates.
+  """
+  image_height, image_width = (
+      tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32))
+
+  # Change bbox coordinates to be pixels.
+  min_y = tf.cast(image_height * bbox[0], tf.int32)
+  min_x = tf.cast(image_width * bbox[1], tf.int32)
+  max_y = tf.cast(image_height * bbox[2], tf.int32)
+  max_x = tf.cast(image_width * bbox[3], tf.int32)
+  coordinates = tf.stack(
+      [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
+  coordinates = tf.cast(coordinates, tf.float32)
+
+  # Shear the coordinates according to the translation matrix.
+  if shear_horizontal:
+    translation_matrix = tf.stack(
+        [[1, 0], [-level, 1]])
+  else:
+    translation_matrix = tf.stack(
+        [[1, -level], [0, 1]])
+  translation_matrix = tf.cast(translation_matrix, tf.float32)
+  new_coords = tf.cast(
+      tf.matmul(translation_matrix, tf.transpose(coordinates)), tf.int32)
+
+  # Find min/max values and convert them back to floats.
+  min_y = tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height
+  min_x = tf.cast(tf.reduce_min(new_coords[1, :]), tf.float32) / image_width
+  max_y = tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height
+  max_x = tf.cast(tf.reduce_max(new_coords[1, :]), tf.float32) / image_width
+
+  # Clip the bboxes to be sure the fall between [0, 1].
+  min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+  min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+  return tf.stack([min_y, min_x, max_y, max_x])
+
+
+def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
+  """Applies Shear Transformation to the image and shifts the bboxes.
+
+  Args:
+    image: 3D uint8 Tensor.
+    bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+      has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+      between [0, 1].
+    level: Float. How much to shear the image. This value will be between
+      -0.3 to 0.3.
+    replace: A one or three value 1D tensor to fill empty pixels.
+    shear_horizontal: Boolean. If true then shear in X dimension else shear in
+      the Y dimension.
+
+  Returns:
+    A tuple containing a 3D uint8 Tensor that will be the result of shearing
+    image by level. The second element of the tuple is bboxes, where now
+    the coordinates will be shifted to reflect the sheared image.
+
+  Raises:
+    ValueError: If applied to video.
+  """
+  if image.shape.rank == 4:
+    raise ValueError('Image rank 4 is not supported')
+
+  if shear_horizontal:
+    image = shear_x(image, level, replace)
+  else:
+    image = shear_y(image, level, replace)
+
+  # Convert bbox coordinates to pixel values.
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+  # pylint:disable=g-long-lambda
+  wrapped_shear_bbox = lambda bbox: _shear_bbox(
+      bbox, image_height, image_width, level, shear_horizontal)
+  # pylint:enable=g-long-lambda
+  bboxes = tf.map_fn(wrapped_shear_bbox, bboxes)
+  return image, bboxes
+
+
+def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
+  """Shifts the bbox coordinates by pixels.
+
+  Args:
+    bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+      of type float that represents the normalized coordinates between 0 and 1.
+    image_height: Int, height of the image.
+    image_width: Int, width of the image.
+    pixels: An int. How many pixels to shift the bbox.
+    shift_horizontal: Boolean. If true then shift in X dimension else shift in
+      Y dimension.
+
+  Returns:
+    A tensor of the same shape as bbox, but now with the shifted coordinates.
+  """
+  pixels = tf.cast(pixels, tf.int32)
+  # Convert bbox to integer pixel locations.
+  min_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[0], tf.int32)
+  min_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[1], tf.int32)
+  max_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[2], tf.int32)
+  max_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[3], tf.int32)
+
+  if shift_horizontal:
+    min_x = tf.maximum(0, min_x - pixels)
+    max_x = tf.minimum(image_width, max_x - pixels)
+  else:
+    min_y = tf.maximum(0, min_y - pixels)
+    max_y = tf.minimum(image_height, max_y - pixels)
+
+  # Convert bbox back to floats.
+  min_y = tf.cast(min_y, tf.float32) / tf.cast(image_height, tf.float32)
+  min_x = tf.cast(min_x, tf.float32) / tf.cast(image_width, tf.float32)
+  max_y = tf.cast(max_y, tf.float32) / tf.cast(image_height, tf.float32)
+  max_x = tf.cast(max_x, tf.float32) / tf.cast(image_width, tf.float32)
+
+  # Clip the bboxes to be sure the fall between [0, 1].
+  min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+  min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+  return tf.stack([min_y, min_x, max_y, max_x])
+
+
+def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
+  """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
+
+  Args:
+    image: 3D uint8 Tensor.
+    bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+      has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+      between [0, 1].
+    pixels: An int. How many pixels to shift the image and bboxes
+    replace: A one or three value 1D tensor to fill empty pixels.
+    shift_horizontal: Boolean. If true then shift in X dimension else shift in
+      Y dimension.
+
+  Returns:
+    A tuple containing a 3D uint8 Tensor that will be the result of translating
+    image by pixels. The second element of the tuple is bboxes, where now
+    the coordinates will be shifted to reflect the shifted image.
+
+  Raises:
+    ValueError if applied to video.
+  """
+  if image.shape.rank == 4:
+    raise ValueError('Image rank 4 is not supported')
+
+  if shift_horizontal:
+    image = translate_x(image, pixels, replace)
+  else:
+    image = translate_y(image, pixels, replace)
+
+  # Convert bbox coordinates to pixel values.
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+  # pylint:disable=g-long-lambda
+  wrapped_shift_bbox = lambda bbox: _shift_bbox(
+      bbox, image_height, image_width, pixels, shift_horizontal)
+  # pylint:enable=g-long-lambda
+  bboxes = tf.map_fn(wrapped_shift_bbox, bboxes)
+  return image, bboxes
+
+
+def translate_y_only_bboxes(
+    image: tf.Tensor, bboxes: tf.Tensor, prob: float, pixels: int, replace):
+  """Apply translate_y to each bbox in the image with probability prob."""
+  if bboxes.shape.rank == 4:
+    raise ValueError('translate_y_only_bboxes does not support rank 4 boxes')
+
+  func_changes_bbox = False
+  prob = _scale_bbox_only_op_probability(prob)
+  return _apply_multi_bbox_augmentation_wrapper(
+      image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)
+
+
+def _randomly_negate_tensor(tensor):
+  """With 50% prob turn the tensor negative."""
+  should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool)
+  final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
+  return final_tensor
+
+
+def _rotate_level_to_arg(level: float):
+  level = (level / _MAX_LEVEL) * 30.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _shrink_level_to_arg(level: float):
+  """Converts level to ratio by which we shrink the image content."""
+  if level == 0:
+    return (1.0,)  # if level is zero, do not shrink the image
+  # Maximum shrinking ratio is 2.9.
+  level = 2. / (_MAX_LEVEL / level) + 0.9
+  return (level,)
+
+
+def _enhance_level_to_arg(level: float):
+  return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
+
+
+def _shear_level_to_arg(level: float):
+  level = (level / _MAX_LEVEL) * 0.3
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _translate_level_to_arg(level: float, translate_const: float):
+  level = (level / _MAX_LEVEL) * float(translate_const)
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _mult_to_arg(level: float, multiplier: float = 1.):
+  return (int((level / _MAX_LEVEL) * multiplier),)
+
+
+def _apply_func_with_prob(func: Any, image: tf.Tensor,
+                          bboxes: Optional[tf.Tensor], args: Any, prob: float):
+  """Apply `func` to image w/ `args` as input with probability `prob`."""
+  assert isinstance(args, tuple)
+  assert inspect.getfullargspec(func)[0][1] == 'bboxes'
+
+  # Apply the function with probability `prob`.
+  should_apply_op = tf.cast(
+      tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
+  augmented_image, augmented_bboxes = tf.cond(
+      should_apply_op,
+      lambda: func(image, bboxes, *args),
+      lambda: (image, bboxes))
+  return augmented_image, augmented_bboxes
+
+
+def select_and_apply_random_policy(policies: Any,
+                                   image: tf.Tensor,
+                                   bboxes: Optional[tf.Tensor] = None):
+  """Select a random policy from `policies` and apply it to `image`."""
+  policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
+  # Note that using tf.case instead of tf.conds would result in significantly
+  # larger graphs and would even break export for some larger policies.
+  for (i, policy) in enumerate(policies):
+    image, bboxes = tf.cond(
+        tf.equal(i, policy_to_select),
+        lambda selected_policy=policy: selected_policy(image, bboxes),
+        lambda: (image, bboxes))
+  return image, bboxes
+
+
+NAME_TO_FUNC = {
+    'AutoContrast': autocontrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': wrapped_rotate,
+    'Posterize': posterize,
+    'Solarize': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x,
+    'TranslateY': translate_y,
+    'Cutout': cutout,
+    'Rotate_BBox': rotate_with_bboxes,
+    # pylint:disable=g-long-lambda
+    'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
+        image, bboxes, level, replace, shear_horizontal=True),
+    'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
+        image, bboxes, level, replace, shear_horizontal=False),
+    'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
+        image, bboxes, pixels, replace, shift_horizontal=True),
+    'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
+        image, bboxes, pixels, replace, shift_horizontal=False),
+    # pylint:enable=g-long-lambda
+    'TranslateY_Only_BBoxes': translate_y_only_bboxes,
+}
+
+# Functions that require a `bboxes` parameter.
+REQUIRE_BOXES_FUNCS = frozenset({
+    'Rotate_BBox',
+    'ShearX_BBox',
+    'ShearY_BBox',
+    'TranslateX_BBox',
+    'TranslateY_BBox',
+    'TranslateY_Only_BBoxes',
+})
+
+# Functions that have a 'prob' parameter
+PROB_FUNCS = frozenset({
+    'TranslateY_Only_BBoxes',
+})
+
+# Functions that have a 'replace' parameter
+REPLACE_FUNCS = frozenset({
+    'Rotate',
+    'TranslateX',
+    'ShearX',
+    'ShearY',
+    'TranslateY',
+    'Cutout',
+    'Rotate_BBox',
+    'ShearX_BBox',
+    'ShearY_BBox',
+    'TranslateX_BBox',
+    'TranslateY_BBox',
+    'TranslateY_Only_BBoxes',
+})
+
+
+def level_to_arg(cutout_const: float, translate_const: float):
+  """Creates a dict mapping image operation names to their arguments."""
+
+  no_arg = lambda level: ()
+  posterize_arg = lambda level: _mult_to_arg(level, 4)
+  solarize_arg = lambda level: _mult_to_arg(level, 256)
+  solarize_add_arg = lambda level: _mult_to_arg(level, 110)
+  cutout_arg = lambda level: _mult_to_arg(level, cutout_const)
+  translate_arg = lambda level: _translate_level_to_arg(level, translate_const)
+  translate_bbox_arg = lambda level: _translate_level_to_arg(level, 120)
+
+  args = {
+      'AutoContrast': no_arg,
+      'Equalize': no_arg,
+      'Invert': no_arg,
+      'Rotate': _rotate_level_to_arg,
+      'Posterize': posterize_arg,
+      'Solarize': solarize_arg,
+      'SolarizeAdd': solarize_add_arg,
+      'Color': _enhance_level_to_arg,
+      'Contrast': _enhance_level_to_arg,
+      'Brightness': _enhance_level_to_arg,
+      'Sharpness': _enhance_level_to_arg,
+      'ShearX': _shear_level_to_arg,
+      'ShearY': _shear_level_to_arg,
+      'Cutout': cutout_arg,
+      'TranslateX': translate_arg,
+      'TranslateY': translate_arg,
+      'Rotate_BBox': _rotate_level_to_arg,
+      'ShearX_BBox': _shear_level_to_arg,
+      'ShearY_BBox': _shear_level_to_arg,
+      # pylint:disable=g-long-lambda
+      'TranslateX_BBox': lambda level: _translate_level_to_arg(
+          level, translate_const),
+      'TranslateY_BBox': lambda level: _translate_level_to_arg(
+          level, translate_const),
+      # pylint:enable=g-long-lambda
+      'TranslateY_Only_BBoxes': translate_bbox_arg,
+  }
+  return args
+
+
+def bbox_wrapper(func):
+  """Adds a bboxes function argument to func and returns unchanged bboxes."""
+  def wrapper(images, bboxes, *args, **kwargs):
+    return (func(images, *args, **kwargs), bboxes)
+  return wrapper
+
+
+def _parse_policy_info(name: Text,
+                       prob: float,
+                       level: float,
+                       replace_value: List[int],
+                       cutout_const: float,
+                       translate_const: float,
+                       level_std: float = 0.) -> Tuple[Any, float, Any]:
+  """Return the function that corresponds to `name` and update `level` param."""
+  func = NAME_TO_FUNC[name]
+
+  if level_std > 0:
+    level += tf.random.normal([], dtype=tf.float32)
+    level = tf.clip_by_value(level, 0., _MAX_LEVEL)
+
+  args = level_to_arg(cutout_const, translate_const)[name](level)
+
+  if name in PROB_FUNCS:
+    # Add in the prob arg if it is required for the function that is called.
+    args = tuple([prob] + list(args))
+
+  if name in REPLACE_FUNCS:
+    # Add in replace arg if it is required for the function that is called.
+    args = tuple(list(args) + [replace_value])
+
+  # Add bboxes as the second positional argument for the function if it does
+  # not already exist.
+  if 'bboxes' not in inspect.getfullargspec(func)[0]:
+    func = bbox_wrapper(func)
+
+  return func, prob, args
+
+
+class ImageAugment(object):
+  """Image augmentation class for applying image distortions."""
+
+  def distort(
+      self,
+      image: tf.Tensor
+  ) -> tf.Tensor:
+    """Given an image tensor, returns a distorted image with the same shape.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] or
+        [num_frames, height, width, 3] representing an image or image sequence.
+
+    Returns:
+      The augmented version of `image`.
+    """
+    raise NotImplementedError()
+
+  def distort_with_boxes(
+      self,
+      image: tf.Tensor,
+      bboxes: tf.Tensor
+  ) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Distorts the image and bounding boxes.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] or
+        [num_frames, height, width, 3] representing an image or image sequence.
+      bboxes: `Tensor` of shape [num_boxes, 4] or [num_frames, num_boxes, 4]
+        representing bounding boxes for an image or image sequence.
+
+    Returns:
+      The augmented version of `image` and `bboxes`.
+    """
+    raise NotImplementedError
+
+
+class AutoAugment(ImageAugment):
+  """Applies the AutoAugment policy to images.
+
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+  """
+
+  def __init__(self,
+               augmentation_name: Text = 'v0',
+               policies: Optional[Iterable[Iterable[Tuple[Text, float,
+                                                          float]]]] = None,
+               cutout_const: float = 100,
+               translate_const: float = 250):
+    """Applies the AutoAugment policy to images.
+
+    Args:
+      augmentation_name: The name of the AutoAugment policy to use. The
+        available options are `v0`, `test`, `reduced_cifar10`, `svhn` and
+        `reduced_imagenet`. `v0` is the policy used for all
+        of the results in the paper and was found to achieve the best results on
+        the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
+        on the COCO dataset that have slight variation in what operations were
+        used during the search procedure along with how many operations are
+        applied in parallel to a single image (2 vs 3). Make sure to set
+        `policies` to `None` (the default) if you want to set options using
+        `augmentation_name`.
+      policies: list of lists of tuples in the form `(func, prob, level)`,
+        `func` is a string name of the augmentation function, `prob` is the
+        probability of applying the `func` operation, `level` (or magnitude) is
+        the input argument for `func`. For example:
+        ```
+        [[('Equalize', 0.9, 3), ('Color', 0.7, 8)],
+         [('Invert', 0.6, 5), ('Rotate', 0.2, 9), ('ShearX', 0.1, 2)], ...]
+        ```
+        The outer-most list must be 3-d. The number of operations in a
+        sub-policy can vary from one sub-policy to another.
+        If you provide `policies` as input, any option set with
+        `augmentation_name` will get overriden as they are mutually exclusive.
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+
+    Raises:
+      ValueError if `augmentation_name` is unsupported.
+    """
+    super(AutoAugment, self).__init__()
+
+    self.augmentation_name = augmentation_name
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+    self.available_policies = {
+        'detection_v0': self.detection_policy_v0(),
+        'v0': self.policy_v0(),
+        'test': self.policy_test(),
+        'simple': self.policy_simple(),
+        'reduced_cifar10': self.policy_reduced_cifar10(),
+        'svhn': self.policy_svhn(),
+        'reduced_imagenet': self.policy_reduced_imagenet(),
+    }
+
+    if not policies:
+      if augmentation_name not in self.available_policies:
+        raise ValueError(
+            'Invalid augmentation_name: {}'.format(augmentation_name))
+
+      self.policies = self.available_policies[augmentation_name]
+
+    else:
+      self._check_policy_shape(policies)
+      self.policies = policies
+
+  def _check_policy_shape(self, policies):
+    """Checks dimension and shape of the custom policy.
+
+    Args:
+      policies: List of list of tuples in the form `(func, prob, level)`. Must
+        have shape of `(:, :, 3)`.
+
+    Raises:
+      ValueError if the shape of `policies` is unexpected.
+    """
+    in_shape = np.array(policies).shape
+    if len(in_shape) != 3 or in_shape[-1:] != (3,):
+      raise ValueError('Wrong shape detected for custom policy. Expected '
+                       '(:, :, 3) but got {}.'.format(in_shape))
+
+  def _make_tf_policies(self):
+    """Prepares the TF functions for augmentations based on the policies."""
+    replace_value = [128] * 3
+
+    # func is the string name of the augmentation function, prob is the
+    # probability of applying the operation and level is the parameter
+    # associated with the tf op.
+
+    # tf_policies are functions that take in an image and return an augmented
+    # image.
+    tf_policies = []
+    for policy in self.policies:
+      tf_policy = []
+      assert_ranges = []
+      # Link string name to the correct python function and make sure the
+      # correct argument is passed into that function.
+      for policy_info in policy:
+        _, prob, level = policy_info
+        assert_ranges.append(tf.Assert(tf.less_equal(prob, 1.), [prob]))
+        assert_ranges.append(
+            tf.Assert(tf.less_equal(level, int(_MAX_LEVEL)), [level]))
+
+        policy_info = list(policy_info) + [
+            replace_value, self.cutout_const, self.translate_const
+        ]
+        tf_policy.append(_parse_policy_info(*policy_info))
+      # Now build the tf policy that will apply the augmentation procedue
+      # on image.
+      def make_final_policy(tf_policy_):
+
+        def final_policy(image_, bboxes_):
+          for func, prob, args in tf_policy_:
+            image_, bboxes_ = _apply_func_with_prob(func, image_, bboxes_, args,
+                                                    prob)
+          return image_, bboxes_
+
+        return final_policy
+
+      with tf.control_dependencies(assert_ranges):
+        tf_policies.append(make_final_policy(tf_policy))
+
+    return tf_policies
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """See base class."""
+    input_image_type = image.dtype
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    tf_policies = self._make_tf_policies()
+    image, _ = select_and_apply_random_policy(tf_policies, image, bboxes=None)
+    return image
+
+  def distort_with_boxes(self, image: tf.Tensor,
+                         bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """See base class."""
+    input_image_type = image.dtype
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    tf_policies = self._make_tf_policies()
+    image, bboxes = select_and_apply_random_policy(tf_policies, image, bboxes)
+    return image, bboxes
+
+  @staticmethod
+  def detection_policy_v0():
+    """Autoaugment policy that was used in AutoAugment Paper for Detection.
+
+    https://arxiv.org/pdf/1906.11172
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+    policy = [
+        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
+        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
+        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
+        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_v0():
+    """Autoaugment policy that was used in AutoAugment Paper.
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_reduced_cifar10():
+    """Autoaugment policy for reduced CIFAR-10 dataset.
+
+    Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+    policy = [
+        [('Invert', 0.1, 7), ('Contrast', 0.2, 6)],
+        [('Rotate', 0.7, 2), ('TranslateX', 0.3, 9)],
+        [('Sharpness', 0.8, 1), ('Sharpness', 0.9, 3)],
+        [('ShearY', 0.5, 8), ('TranslateY', 0.7, 9)],
+        [('AutoContrast', 0.5, 8), ('Equalize', 0.9, 2)],
+        [('ShearY', 0.2, 7), ('Posterize', 0.3, 7)],
+        [('Color', 0.4, 3), ('Brightness', 0.6, 7)],
+        [('Sharpness', 0.3, 9), ('Brightness', 0.7, 9)],
+        [('Equalize', 0.6, 5), ('Equalize', 0.5, 1)],
+        [('Contrast', 0.6, 7), ('Sharpness', 0.6, 5)],
+        [('Color', 0.7, 7), ('TranslateX', 0.5, 8)],
+        [('Equalize', 0.3, 7), ('AutoContrast', 0.4, 8)],
+        [('TranslateY', 0.4, 3), ('Sharpness', 0.2, 6)],
+        [('Brightness', 0.9, 6), ('Color', 0.2, 8)],
+        [('Solarize', 0.5, 2), ('Invert', 0.0, 3)],
+        [('Equalize', 0.2, 0), ('AutoContrast', 0.6, 0)],
+        [('Equalize', 0.2, 8), ('Equalize', 0.6, 4)],
+        [('Color', 0.9, 9), ('Equalize', 0.6, 6)],
+        [('AutoContrast', 0.8, 4), ('Solarize', 0.2, 8)],
+        [('Brightness', 0.1, 3), ('Color', 0.7, 0)],
+        [('Solarize', 0.4, 5), ('AutoContrast', 0.9, 3)],
+        [('TranslateY', 0.9, 9), ('TranslateY', 0.7, 9)],
+        [('AutoContrast', 0.9, 2), ('Solarize', 0.8, 3)],
+        [('Equalize', 0.8, 8), ('Invert', 0.1, 3)],
+        [('TranslateY', 0.7, 9), ('AutoContrast', 0.9, 1)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_svhn():
+    """Autoaugment policy for SVHN dataset.
+
+    Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+    policy = [
+        [('ShearX', 0.9, 4), ('Invert', 0.2, 3)],
+        [('ShearY', 0.9, 8), ('Invert', 0.7, 5)],
+        [('Equalize', 0.6, 5), ('Solarize', 0.6, 6)],
+        [('Invert', 0.9, 3), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.6, 1), ('Rotate', 0.9, 3)],
+        [('ShearX', 0.9, 4), ('AutoContrast', 0.8, 3)],
+        [('ShearY', 0.9, 8), ('Invert', 0.4, 5)],
+        [('ShearY', 0.9, 5), ('Solarize', 0.2, 6)],
+        [('Invert', 0.9, 6), ('AutoContrast', 0.8, 1)],
+        [('Equalize', 0.6, 3), ('Rotate', 0.9, 3)],
+        [('ShearX', 0.9, 4), ('Solarize', 0.3, 3)],
+        [('ShearY', 0.8, 8), ('Invert', 0.7, 4)],
+        [('Equalize', 0.9, 5), ('TranslateY', 0.6, 6)],
+        [('Invert', 0.9, 4), ('Equalize', 0.6, 7)],
+        [('Contrast', 0.3, 3), ('Rotate', 0.8, 4)],
+        [('Invert', 0.8, 5), ('TranslateY', 0.0, 2)],
+        [('ShearY', 0.7, 6), ('Solarize', 0.4, 8)],
+        [('Invert', 0.6, 4), ('Rotate', 0.8, 4)],
+        [('ShearY', 0.3, 7), ('TranslateX', 0.9, 3)],
+        [('ShearX', 0.1, 6), ('Invert', 0.6, 5)],
+        [('Solarize', 0.7, 2), ('TranslateY', 0.6, 7)],
+        [('ShearY', 0.8, 4), ('Invert', 0.8, 8)],
+        [('ShearX', 0.7, 9), ('TranslateY', 0.8, 3)],
+        [('ShearY', 0.8, 5), ('AutoContrast', 0.7, 3)],
+        [('ShearX', 0.7, 2), ('Invert', 0.1, 5)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_reduced_imagenet():
+    """Autoaugment policy for reduced ImageNet dataset.
+
+    Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501.
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+    policy = [
+        [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)]
+    ]
+    return policy
+
+  @staticmethod
+  def policy_simple():
+    """Same as `policy_v0`, except with custom ops removed."""
+
+    policy = [
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_test():
+    """Autoaugment test policy for debugging."""
+    policy = [
+        [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
+    ]
+    return policy
+
+
+def _maybe_identity(x: Optional[tf.Tensor]) -> Optional[tf.Tensor]:
+  return tf.identity(x) if x is not None else None
+
+
+class RandAugment(ImageAugment):
+  """Applies the RandAugment policy to images.
+
+  RandAugment is from the paper https://arxiv.org/abs/1909.13719,
+  """
+
+  def __init__(self,
+               num_layers: int = 2,
+               magnitude: float = 10.,
+               cutout_const: float = 40.,
+               translate_const: float = 100.,
+               magnitude_std: float = 0.0,
+               prob_to_apply: Optional[float] = None,
+               exclude_ops: Optional[List[str]] = None):
+    """Applies the RandAugment policy to images.
+
+    Args:
+      num_layers: Integer, the number of augmentation transformations to apply
+        sequentially to an image. Represented as (N) in the paper. Usually best
+        values will be in the range [1, 3].
+      magnitude: Integer, shared magnitude across all augmentation operations.
+        Represented as (M) in the paper. Usually best values are in the range
+        [5, 10].
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+      magnitude_std: randomness of the severity as proposed by the authors of
+        the timm library.
+      prob_to_apply: The probability to apply the selected augmentation at each
+        layer.
+      exclude_ops: exclude selected operations.
+    """
+    super(RandAugment, self).__init__()
+
+    self.num_layers = num_layers
+    self.magnitude = float(magnitude)
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+    self.prob_to_apply = (
+        float(prob_to_apply) if prob_to_apply is not None else None)
+    self.available_ops = [
+        'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize',
+        'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY',
+        'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd'
+    ]
+    self.magnitude_std = magnitude_std
+    if exclude_ops:
+      self.available_ops = [
+          op for op in self.available_ops if op not in exclude_ops
+      ]
+
+  @classmethod
+  def build_for_detection(cls,
+                          num_layers: int = 2,
+                          magnitude: float = 10.,
+                          cutout_const: float = 40.,
+                          translate_const: float = 100.,
+                          magnitude_std: float = 0.0,
+                          prob_to_apply: Optional[float] = None,
+                          exclude_ops: Optional[List[str]] = None):
+    """Builds a RandAugment that modifies bboxes for geometric transforms."""
+    augmenter = cls(
+        num_layers=num_layers,
+        magnitude=magnitude,
+        cutout_const=cutout_const,
+        translate_const=translate_const,
+        magnitude_std=magnitude_std,
+        prob_to_apply=prob_to_apply,
+        exclude_ops=exclude_ops)
+    box_aware_ops_by_base_name = {
+        'Rotate': 'Rotate_BBox',
+        'ShearX': 'ShearX_BBox',
+        'ShearY': 'ShearY_BBox',
+        'TranslateX': 'TranslateX_BBox',
+        'TranslateY': 'TranslateY_BBox',
+    }
+    augmenter.available_ops = [
+        box_aware_ops_by_base_name.get(op_name) or op_name
+        for op_name in augmenter.available_ops
+    ]
+    return augmenter
+
+  def _distort_common(
+      self,
+      image: tf.Tensor,
+      bboxes: Optional[tf.Tensor] = None
+  ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    """Distorts the image and optionally bounding boxes."""
+    input_image_type = image.dtype
+
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    replace_value = [128] * 3
+    min_prob, max_prob = 0.2, 0.8
+
+    aug_image = image
+    aug_bboxes = bboxes
+
+    for _ in range(self.num_layers):
+      op_to_select = tf.random.uniform([],
+                                       maxval=len(self.available_ops) + 1,
+                                       dtype=tf.int32)
+
+      branch_fns = []
+      for (i, op_name) in enumerate(self.available_ops):
+        prob = tf.random.uniform([],
+                                 minval=min_prob,
+                                 maxval=max_prob,
+                                 dtype=tf.float32)
+        func, _, args = _parse_policy_info(op_name, prob, self.magnitude,
+                                           replace_value, self.cutout_const,
+                                           self.translate_const,
+                                           self.magnitude_std)
+        branch_fns.append((
+            i,
+            # pylint:disable=g-long-lambda
+            lambda selected_func=func, selected_args=args: selected_func(
+                image, bboxes, *selected_args)))
+        # pylint:enable=g-long-lambda
+
+      aug_image, aug_bboxes = tf.switch_case(
+          branch_index=op_to_select,
+          branch_fns=branch_fns,
+          default=lambda: (tf.identity(image), _maybe_identity(bboxes)))
+
+      if self.prob_to_apply is not None:
+        aug_image, aug_bboxes = tf.cond(
+            tf.random.uniform(shape=[], dtype=tf.float32) < self.prob_to_apply,
+            lambda: (tf.identity(aug_image), _maybe_identity(aug_bboxes)),
+            lambda: (tf.identity(image), _maybe_identity(bboxes)))
+      image = aug_image
+      bboxes = aug_bboxes
+
+    image = tf.cast(image, dtype=input_image_type)
+    return image, bboxes
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """See base class."""
+    image, _ = self._distort_common(image)
+    return image
+
+  def distort_with_boxes(self, image: tf.Tensor,
+                         bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """See base class."""
+    image, bboxes = self._distort_common(image, bboxes)
+    return image, bboxes
+
+
+class RandomErasing(ImageAugment):
+  """Applies RandomErasing to a single image.
+
+  Reference: https://arxiv.org/abs/1708.04896
+
+  Implementaion is inspired by https://github.com/rwightman/pytorch-image-models
+  """
+
+  def __init__(self,
+               probability: float = 0.25,
+               min_area: float = 0.02,
+               max_area: float = 1 / 3,
+               min_aspect: float = 0.3,
+               max_aspect=None,
+               min_count=1,
+               max_count=1,
+               trials=10):
+    """Applies RandomErasing to a single image.
+
+    Args:
+      probability (float, optional): Probability of augmenting the image.
+        Defaults to 0.25.
+      min_area (float, optional): Minimum area of the random erasing rectangle.
+        Defaults to 0.02.
+      max_area (float, optional): Maximum area of the random erasing rectangle.
+        Defaults to 1/3.
+      min_aspect (float, optional): Minimum aspect rate of the random erasing
+        rectangle. Defaults to 0.3.
+      max_aspect ([type], optional): Maximum aspect rate of the random erasing
+        rectangle. Defaults to None.
+      min_count (int, optional): Minimum number of erased rectangles. Defaults
+        to 1.
+      max_count (int, optional):  Maximum number of erased rectangles. Defaults
+        to 1.
+      trials (int, optional): Maximum number of trials to randomly sample a
+        rectangle that fulfills constraint. Defaults to 10.
+    """
+    self._probability = probability
+    self._min_area = float(min_area)
+    self._max_area = float(max_area)
+    self._min_log_aspect = math.log(min_aspect)
+    self._max_log_aspect = math.log(max_aspect or 1 / min_aspect)
+    self._min_count = min_count
+    self._max_count = max_count
+    self._trials = trials
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies RandomErasing to single `image`.
+
+    Args:
+      image (tf.Tensor): Of shape [height, width, 3] representing an image.
+
+    Returns:
+      tf.Tensor: The augmented version of `image`.
+    """
+    uniform_random = tf.random.uniform(shape=[], minval=0., maxval=1.0)
+    mirror_cond = tf.less(uniform_random, self._probability)
+    image = tf.cond(mirror_cond, lambda: self._erase(image), lambda: image)
+    return image
+
+  @tf.function
+  def _erase(self, image: tf.Tensor) -> tf.Tensor:
+    """Erase an area."""
+    if self._min_count == self._max_count:
+      count = self._min_count
+    else:
+      count = tf.random.uniform(
+          shape=[],
+          minval=int(self._min_count),
+          maxval=int(self._max_count - self._min_count + 1),
+          dtype=tf.int32)
+
+    image_height = tf.shape(image)[0]
+    image_width = tf.shape(image)[1]
+    area = tf.cast(image_width * image_height, tf.float32)
+
+    for _ in range(count):
+      # Work around since break is not supported in tf.function
+      is_trial_successfull = False
+      for _ in range(self._trials):
+        if not is_trial_successfull:
+          erase_area = tf.random.uniform(
+              shape=[],
+              minval=area * self._min_area,
+              maxval=area * self._max_area)
+          aspect_ratio = tf.math.exp(
+              tf.random.uniform(
+                  shape=[],
+                  minval=self._min_log_aspect,
+                  maxval=self._max_log_aspect))
+
+          half_height = tf.cast(
+              tf.math.round(tf.math.sqrt(erase_area * aspect_ratio) / 2),
+              dtype=tf.int32)
+          half_width = tf.cast(
+              tf.math.round(tf.math.sqrt(erase_area / aspect_ratio) / 2),
+              dtype=tf.int32)
+
+          if 2 * half_height < image_height and 2 * half_width < image_width:
+            center_height = tf.random.uniform(
+                shape=[],
+                minval=0,
+                maxval=int(image_height - 2 * half_height),
+                dtype=tf.int32)
+            center_width = tf.random.uniform(
+                shape=[],
+                minval=0,
+                maxval=int(image_width - 2 * half_width),
+                dtype=tf.int32)
+
+            image = _fill_rectangle(
+                image,
+                center_width,
+                center_height,
+                half_width,
+                half_height,
+                replace=None)
+
+            is_trial_successfull = True
+
+    return image
+
+
+class MixupAndCutmix:
+  """Applies Mixup and/or Cutmix to a batch of images.
+
+  - Mixup: https://arxiv.org/abs/1710.09412
+  - Cutmix: https://arxiv.org/abs/1905.04899
+
+  Implementaion is inspired by https://github.com/rwightman/pytorch-image-models
+  """
+
+  def __init__(self,
+               mixup_alpha: float = .8,
+               cutmix_alpha: float = 1.,
+               prob: float = 1.0,
+               switch_prob: float = 0.5,
+               label_smoothing: float = 0.1,
+               num_classes: int = 1001):
+    """Applies Mixup and/or Cutmix to a batch of images.
+
+    Args:
+      mixup_alpha (float, optional): For drawing a random lambda (`lam`) from a
+        beta distribution (for each image). If zero Mixup is deactivated.
+        Defaults to .8.
+      cutmix_alpha (float, optional): For drawing a random lambda (`lam`) from a
+        beta distribution (for each image). If zero Cutmix is deactivated.
+        Defaults to 1..
+      prob (float, optional): Of augmenting the batch. Defaults to 1.0.
+      switch_prob (float, optional): Probability of applying Cutmix for the
+        batch. Defaults to 0.5.
+      label_smoothing (float, optional): Constant for label smoothing. Defaults
+        to 0.1.
+      num_classes (int, optional): Number of classes. Defaults to 1001.
+    """
+    self.mixup_alpha = mixup_alpha
+    self.cutmix_alpha = cutmix_alpha
+    self.mix_prob = prob
+    self.switch_prob = switch_prob
+    self.label_smoothing = label_smoothing
+    self.num_classes = num_classes
+    self.mode = 'batch'
+    self.mixup_enabled = True
+
+    if self.mixup_alpha and not self.cutmix_alpha:
+      self.switch_prob = -1
+    elif not self.mixup_alpha and self.cutmix_alpha:
+      self.switch_prob = 1
+
+  def __call__(self, images: tf.Tensor,
+               labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    return self.distort(images, labels)
+
+  def distort(self, images: tf.Tensor,
+              labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Applies Mixup and/or Cutmix to batch of images and transforms labels.
+
+    Args:
+      images (tf.Tensor): Of shape [batch_size,height, width, 3] representing a
+        batch of image.
+      labels (tf.Tensor): Of shape [batch_size, ] representing the class id for
+        each image of the batch.
+
+    Returns:
+      Tuple[tf.Tensor, tf.Tensor]: The augmented version of `image` and
+        `labels`.
+    """
+    augment_cond = tf.less(
+        tf.random.uniform(shape=[], minval=0., maxval=1.0), self.mix_prob)
+    # pylint: disable=g-long-lambda
+    augment_a = lambda: self._update_labels(*tf.cond(
+        tf.less(
+            tf.random.uniform(shape=[], minval=0., maxval=1.0), self.switch_prob
+        ), lambda: self._cutmix(images, labels), lambda: self._mixup(
+            images, labels)))
+    augment_b = lambda: (images, self._smooth_labels(labels))
+    # pylint: enable=g-long-lambda
+
+    return tf.cond(augment_cond, augment_a, augment_b)
+
+  @staticmethod
+  def _sample_from_beta(alpha, beta, shape):
+    sample_alpha = tf.random.gamma(shape, 1., beta=alpha)
+    sample_beta = tf.random.gamma(shape, 1., beta=beta)
+    return sample_alpha / (sample_alpha + sample_beta)
+
+  def _cutmix(self, images: tf.Tensor,
+              labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    """Apply cutmix."""
+    lam = MixupAndCutmix._sample_from_beta(self.cutmix_alpha, self.cutmix_alpha,
+                                           labels.shape)
+
+    ratio = tf.math.sqrt(1 - lam)
+
+    batch_size = tf.shape(images)[0]
+    image_height, image_width = tf.shape(images)[1], tf.shape(images)[2]
+
+    cut_height = tf.cast(
+        ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32)
+    cut_width = tf.cast(
+        ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32)
+
+    random_center_height = tf.random.uniform(
+        shape=[batch_size], minval=0, maxval=image_height, dtype=tf.int32)
+    random_center_width = tf.random.uniform(
+        shape=[batch_size], minval=0, maxval=image_width, dtype=tf.int32)
+
+    bbox_area = cut_height * cut_width
+    lam = 1. - bbox_area / (image_height * image_width)
+    lam = tf.cast(lam, dtype=tf.float32)
+
+    images = tf.map_fn(
+        lambda x: _fill_rectangle(*x),
+        (images, random_center_width, random_center_height, cut_width // 2,
+         cut_height // 2, tf.reverse(images, [0])),
+        dtype=(tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.float32),
+        fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=tf.float32))
+
+    return images, labels, lam
+
+  def _mixup(self, images: tf.Tensor,
+             labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    lam = MixupAndCutmix._sample_from_beta(self.mixup_alpha, self.mixup_alpha,
+                                           labels.shape)
+    lam = tf.reshape(lam, [-1, 1, 1, 1])
+    images = lam * images + (1. - lam) * tf.reverse(images, [0])
+
+    return images, labels, tf.squeeze(lam)
+
+  def _smooth_labels(self, labels: tf.Tensor) -> tf.Tensor:
+    off_value = self.label_smoothing / self.num_classes
+    on_value = 1. - self.label_smoothing + off_value
+
+    smooth_labels = tf.one_hot(
+        labels, self.num_classes, on_value=on_value, off_value=off_value)
+    return smooth_labels
+
+  def _update_labels(self, images: tf.Tensor, labels: tf.Tensor,
+                     lam: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    labels_1 = self._smooth_labels(labels)
+    labels_2 = tf.reverse(labels_1, [0])
+
+    lam = tf.reshape(lam, [-1, 1])
+    labels = lam * labels_1 + (1. - lam) * labels_2
+
+    return images, labels
--- a/official/vision/ops/augment_test.py
+++ b/official/vision/ops/augment_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for autoaugment."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+from absl.testing import parameterized
+
+import tensorflow as tf
+
+from official.vision.ops import augment
+
+
+def get_dtype_test_cases():
+  return [
+      ('uint8', tf.uint8),
+      ('int32', tf.int32),
+      ('float16', tf.float16),
+      ('float32', tf.float32),
+  ]
+
+
+@parameterized.named_parameters(get_dtype_test_cases())
+class TransformsTest(parameterized.TestCase, tf.test.TestCase):
+  """Basic tests for fundamental transformations."""
+
+  def test_to_from_4d(self, dtype):
+    for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
+      original_ndims = len(shape)
+      image = tf.zeros(shape, dtype=dtype)
+      image_4d = augment.to_4d(image)
+      self.assertEqual(4, tf.rank(image_4d))
+      self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
+
+  def test_transform(self, dtype):
+    image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
+    self.assertAllEqual(
+        augment.transform(image, transforms=[1] * 8), [[4, 4], [4, 4]])
+
+  def test_translate(self, dtype):
+    image = tf.constant(
+        [[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]], dtype=dtype)
+    translations = [-1, -1]
+    translated = augment.translate(image=image, translations=translations)
+    expected = [[1, 0, 1, 1], [0, 1, 0, 0], [1, 0, 1, 1], [1, 0, 1, 1]]
+    self.assertAllEqual(translated, expected)
+
+  def test_translate_shapes(self, dtype):
+    translation = [0, 0]
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.translate(image, translation))
+
+  def test_translate_invalid_translation(self, dtype):
+    image = tf.zeros((1, 1), dtype=dtype)
+    invalid_translation = [[[1, 1]]]
+    with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
+      _ = augment.translate(image, invalid_translation)
+
+  def test_rotate(self, dtype):
+    image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
+    rotation = 90.
+    transformed = augment.rotate(image=image, degrees=rotation)
+    expected = [[2, 5, 8], [1, 4, 7], [0, 3, 6]]
+    self.assertAllEqual(transformed, expected)
+
+  def test_rotate_shapes(self, dtype):
+    degrees = 0.
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.rotate(image, degrees))
+
+
+class AutoaugmentTest(tf.test.TestCase, parameterized.TestCase):
+
+  AVAILABLE_POLICIES = [
+      'v0',
+      'test',
+      'simple',
+      'reduced_cifar10',
+      'svhn',
+      'reduced_imagenet',
+      'detection_v0',
+  ]
+
+  def test_autoaugment(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+
+    for policy in self.AVAILABLE_POLICIES:
+      augmenter = augment.AutoAugment(augmentation_name=policy)
+      aug_image = augmenter.distort(image)
+
+      self.assertEqual((224, 224, 3), aug_image.shape)
+
+  def test_autoaugment_with_bboxes(self):
+    """Smoke test to be sure there are no syntax errors with bboxes."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    bboxes = tf.ones((2, 4), dtype=tf.float32)
+
+    for policy in self.AVAILABLE_POLICIES:
+      augmenter = augment.AutoAugment(augmentation_name=policy)
+      aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
+
+      self.assertEqual((224, 224, 3), aug_image.shape)
+      self.assertEqual((2, 4), aug_bboxes.shape)
+
+  def test_randaug(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+
+    augmenter = augment.RandAugment()
+    aug_image = augmenter.distort(image)
+
+    self.assertEqual((224, 224, 3), aug_image.shape)
+
+  def test_randaug_with_bboxes(self):
+    """Smoke test to be sure there are no syntax errors with bboxes."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    bboxes = tf.ones((2, 4), dtype=tf.float32)
+
+    augmenter = augment.RandAugment()
+    aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
+
+    self.assertEqual((224, 224, 3), aug_image.shape)
+    self.assertEqual((2, 4), aug_bboxes.shape)
+
+  def test_randaug_build_for_detection(self):
+    """Smoke test to be sure there are no syntax errors built for detection."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    bboxes = tf.ones((2, 4), dtype=tf.float32)
+
+    augmenter = augment.RandAugment.build_for_detection()
+    self.assertCountEqual(augmenter.available_ops, [
+        'AutoContrast', 'Equalize', 'Invert', 'Posterize', 'Solarize', 'Color',
+        'Contrast', 'Brightness', 'Sharpness', 'Cutout', 'SolarizeAdd',
+        'Rotate_BBox', 'ShearX_BBox', 'ShearY_BBox', 'TranslateX_BBox',
+        'TranslateY_BBox'
+    ])
+
+    aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
+    self.assertEqual((224, 224, 3), aug_image.shape)
+    self.assertEqual((2, 4), aug_bboxes.shape)
+
+  def test_all_policy_ops(self):
+    """Smoke test to be sure all augmentation functions can execute."""
+
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+
+    image = tf.ones((224, 224, 3), dtype=tf.uint8)
+    bboxes = None
+
+    for op_name in augment.NAME_TO_FUNC.keys() - augment.REQUIRE_BOXES_FUNCS:
+      func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
+                                                 replace_value, cutout_const,
+                                                 translate_const)
+      image, bboxes = func(image, bboxes, *args)
+
+    self.assertEqual((224, 224, 3), image.shape)
+    self.assertIsNone(bboxes)
+
+  def test_all_policy_ops_with_bboxes(self):
+    """Smoke test to be sure all augmentation functions can execute."""
+
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+
+    image = tf.ones((224, 224, 3), dtype=tf.uint8)
+    bboxes = tf.ones((2, 4), dtype=tf.float32)
+
+    for op_name in augment.NAME_TO_FUNC:
+      func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
+                                                 replace_value, cutout_const,
+                                                 translate_const)
+      image, bboxes = func(image, bboxes, *args)
+
+    self.assertEqual((224, 224, 3), image.shape)
+    self.assertEqual((2, 4), bboxes.shape)
+
+  def test_autoaugment_video(self):
+    """Smoke test with video to be sure there are no syntax errors."""
+    image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8)
+
+    for policy in self.AVAILABLE_POLICIES:
+      augmenter = augment.AutoAugment(augmentation_name=policy)
+      aug_image = augmenter.distort(image)
+
+      self.assertEqual((2, 224, 224, 3), aug_image.shape)
+
+  def test_autoaugment_video_with_boxes(self):
+    """Smoke test with video to be sure there are no syntax errors."""
+    image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8)
+    bboxes = tf.ones((2, 2, 4), dtype=tf.float32)
+
+    for policy in self.AVAILABLE_POLICIES:
+      augmenter = augment.AutoAugment(augmentation_name=policy)
+      aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes)
+
+      self.assertEqual((2, 224, 224, 3), aug_image.shape)
+      self.assertEqual((2, 2, 4), aug_bboxes.shape)
+
+  def test_randaug_video(self):
+    """Smoke test with video to be sure there are no syntax errors."""
+    image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8)
+
+    augmenter = augment.RandAugment()
+    aug_image = augmenter.distort(image)
+
+    self.assertEqual((2, 224, 224, 3), aug_image.shape)
+
+  def test_all_policy_ops_video(self):
+    """Smoke test to be sure all video augmentation functions can execute."""
+
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+
+    image = tf.ones((2, 224, 224, 3), dtype=tf.uint8)
+    bboxes = None
+
+    for op_name in augment.NAME_TO_FUNC.keys() - augment.REQUIRE_BOXES_FUNCS:
+      func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
+                                                 replace_value, cutout_const,
+                                                 translate_const)
+      image, bboxes = func(image, bboxes, *args)
+
+    self.assertEqual((2, 224, 224, 3), image.shape)
+    self.assertIsNone(bboxes)
+
+  def test_all_policy_ops_video_with_bboxes(self):
+    """Smoke test to be sure all video augmentation functions can execute."""
+
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+
+    image = tf.ones((2, 224, 224, 3), dtype=tf.uint8)
+    bboxes = tf.ones((2, 2, 4), dtype=tf.float32)
+
+    for op_name in augment.NAME_TO_FUNC:
+      func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
+                                                 replace_value, cutout_const,
+                                                 translate_const)
+      if op_name in {
+          'Rotate_BBox',
+          'ShearX_BBox',
+          'ShearY_BBox',
+          'TranslateX_BBox',
+          'TranslateY_BBox',
+          'TranslateY_Only_BBoxes',
+      }:
+        with self.assertRaises(ValueError):
+          func(image, bboxes, *args)
+      else:
+        image, bboxes = func(image, bboxes, *args)
+
+    self.assertEqual((2, 224, 224, 3), image.shape)
+    self.assertEqual((2, 2, 4), bboxes.shape)
+
+  def _generate_test_policy(self):
+    """Generate a test policy at random."""
+    op_list = list(augment.NAME_TO_FUNC.keys())
+    size = 6
+    prob = [round(random.uniform(0., 1.), 1) for _ in range(size)]
+    mag = [round(random.uniform(0, 10)) for _ in range(size)]
+    policy = []
+    for i in range(0, size, 2):
+      policy.append([(op_list[i], prob[i], mag[i]),
+                     (op_list[i + 1], prob[i + 1], mag[i + 1])])
+    return policy
+
+  def test_custom_policy(self):
+    """Test autoaugment with a custom policy."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    augmenter = augment.AutoAugment(policies=self._generate_test_policy())
+    aug_image = augmenter.distort(image)
+
+    self.assertEqual((224, 224, 3), aug_image.shape)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '_OutOfRangeProb',
+       'sub_policy': ('Equalize', 1.1, 3), 'value': '1.1'},
+      {'testcase_name': '_OutOfRangeMag',
+       'sub_policy': ('Equalize', 0.9, 11), 'value': '11'},
+  )
+  def test_invalid_custom_sub_policy(self, sub_policy, value):
+    """Test autoaugment with out-of-range values in the custom policy."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    policy = self._generate_test_policy()
+    policy[0][0] = sub_policy
+    augmenter = augment.AutoAugment(policies=policy)
+
+    with self.assertRaisesRegex(
+        tf.errors.InvalidArgumentError,
+        r'Expected \'tf.Tensor\(False, shape=\(\), dtype=bool\)\' to be true. '
+        r'Summarized data: ({})'.format(value)):
+      augmenter.distort(image)
+
+  def test_invalid_custom_policy_ndim(self):
+    """Test autoaugment with wrong dimension in the custom policy."""
+    policy = [[('Equalize', 0.8, 1), ('Shear', 0.8, 4)],
+              [('TranslateY', 0.6, 3), ('Rotate', 0.9, 3)]]
+    policy = [[policy]]
+
+    with self.assertRaisesRegex(
+        ValueError,
+        r'Expected \(:, :, 3\) but got \(1, 1, 2, 2, 3\).'):
+      augment.AutoAugment(policies=policy)
+
+  def test_invalid_custom_policy_shape(self):
+    """Test autoaugment with wrong shape in the custom policy."""
+    policy = [[('Equalize', 0.8, 1, 1), ('Shear', 0.8, 4, 1)],
+              [('TranslateY', 0.6, 3, 1), ('Rotate', 0.9, 3, 1)]]
+
+    with self.assertRaisesRegex(
+        ValueError,
+        r'Expected \(:, :, 3\) but got \(2, 2, 4\)'):
+      augment.AutoAugment(policies=policy)
+
+  def test_invalid_custom_policy_key(self):
+    """Test autoaugment with invalid key in the custom policy."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    policy = [[('AAAAA', 0.8, 1), ('Shear', 0.8, 4)],
+              [('TranslateY', 0.6, 3), ('Rotate', 0.9, 3)]]
+    augmenter = augment.AutoAugment(policies=policy)
+
+    with self.assertRaisesRegex(KeyError, '\'AAAAA\''):
+      augmenter.distort(image)
+
+
+class RandomErasingTest(tf.test.TestCase, parameterized.TestCase):
+
+  def test_random_erase_replaces_some_pixels(self):
+    image = tf.zeros((224, 224, 3), dtype=tf.float32)
+    augmenter = augment.RandomErasing(probability=1., max_count=10)
+
+    aug_image = augmenter.distort(image)
+
+    self.assertEqual((224, 224, 3), aug_image.shape)
+    self.assertNotEqual(0, tf.reduce_max(aug_image))
+
+
+class MixupAndCutmixTest(tf.test.TestCase, parameterized.TestCase):
+
+  def test_mixup_and_cutmix_smoothes_labels(self):
+    batch_size = 12
+    num_classes = 1000
+    label_smoothing = 0.1
+
+    images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
+    labels = tf.range(batch_size)
+    augmenter = augment.MixupAndCutmix(
+        num_classes=num_classes, label_smoothing=label_smoothing)
+
+    aug_images, aug_labels = augmenter.distort(images, labels)
+
+    self.assertEqual(images.shape, aug_images.shape)
+    self.assertEqual(images.dtype, aug_images.dtype)
+    self.assertEqual([batch_size, num_classes], aug_labels.shape)
+    self.assertAllLessEqual(aug_labels, 1. - label_smoothing +
+                            2. / num_classes)  # With tolerance
+    self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
+                               1e4)  # With tolerance
+
+  def test_mixup_changes_image(self):
+    batch_size = 12
+    num_classes = 1000
+    label_smoothing = 0.1
+
+    images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
+    labels = tf.range(batch_size)
+    augmenter = augment.MixupAndCutmix(
+        mixup_alpha=1., cutmix_alpha=0., num_classes=num_classes)
+
+    aug_images, aug_labels = augmenter.distort(images, labels)
+
+    self.assertEqual(images.shape, aug_images.shape)
+    self.assertEqual(images.dtype, aug_images.dtype)
+    self.assertEqual([batch_size, num_classes], aug_labels.shape)
+    self.assertAllLessEqual(aug_labels, 1. - label_smoothing +
+                            2. / num_classes)  # With tolerance
+    self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
+                               1e4)  # With tolerance
+    self.assertFalse(tf.math.reduce_all(images == aug_images))
+
+  def test_cutmix_changes_image(self):
+    batch_size = 12
+    num_classes = 1000
+    label_smoothing = 0.1
+
+    images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
+    labels = tf.range(batch_size)
+    augmenter = augment.MixupAndCutmix(
+        mixup_alpha=0., cutmix_alpha=1., num_classes=num_classes)
+
+    aug_images, aug_labels = augmenter.distort(images, labels)
+
+    self.assertEqual(images.shape, aug_images.shape)
+    self.assertEqual(images.dtype, aug_images.dtype)
+    self.assertEqual([batch_size, num_classes], aug_labels.shape)
+    self.assertAllLessEqual(aug_labels, 1. - label_smoothing +
+                            2. / num_classes)  # With tolerance
+    self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
+                               1e4)  # With tolerance
+    self.assertFalse(tf.math.reduce_all(images == aug_images))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/box_matcher.py
+++ b/official/vision/ops/box_matcher.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Box matcher implementation."""
+
+
+import tensorflow as tf
+
+
+class BoxMatcher:
+  """Matcher based on highest value.
+
+  This class computes matches from a similarity matrix. Each column is matched
+  to a single row.
+
+  To support object detection target assignment this class enables setting both
+  positive_threshold (upper threshold) and negative_threshold (lower thresholds)
+  defining three categories of similarity which define whether examples are
+  positive, negative, or ignored, for example:
+  (1) thresholds=[negative_threshold, positive_threshold], and
+      indicators=[negative_value, ignore_value, positive_value]: The similarity
+      metrics below negative_threshold will be assigned with negative_value,
+      the metrics between negative_threshold and positive_threshold will be
+      assigned ignore_value, and the metrics above positive_threshold will be
+      assigned positive_value.
+  (2) thresholds=[negative_threshold, positive_threshold], and
+      indicators=[ignore_value, negative_value, positive_value]: The similarity
+      metric below negative_threshold will be assigned with ignore_value,
+      the metrics between negative_threshold and positive_threshold will be
+      assigned negative_value, and the metrics above positive_threshold will be
+      assigned positive_value.
+  """
+
+  def __init__(self, thresholds, indicators, force_match_for_each_col=False):
+    """Construct BoxMatcher.
+
+    Args:
+      thresholds: A list of thresholds to classify boxes into
+        different buckets. The list needs to be sorted, and will be prepended
+        with -Inf and appended with +Inf.
+      indicators: A list of values to assign for each bucket. len(`indicators`)
+        must equal to len(`thresholds`) + 1.
+      force_match_for_each_col: If True, ensures that each column is matched to
+        at least one row (which is not guaranteed otherwise if the
+        positive_threshold is high). Defaults to False. If True, all force
+        matched row will be assigned to `indicators[-1]`.
+
+    Raises:
+      ValueError: If `threshold` not sorted,
+        or len(indicators) != len(threshold) + 1
+    """
+    if not all([lo <= hi for (lo, hi) in zip(thresholds[:-1], thresholds[1:])]):
+      raise ValueError('`threshold` must be sorted, got {}'.format(thresholds))
+    self.indicators = indicators
+    if len(indicators) != len(thresholds) + 1:
+      raise ValueError('len(`indicators`) must be len(`thresholds`) + 1, got '
+                       'indicators {}, thresholds {}'.format(
+                           indicators, thresholds))
+    thresholds = thresholds[:]
+    thresholds.insert(0, -float('inf'))
+    thresholds.append(float('inf'))
+    self.thresholds = thresholds
+    self._force_match_for_each_col = force_match_for_each_col
+
+  def __call__(self, similarity_matrix):
+    """Tries to match each column of the similarity matrix to a row.
+
+    Args:
+      similarity_matrix: A float tensor of shape [N, M] representing any
+        similarity metric.
+
+    Returns:
+      A integer tensor of shape [N] with corresponding match indices for each
+      of M columns, for positive match, the match result will be the
+      corresponding row index, for negative match, the match will be
+      `negative_value`, for ignored match, the match result will be
+      `ignore_value`.
+    """
+    squeeze_result = False
+    if len(similarity_matrix.shape) == 2:
+      squeeze_result = True
+      similarity_matrix = tf.expand_dims(similarity_matrix, axis=0)
+
+    static_shape = similarity_matrix.shape.as_list()
+    num_rows = static_shape[1] or tf.shape(similarity_matrix)[1]
+    batch_size = static_shape[0] or tf.shape(similarity_matrix)[0]
+
+    def _match_when_rows_are_empty():
+      """Performs matching when the rows of similarity matrix are empty.
+
+      When the rows are empty, all detections are false positives. So we return
+      a tensor of -1's to indicate that the columns do not match to any rows.
+
+      Returns:
+        matches:  int32 tensor indicating the row each column matches to.
+      """
+      with tf.name_scope('empty_gt_boxes'):
+        matches = tf.zeros([batch_size, num_rows], dtype=tf.int32)
+        match_labels = -tf.ones([batch_size, num_rows], dtype=tf.int32)
+        return matches, match_labels
+
+    def _match_when_rows_are_non_empty():
+      """Performs matching when the rows of similarity matrix are non empty.
+
+      Returns:
+        matches:  int32 tensor indicating the row each column matches to.
+      """
+      # Matches for each column
+      with tf.name_scope('non_empty_gt_boxes'):
+        matches = tf.argmax(similarity_matrix, axis=-1, output_type=tf.int32)
+
+        # Get logical indices of ignored and unmatched columns as tf.int64
+        matched_vals = tf.reduce_max(similarity_matrix, axis=-1)
+        matched_indicators = tf.zeros([batch_size, num_rows], tf.int32)
+
+        match_dtype = matched_vals.dtype
+        for (ind, low, high) in zip(self.indicators, self.thresholds[:-1],
+                                    self.thresholds[1:]):
+          low_threshold = tf.cast(low, match_dtype)
+          high_threshold = tf.cast(high, match_dtype)
+          mask = tf.logical_and(
+              tf.greater_equal(matched_vals, low_threshold),
+              tf.less(matched_vals, high_threshold))
+          matched_indicators = self._set_values_using_indicator(
+              matched_indicators, mask, ind)
+
+        if self._force_match_for_each_col:
+          # [batch_size, M], for each col (groundtruth_box), find the best
+          # matching row (anchor).
+          force_match_column_ids = tf.argmax(
+              input=similarity_matrix, axis=1, output_type=tf.int32)
+          # [batch_size, M, N]
+          force_match_column_indicators = tf.one_hot(
+              force_match_column_ids, depth=num_rows)
+          # [batch_size, N], for each row (anchor), find the largest column
+          # index for groundtruth box
+          force_match_row_ids = tf.argmax(
+              input=force_match_column_indicators, axis=1, output_type=tf.int32)
+          # [batch_size, N]
+          force_match_column_mask = tf.cast(
+              tf.reduce_max(force_match_column_indicators, axis=1),
+              tf.bool)
+          # [batch_size, N]
+          final_matches = tf.where(force_match_column_mask, force_match_row_ids,
+                                   matches)
+          final_matched_indicators = tf.where(
+              force_match_column_mask, self.indicators[-1] *
+              tf.ones([batch_size, num_rows], dtype=tf.int32),
+              matched_indicators)
+          return final_matches, final_matched_indicators
+        else:
+          return matches, matched_indicators
+
+    num_gt_boxes = similarity_matrix.shape.as_list()[-1] or tf.shape(
+        similarity_matrix)[-1]
+    result_match, result_matched_indicators = tf.cond(
+        pred=tf.greater(num_gt_boxes, 0),
+        true_fn=_match_when_rows_are_non_empty,
+        false_fn=_match_when_rows_are_empty)
+
+    if squeeze_result:
+      result_match = tf.squeeze(result_match, axis=0)
+      result_matched_indicators = tf.squeeze(result_matched_indicators, axis=0)
+
+    return result_match, result_matched_indicators
+
+  def _set_values_using_indicator(self, x, indicator, val):
+    """Set the indicated fields of x to val.
+
+    Args:
+      x: tensor.
+      indicator: boolean with same shape as x.
+      val: scalar with value to set.
+
+    Returns:
+      modified tensor.
+    """
+    indicator = tf.cast(indicator, x.dtype)
+    return tf.add(tf.multiply(x, 1 - indicator), val * indicator)
--- a/official/vision/ops/box_matcher_test.py
+++ b/official/vision/ops/box_matcher_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for box_matcher.py."""
+
+import tensorflow as tf
+
+from official.vision.ops import box_matcher
+
+
+class BoxMatcherTest(tf.test.TestCase):
+
+  def test_box_matcher_unbatched(self):
+    sim_matrix = tf.constant(
+        [[0.04, 0, 0, 0],
+         [0, 0, 1., 0]],
+        dtype=tf.float32)
+
+    fg_threshold = 0.5
+    bg_thresh_hi = 0.2
+    bg_thresh_lo = 0.0
+
+    matcher = box_matcher.BoxMatcher(
+        thresholds=[bg_thresh_lo, bg_thresh_hi, fg_threshold],
+        indicators=[-3, -2, -1, 1])
+    match_indices, match_indicators = matcher(sim_matrix)
+    positive_matches = tf.greater_equal(match_indicators, 0)
+    negative_matches = tf.equal(match_indicators, -2)
+
+    self.assertAllEqual(
+        positive_matches.numpy(), [False, True])
+    self.assertAllEqual(
+        negative_matches.numpy(), [True, False])
+    self.assertAllEqual(
+        match_indices.numpy(), [0, 2])
+    self.assertAllEqual(
+        match_indicators.numpy(), [-2, 1])
+
+  def test_box_matcher_batched(self):
+    sim_matrix = tf.constant(
+        [[[0.04, 0, 0, 0],
+          [0, 0, 1., 0]]],
+        dtype=tf.float32)
+
+    fg_threshold = 0.5
+    bg_thresh_hi = 0.2
+    bg_thresh_lo = 0.0
+
+    matcher = box_matcher.BoxMatcher(
+        thresholds=[bg_thresh_lo, bg_thresh_hi, fg_threshold],
+        indicators=[-3, -2, -1, 1])
+    match_indices, match_indicators = matcher(sim_matrix)
+    positive_matches = tf.greater_equal(match_indicators, 0)
+    negative_matches = tf.equal(match_indicators, -2)
+
+    self.assertAllEqual(
+        positive_matches.numpy(), [[False, True]])
+    self.assertAllEqual(
+        negative_matches.numpy(), [[True, False]])
+    self.assertAllEqual(
+        match_indices.numpy(), [[0, 2]])
+    self.assertAllEqual(
+        match_indicators.numpy(), [[-2, 1]])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/box_ops.py
+++ b/official/vision/ops/box_ops.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Box related ops."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+
+
+EPSILON = 1e-8
+BBOX_XFORM_CLIP = np.log(1000. / 16.)
+
+
+def yxyx_to_xywh(boxes):
+  """Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
+
+  Args:
+    boxes: a numpy array whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+
+  Returns:
+    boxes: a numpy array whose shape is the same as `boxes` in new format.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  boxes_ymin = boxes[..., 0]
+  boxes_xmin = boxes[..., 1]
+  boxes_width = boxes[..., 3] - boxes[..., 1]
+  boxes_height = boxes[..., 2] - boxes[..., 0]
+  new_boxes = np.stack(
+      [boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=-1)
+
+  return new_boxes
+
+
+def yxyx_to_cycxhw(boxes):
+  """Converts box corner coordinates to center plus height and width terms.
+
+  Args:
+    boxes: a `Tensor` with last dimension of 4, representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+
+  Returns:
+    boxes: a `Tensor` with the same shape as the inputted boxes, in the format
+      of cy, cx, height, width.
+
+  Raises:
+    ValueError: if the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('Last dimension of boxes must be 4 but is {:d}'.format(
+        boxes.shape[-1]))
+
+  boxes_ycenter = (boxes[..., 0] + boxes[..., 2]) / 2
+  boxes_xcenter = (boxes[..., 1] + boxes[..., 3]) / 2
+  boxes_height = boxes[..., 2] - boxes[..., 0]
+  boxes_width = boxes[..., 3] - boxes[..., 1]
+
+  new_boxes = tf.stack(
+      [boxes_ycenter, boxes_xcenter, boxes_height, boxes_width], axis=-1)
+  return new_boxes
+
+
+def cycxhw_to_yxyx(boxes):
+  """Converts box center coordinates plus height and width terms to corner.
+
+  Args:
+    boxes: a numpy array whose last dimension is 4 representing the coordinates
+      of boxes in cy, cx, height, width order.
+
+  Returns:
+    boxes: a numpy array whose shape is the same as `boxes` in new format.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  boxes_ymin = boxes[..., 0] - boxes[..., 2] / 2
+  boxes_xmin = boxes[..., 1] - boxes[..., 3] / 2
+  boxes_ymax = boxes[..., 0] + boxes[..., 2] / 2
+  boxes_xmax = boxes[..., 1] + boxes[..., 3] / 2
+  new_boxes = tf.stack([
+      boxes_ymin, boxes_xmin, boxes_ymax, boxes_xmax], axis=-1)
+  return new_boxes
+
+
+def jitter_boxes(boxes, noise_scale=0.025):
+  """Jitter the box coordinates by some noise distribution.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    noise_scale: a python float which specifies the magnitude of noise. The rule
+      of thumb is to set this between (0, 0.1]. The default value is found to
+      mimic the noisy detections best empirically.
+
+  Returns:
+    jittered_boxes: a tensor whose shape is the same as `boxes` representing
+      the jittered boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('jitter_boxes'):
+    bbox_jitters = tf.random.normal(tf.shape(boxes), stddev=noise_scale)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    width = xmax - xmin
+    height = ymax - ymin
+    new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
+    new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
+    new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
+    new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
+    jittered_boxes = tf.concat(
+        [new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
+         new_center_y + new_height * 0.5, new_center_x + new_width * 0.5],
+        axis=-1)
+
+    return jittered_boxes
+
+
+def normalize_boxes(boxes, image_shape):
+  """Converts boxes to the normalized coordinates.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+
+  Returns:
+    normalized_boxes: a tensor whose shape is the same as `boxes` representing
+      the normalized boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('normalize_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0:1]
+      width = image_shape[..., 1:2]
+
+    ymin = boxes[..., 0:1] / height
+    xmin = boxes[..., 1:2] / width
+    ymax = boxes[..., 2:3] / height
+    xmax = boxes[..., 3:4] / width
+
+    normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+    return normalized_boxes
+
+
+def denormalize_boxes(boxes, image_shape):
+  """Converts boxes normalized by [height, width] to pixel coordinates.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+
+  Returns:
+    denormalized_boxes: a tensor whose shape is the same as `boxes` representing
+      the denormalized boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  with tf.name_scope('denormalize_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.split(image_shape, 2, axis=-1)
+
+    ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
+    ymin = ymin * height
+    xmin = xmin * width
+    ymax = ymax * height
+    xmax = xmax * width
+
+    denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+    return denormalized_boxes
+
+
+def clip_boxes(boxes, image_shape):
+  """Clips boxes to image boundaries.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+
+  Returns:
+    clipped_boxes: a tensor whose shape is the same as `boxes` representing the
+      clipped boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('clip_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+      max_length = [height, width, height, width]
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.unstack(image_shape, axis=-1)
+      max_length = tf.stack([height, width, height, width], axis=-1)
+
+    clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
+    return clipped_boxes
+
+
+def compute_outer_boxes(boxes, image_shape, scale=1.0):
+  """Compute outer box encloses an object with a margin.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+    scale: a float number specifying the scale of output outer boxes to input
+      `boxes`.
+
+  Returns:
+    outer_boxes: a tensor whose shape is the same as `boxes` representing the
+      outer boxes.
+  """
+  if scale < 1.0:
+    raise ValueError(
+        'scale is {}, but outer box scale must be greater than 1.0.'.format(
+            scale))
+  centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
+  centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
+  box_height = (boxes[..., 2] - boxes[..., 0]) * scale
+  box_width = (boxes[..., 3] - boxes[..., 1]) * scale
+  outer_boxes = tf.stack(
+      [centers_y - box_height / 2.0, centers_x - box_width / 2.0,
+       centers_y + box_height / 2.0, centers_x + box_width / 2.0],
+      axis=1)
+  outer_boxes = clip_boxes(outer_boxes, image_shape)
+  return outer_boxes
+
+
+def encode_boxes(boxes, anchors, weights=None):
+  """Encode boxes to targets.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+
+  Returns:
+    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
+      encoded box targets.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('encode_boxes'):
+    boxes = tf.cast(boxes, dtype=anchors.dtype)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    box_h = ymax - ymin
+    box_w = xmax - xmin
+    box_yc = ymin + 0.5 * box_h
+    box_xc = xmin + 0.5 * box_w
+
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin
+    anchor_w = anchor_xmax - anchor_xmin
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+
+    encoded_dy = (box_yc - anchor_yc) / anchor_h
+    encoded_dx = (box_xc - anchor_xc) / anchor_w
+    encoded_dh = tf.math.log(box_h / anchor_h)
+    encoded_dw = tf.math.log(box_w / anchor_w)
+    if weights:
+      encoded_dy *= weights[0]
+      encoded_dx *= weights[1]
+      encoded_dh *= weights[2]
+      encoded_dw *= weights[3]
+
+    encoded_boxes = tf.concat(
+        [encoded_dy, encoded_dx, encoded_dh, encoded_dw], axis=-1)
+    return encoded_boxes
+
+
+def decode_boxes(encoded_boxes, anchors, weights=None):
+  """Decode boxes.
+
+  Args:
+    encoded_boxes: a tensor whose last dimension is 4 representing the
+      coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+
+  Returns:
+    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
+      decoded box targets.
+  """
+  if encoded_boxes.shape[-1] != 4:
+    raise ValueError(
+        'encoded_boxes.shape[-1] is {:d}, but must be 4.'
+        .format(encoded_boxes.shape[-1]))
+
+  with tf.name_scope('decode_boxes'):
+    encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
+    dy = encoded_boxes[..., 0:1]
+    dx = encoded_boxes[..., 1:2]
+    dh = encoded_boxes[..., 2:3]
+    dw = encoded_boxes[..., 3:4]
+    if weights:
+      dy /= weights[0]
+      dx /= weights[1]
+      dh /= weights[2]
+      dw /= weights[3]
+    dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
+    dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
+
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin
+    anchor_w = anchor_xmax - anchor_xmin
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+
+    decoded_boxes_yc = dy * anchor_h + anchor_yc
+    decoded_boxes_xc = dx * anchor_w + anchor_xc
+    decoded_boxes_h = tf.math.exp(dh) * anchor_h
+    decoded_boxes_w = tf.math.exp(dw) * anchor_w
+
+    decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
+    decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
+    decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h
+    decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w
+
+    decoded_boxes = tf.concat(
+        [decoded_boxes_ymin, decoded_boxes_xmin,
+         decoded_boxes_ymax, decoded_boxes_xmax],
+        axis=-1)
+    return decoded_boxes
+
+
+def filter_boxes(boxes, scores, image_shape, min_size_threshold):
+  """Filter and remove boxes that are too small or fall outside the image.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    image_shape: a tensor whose shape is the same as, or `broadcastable` to
+      `boxes` except the last dimension, which is 2, representing [height,
+      width] of the scaled image.
+    min_size_threshold: a float representing the minimal box size in each side
+      (w.r.t. the scaled image). Boxes whose sides are smaller than it will be
+      filtered out.
+
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with 0.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the positinon of the filtered boxes filled with 0.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+
+  with tf.name_scope('filter_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0]
+      width = image_shape[..., 1]
+
+    ymin = boxes[..., 0]
+    xmin = boxes[..., 1]
+    ymax = boxes[..., 2]
+    xmax = boxes[..., 3]
+
+    h = ymax - ymin
+    w = xmax - xmin
+    yc = ymin + 0.5 * h
+    xc = xmin + 0.5 * w
+
+    min_size = tf.cast(
+        tf.math.maximum(min_size_threshold, 0.0), dtype=boxes.dtype)
+
+    filtered_size_mask = tf.math.logical_and(
+        tf.math.greater(h, min_size), tf.math.greater(w, min_size))
+    filtered_center_mask = tf.logical_and(
+        tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
+        tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
+    filtered_mask = tf.math.logical_and(
+        filtered_size_mask, filtered_center_mask)
+
+    filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+    return filtered_boxes, filtered_scores
+
+
+def filter_boxes_by_scores(boxes, scores, min_score_threshold):
+  """Filter and remove boxes whose scores are smaller than the threshold.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    min_score_threshold: a float representing the minimal box score threshold.
+      Boxes whose score are smaller than it will be filtered out.
+
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with -1.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+
+  with tf.name_scope('filter_boxes_by_scores'):
+    filtered_mask = tf.math.greater(scores, min_score_threshold)
+    filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+
+    return filtered_boxes, filtered_scores
+
+
+def gather_instances(selected_indices, instances, *aux_instances):
+  """Gather instances by indices.
+
+  Args:
+    selected_indices: a Tensor of shape [batch, K] which indicates the selected
+      indices in instance dimension (2nd dimension).
+    instances: a Tensor of shape [batch, N, ...] where the 2nd dimension is
+      the instance dimension to be selected from.
+    *aux_instances: the additional Tensors whose shapes are in [batch, N, ...]
+      which are the tensors to be selected from using the `selected_indices`.
+
+  Returns:
+    selected_instances: the tensor of shape [batch, K, ...] which corresponds to
+      the selected instances of the `instances` tensor.
+    selected_aux_instances: the additional tensors of shape [batch, K, ...]
+      which corresponds to the selected instances of the `aus_instances`
+      tensors.
+  """
+  batch_size = instances.shape[0]
+  if batch_size == 1:
+    selected_instances = tf.squeeze(
+        tf.gather(instances, selected_indices, axis=1), axis=1)
+    if aux_instances:
+      selected_aux_instances = [
+          tf.squeeze(
+              tf.gather(a, selected_indices, axis=1), axis=1)
+          for a in aux_instances
+      ]
+      return tuple([selected_instances] + selected_aux_instances)
+    else:
+      return selected_instances
+  else:
+    indices_shape = tf.shape(selected_indices)
+    batch_indices = (
+        tf.expand_dims(tf.range(indices_shape[0]), axis=-1) *
+        tf.ones([1, indices_shape[-1]], dtype=tf.int32))
+    gather_nd_indices = tf.stack(
+        [batch_indices, selected_indices], axis=-1)
+    selected_instances = tf.gather_nd(instances, gather_nd_indices)
+    if aux_instances:
+      selected_aux_instances = [
+          tf.gather_nd(a, gather_nd_indices) for a in aux_instances
+      ]
+      return tuple([selected_instances] + selected_aux_instances)
+    else:
+      return selected_instances
+
+
+def top_k_boxes(boxes, scores, k):
+  """Sort and select top k boxes according to the scores.
+
+  Args:
+    boxes: a tensor of shape [batch_size, N, 4] representing the coordinate of
+      the boxes. N is the number of boxes per image.
+    scores: a tensor of shsape [batch_size, N] representing the socre of the
+      boxes.
+    k: an integer or a tensor indicating the top k number.
+
+  Returns:
+    selected_boxes: a tensor of shape [batch_size, k, 4] representing the
+      selected top k box coordinates.
+    selected_scores: a tensor of shape [batch_size, k] representing the selected
+      top k box scores.
+  """
+  with tf.name_scope('top_k_boxes'):
+    selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
+    selected_boxes = gather_instances(top_k_indices, boxes)
+    return selected_boxes, selected_scores
+
+
+def get_non_empty_box_indices(boxes):
+  """Get indices for non-empty boxes."""
+  # Selects indices if box height or width is 0.
+  height = boxes[:, 2] - boxes[:, 0]
+  width = boxes[:, 3] - boxes[:, 1]
+  indices = tf.where(tf.logical_and(tf.greater(height, 0),
+                                    tf.greater(width, 0)))
+  return indices[:, 0]
+
+
+def bbox_overlap(boxes, gt_boxes):
+  """Calculates the overlap between proposal and ground truth boxes.
+
+  Some `boxes` or `gt_boxes` may have been padded.  The returned `iou` tensor
+  for these boxes will be -1.
+
+  Args:
+    boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
+      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
+    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
+      tensor might have paddings with a negative value.
+
+  Returns:
+    iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
+  """
+  with tf.name_scope('bbox_overlap'):
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=2)
+
+    # Calculates the intersection area.
+    i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    i_area = (
+        tf.math.maximum((i_xmax - i_xmin), 0) *
+        tf.math.maximum((i_ymax - i_ymin), 0))
+
+    # Calculates the union area.
+    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
+    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
+    # Adds a small epsilon to avoid divide-by-zero.
+    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
+
+    # Calculates IoU.
+    iou = i_area / u_area
+
+    # Fills -1 for IoU entries between the padded ground truth boxes.
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    padding_mask = tf.logical_or(
+        tf.zeros_like(bb_x_min, dtype=tf.bool),
+        tf.transpose(gt_invalid_mask, [0, 2, 1]))
+    iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
+
+    # Fills -1 for for invalid (-1) boxes.
+    boxes_invalid_mask = tf.less(
+        tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
+    iou = tf.where(boxes_invalid_mask, -tf.ones_like(iou), iou)
+
+    return iou
+
+
+def bbox_generalized_overlap(boxes, gt_boxes):
+  """Calculates the GIOU between proposal and ground truth boxes.
+
+  The generalized intersection of union is an adjustment of the traditional IOU
+  metric which provides continuous updates even for predictions with no overlap.
+  This metric is defined in https://giou.stanford.edu/GIoU.pdf. Note, some
+  `gt_boxes` may have been padded. The returned `giou` tensor for these boxes
+  will be -1.
+
+  Args:
+    boxes: a `Tensor` with a shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
+      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
+    gt_boxes: a `Tensor` with a shape of [batch_size, max_num_instances, 4].
+      This tensor may have paddings with a negative value and will also be in
+      the [ymin, xmin, ymax, xmax] format.
+
+  Returns:
+    giou: a `Tensor` with as a shape of [batch_size, N, max_num_instances].
+  """
+  with tf.name_scope('bbox_generalized_overlap'):
+    assert boxes.shape.as_list(
+    )[-1] == 4, 'Boxes must be defined by 4 coordinates.'
+    assert gt_boxes.shape.as_list(
+    )[-1] == 4, 'Groundtruth boxes must be defined by 4 coordinates.'
+
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=2)
+
+    # Calculates the hull area for each pair of boxes, with one from
+    # boxes and the other from gt_boxes.
+    # Outputs for coordinates are of shape [batch_size, N, max_num_instances]
+    h_xmin = tf.minimum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    h_xmax = tf.maximum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    h_ymin = tf.minimum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    h_ymax = tf.maximum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    h_area = tf.maximum((h_xmax - h_xmin), 0) * tf.maximum((h_ymax - h_ymin), 0)
+    # Add a small epsilon to avoid divide-by-zero.
+    h_area = h_area + 1e-8
+
+    # Calculates the intersection area.
+    i_xmin = tf.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    i_xmax = tf.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    i_ymin = tf.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    i_ymax = tf.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    i_area = tf.maximum((i_xmax - i_xmin), 0) * tf.maximum((i_ymax - i_ymin), 0)
+
+    # Calculates the union area.
+    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
+    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
+
+    # Adds a small epsilon to avoid divide-by-zero.
+    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
+
+    # Calculates IoU.
+    iou = i_area / u_area
+    # Calculates GIoU.
+    giou = iou - (h_area - u_area) / h_area
+
+    # Fills -1 for GIoU entries between the padded ground truth boxes.
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    padding_mask = tf.broadcast_to(
+        tf.transpose(gt_invalid_mask, [0, 2, 1]), tf.shape(giou))
+    giou = tf.where(padding_mask, -tf.ones_like(giou), giou)
+    return giou
+
+
+def box_matching(boxes, gt_boxes, gt_classes):
+  """Match boxes to groundtruth boxes.
+
+  Given the proposal boxes and the groundtruth boxes and classes, perform the
+  groundtruth matching by taking the argmax of the IoU between boxes and
+  groundtruth boxes.
+
+  Args:
+    boxes: a tensor of shape of [batch_size, N, 4] representing the box
+      coordiantes to be matched to groundtruth boxes.
+    gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
+      the groundtruth box coordinates. It is padded with -1s to indicate the
+      invalid boxes.
+    gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+      classes. It is padded with -1s to indicate the invalid classes.
+
+  Returns:
+    matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
+      the matched groundtruth box coordinates for each input box. If the box
+      does not overlap with any groundtruth boxes, the matched boxes of it
+      will be set to all 0s.
+    matched_gt_classes: a tensor of shape of [batch_size, N], representing
+      the matched groundtruth classes for each input box. If the box does not
+      overlap with any groundtruth boxes, the matched box classes of it will
+      be set to 0, which corresponds to the background class.
+    matched_gt_indices: a tensor of shape of [batch_size, N], representing
+      the indices of the matched groundtruth boxes in the original gt_boxes
+      tensor. If the box does not overlap with any groundtruth boxes, the
+      index of the matched groundtruth will be set to -1.
+    matched_iou: a tensor of shape of [batch_size, N], representing the IoU
+      between the box and its matched groundtruth box. The matched IoU is the
+      maximum IoU of the box and all the groundtruth boxes.
+    iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
+      between boxes and the groundtruth boxes. The IoU between a box and the
+      invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
+  """
+  # Compute IoU between boxes and gt_boxes.
+  # iou <- [batch_size, N, K]
+  iou = bbox_overlap(boxes, gt_boxes)
+
+  # max_iou <- [batch_size, N]
+  # 0.0 -> no match to gt, or -1.0 match to no gt
+  matched_iou = tf.reduce_max(iou, axis=-1)
+
+  # background_box_mask <- bool, [batch_size, N]
+  background_box_mask = tf.less_equal(matched_iou, 0.0)
+
+  argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
+
+  matched_gt_boxes, matched_gt_classes = gather_instances(
+      argmax_iou_indices, gt_boxes, gt_classes)
+  matched_gt_boxes = tf.where(
+      tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
+      tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
+      matched_gt_boxes)
+  matched_gt_classes = tf.where(
+      background_box_mask,
+      tf.zeros_like(matched_gt_classes),
+      matched_gt_classes)
+
+  matched_gt_indices = tf.where(
+      background_box_mask,
+      -tf.ones_like(argmax_iou_indices),
+      argmax_iou_indices)
+
+  return (matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+          matched_iou, iou)
--- a/official/vision/ops/iou_similarity.py
+++ b/official/vision/ops/iou_similarity.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Region Similarity Calculators."""
+
+import tensorflow as tf
+
+
+def area(box):
+  """Computes area of boxes.
+
+  B: batch_size
+  N: number of boxes
+
+  Args:
+    box: a float Tensor with [N, 4], or [B, N, 4].
+
+  Returns:
+    a float Tensor with [N], or [B, N]
+  """
+  with tf.name_scope('Area'):
+    y_min, x_min, y_max, x_max = tf.split(
+        value=box, num_or_size_splits=4, axis=-1)
+    return tf.squeeze((y_max - y_min) * (x_max - x_min), axis=-1)
+
+
+def intersection(gt_boxes, boxes):
+  """Compute pairwise intersection areas between boxes.
+
+  B: batch_size
+  N: number of groundtruth boxes.
+  M: number of anchor boxes.
+
+  Args:
+    gt_boxes: a float Tensor with [N, 4], or [B, N, 4]
+    boxes: a float Tensor with [M, 4], or [B, M, 4]
+
+  Returns:
+    a float Tensor with shape [N, M] or [B, N, M] representing pairwise
+      intersections.
+  """
+  with tf.name_scope('Intersection'):
+    y_min1, x_min1, y_max1, x_max1 = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=-1)
+    y_min2, x_min2, y_max2, x_max2 = tf.split(
+        value=boxes, num_or_size_splits=4, axis=-1)
+
+    boxes_rank = len(boxes.shape)
+    perm = [1, 0] if boxes_rank == 2 else [0, 2, 1]
+    # [N, M] or [B, N, M]
+    y_min_max = tf.minimum(y_max1, tf.transpose(y_max2, perm))
+    y_max_min = tf.maximum(y_min1, tf.transpose(y_min2, perm))
+    x_min_max = tf.minimum(x_max1, tf.transpose(x_max2, perm))
+    x_max_min = tf.maximum(x_min1, tf.transpose(x_min2, perm))
+
+    intersect_heights = y_min_max - y_max_min
+    intersect_widths = x_min_max - x_max_min
+    zeros_t = tf.cast(0, intersect_heights.dtype)
+    intersect_heights = tf.maximum(zeros_t, intersect_heights)
+    intersect_widths = tf.maximum(zeros_t, intersect_widths)
+    return intersect_heights * intersect_widths
+
+
+def iou(gt_boxes, boxes):
+  """Computes pairwise intersection-over-union between box collections.
+
+  Args:
+    gt_boxes: a float Tensor with [N, 4].
+    boxes: a float Tensor with [M, 4].
+
+  Returns:
+    a Tensor with shape [N, M] representing pairwise iou scores.
+  """
+  with tf.name_scope('IOU'):
+    intersections = intersection(gt_boxes, boxes)
+    gt_boxes_areas = area(gt_boxes)
+    boxes_areas = area(boxes)
+    boxes_rank = len(boxes_areas.shape)
+    boxes_axis = 1 if (boxes_rank == 2) else 0
+    gt_boxes_areas = tf.expand_dims(gt_boxes_areas, -1)
+    boxes_areas = tf.expand_dims(boxes_areas, boxes_axis)
+    unions = gt_boxes_areas + boxes_areas
+    unions = unions - intersections
+    return tf.where(
+        tf.equal(intersections, 0.0), tf.zeros_like(intersections),
+        tf.truediv(intersections, unions))
+
+
+class IouSimilarity:
+  """Class to compute similarity based on Intersection over Union (IOU) metric.
+
+  """
+
+  def __init__(self, mask_val=-1):
+    self.mask_val = mask_val
+
+  def __call__(self, boxes_1, boxes_2, boxes_1_masks=None, boxes_2_masks=None):
+    """Compute pairwise IOU similarity between ground truth boxes and anchors.
+
+    B: batch_size
+    N: Number of groundtruth boxes.
+    M: Number of anchor boxes.
+
+    Args:
+      boxes_1: a float Tensor with M or B * M boxes.
+      boxes_2: a float Tensor with N or B * N boxes, the rank must be less than
+        or equal to rank of `boxes_1`.
+      boxes_1_masks: a boolean Tensor with M or B * M boxes. Optional.
+      boxes_2_masks: a boolean Tensor with N or B * N boxes. Optional.
+
+    Returns:
+      A Tensor with shape [M, N] or [B, M, N] representing pairwise
+        iou scores, anchor per row and groundtruth_box per colulmn.
+
+    Input shape:
+      boxes_1: [N, 4], or [B, N, 4]
+      boxes_2: [M, 4], or [B, M, 4]
+      boxes_1_masks: [N, 1], or [B, N, 1]
+      boxes_2_masks: [M, 1], or [B, M, 1]
+
+    Output shape:
+      [M, N], or [B, M, N]
+    """
+    boxes_1 = tf.cast(boxes_1, tf.float32)
+    boxes_2 = tf.cast(boxes_2, tf.float32)
+
+    boxes_1_rank = len(boxes_1.shape)
+    boxes_2_rank = len(boxes_2.shape)
+    if boxes_1_rank < 2 or boxes_1_rank > 3:
+      raise ValueError(
+          '`groudtruth_boxes` must be rank 2 or 3, got {}'.format(boxes_1_rank))
+    if boxes_2_rank < 2 or boxes_2_rank > 3:
+      raise ValueError(
+          '`anchors` must be rank 2 or 3, got {}'.format(boxes_2_rank))
+    if boxes_1_rank < boxes_2_rank:
+      raise ValueError('`groundtruth_boxes` is unbatched while `anchors` is '
+                       'batched is not a valid use case, got groundtruth_box '
+                       'rank {}, and anchors rank {}'.format(
+                           boxes_1_rank, boxes_2_rank))
+
+    result = iou(boxes_1, boxes_2)
+    if boxes_1_masks is None and boxes_2_masks is None:
+      return result
+    background_mask = None
+    mask_val_t = tf.cast(self.mask_val, result.dtype) * tf.ones_like(result)
+    perm = [1, 0] if boxes_2_rank == 2 else [0, 2, 1]
+    if boxes_1_masks is not None and boxes_2_masks is not None:
+      background_mask = tf.logical_or(boxes_1_masks,
+                                      tf.transpose(boxes_2_masks, perm))
+    elif boxes_1_masks is not None:
+      background_mask = boxes_1_masks
+    else:
+      background_mask = tf.logical_or(
+          tf.zeros(tf.shape(boxes_2)[:-1], dtype=tf.bool),
+          tf.transpose(boxes_2_masks, perm))
+    return tf.where(background_mask, mask_val_t, result)
--- a/official/vision/ops/iou_similarity_test.py
+++ b/official/vision/ops/iou_similarity_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for iou_similarity.py."""
+
+import tensorflow as tf
+
+from official.vision.ops import iou_similarity
+
+
+class BoxMatcherTest(tf.test.TestCase):
+
+  def test_similarity_unbatched(self):
+    boxes = tf.constant(
+        [
+            [0, 0, 1, 1],
+            [5, 0, 10, 5],
+        ],
+        dtype=tf.float32)
+
+    gt_boxes = tf.constant(
+        [
+            [0, 0, 5, 5],
+            [0, 5, 5, 10],
+            [5, 0, 10, 5],
+            [5, 5, 10, 10],
+        ],
+        dtype=tf.float32)
+
+    sim_calc = iou_similarity.IouSimilarity()
+    sim_matrix = sim_calc(boxes, gt_boxes)
+
+    self.assertAllClose(
+        sim_matrix.numpy(),
+        [[0.04, 0, 0, 0],
+         [0, 0, 1., 0]])
+
+  def test_similarity_batched(self):
+    boxes = tf.constant(
+        [[
+            [0, 0, 1, 1],
+            [5, 0, 10, 5],
+        ]],
+        dtype=tf.float32)
+
+    gt_boxes = tf.constant(
+        [[
+            [0, 0, 5, 5],
+            [0, 5, 5, 10],
+            [5, 0, 10, 5],
+            [5, 5, 10, 10],
+        ]],
+        dtype=tf.float32)
+
+    sim_calc = iou_similarity.IouSimilarity()
+    sim_matrix = sim_calc(boxes, gt_boxes)
+
+    self.assertAllClose(
+        sim_matrix.numpy(),
+        [[[0.04, 0, 0, 0],
+          [0, 0, 1., 0]]])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/mask_ops.py
+++ b/official/vision/ops/mask_ops.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for segmentations."""
+
+import math
+# Import libraries
+import cv2
+import numpy as np
+
+
+def paste_instance_masks(masks,
+                         detected_boxes,
+                         image_height,
+                         image_width):
+  """Paste instance masks to generate the image segmentation results.
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+
+  def expand_boxes(boxes, scale):
+    """Expands an array of boxes by a given scale."""
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
+    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+    # whereas `boxes` here is in [x1, y1, w, h] form
+    w_half = boxes[:, 2] * .5
+    h_half = boxes[:, 3] * .5
+    x_c = boxes[:, 0] + w_half
+    y_c = boxes[:, 1] + h_half
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+
+    return boxes_exp
+
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812  # pylint: disable=line-too-long
+  # To work around an issue with cv2.resize (it seems to automatically pad
+  # with repeated border values), we manually zero-pad the masks by 1 pixel
+  # prior to resizing back to the original image resolution. This prevents
+  # "top hat" artifacts. We therefore need to expand the reference boxes by an
+  # appropriate factor.
+  _, mask_height, mask_width = masks.shape
+  scale = max((mask_width + 2.0) / mask_width,
+              (mask_height + 2.0) / mask_height)
+
+  ref_boxes = expand_boxes(detected_boxes, scale)
+  ref_boxes = ref_boxes.astype(np.int32)
+  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+  segms = []
+  for mask_ind, mask in enumerate(masks):
+    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+    # Process mask inside bounding boxes.
+    padded_mask[1:-1, 1:-1] = mask[:, :]
+
+    ref_box = ref_boxes[mask_ind, :]
+    w = ref_box[2] - ref_box[0] + 1
+    h = ref_box[3] - ref_box[1] + 1
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    mask = cv2.resize(padded_mask, (w, h))
+    mask = np.array(mask > 0.5, dtype=np.uint8)
+
+    x_0 = min(max(ref_box[0], 0), image_width)
+    x_1 = min(max(ref_box[2] + 1, 0), image_width)
+    y_0 = min(max(ref_box[1], 0), image_height)
+    y_1 = min(max(ref_box[3] + 1, 0), image_height)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - ref_box[1]):(y_1 - ref_box[1]),
+        (x_0 - ref_box[0]):(x_1 - ref_box[0])
+    ]
+    segms.append(im_mask)
+
+  segms = np.array(segms)
+  assert masks.shape[0] == segms.shape[0]
+  return segms
+
+
+def paste_instance_masks_v2(masks,
+                            detected_boxes,
+                            image_height,
+                            image_width):
+  """Paste instance masks to generate the image segmentation (v2).
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+  _, mask_height, mask_width = masks.shape
+
+  segms = []
+  for i, mask in enumerate(masks):
+    box = detected_boxes[i, :]
+    xmin = box[0]
+    ymin = box[1]
+    xmax = xmin + box[2]
+    ymax = ymin + box[3]
+
+    # Sample points of the cropped mask w.r.t. the image grid.
+    # Note that these coordinates may fall beyond the image.
+    # Pixel clipping will happen after warping.
+    xmin_int = int(math.floor(xmin))
+    xmax_int = int(math.ceil(xmax))
+    ymin_int = int(math.floor(ymin))
+    ymax_int = int(math.ceil(ymax))
+
+    alpha = box[2] / (1.0 * mask_width)
+    beta = box[3] / (1.0 * mask_height)
+    # pylint: disable=invalid-name
+    # Transformation from mask pixel indices to image coordinate.
+    M_mask_to_image = np.array(
+        [[alpha, 0, xmin],
+         [0, beta, ymin],
+         [0, 0, 1]],
+        dtype=np.float32)
+    # Transformation from image to cropped mask coordinate.
+    M_image_to_crop = np.array(
+        [[1, 0, -xmin_int],
+         [0, 1, -ymin_int],
+         [0, 0, 1]],
+        dtype=np.float32)
+    M = np.dot(M_image_to_crop, M_mask_to_image)
+    # Compensate the half pixel offset that OpenCV has in the
+    # warpPerspective implementation: the top-left pixel is sampled
+    # at (0,0), but we want it to be at (0.5, 0.5).
+    M = np.dot(
+        np.dot(
+            np.array([[1, 0, -0.5],
+                      [0, 1, -0.5],
+                      [0, 0, 1]], np.float32),
+            M),
+        np.array([[1, 0, 0.5],
+                  [0, 1, 0.5],
+                  [0, 0, 1]], np.float32))
+    # pylint: enable=invalid-name
+    cropped_mask = cv2.warpPerspective(
+        mask.astype(np.float32), M,
+        (xmax_int - xmin_int, ymax_int - ymin_int))
+    cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
+
+    img_mask = np.zeros((image_height, image_width))
+    x0 = max(min(xmin_int, image_width), 0)
+    x1 = max(min(xmax_int, image_width), 0)
+    y0 = max(min(ymin_int, image_height), 0)
+    y1 = max(min(ymax_int, image_height), 0)
+    img_mask[y0:y1, x0:x1] = cropped_mask[
+        (y0 - ymin_int):(y1 - ymin_int),
+        (x0 - xmin_int):(x1 - xmin_int)]
+
+    segms.append(img_mask)
+
+  segms = np.array(segms)
+  return segms
+
--- a/official/vision/ops/mask_ops_test.py
+++ b/official/vision/ops/mask_ops_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Tests for mask_ops.py."""
+
+# Import libraries
+import numpy as np
+import tensorflow as tf
+from official.vision.ops import mask_ops
+
+
+class MaskUtilsTest(tf.test.TestCase):
+
+  def testPasteInstanceMasks(self):
+    image_height = 10
+    image_width = 10
+    mask_height = 6
+    mask_width = 6
+    masks = np.random.randint(0, 255, (1, mask_height, mask_width))
+    detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
+
+    _ = mask_ops.paste_instance_masks(
+        masks, detected_boxes, image_height, image_width)
+
+  def testPasteInstanceMasksV2(self):
+    image_height = 10
+    image_width = 10
+    mask_height = 6
+    mask_width = 6
+    masks = np.random.randint(0, 255, (1, mask_height, mask_width))
+    detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]])
+
+    image_masks = mask_ops.paste_instance_masks_v2(
+        masks, detected_boxes, image_height, image_width)
+
+    self.assertNDArrayNear(
+        image_masks[:, 2:8, 0:6],
+        np.array(masks > 0.5, dtype=np.uint8),
+        1e-5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/nms.py
+++ b/official/vision/ops/nms.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tensorflow implementation of non max suppression."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.ops import box_ops
+
+
+NMS_TILE_SIZE = 512
+
+
+def _self_suppression(iou, _, iou_sum):
+  batch_size = tf.shape(iou)[0]
+  can_suppress_others = tf.cast(
+      tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
+  iou_suppressed = tf.reshape(
+      tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
+      [batch_size, -1, 1]) * iou
+  iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
+  return [
+      iou_suppressed,
+      tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
+  ]
+
+
+def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
+  batch_size = tf.shape(boxes)[0]
+  new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
+                       [batch_size, NMS_TILE_SIZE, 4])
+  iou = box_ops.bbox_overlap(new_slice, box_slice)
+  ret_slice = tf.expand_dims(
+      tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
+      2) * box_slice
+  return boxes, ret_slice, iou_threshold, inner_idx + 1
+
+
+def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
+  """Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
+
+  Args:
+    boxes: a tensor with a shape of [batch_size, anchors, 4].
+    iou_threshold: a float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+    output_size: an int32 tensor of size [batch_size]. Representing the number
+      of selected boxes for each batch.
+    idx: an integer scalar representing induction variable.
+
+  Returns:
+    boxes: updated boxes.
+    iou_threshold: pass down iou_threshold to the next iteration.
+    output_size: the updated output_size.
+    idx: the updated induction variable.
+  """
+  num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE
+  batch_size = tf.shape(boxes)[0]
+
+  # Iterates over tiles that can possibly suppress the current tile.
+  box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
+                       [batch_size, NMS_TILE_SIZE, 4])
+  _, box_slice, _, _ = tf.while_loop(
+      lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
+      _cross_suppression, [boxes, box_slice, iou_threshold,
+                           tf.constant(0)])
+
+  # Iterates over the current tile to compute self-suppression.
+  iou = box_ops.bbox_overlap(box_slice, box_slice)
+  mask = tf.expand_dims(
+      tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
+          tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
+  iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
+  suppressed_iou, _, _ = tf.while_loop(
+      lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
+      [iou, tf.constant(True),
+       tf.reduce_sum(iou, [1, 2])])
+  suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
+  box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
+
+  # Uses box_slice to update the input boxes.
+  mask = tf.reshape(
+      tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
+  boxes = tf.tile(tf.expand_dims(
+      box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
+          boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
+  boxes = tf.reshape(boxes, [batch_size, -1, 4])
+
+  # Updates output_size.
+  output_size += tf.reduce_sum(
+      tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
+  return boxes, iou_threshold, output_size, idx + 1
+
+
+def sorted_non_max_suppression_padded(scores,
+                                      boxes,
+                                      max_output_size,
+                                      iou_threshold):
+  """A wrapper that handles non-maximum suppression.
+
+  Assumption:
+    * The boxes are sorted by scores unless the box is a dot (all coordinates
+      are zero).
+    * Boxes with higher scores can be used to suppress boxes with lower scores.
+
+  The overal design of the algorithm is to handle boxes tile-by-tile:
+
+  boxes = boxes.pad_to_multiply_of(tile_size)
+  num_tiles = len(boxes) // tile_size
+  output_boxes = []
+  for i in range(num_tiles):
+    box_tile = boxes[i*tile_size : (i+1)*tile_size]
+    for j in range(i - 1):
+      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
+      iou = bbox_overlap(box_tile, suppressing_tile)
+      # if the box is suppressed in iou, clear it to a dot
+      box_tile *= _update_boxes(iou)
+    # Iteratively handle the diagnal tile.
+    iou = _box_overlap(box_tile, box_tile)
+    iou_changed = True
+    while iou_changed:
+      # boxes that are not suppressed by anything else
+      suppressing_boxes = _get_suppressing_boxes(iou)
+      # boxes that are suppressed by suppressing_boxes
+      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
+      # clear iou to 0 for boxes that are suppressed, as they cannot be used
+      # to suppress other boxes any more
+      new_iou = _clear_iou(iou, suppressed_boxes)
+      iou_changed = (new_iou != iou)
+      iou = new_iou
+    # remaining boxes that can still suppress others, are selected boxes.
+    output_boxes.append(_get_suppressing_boxes(iou))
+    if len(output_boxes) >= max_output_size:
+      break
+
+  Args:
+    scores: a tensor with a shape of [batch_size, anchors].
+    boxes: a tensor with a shape of [batch_size, anchors, 4].
+    max_output_size: a scalar integer `Tensor` representing the maximum number
+      of boxes to be selected by non max suppression.
+    iou_threshold: a float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+
+  Returns:
+    nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
+      dtype as input scores.
+    nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
+      same dtype as input boxes.
+  """
+  batch_size = tf.shape(boxes)[0]
+  num_boxes = tf.shape(boxes)[1]
+  pad = tf.cast(
+      tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
+      tf.int32) * NMS_TILE_SIZE - num_boxes
+  boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
+  scores = tf.pad(
+      tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
+  num_boxes += pad
+
+  def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
+    return tf.logical_and(
+        tf.reduce_min(output_size) < max_output_size,
+        idx < num_boxes // NMS_TILE_SIZE)
+
+  selected_boxes, _, output_size, _ = tf.while_loop(
+      _loop_cond, _suppression_loop_body, [
+          boxes, iou_threshold,
+          tf.zeros([batch_size], tf.int32),
+          tf.constant(0)
+      ])
+  idx = num_boxes - tf.cast(
+      tf.nn.top_k(
+          tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
+          tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
+      tf.int32)
+  idx = tf.minimum(idx, num_boxes - 1)
+  idx = tf.reshape(
+      idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1])
+  boxes = tf.reshape(
+      tf.gather(tf.reshape(boxes, [-1, 4]), idx),
+      [batch_size, max_output_size, 4])
+  boxes = boxes * tf.cast(
+      tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
+          output_size, [-1, 1, 1]), boxes.dtype)
+  scores = tf.reshape(
+      tf.gather(tf.reshape(scores, [-1, 1]), idx),
+      [batch_size, max_output_size])
+  scores = scores * tf.cast(
+      tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
+          output_size, [-1, 1]), scores.dtype)
+  return scores, boxes
--- a/official/vision/ops/preprocess_ops.py
+++ b/official/vision/ops/preprocess_ops.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Preprocessing ops."""
+
+import math
+from typing import Optional, Tuple, Union
+from six.moves import range
+import tensorflow as tf
+
+from official.vision.ops import augment
+from official.vision.ops import box_ops
+
+CENTER_CROP_FRACTION = 0.875
+
+
+def clip_or_pad_to_fixed_size(input_tensor, size, constant_values=0):
+  """Pads data to a fixed length at the first dimension.
+
+  Args:
+    input_tensor: `Tensor` with any dimension.
+    size: `int` number for the first dimension of output Tensor.
+    constant_values: `int` value assigned to the paddings.
+
+  Returns:
+    `Tensor` with the first dimension padded to `size`.
+  """
+  input_shape = input_tensor.get_shape().as_list()
+  padding_shape = []
+
+  # Computes the padding length on the first dimension, clip input tensor if it
+  # is longer than `size`.
+  input_length = tf.shape(input_tensor)[0]
+  input_length = tf.clip_by_value(input_length, 0, size)
+  input_tensor = input_tensor[:input_length]
+
+  padding_length = tf.maximum(0, size - input_length)
+  padding_shape.append(padding_length)
+
+  # Copies shapes of the rest of input shape dimensions.
+  for i in range(1, len(input_shape)):
+    padding_shape.append(tf.shape(input_tensor)[i])
+
+  # Pads input tensor to the fixed first dimension.
+  paddings = tf.cast(constant_values * tf.ones(padding_shape),
+                     input_tensor.dtype)
+  padded_tensor = tf.concat([input_tensor, paddings], axis=0)
+  output_shape = input_shape
+  output_shape[0] = size
+  padded_tensor.set_shape(output_shape)
+  return padded_tensor
+
+
+def normalize_image(image,
+                    offset=(0.485, 0.456, 0.406),
+                    scale=(0.229, 0.224, 0.225)):
+  """Normalizes the image to zero mean and unit variance."""
+  with tf.name_scope('normalize_image'):
+    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+    offset = tf.constant(offset)
+    offset = tf.expand_dims(offset, axis=0)
+    offset = tf.expand_dims(offset, axis=0)
+    image -= offset
+
+    scale = tf.constant(scale)
+    scale = tf.expand_dims(scale, axis=0)
+    scale = tf.expand_dims(scale, axis=0)
+    image /= scale
+    return image
+
+
+def compute_padded_size(desired_size, stride):
+  """Compute the padded size given the desired size and the stride.
+
+  The padded size will be the smallest rectangle, such that each dimension is
+  the smallest multiple of the stride which is larger than the desired
+  dimension. For example, if desired_size = (100, 200) and stride = 32,
+  the output padded_size = (128, 224).
+
+  Args:
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the target output image size.
+    stride: an integer, the stride of the backbone network.
+
+  Returns:
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size.
+  """
+  if isinstance(desired_size, list) or isinstance(desired_size, tuple):
+    padded_size = [int(math.ceil(d * 1.0 / stride) * stride)
+                   for d in desired_size]
+  else:
+    padded_size = tf.cast(
+        tf.math.ceil(
+            tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
+        tf.int32)
+  return padded_size
+
+
+def resize_and_crop_image(image,
+                          desired_size,
+                          padded_size,
+                          aug_scale_min=1.0,
+                          aug_scale_max=1.0,
+                          seed=1,
+                          method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size (RetinaNet style).
+
+  Resize and pad images given the desired output size of the image and
+  stride size.
+
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and rescale the image to make it
+     the largest rectangle to be bounded by the rectangle specified by the
+     `desired_size`.
+  2. Pad the rescaled image to the padded_size.
+
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the desired actual output image size.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image'):
+    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
+
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+
+    if random_jittering:
+      random_scale = tf.random.uniform(
+          [], aug_scale_min, aug_scale_max, seed=seed)
+      scaled_size = tf.round(random_scale * desired_size)
+    else:
+      scaled_size = desired_size
+
+    scale = tf.minimum(
+        scaled_size[0] / image_size[0], scaled_size[1] / image_size[1])
+    scaled_size = tf.round(image_size * scale)
+
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+
+    if random_jittering:
+      scaled_image = scaled_image[
+          offset[0]:offset[0] + desired_size[0],
+          offset[1]:offset[1] + desired_size[1], :]
+
+    output_image = tf.image.pad_to_bounding_box(
+        scaled_image, 0, 0, padded_size[0], padded_size[1])
+
+    image_info = tf.stack([
+        image_size,
+        tf.constant(desired_size, dtype=tf.float32),
+        image_scale,
+        tf.cast(offset, tf.float32)])
+    return output_image, image_info
+
+
+def resize_and_crop_image_v2(image,
+                             short_side,
+                             long_side,
+                             padded_size,
+                             aug_scale_min=1.0,
+                             aug_scale_max=1.0,
+                             seed=1,
+                             method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size (Faster R-CNN style).
+
+  Resize and pad images given the specified short / long side length and the
+  stride size.
+
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and first try to rescale the short
+     side of the original image to `short_side`.
+  2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
+     the aspect ratio and rescal the long side of the image to `long_side`.
+  2. Pad the rescaled image to the padded_size.
+
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    short_side: a scalar `Tensor` or `int` representing the desired short side
+      to be rescaled to.
+    long_side: a scalar `Tensor` or `int` representing the desired long side to
+      be rescaled to.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image_v2'):
+    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
+
+    scale_using_short_side = (
+        short_side / tf.math.minimum(image_size[0], image_size[1]))
+    scale_using_long_side = (
+        long_side / tf.math.maximum(image_size[0], image_size[1]))
+
+    scaled_size = tf.math.round(image_size * scale_using_short_side)
+    scaled_size = tf.where(
+        tf.math.greater(
+            tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
+        tf.math.round(image_size * scale_using_long_side),
+        scaled_size)
+    desired_size = scaled_size
+
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+
+    if random_jittering:
+      random_scale = tf.random.uniform(
+          [], aug_scale_min, aug_scale_max, seed=seed)
+      scaled_size = tf.math.round(random_scale * scaled_size)
+
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+
+    if random_jittering:
+      scaled_image = scaled_image[
+          offset[0]:offset[0] + desired_size[0],
+          offset[1]:offset[1] + desired_size[1], :]
+
+    output_image = tf.image.pad_to_bounding_box(
+        scaled_image, 0, 0, padded_size[0], padded_size[1])
+
+    image_info = tf.stack([
+        image_size,
+        tf.cast(desired_size, dtype=tf.float32),
+        image_scale,
+        tf.cast(offset, tf.float32)])
+    return output_image, image_info
+
+
+def resize_image(
+    image: tf.Tensor,
+    size: Union[Tuple[int, int], int],
+    max_size: Optional[int] = None,
+    method: tf.image.ResizeMethod = tf.image.ResizeMethod.BILINEAR):
+  """Resize image with size and max_size.
+
+  Args:
+    image: the image to be resized.
+    size: if list to tuple, resize to it. If scalar, we keep the same
+      aspect ratio and resize the short side to the value.
+    max_size: only used when size is a scalar. When the larger side is larger
+      than max_size after resized with size we used max_size to keep the aspect
+      ratio instead.
+    method: the method argument passed to tf.image.resize.
+
+  Returns:
+    the resized image and image_info to be used for downstream processing.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [resized_height, resized_width],
+      [y_scale, x_scale], [0, 0]], where [resized_height, resized_width]
+      is the actual scaled image size, and [y_scale, x_scale] is the
+      scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+
+  def get_size_with_aspect_ratio(image_size, size, max_size=None):
+    h = image_size[0]
+    w = image_size[1]
+    if max_size is not None:
+      min_original_size = tf.cast(tf.math.minimum(w, h), dtype=tf.float32)
+      max_original_size = tf.cast(tf.math.maximum(w, h), dtype=tf.float32)
+      if max_original_size / min_original_size * size > max_size:
+        size = tf.cast(
+            tf.math.floor(max_size * min_original_size / max_original_size),
+            dtype=tf.int32)
+      else:
+        size = tf.cast(size, tf.int32)
+
+    else:
+      size = tf.cast(size, tf.int32)
+    if (w <= h and w == size) or (h <= w and h == size):
+      return tf.stack([h, w])
+
+    if w < h:
+      ow = size
+      oh = tf.cast(
+          (tf.cast(size, dtype=tf.float32) * tf.cast(h, dtype=tf.float32) /
+           tf.cast(w, dtype=tf.float32)),
+          dtype=tf.int32)
+    else:
+      oh = size
+      ow = tf.cast(
+          (tf.cast(size, dtype=tf.float32) * tf.cast(w, dtype=tf.float32) /
+           tf.cast(h, dtype=tf.float32)),
+          dtype=tf.int32)
+
+    return tf.stack([oh, ow])
+
+  def get_size(image_size, size, max_size=None):
+    if isinstance(size, (list, tuple)):
+      return size[::-1]
+    else:
+      return get_size_with_aspect_ratio(image_size, size, max_size)
+
+  orignal_size = tf.shape(image)[0:2]
+  size = get_size(orignal_size, size, max_size)
+  rescaled_image = tf.image.resize(
+      image, tf.cast(size, tf.int32), method=method)
+  image_scale = size / orignal_size
+  image_info = tf.stack([
+      tf.cast(orignal_size, dtype=tf.float32),
+      tf.cast(size, dtype=tf.float32),
+      tf.cast(image_scale, tf.float32),
+      tf.constant([0.0, 0.0], dtype=tf.float32)
+  ])
+  return rescaled_image, image_info
+
+
+def center_crop_image(image):
+  """Center crop a square shape slice from the input image.
+
+  It crops a square shape slice from the image. The side of the actual crop
+  is 224 / 256 = 0.875 of the short side of the original image. References:
+  [1] Very Deep Convolutional Networks for Large-Scale Image Recognition
+      https://arxiv.org/abs/1409.1556
+  [2] Deep Residual Learning for Image Recognition
+      https://arxiv.org/abs/1512.03385
+
+  Args:
+    image: a Tensor of shape [height, width, 3] representing the input image.
+
+  Returns:
+    cropped_image: a Tensor representing the center cropped image.
+  """
+  with tf.name_scope('center_crop_image'):
+    image_size = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
+    crop_size = (
+        CENTER_CROP_FRACTION * tf.math.minimum(image_size[0], image_size[1]))
+    crop_offset = tf.cast((image_size - crop_size) / 2.0, dtype=tf.int32)
+    crop_size = tf.cast(crop_size, dtype=tf.int32)
+    cropped_image = image[
+        crop_offset[0]:crop_offset[0] + crop_size,
+        crop_offset[1]:crop_offset[1] + crop_size, :]
+    return cropped_image
+
+
+def center_crop_image_v2(image_bytes, image_shape):
+  """Center crop a square shape slice from the input image.
+
+  It crops a square shape slice from the image. The side of the actual crop
+  is 224 / 256 = 0.875 of the short side of the original image. References:
+  [1] Very Deep Convolutional Networks for Large-Scale Image Recognition
+      https://arxiv.org/abs/1409.1556
+  [2] Deep Residual Learning for Image Recognition
+      https://arxiv.org/abs/1512.03385
+
+  This is a faster version of `center_crop_image` which takes the original
+  image bytes and image size as the inputs, and partially decode the JPEG
+  bytes according to the center crop.
+
+  Args:
+    image_bytes: a Tensor of type string representing the raw image bytes.
+    image_shape: a Tensor specifying the shape of the raw image.
+
+  Returns:
+    cropped_image: a Tensor representing the center cropped image.
+  """
+  with tf.name_scope('center_image_crop_v2'):
+    image_shape = tf.cast(image_shape, tf.float32)
+    crop_size = (
+        CENTER_CROP_FRACTION * tf.math.minimum(image_shape[0], image_shape[1]))
+    crop_offset = tf.cast((image_shape - crop_size) / 2.0, dtype=tf.int32)
+    crop_size = tf.cast(crop_size, dtype=tf.int32)
+    crop_window = tf.stack(
+        [crop_offset[0], crop_offset[1], crop_size, crop_size])
+    cropped_image = tf.image.decode_and_crop_jpeg(
+        image_bytes, crop_window, channels=3)
+    return cropped_image
+
+
+def random_crop_image(image,
+                      aspect_ratio_range=(3. / 4., 4. / 3.),
+                      area_range=(0.08, 1.0),
+                      max_attempts=10,
+                      seed=1):
+  """Randomly crop an arbitrary shaped slice from the input image.
+
+  Args:
+    image: a Tensor of shape [height, width, 3] representing the input image.
+    aspect_ratio_range: a list of floats. The cropped area of the image must
+      have an aspect ratio = width / height within this range.
+    area_range: a list of floats. The cropped reas of the image must contain
+      a fraction of the input image within this range.
+    max_attempts: the number of attempts at generating a cropped region of the
+      image of the specified constraints. After max_attempts failures, return
+      the entire image.
+    seed: the seed of the random generator.
+
+  Returns:
+    cropped_image: a Tensor representing the random cropped image. Can be the
+      original image if max_attempts is exhausted.
+  """
+  with tf.name_scope('random_crop_image'):
+    crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
+        seed=seed,
+        min_object_covered=area_range[0],
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts)
+    cropped_image = tf.slice(image, crop_offset, crop_size)
+    return cropped_image
+
+
+def random_crop_image_v2(image_bytes,
+                         image_shape,
+                         aspect_ratio_range=(3. / 4., 4. / 3.),
+                         area_range=(0.08, 1.0),
+                         max_attempts=10,
+                         seed=1):
+  """Randomly crop an arbitrary shaped slice from the input image.
+
+  This is a faster version of `random_crop_image` which takes the original
+  image bytes and image size as the inputs, and partially decode the JPEG
+  bytes according to the generated crop.
+
+  Args:
+    image_bytes: a Tensor of type string representing the raw image bytes.
+    image_shape: a Tensor specifying the shape of the raw image.
+    aspect_ratio_range: a list of floats. The cropped area of the image must
+      have an aspect ratio = width / height within this range.
+    area_range: a list of floats. The cropped reas of the image must contain
+      a fraction of the input image within this range.
+    max_attempts: the number of attempts at generating a cropped region of the
+      image of the specified constraints. After max_attempts failures, return
+      the entire image.
+    seed: the seed of the random generator.
+
+  Returns:
+    cropped_image: a Tensor representing the random cropped image. Can be the
+      original image if max_attempts is exhausted.
+  """
+  with tf.name_scope('random_crop_image_v2'):
+    crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box(
+        image_shape,
+        tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
+        seed=seed,
+        min_object_covered=area_range[0],
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts)
+    offset_y, offset_x, _ = tf.unstack(crop_offset)
+    crop_height, crop_width, _ = tf.unstack(crop_size)
+    crop_window = tf.stack([offset_y, offset_x, crop_height, crop_width])
+    cropped_image = tf.image.decode_and_crop_jpeg(
+        image_bytes, crop_window, channels=3)
+    return cropped_image
+
+
+def resize_and_crop_boxes(boxes,
+                          image_scale,
+                          output_size,
+                          offset):
+  """Resizes boxes to output size with scale and offset.
+
+  Args:
+    boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
+    image_scale: 2D float `Tensor` representing scale factors that apply to
+      [height, width] of input image.
+    output_size: 2D `Tensor` or `int` representing [height, width] of target
+      output image size.
+    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+      boxes.
+
+  Returns:
+    boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
+  """
+  with tf.name_scope('resize_and_crop_boxes'):
+    # Adjusts box coordinates based on image_scale and offset.
+    boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
+    boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
+    # Clips the boxes.
+    boxes = box_ops.clip_boxes(boxes, output_size)
+    return boxes
+
+
+def resize_and_crop_masks(masks,
+                          image_scale,
+                          output_size,
+                          offset):
+  """Resizes boxes to output size with scale and offset.
+
+  Args:
+    masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
+    image_scale: 2D float `Tensor` representing scale factors that apply to
+      [height, width] of input image.
+    output_size: 2D `Tensor` or `int` representing [height, width] of target
+      output image size.
+    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+      boxes.
+
+  Returns:
+    masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
+  """
+  with tf.name_scope('resize_and_crop_masks'):
+    mask_size = tf.cast(tf.shape(masks)[1:3], tf.float32)
+    # Pad masks to avoid empty mask annotations.
+    masks = tf.concat(
+        [tf.zeros([1, mask_size[0], mask_size[1], 1]), masks], axis=0)
+
+    scaled_size = tf.cast(image_scale * mask_size, tf.int32)
+    scaled_masks = tf.image.resize(
+        masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+    offset = tf.cast(offset, tf.int32)
+    scaled_masks = scaled_masks[
+        :,
+        offset[0]:offset[0] + output_size[0],
+        offset[1]:offset[1] + output_size[1],
+        :]
+
+    output_masks = tf.image.pad_to_bounding_box(
+        scaled_masks, 0, 0, output_size[0], output_size[1])
+    # Remove padding.
+    output_masks = output_masks[1::]
+    return output_masks
+
+
+def horizontal_flip_image(image):
+  """Flips image horizontally."""
+  return tf.image.flip_left_right(image)
+
+
+def horizontal_flip_boxes(normalized_boxes):
+  """Flips normalized boxes horizontally."""
+  ymin, xmin, ymax, xmax = tf.split(
+      value=normalized_boxes, num_or_size_splits=4, axis=1)
+  flipped_xmin = tf.subtract(1.0, xmax)
+  flipped_xmax = tf.subtract(1.0, xmin)
+  flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
+  return flipped_boxes
+
+
+def horizontal_flip_masks(masks):
+  """Flips masks horizontally."""
+  return masks[:, :, ::-1]
+
+
+def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1):
+  """Randomly flips input image and bounding boxes."""
+  with tf.name_scope('random_horizontal_flip'):
+    do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
+
+    image = tf.cond(
+        do_flip,
+        lambda: horizontal_flip_image(image),
+        lambda: image)
+
+    if normalized_boxes is not None:
+      normalized_boxes = tf.cond(
+          do_flip,
+          lambda: horizontal_flip_boxes(normalized_boxes),
+          lambda: normalized_boxes)
+
+    if masks is not None:
+      masks = tf.cond(
+          do_flip,
+          lambda: horizontal_flip_masks(masks),
+          lambda: masks)
+
+    return image, normalized_boxes, masks
+
+
+def color_jitter(image: tf.Tensor,
+                 brightness: Optional[float] = 0.,
+                 contrast: Optional[float] = 0.,
+                 saturation: Optional[float] = 0.,
+                 seed: Optional[int] = None) -> tf.Tensor:
+  """Applies color jitter to an image, similarly to torchvision`s ColorJitter.
+
+  Args:
+    image (tf.Tensor): Of shape [height, width, 3] and type uint8.
+    brightness (float, optional): Magnitude for brightness jitter. Defaults to
+      0.
+    contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
+    saturation (float, optional): Magnitude for saturation jitter. Defaults to
+      0.
+    seed (int, optional): Random seed. Defaults to None.
+
+  Returns:
+    tf.Tensor: The augmented `image` of type uint8.
+  """
+  image = tf.cast(image, dtype=tf.uint8)
+  image = random_brightness(image, brightness, seed=seed)
+  image = random_contrast(image, contrast, seed=seed)
+  image = random_saturation(image, saturation, seed=seed)
+  return image
+
+
+def random_brightness(image: tf.Tensor,
+                      brightness: float = 0.,
+                      seed: Optional[int] = None) -> tf.Tensor:
+  """Jitters brightness of an image.
+
+  Args:
+      image (tf.Tensor): Of shape [height, width, 3] and type uint8.
+      brightness (float, optional): Magnitude for brightness jitter. Defaults to
+        0.
+      seed (int, optional): Random seed. Defaults to None.
+
+  Returns:
+      tf.Tensor: The augmented `image` of type uint8.
+  """
+  assert brightness >= 0, '`brightness` must be positive'
+  brightness = tf.random.uniform([],
+                                 max(0, 1 - brightness),
+                                 1 + brightness,
+                                 seed=seed,
+                                 dtype=tf.float32)
+  return augment.brightness(image, brightness)
+
+
+def random_contrast(image: tf.Tensor,
+                    contrast: float = 0.,
+                    seed: Optional[int] = None) -> tf.Tensor:
+  """Jitters contrast of an image, similarly to torchvision`s ColorJitter.
+
+  Args:
+      image (tf.Tensor): Of shape [height, width, 3] and type uint8.
+      contrast (float, optional): Magnitude for contrast jitter. Defaults to 0.
+      seed (int, optional): Random seed. Defaults to None.
+
+  Returns:
+      tf.Tensor: The augmented `image` of type uint8.
+  """
+  assert contrast >= 0, '`contrast` must be positive'
+  contrast = tf.random.uniform([],
+                               max(0, 1 - contrast),
+                               1 + contrast,
+                               seed=seed,
+                               dtype=tf.float32)
+  return augment.contrast(image, contrast)
+
+
+def random_saturation(image: tf.Tensor,
+                      saturation: float = 0.,
+                      seed: Optional[int] = None) -> tf.Tensor:
+  """Jitters saturation of an image, similarly to torchvision`s ColorJitter.
+
+  Args:
+      image (tf.Tensor): Of shape [height, width, 3] and type uint8.
+      saturation (float, optional): Magnitude for saturation jitter. Defaults to
+        0.
+      seed (int, optional): Random seed. Defaults to None.
+
+  Returns:
+      tf.Tensor: The augmented `image` of type uint8.
+  """
+  assert saturation >= 0, '`saturation` must be positive'
+  saturation = tf.random.uniform([],
+                                 max(0, 1 - saturation),
+                                 1 + saturation,
+                                 seed=seed,
+                                 dtype=tf.float32)
+  return _saturation(image, saturation)
+
+
+def _saturation(image: tf.Tensor,
+                saturation: Optional[float] = 0.) -> tf.Tensor:
+  return augment.blend(
+      tf.repeat(tf.image.rgb_to_grayscale(image), 3, axis=-1), image,
+      saturation)
+
+
+def random_crop_image_with_boxes_and_labels(img, boxes, labels, min_scale,
+                                            aspect_ratio_range,
+                                            min_overlap_params, max_retry):
+  """Crops a random slice from the input image.
+
+  The function will correspondingly recompute the bounding boxes and filter out
+  outside boxes and their labels.
+
+  References:
+  [1] End-to-End Object Detection with Transformers
+  https://arxiv.org/abs/2005.12872
+
+  The preprocessing steps:
+  1. Sample a minimum IoU overlap.
+  2. For each trial, sample the new image width, height, and top-left corner.
+  3. Compute the IoUs of bounding boxes with the cropped image and retry if
+    the maximum IoU is below the sampled threshold.
+  4. Find boxes whose centers are in the cropped image.
+  5. Compute new bounding boxes in the cropped region and only select those
+    boxes' labels.
+
+  Args:
+    img: a 'Tensor' of shape [height, width, 3] representing the input image.
+    boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
+      boxes with (ymin, xmin, ymax, xmax).
+    labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
+    min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
+      scale variable.
+    aspect_ratio_range: a list of two 'float' that specifies the lower and upper
+      bound of the random aspect ratio.
+    min_overlap_params: a list of four 'float' representing the min value, max
+      value, step size, and offset for the minimum overlap sample.
+    max_retry: an 'int' representing the number of trials for cropping. If it is
+      exhausted, no cropping will be performed.
+
+  Returns:
+    img: a Tensor representing the random cropped image. Can be the
+      original image if max_retry is exhausted.
+    boxes: a Tensor representing the bounding boxes in the cropped image.
+    labels: a Tensor representing the new bounding boxes' labels.
+  """
+
+  shape = tf.shape(img)
+  original_h = shape[0]
+  original_w = shape[1]
+
+  minval, maxval, step, offset = min_overlap_params
+
+  min_overlap = tf.math.floordiv(
+      tf.random.uniform([], minval=minval, maxval=maxval), step) * step - offset
+
+  min_overlap = tf.clip_by_value(min_overlap, 0.0, 1.1)
+
+  if min_overlap > 1.0:
+    return img, boxes, labels
+
+  aspect_ratio_low = aspect_ratio_range[0]
+  aspect_ratio_high = aspect_ratio_range[1]
+
+  for _ in tf.range(max_retry):
+    scale_h = tf.random.uniform([], min_scale, 1.0)
+    scale_w = tf.random.uniform([], min_scale, 1.0)
+    new_h = tf.cast(
+        scale_h * tf.cast(original_h, dtype=tf.float32), dtype=tf.int32)
+    new_w = tf.cast(
+        scale_w * tf.cast(original_w, dtype=tf.float32), dtype=tf.int32)
+
+    # Aspect ratio has to be in the prespecified range
+    aspect_ratio = new_h / new_w
+    if aspect_ratio_low > aspect_ratio or aspect_ratio > aspect_ratio_high:
+      continue
+
+    left = tf.random.uniform([], 0, original_w - new_w, dtype=tf.int32)
+    right = left + new_w
+    top = tf.random.uniform([], 0, original_h - new_h, dtype=tf.int32)
+    bottom = top + new_h
+
+    normalized_left = tf.cast(
+        left, dtype=tf.float32) / tf.cast(
+            original_w, dtype=tf.float32)
+    normalized_right = tf.cast(
+        right, dtype=tf.float32) / tf.cast(
+            original_w, dtype=tf.float32)
+    normalized_top = tf.cast(
+        top, dtype=tf.float32) / tf.cast(
+            original_h, dtype=tf.float32)
+    normalized_bottom = tf.cast(
+        bottom, dtype=tf.float32) / tf.cast(
+            original_h, dtype=tf.float32)
+
+    cropped_box = tf.expand_dims(
+        tf.stack([
+            normalized_top,
+            normalized_left,
+            normalized_bottom,
+            normalized_right,
+        ]),
+        axis=0)
+    iou = box_ops.bbox_overlap(
+        tf.expand_dims(cropped_box, axis=0),
+        tf.expand_dims(boxes, axis=0))  # (1, 1, n_ground_truth)
+    iou = tf.squeeze(iou, axis=[0, 1])
+
+    # If not a single bounding box has a Jaccard overlap of greater than
+    # the minimum, try again
+    if tf.reduce_max(iou) < min_overlap:
+      continue
+
+    centroids = box_ops.yxyx_to_cycxhw(boxes)
+    mask = tf.math.logical_and(
+        tf.math.logical_and(centroids[:, 0] > normalized_top,
+                            centroids[:, 0] < normalized_bottom),
+        tf.math.logical_and(centroids[:, 1] > normalized_left,
+                            centroids[:, 1] < normalized_right))
+    # If not a single bounding box has its center in the crop, try again.
+    if tf.reduce_sum(tf.cast(mask, dtype=tf.int32)) > 0:
+      indices = tf.squeeze(tf.where(mask), axis=1)
+
+      filtered_boxes = tf.gather(boxes, indices)
+
+      boxes = tf.clip_by_value(
+          (filtered_boxes[..., :] * tf.cast(
+              tf.stack([original_h, original_w, original_h, original_w]),
+              dtype=tf.float32) -
+           tf.cast(tf.stack([top, left, top, left]), dtype=tf.float32)) /
+          tf.cast(tf.stack([new_h, new_w, new_h, new_w]), dtype=tf.float32),
+          0.0, 1.0)
+
+      img = tf.image.crop_to_bounding_box(img, top, left, bottom - top,
+                                          right - left)
+
+      labels = tf.gather(labels, indices)
+      break
+
+  return img, boxes, labels
+
+
+def random_crop(image,
+                boxes,
+                labels,
+                min_scale=0.3,
+                aspect_ratio_range=(0.5, 2.0),
+                min_overlap_params=(0.0, 1.4, 0.2, 0.1),
+                max_retry=50,
+                seed=None):
+  """Randomly crop the image and boxes, filtering labels.
+
+  Args:
+    image: a 'Tensor' of shape [height, width, 3] representing the input image.
+    boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
+      boxes with (ymin, xmin, ymax, xmax).
+    labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
+    min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
+      scale variable.
+    aspect_ratio_range: a list of two 'float' that specifies the lower and upper
+      bound of the random aspect ratio.
+    min_overlap_params: a list of four 'float' representing the min value, max
+      value, step size, and offset for the minimum overlap sample.
+    max_retry: an 'int' representing the number of trials for cropping. If it is
+      exhausted, no cropping will be performed.
+    seed: the random number seed of int, but could be None.
+
+  Returns:
+    image: a Tensor representing the random cropped image. Can be the
+      original image if max_retry is exhausted.
+    boxes: a Tensor representing the bounding boxes in the cropped image.
+    labels: a Tensor representing the new bounding boxes' labels.
+  """
+  with tf.name_scope('random_crop'):
+    do_crop = tf.greater(tf.random.uniform([], seed=seed), 0.5)
+    if do_crop:
+      return random_crop_image_with_boxes_and_labels(image, boxes, labels,
+                                                     min_scale,
+                                                     aspect_ratio_range,
+                                                     min_overlap_params,
+                                                     max_retry)
+    else:
+      return image, boxes, labels
--- a/official/vision/ops/preprocess_ops_3d.py
+++ b/official/vision/ops/preprocess_ops_3d.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Utils for processing video dataset features."""
+
+from typing import Optional, Tuple
+import tensorflow as tf
+
+
+def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
+                                    num_steps: int,
+                                    stride: int,
+                                    offset: tf.Tensor) -> tf.Tensor:
+  """Returns indices to take for sampling or padding sequences to fixed size."""
+  sequence_length = tf.shape(sequence)[0]
+  sel_idx = tf.range(sequence_length)
+
+  # Repeats sequence until num_steps are available in total.
+  max_length = num_steps * stride + offset
+  num_repeats = tf.math.floordiv(
+      max_length + sequence_length - 1, sequence_length)
+  sel_idx = tf.tile(sel_idx, [num_repeats])
+
+  steps = tf.range(offset, offset + num_steps * stride, stride)
+  return tf.gather(sel_idx, steps)
+
+
+def sample_linspace_sequence(sequence: tf.Tensor,
+                             num_windows: int,
+                             num_steps: int,
+                             stride: int) -> tf.Tensor:
+  """Samples `num_windows` segments from sequence with linearly spaced offsets.
+
+  The samples are concatenated in a single `tf.Tensor` in order to have the same
+  format structure per timestep (e.g. a single frame). If `num_steps` * `stride`
+  is bigger than the number of timesteps, the sequence is repeated. This
+  function can be used in evaluation in order to extract enough segments to span
+  the entire sequence.
+
+  Args:
+    sequence: Any tensor where the first dimension is timesteps.
+    num_windows: Number of windows retrieved from the sequence.
+    num_steps: Number of steps (e.g. frames) to take.
+    stride: Distance to sample between timesteps.
+
+  Returns:
+    A single `tf.Tensor` with first dimension `num_windows` * `num_steps`. The
+    tensor contains the concatenated list of `num_windows` tensors which offsets
+    have been linearly spaced from input.
+  """
+  sequence_length = tf.shape(sequence)[0]
+  max_offset = tf.maximum(0, sequence_length - num_steps * stride)
+  offsets = tf.linspace(0.0, tf.cast(max_offset, tf.float32), num_windows)
+  offsets = tf.cast(offsets, tf.int32)
+
+  all_indices = []
+  for i in range(num_windows):
+    all_indices.append(_sample_or_pad_sequence_indices(
+        sequence=sequence,
+        num_steps=num_steps,
+        stride=stride,
+        offset=offsets[i]))
+
+  indices = tf.concat(all_indices, axis=0)
+  indices.set_shape((num_windows * num_steps,))
+  return tf.gather(sequence, indices)
+
+
+def sample_sequence(sequence: tf.Tensor,
+                    num_steps: int,
+                    random: bool,
+                    stride: int,
+                    seed: Optional[int] = None) -> tf.Tensor:
+  """Samples a single segment of size `num_steps` from a given sequence.
+
+  If `random` is not `True`, this function will simply sample the central window
+  of the sequence. Otherwise, a random offset will be chosen in a way that the
+  desired `num_steps` might be extracted from the sequence.
+
+  Args:
+    sequence: Any tensor where the first dimension is timesteps.
+    num_steps: Number of steps (e.g. frames) to take.
+    random: A boolean indicating whether to random sample the single window. If
+      `True`, the offset is randomized. If `False`, the middle frame minus half
+      of `num_steps` is the first frame.
+    stride: Distance to sample between timesteps.
+    seed: A deterministic seed to use when sampling.
+
+  Returns:
+    A single `tf.Tensor` with first dimension `num_steps` with the sampled
+    segment.
+  """
+  sequence_length = tf.shape(sequence)[0]
+
+  if random:
+    sequence_length = tf.cast(sequence_length, tf.float32)
+    frame_stride = tf.cast(stride, tf.float32)
+    max_offset = tf.cond(
+        sequence_length > (num_steps - 1) * frame_stride,
+        lambda: sequence_length - (num_steps - 1) * frame_stride,
+        lambda: sequence_length)
+    offset = tf.random.uniform(
+        (),
+        maxval=tf.cast(max_offset, dtype=tf.int32),
+        dtype=tf.int32,
+        seed=seed)
+  else:
+    offset = (sequence_length - num_steps * stride) // 2
+    offset = tf.maximum(0, offset)
+
+  indices = _sample_or_pad_sequence_indices(
+      sequence=sequence,
+      num_steps=num_steps,
+      stride=stride,
+      offset=offset)
+  indices.set_shape((num_steps,))
+
+  return tf.gather(sequence, indices)
+
+
+def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
+  """Decodes JPEG raw bytes string into a RGB uint8 Tensor.
+
+  Args:
+    image_string: A `tf.Tensor` of type strings with the raw JPEG bytes where
+      the first dimension is timesteps.
+    channels: Number of channels of the JPEG image. Allowed values are 0, 1 and
+      3. If 0, the number of channels will be calculated at runtime and no
+      static shape is set.
+
+  Returns:
+    A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
+  """
+  return tf.map_fn(
+      lambda x: tf.image.decode_jpeg(x, channels=channels),
+      image_string, back_prop=False, dtype=tf.uint8)
+
+
+def crop_image(frames: tf.Tensor,
+               target_height: int,
+               target_width: int,
+               random: bool = False,
+               num_crops: int = 1,
+               seed: Optional[int] = None) -> tf.Tensor:
+  """Crops the image sequence of images.
+
+  If requested size is bigger than image size, image is padded with 0. If not
+  random cropping, a central crop is performed if num_crops is 1.
+
+  Args:
+    frames: A Tensor of dimension [timesteps, in_height, in_width, channels].
+    target_height: Target cropped image height.
+    target_width: Target cropped image width.
+    random: A boolean indicating if crop should be randomized.
+    num_crops: Number of crops (support 1 for central crop and 3 for 3-crop).
+    seed: A deterministic seed to use when random cropping.
+
+  Returns:
+    A Tensor of shape [timesteps, out_height, out_width, channels] of type uint8
+    with the cropped images.
+  """
+  if random:
+    # Random spatial crop.
+    shape = tf.shape(frames)
+    # If a static_shape is available (e.g. when using this method from add_image
+    # method), it will be used to have an output tensor with static shape.
+    static_shape = frames.shape.as_list()
+    seq_len = shape[0] if static_shape[0] is None else static_shape[0]
+    channels = shape[3] if static_shape[3] is None else static_shape[3]
+    frames = tf.image.random_crop(
+        frames, (seq_len, target_height, target_width, channels), seed)
+  else:
+    if num_crops == 1:
+      # Central crop or pad.
+      frames = tf.image.resize_with_crop_or_pad(frames, target_height,
+                                                target_width)
+
+    elif num_crops == 3:
+      # Three-crop evaluation.
+      shape = tf.shape(frames)
+      static_shape = frames.shape.as_list()
+      seq_len = shape[0] if static_shape[0] is None else static_shape[0]
+      height = shape[1] if static_shape[1] is None else static_shape[1]
+      width = shape[2] if static_shape[2] is None else static_shape[2]
+      channels = shape[3] if static_shape[3] is None else static_shape[3]
+
+      size = tf.convert_to_tensor(
+          (seq_len, target_height, target_width, channels))
+
+      offset_1 = tf.broadcast_to([0, 0, 0, 0], [4])
+      # pylint:disable=g-long-lambda
+      offset_2 = tf.cond(
+          tf.greater_equal(height, width),
+          true_fn=lambda: tf.broadcast_to([
+              0, tf.cast(height, tf.float32) / 2 - target_height // 2, 0, 0
+          ], [4]),
+          false_fn=lambda: tf.broadcast_to([
+              0, 0, tf.cast(width, tf.float32) / 2 - target_width // 2, 0
+          ], [4]))
+      offset_3 = tf.cond(
+          tf.greater_equal(height, width),
+          true_fn=lambda: tf.broadcast_to(
+              [0, tf.cast(height, tf.float32) - target_height, 0, 0], [4]),
+          false_fn=lambda: tf.broadcast_to(
+              [0, 0, tf.cast(width, tf.float32) - target_width, 0], [4]))
+      # pylint:disable=g-long-lambda
+
+      crops = []
+      for offset in [offset_1, offset_2, offset_3]:
+        offset = tf.cast(tf.math.round(offset), tf.int32)
+        crops.append(tf.slice(frames, offset, size))
+      frames = tf.concat(crops, axis=0)
+
+    else:
+      raise NotImplementedError(
+          f"Only 1-crop and 3-crop are supported. Found {num_crops!r}.")
+
+  return frames
+
+
+def resize_smallest(frames: tf.Tensor,
+                    min_resize: int) -> tf.Tensor:
+  """Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
+
+  This function will not do anything if the min(`height`, `width`) is already
+  equal to `min_resize`. This allows to save compute time.
+
+  Args:
+    frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
+    min_resize: Minimum size of the final image dimensions.
+
+  Returns:
+    A Tensor of shape [timesteps, output_h, output_w, channels] of type
+      frames.dtype where min(output_h, output_w) = min_resize.
+  """
+  shape = tf.shape(frames)
+  input_h = shape[1]
+  input_w = shape[2]
+
+  output_h = tf.maximum(min_resize, (input_h * min_resize) // input_w)
+  output_w = tf.maximum(min_resize, (input_w * min_resize) // input_h)
+
+  def resize_fn():
+    frames_resized = tf.image.resize(frames, (output_h, output_w))
+    return tf.cast(frames_resized, frames.dtype)
+
+  should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w),
+                                     tf.not_equal(input_h, output_h))
+  frames = tf.cond(should_resize, resize_fn, lambda: frames)
+
+  return frames
+
+
+def random_crop_resize(frames: tf.Tensor,
+                       output_h: int,
+                       output_w: int,
+                       num_frames: int,
+                       num_channels: int,
+                       aspect_ratio: Tuple[float, float],
+                       area_range: Tuple[float, float]) -> tf.Tensor:
+  """First crops clip with jittering and then resizes to (output_h, output_w).
+
+  Args:
+    frames: A Tensor of dimension [timesteps, input_h, input_w, channels].
+    output_h: Resized image height.
+    output_w: Resized image width.
+    num_frames: Number of input frames per clip.
+    num_channels: Number of channels of the clip.
+    aspect_ratio: Float tuple with the aspect range for cropping.
+    area_range: Float tuple with the area range for cropping.
+  Returns:
+    A Tensor of shape [timesteps, output_h, output_w, channels] of type
+      frames.dtype.
+  """
+  shape = tf.shape(frames)
+  seq_len, _, _, channels = shape[0], shape[1], shape[2], shape[3]
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  factor = output_w / output_h
+  aspect_ratio = (aspect_ratio[0] * factor, aspect_ratio[1] * factor)
+  sample_distorted_bbox = tf.image.sample_distorted_bounding_box(
+      shape[1:],
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=aspect_ratio,
+      area_range=area_range,
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bbox
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  size = tf.convert_to_tensor((
+      seq_len, target_height, target_width, channels))
+  offset = tf.convert_to_tensor((
+      0, offset_y, offset_x, 0))
+  frames = tf.slice(frames, offset, size)
+  frames = tf.cast(
+      tf.image.resize(frames, (output_h, output_w)),
+      frames.dtype)
+  frames.set_shape((num_frames, output_h, output_w, num_channels))
+  return frames
+
+
+def random_flip_left_right(
+    frames: tf.Tensor,
+    seed: Optional[int] = None) -> tf.Tensor:
+  """Flips all the frames with a probability of 50%.
+
+  Args:
+    frames: A Tensor of shape [timesteps, input_h, input_w, channels].
+    seed: A seed to use for the random sampling.
+
+  Returns:
+    A Tensor of shape [timesteps, output_h, output_w, channels] eventually
+    flipped left right.
+  """
+  is_flipped = tf.random.uniform(
+      (), minval=0, maxval=2, dtype=tf.int32, seed=seed)
+
+  frames = tf.cond(tf.equal(is_flipped, 1),
+                   true_fn=lambda: tf.image.flip_left_right(frames),
+                   false_fn=lambda: frames)
+  return frames
+
+
+def normalize_image(frames: tf.Tensor,
+                    zero_centering_image: bool,
+                    dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+  """Normalizes images.
+
+  Args:
+    frames: A Tensor of numbers.
+    zero_centering_image: If True, results are in [-1, 1], if False, results are
+      in [0, 1].
+    dtype: Type of output Tensor.
+
+  Returns:
+    A Tensor of same shape as the input and of the given type.
+  """
+  frames = tf.cast(frames, dtype)
+  if zero_centering_image:
+    return frames * (2.0 / 255.0) - 1.0
+  else:
+    return frames / 255.0
--- a/official/vision/ops/preprocess_ops_3d_test.py
+++ b/official/vision/ops/preprocess_ops_3d_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import io
+import itertools
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.ops import preprocess_ops_3d
+
+
+class ParserUtilsTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    # [[0, 1, ..., 119], [1, 2, ..., 120], ..., [119, 120, ..., 218]].
+    self._frames = tf.stack([tf.range(i, i + 120) for i in range(90)])
+    self._frames = tf.cast(self._frames, tf.uint8)
+    self._frames = self._frames[tf.newaxis, :, :, tf.newaxis]
+    self._frames = tf.broadcast_to(self._frames, (6, 90, 120, 3))
+
+    # Create an equivalent numpy array for assertions.
+    self._np_frames = np.array([range(i, i + 120) for i in range(90)])
+    self._np_frames = self._np_frames[np.newaxis, :, :, np.newaxis]
+    self._np_frames = np.broadcast_to(self._np_frames, (6, 90, 120, 3))
+
+  def test_sample_linspace_sequence(self):
+    sequence = tf.range(100)
+    sampled_seq_1 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 10, 10, 1)
+    sampled_seq_2 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 7, 10, 1)
+    sampled_seq_3 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 7, 5, 2)
+    sampled_seq_4 = preprocess_ops_3d.sample_linspace_sequence(
+        sequence, 101, 1, 1)
+
+    self.assertAllEqual(sampled_seq_1, range(100))
+    # [0, 1, 2, 3, 4, ..., 8, 9, 15, 16, ..., 97, 98, 99]
+    self.assertAllEqual(
+        sampled_seq_2,
+        [15 * i + j for i, j in itertools.product(range(7), range(10))])
+    # [0, 2, 4, 6, 8, 15, 17, 19, ..., 96, 98]
+    self.assertAllEqual(
+        sampled_seq_3,
+        [15 * i + 2 * j for i, j in itertools.product(range(7), range(5))])
+    self.assertAllEqual(sampled_seq_4, [0] + list(range(100)))
+
+  def test_sample_sequence(self):
+    sequence = tf.range(100)
+    sampled_seq_1 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 1)
+    sampled_seq_2 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 2)
+    sampled_seq_3 = preprocess_ops_3d.sample_sequence(sequence, 10, True, 1)
+
+    self.assertAllEqual(sampled_seq_1, range(45, 55))
+    self.assertAllEqual(sampled_seq_2, range(40, 60, 2))
+
+    offset_3 = sampled_seq_3[0]
+    self.assertBetween(offset_3, 0, 99)
+    self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
+
+  def test_decode_jpeg(self):
+    # Create a random RGB JPEG image.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
+    decoded_image = preprocess_ops_3d.decode_jpeg(raw_image, 3)
+
+    self.assertEqual(decoded_image.shape.as_list()[3], 3)
+    self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
+
+  def test_crop_image(self):
+    cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70)
+    cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200)
+    cropped_image_3 = preprocess_ops_3d.crop_image(self._frames, 50, 70, True)
+    cropped_image_4 = preprocess_ops_3d.crop_image(
+        self._frames, 90, 90, False, 3)
+
+    self.assertAllEqual(cropped_image_1.shape, (6, 50, 70, 3))
+    self.assertAllEqual(cropped_image_1, self._np_frames[:, 20:70, 25:95, :])
+
+    self.assertAllEqual(cropped_image_2.shape, (6, 200, 200, 3))
+    expected = np.pad(
+        self._np_frames, ((0, 0), (55, 55), (40, 40), (0, 0)), 'constant')
+    self.assertAllEqual(cropped_image_2, expected)
+
+    self.assertAllEqual(cropped_image_3.shape, (6, 50, 70, 3))
+    offset = cropped_image_3[0, 0, 0, 0]
+    expected = np.array([range(i, i + 70) for i in range(offset, offset + 50)])
+    expected = expected[np.newaxis, :, :, np.newaxis]
+    expected = np.broadcast_to(expected, (6, 50, 70, 3))
+    self.assertAllEqual(cropped_image_3, expected)
+    self.assertAllEqual(cropped_image_4.shape, (18, 90, 90, 3))
+
+  def test_resize_smallest(self):
+    resized_frames_1 = preprocess_ops_3d.resize_smallest(self._frames, 180)
+    resized_frames_2 = preprocess_ops_3d.resize_smallest(self._frames, 45)
+    resized_frames_3 = preprocess_ops_3d.resize_smallest(self._frames, 90)
+    resized_frames_4 = preprocess_ops_3d.resize_smallest(
+        tf.transpose(self._frames, (0, 2, 1, 3)), 45)
+
+    self.assertAllEqual(resized_frames_1.shape, (6, 180, 240, 3))
+    self.assertAllEqual(resized_frames_2.shape, (6, 45, 60, 3))
+    self.assertAllEqual(resized_frames_3.shape, (6, 90, 120, 3))
+    self.assertAllEqual(resized_frames_4.shape, (6, 60, 45, 3))
+
+  def test_random_crop_resize(self):
+    resized_frames_1 = preprocess_ops_3d.random_crop_resize(
+        self._frames, 256, 256, 6, 3, (0.5, 2), (0.3, 1))
+    resized_frames_2 = preprocess_ops_3d.random_crop_resize(
+        self._frames, 224, 224, 6, 3, (0.5, 2), (0.3, 1))
+    resized_frames_3 = preprocess_ops_3d.random_crop_resize(
+        self._frames, 256, 256, 6, 3, (0.8, 1.2), (0.3, 1))
+    resized_frames_4 = preprocess_ops_3d.random_crop_resize(
+        self._frames, 256, 256, 6, 3, (0.5, 2), (0.1, 1))
+    self.assertAllEqual(resized_frames_1.shape, (6, 256, 256, 3))
+    self.assertAllEqual(resized_frames_2.shape, (6, 224, 224, 3))
+    self.assertAllEqual(resized_frames_3.shape, (6, 256, 256, 3))
+    self.assertAllEqual(resized_frames_4.shape, (6, 256, 256, 3))
+
+  def test_random_flip_left_right(self):
+    flipped_frames = preprocess_ops_3d.random_flip_left_right(self._frames)
+
+    flipped = np.fliplr(self._np_frames[0, :, :, 0])
+    flipped = flipped[np.newaxis, :, :, np.newaxis]
+    flipped = np.broadcast_to(flipped, (6, 90, 120, 3))
+    self.assertTrue((flipped_frames == self._np_frames).numpy().all() or (
+        flipped_frames == flipped).numpy().all())
+
+  def test_normalize_image(self):
+    normalized_images_1 = preprocess_ops_3d.normalize_image(
+        self._frames, False, tf.float32)
+    normalized_images_2 = preprocess_ops_3d.normalize_image(
+        self._frames, True, tf.float32)
+
+    self.assertAllClose(normalized_images_1, self._np_frames / 255)
+    self.assertAllClose(normalized_images_2, self._np_frames * 2 / 255 - 1.0)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/preprocess_ops_test.py
+++ b/official/vision/ops/preprocess_ops_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for preprocess_ops.py."""
+
+import io
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.ops import preprocess_ops
+
+
+def _encode_image(image_array, fmt):
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+
+
+class InputUtilsTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ([1], 10),
+      ([1, 2], 10),
+      ([1, 2, 3], 10),
+      ([11], 10),
+      ([12, 2], 10),
+      ([13, 2, 3], 10),
+  )
+  def test_pad_to_fixed_size(self, input_shape, output_size):
+    # Copies input shape to padding shape.
+    clip_shape = input_shape[:]
+    clip_shape[0] = min(output_size, clip_shape[0])
+    padding_shape = input_shape[:]
+    padding_shape[0] = max(output_size - input_shape[0], 0)
+    expected_outputs = np.concatenate(
+        [np.ones(clip_shape), np.zeros(padding_shape)], axis=0)
+
+    data = tf.ones(input_shape)
+    output_data = preprocess_ops.clip_or_pad_to_fixed_size(
+        data, output_size, constant_values=0)
+    output_data = output_data.numpy()
+    self.assertAllClose(output_size, output_data.shape[0])
+    self.assertAllClose(expected_outputs, output_data)
+
+  @parameterized.parameters(
+      (100, 200, 100, 200, 32, 1.0, 1.0, 128, 224),
+      (100, 256, 128, 256, 32, 1.0, 1.0, 128, 256),
+      (200, 512, 200, 128, 32, 0.25, 0.25, 224, 128),
+  )
+  def test_resize_and_crop_image_rectangluar_case(self, input_height,
+                                                  input_width, desired_height,
+                                                  desired_width, stride,
+                                                  scale_y, scale_x,
+                                                  output_height, output_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+
+    desired_size = (desired_height, desired_width)
+    resized_image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        desired_size=desired_size,
+        padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
+    resized_image_shape = tf.shape(resized_image)
+
+    self.assertAllEqual(
+        [output_height, output_width, 3],
+        resized_image_shape.numpy())
+    self.assertNDArrayNear(
+        [[input_height, input_width],
+         [desired_height, desired_width],
+         [scale_y, scale_x],
+         [0.0, 0.0]],
+        image_info.numpy(),
+        1e-5)
+
+  @parameterized.parameters(
+      (100, 200, 220, 220, 32, 1.1, 1.1, 224, 224),
+      (512, 512, 1024, 1024, 32, 2.0, 2.0, 1024, 1024),
+  )
+  def test_resize_and_crop_image_square_case(self, input_height, input_width,
+                                             desired_height, desired_width,
+                                             stride, scale_y, scale_x,
+                                             output_height, output_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+
+    desired_size = (desired_height, desired_width)
+    resized_image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        desired_size=desired_size,
+        padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
+    resized_image_shape = tf.shape(resized_image)
+
+    self.assertAllEqual(
+        [output_height, output_width, 3],
+        resized_image_shape.numpy())
+    self.assertNDArrayNear(
+        [[input_height, input_width],
+         [desired_height, desired_width],
+         [scale_y, scale_x],
+         [0.0, 0.0]],
+        image_info.numpy(),
+        1e-5)
+
+  @parameterized.parameters(
+      (100, 200, 100, 300, 32, 1.0, 1.0, 100, 200, 128, 320),
+      (200, 100, 100, 300, 32, 1.0, 1.0, 200, 100, 320, 128),
+      (100, 200, 80, 100, 32, 0.5, 0.5, 50, 100, 96, 128),
+      (200, 100, 80, 100, 32, 0.5, 0.5, 100, 50, 128, 96),
+  )
+  def test_resize_and_crop_image_v2(self, input_height, input_width, short_side,
+                                    long_side, stride, scale_y, scale_x,
+                                    desired_height, desired_width,
+                                    output_height, output_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+    image_shape = tf.shape(image)[0:2]
+
+    desired_size = tf.where(
+        tf.greater(image_shape[0], image_shape[1]),
+        tf.constant([long_side, short_side], dtype=tf.int32),
+        tf.constant([short_side, long_side], dtype=tf.int32))
+    resized_image, image_info = preprocess_ops.resize_and_crop_image_v2(
+        image,
+        short_side=short_side,
+        long_side=long_side,
+        padded_size=preprocess_ops.compute_padded_size(desired_size, stride))
+    resized_image_shape = tf.shape(resized_image)
+
+    self.assertAllEqual(
+        [output_height, output_width, 3],
+        resized_image_shape.numpy())
+    self.assertNDArrayNear(
+        [[input_height, input_width],
+         [desired_height, desired_width],
+         [scale_y, scale_x],
+         [0.0, 0.0]],
+        image_info.numpy(),
+        1e-5)
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def test_center_crop_image(self, input_height, input_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+    cropped_image = preprocess_ops.center_crop_image(image)
+    cropped_image_shape = tf.shape(cropped_image)
+    self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def test_center_crop_image_v2(self, input_height, input_width):
+    image_bytes = tf.constant(
+        _encode_image(
+            np.uint8(np.random.rand(input_height, input_width, 3) * 255),
+            fmt='JPEG'),
+        dtype=tf.string)
+    cropped_image = preprocess_ops.center_crop_image_v2(
+        image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
+    cropped_image_shape = tf.shape(cropped_image)
+    self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy())
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def test_random_crop_image(self, input_height, input_width):
+    image = tf.convert_to_tensor(
+        np.random.rand(input_height, input_width, 3))
+    _ = preprocess_ops.random_crop_image(image)
+
+  @parameterized.parameters(
+      (400, 600), (600, 400),
+  )
+  def test_random_crop_image_v2(self, input_height, input_width):
+    image_bytes = tf.constant(
+        _encode_image(
+            np.uint8(np.random.rand(input_height, input_width, 3) * 255),
+            fmt='JPEG'),
+        dtype=tf.string)
+    _ = preprocess_ops.random_crop_image_v2(
+        image_bytes, tf.constant([input_height, input_width, 3], tf.int32))
+
+  @parameterized.parameters((400, 600, 0), (400, 600, 0.4), (600, 400, 1.4))
+  def testColorJitter(self, input_height, input_width, color_jitter):
+    image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3))
+    jittered_image = preprocess_ops.color_jitter(image, color_jitter,
+                                                 color_jitter, color_jitter)
+    assert jittered_image.shape == image.shape
+
+  @parameterized.parameters((400, 600, 0), (400, 600, 0.4), (600, 400, 1))
+  def testSaturation(self, input_height, input_width, saturation):
+    image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3))
+    jittered_image = preprocess_ops._saturation(image, saturation)
+    assert jittered_image.shape == image.shape
+
+  @parameterized.parameters((640, 640, 20), (1280, 1280, 30))
+  def test_random_crop(self, input_height, input_width, num_boxes):
+    image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3))
+    boxes_height = np.random.randint(0, input_height, size=(num_boxes, 1))
+    top = np.random.randint(0, high=(input_height - boxes_height))
+    down = top + boxes_height
+    boxes_width = np.random.randint(0, input_width, size=(num_boxes, 1))
+    left = np.random.randint(0, high=(input_width - boxes_width))
+    right = left + boxes_width
+    boxes = tf.constant(
+        np.concatenate([top, left, down, right], axis=-1), tf.float32)
+    labels = tf.constant(
+        np.random.randint(low=0, high=num_boxes, size=(num_boxes,)), tf.int64)
+    _ = preprocess_ops.random_crop(image, boxes, labels)
+
+  @parameterized.parameters(
+      ((640, 640, 3), (1000, 1000), None, (1000, 1000, 3)),
+      ((1280, 640, 3), 320, None, (640, 320, 3)),
+      ((640, 1280, 3), 320, None, (320, 640, 3)),
+      ((640, 640, 3), 320, 100, (100, 100, 3)))
+  def test_resize_image(self, input_shape, size, max_size, expected_shape):
+    resized_img, image_info = preprocess_ops.resize_image(
+        tf.zeros((input_shape)), size, max_size)
+    self.assertAllEqual(tf.shape(resized_img), expected_shape)
+    self.assertAllEqual(image_info[0], input_shape[:-1])
+    self.assertAllEqual(image_info[1], expected_shape[:-1])
+    self.assertAllEqual(
+        image_info[2],
+        np.array(expected_shape[:-1]) / np.array(input_shape[:-1]))
+    self.assertAllEqual(image_info[3], [0, 0])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/ops/sampling_ops.py
+++ b/official/vision/ops/sampling_ops.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Class to subsample minibatches by balancing positives and negatives.
+
+Subsamples minibatches based on a pre-specified positive fraction in range
+[0,1]. The class presumes there are many more negatives than positive examples:
+if the desired batch_size cannot be achieved with the pre-specified positive
+fraction, it fills the rest with negative examples. If this is not sufficient
+for obtaining the desired batch_size, it returns fewer examples.
+
+The main function to call is Subsample(self, indicator, labels). For convenience
+one can also call SubsampleWeights(self, weights, labels) which is defined in
+the minibatch_sampler base class.
+
+When is_static is True, it implements a method that guarantees static shapes.
+It also ensures the length of output of the subsample is always batch_size, even
+when number of examples set to True in indicator is less than batch_size.
+
+This is originally implemented in TensorFlow Object Detection API.
+"""
+
+# Import libraries
+import tensorflow as tf
+
+
+def combined_static_and_dynamic_shape(tensor):
+  """Returns a list containing static and dynamic values for the dimensions.
+
+  Returns a list of static and dynamic values for shape dimensions. This is
+  useful to preserve static shapes when available in reshape operation.
+
+  Args:
+    tensor: A tensor of any type.
+
+  Returns:
+    A list of size tensor.shape.ndims containing integers or a scalar tensor.
+  """
+  static_tensor_shape = tensor.shape.as_list()
+  dynamic_tensor_shape = tf.shape(input=tensor)
+  combined_shape = []
+  for index, dim in enumerate(static_tensor_shape):
+    if dim is not None:
+      combined_shape.append(dim)
+    else:
+      combined_shape.append(dynamic_tensor_shape[index])
+  return combined_shape
+
+
+def indices_to_dense_vector(indices,
+                            size,
+                            indices_value=1.,
+                            default_value=0,
+                            dtype=tf.float32):
+  """Creates dense vector with indices set to specific value and rest to zeros.
+
+  This function exists because it is unclear if it is safe to use
+    tf.sparse_to_dense(indices, [size], 1, validate_indices=False)
+  with indices which are not ordered.
+  This function accepts a dynamic size (e.g. tf.shape(tensor)[0])
+
+  Args:
+    indices: 1d Tensor with integer indices which are to be set to
+        indices_values.
+    size: scalar with size (integer) of output Tensor.
+    indices_value: values of elements specified by indices in the output vector
+    default_value: values of other elements in the output vector.
+    dtype: data type.
+
+  Returns:
+    dense 1D Tensor of shape [size] with indices set to indices_values and the
+        rest set to default_value.
+  """
+  size = tf.cast(size, dtype=tf.int32)
+  zeros = tf.ones([size], dtype=dtype) * default_value
+  values = tf.ones_like(indices, dtype=dtype) * indices_value
+
+  return tf.dynamic_stitch(
+      [tf.range(size), tf.cast(indices, dtype=tf.int32)], [zeros, values])
+
+
+def matmul_gather_on_zeroth_axis(params, indices, scope=None):
+  """Matrix multiplication based implementation of tf.gather on zeroth axis.
+
+  TODO(rathodv, jonathanhuang): enable sparse matmul option.
+
+  Args:
+    params: A float32 Tensor. The tensor from which to gather values.
+      Must be at least rank 1.
+    indices: A Tensor. Must be one of the following types: int32, int64.
+      Must be in range [0, params.shape[0])
+    scope: A name for the operation (optional).
+
+  Returns:
+    A Tensor. Has the same type as params. Values from params gathered
+    from indices given by indices, with shape indices.shape + params.shape[1:].
+  """
+  scope = scope or 'MatMulGather'
+  with tf.name_scope(scope):
+    params_shape = combined_static_and_dynamic_shape(params)
+    indices_shape = combined_static_and_dynamic_shape(indices)
+    params2d = tf.reshape(params, [params_shape[0], -1])
+    indicator_matrix = tf.one_hot(indices, params_shape[0])
+    gathered_result_flattened = tf.matmul(indicator_matrix, params2d)
+    return tf.reshape(gathered_result_flattened,
+                      tf.stack(indices_shape + params_shape[1:]))
+
+
+class BalancedPositiveNegativeSampler:
+  """Subsamples minibatches to a desired balance of positives and negatives."""
+
+  def __init__(self, positive_fraction=0.5, is_static=False):
+    """Constructs a minibatch sampler.
+
+    Args:
+      positive_fraction: desired fraction of positive examples (scalar in [0,1])
+        in the batch.
+      is_static: If True, uses an implementation with static shape guarantees.
+
+    Raises:
+      ValueError: if positive_fraction < 0, or positive_fraction > 1
+    """
+    if positive_fraction < 0 or positive_fraction > 1:
+      raise ValueError('positive_fraction should be in range [0,1]. '
+                       'Received: %s.' % positive_fraction)
+    self._positive_fraction = positive_fraction
+    self._is_static = is_static
+
+  @staticmethod
+  def subsample_indicator(indicator, num_samples):
+    """Subsample indicator vector.
+
+    Given a boolean indicator vector with M elements set to `True`, the function
+    assigns all but `num_samples` of these previously `True` elements to
+    `False`. If `num_samples` is greater than M, the original indicator vector
+    is returned.
+
+    Args:
+      indicator: a 1-dimensional boolean tensor indicating which elements
+        are allowed to be sampled and which are not.
+      num_samples: int32 scalar tensor
+
+    Returns:
+      a boolean tensor with the same shape as input (indicator) tensor
+    """
+    indices = tf.where(indicator)
+    indices = tf.random.shuffle(indices)
+    indices = tf.reshape(indices, [-1])
+
+    num_samples = tf.minimum(tf.size(input=indices), num_samples)
+    selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1]))
+
+    selected_indicator = indices_to_dense_vector(
+        selected_indices,
+        tf.shape(input=indicator)[0])
+
+    return tf.equal(selected_indicator, 1)
+
+  def _get_num_pos_neg_samples(self, sorted_indices_tensor, sample_size):
+    """Counts the number of positives and negatives numbers to be sampled.
+
+    Args:
+      sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains
+        the signed indices of the examples where the sign is based on the label
+        value. The examples that cannot be sampled are set to 0. It samples
+        at most sample_size*positive_fraction positive examples and remaining
+        from negative examples.
+      sample_size: Size of subsamples.
+
+    Returns:
+      A tuple containing the number of positive and negative labels in the
+      subsample.
+    """
+    input_length = tf.shape(input=sorted_indices_tensor)[0]
+    valid_positive_index = tf.greater(sorted_indices_tensor,
+                                      tf.zeros(input_length, tf.int32))
+    num_sampled_pos = tf.reduce_sum(
+        input_tensor=tf.cast(valid_positive_index, tf.int32))
+    max_num_positive_samples = tf.constant(
+        int(sample_size * self._positive_fraction), tf.int32)
+    num_positive_samples = tf.minimum(max_num_positive_samples, num_sampled_pos)
+    num_negative_samples = tf.constant(sample_size,
+                                       tf.int32) - num_positive_samples
+
+    return num_positive_samples, num_negative_samples
+
+  def _get_values_from_start_and_end(self, input_tensor, num_start_samples,
+                                     num_end_samples, total_num_samples):
+    """slices num_start_samples and last num_end_samples from input_tensor.
+
+    Args:
+      input_tensor: An int32 tensor of shape [N] to be sliced.
+      num_start_samples: Number of examples to be sliced from the beginning
+        of the input tensor.
+      num_end_samples: Number of examples to be sliced from the end of the
+        input tensor.
+      total_num_samples: Sum of is num_start_samples and num_end_samples. This
+        should be a scalar.
+
+    Returns:
+      A tensor containing the first num_start_samples and last num_end_samples
+      from input_tensor.
+
+    """
+    input_length = tf.shape(input=input_tensor)[0]
+    start_positions = tf.less(tf.range(input_length), num_start_samples)
+    end_positions = tf.greater_equal(
+        tf.range(input_length), input_length - num_end_samples)
+    selected_positions = tf.logical_or(start_positions, end_positions)
+    selected_positions = tf.cast(selected_positions, tf.float32)
+    indexed_positions = tf.multiply(tf.cumsum(selected_positions),
+                                    selected_positions)
+    one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1,
+                                  total_num_samples,
+                                  dtype=tf.float32)
+    return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32),
+                                one_hot_selector, axes=[0, 0]), tf.int32)
+
+  def _static_subsample(self, indicator, batch_size, labels):
+    """Returns subsampled minibatch.
+
+    Args:
+      indicator: boolean tensor of shape [N] whose True entries can be sampled.
+        N should be a complie time constant.
+      batch_size: desired batch size. This scalar cannot be None.
+      labels: boolean tensor of shape [N] denoting positive(=True) and negative
+        (=False) examples. N should be a complie time constant.
+
+    Returns:
+      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
+        are sampled. It ensures the length of output of the subsample is always
+        batch_size, even when number of examples set to True in indicator is
+        less than batch_size.
+
+    Raises:
+      ValueError: if labels and indicator are not 1D boolean tensors.
+    """
+    # Check if indicator and labels have a static size.
+    if not indicator.shape.is_fully_defined():
+      raise ValueError('indicator must be static in shape when is_static is'
+                       'True')
+    if not labels.shape.is_fully_defined():
+      raise ValueError('labels must be static in shape when is_static is'
+                       'True')
+    if not isinstance(batch_size, int):
+      raise ValueError('batch_size has to be an integer when is_static is'
+                       'True.')
+
+    input_length = tf.shape(input=indicator)[0]
+
+    # Set the number of examples set True in indicator to be at least
+    # batch_size.
+    num_true_sampled = tf.reduce_sum(
+        input_tensor=tf.cast(indicator, tf.float32))
+    additional_false_sample = tf.less_equal(
+        tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
+        batch_size - num_true_sampled)
+    indicator = tf.logical_or(indicator, additional_false_sample)
+
+    # Shuffle indicator and label. Need to store the permutation to restore the
+    # order post sampling.
+    permutation = tf.random.shuffle(tf.range(input_length))
+    indicator = matmul_gather_on_zeroth_axis(
+        tf.cast(indicator, tf.float32), permutation)
+    labels = matmul_gather_on_zeroth_axis(
+        tf.cast(labels, tf.float32), permutation)
+
+    # index (starting from 1) when indicator is True, 0 when False
+    indicator_idx = tf.where(
+        tf.cast(indicator, tf.bool), tf.range(1, input_length + 1),
+        tf.zeros(input_length, tf.int32))
+
+    # Replace -1 for negative, +1 for positive labels
+    signed_label = tf.where(
+        tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
+        tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
+    # negative of index for negative label, positive index for positive label,
+    # 0 when indicator is False.
+    signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
+    sorted_signed_indicator_idx = tf.nn.top_k(
+        signed_indicator_idx, input_length, sorted=True).values
+
+    [num_positive_samples,
+     num_negative_samples] = self._get_num_pos_neg_samples(
+         sorted_signed_indicator_idx, batch_size)
+
+    sampled_idx = self._get_values_from_start_and_end(
+        sorted_signed_indicator_idx, num_positive_samples,
+        num_negative_samples, batch_size)
+
+    # Shift the indices to start from 0 and remove any samples that are set as
+    # False.
+    sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
+    sampled_idx = tf.multiply(
+        tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
+        sampled_idx)
+
+    sampled_idx_indicator = tf.cast(
+        tf.reduce_sum(
+            input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0),
+        tf.bool)
+
+    # project back the order based on stored permutations
+    reprojections = tf.one_hot(permutation, depth=input_length,
+                               dtype=tf.float32)
+    return tf.cast(tf.tensordot(
+        tf.cast(sampled_idx_indicator, tf.float32),
+        reprojections, axes=[0, 0]), tf.bool)
+
+  def subsample(self, indicator, batch_size, labels, scope=None):
+    """Returns subsampled minibatch.
+
+    Args:
+      indicator: boolean tensor of shape [N] whose True entries can be sampled.
+      batch_size: desired batch size. If None, keeps all positive samples and
+        randomly selects negative samples so that the positive sample fraction
+        matches self._positive_fraction. It cannot be None is is_static is True.
+      labels: boolean tensor of shape [N] denoting positive(=True) and negative
+          (=False) examples.
+      scope: name scope.
+
+    Returns:
+      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
+        are sampled.
+
+    Raises:
+      ValueError: if labels and indicator are not 1D boolean tensors.
+    """
+    if len(indicator.get_shape().as_list()) != 1:
+      raise ValueError('indicator must be 1 dimensional, got a tensor of '
+                       'shape %s' % indicator.get_shape())
+    if len(labels.get_shape().as_list()) != 1:
+      raise ValueError('labels must be 1 dimensional, got a tensor of '
+                       'shape %s' % labels.get_shape())
+    if labels.dtype != tf.bool:
+      raise ValueError('labels should be of type bool. Received: %s' %
+                       labels.dtype)
+    if indicator.dtype != tf.bool:
+      raise ValueError('indicator should be of type bool. Received: %s' %
+                       indicator.dtype)
+    scope = scope or 'BalancedPositiveNegativeSampler'
+    with tf.name_scope(scope):
+      if self._is_static:
+        return self._static_subsample(indicator, batch_size, labels)
+
+      else:
+        # Only sample from indicated samples
+        negative_idx = tf.logical_not(labels)
+        positive_idx = tf.logical_and(labels, indicator)
+        negative_idx = tf.logical_and(negative_idx, indicator)
+
+        # Sample positive and negative samples separately
+        if batch_size is None:
+          max_num_pos = tf.reduce_sum(
+              input_tensor=tf.cast(positive_idx, dtype=tf.int32))
+        else:
+          max_num_pos = int(self._positive_fraction * batch_size)
+        sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos)
+        num_sampled_pos = tf.reduce_sum(
+            input_tensor=tf.cast(sampled_pos_idx, tf.int32))
+        if batch_size is None:
+          negative_positive_ratio = (
+              1 - self._positive_fraction) / self._positive_fraction
+          max_num_neg = tf.cast(
+              negative_positive_ratio *
+              tf.cast(num_sampled_pos, dtype=tf.float32),
+              dtype=tf.int32)
+        else:
+          max_num_neg = batch_size - num_sampled_pos
+        sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg)
+
+        return tf.logical_or(sampled_pos_idx, sampled_neg_idx)