Adds files to utils folder.

PiperOrigin-RevId: 276317091

Adds files to utils folder.
PiperOrigin-RevId: 276317091
7d1cfc1e · Yeqing Li · A. Unique TensorFlower · 638ba7a4 · 7d1cfc1e · 7d1cfc1e
Commit 7d1cfc1e authored Oct 23, 2019 by Yeqing Li Committed by A. Unique TensorFlower Oct 23, 2019
5 changed files
--- a/official/vision/detection/utils/box_utils.py
+++ b/official/vision/detection/utils/box_utils.py
@@ -26,6 +26,75 @@ EPSILON = 1e-8
 BBOX_XFORM_CLIP = np.log(1000. / 16.)


+def yxyx_to_xywh(boxes):
+  """Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
+
+  Args:
+    boxes: a numpy array whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+
+  Returns:
+    boxes: a numpy array whose shape is the same as `boxes` in new format.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+
+  boxes_ymin = boxes[..., 0]
+  boxes_xmin = boxes[..., 1]
+  boxes_width = boxes[..., 3] - boxes[..., 1]
+  boxes_height = boxes[..., 2] - boxes[..., 0]
+  new_boxes = np.stack([boxes_xmin, boxes_ymin, boxes_width, boxes_height],
+                       axis=-1)
+
+  return new_boxes
+
+
+def jitter_boxes(boxes, noise_scale=0.025):
+  """Jitter the box coordinates by some noise distribution.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    noise_scale: a python float which specifies the magnitude of noise. The rule
+      of thumb is to set this between (0, 0.1]. The default value is found to
+      mimic the noisy detections best empirically.
+
+  Returns:
+    jittered_boxes: a tensor whose shape is the same as `boxes` representing
+      the jittered boxes.
+
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+
+  with tf.name_scope('jitter_boxes'):
+    bbox_jitters = tf.random.normal(boxes.get_shape(), stddev=noise_scale)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    width = xmax - xmin
+    height = ymax - ymin
+    new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
+    new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
+    new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
+    new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
+    jittered_boxes = tf.concat([
+        new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
+        new_center_y + new_height * 0.5, new_center_x + new_width * 0.5
+    ],
+                               axis=-1)
+
+    return jittered_boxes
+
+
 def normalize_boxes(boxes, image_shape):
  """Converts boxes to the normalized coordinates.

@@ -44,8 +113,8 @@ def normalize_boxes(boxes, image_shape):
    ValueError: If the last dimension of boxes is not 4.
  """
  if boxes.shape[-1] != 4:
-    raise ValueError(
-        'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[1]))
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))

  with tf.name_scope('normalize_boxes'):
    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
@@ -86,13 +155,13 @@ def denormalize_boxes(boxes, image_shape):
      height, width = image_shape
    else:
      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
-      height = image_shape[..., 0:1]
-      width = image_shape[..., 1:2]
+      height, width = tf.split(image_shape, 2, axis=-1)

-    ymin = boxes[..., 0:1] * height
-    xmin = boxes[..., 1:2] * width
-    ymax = boxes[..., 2:3] * height
-    xmax = boxes[..., 3:4] * width
+    ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
+    ymin = ymin * height
+    xmin = xmin * width
+    ymax = ymax * height
+    xmax = xmax * width

    denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
    return denormalized_boxes
@@ -116,10 +185,10 @@ def clip_boxes(boxes, image_shape):
    ValueError: If the last dimension of boxes is not 4.
  """
  if boxes.shape[-1] != 4:
-    raise ValueError(
-        'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[1]))
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))

-  with tf.name_scope('crop_boxes'):
+  with tf.name_scope('clip_boxes'):
    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
      height, width = image_shape
    else:
@@ -132,10 +201,10 @@ def clip_boxes(boxes, image_shape):
    ymax = boxes[..., 2:3]
    xmax = boxes[..., 3:4]

-    clipped_ymin = tf.maximum(tf.minimum(ymin, height - 1.0), 0.0)
-    clipped_ymax = tf.maximum(tf.minimum(ymax, height - 1.0), 0.0)
-    clipped_xmin = tf.maximum(tf.minimum(xmin, width - 1.0), 0.0)
-    clipped_xmax = tf.maximum(tf.minimum(xmax, width - 1.0), 0.0)
+    clipped_ymin = tf.math.maximum(tf.math.minimum(ymin, height - 1.0), 0.0)
+    clipped_ymax = tf.math.maximum(tf.math.minimum(ymax, height - 1.0), 0.0)
+    clipped_xmin = tf.math.maximum(tf.math.minimum(xmin, width - 1.0), 0.0)
+    clipped_xmax = tf.math.maximum(tf.math.minimum(xmax, width - 1.0), 0.0)

    clipped_boxes = tf.concat(
        [clipped_ymin, clipped_xmin, clipped_ymax, clipped_xmax],
@@ -143,14 +212,47 @@ def clip_boxes(boxes, image_shape):
    return clipped_boxes


+def compute_outer_boxes(boxes, image_shape, scale=1.0):
+  """Compute outer box encloses an object with a margin.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+    scale: a float number specifying the scale of output outer boxes to input
+      `boxes`.
+
+  Returns:
+    outer_boxes: a tensor whose shape is the same as `boxes` representing the
+      outer boxes.
+  """
+  if scale < 1.0:
+    raise ValueError(
+        'scale is {}, but outer box scale must be greater than 1.0.'.format(
+            scale))
+  centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
+  centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
+  box_height = (boxes[..., 2] - boxes[..., 0]) * scale
+  box_width = (boxes[..., 3] - boxes[..., 1]) * scale
+  outer_boxes = tf.stack([
+      centers_y - box_height / 2.0, centers_x - box_width / 2.0,
+      centers_y + box_height / 2.0, centers_x + box_width / 2.0
+  ],
+                         axis=1)
+  outer_boxes = clip_boxes(outer_boxes, image_shape)
+  return outer_boxes
+
+
 def encode_boxes(boxes, anchors, weights=None):
  """Encode boxes to targets.

  Args:
    boxes: a tensor whose last dimension is 4 representing the coordinates
      of boxes in ymin, xmin, ymax, xmax order.
-    anchors: a tensor whose shape is the same as `boxes` representing the
-      coordinates of anchors in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
    weights: None or a list of four float numbers used to scale coordinates.

  Returns:
@@ -161,8 +263,8 @@ def encode_boxes(boxes, anchors, weights=None):
    ValueError: If the last dimension of boxes is not 4.
  """
  if boxes.shape[-1] != 4:
-    raise ValueError(
-        'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[1]))
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))

  with tf.name_scope('encode_boxes'):
    boxes = tf.cast(boxes, dtype=anchors.dtype)
@@ -206,14 +308,18 @@ def decode_boxes(encoded_boxes, anchors, weights=None):
  Args:
    encoded_boxes: a tensor whose last dimension is 4 representing the
      coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
-    anchors: a tensor whose shape is the same as `boxes` representing the
-      coordinates of anchors in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
    weights: None or a list of four float numbers used to scale coordinates.

  Returns:
    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
      decoded box targets.
  """
+  if encoded_boxes.shape[-1] != 4:
+    raise ValueError('encoded_boxes.shape[-1] is {:d}, but must be 4.'.format(
+        encoded_boxes.shape[-1]))
+
  with tf.name_scope('decode_boxes'):
    encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
    dy = encoded_boxes[..., 0:1]
@@ -225,8 +331,8 @@ def decode_boxes(encoded_boxes, anchors, weights=None):
      dx /= weights[1]
      dh /= weights[2]
      dw /= weights[3]
-    dh = tf.minimum(dh, BBOX_XFORM_CLIP)
-    dw = tf.minimum(dw, BBOX_XFORM_CLIP)
+    dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
+    dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)

    anchor_ymin = anchors[..., 0:1]
    anchor_xmin = anchors[..., 1:2]
@@ -239,8 +345,8 @@ def decode_boxes(encoded_boxes, anchors, weights=None):

    decoded_boxes_yc = dy * anchor_h + anchor_yc
    decoded_boxes_xc = dx * anchor_w + anchor_xc
-    decoded_boxes_h = tf.exp(dh) * anchor_h
-    decoded_boxes_w = tf.exp(dw) * anchor_w
+    decoded_boxes_h = tf.math.exp(dh) * anchor_h
+    decoded_boxes_w = tf.math.exp(dw) * anchor_w

    decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
    decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
@@ -252,3 +358,178 @@ def decode_boxes(encoded_boxes, anchors, weights=None):
         decoded_boxes_ymax, decoded_boxes_xmax],
        axis=-1)
    return decoded_boxes
+
+
+def filter_boxes(boxes, scores, image_shape, min_size_threshold):
+  """Filter and remove boxes that are too small or fall outside the image.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    image_shape: a tensor whose shape is the same as, or `broadcastable` to
+      `boxes` except the last dimension, which is 2, representing [height,
+      width] of the scaled image.
+    min_size_threshold: a float representing the minimal box size in each side
+      (w.r.t. the scaled image). Boxes whose sides are smaller than it will be
+      filtered out.
+
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with 0.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the positinon of the filtered boxes filled with 0.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+
+  with tf.name_scope('filter_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0]
+      width = image_shape[..., 1]
+
+    ymin = boxes[..., 0]
+    xmin = boxes[..., 1]
+    ymax = boxes[..., 2]
+    xmax = boxes[..., 3]
+
+    h = ymax - ymin + 1.0
+    w = xmax - xmin + 1.0
+    yc = ymin + 0.5 * h
+    xc = xmin + 0.5 * w
+
+    min_size = tf.cast(
+        tf.math.maximum(min_size_threshold, 1.0), dtype=boxes.dtype)
+
+    filtered_size_mask = tf.math.logical_and(
+        tf.math.greater(h, min_size), tf.math.greater(w, min_size))
+    filtered_center_mask = tf.logical_and(
+        tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
+        tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
+    filtered_mask = tf.math.logical_and(filtered_size_mask,
+                                        filtered_center_mask)
+
+    filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+
+    return filtered_boxes, filtered_scores
+
+
+def filter_boxes_by_scores(boxes, scores, min_score_threshold):
+  """Filter and remove boxes whose scores are smaller than the threshold.
+
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    min_score_threshold: a float representing the minimal box score threshold.
+      Boxes whose score are smaller than it will be filtered out.
+
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with 0.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+
+  with tf.name_scope('filter_boxes_by_scores'):
+    filtered_mask = tf.math.greater(scores, min_score_threshold)
+    filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+
+    return filtered_boxes, filtered_scores
+
+
+def top_k_boxes(boxes, scores, k):
+  """Sort and select top k boxes according to the scores.
+
+  Args:
+    boxes: a tensor of shape [batch_size, N, 4] representing the coordiante of
+      the boxes. N is the number of boxes per image.
+    scores: a tensor of shsape [batch_size, N] representing the socre of the
+      boxes.
+    k: an integer or a tensor indicating the top k number.
+
+  Returns:
+    selected_boxes: a tensor of shape [batch_size, k, 4] representing the
+      selected top k box coordinates.
+    selected_scores: a tensor of shape [batch_size, k] representing the selected
+      top k box scores.
+  """
+  with tf.name_scope('top_k_boxes'):
+    selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
+
+    batch_size, _ = scores.get_shape().as_list()
+    if batch_size == 1:
+      selected_boxes = tf.squeeze(
+          tf.gather(boxes, top_k_indices, axis=1), axis=1)
+    else:
+      top_k_indices_shape = tf.shape(top_k_indices)
+      batch_indices = (
+          tf.expand_dims(tf.range(top_k_indices_shape[0]), axis=-1) *
+          tf.ones([1, top_k_indices_shape[-1]], dtype=tf.int32))
+      gather_nd_indices = tf.stack([batch_indices, top_k_indices], axis=-1)
+      selected_boxes = tf.gather_nd(boxes, gather_nd_indices)
+
+    return selected_boxes, selected_scores
+
+
+def bbox_overlap(boxes, gt_boxes):
+  """Calculates the overlap between proposal and ground truth boxes.
+
+  Some `gt_boxes` may have been padded.  The returned `iou` tensor for these
+  boxes will be -1.
+
+  Args:
+    boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
+      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
+    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
+      tensor might have paddings with a negative value.
+
+  Returns:
+    iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
+  """
+  with tf.name_scope('bbox_overlap'):
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=2)
+
+    # Calculates the intersection area.
+    i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    i_area = tf.math.maximum((i_xmax - i_xmin), 0) * tf.math.maximum(
+        (i_ymax - i_ymin), 0)
+
+    # Calculates the union area.
+    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
+    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
+    # Adds a small epsilon to avoid divide-by-zero.
+    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
+
+    # Calculates IoU.
+    iou = i_area / u_area
+
+    # Fills -1 for IoU entries between the padded ground truth boxes.
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    padding_mask = tf.logical_or(
+        tf.zeros_like(bb_x_min, dtype=tf.bool),
+        tf.transpose(gt_invalid_mask, [0, 2, 1]))
+    iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
+
+    return iou
--- a/official/vision/detection/utils/class_utils.py
+++ b/official/vision/detection/utils/class_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for handling dataset object categories."""
+
+
+def coco_split_class_ids(split_name):
+  """Return the COCO class split ids based on split name and training mode.
+
+  Args:
+    split_name: The name of dataset split.
+
+  Returns:
+    class_ids: a python list of integer.
+  """
+  if split_name == 'all':
+    return []
+
+  elif split_name == 'voc':
+    return [
+        1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72
+    ]
+
+  elif split_name == 'nonvoc':
+    return [
+        8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36,
+        37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+        57, 58, 59, 60, 61, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
+        85, 86, 87, 88, 89, 90
+    ]
+
+  else:
+    raise ValueError('Invalid split name {}!!!'.format(split_name))
--- a/official/vision/detection/utils/dataloader_utils.py
+++ b/official/vision/detection/utils/dataloader_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for dataloader."""
+
+import tensorflow.compat.v2 as tf
+
+from official.vision.detection.utils import input_utils
+
+
+def process_source_id(source_id):
+  """Processes source_id to the right format."""
+  if source_id.dtype == tf.string:
+    source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
+  with tf.control_dependencies([source_id]):
+    source_id = tf.cond(
+        pred=tf.equal(tf.size(input=source_id), 0),
+        true_fn=lambda: tf.cast(tf.constant(-1), tf.int64),
+        false_fn=lambda: tf.identity(source_id))
+  return source_id
+
+
+def pad_groundtruths_to_fixed_size(gt, n):
+  """Pads the first dimension of groundtruths labels to the fixed size."""
+  gt['boxes'] = input_utils.pad_to_fixed_size(gt['boxes'], n, -1)
+  gt['is_crowds'] = input_utils.pad_to_fixed_size(gt['is_crowds'], n, 0)
+  gt['areas'] = input_utils.pad_to_fixed_size(gt['areas'], n, -1)
+  gt['classes'] = input_utils.pad_to_fixed_size(gt['classes'], n, -1)
+  return gt
--- a/official/vision/detection/utils/input_utils.py
+++ b/official/vision/detection/utils/input_utils.py
@@ -182,6 +182,109 @@ def resize_and_crop_image(image,
    scaled_image = tf.image.resize(
        image, tf.cast(scaled_size, tf.int32), method=method)

+    if random_jittering:
+      scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
+                                  offset[1]:offset[1] + desired_size[1], :]
+
+    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
+                                                padded_size[0], padded_size[1])
+
+    image_info = tf.stack(
+        [image_size, scaled_size, image_scale,
+         tf.cast(offset, tf.float32)])
+    return output_image, image_info
+
+
+def resize_and_crop_image_v2(image,
+                             short_side,
+                             long_side,
+                             padded_size,
+                             aug_scale_min=1.0,
+                             aug_scale_max=1.0,
+                             seed=1,
+                             method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size (Faster R-CNN style).
+
+  Resize and pad images given the specified short / long side length and the
+  stride size.
+
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and first try to rescale the short
+     side of the original image to `short_side`.
+  2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
+     the aspect ratio and rescal the long side of the image to `long_side`.
+  2. Pad the rescaled image to the padded_size.
+
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    short_side: a scalar `Tensor` or `int` representing the desired short side
+      to be rescaled to.
+    long_side: a scalar `Tensor` or `int` representing the desired long side to
+      be rescaled to.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [scaled_height, scaled_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [scaled_height,
+      scaled_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image_v2'):
+    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
+
+    scale_using_short_side = (
+        short_side / tf.math.minimum(image_size[0], image_size[1]))
+    scale_using_long_side = (
+        long_side / tf.math.maximum(image_size[0], image_size[1]))
+
+    scaled_size = tf.math.round(image_size * scale_using_short_side)
+    scaled_size = tf.where(
+        tf.math.greater(
+            tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
+        tf.math.round(image_size * scale_using_long_side), scaled_size)
+    desired_size = scaled_size
+
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+
+    if random_jittering:
+      random_scale = tf.random.uniform([],
+                                       aug_scale_min,
+                                       aug_scale_max,
+                                       seed=seed)
+      scaled_size = tf.math.round(random_scale * scaled_size)
+
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([
+          2,
+      ], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+
    if random_jittering:
      scaled_image = scaled_image[
          offset[0]:offset[0] + desired_size[0],

--- a/official/vision/detection/utils/object_detection/preprocessor.py
+++ b/official/vision/detection/utils/object_detection/preprocessor.py
@@ -100,7 +100,9 @@ def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
  Returns:
    new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
  """
-  with tf.name_scope(scope, 'FlipHorizontal'):
+  if not scope:
+    scope = 'FlipHorizontal'
+  with tf.name_scope(scope):
    keypoints = tf.transpose(a=keypoints, perm=[1, 0, 2])
    keypoints = tf.gather(keypoints, flip_permutation)
    v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
@@ -110,6 +112,70 @@ def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
    return new_keypoints


+def keypoint_change_coordinate_frame(keypoints, window, scope=None):
+  """Changes coordinate frame of the keypoints to be relative to window's frame.
+
+  Given a window of the form [y_min, x_min, y_max, x_max], changes keypoint
+  coordinates from keypoints of shape [num_instances, num_keypoints, 2]
+  to be relative to this window.
+
+  An example use case is data augmentation: where we are given groundtruth
+  keypoints and would like to randomly crop the image to some window. In this
+  case we need to change the coordinate frame of each groundtruth keypoint to be
+  relative to this new window.
+
+  Args:
+    keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+    window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
+      window we should change the coordinate frame to.
+    scope: name scope.
+
+  Returns:
+    new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+  """
+  if not scope:
+    scope = 'ChangeCoordinateFrame'
+  with tf.name_scope(scope):
+    win_height = window[2] - window[0]
+    win_width = window[3] - window[1]
+    new_keypoints = box_list_ops.scale(keypoints - [window[0], window[1]],
+                                       1.0 / win_height, 1.0 / win_width)
+    return new_keypoints
+
+
+def keypoint_prune_outside_window(keypoints, window, scope=None):
+  """Prunes keypoints that fall outside a given window.
+
+  This function replaces keypoints that fall outside the given window with nan.
+  See also clip_to_window which clips any keypoints that fall outside the given
+  window.
+
+  Args:
+    keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+    window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
+      window outside of which the op should prune the keypoints.
+    scope: name scope.
+
+  Returns:
+    new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+  """
+  if not scope:
+    scope = 'PruneOutsideWindow'
+  with tf.name_scope(scope):
+    y, x = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
+    win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
+
+    valid_indices = tf.logical_and(
+        tf.logical_and(y >= win_y_min, y <= win_y_max),
+        tf.logical_and(x >= win_x_min, x <= win_x_max))
+
+    new_y = tf.where(valid_indices, y, np.nan * tf.ones_like(y))
+    new_x = tf.where(valid_indices, x, np.nan * tf.ones_like(x))
+    new_keypoints = tf.concat([new_y, new_x], 2)
+
+    return new_keypoints
+
+
 def random_horizontal_flip(image,
                           boxes=None,
                           masks=None,
@@ -334,7 +400,7 @@ def resize_to_range(image,
  if len(image.get_shape()) != 3:
    raise ValueError('Image should be 3D tensor')

-  with tf.name_scope('ResizeToRange', values=[image, min_dimension]):
+  with tf.name_scope('ResizeToRange'):
    if image.get_shape().is_fully_defined():
      new_size = _compute_new_static_size(image, min_dimension, max_dimension)
    else:
@@ -389,7 +455,9 @@ def box_list_scale(boxlist, y_scale, x_scale, scope=None):
  Returns:
    boxlist: BoxList holding N boxes
  """
-  with tf.name_scope(scope, 'Scale'):
+  if not scope:
+    scope = 'Scale'
+  with tf.name_scope(scope):
    y_scale = tf.cast(y_scale, tf.float32)
    x_scale = tf.cast(x_scale, tf.float32)
    y_min, x_min, y_max, x_max = tf.split(
@@ -415,7 +483,9 @@ def keypoint_scale(keypoints, y_scale, x_scale, scope=None):
  Returns:
    new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
  """
-  with tf.name_scope(scope, 'Scale'):
+  if not scope:
+    scope = 'Scale'
+  with tf.name_scope(scope):
    y_scale = tf.cast(y_scale, tf.float32)
    x_scale = tf.cast(x_scale, tf.float32)
    new_keypoints = keypoints * [[[y_scale, x_scale]]]