Internal change

PiperOrigin-RevId: 471573654

Internal change
PiperOrigin-RevId: 471573654
d59e9237 · A. Unique TensorFlower · c63a5e72 · d59e9237 · d59e9237 · d59e9237
Commit d59e9237 authored Sep 01, 2022 by A. Unique TensorFlower
3 changed files
--- a/official/vision/ops/mask_ops.py
+++ b/official/vision/ops/mask_ops.py
@@ -18,12 +18,11 @@ import math
 # Import libraries
 import cv2
 import numpy as np
+import tensorflow as tf
-def paste_instance_masks(masks,
+def paste_instance_masks(masks: np.ndarray, detected_boxes: np.ndarray,
-                         detected_boxes,
+                         image_height: int, image_width: int) -> np.ndarray:
-                         image_height,
-                         image_width):
  """Paste instance masks to generate the image segmentation results.
  Args:
@@ -39,13 +38,13 @@ def paste_instance_masks(masks,
      the instance masks *pasted* on the image canvas.
  """
-  def expand_boxes(boxes, scale):
+  def expand_boxes(boxes: np.ndarray, scale: float) -> np.ndarray:
    """Expands an array of boxes by a given scale."""
    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
    # whereas `boxes` here is in [x1, y1, w, h] form
-    w_half = boxes[:, 2] * .5
+    w_half = boxes[:, 2] * 0.5
-    h_half = boxes[:, 3] * .5
+    h_half = boxes[:, 3] * 0.5
    x_c = boxes[:, 0] + w_half
    y_c = boxes[:, 1] + h_half
@@ -104,10 +103,8 @@ def paste_instance_masks(masks,
  return segms
-def paste_instance_masks_v2(masks,
+def paste_instance_masks_v2(masks: np.ndarray, detected_boxes: np.ndarray,
-                            detected_boxes,
+                            image_height: int, image_width: int) -> np.ndarray:
-                            image_height,
-                            image_width):
  """Paste instance masks to generate the image segmentation (v2).
  Args:
@@ -188,3 +185,65 @@ def paste_instance_masks_v2(masks,
  segms = np.array(segms)
  return segms
+def bbox2mask(bbox: tf.Tensor,
+              *,
+              image_height: int,
+              image_width: int,
+              dtype: tf.DType = tf.bool) -> tf.Tensor:
+  """Converts bounding boxes to bitmasks.
+  Args:
+    bbox: A tensor in shape (..., 4) with arbitrary numbers of batch dimensions,
+      representing the absolute coordinates (ymin, xmin, ymax, xmax) for each
+      bounding box.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+    dtype: DType of the output bitmasks.
+  Returns:
+    A tensor in shape (..., height, width) which stores the bitmasks created
+    from the bounding boxes. For example:
+    >>> bbox2mask(tf.constant([[1,2,4,4]]),
+                  image_height=5,
+                  image_width=5,
+                  dtype=tf.int32)
+    <tf.Tensor: shape=(1, 5, 5), dtype=int32, numpy=
+    array([[[0, 0, 0, 0, 0],
+            [0, 0, 1, 1, 0],
+            [0, 0, 1, 1, 0],
+            [0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0]]], dtype=int32)>
+  """
+  bbox_shape = bbox.get_shape().as_list()
+  if bbox_shape[-1] != 4:
+    raise ValueError(
+        'Expected the last dimension of `bbox` has size == 4, but the shape '
+        'of `bbox` was: %s' % bbox_shape)
+  # (..., 1)
+  ymin = bbox[..., 0:1]
+  xmin = bbox[..., 1:2]
+  ymax = bbox[..., 2:3]
+  xmax = bbox[..., 3:4]
+  # (..., 1, width)
+  ymin = tf.expand_dims(tf.repeat(ymin, repeats=image_width, axis=-1), axis=-2)
+  # (..., height, 1)
+  xmin = tf.expand_dims(tf.repeat(xmin, repeats=image_height, axis=-1), axis=-1)
+  # (..., 1, width)
+  ymax = tf.expand_dims(tf.repeat(ymax, repeats=image_width, axis=-1), axis=-2)
+  # (..., height, 1)
+  xmax = tf.expand_dims(tf.repeat(xmax, repeats=image_height, axis=-1), axis=-1)
+  # (height, 1)
+  y_grid = tf.expand_dims(tf.range(image_height, dtype=bbox.dtype), axis=-1)
+  # (1, width)
+  x_grid = tf.expand_dims(tf.range(image_width, dtype=bbox.dtype), axis=-2)
+  # (..., height, width)
+  ymin_mask = y_grid >= ymin
+  xmin_mask = x_grid >= xmin
+  ymax_mask = y_grid < ymax
+  xmax_mask = x_grid < xmax
+  return tf.cast(ymin_mask & xmin_mask & ymax_mask & xmax_mask, dtype)
--- a/official/vision/ops/mask_ops_test.py
+++ b/official/vision/ops/mask_ops_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for mask_ops.py."""
 # Import libraries
@@ -50,6 +49,57 @@ class MaskUtilsTest(tf.test.TestCase):
        np.array(masks > 0.5, dtype=np.uint8),
        1e-5)
+  def testBbox2mask(self):
+    bboxes = tf.constant([[1, 2, 4, 4], [-1, -1, 3, 3], [2, 3, 6, 8],
+                          [1, 1, 2, 2], [1, 1, 1, 4]])
+    masks = mask_ops.bbox2mask(
+        bboxes, image_height=5, image_width=6, dtype=tf.int32)
+    expected_masks = tf.constant(
+        [
+            [  # bbox = [1, 2, 4, 4]
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 1, 1, 0, 0],
+                [0, 0, 1, 1, 0, 0],
+                [0, 0, 1, 1, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+            ],
+            [  # bbox = [-1, -1, 3, 3]
+                [1, 1, 1, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+            ],
+            [  # bbox = [2, 3, 6, 8]
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 1, 1, 1],
+                [0, 0, 0, 1, 1, 1],
+                [0, 0, 0, 1, 1, 1],
+            ],
+            [  # bbox =  [1, 1, 2, 2]
+                [0, 0, 0, 0, 0, 0],
+                [0, 1, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+            ],
+            [  # bbox = [1, 1, 1, 4]
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0],
+            ]
+        ],
+        dtype=tf.int32)
+    self.assertAllEqual(expected_masks, masks)
+  def testBbox2maskInvalidInput(self):
+    bboxes = tf.constant([[1, 2, 4, 4, 4], [-1, -1, 3, 3, 3]])
+    with self.assertRaisesRegex(ValueError, 'bbox.*size == 4'):
+      mask_ops.bbox2mask(bboxes, image_height=5, image_width=6, dtype=tf.int32)
 if __name__ == '__main__':
  tf.test.main()
--- a/official/vision/ops/spatial_transform_ops.py
+++ b/official/vision/ops/spatial_transform_ops.py
@@ -19,7 +19,8 @@ import tensorflow as tf
 _EPSILON = 1e-8
-def _feature_bilinear_interpolation(features, kernel_y, kernel_x):
+def _feature_bilinear_interpolation(features: tf.Tensor, kernel_y: tf.Tensor,
+                                    kernel_x: tf.Tensor) -> tf.Tensor:
  """Feature bilinear interpolation.
  The RoIAlign feature f can be computed by bilinear interpolation
@@ -67,8 +68,12 @@ def _feature_bilinear_interpolation(features, kernel_y, kernel_x):
  return features
-def _compute_grid_positions(boxes, boundaries, output_size, sample_offset):
+def _compute_grid_positions(
-  """Computes the grid position w.r.t. the corresponding feature map.
+    boxes: tf.Tensor, boundaries: tf.Tensor, output_size: int,
+    sample_offset: float) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the grid position w.r.t.
+  the corresponding feature map.
  Args:
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
@@ -135,10 +140,10 @@ def _compute_grid_positions(boxes, boundaries, output_size, sample_offset):
  return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
-def multilevel_crop_and_resize(features,
+def multilevel_crop_and_resize(features: dict[str, tf.Tensor],
-                               boxes,
+                               boxes: tf.Tensor,
-                               output_size=7,
+                               output_size: int = 7,
-                               sample_offset=0.5):
+                               sample_offset: float = 0.5) -> tf.Tensor:
  """Crop and resize on multilevel feature pyramid.
  Generate the (output_size, output_size) set of pixels for each input box
@@ -282,13 +287,13 @@ def multilevel_crop_and_resize(features,
    return features_per_box
-def _selective_crop_and_resize(features,
+def _selective_crop_and_resize(features: tf.Tensor,
-                               boxes,
+                               boxes: tf.Tensor,
-                               box_levels,
+                               box_levels: tf.Tensor,
-                               boundaries,
+                               boundaries: tf.Tensor,
-                               output_size=7,
+                               output_size: int = 7,
-                               sample_offset=0.5,
+                               sample_offset: float = 0.5,
-                               use_einsum_gather=False):
+                               use_einsum_gather: bool = False) -> tf.Tensor:
  """Crop and resize boxes on a set of feature maps.
  Given multiple features maps indexed by different levels, and a set of boxes
@@ -434,12 +439,12 @@ def _selective_crop_and_resize(features,
  return features_per_box
-def crop_mask_in_target_box(masks,
+def crop_mask_in_target_box(masks: tf.Tensor,
-                            boxes,
+                            boxes: tf.Tensor,
-                            target_boxes,
+                            target_boxes: tf.Tensor,
-                            output_size,
+                            output_size: int,
-                            sample_offset=0,
+                            sample_offset: float = 0.0,
-                            use_einsum=True):
+                            use_einsum: bool = True) -> tf.Tensor:
  """Crop masks in target boxes.
  Args:
@@ -515,7 +520,9 @@ def crop_mask_in_target_box(masks,
  return cropped_masks
-def nearest_upsampling(data, scale, use_keras_layer=False):
+def nearest_upsampling(data: tf.Tensor,
+                       scale: int,
+                       use_keras_layer: bool = False) -> tf.Tensor:
  """Nearest neighbor upsampling implementation.
  Args:
@@ -542,3 +549,54 @@ def nearest_upsampling(data, scale, use_keras_layer=False):
    data = tf.tile(
        tf.reshape(data, [bs, h, 1, w, 1, c]), [1, 1, scale, 1, scale, 1])
    return tf.reshape(data, [bs, h * scale, w * scale, c])
+def _gather_rows_from_matrix(input_matrix: tf.Tensor,
+                             row_indices: tf.Tensor) -> tf.Tensor:
+  """Gather rows from the input matrix (2-D tensor).
+  This operation is equivalent to tf.gather(input_matrix, row_indices), but is
+  implemented in sparse matrix multiplication.
+  Args:
+    input_matrix: A 2-D tensor in shape (input_h, input_w) from which to gather
+      values. The shape must be 2-D, since sparse matrix multiplication is
+      currently only supported on 2-D matrices.
+    row_indices: A 1-D int tensor in shape (output_h) which stored the row
+      indices of the input.
+  Returns:
+    A tensor in shape (output_h, input_w) which stores the gathered rows.
+  """
+  input_matrix_shape = input_matrix.get_shape().as_list()
+  if len(input_matrix_shape) != 2:
+    raise ValueError(
+        'Expected the input_matrix tensor (input_h, input_w) has rank == 2, '
+        'was: %s' % input_matrix_shape)
+  row_indices_shape = row_indices.get_shape().as_list()
+  if len(row_indices_shape) != 1:
+    raise ValueError(
+        'Expected the row_indices tensor (output_h) has rank == 1, was: %s' %
+        row_indices_shape)
+  # (output_h, input_h)
+  indices_one_hot = tf.one_hot(
+      row_indices, depth=input_matrix_shape[0], dtype=input_matrix.dtype)
+  # Matrix multiplication: (output_h, input_h) x (input_h, input_w)
+  # (output_h, input_w)
+  return tf.linalg.matmul(indices_one_hot, input_matrix, a_is_sparse=True)
+def bilinear_resize_to_bbox(images: tf.Tensor, bbox: tf.Tensor,
+                            output_size: tf.Tensor) -> tf.Tensor:
+  # TODO(b/241944792): Implement in follow-up CLs
+  raise NotImplementedError
+def bilinear_resize_with_crop_and_pad(images: tf.Tensor, *,
+                                      rescale_size: tf.Tensor,
+                                      crop_offset: tf.Tensor,
+                                      crop_size: tf.Tensor,
+                                      output_size: tf.Tensor) -> tf.Tensor:
+  # TODO(b/241944792): Implement in follow-up CLs
+  raise NotImplementedError