Updated the MoveNet.Multipose model postprocessing logics such that it avoids

using 5d tensors to avoid issue when converted to TfLite model. PiperOrigin-RevId: 398301856

Updated the MoveNet.Multipose model postprocessing logics such that it avoids
using 5d tensors to avoid issue when converted to TfLite model. PiperOrigin-RevId: 398301856
b1d0bf77 · Yu-hui Chen · TF Object Detection Team · 19d2e2ac · b1d0bf77 · b1d0bf77
Commit b1d0bf77 authored Sep 22, 2021 by Yu-hui Chen Committed by TF Object Detection Team Sep 22, 2021
2 changed files
--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -807,62 +807,58 @@ def _gaussian_weighted_map_const_multi(
      y-coordinate of each pixel grid.
    x_grid: A float tensor with shape [height, width] representing the
      x-coordinate of each pixel grid.
-    heatmap: A float tensor with shape [batch_size, height, width,
-      num_keypoints] representing the heatmap to be rescored.
-    points_y: A float tensor with shape [batch_size, num_instances,
-      num_keypoints] representing the y coordinates of the target points for
-      each channel.
-    points_x: A float tensor with shape [batch_size, num_instances,
-      num_keypoints] representing the x coordinates of the target points for
-      each channel.
-    boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
-      bounding boxes for each instance, expressed in the output coordinate
-      frame.
+    heatmap: A float tensor with shape [height, width, num_keypoints]
+      representing the heatmap to be rescored.
+    points_y: A float tensor with shape [num_instances, num_keypoints]
+      representing the y coordinates of the target points for each channel.
+    points_x: A float tensor with shape [num_instances, num_keypoints]
+      representing the x coordinates of the target points for each channel.
+    boxes: A tensor of shape [num_instances, 4] with predicted bounding boxes
+      for each instance, expressed in the output coordinate frame.
    gaussian_denom_ratio: A constant used in the above formula that determines
      the denominator of the Gaussian kernel.

  Returns:
-    A float tensor with shape [batch_size, height, width, channel] representing
+    A float tensor with shape [height, width, channel] representing
    the rescored heatmap.
  """
-  batch_size, num_instances, _ = _get_shape(boxes, 3)
-  _, height, width, num_keypoints = _get_shape(heatmap, 4)
+  num_instances, _ = _get_shape(boxes, 2)
+  height, width, num_keypoints = _get_shape(heatmap, 3)

-  # [batch_size, height, width, num_instances, num_keypoints].
+  # [height, width, num_instances, num_keypoints].
  # Note that we intentionally avoid using tf.newaxis as TfLite converter
  # doesn't like it.
  y_diff = (
-      tf.reshape(y_grid, [1, height, width, 1, 1]) -
-      tf.reshape(points_y, [batch_size, 1, 1, num_instances, num_keypoints]))
+      tf.reshape(y_grid, [height, width, 1, 1]) -
+      tf.reshape(points_y, [1, 1, num_instances, num_keypoints]))
  x_diff = (
-      tf.reshape(x_grid, [1, height, width, 1, 1]) -
-      tf.reshape(points_x, [batch_size, 1, 1, num_instances, num_keypoints]))
-  distance_square = y_diff**2 + x_diff**2
+      tf.reshape(x_grid, [height, width, 1, 1]) -
+      tf.reshape(points_x, [1, 1, num_instances, num_keypoints]))
+  distance_square = y_diff * y_diff + x_diff * x_diff

-  y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=2)
+  y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=1)

  # Make the mask with all 1.0 in the box regions.
-  # Shape: [batch_size, height, width, num_instances]
+  # Shape: [height, width, num_instances]
  in_boxes = tf.math.logical_and(
      tf.math.logical_and(
-          tf.reshape(y_grid, [1, height, width, 1]) >= tf.reshape(
-              y_min, [batch_size, 1, 1, num_instances]),
-          tf.reshape(y_grid, [1, height, width, 1]) < tf.reshape(
-              y_max, [batch_size, 1, 1, num_instances])),
+          tf.reshape(y_grid, [height, width, 1]) >= tf.reshape(
+              y_min, [1, 1, num_instances]),
+          tf.reshape(y_grid, [height, width, 1]) < tf.reshape(
+              y_max, [1, 1, num_instances])),
      tf.math.logical_and(
-          tf.reshape(x_grid, [1, height, width, 1]) >= tf.reshape(
-              x_min, [batch_size, 1, 1, num_instances]),
-          tf.reshape(x_grid, [1, height, width, 1]) < tf.reshape(
-              x_max, [batch_size, 1, 1, num_instances])))
+          tf.reshape(x_grid, [height, width, 1]) >= tf.reshape(
+              x_min, [1, 1, num_instances]),
+          tf.reshape(x_grid, [height, width, 1]) < tf.reshape(
+              x_max, [1, 1, num_instances])))
  in_boxes = tf.cast(in_boxes, dtype=tf.float32)

  gaussian_denom = tf.cast(
      tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio
-  # shape: [batch_size, height, width, num_instances, num_keypoints]
+  # shape: [height, width, num_instances, num_keypoints]
  gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom)
-  return tf.expand_dims(
-      heatmap, axis=3) * gaussian_map * tf.reshape(
-          in_boxes, [batch_size, height, width, num_instances, 1])
+  return tf.expand_dims(heatmap, axis=2) * gaussian_map * tf.reshape(
+      in_boxes, [height, width, num_instances, 1])


 def prediction_tensors_to_multi_instance_kpts(
@@ -876,62 +872,59 @@ def prediction_tensors_to_multi_instance_kpts(
  have an additional 'num_instances' dimension for multi-instance prediction.

  Args:
-    keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
+    keypoint_heatmap_predictions: A float tensor of shape [height,
      width, num_instances, num_keypoints] representing the per-keypoint and
      per-instance heatmaps which is used for finding the best keypoint
      candidate locations.
-    keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
+    keypoint_heatmap_offsets: A float tensor of shape [height,
      width, 2 * num_keypoints] representing the per-keypoint offsets.
-    keypoint_score_heatmap: (optional) A float tensor of shape [batch_size,
-      height, width, num_keypoints] representing the heatmap
-      which is used for reporting the confidence scores. If not provided, then
-      the values in the keypoint_heatmap_predictions will be used.
+    keypoint_score_heatmap: (optional) A float tensor of shape [height, width,
+      num_keypoints] representing the heatmap which is used for reporting the
+      confidence scores. If not provided, then the values in the
+      keypoint_heatmap_predictions will be used.

  Returns:
    keypoint_candidates: A tensor of shape
-      [batch_size, max_candidates, num_keypoints, 2] holding the
+      [1, max_candidates, num_keypoints, 2] holding the
      location of keypoint candidates in [y, x] format (expressed in absolute
      coordinates in the output coordinate frame).
    keypoint_scores: A float tensor of shape
-      [batch_size, max_candidates, num_keypoints] with the scores for each
+      [1, max_candidates, num_keypoints] with the scores for each
      keypoint candidate. The scores come directly from the heatmap predictions.
  """
-  batch_size, height, width, num_instances, num_keypoints = _get_shape(
-      keypoint_heatmap_predictions, 5)
+  height, width, num_instances, num_keypoints = _get_shape(
+      keypoint_heatmap_predictions, 4)

-  # [batch_size, height * width, num_instances * num_keypoints].
+  # [height * width, num_instances * num_keypoints].
  feature_map_flattened = tf.reshape(
      keypoint_heatmap_predictions,
-      [batch_size, -1, num_instances * num_keypoints])
+      [-1, num_instances * num_keypoints])

-  # [batch_size, num_instances * num_keypoints].
+  # [num_instances * num_keypoints].
  peak_flat_indices = tf.math.argmax(
-      feature_map_flattened, axis=1, output_type=tf.dtypes.int32)
+      feature_map_flattened, axis=0, output_type=tf.dtypes.int32)

  # Get x and y indices corresponding to the top indices in the flat array.
  y_indices, x_indices = (
      row_col_indices_from_flattened_indices(peak_flat_indices, width))
-  # [batch_size * num_instances * num_keypoints].
+  # [num_instances * num_keypoints].
  y_indices = tf.reshape(y_indices, [-1])
  x_indices = tf.reshape(x_indices, [-1])

  # Prepare the indices to gather the offsets from the keypoint_heatmap_offsets.
-  batch_idx = _multi_range(
-      limit=batch_size, value_repetitions=num_keypoints * num_instances)
  kpts_idx = _multi_range(
      limit=num_keypoints, value_repetitions=1,
-      range_repetitions=batch_size * num_instances)
+      range_repetitions=num_instances)
  combined_indices = tf.stack([
-      batch_idx,
      y_indices,
      x_indices,
      kpts_idx
  ], axis=1)

  keypoint_heatmap_offsets = tf.reshape(
-      keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, 2])
+      keypoint_heatmap_offsets, [height, width, num_keypoints, 2])
  # Retrieve the keypoint offsets: shape:
-  # [batch_size * num_instance * num_keypoints, 2].
+  # [num_instance * num_keypoints, 2].
  selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
                                       combined_indices)
  y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
@@ -941,15 +934,15 @@ def prediction_tensors_to_multi_instance_kpts(
      tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
  ], axis=2)
  keypoint_candidates = tf.reshape(
-      keypoint_candidates, [batch_size, num_instances, num_keypoints, 2])
+      keypoint_candidates, [num_instances, num_keypoints, 2])

  if keypoint_score_heatmap is None:
    keypoint_scores = tf.gather_nd(
-        tf.reduce_max(keypoint_heatmap_predictions, axis=3), combined_indices)
+        tf.reduce_max(keypoint_heatmap_predictions, axis=2), combined_indices)
  else:
    keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices)
-  return keypoint_candidates, tf.reshape(
-      keypoint_scores, [batch_size, num_instances, num_keypoints])
+  return tf.expand_dims(keypoint_candidates, axis=0), tf.reshape(
+      keypoint_scores, [1, num_instances, num_keypoints])


 def prediction_to_keypoints_argmax(
@@ -963,7 +956,9 @@ def prediction_to_keypoints_argmax(

  This is a different implementation of the original keypoint postprocessing
  function such that it avoids using topk op (replaced by argmax) as it runs
-  much slower in the browser.
+  much slower in the browser. Note that in this function, we assume the
+  batch_size to be 1 to avoid using 5D tensors which cause issues when
+  converting to the TfLite model.

  Args:
    prediction_dict: a dictionary holding predicted tensors, returned from the
@@ -992,13 +987,13 @@ def prediction_to_keypoints_argmax(
  Raises:
    ValueError: if the candidate_ranking_mode is not supported.
  """
-  keypoint_heatmap = tf.nn.sigmoid(prediction_dict[
-      get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1])
-  keypoint_offset = prediction_dict[
-      get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
-  keypoint_regression = prediction_dict[
-      get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
-  batch_size, height, width, num_keypoints = _get_shape(keypoint_heatmap, 4)
+  keypoint_heatmap = tf.squeeze(tf.nn.sigmoid(prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]), axis=0)
+  keypoint_offset = tf.squeeze(prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1], axis=0)
+  keypoint_regression = tf.squeeze(prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1], axis=0)
+  height, width, num_keypoints = _get_shape(keypoint_heatmap, 3)

  # Create the y,x grids: [height, width]
  (y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width)
@@ -1006,7 +1001,6 @@ def prediction_to_keypoints_argmax(
  # Prepare the indices to retrieve the information from object centers.
  num_instances = _get_shape(object_y_indices, 2)[1]
  combined_obj_indices = tf.stack([
-      _multi_range(batch_size, value_repetitions=num_instances),
      tf.reshape(object_y_indices, [-1]),
      tf.reshape(object_x_indices, [-1])
  ], axis=1)
@@ -1015,24 +1009,24 @@ def prediction_to_keypoints_argmax(
  selected_regression_flat = tf.gather_nd(
      keypoint_regression, combined_obj_indices)
  selected_regression = tf.reshape(
-      selected_regression_flat, [batch_size, num_instances, num_keypoints, 2])
-  (y_reg, x_reg) = tf.unstack(selected_regression, axis=3)
+      selected_regression_flat, [num_instances, num_keypoints, 2])
+  (y_reg, x_reg) = tf.unstack(selected_regression, axis=2)

-  # shape: [batch_size, num_instances, num_keypoints].
+  # shape: [num_instances, num_keypoints].
  y_regressed = tf.cast(
-      tf.reshape(object_y_indices, [batch_size, num_instances, 1]),
+      tf.reshape(object_y_indices, [num_instances, 1]),
      dtype=tf.float32) + y_reg
  x_regressed = tf.cast(
-      tf.reshape(object_x_indices, [batch_size, num_instances, 1]),
+      tf.reshape(object_x_indices, [num_instances, 1]),
      dtype=tf.float32) + x_reg

  if kp_params.candidate_ranking_mode == 'gaussian_weighted_const':
    rescored_heatmap = _gaussian_weighted_map_const_multi(
-        y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed, boxes,
-        kp_params.gaussian_denom_ratio)
+        y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed,
+        tf.squeeze(boxes, axis=0), kp_params.gaussian_denom_ratio)

-    # shape: [batch_size, height, width, num_keypoints].
-    keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=3)
+    # shape: [height, width, num_keypoints].
+    keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=2)
  else:
    raise ValueError(
        'Unsupported ranking mode in the multipose no topk method: %s' %

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -811,29 +811,29 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
                            {'provide_keypoint_score': False})
  def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score):
    image_size = (9, 9)
-    keypoint_heatmap_np = np.zeros((1, image_size[0], image_size[1], 3, 4),
+    keypoint_heatmap_np = np.zeros((image_size[0], image_size[1], 3, 4),
                                   dtype=np.float32)
    # Instance 0.
-    keypoint_heatmap_np[0, 1, 1, 0, 0] = 0.9
-    keypoint_heatmap_np[0, 1, 7, 0, 1] = 0.9
-    keypoint_heatmap_np[0, 7, 1, 0, 2] = 0.9
-    keypoint_heatmap_np[0, 7, 7, 0, 3] = 0.9
+    keypoint_heatmap_np[1, 1, 0, 0] = 0.9
+    keypoint_heatmap_np[1, 7, 0, 1] = 0.9
+    keypoint_heatmap_np[7, 1, 0, 2] = 0.9
+    keypoint_heatmap_np[7, 7, 0, 3] = 0.9
    # Instance 1.
-    keypoint_heatmap_np[0, 2, 2, 1, 0] = 0.8
-    keypoint_heatmap_np[0, 2, 8, 1, 1] = 0.8
-    keypoint_heatmap_np[0, 8, 2, 1, 2] = 0.8
-    keypoint_heatmap_np[0, 8, 8, 1, 3] = 0.8
+    keypoint_heatmap_np[2, 2, 1, 0] = 0.8
+    keypoint_heatmap_np[2, 8, 1, 1] = 0.8
+    keypoint_heatmap_np[8, 2, 1, 2] = 0.8
+    keypoint_heatmap_np[8, 8, 1, 3] = 0.8

-    keypoint_offset_np = np.zeros((1, image_size[0], image_size[1], 8),
+    keypoint_offset_np = np.zeros((image_size[0], image_size[1], 8),
                                  dtype=np.float32)
-    keypoint_offset_np[0, 1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-    keypoint_offset_np[0, 1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
-    keypoint_offset_np[0, 7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
-    keypoint_offset_np[0, 7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
-    keypoint_offset_np[0, 2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-    keypoint_offset_np[0, 2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
-    keypoint_offset_np[0, 8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
-    keypoint_offset_np[0, 8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
+    keypoint_offset_np[1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
+    keypoint_offset_np[7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
+    keypoint_offset_np[2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
+    keypoint_offset_np[8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]

    def graph_fn():
      keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
@@ -844,7 +844,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
            cnma.prediction_tensors_to_multi_instance_kpts(
                keypoint_heatmap,
                keypoint_offset,
-                tf.reduce_max(keypoint_heatmap, axis=3)))
+                tf.reduce_max(keypoint_heatmap, axis=2)))
      else:
        (keypoint_cands, keypoint_scores) = (
            cnma.prediction_tensors_to_multi_instance_kpts(