Commit b1d0bf77 authored by Yu-hui Chen's avatar Yu-hui Chen Committed by TF Object Detection Team
Browse files

Updated the MoveNet.Multipose model postprocessing logics such that it avoids

using 5d tensors to avoid issue when converted to TfLite model.

PiperOrigin-RevId: 398301856
parent 19d2e2ac
...@@ -807,62 +807,58 @@ def _gaussian_weighted_map_const_multi( ...@@ -807,62 +807,58 @@ def _gaussian_weighted_map_const_multi(
y-coordinate of each pixel grid. y-coordinate of each pixel grid.
x_grid: A float tensor with shape [height, width] representing the x_grid: A float tensor with shape [height, width] representing the
x-coordinate of each pixel grid. x-coordinate of each pixel grid.
heatmap: A float tensor with shape [batch_size, height, width, heatmap: A float tensor with shape [height, width, num_keypoints]
num_keypoints] representing the heatmap to be rescored. representing the heatmap to be rescored.
points_y: A float tensor with shape [batch_size, num_instances, points_y: A float tensor with shape [num_instances, num_keypoints]
num_keypoints] representing the y coordinates of the target points for representing the y coordinates of the target points for each channel.
each channel. points_x: A float tensor with shape [num_instances, num_keypoints]
points_x: A float tensor with shape [batch_size, num_instances, representing the x coordinates of the target points for each channel.
num_keypoints] representing the x coordinates of the target points for boxes: A tensor of shape [num_instances, 4] with predicted bounding boxes
each channel. for each instance, expressed in the output coordinate frame.
boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
bounding boxes for each instance, expressed in the output coordinate
frame.
gaussian_denom_ratio: A constant used in the above formula that determines gaussian_denom_ratio: A constant used in the above formula that determines
the denominator of the Gaussian kernel. the denominator of the Gaussian kernel.
Returns: Returns:
A float tensor with shape [batch_size, height, width, channel] representing A float tensor with shape [height, width, channel] representing
the rescored heatmap. the rescored heatmap.
""" """
batch_size, num_instances, _ = _get_shape(boxes, 3) num_instances, _ = _get_shape(boxes, 2)
_, height, width, num_keypoints = _get_shape(heatmap, 4) height, width, num_keypoints = _get_shape(heatmap, 3)
# [batch_size, height, width, num_instances, num_keypoints]. # [height, width, num_instances, num_keypoints].
# Note that we intentionally avoid using tf.newaxis as TfLite converter # Note that we intentionally avoid using tf.newaxis as TfLite converter
# doesn't like it. # doesn't like it.
y_diff = ( y_diff = (
tf.reshape(y_grid, [1, height, width, 1, 1]) - tf.reshape(y_grid, [height, width, 1, 1]) -
tf.reshape(points_y, [batch_size, 1, 1, num_instances, num_keypoints])) tf.reshape(points_y, [1, 1, num_instances, num_keypoints]))
x_diff = ( x_diff = (
tf.reshape(x_grid, [1, height, width, 1, 1]) - tf.reshape(x_grid, [height, width, 1, 1]) -
tf.reshape(points_x, [batch_size, 1, 1, num_instances, num_keypoints])) tf.reshape(points_x, [1, 1, num_instances, num_keypoints]))
distance_square = y_diff**2 + x_diff**2 distance_square = y_diff * y_diff + x_diff * x_diff
y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=2) y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=1)
# Make the mask with all 1.0 in the box regions. # Make the mask with all 1.0 in the box regions.
# Shape: [batch_size, height, width, num_instances] # Shape: [height, width, num_instances]
in_boxes = tf.math.logical_and( in_boxes = tf.math.logical_and(
tf.math.logical_and( tf.math.logical_and(
tf.reshape(y_grid, [1, height, width, 1]) >= tf.reshape( tf.reshape(y_grid, [height, width, 1]) >= tf.reshape(
y_min, [batch_size, 1, 1, num_instances]), y_min, [1, 1, num_instances]),
tf.reshape(y_grid, [1, height, width, 1]) < tf.reshape( tf.reshape(y_grid, [height, width, 1]) < tf.reshape(
y_max, [batch_size, 1, 1, num_instances])), y_max, [1, 1, num_instances])),
tf.math.logical_and( tf.math.logical_and(
tf.reshape(x_grid, [1, height, width, 1]) >= tf.reshape( tf.reshape(x_grid, [height, width, 1]) >= tf.reshape(
x_min, [batch_size, 1, 1, num_instances]), x_min, [1, 1, num_instances]),
tf.reshape(x_grid, [1, height, width, 1]) < tf.reshape( tf.reshape(x_grid, [height, width, 1]) < tf.reshape(
x_max, [batch_size, 1, 1, num_instances]))) x_max, [1, 1, num_instances])))
in_boxes = tf.cast(in_boxes, dtype=tf.float32) in_boxes = tf.cast(in_boxes, dtype=tf.float32)
gaussian_denom = tf.cast( gaussian_denom = tf.cast(
tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio
# shape: [batch_size, height, width, num_instances, num_keypoints] # shape: [height, width, num_instances, num_keypoints]
gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom) gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom)
return tf.expand_dims( return tf.expand_dims(heatmap, axis=2) * gaussian_map * tf.reshape(
heatmap, axis=3) * gaussian_map * tf.reshape( in_boxes, [height, width, num_instances, 1])
in_boxes, [batch_size, height, width, num_instances, 1])
def prediction_tensors_to_multi_instance_kpts( def prediction_tensors_to_multi_instance_kpts(
...@@ -876,62 +872,59 @@ def prediction_tensors_to_multi_instance_kpts( ...@@ -876,62 +872,59 @@ def prediction_tensors_to_multi_instance_kpts(
have an additional 'num_instances' dimension for multi-instance prediction. have an additional 'num_instances' dimension for multi-instance prediction.
Args: Args:
keypoint_heatmap_predictions: A float tensor of shape [batch_size, height, keypoint_heatmap_predictions: A float tensor of shape [height,
width, num_instances, num_keypoints] representing the per-keypoint and width, num_instances, num_keypoints] representing the per-keypoint and
per-instance heatmaps which is used for finding the best keypoint per-instance heatmaps which is used for finding the best keypoint
candidate locations. candidate locations.
keypoint_heatmap_offsets: A float tensor of shape [batch_size, height, keypoint_heatmap_offsets: A float tensor of shape [height,
width, 2 * num_keypoints] representing the per-keypoint offsets. width, 2 * num_keypoints] representing the per-keypoint offsets.
keypoint_score_heatmap: (optional) A float tensor of shape [batch_size, keypoint_score_heatmap: (optional) A float tensor of shape [height, width,
height, width, num_keypoints] representing the heatmap num_keypoints] representing the heatmap which is used for reporting the
which is used for reporting the confidence scores. If not provided, then confidence scores. If not provided, then the values in the
the values in the keypoint_heatmap_predictions will be used. keypoint_heatmap_predictions will be used.
Returns: Returns:
keypoint_candidates: A tensor of shape keypoint_candidates: A tensor of shape
[batch_size, max_candidates, num_keypoints, 2] holding the [1, max_candidates, num_keypoints, 2] holding the
location of keypoint candidates in [y, x] format (expressed in absolute location of keypoint candidates in [y, x] format (expressed in absolute
coordinates in the output coordinate frame). coordinates in the output coordinate frame).
keypoint_scores: A float tensor of shape keypoint_scores: A float tensor of shape
[batch_size, max_candidates, num_keypoints] with the scores for each [1, max_candidates, num_keypoints] with the scores for each
keypoint candidate. The scores come directly from the heatmap predictions. keypoint candidate. The scores come directly from the heatmap predictions.
""" """
batch_size, height, width, num_instances, num_keypoints = _get_shape( height, width, num_instances, num_keypoints = _get_shape(
keypoint_heatmap_predictions, 5) keypoint_heatmap_predictions, 4)
# [batch_size, height * width, num_instances * num_keypoints]. # [height * width, num_instances * num_keypoints].
feature_map_flattened = tf.reshape( feature_map_flattened = tf.reshape(
keypoint_heatmap_predictions, keypoint_heatmap_predictions,
[batch_size, -1, num_instances * num_keypoints]) [-1, num_instances * num_keypoints])
# [batch_size, num_instances * num_keypoints]. # [num_instances * num_keypoints].
peak_flat_indices = tf.math.argmax( peak_flat_indices = tf.math.argmax(
feature_map_flattened, axis=1, output_type=tf.dtypes.int32) feature_map_flattened, axis=0, output_type=tf.dtypes.int32)
# Get x and y indices corresponding to the top indices in the flat array. # Get x and y indices corresponding to the top indices in the flat array.
y_indices, x_indices = ( y_indices, x_indices = (
row_col_indices_from_flattened_indices(peak_flat_indices, width)) row_col_indices_from_flattened_indices(peak_flat_indices, width))
# [batch_size * num_instances * num_keypoints]. # [num_instances * num_keypoints].
y_indices = tf.reshape(y_indices, [-1]) y_indices = tf.reshape(y_indices, [-1])
x_indices = tf.reshape(x_indices, [-1]) x_indices = tf.reshape(x_indices, [-1])
# Prepare the indices to gather the offsets from the keypoint_heatmap_offsets. # Prepare the indices to gather the offsets from the keypoint_heatmap_offsets.
batch_idx = _multi_range(
limit=batch_size, value_repetitions=num_keypoints * num_instances)
kpts_idx = _multi_range( kpts_idx = _multi_range(
limit=num_keypoints, value_repetitions=1, limit=num_keypoints, value_repetitions=1,
range_repetitions=batch_size * num_instances) range_repetitions=num_instances)
combined_indices = tf.stack([ combined_indices = tf.stack([
batch_idx,
y_indices, y_indices,
x_indices, x_indices,
kpts_idx kpts_idx
], axis=1) ], axis=1)
keypoint_heatmap_offsets = tf.reshape( keypoint_heatmap_offsets = tf.reshape(
keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, 2]) keypoint_heatmap_offsets, [height, width, num_keypoints, 2])
# Retrieve the keypoint offsets: shape: # Retrieve the keypoint offsets: shape:
# [batch_size * num_instance * num_keypoints, 2]. # [num_instance * num_keypoints, 2].
selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets, selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
combined_indices) combined_indices)
y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1) y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
...@@ -941,15 +934,15 @@ def prediction_tensors_to_multi_instance_kpts( ...@@ -941,15 +934,15 @@ def prediction_tensors_to_multi_instance_kpts(
tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0) tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
], axis=2) ], axis=2)
keypoint_candidates = tf.reshape( keypoint_candidates = tf.reshape(
keypoint_candidates, [batch_size, num_instances, num_keypoints, 2]) keypoint_candidates, [num_instances, num_keypoints, 2])
if keypoint_score_heatmap is None: if keypoint_score_heatmap is None:
keypoint_scores = tf.gather_nd( keypoint_scores = tf.gather_nd(
tf.reduce_max(keypoint_heatmap_predictions, axis=3), combined_indices) tf.reduce_max(keypoint_heatmap_predictions, axis=2), combined_indices)
else: else:
keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices) keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices)
return keypoint_candidates, tf.reshape( return tf.expand_dims(keypoint_candidates, axis=0), tf.reshape(
keypoint_scores, [batch_size, num_instances, num_keypoints]) keypoint_scores, [1, num_instances, num_keypoints])
def prediction_to_keypoints_argmax( def prediction_to_keypoints_argmax(
...@@ -963,7 +956,9 @@ def prediction_to_keypoints_argmax( ...@@ -963,7 +956,9 @@ def prediction_to_keypoints_argmax(
This is a different implementation of the original keypoint postprocessing This is a different implementation of the original keypoint postprocessing
function such that it avoids using topk op (replaced by argmax) as it runs function such that it avoids using topk op (replaced by argmax) as it runs
much slower in the browser. much slower in the browser. Note that in this function, we assume the
batch_size to be 1 to avoid using 5D tensors which cause issues when
converting to the TfLite model.
Args: Args:
prediction_dict: a dictionary holding predicted tensors, returned from the prediction_dict: a dictionary holding predicted tensors, returned from the
...@@ -992,13 +987,13 @@ def prediction_to_keypoints_argmax( ...@@ -992,13 +987,13 @@ def prediction_to_keypoints_argmax(
Raises: Raises:
ValueError: if the candidate_ranking_mode is not supported. ValueError: if the candidate_ranking_mode is not supported.
""" """
keypoint_heatmap = tf.nn.sigmoid(prediction_dict[ keypoint_heatmap = tf.squeeze(tf.nn.sigmoid(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]) get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]), axis=0)
keypoint_offset = prediction_dict[ keypoint_offset = tf.squeeze(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1] get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1], axis=0)
keypoint_regression = prediction_dict[ keypoint_regression = tf.squeeze(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1] get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1], axis=0)
batch_size, height, width, num_keypoints = _get_shape(keypoint_heatmap, 4) height, width, num_keypoints = _get_shape(keypoint_heatmap, 3)
# Create the y,x grids: [height, width] # Create the y,x grids: [height, width]
(y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width) (y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width)
...@@ -1006,7 +1001,6 @@ def prediction_to_keypoints_argmax( ...@@ -1006,7 +1001,6 @@ def prediction_to_keypoints_argmax(
# Prepare the indices to retrieve the information from object centers. # Prepare the indices to retrieve the information from object centers.
num_instances = _get_shape(object_y_indices, 2)[1] num_instances = _get_shape(object_y_indices, 2)[1]
combined_obj_indices = tf.stack([ combined_obj_indices = tf.stack([
_multi_range(batch_size, value_repetitions=num_instances),
tf.reshape(object_y_indices, [-1]), tf.reshape(object_y_indices, [-1]),
tf.reshape(object_x_indices, [-1]) tf.reshape(object_x_indices, [-1])
], axis=1) ], axis=1)
...@@ -1015,24 +1009,24 @@ def prediction_to_keypoints_argmax( ...@@ -1015,24 +1009,24 @@ def prediction_to_keypoints_argmax(
selected_regression_flat = tf.gather_nd( selected_regression_flat = tf.gather_nd(
keypoint_regression, combined_obj_indices) keypoint_regression, combined_obj_indices)
selected_regression = tf.reshape( selected_regression = tf.reshape(
selected_regression_flat, [batch_size, num_instances, num_keypoints, 2]) selected_regression_flat, [num_instances, num_keypoints, 2])
(y_reg, x_reg) = tf.unstack(selected_regression, axis=3) (y_reg, x_reg) = tf.unstack(selected_regression, axis=2)
# shape: [batch_size, num_instances, num_keypoints]. # shape: [num_instances, num_keypoints].
y_regressed = tf.cast( y_regressed = tf.cast(
tf.reshape(object_y_indices, [batch_size, num_instances, 1]), tf.reshape(object_y_indices, [num_instances, 1]),
dtype=tf.float32) + y_reg dtype=tf.float32) + y_reg
x_regressed = tf.cast( x_regressed = tf.cast(
tf.reshape(object_x_indices, [batch_size, num_instances, 1]), tf.reshape(object_x_indices, [num_instances, 1]),
dtype=tf.float32) + x_reg dtype=tf.float32) + x_reg
if kp_params.candidate_ranking_mode == 'gaussian_weighted_const': if kp_params.candidate_ranking_mode == 'gaussian_weighted_const':
rescored_heatmap = _gaussian_weighted_map_const_multi( rescored_heatmap = _gaussian_weighted_map_const_multi(
y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed, boxes, y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed,
kp_params.gaussian_denom_ratio) tf.squeeze(boxes, axis=0), kp_params.gaussian_denom_ratio)
# shape: [batch_size, height, width, num_keypoints]. # shape: [height, width, num_keypoints].
keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=3) keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=2)
else: else:
raise ValueError( raise ValueError(
'Unsupported ranking mode in the multipose no topk method: %s' % 'Unsupported ranking mode in the multipose no topk method: %s' %
......
...@@ -811,29 +811,29 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase): ...@@ -811,29 +811,29 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
{'provide_keypoint_score': False}) {'provide_keypoint_score': False})
def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score): def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score):
image_size = (9, 9) image_size = (9, 9)
keypoint_heatmap_np = np.zeros((1, image_size[0], image_size[1], 3, 4), keypoint_heatmap_np = np.zeros((image_size[0], image_size[1], 3, 4),
dtype=np.float32) dtype=np.float32)
# Instance 0. # Instance 0.
keypoint_heatmap_np[0, 1, 1, 0, 0] = 0.9 keypoint_heatmap_np[1, 1, 0, 0] = 0.9
keypoint_heatmap_np[0, 1, 7, 0, 1] = 0.9 keypoint_heatmap_np[1, 7, 0, 1] = 0.9
keypoint_heatmap_np[0, 7, 1, 0, 2] = 0.9 keypoint_heatmap_np[7, 1, 0, 2] = 0.9
keypoint_heatmap_np[0, 7, 7, 0, 3] = 0.9 keypoint_heatmap_np[7, 7, 0, 3] = 0.9
# Instance 1. # Instance 1.
keypoint_heatmap_np[0, 2, 2, 1, 0] = 0.8 keypoint_heatmap_np[2, 2, 1, 0] = 0.8
keypoint_heatmap_np[0, 2, 8, 1, 1] = 0.8 keypoint_heatmap_np[2, 8, 1, 1] = 0.8
keypoint_heatmap_np[0, 8, 2, 1, 2] = 0.8 keypoint_heatmap_np[8, 2, 1, 2] = 0.8
keypoint_heatmap_np[0, 8, 8, 1, 3] = 0.8 keypoint_heatmap_np[8, 8, 1, 3] = 0.8
keypoint_offset_np = np.zeros((1, image_size[0], image_size[1], 8), keypoint_offset_np = np.zeros((image_size[0], image_size[1], 8),
dtype=np.float32) dtype=np.float32)
keypoint_offset_np[0, 1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] keypoint_offset_np[1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0] keypoint_offset_np[1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0] keypoint_offset_np[7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
keypoint_offset_np[0, 7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5] keypoint_offset_np[7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
keypoint_offset_np[0, 2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] keypoint_offset_np[2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0] keypoint_offset_np[2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0] keypoint_offset_np[8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
keypoint_offset_np[0, 8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3] keypoint_offset_np[8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
def graph_fn(): def graph_fn():
keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32) keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
...@@ -844,7 +844,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase): ...@@ -844,7 +844,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
cnma.prediction_tensors_to_multi_instance_kpts( cnma.prediction_tensors_to_multi_instance_kpts(
keypoint_heatmap, keypoint_heatmap,
keypoint_offset, keypoint_offset,
tf.reduce_max(keypoint_heatmap, axis=3))) tf.reduce_max(keypoint_heatmap, axis=2)))
else: else:
(keypoint_cands, keypoint_scores) = ( (keypoint_cands, keypoint_scores) = (
cnma.prediction_tensors_to_multi_instance_kpts( cnma.prediction_tensors_to_multi_instance_kpts(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment