Commit b1d0bf77 authored by Yu-hui Chen's avatar Yu-hui Chen Committed by TF Object Detection Team
Browse files

Updated the MoveNet.Multipose model postprocessing logics such that it avoids

using 5d tensors to avoid issue when converted to TfLite model.

PiperOrigin-RevId: 398301856
parent 19d2e2ac
......@@ -807,62 +807,58 @@ def _gaussian_weighted_map_const_multi(
y-coordinate of each pixel grid.
x_grid: A float tensor with shape [height, width] representing the
x-coordinate of each pixel grid.
heatmap: A float tensor with shape [batch_size, height, width,
num_keypoints] representing the heatmap to be rescored.
points_y: A float tensor with shape [batch_size, num_instances,
num_keypoints] representing the y coordinates of the target points for
each channel.
points_x: A float tensor with shape [batch_size, num_instances,
num_keypoints] representing the x coordinates of the target points for
each channel.
boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
bounding boxes for each instance, expressed in the output coordinate
frame.
heatmap: A float tensor with shape [height, width, num_keypoints]
representing the heatmap to be rescored.
points_y: A float tensor with shape [num_instances, num_keypoints]
representing the y coordinates of the target points for each channel.
points_x: A float tensor with shape [num_instances, num_keypoints]
representing the x coordinates of the target points for each channel.
boxes: A tensor of shape [num_instances, 4] with predicted bounding boxes
for each instance, expressed in the output coordinate frame.
gaussian_denom_ratio: A constant used in the above formula that determines
the denominator of the Gaussian kernel.
Returns:
A float tensor with shape [batch_size, height, width, channel] representing
A float tensor with shape [height, width, channel] representing
the rescored heatmap.
"""
batch_size, num_instances, _ = _get_shape(boxes, 3)
_, height, width, num_keypoints = _get_shape(heatmap, 4)
num_instances, _ = _get_shape(boxes, 2)
height, width, num_keypoints = _get_shape(heatmap, 3)
# [batch_size, height, width, num_instances, num_keypoints].
# [height, width, num_instances, num_keypoints].
# Note that we intentionally avoid using tf.newaxis as TfLite converter
# doesn't like it.
y_diff = (
tf.reshape(y_grid, [1, height, width, 1, 1]) -
tf.reshape(points_y, [batch_size, 1, 1, num_instances, num_keypoints]))
tf.reshape(y_grid, [height, width, 1, 1]) -
tf.reshape(points_y, [1, 1, num_instances, num_keypoints]))
x_diff = (
tf.reshape(x_grid, [1, height, width, 1, 1]) -
tf.reshape(points_x, [batch_size, 1, 1, num_instances, num_keypoints]))
distance_square = y_diff**2 + x_diff**2
tf.reshape(x_grid, [height, width, 1, 1]) -
tf.reshape(points_x, [1, 1, num_instances, num_keypoints]))
distance_square = y_diff * y_diff + x_diff * x_diff
y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=2)
y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=1)
# Make the mask with all 1.0 in the box regions.
# Shape: [batch_size, height, width, num_instances]
# Shape: [height, width, num_instances]
in_boxes = tf.math.logical_and(
tf.math.logical_and(
tf.reshape(y_grid, [1, height, width, 1]) >= tf.reshape(
y_min, [batch_size, 1, 1, num_instances]),
tf.reshape(y_grid, [1, height, width, 1]) < tf.reshape(
y_max, [batch_size, 1, 1, num_instances])),
tf.reshape(y_grid, [height, width, 1]) >= tf.reshape(
y_min, [1, 1, num_instances]),
tf.reshape(y_grid, [height, width, 1]) < tf.reshape(
y_max, [1, 1, num_instances])),
tf.math.logical_and(
tf.reshape(x_grid, [1, height, width, 1]) >= tf.reshape(
x_min, [batch_size, 1, 1, num_instances]),
tf.reshape(x_grid, [1, height, width, 1]) < tf.reshape(
x_max, [batch_size, 1, 1, num_instances])))
tf.reshape(x_grid, [height, width, 1]) >= tf.reshape(
x_min, [1, 1, num_instances]),
tf.reshape(x_grid, [height, width, 1]) < tf.reshape(
x_max, [1, 1, num_instances])))
in_boxes = tf.cast(in_boxes, dtype=tf.float32)
gaussian_denom = tf.cast(
tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio
# shape: [batch_size, height, width, num_instances, num_keypoints]
# shape: [height, width, num_instances, num_keypoints]
gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom)
return tf.expand_dims(
heatmap, axis=3) * gaussian_map * tf.reshape(
in_boxes, [batch_size, height, width, num_instances, 1])
return tf.expand_dims(heatmap, axis=2) * gaussian_map * tf.reshape(
in_boxes, [height, width, num_instances, 1])
def prediction_tensors_to_multi_instance_kpts(
......@@ -876,62 +872,59 @@ def prediction_tensors_to_multi_instance_kpts(
have an additional 'num_instances' dimension for multi-instance prediction.
Args:
keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
keypoint_heatmap_predictions: A float tensor of shape [height,
width, num_instances, num_keypoints] representing the per-keypoint and
per-instance heatmaps which is used for finding the best keypoint
candidate locations.
keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
keypoint_heatmap_offsets: A float tensor of shape [height,
width, 2 * num_keypoints] representing the per-keypoint offsets.
keypoint_score_heatmap: (optional) A float tensor of shape [batch_size,
height, width, num_keypoints] representing the heatmap
which is used for reporting the confidence scores. If not provided, then
the values in the keypoint_heatmap_predictions will be used.
keypoint_score_heatmap: (optional) A float tensor of shape [height, width,
num_keypoints] representing the heatmap which is used for reporting the
confidence scores. If not provided, then the values in the
keypoint_heatmap_predictions will be used.
Returns:
keypoint_candidates: A tensor of shape
[batch_size, max_candidates, num_keypoints, 2] holding the
[1, max_candidates, num_keypoints, 2] holding the
location of keypoint candidates in [y, x] format (expressed in absolute
coordinates in the output coordinate frame).
keypoint_scores: A float tensor of shape
[batch_size, max_candidates, num_keypoints] with the scores for each
[1, max_candidates, num_keypoints] with the scores for each
keypoint candidate. The scores come directly from the heatmap predictions.
"""
batch_size, height, width, num_instances, num_keypoints = _get_shape(
keypoint_heatmap_predictions, 5)
height, width, num_instances, num_keypoints = _get_shape(
keypoint_heatmap_predictions, 4)
# [batch_size, height * width, num_instances * num_keypoints].
# [height * width, num_instances * num_keypoints].
feature_map_flattened = tf.reshape(
keypoint_heatmap_predictions,
[batch_size, -1, num_instances * num_keypoints])
[-1, num_instances * num_keypoints])
# [batch_size, num_instances * num_keypoints].
# [num_instances * num_keypoints].
peak_flat_indices = tf.math.argmax(
feature_map_flattened, axis=1, output_type=tf.dtypes.int32)
feature_map_flattened, axis=0, output_type=tf.dtypes.int32)
# Get x and y indices corresponding to the top indices in the flat array.
y_indices, x_indices = (
row_col_indices_from_flattened_indices(peak_flat_indices, width))
# [batch_size * num_instances * num_keypoints].
# [num_instances * num_keypoints].
y_indices = tf.reshape(y_indices, [-1])
x_indices = tf.reshape(x_indices, [-1])
# Prepare the indices to gather the offsets from the keypoint_heatmap_offsets.
batch_idx = _multi_range(
limit=batch_size, value_repetitions=num_keypoints * num_instances)
kpts_idx = _multi_range(
limit=num_keypoints, value_repetitions=1,
range_repetitions=batch_size * num_instances)
range_repetitions=num_instances)
combined_indices = tf.stack([
batch_idx,
y_indices,
x_indices,
kpts_idx
], axis=1)
keypoint_heatmap_offsets = tf.reshape(
keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, 2])
keypoint_heatmap_offsets, [height, width, num_keypoints, 2])
# Retrieve the keypoint offsets: shape:
# [batch_size * num_instance * num_keypoints, 2].
# [num_instance * num_keypoints, 2].
selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
combined_indices)
y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
......@@ -941,15 +934,15 @@ def prediction_tensors_to_multi_instance_kpts(
tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
], axis=2)
keypoint_candidates = tf.reshape(
keypoint_candidates, [batch_size, num_instances, num_keypoints, 2])
keypoint_candidates, [num_instances, num_keypoints, 2])
if keypoint_score_heatmap is None:
keypoint_scores = tf.gather_nd(
tf.reduce_max(keypoint_heatmap_predictions, axis=3), combined_indices)
tf.reduce_max(keypoint_heatmap_predictions, axis=2), combined_indices)
else:
keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices)
return keypoint_candidates, tf.reshape(
keypoint_scores, [batch_size, num_instances, num_keypoints])
return tf.expand_dims(keypoint_candidates, axis=0), tf.reshape(
keypoint_scores, [1, num_instances, num_keypoints])
def prediction_to_keypoints_argmax(
......@@ -963,7 +956,9 @@ def prediction_to_keypoints_argmax(
This is a different implementation of the original keypoint postprocessing
function such that it avoids using topk op (replaced by argmax) as it runs
much slower in the browser.
much slower in the browser. Note that in this function, we assume the
batch_size to be 1 to avoid using 5D tensors which cause issues when
converting to the TfLite model.
Args:
prediction_dict: a dictionary holding predicted tensors, returned from the
......@@ -992,13 +987,13 @@ def prediction_to_keypoints_argmax(
Raises:
ValueError: if the candidate_ranking_mode is not supported.
"""
keypoint_heatmap = tf.nn.sigmoid(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1])
keypoint_offset = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
keypoint_regression = prediction_dict[
get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
batch_size, height, width, num_keypoints = _get_shape(keypoint_heatmap, 4)
keypoint_heatmap = tf.squeeze(tf.nn.sigmoid(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]), axis=0)
keypoint_offset = tf.squeeze(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1], axis=0)
keypoint_regression = tf.squeeze(prediction_dict[
get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1], axis=0)
height, width, num_keypoints = _get_shape(keypoint_heatmap, 3)
# Create the y,x grids: [height, width]
(y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width)
......@@ -1006,7 +1001,6 @@ def prediction_to_keypoints_argmax(
# Prepare the indices to retrieve the information from object centers.
num_instances = _get_shape(object_y_indices, 2)[1]
combined_obj_indices = tf.stack([
_multi_range(batch_size, value_repetitions=num_instances),
tf.reshape(object_y_indices, [-1]),
tf.reshape(object_x_indices, [-1])
], axis=1)
......@@ -1015,24 +1009,24 @@ def prediction_to_keypoints_argmax(
selected_regression_flat = tf.gather_nd(
keypoint_regression, combined_obj_indices)
selected_regression = tf.reshape(
selected_regression_flat, [batch_size, num_instances, num_keypoints, 2])
(y_reg, x_reg) = tf.unstack(selected_regression, axis=3)
selected_regression_flat, [num_instances, num_keypoints, 2])
(y_reg, x_reg) = tf.unstack(selected_regression, axis=2)
# shape: [batch_size, num_instances, num_keypoints].
# shape: [num_instances, num_keypoints].
y_regressed = tf.cast(
tf.reshape(object_y_indices, [batch_size, num_instances, 1]),
tf.reshape(object_y_indices, [num_instances, 1]),
dtype=tf.float32) + y_reg
x_regressed = tf.cast(
tf.reshape(object_x_indices, [batch_size, num_instances, 1]),
tf.reshape(object_x_indices, [num_instances, 1]),
dtype=tf.float32) + x_reg
if kp_params.candidate_ranking_mode == 'gaussian_weighted_const':
rescored_heatmap = _gaussian_weighted_map_const_multi(
y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed, boxes,
kp_params.gaussian_denom_ratio)
y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed,
tf.squeeze(boxes, axis=0), kp_params.gaussian_denom_ratio)
# shape: [batch_size, height, width, num_keypoints].
keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=3)
# shape: [height, width, num_keypoints].
keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=2)
else:
raise ValueError(
'Unsupported ranking mode in the multipose no topk method: %s' %
......
......@@ -811,29 +811,29 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
{'provide_keypoint_score': False})
def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score):
image_size = (9, 9)
keypoint_heatmap_np = np.zeros((1, image_size[0], image_size[1], 3, 4),
keypoint_heatmap_np = np.zeros((image_size[0], image_size[1], 3, 4),
dtype=np.float32)
# Instance 0.
keypoint_heatmap_np[0, 1, 1, 0, 0] = 0.9
keypoint_heatmap_np[0, 1, 7, 0, 1] = 0.9
keypoint_heatmap_np[0, 7, 1, 0, 2] = 0.9
keypoint_heatmap_np[0, 7, 7, 0, 3] = 0.9
keypoint_heatmap_np[1, 1, 0, 0] = 0.9
keypoint_heatmap_np[1, 7, 0, 1] = 0.9
keypoint_heatmap_np[7, 1, 0, 2] = 0.9
keypoint_heatmap_np[7, 7, 0, 3] = 0.9
# Instance 1.
keypoint_heatmap_np[0, 2, 2, 1, 0] = 0.8
keypoint_heatmap_np[0, 2, 8, 1, 1] = 0.8
keypoint_heatmap_np[0, 8, 2, 1, 2] = 0.8
keypoint_heatmap_np[0, 8, 8, 1, 3] = 0.8
keypoint_heatmap_np[2, 2, 1, 0] = 0.8
keypoint_heatmap_np[2, 8, 1, 1] = 0.8
keypoint_heatmap_np[8, 2, 1, 2] = 0.8
keypoint_heatmap_np[8, 8, 1, 3] = 0.8
keypoint_offset_np = np.zeros((1, image_size[0], image_size[1], 8),
keypoint_offset_np = np.zeros((image_size[0], image_size[1], 8),
dtype=np.float32)
keypoint_offset_np[0, 1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
keypoint_offset_np[0, 7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
keypoint_offset_np[0, 2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[0, 8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
keypoint_offset_np[0, 8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
keypoint_offset_np[1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
keypoint_offset_np[7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
keypoint_offset_np[2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
keypoint_offset_np[8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
keypoint_offset_np[8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
def graph_fn():
keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
......@@ -844,7 +844,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
cnma.prediction_tensors_to_multi_instance_kpts(
keypoint_heatmap,
keypoint_offset,
tf.reduce_max(keypoint_heatmap, axis=3)))
tf.reduce_max(keypoint_heatmap, axis=2)))
else:
(keypoint_cands, keypoint_scores) = (
cnma.prediction_tensors_to_multi_instance_kpts(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment