Updated the target_assigner to prepare the prediction targets for CenterNet

keypoint depth. Updated model_lib files to populate the keypoint depth information. PiperOrigin-RevId: 355261727

Updated the target_assigner to prepare the prediction targets for CenterNet
keypoint depth. Updated model_lib files to populate the keypoint depth information. PiperOrigin-RevId: 355261727
81434df3 · Yu-hui Chen · TF Object Detection Team · 6a761cc8 · 81434df3 · 81434df3
Commit 81434df3 authored Feb 02, 2021 by Yu-hui Chen Committed by TF Object Detection Team Feb 02, 2021
4 changed files
--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -1468,6 +1468,175 @@ class CenterNetKeypointTargetAssigner(object):
    batch_offsets = tf.concat(batch_offsets, axis=0)
    return (batch_indices, batch_offsets, batch_weights)
+  def assign_keypoints_depth_targets(self,
+                                     height,
+                                     width,
+                                     gt_keypoints_list,
+                                     gt_classes_list,
+                                     gt_keypoint_depths_list,
+                                     gt_keypoint_depth_weights_list,
+                                     gt_keypoints_weights_list=None,
+                                     gt_weights_list=None):
+    """Returns the target depths of the keypoints.
+    The returned values are the relative depth information of each keypoints.
+    Args:
+      height: int, height of input to the CenterNet model. This is used to
+        determine the height of the output.
+      width: int, width of the input to the CenterNet model. This is used to
+        determine the width of the output.
+      gt_keypoints_list: A list of tensors with shape [num_instances,
+        num_total_keypoints, 2]. See class-level description for more detail.
+      gt_classes_list: A list of tensors with shape [num_instances,
+        num_classes]. See class-level description for more detail.
+      gt_keypoint_depths_list: A list of tensors with shape [num_instances,
+        num_total_keypoints] corresponding to the relative depth of the
+        keypoints.
+      gt_keypoint_depth_weights_list: A list of tensors with shape
+        [num_instances, num_total_keypoints] corresponding to the weights of
+        the relative depth.
+      gt_keypoints_weights_list: A list of tensors with shape [num_instances,
+        num_total_keypoints] corresponding to the weight of each keypoint.
+      gt_weights_list: A list of float tensors with shape [num_instances]. See
+        class-level description for more detail.
+    Returns:
+      batch_indices: an integer tensor of shape [num_total_instances, 3] (or
+        [num_total_instances, 4] if 'per_keypoint_offset' is set True) holding
+        the indices inside the predicted tensor which should be penalized. The
+        first column indicates the index along the batch dimension and the
+        second and third columns indicate the index along the y and x
+        dimensions respectively. The fourth column corresponds to the channel
+        dimension (if 'per_keypoint_offset' is set True).
+      batch_depths: a float tensor of shape [num_total_instances, 1] indicating
+        the target depth of each keypoint.
+      batch_weights: a float tensor of shape [num_total_instances] indicating
+        the weight of each prediction.
+      Note that num_total_instances = batch_size * num_instances *
+                                      num_keypoints * num_neighbors
+    """
+    batch_indices = []
+    batch_weights = []
+    batch_depths = []
+    if gt_keypoints_weights_list is None:
+      gt_keypoints_weights_list = [None] * len(gt_keypoints_list)
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_classes_list)
+    if gt_keypoint_depths_list is None:
+      gt_keypoint_depths_list = [None] * len(gt_classes_list)
+    for i, (keypoints, classes, kp_weights, weights,
+            keypoint_depths, keypoint_depth_weights) in enumerate(
+                zip(gt_keypoints_list, gt_classes_list,
+                    gt_keypoints_weights_list, gt_weights_list,
+                    gt_keypoint_depths_list, gt_keypoint_depth_weights_list)):
+      keypoints_absolute, kp_weights = self._preprocess_keypoints_and_weights(
+          out_height=height // self._stride,
+          out_width=width // self._stride,
+          keypoints=keypoints,
+          class_onehot=classes,
+          class_weights=weights,
+          keypoint_weights=kp_weights)
+      num_instances, num_keypoints, _ = (
+          shape_utils.combined_static_and_dynamic_shape(keypoints_absolute))
+      # [num_instances * num_keypoints]
+      y_source = tf.keras.backend.flatten(keypoints_absolute[:, :, 0])
+      x_source = tf.keras.backend.flatten(keypoints_absolute[:, :, 1])
+      # All keypoint coordinates and their neighbors:
+      # [num_instance * num_keypoints, num_neighbors]
+      (y_source_neighbors, x_source_neighbors,
+       valid_sources) = ta_utils.get_surrounding_grids(height // self._stride,
+                                                       width // self._stride,
+                                                       y_source, x_source,
+                                                       self._peak_radius)
+      _, num_neighbors = shape_utils.combined_static_and_dynamic_shape(
+          y_source_neighbors)
+      # Update the valid keypoint weights.
+      # [num_instance * num_keypoints, num_neighbors]
+      valid_keypoints = tf.cast(
+          valid_sources, dtype=tf.float32) * tf.stack(
+              [tf.keras.backend.flatten(kp_weights)] * num_neighbors, axis=-1)
+      # Compute the offsets and indices of the box centers. Shape:
+      #   indices: [num_instances * num_keypoints, num_neighbors, 2]
+      _, indices = ta_utils.compute_floor_offsets_with_indices(
+          y_source=y_source_neighbors,
+          x_source=x_source_neighbors,
+          y_target=y_source,
+          x_target=x_source)
+      # Reshape to:
+      #   indices: [num_instances * num_keypoints * num_neighbors, 2]
+      indices = tf.reshape(indices, [-1, 2])
+      # Gather the keypoint depth from corresponding keypoint indices:
+      #   [num_instances, num_keypoints]
+      keypoint_depths = tf.gather(
+          keypoint_depths, self._keypoint_indices, axis=1)
+      # Tile the depth target to surrounding pixels.
+      #   [num_instances, num_keypoints, num_neighbors]
+      tiled_keypoint_depths = tf.tile(
+          tf.expand_dims(keypoint_depths, axis=-1),
+          multiples=[1, 1, num_neighbors])
+      # [num_instances, num_keypoints]
+      keypoint_depth_weights = tf.gather(
+          keypoint_depth_weights, self._keypoint_indices, axis=1)
+      # [num_instances, num_keypoints, num_neighbors]
+      keypoint_depth_weights = tf.tile(
+          tf.expand_dims(keypoint_depth_weights, axis=-1),
+          multiples=[1, 1, num_neighbors])
+      # Update the weights of keypoint depth by the weights of the keypoints.
+      # A keypoint depth target is valid only if its corresponding keypoint
+      # target is also valid.
+      # [num_instances, num_keypoints, num_neighbors]
+      tiled_depth_weights = (
+          tf.reshape(valid_keypoints,
+                     [num_instances, num_keypoints, num_neighbors]) *
+          keypoint_depth_weights)
+      invalid_depths = tf.logical_or(
+          tf.math.is_nan(tiled_depth_weights),
+          tf.math.is_nan(tiled_keypoint_depths))
+      # Assign zero values and weights to NaN values.
+      final_keypoint_depths = tf.where(invalid_depths,
+                                       tf.zeros_like(tiled_keypoint_depths),
+                                       tiled_keypoint_depths)
+      final_keypoint_depth_weights = tf.where(
+          invalid_depths,
+          tf.zeros_like(tiled_depth_weights),
+          tiled_depth_weights)
+      # [num_instances * num_keypoints * num_neighbors, 1]
+      batch_depths.append(tf.reshape(final_keypoint_depths, [-1, 1]))
+      # Prepare the batch indices to be prepended.
+      batch_index = tf.fill(
+          [num_instances * num_keypoints * num_neighbors, 1], i)
+      if self._per_keypoint_offset:
+        tiled_keypoint_types = self._get_keypoint_types(
+            num_instances, num_keypoints, num_neighbors)
+        batch_indices.append(
+            tf.concat([batch_index, indices,
+                       tf.reshape(tiled_keypoint_types, [-1, 1])], axis=1))
+      else:
+        batch_indices.append(tf.concat([batch_index, indices], axis=1))
+      batch_weights.append(
+          tf.keras.backend.flatten(final_keypoint_depth_weights))
+    # Concatenate the tensors in the batch in the first dimension:
+    # shape: [batch_size * num_instances * num_keypoints * num_neighbors, 3] or
+    # [batch_size * num_instances * num_keypoints * num_neighbors, 4] if
+    # 'per_keypoint_offset' is set to True.
+    batch_indices = tf.concat(batch_indices, axis=0)
+    # shape: [batch_size * num_instances * num_keypoints * num_neighbors]
+    batch_weights = tf.concat(batch_weights, axis=0)
+    # shape: [batch_size * num_instances * num_keypoints * num_neighbors, 1]
+    batch_depths = tf.concat(batch_depths, axis=0)
+    return (batch_indices, batch_depths, batch_weights)
  def assign_joint_regression_targets(self,
                                      height,
                                      width,

--- a/research/object_detection/core/target_assigner_test.py
+++ b/research/object_detection/core/target_assigner_test.py
@@ -1683,6 +1683,121 @@ class CenterNetKeypointTargetAssignerTest(test_case.TestCase):
    np.testing.assert_array_equal([0, 3, 2], indices[7, :])
    np.testing.assert_array_almost_equal([0.6, 0.4], offsets[7, :])
+  def test_assign_keypoint_depths_target(self):
+    def graph_fn():
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, 0.7, 0.9, 0.4],
+                        [0.4, 0.1, 0.4, 0.2, 0.0],
+                        [float('nan'), 0.0, 0.12, 0.7, 0.4]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+      depths = tf.constant(
+          np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                    [float('nan'), 0.7, float('nan'), 0.9, 0.4],
+                    [0.4, 0.1, 0.4, 0.2, 0.0],
+                    [0.5, 0.0, 7.0, 0.7, 0.4]]),
+          dtype=tf.float32)
+      gt_keypoint_depths_list = [depths]
+      gt_keypoint_depth_weights = tf.constant(
+          np.array([[1.0, 1.0, 1.0, 1.0, 1.0],
+                    [float('nan'), 0.0, 1.0, 0.0, 0.0],
+                    [1.0, 1.0, 1.0, 1.0, 1.0],
+                    [1.0, 1.0, 0.5, 1.0, 1.0]]),
+          dtype=tf.float32)
+      gt_keypoint_depth_weights_list = [gt_keypoint_depth_weights]
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2],
+          peak_radius=1)
+      (indices, depths, weights) = cn_assigner.assign_keypoints_depth_targets(
+          height=120,
+          width=80,
+          gt_keypoints_list=gt_keypoints_list,
+          gt_classes_list=gt_classes_list,
+          gt_keypoint_depths_list=gt_keypoint_depths_list,
+          gt_keypoint_depth_weights_list=gt_keypoint_depth_weights_list)
+      return indices, depths, weights
+    indices, depths, weights = self.execute(graph_fn, [])
+    # Only the last 5 elements has positive weight.
+    np.testing.assert_array_almost_equal([
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5
+    ], weights)
+    # Validate the last 5 elements' depth value.
+    np.testing.assert_array_almost_equal(
+        [7.0, 7.0, 7.0, 7.0, 7.0], depths[35:, 0])
+    self.assertEqual((40, 3), indices.shape)
+    np.testing.assert_array_equal([0, 2, 2], indices[35, :])
+  def test_assign_keypoint_depths_per_keypoints(self):
+    def graph_fn():
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, 0.7, 0.9, 0.4],
+                        [0.4, 0.1, 0.4, 0.2, 0.0],
+                        [float('nan'), 0.0, 0.12, 0.7, 0.4]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+      depths = tf.constant(
+          np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                    [float('nan'), 0.7, float('nan'), 0.9, 0.4],
+                    [0.4, 0.1, 0.4, 0.2, 0.0],
+                    [0.5, 0.0, 7.0, 0.7, 0.4]]),
+          dtype=tf.float32)
+      gt_keypoint_depths_list = [depths]
+      gt_keypoint_depth_weights = tf.constant(
+          np.array([[1.0, 1.0, 1.0, 1.0, 1.0],
+                    [float('nan'), 0.0, 1.0, 0.0, 0.0],
+                    [1.0, 1.0, 1.0, 1.0, 1.0],
+                    [1.0, 1.0, 0.5, 1.0, 1.0]]),
+          dtype=tf.float32)
+      gt_keypoint_depth_weights_list = [gt_keypoint_depth_weights]
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2],
+          peak_radius=1,
+          per_keypoint_offset=True)
+      (indices, depths, weights) = cn_assigner.assign_keypoints_depth_targets(
+          height=120,
+          width=80,
+          gt_keypoints_list=gt_keypoints_list,
+          gt_classes_list=gt_classes_list,
+          gt_keypoint_depths_list=gt_keypoint_depths_list,
+          gt_keypoint_depth_weights_list=gt_keypoint_depth_weights_list)
+      return indices, depths, weights
+    indices, depths, weights = self.execute(graph_fn, [])
+    # Only the last 5 elements has positive weight.
+    np.testing.assert_array_almost_equal([
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5
+    ], weights)
+    # Validate the last 5 elements' depth value.
+    np.testing.assert_array_almost_equal(
+        [7.0, 7.0, 7.0, 7.0, 7.0], depths[35:, 0])
+    self.assertEqual((40, 4), indices.shape)
+    np.testing.assert_array_equal([0, 2, 2, 1], indices[35, :])
  def test_assign_keypoints_offset_targets_radius(self):
    def graph_fn():
      gt_classes_list = [

--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -152,6 +152,15 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
    groundtruth[input_data_fields.groundtruth_keypoints] = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.keypoints))
+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.keypoint_depths):
+    groundtruth[input_data_fields.groundtruth_keypoint_depths] = tf.stack(
+        detection_model.groundtruth_lists(fields.BoxListFields.keypoint_depths))
+    groundtruth[
+        input_data_fields.groundtruth_keypoint_depth_weights] = tf.stack(
+            detection_model.groundtruth_lists(
+                fields.BoxListFields.keypoint_depth_weights))
  if detection_model.groundtruth_has_field(
      fields.BoxListFields.keypoint_visibilities):
    groundtruth[input_data_fields.groundtruth_keypoint_visibilities] = tf.stack(
@@ -260,6 +269,8 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
        fields.InputDataFields.groundtruth_classes,
        fields.InputDataFields.groundtruth_boxes,
        fields.InputDataFields.groundtruth_keypoints,
+        fields.InputDataFields.groundtruth_keypoint_depths,
+        fields.InputDataFields.groundtruth_keypoint_depth_weights,
        fields.InputDataFields.groundtruth_keypoint_visibilities,
        fields.InputDataFields.groundtruth_dp_num_points,
        fields.InputDataFields.groundtruth_dp_part_ids,
@@ -311,6 +322,13 @@ def provide_groundtruth(model, labels):
  gt_keypoints_list = None
  if fields.InputDataFields.groundtruth_keypoints in labels:
    gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
+  gt_keypoint_depths_list = None
+  gt_keypoint_depth_weights_list = None
+  if fields.InputDataFields.groundtruth_keypoint_depths in labels:
+    gt_keypoint_depths_list = (
+        labels[fields.InputDataFields.groundtruth_keypoint_depths])
+    gt_keypoint_depth_weights_list = (
+        labels[fields.InputDataFields.groundtruth_keypoint_depth_weights])
  gt_keypoint_visibilities_list = None
  if fields.InputDataFields.groundtruth_keypoint_visibilities in labels:
    gt_keypoint_visibilities_list = labels[
@@ -376,7 +394,9 @@ def provide_groundtruth(model, labels):
      groundtruth_area_list=gt_area_list,
      groundtruth_track_ids_list=gt_track_ids_list,
      groundtruth_verified_neg_classes=gt_verified_neg_classes,
-      groundtruth_not_exhaustive_classes=gt_not_exhaustive_classes)
+      groundtruth_not_exhaustive_classes=gt_not_exhaustive_classes,
+      groundtruth_keypoint_depths_list=gt_keypoint_depths_list,
+      groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list)
 def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False,

--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -99,6 +99,10 @@ def _compute_losses_and_predictions_dicts(
          k-hot tensor of classes.
        labels[fields.InputDataFields.groundtruth_track_ids] is a int32
          tensor of track IDs.
+        labels[fields.InputDataFields.groundtruth_keypoint_depths] is a
+          float32 tensor containing keypoint depths information.
+        labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a
+          float32 tensor containing the weights of the keypoint depth feature.
    add_regularization_loss: Whether or not to include the model's
      regularization loss in the losses dictionary.
@@ -213,6 +217,10 @@ def eager_train_step(detection_model,
          k-hot tensor of classes.
        labels[fields.InputDataFields.groundtruth_track_ids] is a int32
          tensor of track IDs.
+        labels[fields.InputDataFields.groundtruth_keypoint_depths] is a
+          float32 tensor containing keypoint depths information.
+        labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a
+          float32 tensor containing the weights of the keypoint depth feature.
    unpad_groundtruth_tensors: A parameter passed to unstack_batch.
    optimizer: The training optimizer that will update the variables.
    learning_rate: The learning rate tensor for the current training step.