Extended CenterNet model to predict keypoint depth information.

PiperOrigin-RevId: 359344675

Extended CenterNet model to predict keypoint depth information.
PiperOrigin-RevId: 359344675
0c85c06c · Yu-hui Chen · TF Object Detection Team · 3cfd0ba0 · 0c85c06c · 0c85c06c
Commit 0c85c06c authored Feb 24, 2021 by Yu-hui Chen Committed by TF Object Detection Team Feb 24, 2021
5 changed files
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -868,7 +868,10 @@ def keypoint_proto_to_params(kp_config, keypoint_map_dict):
      candidate_search_scale=kp_config.candidate_search_scale,
      candidate_ranking_mode=kp_config.candidate_ranking_mode,
      offset_peak_radius=kp_config.offset_peak_radius,
-      per_keypoint_offset=kp_config.per_keypoint_offset)
+      per_keypoint_offset=kp_config.per_keypoint_offset,
+      predict_depth=kp_config.predict_depth,
+      per_keypoint_depth=kp_config.per_keypoint_depth,
+      keypoint_depth_loss_weight=kp_config.keypoint_depth_loss_weight)


 def object_detection_proto_to_params(od_config):

--- a/research/object_detection/builders/model_builder_tf2_test.py
+++ b/research/object_detection/builders/model_builder_tf2_test.py
@@ -116,6 +116,9 @@ class ModelBuilderTF2Test(model_builder_test.ModelBuilderTest):
      candidate_ranking_mode: "score_distance_ratio"
      offset_peak_radius: 3
      per_keypoint_offset: true
+      predict_depth: true
+      per_keypoint_depth: true
+      keypoint_depth_loss_weight: 0.3
    """
    config = text_format.Merge(task_proto_txt,
                               center_net_pb2.CenterNet.KeypointEstimation())
@@ -264,6 +267,9 @@ class ModelBuilderTF2Test(model_builder_test.ModelBuilderTest):
    self.assertEqual(kp_params.candidate_ranking_mode, 'score_distance_ratio')
    self.assertEqual(kp_params.offset_peak_radius, 3)
    self.assertEqual(kp_params.per_keypoint_offset, True)
+    self.assertEqual(kp_params.predict_depth, True)
+    self.assertEqual(kp_params.per_keypoint_depth, True)
+    self.assertAlmostEqual(kp_params.keypoint_depth_loss_weight, 0.3)

    # Check mask related parameters.
    self.assertAlmostEqual(model._mask_params.task_loss_weight, 0.7)

--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -423,12 +423,12 @@ def prediction_tensors_to_temporal_offsets(
  return offsets


-def prediction_tensors_to_keypoint_candidates(
-    keypoint_heatmap_predictions,
-    keypoint_heatmap_offsets,
-    keypoint_score_threshold=0.1,
-    max_pool_kernel_size=1,
-    max_candidates=20):
+def prediction_tensors_to_keypoint_candidates(keypoint_heatmap_predictions,
+                                              keypoint_heatmap_offsets,
+                                              keypoint_score_threshold=0.1,
+                                              max_pool_kernel_size=1,
+                                              max_candidates=20,
+                                              keypoint_depths=None):
  """Convert keypoint heatmap predictions and offsets to keypoint candidates.

  Args:
@@ -437,14 +437,17 @@ def prediction_tensors_to_keypoint_candidates(
    keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
      width, 2] (or [batch_size, height, width, 2 * num_keypoints] if
      'per_keypoint_offset' is set True) representing the per-keypoint offsets.
-    keypoint_score_threshold: float, the threshold for considering a keypoint
-      a candidate.
+    keypoint_score_threshold: float, the threshold for considering a keypoint a
+      candidate.
    max_pool_kernel_size: integer, the max pool kernel size to use to pull off
      peak score locations in a neighborhood. For example, to make sure no two
      neighboring values for the same keypoint are returned, set
      max_pool_kernel_size=3. If None or 1, will not apply any local filtering.
-    max_candidates: integer, maximum number of keypoint candidates per
-      keypoint type.
+    max_candidates: integer, maximum number of keypoint candidates per keypoint
+      type.
+    keypoint_depths: (optional) A float tensor of shape [batch_size, height,
+      width, 1] (or [batch_size, height, width, num_keypoints] if
+      'per_keypoint_depth' is set True) representing the per-keypoint depths.

  Returns:
    keypoint_candidates: A tensor of shape
@@ -458,6 +461,9 @@ def prediction_tensors_to_keypoint_candidates(
      [batch_size, num_keypoints] with the number of candidates for each
      keypoint type, as it's possible to filter some candidates due to the score
      threshold.
+    depth_candidates: A tensor of shape [batch_size, max_candidates,
+      num_keypoints] representing the estimated depth of each keypoint
+      candidate. Return None if the input keypoint_depths is None.
  """
  batch_size, _, _, num_keypoints = _get_shape(keypoint_heatmap_predictions, 4)
  # Get x, y and channel indices corresponding to the top indices in the
@@ -499,13 +505,13 @@ def prediction_tensors_to_keypoint_candidates(
    # TF Lite does not support tf.gather with batch_dims > 0, so we need to use
    # tf_gather_nd instead and here we prepare the indices for that. In this
    # case, channel_indices indicates which keypoint to use the offset from.
-    combined_indices = tf.stack([
+    channel_combined_indices = tf.stack([
        _multi_range(batch_size, value_repetitions=num_indices),
        _multi_range(num_indices, range_repetitions=batch_size),
        tf.reshape(channel_indices, [-1])
    ], axis=1)

-    offsets = tf.gather_nd(reshaped_offsets, combined_indices)
+    offsets = tf.gather_nd(reshaped_offsets, channel_combined_indices)
    offsets = tf.reshape(offsets, [batch_size, num_indices, -1])
  else:
    offsets = selected_offsets
@@ -524,14 +530,38 @@ def prediction_tensors_to_keypoint_candidates(
  num_candidates = tf.reduce_sum(
      tf.to_int32(keypoint_scores >= keypoint_score_threshold), axis=1)

-  return keypoint_candidates, keypoint_scores, num_candidates
+  depth_candidates = None
+  if keypoint_depths is not None:
+    selected_depth_flat = tf.gather_nd(keypoint_depths, combined_indices)
+    selected_depth = tf.reshape(selected_depth_flat,
+                                [batch_size, num_indices, -1])
+    _, _, num_depth_channels = _get_shape(selected_depth, 3)
+    if num_depth_channels > 1:
+      combined_indices = tf.stack([
+          _multi_range(batch_size, value_repetitions=num_indices),
+          _multi_range(num_indices, range_repetitions=batch_size),
+          tf.reshape(channel_indices, [-1])
+      ], axis=1)
+      depth = tf.gather_nd(selected_depth, combined_indices)
+      depth = tf.reshape(depth, [batch_size, num_indices, -1])
+    else:
+      depth = selected_depth
+    depth_candidates = tf.reshape(depth,
+                                  [batch_size, num_keypoints, max_candidates])
+    depth_candidates = tf.transpose(depth_candidates, [0, 2, 1])

+  return keypoint_candidates, keypoint_scores, num_candidates, depth_candidates

-def prediction_to_single_instance_keypoints(object_heatmap, keypoint_heatmap,
+
+def prediction_to_single_instance_keypoints(object_heatmap,
+                                            keypoint_heatmap,
                                            keypoint_offset,
-                                            keypoint_regression, stride,
+                                            keypoint_regression,
+                                            stride,
                                            object_center_std_dev,
-                                            keypoint_std_dev, kp_params):
+                                            keypoint_std_dev,
+                                            kp_params,
+                                            keypoint_depths=None):
  """Postprocess function to predict single instance keypoints.

  This is a simplified postprocessing function based on the assumption that
@@ -569,6 +599,9 @@ def prediction_to_single_instance_keypoints(object_heatmap, keypoint_heatmap,
      representing the standard deviation corresponding to each joint.
    kp_params: A `KeypointEstimationParams` object with parameters for a single
      keypoint class.
+    keypoint_depths: (optional) A float tensor of shape [batch_size, height,
+      width, 1] (or [batch_size, height, width, num_keypoints] if
+      'per_keypoint_depth' is set True) representing the per-keypoint depths.

  Returns:
    A tuple of two tensors:
@@ -577,6 +610,9 @@ def prediction_to_single_instance_keypoints(object_heatmap, keypoint_heatmap,
        map space.
      keypoint_scores: A float tensor with shape [1, 1, num_keypoints]
        representing the keypoint prediction scores.
+      keypoint_depths: A float tensor with shape [1, 1, num_keypoints]
+        representing the estimated keypoint depths. Return None if the input
+        keypoint_depths is None.

  Raises:
    ValueError: if the input keypoint_std_dev doesn't have valid number of
@@ -636,14 +672,16 @@ def prediction_to_single_instance_keypoints(object_heatmap, keypoint_heatmap,
  # Get the keypoint locations/scores:
  #   keypoint_candidates: [1, 1, num_keypoints, 2]
  #   keypoint_scores: [1, 1, num_keypoints]
-  (keypoint_candidates, keypoint_scores,
-   _) = prediction_tensors_to_keypoint_candidates(
+  #   depth_candidates: [1, 1, num_keypoints]
+  (keypoint_candidates, keypoint_scores, _,
+   depth_candidates) = prediction_tensors_to_keypoint_candidates(
       keypoint_predictions,
       keypoint_offset,
       keypoint_score_threshold=kp_params.keypoint_candidate_score_threshold,
       max_pool_kernel_size=kp_params.peak_max_pool_kernel_size,
-       max_candidates=1)
-  return keypoint_candidates, keypoint_scores
+       max_candidates=1,
+       keypoint_depths=keypoint_depths)
+  return keypoint_candidates, keypoint_scores, depth_candidates


 def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
@@ -697,11 +735,16 @@ def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
                    [batch_size, num_instances, -1])


-def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,
-                     num_keypoint_candidates, bboxes=None,
-                     unmatched_keypoint_score=0.1, box_scale=1.2,
+def refine_keypoints(regressed_keypoints,
+                     keypoint_candidates,
+                     keypoint_scores,
+                     num_keypoint_candidates,
+                     bboxes=None,
+                     unmatched_keypoint_score=0.1,
+                     box_scale=1.2,
                     candidate_search_scale=0.3,
-                     candidate_ranking_mode='min_distance'):
+                     candidate_ranking_mode='min_distance',
+                     keypoint_depth_candidates=None):
  """Refines regressed keypoints by snapping to the nearest candidate keypoints.

  The initial regressed keypoints represent a full set of keypoints regressed
@@ -757,6 +800,9 @@ def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,
    candidate_ranking_mode: A string as one of ['min_distance',
     'score_distance_ratio'] indicating how to select the candidate. If invalid
      value is provided, an ValueError will be raised.
+    keypoint_depth_candidates: (optional) A float tensor of shape
+      [batch_size, max_candidates, num_keypoints] indicating the depths for
+      keypoint candidates.

  Returns:
    A tuple with:
@@ -836,9 +882,11 @@ def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,
  # Gather the coordinates and scores corresponding to the closest candidates.
  # Shape of tensors are [batch_size, num_instances, num_keypoints, 2] and
  # [batch_size, num_instances, num_keypoints], respectively.
-  nearby_candidate_coords, nearby_candidate_scores = (
-      _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
-                                    nearby_candidate_inds))
+  (nearby_candidate_coords, nearby_candidate_scores,
+   nearby_candidate_depths) = (
+       _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
+                                     nearby_candidate_inds,
+                                     keypoint_depth_candidates))

  if bboxes is None:
    # Create bboxes from regressed keypoints.
@@ -895,7 +943,12 @@ def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,
      unmatched_keypoint_score * tf.ones_like(nearby_candidate_scores),
      nearby_candidate_scores)

-  return refined_keypoints, refined_scores
+  refined_depths = None
+  if nearby_candidate_depths is not None:
+    refined_depths = tf.where(mask, tf.zeros_like(nearby_candidate_depths),
+                              nearby_candidate_depths)
+
+  return refined_keypoints, refined_scores, refined_depths


 def _pad_to_full_keypoint_dim(keypoint_coords, keypoint_scores, keypoint_inds,
@@ -976,8 +1029,10 @@ def _pad_to_full_instance_dim(keypoint_coords, keypoint_scores, instance_inds,
  return keypoint_coords_padded, keypoint_scores_padded


-def _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
-                                  indices):
+def _gather_candidates_at_indices(keypoint_candidates,
+                                  keypoint_scores,
+                                  indices,
+                                  keypoint_depth_candidates=None):
  """Gathers keypoint candidate coordinates and scores at indices.

  Args:
@@ -987,13 +1042,18 @@ def _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
      num_keypoints] with keypoint scores.
    indices: an integer tensor of shape [batch_size, num_indices, num_keypoints]
      with indices.
+    keypoint_depth_candidates: (optional) a float tensor of shape [batch_size,
+      max_candidates, num_keypoints] with keypoint depths.

  Returns:
    A tuple with
    gathered_keypoint_candidates: a float tensor of shape [batch_size,
      num_indices, num_keypoints, 2] with gathered coordinates.
    gathered_keypoint_scores: a float tensor of shape [batch_size,
-      num_indices, num_keypoints, 2].
+      num_indices, num_keypoints].
+    gathered_keypoint_depths: a float tensor of shape [batch_size,
+      num_indices, num_keypoints]. Return None if the input
+      keypoint_depth_candidates is None.
  """
  batch_size, num_indices, num_keypoints = _get_shape(indices, 3)

@@ -1035,7 +1095,19 @@ def _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
  gathered_keypoint_scores = tf.transpose(nearby_candidate_scores_transposed,
                                          [0, 2, 1])

-  return gathered_keypoint_candidates, gathered_keypoint_scores
+  gathered_keypoint_depths = None
+  if keypoint_depth_candidates is not None:
+    keypoint_depths_transposed = tf.transpose(keypoint_depth_candidates,
+                                              [0, 2, 1])
+    nearby_candidate_depths_transposed = tf.gather_nd(
+        keypoint_depths_transposed, combined_indices)
+    nearby_candidate_depths_transposed = tf.reshape(
+        nearby_candidate_depths_transposed,
+        [batch_size, num_keypoints, num_indices])
+    gathered_keypoint_depths = tf.transpose(nearby_candidate_depths_transposed,
+                                            [0, 2, 1])
+  return (gathered_keypoint_candidates, gathered_keypoint_scores,
+          gathered_keypoint_depths)


 def flattened_indices_from_row_col_indices(row_indices, col_indices, num_cols):
@@ -1517,7 +1589,8 @@ class KeypointEstimationParams(
        'heatmap_bias_init', 'num_candidates_per_keypoint', 'task_loss_weight',
        'peak_max_pool_kernel_size', 'unmatched_keypoint_score', 'box_scale',
        'candidate_search_scale', 'candidate_ranking_mode',
-        'offset_peak_radius', 'per_keypoint_offset'
+        'offset_peak_radius', 'per_keypoint_offset', 'predict_depth',
+        'per_keypoint_depth', 'keypoint_depth_loss_weight'
    ])):
  """Namedtuple to host object detection related parameters.

@@ -1550,7 +1623,10 @@ class KeypointEstimationParams(
              candidate_search_scale=0.3,
              candidate_ranking_mode='min_distance',
              offset_peak_radius=0,
-              per_keypoint_offset=False):
+              per_keypoint_offset=False,
+              predict_depth=False,
+              per_keypoint_depth=False,
+              keypoint_depth_loss_weight=1.0):
    """Constructor with default values for KeypointEstimationParams.

    Args:
@@ -1614,6 +1690,12 @@ class KeypointEstimationParams(
        original paper). If set True, the output offset target has the shape
        [batch_size, out_height, out_width, 2 * num_keypoints] (recommended when
        the offset_peak_radius is not zero).
+      predict_depth: A bool indicates whether to predict the depth of each
+        keypoints.
+      per_keypoint_depth: A bool indicates whether the model predicts the depth
+        of each keypoints in independent channels. Similar to
+        per_keypoint_offset but for the keypoint depth.
+      keypoint_depth_loss_weight: The weight of the keypoint depth loss.

    Returns:
      An initialized KeypointEstimationParams namedtuple.
@@ -1626,7 +1708,8 @@ class KeypointEstimationParams(
        heatmap_bias_init, num_candidates_per_keypoint, task_loss_weight,
        peak_max_pool_kernel_size, unmatched_keypoint_score, box_scale,
        candidate_search_scale, candidate_ranking_mode, offset_peak_radius,
-        per_keypoint_offset)
+        per_keypoint_offset, predict_depth, per_keypoint_depth,
+        keypoint_depth_loss_weight)


 class ObjectCenterParams(
@@ -1839,6 +1922,7 @@ BOX_OFFSET = 'box/offset'
 KEYPOINT_REGRESSION = 'keypoint/regression'
 KEYPOINT_HEATMAP = 'keypoint/heatmap'
 KEYPOINT_OFFSET = 'keypoint/offset'
+KEYPOINT_DEPTH = 'keypoint/depth'
 SEGMENTATION_TASK = 'segmentation_task'
 SEGMENTATION_HEATMAP = 'segmentation/heatmap'
 DENSEPOSE_TASK = 'densepose_task'
@@ -2055,6 +2139,15 @@ class CenterNetMetaArch(model.DetectionModel):
                                  use_depthwise=self._use_depthwise)
              for _ in range(num_feature_outputs)
          ]
+
+        if kp_params.predict_depth:
+          num_depth_channel = (
+              num_keypoints if kp_params.per_keypoint_depth else 1)
+          prediction_heads[get_keypoint_name(task_name, KEYPOINT_DEPTH)] = [
+              make_prediction_net(
+                  num_depth_channel, use_depthwise=self._use_depthwise)
+              for _ in range(num_feature_outputs)
+          ]
    # pylint: disable=g-complex-comprehension
    if self._mask_params is not None:
      prediction_heads[SEGMENTATION_HEATMAP] = [
@@ -2305,6 +2398,7 @@ class CenterNetMetaArch(model.DetectionModel):
    heatmap_key = get_keypoint_name(task_name, KEYPOINT_HEATMAP)
    offset_key = get_keypoint_name(task_name, KEYPOINT_OFFSET)
    regression_key = get_keypoint_name(task_name, KEYPOINT_REGRESSION)
+    depth_key = get_keypoint_name(task_name, KEYPOINT_DEPTH)
    heatmap_loss = self._compute_kp_heatmap_loss(
        input_height=input_height,
        input_width=input_width,
@@ -2332,6 +2426,14 @@ class CenterNetMetaArch(model.DetectionModel):
        kp_params.keypoint_offset_loss_weight * offset_loss)
    loss_dict[regression_key] = (
        kp_params.keypoint_regression_loss_weight * reg_loss)
+    if kp_params.predict_depth:
+      depth_loss = self._compute_kp_depth_loss(
+          input_height=input_height,
+          input_width=input_width,
+          task_name=task_name,
+          depth_predictions=prediction_dict[depth_key],
+          localization_loss_fn=kp_params.localization_loss)
+      loss_dict[depth_key] = kp_params.keypoint_depth_loss_weight * depth_loss
    return loss_dict

  def _compute_kp_heatmap_loss(self, input_height, input_width, task_name,
@@ -2501,6 +2603,68 @@ class CenterNetMetaArch(model.DetectionModel):
        tf.maximum(tf.reduce_sum(batch_weights), 1.0))
    return loss

+  def _compute_kp_depth_loss(self, input_height, input_width, task_name,
+                             depth_predictions, localization_loss_fn):
+    """Computes the loss of the keypoint depth estimation.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      task_name: A string representing the name of the keypoint task.
+      depth_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 1 (or num_keypoints)] representing the prediction
+        heads of the model for keypoint depth.
+      localization_loss_fn: An object_detection.core.losses.Loss object to
+        compute the loss for the keypoint offset predictions in CenterNet.
+
+    Returns:
+      loss: A float scalar tensor representing the keypoint depth loss
+        normalized by number of total keypoints.
+    """
+    kp_params = self._kp_params_dict[task_name]
+    gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
+    gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+    gt_keypoint_depths_list = self.groundtruth_lists(
+        fields.BoxListFields.keypoint_depths)
+    gt_keypoint_depth_weights_list = self.groundtruth_lists(
+        fields.BoxListFields.keypoint_depth_weights)
+
+    assigner = self._target_assigner_dict[task_name]
+    (batch_indices, batch_depths,
+     batch_weights) = assigner.assign_keypoints_depth_targets(
+         height=input_height,
+         width=input_width,
+         gt_keypoints_list=gt_keypoints_list,
+         gt_weights_list=gt_weights_list,
+         gt_classes_list=gt_classes_list,
+         gt_keypoint_depths_list=gt_keypoint_depths_list,
+         gt_keypoint_depth_weights_list=gt_keypoint_depth_weights_list)
+
+    if kp_params.per_keypoint_offset and not kp_params.per_keypoint_depth:
+      batch_indices = batch_indices[:, 0:3]
+
+    # Keypoint offset loss.
+    loss = 0.0
+    for prediction in depth_predictions:
+      selected_depths = cn_assigner.get_batch_predictions_from_indices(
+          prediction, batch_indices)
+      if kp_params.per_keypoint_offset and kp_params.per_keypoint_depth:
+        selected_depths = tf.expand_dims(selected_depths, axis=-1)
+      # The dimensions passed are not as per the doc string but the loss
+      # still computes the correct value.
+      unweighted_loss = localization_loss_fn(
+          selected_depths,
+          batch_depths,
+          weights=tf.expand_dims(tf.ones_like(batch_weights), -1))
+      # Apply the weights after the loss function to have full control over it.
+      loss += batch_weights * tf.squeeze(unweighted_loss, axis=1)
+
+    loss = tf.reduce_sum(loss) / (
+        float(len(depth_predictions)) *
+        tf.maximum(tf.reduce_sum(batch_weights), 1.0))
+    return loss
+
  def _compute_segmentation_losses(self, prediction_dict, per_pixel_weights):
    """Computes all the losses associated with segmentation.

@@ -3051,9 +3215,10 @@ class CenterNetMetaArch(model.DetectionModel):
      # keypoint, we fall back to a simpler postprocessing function which uses
      # the ops that are supported by tf.lite on GPU.
      if len(self._kp_params_dict) == 1 and self._num_classes == 1:
-        keypoints, keypoint_scores = self._postprocess_keypoints_single_class(
-            prediction_dict, classes, y_indices, x_indices,
-            boxes_strided, num_detections)
+        (keypoints, keypoint_scores,
+         keypoint_depths) = self._postprocess_keypoints_single_class(
+             prediction_dict, classes, y_indices, x_indices, boxes_strided,
+             num_detections)
        # The map_fn used to clip out of frame keypoints creates issues when
        # converting to tf.lite model so we disable it and let the users to
        # handle those out of frame keypoints.
@@ -3061,7 +3226,18 @@ class CenterNetMetaArch(model.DetectionModel):
            convert_strided_predictions_to_normalized_keypoints(
                keypoints, keypoint_scores, self._stride, true_image_shapes,
                clip_out_of_frame_keypoints=False))
+        if keypoint_depths is not None:
+          postprocess_dict.update({
+              fields.DetectionResultFields.detection_keypoint_depths:
+                  keypoint_depths
+          })
      else:
+        # Multi-class keypoint estimation task does not support depth
+        # estimation.
+        assert all([
+            not kp_dict.predict_depth
+            for kp_dict in self._kp_params_dict.values()
+        ])
        keypoints, keypoint_scores = self._postprocess_keypoints_multi_class(
            prediction_dict, classes, y_indices, x_indices,
            boxes_strided, num_detections)
@@ -3200,7 +3376,11 @@ class CenterNetMetaArch(model.DetectionModel):
        task_name, KEYPOINT_REGRESSION)][-1]
    object_heatmap = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])

-    keypoints, keypoint_scores = (
+    keypoint_depths = None
+    if kp_params.predict_depth:
+      keypoint_depths = prediction_dict[get_keypoint_name(
+          task_name, KEYPOINT_DEPTH)][-1]
+    keypoints, keypoint_scores, keypoint_depths = (
        prediction_to_single_instance_keypoints(
            object_heatmap=object_heatmap,
            keypoint_heatmap=keypoint_heatmap,
@@ -3209,7 +3389,8 @@ class CenterNetMetaArch(model.DetectionModel):
            stride=self._stride,
            object_center_std_dev=object_center_std_dev,
            keypoint_std_dev=keypoint_std_dev,
-            kp_params=kp_params))
+            kp_params=kp_params,
+            keypoint_depths=keypoint_depths))

    keypoints, keypoint_scores = (
        convert_strided_predictions_to_normalized_keypoints(
@@ -3222,6 +3403,12 @@ class CenterNetMetaArch(model.DetectionModel):
        fields.DetectionResultFields.detection_keypoints: keypoints,
        fields.DetectionResultFields.detection_keypoint_scores: keypoint_scores
    }
+
+    if kp_params.predict_depth:
+      postprocess_dict.update({
+          fields.DetectionResultFields.detection_keypoint_depths:
+              keypoint_depths
+      })
    return postprocess_dict

  def _postprocess_embeddings(self, prediction_dict, y_indices, x_indices):
@@ -3316,7 +3503,7 @@ class CenterNetMetaArch(model.DetectionModel):
          # [1, num_instances_i, num_keypoints_i], respectively. Note that
          # num_instances_i and num_keypoints_i refers to the number of
          # instances and keypoints for class i, respectively.
-          kpt_coords_for_class, kpt_scores_for_class = (
+          (kpt_coords_for_class, kpt_scores_for_class, _) = (
              self._postprocess_keypoints_for_class_and_image(
                  keypoint_heatmap, keypoint_offsets, keypoint_regression,
                  classes, y_indices_for_kpt_class, x_indices_for_kpt_class,
@@ -3426,21 +3613,35 @@ class CenterNetMetaArch(model.DetectionModel):
        get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
    keypoint_regression = prediction_dict[
        get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
+    keypoint_depth_predictions = None
+    if kp_params.predict_depth:
+      keypoint_depth_predictions = prediction_dict[get_keypoint_name(
+          task_name, KEYPOINT_DEPTH)][-1]

    batch_size, _, _ = _get_shape(boxes, 3)
    kpt_coords_for_example_list = []
    kpt_scores_for_example_list = []
+    kpt_depths_for_example_list = []
    for ex_ind in range(batch_size):
      # Postprocess keypoints and scores for class and single image. Shapes
      # are [1, max_detections, num_keypoints, 2] and
      # [1, max_detections, num_keypoints], respectively.
-      kpt_coords_for_class, kpt_scores_for_class = (
+      (kpt_coords_for_class, kpt_scores_for_class, kpt_depths_for_class) = (
          self._postprocess_keypoints_for_class_and_image(
-              keypoint_heatmap, keypoint_offsets, keypoint_regression, classes,
-              y_indices, x_indices, boxes, ex_ind, kp_params))
+              keypoint_heatmap,
+              keypoint_offsets,
+              keypoint_regression,
+              classes,
+              y_indices,
+              x_indices,
+              boxes,
+              ex_ind,
+              kp_params,
+              keypoint_depth_predictions=keypoint_depth_predictions))

      kpt_coords_for_example_list.append(kpt_coords_for_class)
      kpt_scores_for_example_list.append(kpt_scores_for_class)
+      kpt_depths_for_example_list.append(kpt_depths_for_class)

    # Concatenate all keypoints and scores from all examples in the batch.
    # Shapes are [batch_size, max_detections, num_keypoints, 2] and
@@ -3448,7 +3649,11 @@ class CenterNetMetaArch(model.DetectionModel):
    keypoints = tf.concat(kpt_coords_for_example_list, axis=0)
    keypoint_scores = tf.concat(kpt_scores_for_example_list, axis=0)

-    return keypoints, keypoint_scores
+    keypoint_depths = None
+    if kp_params.predict_depth:
+      keypoint_depths = tf.concat(kpt_depths_for_example_list, axis=0)
+
+    return keypoints, keypoint_scores, keypoint_depths

  def _get_instance_indices(self, classes, num_detections, batch_index,
                            class_id):
@@ -3482,8 +3687,17 @@ class CenterNetMetaArch(model.DetectionModel):
    return tf.cast(instance_inds, tf.int32)

  def _postprocess_keypoints_for_class_and_image(
-      self, keypoint_heatmap, keypoint_offsets, keypoint_regression, classes,
-      y_indices, x_indices, boxes, batch_index, kp_params):
+      self,
+      keypoint_heatmap,
+      keypoint_offsets,
+      keypoint_regression,
+      classes,
+      y_indices,
+      x_indices,
+      boxes,
+      batch_index,
+      kp_params,
+      keypoint_depth_predictions=None):
    """Postprocess keypoints for a single image and class.

    Args:
@@ -3504,6 +3718,8 @@ class CenterNetMetaArch(model.DetectionModel):
      batch_index: An integer specifying the index for an example in the batch.
      kp_params: A `KeypointEstimationParams` object with parameters for a
        single keypoint class.
+      keypoint_depth_predictions: (optional) A [batch_size, height, width, 1]
+        float32 tensor representing the keypoint depth prediction.

    Returns:
      A tuple of
@@ -3514,6 +3730,9 @@ class CenterNetMetaArch(model.DetectionModel):
        for the specific class.
      refined_scores: A [1, num_instances, num_keypoints] float32 tensor with
        keypoint scores.
+      refined_depths: A [1, num_instances, num_keypoints] float32 tensor with
+        keypoint depths. Return None if the input keypoint_depth_predictions is
+        None.
    """
    num_keypoints = len(kp_params.keypoint_indices)

@@ -3521,6 +3740,10 @@ class CenterNetMetaArch(model.DetectionModel):
        keypoint_heatmap[batch_index:batch_index+1, ...])
    keypoint_offsets = keypoint_offsets[batch_index:batch_index+1, ...]
    keypoint_regression = keypoint_regression[batch_index:batch_index+1, ...]
+    keypoint_depths = None
+    if keypoint_depth_predictions is not None:
+      keypoint_depths = keypoint_depth_predictions[batch_index:batch_index + 1,
+                                                   ...]
    y_indices = y_indices[batch_index:batch_index+1, ...]
    x_indices = x_indices[batch_index:batch_index+1, ...]
    boxes_slice = boxes[batch_index:batch_index+1, ...]
@@ -3536,26 +3759,33 @@ class CenterNetMetaArch(model.DetectionModel):
    # The shape of keypoint_candidates and keypoint_scores is:
    # [1, num_candidates_per_keypoint, num_keypoints, 2] and
    #  [1, num_candidates_per_keypoint, num_keypoints], respectively.
-    keypoint_candidates, keypoint_scores, num_keypoint_candidates = (
-        prediction_tensors_to_keypoint_candidates(
-            keypoint_heatmap, keypoint_offsets,
-            keypoint_score_threshold=(
-                kp_params.keypoint_candidate_score_threshold),
-            max_pool_kernel_size=kp_params.peak_max_pool_kernel_size,
-            max_candidates=kp_params.num_candidates_per_keypoint))
+    (keypoint_candidates, keypoint_scores, num_keypoint_candidates,
+     keypoint_depth_candidates) = (
+         prediction_tensors_to_keypoint_candidates(
+             keypoint_heatmap,
+             keypoint_offsets,
+             keypoint_score_threshold=(
+                 kp_params.keypoint_candidate_score_threshold),
+             max_pool_kernel_size=kp_params.peak_max_pool_kernel_size,
+             max_candidates=kp_params.num_candidates_per_keypoint,
+             keypoint_depths=keypoint_depths))

    # Get the refined keypoints and scores, of shape
    # [1, num_instances, num_keypoints, 2] and
    # [1, num_instances, num_keypoints], respectively.
-    refined_keypoints, refined_scores = refine_keypoints(
-        regressed_keypoints_for_objects, keypoint_candidates, keypoint_scores,
-        num_keypoint_candidates, bboxes=boxes_slice,
+    (refined_keypoints, refined_scores, refined_depths) = refine_keypoints(
+        regressed_keypoints_for_objects,
+        keypoint_candidates,
+        keypoint_scores,
+        num_keypoint_candidates,
+        bboxes=boxes_slice,
        unmatched_keypoint_score=kp_params.unmatched_keypoint_score,
        box_scale=kp_params.box_scale,
        candidate_search_scale=kp_params.candidate_search_scale,
-        candidate_ranking_mode=kp_params.candidate_ranking_mode)
+        candidate_ranking_mode=kp_params.candidate_ranking_mode,
+        keypoint_depth_candidates=keypoint_depth_candidates)

-    return refined_keypoints, refined_scores
+    return refined_keypoints, refined_scores, refined_depths

  def regularization_losses(self):
    return []

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -695,7 +695,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      keypoint_heatmap_offsets = tf.constant(
          keypoint_heatmap_offsets_np, dtype=tf.float32)

-      keypoint_cands, keypoint_scores, num_keypoint_candidates = (
+      (keypoint_cands, keypoint_scores, num_keypoint_candidates, _) = (
          cnma.prediction_tensors_to_keypoint_candidates(
              keypoint_heatmap,
              keypoint_heatmap_offsets,
@@ -780,7 +780,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      keypoint_regression = tf.constant(
          keypoint_regression_np, dtype=tf.float32)

-      (keypoint_cands, keypoint_scores) = (
+      (keypoint_cands, keypoint_scores, _) = (
          cnma.prediction_to_single_instance_keypoints(
              object_heatmap,
              keypoint_heatmap,
@@ -839,7 +839,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      keypoint_heatmap_offsets = tf.constant(
          keypoint_heatmap_offsets_np, dtype=tf.float32)

-      keypoint_cands, keypoint_scores, num_keypoint_candidates = (
+      (keypoint_cands, keypoint_scores, num_keypoint_candidates, _) = (
          cnma.prediction_tensors_to_keypoint_candidates(
              keypoint_heatmap,
              keypoint_heatmap_offsets,
@@ -880,6 +880,89 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    np.testing.assert_array_equal(expected_num_keypoint_candidates,
                                  num_keypoint_candidates)

+  @parameterized.parameters({'per_keypoint_depth': True},
+                            {'per_keypoint_depth': False})
+  def test_keypoint_candidate_prediction_depth(self, per_keypoint_depth):
+    keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    keypoint_heatmap_np[0, 0, 0, 0] = 1.0
+    keypoint_heatmap_np[0, 2, 1, 0] = 0.7
+    keypoint_heatmap_np[0, 1, 1, 0] = 0.6
+    keypoint_heatmap_np[0, 0, 2, 1] = 0.7
+    keypoint_heatmap_np[0, 1, 1, 1] = 0.3  # Filtered by low score.
+    keypoint_heatmap_np[0, 2, 2, 1] = 0.2
+    keypoint_heatmap_np[1, 1, 0, 0] = 0.6
+    keypoint_heatmap_np[1, 2, 1, 0] = 0.5
+    keypoint_heatmap_np[1, 0, 0, 0] = 0.4
+    keypoint_heatmap_np[1, 0, 0, 1] = 1.0
+    keypoint_heatmap_np[1, 0, 1, 1] = 0.9
+    keypoint_heatmap_np[1, 2, 0, 1] = 0.8
+
+    if per_keypoint_depth:
+      keypoint_depths_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+      keypoint_depths_np[0, 0, 0, 0] = -1.5
+      keypoint_depths_np[0, 2, 1, 0] = -1.0
+      keypoint_depths_np[0, 0, 2, 1] = 1.5
+    else:
+      keypoint_depths_np = np.zeros((2, 3, 3, 1), dtype=np.float32)
+      keypoint_depths_np[0, 0, 0, 0] = -1.5
+      keypoint_depths_np[0, 2, 1, 0] = -1.0
+      keypoint_depths_np[0, 0, 2, 0] = 1.5
+
+    keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25]
+    keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5]
+    keypoint_heatmap_offsets_np[0, 1, 1] = [0.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 0, 2] = [1.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 2, 2] = [1.0, 1.0]
+    keypoint_heatmap_offsets_np[1, 1, 0] = [0.25, 0.5]
+    keypoint_heatmap_offsets_np[1, 2, 1] = [0.5, 0.0]
+    keypoint_heatmap_offsets_np[1, 0, 0] = [0.0, -0.5]
+    keypoint_heatmap_offsets_np[1, 0, 1] = [0.5, -0.5]
+    keypoint_heatmap_offsets_np[1, 2, 0] = [-1.0, -0.5]
+
+    def graph_fn():
+      keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
+      keypoint_heatmap_offsets = tf.constant(
+          keypoint_heatmap_offsets_np, dtype=tf.float32)
+
+      keypoint_depths = tf.constant(keypoint_depths_np, dtype=tf.float32)
+      (keypoint_cands, keypoint_scores, num_keypoint_candidates,
+       keypoint_depths) = (
+           cnma.prediction_tensors_to_keypoint_candidates(
+               keypoint_heatmap,
+               keypoint_heatmap_offsets,
+               keypoint_score_threshold=0.5,
+               max_pool_kernel_size=1,
+               max_candidates=2,
+               keypoint_depths=keypoint_depths))
+      return (keypoint_cands, keypoint_scores, num_keypoint_candidates,
+              keypoint_depths)
+
+    (_, keypoint_scores, _, keypoint_depths) = self.execute(graph_fn, [])
+
+    expected_keypoint_scores = [
+        [  # Example 0.
+            [1.0, 0.7],  # Keypoint 1.
+            [0.7, 0.3],  # Keypoint 2.
+        ],
+        [  # Example 1.
+            [0.6, 1.0],  # Keypoint 1.
+            [0.5, 0.9],  # Keypoint 2.
+        ],
+    ]
+    expected_keypoint_depths = [
+        [
+            [-1.5, 1.5],
+            [-1.0, 0.0],
+        ],
+        [
+            [0., 0.],
+            [0., 0.],
+        ],
+    ]
+    np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
+    np.testing.assert_allclose(expected_keypoint_depths, keypoint_depths)
+
  def test_regressed_keypoints_at_object_centers(self):
    batch_size = 2
    num_keypoints = 5
@@ -985,11 +1068,15 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
      num_keypoint_candidates = tf.constant(num_keypoints_candidates_np,
                                            dtype=tf.int32)
-      refined_keypoints, refined_scores = cnma.refine_keypoints(
-          regressed_keypoints, keypoint_candidates, keypoint_scores,
-          num_keypoint_candidates, bboxes=None,
+      (refined_keypoints, refined_scores, _) = cnma.refine_keypoints(
+          regressed_keypoints,
+          keypoint_candidates,
+          keypoint_scores,
+          num_keypoint_candidates,
+          bboxes=None,
          unmatched_keypoint_score=unmatched_keypoint_score,
-          box_scale=1.2, candidate_search_scale=0.3,
+          box_scale=1.2,
+          candidate_search_scale=0.3,
          candidate_ranking_mode=candidate_ranking_mode)
      return refined_keypoints, refined_scores

@@ -1057,7 +1144,8 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
    np.testing.assert_allclose(expected_refined_scores, refined_scores)

-  def test_refine_keypoints_with_bboxes(self):
+  @parameterized.parameters({'predict_depth': True}, {'predict_depth': False})
+  def test_refine_keypoints_with_bboxes(self, predict_depth):
    regressed_keypoints_np = np.array(
        [
            # Example 0.
@@ -1096,7 +1184,22 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
                [0.7, 0.4, 0.0],  # Candidate 0.
                [0.6, 0.1, 0.0],  # Candidate 1.
            ]
-        ], dtype=np.float32)
+        ],
+        dtype=np.float32)
+    keypoint_depths_np = np.array(
+        [
+            # Example 0.
+            [
+                [-0.8, -0.9, -1.0],  # Candidate 0.
+                [-0.6, -0.1, -0.9],  # Candidate 1.
+            ],
+            # Example 1.
+            [
+                [-0.7, -0.4, -0.0],  # Candidate 0.
+                [-0.6, -0.1, -0.0],  # Candidate 1.
+            ]
+        ],
+        dtype=np.float32)
    num_keypoints_candidates_np = np.array(
        [
            # Example 0.
@@ -1125,17 +1228,28 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      keypoint_candidates = tf.constant(
          keypoint_candidates_np, dtype=tf.float32)
      keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
+      if predict_depth:
+        keypoint_depths = tf.constant(keypoint_depths_np, dtype=tf.float32)
+      else:
+        keypoint_depths = None
      num_keypoint_candidates = tf.constant(num_keypoints_candidates_np,
                                            dtype=tf.int32)
      bboxes = tf.constant(bboxes_np, dtype=tf.float32)
-      refined_keypoints, refined_scores = cnma.refine_keypoints(
-          regressed_keypoints, keypoint_candidates, keypoint_scores,
-          num_keypoint_candidates, bboxes=bboxes,
-          unmatched_keypoint_score=unmatched_keypoint_score,
-          box_scale=1.0, candidate_search_scale=0.3)
-      return refined_keypoints, refined_scores
-
-    refined_keypoints, refined_scores = self.execute(graph_fn, [])
+      (refined_keypoints, refined_scores,
+       refined_depths) = cnma.refine_keypoints(
+           regressed_keypoints,
+           keypoint_candidates,
+           keypoint_scores,
+           num_keypoint_candidates,
+           bboxes=bboxes,
+           unmatched_keypoint_score=unmatched_keypoint_score,
+           box_scale=1.0,
+           candidate_search_scale=0.3,
+           keypoint_depth_candidates=keypoint_depths)
+      if predict_depth:
+        return refined_keypoints, refined_scores, refined_depths
+      else:
+        return refined_keypoints, refined_scores

    expected_refined_keypoints = np.array(
        [
@@ -1166,8 +1280,17 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
            ],
        ], dtype=np.float32)

-    np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
-    np.testing.assert_allclose(expected_refined_scores, refined_scores)
+    if predict_depth:
+      refined_keypoints, refined_scores, refined_depths = self.execute(
+          graph_fn, [])
+      expected_refined_depths = np.array([[[-0.8, 0.0, 0.0], [0.0, 0.0, -1.0]],
+                                          [[-0.7, -0.1, 0.0], [-0.7, -0.4,
+                                                               0.0]]])
+      np.testing.assert_allclose(expected_refined_depths, refined_depths)
+    else:
+      refined_keypoints, refined_scores = self.execute(graph_fn, [])
+      np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
+      np.testing.assert_allclose(expected_refined_scores, refined_scores)

  def test_pad_to_full_keypoint_dim(self):
    batch_size = 4
@@ -1296,7 +1419,11 @@ def get_fake_od_params():
      scale_loss_weight=0.1)


-def get_fake_kp_params(num_candidates_per_keypoint=100):
+def get_fake_kp_params(num_candidates_per_keypoint=100,
+                       per_keypoint_offset=False,
+                       predict_depth=False,
+                       per_keypoint_depth=False,
+                       peak_radius=0):
  """Returns the fake keypoint estimation parameter namedtuple."""
  return cnma.KeypointEstimationParams(
      task_name=_TASK_NAME,
@@ -1306,7 +1433,11 @@ def get_fake_kp_params(num_candidates_per_keypoint=100):
      classification_loss=losses.WeightedSigmoidClassificationLoss(),
      localization_loss=losses.L1LocalizationLoss(),
      keypoint_candidate_score_threshold=0.1,
-      num_candidates_per_keypoint=num_candidates_per_keypoint)
+      num_candidates_per_keypoint=num_candidates_per_keypoint,
+      per_keypoint_offset=per_keypoint_offset,
+      predict_depth=predict_depth,
+      per_keypoint_depth=per_keypoint_depth,
+      offset_peak_radius=peak_radius)


 def get_fake_mask_params():
@@ -1353,7 +1484,11 @@ def build_center_net_meta_arch(build_resnet=False,
                               num_classes=_NUM_CLASSES,
                               max_box_predictions=5,
                               apply_non_max_suppression=False,
-                               detection_only=False):
+                               detection_only=False,
+                               per_keypoint_offset=False,
+                               predict_depth=False,
+                               per_keypoint_depth=False,
+                               peak_radius=0):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
    feature_extractor = (
@@ -1407,7 +1542,10 @@ def build_center_net_meta_arch(build_resnet=False,
        object_center_params=get_fake_center_params(max_box_predictions),
        object_detection_params=get_fake_od_params(),
        keypoint_params_dict={
-            _TASK_NAME: get_fake_kp_params(num_candidates_per_keypoint)
+            _TASK_NAME:
+                get_fake_kp_params(num_candidates_per_keypoint,
+                                   per_keypoint_offset, predict_depth,
+                                   per_keypoint_depth, peak_radius)
        },
        non_max_suppression_fn=non_max_suppression_fn)
  else:
@@ -1992,6 +2130,84 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual([1, 1, num_keypoints],
                        detections['detection_keypoint_scores'].shape)

+  @parameterized.parameters(
+      {'per_keypoint_depth': False},
+      {'per_keypoint_depth': True},
+  )
+  def test_postprocess_single_class_depth(self, per_keypoint_depth):
+    """Test the postprocess function."""
+    model = build_center_net_meta_arch(
+        num_classes=1,
+        per_keypoint_offset=per_keypoint_depth,
+        predict_depth=True,
+        per_keypoint_depth=per_keypoint_depth)
+    num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
+
+    class_center = np.zeros((1, 32, 32, 1), dtype=np.float32)
+    height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
+    keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
+
+    class_probs = np.zeros(1)
+    class_probs[0] = _logit(0.75)
+    class_center[0, 16, 16] = class_probs
+    height_width[0, 16, 16] = [5, 10]
+    offset[0, 16, 16] = [.25, .5]
+    keypoint_regression[0, 16, 16] = [-1., -1., -1., 1., 1., -1., 1., 1.]
+    keypoint_heatmaps[0, 14, 14, 0] = _logit(0.9)
+    keypoint_heatmaps[0, 14, 18, 1] = _logit(0.9)
+    keypoint_heatmaps[0, 18, 14, 2] = _logit(0.9)
+    keypoint_heatmaps[0, 18, 18, 3] = _logit(0.05)  # Note the low score.
+
+    if per_keypoint_depth:
+      keypoint_depth = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
+      keypoint_depth[0, 14, 14, 0] = -1.0
+      keypoint_depth[0, 14, 18, 1] = -1.1
+      keypoint_depth[0, 18, 14, 2] = -1.2
+      keypoint_depth[0, 18, 18, 3] = -1.3
+    else:
+      keypoint_depth = np.zeros((1, 32, 32, 1), dtype=np.float32)
+      keypoint_depth[0, 14, 14, 0] = -1.0
+      keypoint_depth[0, 14, 18, 0] = -1.1
+      keypoint_depth[0, 18, 14, 0] = -1.2
+      keypoint_depth[0, 18, 18, 0] = -1.3
+
+    class_center = tf.constant(class_center)
+    height_width = tf.constant(height_width)
+    offset = tf.constant(offset)
+    keypoint_heatmaps = tf.constant(keypoint_heatmaps, dtype=tf.float32)
+    keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
+    keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
+    keypoint_depth = tf.constant(keypoint_depth, dtype=tf.float32)
+
+    prediction_dict = {
+        cnma.OBJECT_CENTER: [class_center],
+        cnma.BOX_SCALE: [height_width],
+        cnma.BOX_OFFSET: [offset],
+        cnma.get_keypoint_name(_TASK_NAME,
+                               cnma.KEYPOINT_HEATMAP): [keypoint_heatmaps],
+        cnma.get_keypoint_name(_TASK_NAME,
+                               cnma.KEYPOINT_OFFSET): [keypoint_offsets],
+        cnma.get_keypoint_name(_TASK_NAME,
+                               cnma.KEYPOINT_REGRESSION): [keypoint_regression],
+        cnma.get_keypoint_name(_TASK_NAME,
+                               cnma.KEYPOINT_DEPTH): [keypoint_depth]
+    }
+
+    def graph_fn():
+      detections = model.postprocess(prediction_dict,
+                                     tf.constant([[128, 128, 3]]))
+      return detections
+
+    detections = self.execute_cpu(graph_fn, [])
+
+    self.assertAllClose(detections['detection_keypoint_depths'][0, 0],
+                        np.array([-1.0, -1.1, -1.2, 0.0]))
+    self.assertAllClose(detections['detection_keypoint_scores'][0, 0],
+                        np.array([0.9, 0.9, 0.9, 0.1]))
+
  def test_get_instance_indices(self):
    classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
    num_detections = tf.constant([1, 3], dtype=tf.int32)
@@ -2003,7 +2219,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual(valid_indices.numpy(), [0, 2])


-def get_fake_prediction_dict(input_height, input_width, stride):
+def get_fake_prediction_dict(input_height,
+                             input_width,
+                             stride,
+                             per_keypoint_depth=False):
  """Prepares the fake prediction dictionary."""
  output_height = input_height // stride
  output_width = input_width // stride
@@ -2038,6 +2257,11 @@ def get_fake_prediction_dict(input_height, input_width, stride):
                             dtype=np.float32)
  keypoint_offset[0, 2, 4] = 0.2, 0.4

+  keypoint_depth = np.zeros((2, output_height, output_width,
+                             _NUM_KEYPOINTS if per_keypoint_depth else 1),
+                            dtype=np.float32)
+  keypoint_depth[0, 2, 4] = 3.0
+
  keypoint_regression = np.zeros(
      (2, output_height, output_width, 2 * _NUM_KEYPOINTS), dtype=np.float32)
  keypoint_regression[0, 2, 4] = 0.0, 0.0, 0.2, 0.4, 0.0, 0.0, 0.2, 0.4
@@ -2073,14 +2297,10 @@ def get_fake_prediction_dict(input_height, input_width, stride):
          tf.constant(object_center),
          tf.constant(object_center)
      ],
-      cnma.BOX_SCALE: [
-          tf.constant(object_scale),
-          tf.constant(object_scale)
-      ],
-      cnma.BOX_OFFSET: [
-          tf.constant(object_offset),
-          tf.constant(object_offset)
-      ],
+      cnma.BOX_SCALE: [tf.constant(object_scale),
+                       tf.constant(object_scale)],
+      cnma.BOX_OFFSET: [tf.constant(object_offset),
+                        tf.constant(object_offset)],
      cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP): [
          tf.constant(keypoint_heatmap),
          tf.constant(keypoint_heatmap)
@@ -2093,6 +2313,10 @@ def get_fake_prediction_dict(input_height, input_width, stride):
          tf.constant(keypoint_regression),
          tf.constant(keypoint_regression)
      ],
+      cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_DEPTH): [
+          tf.constant(keypoint_depth),
+          tf.constant(keypoint_depth)
+      ],
      cnma.SEGMENTATION_HEATMAP: [
          tf.constant(mask_heatmap),
          tf.constant(mask_heatmap)
@@ -2117,7 +2341,10 @@ def get_fake_prediction_dict(input_height, input_width, stride):
  return prediction_dict


-def get_fake_groundtruth_dict(input_height, input_width, stride):
+def get_fake_groundtruth_dict(input_height,
+                              input_width,
+                              stride,
+                              has_depth=False):
  """Prepares the fake groundtruth dictionary."""
  # A small box with center at (0.55, 0.55).
  boxes = [
@@ -2146,6 +2373,26 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
              axis=2),
          multiples=[1, 1, 2]),
  ]
+  if has_depth:
+    keypoint_depths = [
+        tf.constant([[float('nan'), 3.0,
+                      float('nan'), 3.0, 0.55, 0.0]]),
+        tf.constant([[float('nan'), 0.55,
+                      float('nan'), 0.55, 0.55, 0.0]])
+    ]
+    keypoint_depth_weights = [
+        tf.constant([[1.0, 1.0, 1.0, 1.0, 0.0, 0.0]]),
+        tf.constant([[1.0, 1.0, 1.0, 1.0, 0.0, 0.0]])
+    ]
+  else:
+    keypoint_depths = [
+        tf.constant([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]),
+        tf.constant([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
+    ]
+    keypoint_depth_weights = [
+        tf.constant([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]),
+        tf.constant([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
+    ]
  labeled_classes = [
      tf.one_hot([1], depth=_NUM_CLASSES) + tf.one_hot([2], depth=_NUM_CLASSES),
      tf.one_hot([0], depth=_NUM_CLASSES) + tf.one_hot([1], depth=_NUM_CLASSES),
@@ -2187,11 +2434,12 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
      fields.BoxListFields.weights: weights,
      fields.BoxListFields.classes: classes,
      fields.BoxListFields.keypoints: keypoints,
+      fields.BoxListFields.keypoint_depths: keypoint_depths,
+      fields.BoxListFields.keypoint_depth_weights: keypoint_depth_weights,
      fields.BoxListFields.masks: masks,
      fields.BoxListFields.densepose_num_points: densepose_num_points,
      fields.BoxListFields.densepose_part_ids: densepose_part_ids,
-      fields.BoxListFields.densepose_surface_coords:
-          densepose_surface_coords,
+      fields.BoxListFields.densepose_surface_coords: densepose_surface_coords,
      fields.BoxListFields.track_ids: track_ids,
      fields.BoxListFields.temporal_offsets: temporal_offsets,
      fields.BoxListFields.track_match_flags: track_match_flags,
@@ -2201,7 +2449,7 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):


 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
-class CenterNetMetaComputeLossTest(test_case.TestCase):
+class CenterNetMetaComputeLossTest(test_case.TestCase, parameterized.TestCase):
  """Test for CenterNet loss compuation related functions."""

  def setUp(self):
@@ -2328,6 +2576,45 @@ class CenterNetMetaComputeLossTest(test_case.TestCase):
    # The prediction and groundtruth are curated to produce very low loss.
    self.assertGreater(0.01, loss)

+  @parameterized.parameters(
+      {'per_keypoint_depth': False},
+      {'per_keypoint_depth': True},
+  )
+  def test_compute_kp_depth_loss(self, per_keypoint_depth):
+    prediction_dict = get_fake_prediction_dict(
+        self.input_height,
+        self.input_width,
+        self.stride,
+        per_keypoint_depth=per_keypoint_depth)
+    model = build_center_net_meta_arch(
+        num_classes=1,
+        per_keypoint_offset=per_keypoint_depth,
+        predict_depth=True,
+        per_keypoint_depth=per_keypoint_depth,
+        peak_radius=1 if per_keypoint_depth else 0)
+    model._groundtruth_lists = get_fake_groundtruth_dict(
+        self.input_height, self.input_width, self.stride, has_depth=True)
+
+    def graph_fn():
+      loss = model._compute_kp_depth_loss(
+          input_height=self.input_height,
+          input_width=self.input_width,
+          task_name=_TASK_NAME,
+          depth_predictions=prediction_dict[cnma.get_keypoint_name(
+              _TASK_NAME, cnma.KEYPOINT_DEPTH)],
+          localization_loss_fn=self.localization_loss_fn)
+      return loss
+
+    loss = self.execute(graph_fn, [])
+
+    if per_keypoint_depth:
+      # The loss is computed on a disk with radius 1 but only the center pixel
+      # has the accurate prediction. The final loss is (4 * |3-0|) / 5 = 2.4
+      self.assertAlmostEqual(2.4, loss, delta=1e-4)
+    else:
+      # The prediction and groundtruth are curated to produce very low loss.
+      self.assertGreater(0.01, loss)
+
  def test_compute_track_embedding_loss(self):
    default_fc = self.model.track_reid_classification_net
    # Initialize the kernel to extreme values so that the classification score

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -165,6 +165,21 @@ message CenterNet {
    // out_height, out_width, 2 * num_keypoints] (recommended when the
    // offset_peak_radius is not zero).
    optional bool per_keypoint_offset = 18 [default = false];
+
+    // Indicates whether to predict the depth of each keypoints. Note that this
+    // is only supported in the single class keypoint task.
+    optional bool predict_depth = 19 [default = false];
+
+    // Indicates whether to predict depths for each keypoint channel
+    // separately. If set False, the output depth target has the shape
+    // [batch_size, out_height, out_width, 1]. If set True, the output depth
+    // target has the shape [batch_size, out_height, out_width,
+    // num_keypoints]. Recommend to set this value and "per_keypoint_offset" to
+    // both be True at the same time.
+    optional bool per_keypoint_depth = 20 [default = false];
+
+    // The weight of the keypoint depth loss.
+    optional float keypoint_depth_loss_weight = 21 [default = 1.0];
  }
  repeated KeypointEstimation keypoint_estimation_task = 7;

@@ -278,7 +293,6 @@ message CenterNet {
  // from CenterNet. Use this optional parameter to apply traditional non max
  // suppression and score thresholding.
  optional PostProcessing post_processing = 24;
-
 }

 message CenterNetFeatureExtractor {