Merged commit includes the following changes: (#8803)

320117767 by ronnyvotel: DensePose postprocessing implementation. -- 320065853 by ronnyvotel: Updating how masks are reframed, so that it works on float and uint8 masks. -- 320061717 by yuhuic: Updated CenterNet restore_from_objects to allow the model to load the checkpoints saved during training. -- 319835172 by ronnyvotel: Updating how the DensePose UV Symmetries MAT file path is constructed and loaded. -- 319834678 by ronnyvotel: First update to CenterNetMetaArch for DensePose. Adding prediction and loss functionality. -- 319810261 by rathodv: Create a setup.py file to simplify installation. Usage: "python object_detection/packages/tf1/setup.py install" for TF1. "python object_detection/packages/tf2/setup.py install" for TF2. or to create source distribution "python object_detection/packages/tf1/setup.py sdist" for TF1. "python object_detection/packages/tf2/setup.py sdist" for TF2. -- 319803041 by sbeery: Updating documentation for export -- 319688087 by rathodv: Update as_matrix() to to_numpy() to avoid failures with python3.6 -- 319686183 by vighneshb: Require tpu_name when use_tpu is set. -- 319613327 by aom: EfficientDet-style Data Augmentation. -- 319572180 by rathodv: Add TF2 SSD FPN (a.k.a RetinaNet) configs. -- 319553823 by rathodv: Internal Change. -- PiperOrigin-RevId: 320117767 Co-authored-by: TF Object Detection Team <no-reply@google.com>

Merged commit includes the following changes: (#8803)
320117767 by ronnyvotel: DensePose postprocessing implementation. -- 320065853 by ronnyvotel: Updating how masks are reframed, so that it works on float and uint8 masks. -- 320061717 by yuhuic: Updated CenterNet restore_from_objects to allow the model to load the checkpoints saved during training. -- 319835172 by ronnyvotel: Updating how the DensePose UV Symmetries MAT file path is constructed and loaded. -- 319834678 by ronnyvotel: First update to CenterNetMetaArch for DensePose. Adding prediction and loss functionality. -- 319810261 by rathodv: Create a setup.py file to simplify installation. Usage: "python object_detection/packages/tf1/setup.py install" for TF1. "python object_detection/packages/tf2/setup.py install" for TF2. or to create source distribution "python object_detection/packages/tf1/setup.py sdist" for TF1. "python object_detection/packages/tf2/setup.py sdist" for TF2. -- 319803041 by sbeery: Updating documentation for export -- 319688087 by rathodv: Update as_matrix() to to_numpy() to avoid failures with python3.6 -- 319686183 by vighneshb: Require tpu_name when use_tpu is set. -- 319613327 by aom: EfficientDet-style Data Augmentation. -- 319572180 by rathodv: Add TF2 SSD FPN (a.k.a RetinaNet) configs. -- 319553823 by rathodv: Internal Change. -- PiperOrigin-RevId: 320117767 Co-authored-by: TF Object Detection Team <no-reply@google.com>
52bb4ab1 · vivek rathod · GitHub · 3b56ba8d · 52bb4ab1 · 52bb4ab1
Unverified Commit 52bb4ab1 authored Jul 08, 2020 by vivek rathod Committed by GitHub Jul 08, 2020
20 changed files
--- a/research/object_detection/builders/box_predictor_builder_test.py
+++ b/research/object_detection/builders/box_predictor_builder_test.py
@@ -17,9 +17,8 @@
 """Tests for box_predictor_builder."""

 import unittest
-import mock
+from unittest import mock  # pylint: disable=g-importing-member
 import tensorflow.compat.v1 as tf
-
 from google.protobuf import text_format
 from object_detection.builders import box_predictor_builder
 from object_detection.builders import hyperparams_builder

--- a/research/object_detection/builders/graph_rewriter_builder_tf1_test.py
+++ b/research/object_detection/builders/graph_rewriter_builder_tf1_test.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Tests for graph_rewriter_builder."""
 import unittest
-import mock
+from unittest import mock  # pylint: disable=g-importing-member
 import tensorflow.compat.v1 as tf
 import tf_slim as slim


--- a/research/object_detection/builders/preprocessor_builder.py
+++ b/research/object_detection/builders/preprocessor_builder.py
@@ -417,4 +417,12 @@ def build(preprocessor_step_config):
        'num_scales': config.num_scales
    }

+  if step_type == 'random_scale_crop_and_pad_to_square':
+    config = preprocessor_step_config.random_scale_crop_and_pad_to_square
+    return preprocessor.random_scale_crop_and_pad_to_square, {
+        'scale_min': config.scale_min,
+        'scale_max': config.scale_max,
+        'output_size': config.output_size,
+    }
+
  raise ValueError('Unknown preprocessing step.')
--- a/research/object_detection/core/densepose_ops.py
+++ b/research/object_detection/core/densepose_ops.py
@@ -42,9 +42,6 @@ PART_NAMES = [
    b'left_face',
 ]

-_SRC_PATH = ('google3/third_party/tensorflow_models/object_detection/'
-             'dataset_tools/densepose')
-

 def scale(dp_surface_coords, y_scale, x_scale, scope=None):
  """Scales DensePose coordinates in y and x dimensions.
@@ -266,10 +263,14 @@ class DensePoseHorizontalFlip(object):
  def __init__(self):
    """Constructor."""

-    uv_symmetry_transforms_path = os.path.join(
-        tf.resource_loader.get_data_files_path(), '..', 'dataset_tools',
-        'densepose', 'UV_symmetry_transforms.mat')
-    data = scipy.io.loadmat(uv_symmetry_transforms_path)
+    path = os.path.dirname(os.path.abspath(__file__))
+    uv_symmetry_transforms_path = tf.resource_loader.get_path_to_datafile(
+        os.path.join(path, '..', 'dataset_tools', 'densepose',
+                     'UV_symmetry_transforms.mat'))
+    tf.logging.info('Loading DensePose symmetry transforms file from {}'.format(
+        uv_symmetry_transforms_path))
+    with tf.io.gfile.GFile(uv_symmetry_transforms_path, 'rb') as f:
+      data = scipy.io.loadmat(f)

    # Create lookup maps which indicate how a VU coordinate changes after a
    # horizontal flip.

--- a/research/object_detection/core/model.py
+++ b/research/object_detection/core/model.py
@@ -102,7 +102,7 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    Args:
      field: a string key, options are
        fields.BoxListFields.{boxes,classes,masks,keypoints,
-        keypoint_visibilities} or
+        keypoint_visibilities, densepose_*}
        fields.InputDataFields.is_annotated.

    Returns:
@@ -123,7 +123,7 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    Args:
      field: a string key, options are
        fields.BoxListFields.{boxes,classes,masks,keypoints,
-        keypoint_visibilities} or
+        keypoint_visibilities, densepose_*} or
        fields.InputDataFields.is_annotated.

    Returns:
@@ -288,19 +288,23 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    """
    pass

-  def provide_groundtruth(self,
-                          groundtruth_boxes_list,
-                          groundtruth_classes_list,
-                          groundtruth_masks_list=None,
-                          groundtruth_keypoints_list=None,
-                          groundtruth_keypoint_visibilities_list=None,
-                          groundtruth_weights_list=None,
-                          groundtruth_confidences_list=None,
-                          groundtruth_is_crowd_list=None,
-                          groundtruth_group_of_list=None,
-                          groundtruth_area_list=None,
-                          is_annotated_list=None,
-                          groundtruth_labeled_classes=None):
+  def provide_groundtruth(
+      self,
+      groundtruth_boxes_list,
+      groundtruth_classes_list,
+      groundtruth_masks_list=None,
+      groundtruth_keypoints_list=None,
+      groundtruth_keypoint_visibilities_list=None,
+      groundtruth_dp_num_points_list=None,
+      groundtruth_dp_part_ids_list=None,
+      groundtruth_dp_surface_coords_list=None,
+      groundtruth_weights_list=None,
+      groundtruth_confidences_list=None,
+      groundtruth_is_crowd_list=None,
+      groundtruth_group_of_list=None,
+      groundtruth_area_list=None,
+      is_annotated_list=None,
+      groundtruth_labeled_classes=None):
    """Provide groundtruth tensors.

    Args:
@@ -324,6 +328,15 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
        `groundtruth_keypoint_visibilities_list`).
      groundtruth_keypoint_visibilities_list: a list of 3-D tf.bool tensors
        of shape [num_boxes, num_keypoints] containing keypoint visibilities.
+      groundtruth_dp_num_points_list: a list of 1-D tf.int32 tensors of shape
+        [num_boxes] containing the number of DensePose sampled points.
+      groundtruth_dp_part_ids_list: a list of 2-D tf.int32 tensors of shape
+        [num_boxes, max_sampled_points] containing the DensePose part ids
+        (0-indexed) for each sampled point. Note that there may be padding.
+      groundtruth_dp_surface_coords_list: a list of 3-D tf.float32 tensors of
+        shape [num_boxes, max_sampled_points, 4] containing the DensePose
+        surface coordinates for each sampled point. Note that there may be
+        padding.
      groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape
        [num_boxes] containing weights for groundtruth boxes.
      groundtruth_confidences_list: A list of 2-D tf.float32 tensors of shape
@@ -361,6 +374,18 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      self._groundtruth_lists[
          fields.BoxListFields.keypoint_visibilities] = (
              groundtruth_keypoint_visibilities_list)
+    if groundtruth_dp_num_points_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.densepose_num_points] = (
+              groundtruth_dp_num_points_list)
+    if groundtruth_dp_part_ids_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.densepose_part_ids] = (
+              groundtruth_dp_part_ids_list)
+    if groundtruth_dp_surface_coords_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.densepose_surface_coords] = (
+              groundtruth_dp_surface_coords_list)
    if groundtruth_is_crowd_list:
      self._groundtruth_lists[
          fields.BoxListFields.is_crowd] = groundtruth_is_crowd_list

--- a/research/object_detection/core/preprocessor.py
+++ b/research/object_detection/core/preprocessor.py
@@ -3984,7 +3984,7 @@ def random_square_crop_by_scale(image, boxes, labels, label_weights,

  Args:
    image: rank 3 float32 tensor containing 1 image ->
-           [height, width,channels].
+           [height, width, channels].
    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
           Boxes are in normalized form meaning their coordinates vary
           between [0, 1]. Each row is in the form of [ymin, xmin, ymax, xmax].
@@ -4128,6 +4128,131 @@ def random_square_crop_by_scale(image, boxes, labels, label_weights,
  return return_values


+def random_scale_crop_and_pad_to_square(
+    image,
+    boxes,
+    labels,
+    label_weights,
+    masks=None,
+    keypoints=None,
+    scale_min=0.1,
+    scale_max=2.0,
+    output_size=512,
+    resize_method=tf.image.ResizeMethod.BILINEAR,
+    seed=None):
+  """Randomly scale, crop, and then pad an image to fixed square dimensions.
+
+   Randomly scale, crop, and then pad an image to the desired square output
+   dimensions. Specifically, this method first samples a random_scale factor
+   from a uniform distribution between scale_min and scale_max, and then resizes
+   the image such that it's maximum dimension is (output_size * random_scale).
+   Secondly, a square output_size crop is extracted from the resized image
+   (note, this will only occur when random_scale > 1.0). Lastly, the cropped
+   region is padded to the desired square output_size, by filling with zeros.
+   The augmentation is borrowed from [1]
+   [1]: https://arxiv.org/abs/1911.09070
+
+  Args:
+    image: rank 3 float32 tensor containing 1 image ->
+      [height, width, channels].
+    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. Boxes
+      are in normalized form meaning their coordinates vary between [0, 1]. Each
+      row is in the form of [ymin, xmin, ymax, xmax]. Boxes on the crop boundary
+      are clipped to the boundary and boxes falling outside the crop are
+      ignored.
+    labels: rank 1 int32 tensor containing the object classes.
+    label_weights: float32 tensor of shape [num_instances] representing the
+      weight for each box.
+    masks: (optional) rank 3 float32 tensor with shape [num_instances, height,
+      width] containing instance masks. The masks are of the same height, width
+      as the input `image`.
+    keypoints: (optional) rank 3 float32 tensor with shape [num_instances,
+      num_keypoints, 2]. The keypoints are in y-x normalized coordinates.
+    scale_min: float, the minimum value for the random scale factor.
+    scale_max: float, the maximum value for the random scale factor.
+    output_size: int, the desired (square) output image size.
+    resize_method: tf.image.ResizeMethod, resize method to use when scaling the
+      input images.
+    seed: random seed.
+
+  Returns:
+    image: image which is the same rank as input image.
+    boxes: boxes which is the same rank as input boxes.
+           Boxes are in normalized form.
+    labels: new labels.
+    label_weights: rank 1 float32 tensor with shape [num_instances].
+    masks: rank 3 float32 tensor with shape [num_instances, height, width]
+           containing instance masks.
+
+  """
+
+  img_shape = tf.shape(image)
+  input_height, input_width = img_shape[0], img_shape[1]
+  random_scale = tf.random_uniform([], scale_min, scale_max, seed=seed)
+
+  # Compute the scaled height and width from the random scale.
+  max_input_dim = tf.cast(tf.maximum(input_height, input_width), tf.float32)
+  input_ar_y = tf.cast(input_height, tf.float32) / max_input_dim
+  input_ar_x = tf.cast(input_width, tf.float32) / max_input_dim
+  scaled_height = tf.cast(random_scale * output_size * input_ar_y, tf.int32)
+  scaled_width = tf.cast(random_scale * output_size * input_ar_x, tf.int32)
+
+  # Compute the offsets:
+  offset_y = tf.cast(scaled_height - output_size, tf.float32)
+  offset_x = tf.cast(scaled_width - output_size, tf.float32)
+  offset_y = tf.maximum(0.0, offset_y) * tf.random_uniform([], 0, 1, seed=seed)
+  offset_x = tf.maximum(0.0, offset_x) * tf.random_uniform([], 0, 1, seed=seed)
+  offset_y = tf.cast(offset_y, tf.int32)
+  offset_x = tf.cast(offset_x, tf.int32)
+
+  # Scale, crop, and pad the input image.
+  scaled_image = tf.image.resize_images(
+      image, [scaled_height, scaled_width], method=resize_method)
+  scaled_image = scaled_image[offset_y:offset_y + output_size,
+                              offset_x:offset_x + output_size, :]
+  output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0, output_size,
+                                              output_size)
+
+  # Update the boxes.
+  new_window = tf.cast(
+      tf.stack([offset_y, offset_x,
+                offset_y + output_size, offset_x + output_size]),
+      dtype=tf.float32)
+  new_window /= tf.cast(
+      tf.stack([scaled_height, scaled_width, scaled_height, scaled_width]),
+      dtype=tf.float32)
+  boxlist = box_list.BoxList(boxes)
+  boxlist = box_list_ops.change_coordinate_frame(boxlist, new_window)
+  boxlist, indices = box_list_ops.prune_completely_outside_window(
+      boxlist, [0.0, 0.0, 1.0, 1.0])
+  boxlist = box_list_ops.clip_to_window(
+      boxlist, [0.0, 0.0, 1.0, 1.0], filter_nonoverlapping=False)
+
+  return_values = [output_image, boxlist.get(),
+                   tf.gather(labels, indices),
+                   tf.gather(label_weights, indices)]
+
+  if masks is not None:
+    new_masks = tf.expand_dims(masks, -1)
+    new_masks = tf.image.resize_images(
+        new_masks, [scaled_height, scaled_width], method=resize_method)
+    new_masks = new_masks[:, offset_y:offset_y + output_size,
+                          offset_x:offset_x + output_size, :]
+    new_masks = tf.image.pad_to_bounding_box(
+        new_masks, 0, 0, output_size, output_size)
+    new_masks = tf.squeeze(new_masks, [-1])
+    return_values.append(tf.gather(new_masks, indices))
+
+  if keypoints is not None:
+    keypoints = tf.gather(keypoints, indices)
+    keypoints = keypoint_ops.change_coordinate_frame(keypoints, new_window)
+    keypoints = keypoint_ops.prune_outside_window(
+        keypoints, [0.0, 0.0, 1.0, 1.0])
+    return_values.append(keypoints)
+
+  return return_values
+
+
 def get_default_func_arg_map(include_label_weights=True,
                             include_label_confidences=False,
                             include_multiclass_scores=False,
@@ -4230,15 +4355,14 @@ def get_default_func_arg_map(include_label_weights=True,
      random_adjust_saturation: (fields.InputDataFields.image,),
      random_distort_color: (fields.InputDataFields.image,),
      random_jitter_boxes: (fields.InputDataFields.groundtruth_boxes,),
-      random_crop_image: (fields.InputDataFields.image,
-                          fields.InputDataFields.groundtruth_boxes,
-                          fields.InputDataFields.groundtruth_classes,
-                          groundtruth_label_weights,
-                          groundtruth_label_confidences, multiclass_scores,
-                          groundtruth_instance_masks, groundtruth_keypoints,
-                          groundtruth_keypoint_visibilities,
-                          groundtruth_dp_num_points, groundtruth_dp_part_ids,
-                          groundtruth_dp_surface_coords),
+      random_crop_image:
+          (fields.InputDataFields.image,
+           fields.InputDataFields.groundtruth_boxes,
+           fields.InputDataFields.groundtruth_classes,
+           groundtruth_label_weights, groundtruth_label_confidences,
+           multiclass_scores, groundtruth_instance_masks, groundtruth_keypoints,
+           groundtruth_keypoint_visibilities, groundtruth_dp_num_points,
+           groundtruth_dp_part_ids, groundtruth_dp_surface_coords),
      random_pad_image:
          (fields.InputDataFields.image,
           fields.InputDataFields.groundtruth_boxes, groundtruth_instance_masks,
@@ -4361,6 +4485,12 @@ def get_default_func_arg_map(include_label_weights=True,
           fields.InputDataFields.groundtruth_classes,
           groundtruth_label_weights, groundtruth_instance_masks,
           groundtruth_keypoints),
+      random_scale_crop_and_pad_to_square:
+          (fields.InputDataFields.image,
+           fields.InputDataFields.groundtruth_boxes,
+           fields.InputDataFields.groundtruth_classes,
+           groundtruth_label_weights, groundtruth_instance_masks,
+           groundtruth_keypoints),
  }

  return prep_func_arg_map

--- a/research/object_detection/core/preprocessor_test.py
+++ b/research/object_detection/core/preprocessor_test.py
@@ -712,76 +712,6 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
                                test_masks=True,
                                test_keypoints=True)

-  @parameterized.parameters(
-      {'include_dense_pose': False},
-      {'include_dense_pose': True}
-  )
-  def testRunRandomHorizontalFlipWithMaskAndKeypoints(self, include_dense_pose):
-
-    def graph_fn():
-      preprocess_options = [(preprocessor.random_horizontal_flip, {})]
-      image_height = 3
-      image_width = 3
-      images = tf.random_uniform([1, image_height, image_width, 3])
-      boxes = self.createTestBoxes()
-      masks = self.createTestMasks()
-      keypoints, keypoint_visibilities = self.createTestKeypoints()
-      dp_num_point, dp_part_ids, dp_surface_coords = self.createTestDensePose()
-      keypoint_flip_permutation = self.createKeypointFlipPermutation()
-      tensor_dict = {
-          fields.InputDataFields.image:
-              images,
-          fields.InputDataFields.groundtruth_boxes:
-              boxes,
-          fields.InputDataFields.groundtruth_instance_masks:
-              masks,
-          fields.InputDataFields.groundtruth_keypoints:
-              keypoints,
-          fields.InputDataFields.groundtruth_keypoint_visibilities:
-              keypoint_visibilities
-      }
-      if include_dense_pose:
-        tensor_dict.update({
-            fields.InputDataFields.groundtruth_dp_num_points: dp_num_point,
-            fields.InputDataFields.groundtruth_dp_part_ids: dp_part_ids,
-            fields.InputDataFields.groundtruth_dp_surface_coords:
-                dp_surface_coords
-        })
-      preprocess_options = [(preprocessor.random_horizontal_flip, {
-          'keypoint_flip_permutation': keypoint_flip_permutation
-      })]
-      preprocessor_arg_map = preprocessor.get_default_func_arg_map(
-          include_instance_masks=True,
-          include_keypoints=True,
-          include_keypoint_visibilities=True,
-          include_dense_pose=include_dense_pose)
-      tensor_dict = preprocessor.preprocess(
-          tensor_dict, preprocess_options, func_arg_map=preprocessor_arg_map)
-      boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes]
-      masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks]
-      keypoints = tensor_dict[fields.InputDataFields.groundtruth_keypoints]
-      keypoint_visibilities = tensor_dict[
-          fields.InputDataFields.groundtruth_keypoint_visibilities]
-      output_tensors = [boxes, masks, keypoints, keypoint_visibilities]
-      if include_dense_pose:
-        dp_num_points = tensor_dict[
-            fields.InputDataFields.groundtruth_dp_num_points]
-        dp_part_ids = tensor_dict[
-            fields.InputDataFields.groundtruth_dp_part_ids]
-        dp_surface_coords = tensor_dict[
-            fields.InputDataFields.groundtruth_dp_surface_coords]
-        output_tensors.extend([dp_num_points, dp_part_ids, dp_surface_coords])
-      return output_tensors
-
-    output_tensors = self.execute_cpu(graph_fn, [])
-    self.assertIsNotNone(output_tensors[0])  # Boxes.
-    self.assertIsNotNone(output_tensors[1])  # Masks.
-    self.assertIsNotNone(output_tensors[2])  # Keypoints
-    self.assertIsNotNone(output_tensors[3])  # Keypoint Visibilities.
-    if include_dense_pose:
-      self.assertIsNotNone(output_tensors[4])  # DensePose Num Points.
-      self.assertIsNotNone(output_tensors[5])  # DensePose Part IDs.
-      self.assertIsNotNone(output_tensors[6])  # DensePose Surface Coords

  def testRandomVerticalFlip(self):

@@ -2380,7 +2310,6 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):

  @parameterized.parameters(
      {'include_dense_pose': False},
-      {'include_dense_pose': True}
  )
  def testRandomPadImageWithKeypointsAndMasks(self, include_dense_pose):
    def graph_fn():
@@ -3912,6 +3841,90 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
    size = max(image.shape)
    self.assertAlmostEqual(scale * 256.0, size)

+    self.assertAllClose(image[:, :, 0], masks[0, :, :])
+
+  @parameterized.named_parameters(('scale_0_1', 0.1), ('scale_1_0', 1.0),
+                                  ('scale_2_0', 2.0))
+  def test_random_scale_crop_and_pad_to_square(self, scale):
+
+    def graph_fn():
+      image = np.random.randn(512, 256, 1)
+      box_centers = [0.25, 0.5, 0.75]
+      box_size = 0.1
+      box_corners = []
+      box_labels = []
+      box_label_weights = []
+      keypoints = []
+      masks = []
+      for center_y in box_centers:
+        for center_x in box_centers:
+          box_corners.append(
+              [center_y - box_size / 2.0, center_x - box_size / 2.0,
+               center_y + box_size / 2.0, center_x + box_size / 2.0])
+          box_labels.append([1])
+          box_label_weights.append([1.])
+          keypoints.append(
+              [[center_y - box_size / 2.0, center_x - box_size / 2.0],
+               [center_y + box_size / 2.0, center_x + box_size / 2.0]])
+          masks.append(image[:, :, 0].reshape(512, 256))
+
+      image = tf.constant(image)
+      boxes = tf.constant(box_corners)
+      labels = tf.constant(box_labels)
+      label_weights = tf.constant(box_label_weights)
+      keypoints = tf.constant(keypoints)
+      masks = tf.constant(np.stack(masks))
+
+      (new_image, new_boxes, _, _, new_masks,
+       new_keypoints) = preprocessor.random_scale_crop_and_pad_to_square(
+           image,
+           boxes,
+           labels,
+           label_weights,
+           masks=masks,
+           keypoints=keypoints,
+           scale_min=scale,
+           scale_max=scale,
+           output_size=512)
+      return new_image, new_boxes, new_masks, new_keypoints
+
+    image, boxes, masks, keypoints = self.execute_cpu(graph_fn, [])
+
+    # Since random_scale_crop_and_pad_to_square may prune and clip boxes,
+    # we only need to find one of the boxes that was not clipped and check
+    # that it matches the expected dimensions. Note, assertAlmostEqual(a, b)
+    # is equivalent to round(a-b, 7) == 0.
+    any_box_has_correct_size = False
+    effective_scale_y = int(scale * 512) / 512.0
+    effective_scale_x = int(scale * 256) / 512.0
+    expected_size_y = 0.1 * effective_scale_y
+    expected_size_x = 0.1 * effective_scale_x
+    for box in boxes:
+      ymin, xmin, ymax, xmax = box
+      any_box_has_correct_size |= (
+          (round(ymin, 7) != 0.0) and (round(xmin, 7) != 0.0) and
+          (round(ymax, 7) != 1.0) and (round(xmax, 7) != 1.0) and
+          (round((ymax - ymin) - expected_size_y, 7) == 0.0) and
+          (round((xmax - xmin) - expected_size_x, 7) == 0.0))
+    self.assertTrue(any_box_has_correct_size)
+
+    # Similar to the approach above where we check for at least one box with the
+    # expected dimensions, we check for at least one pair of keypoints whose
+    # distance matches the expected dimensions.
+    any_keypoint_pair_has_correct_dist = False
+    for keypoint_pair in keypoints:
+      ymin, xmin = keypoint_pair[0]
+      ymax, xmax = keypoint_pair[1]
+      any_keypoint_pair_has_correct_dist |= (
+          (round(ymin, 7) != 0.0) and (round(xmin, 7) != 0.0) and
+          (round(ymax, 7) != 1.0) and (round(xmax, 7) != 1.0) and
+          (round((ymax - ymin) - expected_size_y, 7) == 0.0) and
+          (round((xmax - xmin) - expected_size_x, 7) == 0.0))
+    self.assertTrue(any_keypoint_pair_has_correct_dist)
+
+    self.assertAlmostEqual(512.0, image.shape[0])
+    self.assertAlmostEqual(512.0, image.shape[1])
+
    self.assertAllClose(image[:, :, 0],
                        masks[0, :, :])


--- a/research/object_detection/core/standard_fields.py
+++ b/research/object_detection/core/standard_fields.py
@@ -141,6 +141,8 @@ class DetectionResultFields(object):
      for detection boxes in the image including background class.
    detection_classes: detection-level class labels.
    detection_masks: contains a segmentation mask for each detection box.
+    detection_surface_coords: contains DensePose surface coordinates for each
+      box.
    detection_boundaries: contains an object boundary for each detection box.
    detection_keypoints: contains detection keypoints for each detection box.
    detection_keypoint_scores: contains detection keypoint scores.
@@ -161,6 +163,7 @@ class DetectionResultFields(object):
  detection_features = 'detection_features'
  detection_classes = 'detection_classes'
  detection_masks = 'detection_masks'
+  detection_surface_coords = 'detection_surface_coords'
  detection_boundaries = 'detection_boundaries'
  detection_keypoints = 'detection_keypoints'
  detection_keypoint_scores = 'detection_keypoint_scores'
@@ -182,7 +185,11 @@ class BoxListFields(object):
    masks: masks per bounding box.
    boundaries: boundaries per bounding box.
    keypoints: keypoints per bounding box.
+    keypoint_visibilities: keypoint visibilities per bounding box.
    keypoint_heatmaps: keypoint heatmaps per bounding box.
+    densepose_num_points: number of DensePose points per bounding box.
+    densepose_part_ids: DensePose part ids per bounding box.
+    densepose_surface_coords: DensePose surface coordinates per bounding box.
    is_crowd: is_crowd annotation per bounding box.
  """
  boxes = 'boxes'
@@ -196,6 +203,9 @@ class BoxListFields(object):
  keypoints = 'keypoints'
  keypoint_visibilities = 'keypoint_visibilities'
  keypoint_heatmaps = 'keypoint_heatmaps'
+  densepose_num_points = 'densepose_num_points'
+  densepose_part_ids = 'densepose_part_ids'
+  densepose_surface_coords = 'densepose_surface_coords'
  is_crowd = 'is_crowd'
  group_of = 'group_of'


--- a/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples.py
+++ b/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples.py
@@ -50,13 +50,16 @@ import io
 import itertools
 import json
 import os
-
-import apache_beam as beam
 import numpy as np
 import PIL.Image
 import six
 import tensorflow.compat.v1 as tf

+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+

 class ReKeyDataFn(beam.DoFn):
  """Re-keys tfrecords by sequence_key.

--- a/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples_tf1_test.py
@@ -22,7 +22,7 @@ import datetime
 import os
 import tempfile
 import unittest
-import apache_beam as beam
+
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf
@@ -31,6 +31,12 @@ from object_detection.dataset_tools.context_rcnn import add_context_to_examples
 from object_detection.utils import tf_version


+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+
+
 @contextlib.contextmanager
 def InMemoryTFRecord(entries):
  temp = tempfile.NamedTemporaryFile(delete=False)

--- a/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_main.py
+++ b/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_main.py
@@ -39,12 +39,16 @@ import io
 import json
 import logging
 import os
-import apache_beam as beam
 import numpy as np
 import PIL.Image
 import tensorflow.compat.v1 as tf
 from object_detection.utils import dataset_util

+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+

 class ParseImage(beam.DoFn):
  """A DoFn that parses a COCO-CameraTraps json and emits TFRecords."""

--- a/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_tf1_test.py
@@ -22,7 +22,6 @@ import os
 import tempfile
 import unittest

-import apache_beam as beam
 import numpy as np

 from PIL import Image
@@ -30,6 +29,11 @@ import tensorflow.compat.v1 as tf
 from object_detection.dataset_tools.context_rcnn import create_cococameratraps_tfexample_main
 from object_detection.utils import tf_version

+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+

 @unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class CreateCOCOCameraTrapsTfexampleTest(tf.test.TestCase):

--- a/research/object_detection/dataset_tools/context_rcnn/generate_detection_data.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_detection_data.py
@@ -48,8 +48,11 @@ from __future__ import print_function
 import argparse
 import os
 import threading
-import apache_beam as beam
 import tensorflow.compat.v1 as tf
+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass


 class GenerateDetectionDataFn(beam.DoFn):

--- a/research/object_detection/dataset_tools/context_rcnn/generate_detection_data_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_detection_data_tf1_test.py
@@ -22,7 +22,6 @@ import contextlib
 import os
 import tempfile
 import unittest
-import apache_beam as beam
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf
@@ -39,6 +38,11 @@ if six.PY2:
 else:
  mock = unittest.mock

+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+

 class FakeModel(model.DetectionModel):
  """A Fake Detection model with expected output nodes from post-processing."""

--- a/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data.py
@@ -34,7 +34,8 @@ python tensorflow_models/object_detection/export_inference_graph.py \
    --input_type tf_example \
    --pipeline_config_path path/to/faster_rcnn_model.config \
    --trained_checkpoint_prefix path/to/model.ckpt \
-    --output_directory path/to/exported_model_directory
+    --output_directory path/to/exported_model_directory \
+    --additional_output_tensor_names detection_features

 python generate_embedding_data.py \
    --alsologtostderr \
@@ -52,11 +53,15 @@ import datetime
 import os
 import threading

-import apache_beam as beam
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf

+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+

 class GenerateEmbeddingDataFn(beam.DoFn):
  """Generates embedding data for camera trap images.

--- a/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data_tf1_test.py
@@ -21,7 +21,6 @@ import contextlib
 import os
 import tempfile
 import unittest
-import apache_beam as beam
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf
@@ -38,6 +37,11 @@ if six.PY2:
 else:
  mock = unittest.mock

+try:
+  import apache_beam as beam  # pylint:disable=g-import-not-at-top
+except ModuleNotFoundError:
+  pass
+

 class FakeModel(model.DetectionModel):
  """A Fake Detection model with expected output nodes from post-processing."""

--- a/research/object_detection/dataset_tools/oid_tfrecord_creation.py
+++ b/research/object_detection/dataset_tools/oid_tfrecord_creation.py
@@ -51,25 +51,25 @@ def tf_example_from_annotations_data_frame(annotations_data_frame, label_map,
  feature_map = {
      standard_fields.TfExampleFields.object_bbox_ymin:
          dataset_util.float_list_feature(
-              filtered_data_frame_boxes.YMin.as_matrix()),
+              filtered_data_frame_boxes.YMin.to_numpy()),
      standard_fields.TfExampleFields.object_bbox_xmin:
          dataset_util.float_list_feature(
-              filtered_data_frame_boxes.XMin.as_matrix()),
+              filtered_data_frame_boxes.XMin.to_numpy()),
      standard_fields.TfExampleFields.object_bbox_ymax:
          dataset_util.float_list_feature(
-              filtered_data_frame_boxes.YMax.as_matrix()),
+              filtered_data_frame_boxes.YMax.to_numpy()),
      standard_fields.TfExampleFields.object_bbox_xmax:
          dataset_util.float_list_feature(
-              filtered_data_frame_boxes.XMax.as_matrix()),
+              filtered_data_frame_boxes.XMax.to_numpy()),
      standard_fields.TfExampleFields.object_class_text:
          dataset_util.bytes_list_feature([
              six.ensure_binary(label_text)
-              for label_text in filtered_data_frame_boxes.LabelName.as_matrix()
+              for label_text in filtered_data_frame_boxes.LabelName.to_numpy()
          ]),
      standard_fields.TfExampleFields.object_class_label:
          dataset_util.int64_list_feature(
              filtered_data_frame_boxes.LabelName.map(
-                  lambda x: label_map[x]).as_matrix()),
+                  lambda x: label_map[x]).to_numpy()),
      standard_fields.TfExampleFields.filename:
          dataset_util.bytes_feature(
              six.ensure_binary('{}.jpg'.format(image_id))),
@@ -82,31 +82,31 @@ def tf_example_from_annotations_data_frame(annotations_data_frame, label_map,
  if 'IsGroupOf' in filtered_data_frame.columns:
    feature_map[standard_fields.TfExampleFields.
                object_group_of] = dataset_util.int64_list_feature(
-                    filtered_data_frame_boxes.IsGroupOf.as_matrix().astype(int))
+                    filtered_data_frame_boxes.IsGroupOf.to_numpy().astype(int))
  if 'IsOccluded' in filtered_data_frame.columns:
    feature_map[standard_fields.TfExampleFields.
                object_occluded] = dataset_util.int64_list_feature(
-                    filtered_data_frame_boxes.IsOccluded.as_matrix().astype(
+                    filtered_data_frame_boxes.IsOccluded.to_numpy().astype(
                        int))
  if 'IsTruncated' in filtered_data_frame.columns:
    feature_map[standard_fields.TfExampleFields.
                object_truncated] = dataset_util.int64_list_feature(
-                    filtered_data_frame_boxes.IsTruncated.as_matrix().astype(
+                    filtered_data_frame_boxes.IsTruncated.to_numpy().astype(
                        int))
  if 'IsDepiction' in filtered_data_frame.columns:
    feature_map[standard_fields.TfExampleFields.
                object_depiction] = dataset_util.int64_list_feature(
-                    filtered_data_frame_boxes.IsDepiction.as_matrix().astype(
+                    filtered_data_frame_boxes.IsDepiction.to_numpy().astype(
                        int))

  if 'ConfidenceImageLabel' in filtered_data_frame_labels.columns:
    feature_map[standard_fields.TfExampleFields.
                image_class_label] = dataset_util.int64_list_feature(
                    filtered_data_frame_labels.LabelName.map(
-                        lambda x: label_map[x]).as_matrix())
+                        lambda x: label_map[x]).to_numpy())
    feature_map[standard_fields.TfExampleFields
                .image_class_text] = dataset_util.bytes_list_feature([
                    six.ensure_binary(label_text) for label_text in
-                    filtered_data_frame_labels.LabelName.as_matrix()
+                    filtered_data_frame_labels.LabelName.to_numpy()
                ]),
  return tf.train.Example(features=tf.train.Features(feature=feature_map))
--- a/research/object_detection/eval_util.py
+++ b/research/object_detection/eval_util.py
@@ -552,7 +552,11 @@ def _resize_detection_masks(args):
  detection_boxes, detection_masks, image_shape = args
  detection_masks_reframed = ops.reframe_box_masks_to_image_masks(
      detection_masks, detection_boxes, image_shape[0], image_shape[1])
-  return tf.cast(tf.greater(detection_masks_reframed, 0.5), tf.uint8)
+  # If the masks are currently float, binarize them. Otherwise keep them as
+  # integers, since they have already been thresholded.
+  if detection_masks_reframed.dtype == tf.float32:
+    detection_masks_reframed = tf.greater(detection_masks_reframed, 0.5)
+  return tf.cast(detection_masks_reframed, tf.uint8)


 def _resize_groundtruth_masks(args):
@@ -570,6 +574,17 @@ def _resize_groundtruth_masks(args):
  return tf.cast(tf.squeeze(mask, 3), tf.uint8)


+def _resize_surface_coordinate_masks(args):
+  detection_boxes, surface_coords, image_shape = args
+  surface_coords_v, surface_coords_u = tf.unstack(surface_coords, axis=-1)
+  surface_coords_v_reframed = ops.reframe_box_masks_to_image_masks(
+      surface_coords_v, detection_boxes, image_shape[0], image_shape[1])
+  surface_coords_u_reframed = ops.reframe_box_masks_to_image_masks(
+      surface_coords_u, detection_boxes, image_shape[0], image_shape[1])
+  return tf.stack([surface_coords_v_reframed, surface_coords_u_reframed],
+                  axis=-1)
+
+
 def _scale_keypoint_to_absolute(args):
  keypoints, image_shape = args
  return keypoint_ops.scale(keypoints, image_shape[0], image_shape[1])
@@ -720,6 +735,12 @@ def result_dict_for_batched_example(images,
        num_keypoints] bool tensor with keypoint visibilities (Optional).
      'groundtruth_labeled_classes': [batch_size, num_classes] int64
        tensor of 1-indexed classes. (Optional)
+      'groundtruth_dp_num_points': [batch_size, max_number_of_boxes] int32
+        tensor. (Optional)
+      'groundtruth_dp_part_ids': [batch_size, max_number_of_boxes,
+        max_sampled_points] int32 tensor. (Optional)
+      'groundtruth_dp_surface_coords_list': [batch_size, max_number_of_boxes,
+        max_sampled_points, 4] float32 tensor. (Optional)
    class_agnostic: Boolean indicating whether the detections are class-agnostic
      (i.e. binary). Default False.
    scale_to_absolute: Boolean indicating whether boxes and keypoints should be
@@ -747,12 +768,16 @@ def result_dict_for_batched_example(images,
    'detection_scores': [batch_size, max_detections] float32 tensor of scores.
    'detection_classes': [batch_size, max_detections] int64 tensor of 1-indexed
      classes.
-    'detection_masks': [batch_size, max_detections, H, W] float32 tensor of
-      binarized masks, reframed to full image masks. (Optional)
+    'detection_masks': [batch_size, max_detections, H, W] uint8 tensor of
+      instance masks, reframed to full image masks. Note that these may be
+      binarized (e.g. {0, 1}), or may contain 1-indexed part labels. (Optional)
    'detection_keypoints': [batch_size, max_detections, num_keypoints, 2]
      float32 tensor containing keypoint coordinates. (Optional)
    'detection_keypoint_scores': [batch_size, max_detections, num_keypoints]
      float32 tensor containing keypoint scores. (Optional)
+    'detection_surface_coords': [batch_size, max_detection, H, W, 2] float32
+      tensor with normalized surface coordinates (e.g. DensePose UV
+      coordinates). (Optional)
    'num_detections': [batch_size] int64 tensor containing number of valid
      detections.
    'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes, in
@@ -844,14 +869,21 @@ def result_dict_for_batched_example(images,

  if detection_fields.detection_masks in detections:
    detection_masks = detections[detection_fields.detection_masks]
-    # TODO(rathodv): This should be done in model's postprocess
-    # function ideally.
    output_dict[detection_fields.detection_masks] = (
        shape_utils.static_or_dynamic_map_fn(
            _resize_detection_masks,
            elems=[detection_boxes, detection_masks,
                   original_image_spatial_shapes],
            dtype=tf.uint8))
+    if detection_fields.detection_surface_coords in detections:
+      detection_surface_coords = detections[
+          detection_fields.detection_surface_coords]
+      output_dict[detection_fields.detection_surface_coords] = (
+          shape_utils.static_or_dynamic_map_fn(
+              _resize_surface_coordinate_masks,
+              elems=[detection_boxes, detection_surface_coords,
+                     original_image_spatial_shapes],
+              dtype=tf.float32))

  if detection_fields.detection_keypoints in detections:
    detection_keypoints = detections[detection_fields.detection_keypoints]
@@ -1074,3 +1106,8 @@ def evaluator_options_from_eval_config(eval_config):
          'recall_upper_bound': (eval_config.recall_upper_bound)
      }
  return evaluator_options
+
+
+def has_densepose(eval_dict):
+  return (fields.DetectionResultFields.detection_masks in eval_dict and
+          fields.DetectionResultFields.detection_surface_coords in eval_dict)
--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -924,13 +924,16 @@ def convert_strided_predictions_to_normalized_keypoints(


 def convert_strided_predictions_to_instance_masks(
-    boxes, classes, masks, stride, mask_height, mask_width,
-    true_image_shapes, score_threshold=0.5):
+    boxes, classes, masks, true_image_shapes,
+    densepose_part_heatmap=None, densepose_surface_coords=None, stride=4,
+    mask_height=256, mask_width=256, score_threshold=0.5,
+    densepose_class_index=-1):
  """Converts predicted full-image masks into instance masks.

  For each predicted detection box:
-    * Crop and resize the predicted mask based on the detected bounding box
-      coordinates and class prediction. Uses bilinear resampling.
+    * Crop and resize the predicted mask (and optionally DensePose coordinates)
+      based on the detected bounding box coordinates and class prediction. Uses
+      bilinear resampling.
    * Binarize the mask using the provided score threshold.

  Args:
@@ -940,57 +943,212 @@ def convert_strided_predictions_to_instance_masks(
      detected class for each box (0-indexed).
    masks: A [batch, output_height, output_width, num_classes] float32
      tensor with class probabilities.
+    true_image_shapes: A tensor of shape [batch, 3] representing the true
+      shape of the inputs not considering padding.
+    densepose_part_heatmap: (Optional) A [batch, output_height, output_width,
+      num_parts] float32 tensor with part scores (i.e. logits).
+    densepose_surface_coords: (Optional) A [batch, output_height, output_width,
+      2 * num_parts] float32 tensor with predicted part coordinates (in
+      vu-format).
    stride: The stride in the output space.
    mask_height: The desired resized height for instance masks.
    mask_width: The desired resized width for instance masks.
-    true_image_shapes: A tensor of shape [batch, 3] representing the true
-      shape of the inputs not considering padding.
    score_threshold: The threshold at which to convert predicted mask
       into foreground pixels.
+    densepose_class_index: The class index (0-indexed) corresponding to the
+      class which has DensePose labels (e.g. person class).

  Returns:
-    A [batch_size, max_detections, mask_height, mask_width] uint8 tensor with
-    predicted foreground mask for each instance. The masks take values in
-    {0, 1}.
+    A tuple of masks and surface_coords.
+    instance_masks: A [batch_size, max_detections, mask_height, mask_width]
+      uint8 tensor with predicted foreground mask for each
+      instance. If DensePose tensors are provided, then each pixel value in the
+      mask encodes the 1-indexed part.
+    surface_coords: A [batch_size, max_detections, mask_height, mask_width, 2]
+      float32 tensor with (v, u) coordinates. Note that v, u coordinates are
+      only defined on instance masks, and the coordinates at each location of
+      the foreground mask correspond to coordinates on a local part coordinate
+      system (the specific part can be inferred from the `instance_masks`
+      output. If DensePose feature maps are not passed to this function, this
+      output will be None.
+
+  Raises:
+    ValueError: If one but not both of `densepose_part_heatmap` and
+    `densepose_surface_coords` is provided.
  """
-  _, output_height, output_width, _ = (
+  batch_size, output_height, output_width, _ = (
      shape_utils.combined_static_and_dynamic_shape(masks))
  input_height = stride * output_height
  input_width = stride * output_width

+  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
+  # If necessary, create dummy DensePose tensors to simplify the map function.
+  densepose_present = True
+  if ((densepose_part_heatmap is not None) ^
+      (densepose_surface_coords is not None)):
+    raise ValueError('To use DensePose, both `densepose_part_heatmap` and '
+                     '`densepose_surface_coords` must be provided')
+  if densepose_part_heatmap is None and densepose_surface_coords is None:
+    densepose_present = False
+    densepose_part_heatmap = tf.zeros(
+        (batch_size, output_height, output_width, 1), dtype=tf.float32)
+    densepose_surface_coords = tf.zeros(
+        (batch_size, output_height, output_width, 2), dtype=tf.float32)
+  crop_and_threshold_fn = functools.partial(
+      crop_and_threshold_masks, input_height=input_height,
+      input_width=input_width, mask_height=mask_height, mask_width=mask_width,
+      score_threshold=score_threshold,
+      densepose_class_index=densepose_class_index)
+
+  instance_masks, surface_coords = shape_utils.static_or_dynamic_map_fn(
+      crop_and_threshold_fn,
+      elems=[boxes, classes, masks, densepose_part_heatmap,
+             densepose_surface_coords, true_heights, true_widths],
+      dtype=[tf.uint8, tf.float32],
+      back_prop=False)
+  surface_coords = surface_coords if densepose_present else None
+  return instance_masks, surface_coords
+
+
+def crop_and_threshold_masks(elems, input_height, input_width, mask_height=256,
+                             mask_width=256, score_threshold=0.5,
+                             densepose_class_index=-1):
+  """Crops and thresholds masks based on detection boxes.
+
+  Args:
+    elems: A tuple of
+      boxes - float32 tensor of shape [max_detections, 4]
+      classes - int32 tensor of shape [max_detections] (0-indexed)
+      masks - float32 tensor of shape [output_height, output_width, num_classes]
+      part_heatmap - float32 tensor of shape [output_height, output_width,
+        num_parts]
+      surf_coords - float32 tensor of shape [output_height, output_width,
+        2 * num_parts]
+      true_height - scalar int tensor
+      true_width - scalar int tensor
+    input_height: Input height to network.
+    input_width: Input width to network.
+    mask_height: Height for resizing mask crops.
+    mask_width: Width for resizing mask crops.
+    score_threshold: The threshold at which to convert predicted mask
+      into foreground pixels.
+    densepose_class_index: scalar int tensor with the class index (0-indexed)
+      for DensePose.
+
+  Returns:
+    A tuple of
+    all_instances: A [max_detections, mask_height, mask_width] uint8 tensor
+      with a predicted foreground mask for each instance. Background is encoded
+      as 0, and foreground is encoded as a positive integer. Specific part
+      indices are encoded as 1-indexed parts (for classes that have part
+      information).
+    surface_coords: A [max_detections, mask_height, mask_width, 2]
+      float32 tensor with (v, u) coordinates. for each part.
+  """
+  (boxes, classes, masks, part_heatmap, surf_coords, true_height,
+   true_width) = elems
  # Boxes are in normalized coordinates relative to true image shapes. Convert
  # coordinates to be normalized relative to input image shapes (since masks
  # may still have padding).
-  # Then crop and resize each mask.
-  def crop_and_threshold_masks(args):
-    """Crops masks based on detection boxes."""
-    boxes, classes, masks, true_height, true_width = args
-    boxlist = box_list.BoxList(boxes)
-    y_scale = true_height / input_height
-    x_scale = true_width / input_width
-    boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
-    boxes = boxlist.get()
-    # Convert masks from [input_height, input_width, num_classes] to
-    # [num_classes, input_height, input_width, 1].
-    masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
-    cropped_masks = tf2.image.crop_and_resize(
-        masks_4d,
-        boxes=boxes,
-        box_indices=classes,
-        crop_size=[mask_height, mask_width],
-        method='bilinear')
-    masks_3d = tf.squeeze(cropped_masks, axis=3)
-    masks_binarized = tf.math.greater_equal(masks_3d, score_threshold)
-    return tf.cast(masks_binarized, tf.uint8)
+  boxlist = box_list.BoxList(boxes)
+  y_scale = true_height / input_height
+  x_scale = true_width / input_width
+  boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
+  boxes = boxlist.get()
+  # Convert masks from [output_height, output_width, num_classes] to
+  # [num_classes, output_height, output_width, 1].
+  num_classes = tf.shape(masks)[-1]
+  masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
+  # Tile part and surface coordinate masks for all classes.
+  part_heatmap_4d = tf.tile(part_heatmap[tf.newaxis, :, :, :],
+                            multiples=[num_classes, 1, 1, 1])
+  surf_coords_4d = tf.tile(surf_coords[tf.newaxis, :, :, :],
+                           multiples=[num_classes, 1, 1, 1])
+  feature_maps_concat = tf.concat([masks_4d, part_heatmap_4d, surf_coords_4d],
+                                  axis=-1)
+  # The following tensor has shape
+  # [max_detections, mask_height, mask_width, 1 + 3 * num_parts].
+  cropped_masks = tf2.image.crop_and_resize(
+      feature_maps_concat,
+      boxes=boxes,
+      box_indices=classes,
+      crop_size=[mask_height, mask_width],
+      method='bilinear')
+
+  # Split the cropped masks back into instance masks, part masks, and surface
+  # coordinates.
+  num_parts = tf.shape(part_heatmap)[-1]
+  instance_masks, part_heatmap_cropped, surface_coords_cropped = tf.split(
+      cropped_masks, [1, num_parts, 2 * num_parts], axis=-1)
+
+  # Threshold the instance masks. Resulting tensor has shape
+  # [max_detections, mask_height, mask_width, 1].
+  instance_masks_int = tf.cast(
+      tf.math.greater_equal(instance_masks, score_threshold), dtype=tf.int32)
+
+  # Produce a binary mask that is 1.0 only:
+  #  - in the foreground region for an instance
+  #  - in detections corresponding to the DensePose class
+  det_with_parts = tf.equal(classes, densepose_class_index)
+  det_with_parts = tf.cast(
+      tf.reshape(det_with_parts, [-1, 1, 1, 1]), dtype=tf.int32)
+  instance_masks_with_parts = tf.math.multiply(instance_masks_int,
+                                               det_with_parts)
+
+  # Similarly, produce a binary mask that holds the foreground masks only for
+  # instances without parts (i.e. non-DensePose classes).
+  det_without_parts = 1 - det_with_parts
+  instance_masks_without_parts = tf.math.multiply(instance_masks_int,
+                                                  det_without_parts)
+
+  # Assemble a tensor that has standard instance segmentation masks for
+  # non-DensePose classes (with values in [0, 1]), and part segmentation masks
+  # for DensePose classes (with vaues in [0, 1, ..., num_parts]).
+  part_mask_int_zero_indexed = tf.math.argmax(
+      part_heatmap_cropped, axis=-1, output_type=tf.int32)[:, :, :, tf.newaxis]
+  part_mask_int_one_indexed = part_mask_int_zero_indexed + 1
+  all_instances = (instance_masks_without_parts +
+                   instance_masks_with_parts * part_mask_int_one_indexed)
+
+  # Gather the surface coordinates for the parts.
+  surface_coords_cropped = tf.reshape(
+      surface_coords_cropped, [-1, mask_height, mask_width, num_parts, 2])
+  surface_coords = gather_surface_coords_for_parts(surface_coords_cropped,
+                                                   part_mask_int_zero_indexed)
+  surface_coords = (
+      surface_coords * tf.cast(instance_masks_with_parts, tf.float32))
+
+  return [tf.squeeze(all_instances, axis=3), surface_coords]
+
+
+def gather_surface_coords_for_parts(surface_coords_cropped,
+                                    highest_scoring_part):
+  """Gathers the (v, u) coordinates for the highest scoring DensePose parts.

-  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
-  masks_for_image = shape_utils.static_or_dynamic_map_fn(
-      crop_and_threshold_masks,
-      elems=[boxes, classes, masks, true_heights, true_widths],
-      dtype=tf.uint8,
-      back_prop=False)
-  masks = tf.stack(masks_for_image, axis=0)
-  return masks
+  Args:
+    surface_coords_cropped: A [max_detections, height, width, num_parts, 2]
+      float32 tensor with (v, u) surface coordinates.
+    highest_scoring_part: A [max_detections, height, width] integer tensor with
+      the highest scoring part (0-indexed) indices for each location.
+
+  Returns:
+    A [max_detections, height, width, 2] float32 tensor with the (v, u)
+    coordinates selected from the highest scoring parts.
+  """
+  max_detections, height, width, num_parts, _ = (
+      shape_utils.combined_static_and_dynamic_shape(surface_coords_cropped))
+  flattened_surface_coords = tf.reshape(surface_coords_cropped, [-1, 2])
+  flattened_part_ids = tf.reshape(highest_scoring_part, [-1])
+
+  # Produce lookup indices that represent the locations of the highest scoring
+  # parts in the `flattened_surface_coords` tensor.
+  flattened_lookup_indices = (
+      num_parts * tf.range(max_detections * height * width) +
+      flattened_part_ids)
+
+  vu_coords_flattened = tf.gather(flattened_surface_coords,
+                                  flattened_lookup_indices, axis=0)
+  return tf.reshape(vu_coords_flattened, [max_detections, height, width, 2])


 class ObjectDetectionParams(
@@ -1235,6 +1393,64 @@ class MaskParams(
                              score_threshold, heatmap_bias_init)


+class DensePoseParams(
+    collections.namedtuple('DensePoseParams', [
+        'class_id', 'classification_loss', 'localization_loss',
+        'part_loss_weight', 'coordinate_loss_weight', 'num_parts',
+        'task_loss_weight', 'upsample_to_input_res', 'upsample_method',
+        'heatmap_bias_init'
+    ])):
+  """Namedtuple to store DensePose prediction related parameters."""
+
+  __slots__ = ()
+
+  def __new__(cls,
+              class_id,
+              classification_loss,
+              localization_loss,
+              part_loss_weight=1.0,
+              coordinate_loss_weight=1.0,
+              num_parts=24,
+              task_loss_weight=1.0,
+              upsample_to_input_res=True,
+              upsample_method='bilinear',
+              heatmap_bias_init=-2.19):
+    """Constructor with default values for DensePoseParams.
+
+    Args:
+      class_id: the ID of the class that contains the DensePose groundtruth.
+        This should typically correspond to the "person" class. Note that the ID
+        is 0-based, meaning that class 0 corresponds to the first non-background
+        object class.
+      classification_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the body part predictions in CenterNet.
+      localization_loss: an object_detection.core.losses.Loss object to compute
+        the loss for the surface coordinate regression in CenterNet.
+      part_loss_weight: The loss weight to apply to part prediction.
+      coordinate_loss_weight: The loss weight to apply to surface coordinate
+        prediction.
+      num_parts: The number of DensePose parts to predict.
+      task_loss_weight: float, the loss weight for the DensePose task.
+      upsample_to_input_res: Whether to upsample the DensePose feature maps to
+        the input resolution before applying loss. Note that the prediction
+        outputs are still at the standard CenterNet output stride.
+      upsample_method: Method for upsampling DensePose feature maps. Options are
+        either 'bilinear' or 'nearest'). This takes no effect when
+        `upsample_to_input_res` is False.
+      heatmap_bias_init: float, the initial value of bias in the convolutional
+        kernel of the part prediction head. If set to None, the
+        bias is initialized with zeros.
+
+    Returns:
+      An initialized DensePoseParams namedtuple.
+    """
+    return super(DensePoseParams,
+                 cls).__new__(cls, class_id, classification_loss,
+                              localization_loss, part_loss_weight,
+                              coordinate_loss_weight, num_parts,
+                              task_loss_weight, upsample_to_input_res,
+                              upsample_method, heatmap_bias_init)
+
 # The following constants are used to generate the keys of the
 # (prediction, loss, target assigner,...) dictionaries used in CenterNetMetaArch
 # class.
@@ -1247,6 +1463,9 @@ KEYPOINT_HEATMAP = 'keypoint/heatmap'
 KEYPOINT_OFFSET = 'keypoint/offset'
 SEGMENTATION_TASK = 'segmentation_task'
 SEGMENTATION_HEATMAP = 'segmentation/heatmap'
+DENSEPOSE_TASK = 'densepose_task'
+DENSEPOSE_HEATMAP = 'densepose/heatmap'
+DENSEPOSE_REGRESSION = 'densepose/regression'
 LOSS_KEY_PREFIX = 'Loss'


@@ -1290,7 +1509,8 @@ class CenterNetMetaArch(model.DetectionModel):
               object_center_params,
               object_detection_params=None,
               keypoint_params_dict=None,
-               mask_params=None):
+               mask_params=None,
+               densepose_params=None):
    """Initializes a CenterNet model.

    Args:
@@ -1318,6 +1538,10 @@ class CenterNetMetaArch(model.DetectionModel):
      mask_params: A MaskParams namedtuple. This object
        holds the hyper-parameters for segmentation. Please see the class
        definition for more details.
+      densepose_params: A DensePoseParams namedtuple. This object holds the
+        hyper-parameters for DensePose prediction. Please see the class
+        definition for more details. Note that if this is provided, it is
+        expected that `mask_params` is also provided.
    """
    assert object_detection_params or keypoint_params_dict
    # Shorten the name for convenience and better formatting.
@@ -1333,6 +1557,10 @@ class CenterNetMetaArch(model.DetectionModel):
    self._od_params = object_detection_params
    self._kp_params_dict = keypoint_params_dict
    self._mask_params = mask_params
+    if densepose_params is not None and mask_params is None:
+      raise ValueError('To run DensePose prediction, `mask_params` must also '
+                       'be supplied.')
+    self._densepose_params = densepose_params

    # Construct the prediction head nets.
    self._prediction_head_dict = self._construct_prediction_heads(
@@ -1413,8 +1641,18 @@ class CenterNetMetaArch(model.DetectionModel):
    if self._mask_params is not None:
      prediction_heads[SEGMENTATION_HEATMAP] = [
          make_prediction_net(num_classes,
-                              bias_fill=class_prediction_bias_init)
+                              bias_fill=self._mask_params.heatmap_bias_init)
+          for _ in range(num_feature_outputs)]
+    if self._densepose_params is not None:
+      prediction_heads[DENSEPOSE_HEATMAP] = [
+          make_prediction_net(  # pylint: disable=g-complex-comprehension
+              self._densepose_params.num_parts,
+              bias_fill=self._densepose_params.heatmap_bias_init)
          for _ in range(num_feature_outputs)]
+      prediction_heads[DENSEPOSE_REGRESSION] = [
+          make_prediction_net(2 * self._densepose_params.num_parts)
+          for _ in range(num_feature_outputs)
+      ]
    return prediction_heads

  def _initialize_target_assigners(self, stride, min_box_overlap_iou):
@@ -1449,6 +1687,10 @@ class CenterNetMetaArch(model.DetectionModel):
    if self._mask_params is not None:
      target_assigners[SEGMENTATION_TASK] = (
          cn_assigner.CenterNetMaskTargetAssigner(stride))
+    if self._densepose_params is not None:
+      dp_stride = 1 if self._densepose_params.upsample_to_input_res else stride
+      target_assigners[DENSEPOSE_TASK] = (
+          cn_assigner.CenterNetDensePoseTargetAssigner(dp_stride))

    return target_assigners

@@ -1860,6 +2102,113 @@ class CenterNetMetaArch(model.DetectionModel):
        float(len(segmentation_predictions)) * total_pixels_in_loss)
    return total_loss

+  def _compute_densepose_losses(self, input_height, input_width,
+                                prediction_dict):
+    """Computes the weighted DensePose losses.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      prediction_dict: A dictionary holding predicted tensors output by the
+        "predict" function. See the "predict" function for more detailed
+        description.
+
+    Returns:
+      A dictionary of scalar float tensors representing the weighted losses for
+      the DensePose task:
+         DENSEPOSE_HEATMAP: the weighted part segmentation loss.
+         DENSEPOSE_REGRESSION: the weighted part surface coordinate loss.
+    """
+    dp_heatmap_loss, dp_regression_loss = (
+        self._compute_densepose_part_and_coordinate_losses(
+            input_height=input_height,
+            input_width=input_width,
+            part_predictions=prediction_dict[DENSEPOSE_HEATMAP],
+            surface_coord_predictions=prediction_dict[DENSEPOSE_REGRESSION]))
+    loss_dict = {}
+    loss_dict[DENSEPOSE_HEATMAP] = (
+        self._densepose_params.part_loss_weight * dp_heatmap_loss)
+    loss_dict[DENSEPOSE_REGRESSION] = (
+        self._densepose_params.coordinate_loss_weight * dp_regression_loss)
+    return loss_dict
+
+  def _compute_densepose_part_and_coordinate_losses(
+      self, input_height, input_width, part_predictions,
+      surface_coord_predictions):
+    """Computes the individual losses for the DensePose task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      part_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, num_parts].
+      surface_coord_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2 * num_parts].
+
+    Returns:
+      A tuple with two scalar loss tensors: part_prediction_loss and
+      surface_coord_loss.
+    """
+    gt_dp_num_points_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_num_points)
+    gt_dp_part_ids_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_part_ids)
+    gt_dp_surface_coords_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_surface_coords)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+
+    assigner = self._target_assigner_dict[DENSEPOSE_TASK]
+    batch_indices, batch_part_ids, batch_surface_coords, batch_weights = (
+        assigner.assign_part_and_coordinate_targets(
+            height=input_height,
+            width=input_width,
+            gt_dp_num_points_list=gt_dp_num_points_list,
+            gt_dp_part_ids_list=gt_dp_part_ids_list,
+            gt_dp_surface_coords_list=gt_dp_surface_coords_list,
+            gt_weights_list=gt_weights_list))
+
+    part_prediction_loss = 0
+    surface_coord_loss = 0
+    classification_loss_fn = self._densepose_params.classification_loss
+    localization_loss_fn = self._densepose_params.localization_loss
+    num_predictions = float(len(part_predictions))
+    num_valid_points = tf.math.count_nonzero(batch_weights)
+    num_valid_points = tf.cast(tf.math.maximum(num_valid_points, 1), tf.float32)
+    for part_pred, surface_coord_pred in zip(part_predictions,
+                                             surface_coord_predictions):
+      # Potentially upsample the feature maps, so that better quality (i.e.
+      # higher res) groundtruth can be applied.
+      if self._densepose_params.upsample_to_input_res:
+        part_pred = tf.keras.layers.UpSampling2D(
+            self._stride, interpolation=self._densepose_params.upsample_method)(
+                part_pred)
+        surface_coord_pred = tf.keras.layers.UpSampling2D(
+            self._stride, interpolation=self._densepose_params.upsample_method)(
+                surface_coord_pred)
+      # Compute the part prediction loss.
+      part_pred = cn_assigner.get_batch_predictions_from_indices(
+          part_pred, batch_indices[:, 0:3])
+      part_prediction_loss += classification_loss_fn(
+          part_pred[:, tf.newaxis, :],
+          batch_part_ids[:, tf.newaxis, :],
+          weights=batch_weights[:, tf.newaxis, tf.newaxis])
+      # Compute the surface coordinate loss.
+      batch_size, out_height, out_width, _ = _get_shape(
+          surface_coord_pred, 4)
+      surface_coord_pred = tf.reshape(
+          surface_coord_pred, [batch_size, out_height, out_width, -1, 2])
+      surface_coord_pred = cn_assigner.get_batch_predictions_from_indices(
+          surface_coord_pred, batch_indices)
+      surface_coord_loss += localization_loss_fn(
+          surface_coord_pred,
+          batch_surface_coords,
+          weights=batch_weights[:, tf.newaxis])
+    part_prediction_loss = tf.reduce_sum(part_prediction_loss) / (
+        num_predictions * num_valid_points)
+    surface_coord_loss = tf.reduce_sum(surface_coord_loss) / (
+        num_predictions * num_valid_points)
+    return part_prediction_loss, surface_coord_loss
+
  def preprocess(self, inputs):
    outputs = shape_utils.resize_images_and_return_shapes(
        inputs, self._image_resizer_fn)
@@ -1909,6 +2258,13 @@ class CenterNetMetaArch(model.DetectionModel):
        'segmentation/heatmap' - [optional] A list of size num_feature_outputs
          holding float tensors of size [batch_size, output_height,
          output_width, num_classes] representing the mask logits.
+        'densepose/heatmap' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, num_parts] representing the mask logits for each part.
+        'densepose/regression' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, 2 * num_parts] representing the DensePose surface
+          coordinate predictions.
        Note the $TASK_NAME is provided by the KeypointEstimation namedtuple
        used to differentiate between different keypoint tasks.
    """
@@ -1938,10 +2294,16 @@ class CenterNetMetaArch(model.DetectionModel):
      scope: Optional scope name.

    Returns:
-      A dictionary mapping the keys ['Loss/object_center', 'Loss/box/scale',
-        'Loss/box/offset', 'Loss/$TASK_NAME/keypoint/heatmap',
-        'Loss/$TASK_NAME/keypoint/offset',
-        'Loss/$TASK_NAME/keypoint/regression', 'Loss/segmentation/heatmap'] to
+      A dictionary mapping the keys [
+        'Loss/object_center',
+        'Loss/box/scale',  (optional)
+        'Loss/box/offset', (optional)
+        'Loss/$TASK_NAME/keypoint/heatmap', (optional)
+        'Loss/$TASK_NAME/keypoint/offset', (optional)
+        'Loss/$TASK_NAME/keypoint/regression', (optional)
+        'Loss/segmentation/heatmap', (optional)
+        'Loss/densepose/heatmap', (optional)
+        'Loss/densepose/regression]' (optional)
        scalar tensors corresponding to the losses for different tasks. Note the
        $TASK_NAME is provided by the KeypointEstimation namedtuple used to
        differentiate between different keypoint tasks.
@@ -1999,6 +2361,16 @@ class CenterNetMetaArch(model.DetectionModel):
        seg_losses[key] = seg_losses[key] * self._mask_params.task_loss_weight
      losses.update(seg_losses)

+    if self._densepose_params is not None:
+      densepose_losses = self._compute_densepose_losses(
+          input_height=input_height,
+          input_width=input_width,
+          prediction_dict=prediction_dict)
+      for key in densepose_losses:
+        densepose_losses[key] = (
+            densepose_losses[key] * self._densepose_params.task_loss_weight)
+      losses.update(densepose_losses)
+
    # Prepend the LOSS_KEY_PREFIX to the keys in the dictionary such that the
    # losses will be grouped together in Tensorboard.
    return dict([('%s/%s' % (LOSS_KEY_PREFIX, key), val)
@@ -2033,9 +2405,14 @@ class CenterNetMetaArch(model.DetectionModel):
          invalid keypoints have their coordinates and scores set to 0.0.
        detection_keypoint_scores: (Optional) A float tensor of shape [batch,
          max_detection, num_keypoints] with scores for each keypoint.
-        detection_masks: (Optional) An int tensor of shape [batch,
-          max_detections, mask_height, mask_width] with binarized masks for each
-          detection.
+        detection_masks: (Optional) A uint8 tensor of shape [batch,
+          max_detections, mask_height, mask_width] with masks for each
+          detection. Background is specified with 0, and foreground is specified
+          with positive integers (1 for standard instance segmentation mask, and
+          1-indexed parts for DensePose task).
+        detection_surface_coords: (Optional) A float32 tensor of shape [batch,
+          max_detection, mask_height, mask_width, 2] with DensePose surface
+          coordinates, in (v, u) format.
    """
    object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
    # Get x, y and channel indices corresponding to the top indices in the class
@@ -2076,14 +2453,27 @@ class CenterNetMetaArch(model.DetectionModel):

    if self._mask_params:
      masks = tf.nn.sigmoid(prediction_dict[SEGMENTATION_HEATMAP][-1])
-      instance_masks = convert_strided_predictions_to_instance_masks(
-          boxes, classes, masks, self._stride, self._mask_params.mask_height,
-          self._mask_params.mask_width, true_image_shapes,
-          self._mask_params.score_threshold)
-      postprocess_dict.update({
-          fields.DetectionResultFields.detection_masks:
-              instance_masks
-      })
+      densepose_part_heatmap, densepose_surface_coords = None, None
+      densepose_class_index = 0
+      if self._densepose_params:
+        densepose_part_heatmap = prediction_dict[DENSEPOSE_HEATMAP][-1]
+        densepose_surface_coords = prediction_dict[DENSEPOSE_REGRESSION][-1]
+        densepose_class_index = self._densepose_params.class_id
+      instance_masks, surface_coords = (
+          convert_strided_predictions_to_instance_masks(
+              boxes, classes, masks, true_image_shapes,
+              densepose_part_heatmap, densepose_surface_coords,
+              stride=self._stride, mask_height=self._mask_params.mask_height,
+              mask_width=self._mask_params.mask_width,
+              score_threshold=self._mask_params.score_threshold,
+              densepose_class_index=densepose_class_index))
+      postprocess_dict[
+          fields.DetectionResultFields.detection_masks] = instance_masks
+      if self._densepose_params:
+        postprocess_dict[
+            fields.DetectionResultFields.detection_surface_coords] = (
+                surface_coords)
+
    return postprocess_dict

  def _postprocess_keypoints(self, prediction_dict, classes, y_indices,
@@ -2368,7 +2758,9 @@ class CenterNetMetaArch(model.DetectionModel):
      return {'feature_extractor': self._feature_extractor.get_base_model()}

    if fine_tune_checkpoint_type == 'detection':
-      return {'feature_extractor': self._feature_extractor.get_model()}
+      fake_model = tf.train.Checkpoint(
+          _feature_extractor=self._feature_extractor)
+      return {'model': fake_model}

    else:
      raise ValueError('Not supported  fine tune checkpoint type - {}'.format(

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -266,7 +266,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      masks_np[0, :, :3, 1] = 1  # Class 1.
      masks = tf.constant(masks_np)
      true_image_shapes = tf.constant([[6, 8, 3]])
-      instance_masks = cnma.convert_strided_predictions_to_instance_masks(
+      instance_masks, _ = cnma.convert_strided_predictions_to_instance_masks(
          boxes, classes, masks, stride=2, mask_height=2, mask_width=2,
          true_image_shapes=true_image_shapes)
      return instance_masks
@@ -289,6 +289,104 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
        ])
    np.testing.assert_array_equal(expected_instance_masks, instance_masks)

+  def test_convert_strided_predictions_raises_error_with_one_tensor(self):
+    def graph_fn():
+      boxes = tf.constant(
+          [
+              [[0.5, 0.5, 1.0, 1.0],
+               [0.0, 0.5, 0.5, 1.0],
+               [0.0, 0.0, 0.0, 0.0]],
+          ], tf.float32)
+      classes = tf.constant(
+          [
+              [0, 1, 0],
+          ], tf.int32)
+      masks_np = np.zeros((1, 4, 4, 2), dtype=np.float32)
+      masks_np[0, :, 2:, 0] = 1  # Class 0.
+      masks_np[0, :, :3, 1] = 1  # Class 1.
+      masks = tf.constant(masks_np)
+      true_image_shapes = tf.constant([[6, 8, 3]])
+      densepose_part_heatmap = tf.random.uniform(
+          [1, 4, 4, 24])
+      instance_masks, _ = cnma.convert_strided_predictions_to_instance_masks(
+          boxes, classes, masks, true_image_shapes,
+          densepose_part_heatmap=densepose_part_heatmap,
+          densepose_surface_coords=None)
+      return instance_masks
+
+    with self.assertRaises(ValueError):
+      self.execute_cpu(graph_fn, [])
+
+  def test_crop_and_threshold_masks(self):
+    boxes_np = np.array(
+        [[0., 0., 0.5, 0.5],
+         [0.25, 0.25, 1.0, 1.0]], dtype=np.float32)
+    classes_np = np.array([0, 2], dtype=np.int32)
+    masks_np = np.zeros((4, 4, _NUM_CLASSES), dtype=np.float32)
+    masks_np[0, 0, 0] = 0.8
+    masks_np[1, 1, 0] = 0.6
+    masks_np[3, 3, 2] = 0.7
+    part_heatmap_np = np.zeros((4, 4, _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+    part_heatmap_np[0, 0, 4] = 1
+    part_heatmap_np[0, 0, 2] = 0.6  # Lower scoring.
+    part_heatmap_np[1, 1, 8] = 0.2
+    part_heatmap_np[3, 3, 4] = 0.5
+    surf_coords_np = np.zeros((4, 4, 2 * _DENSEPOSE_NUM_PARTS),
+                              dtype=np.float32)
+    surf_coords_np[:, :, 8:10] = 0.2, 0.9
+    surf_coords_np[:, :, 16:18] = 0.3, 0.5
+    true_height, true_width = 10, 10
+    input_height, input_width = 10, 10
+    mask_height = 4
+    mask_width = 4
+    def graph_fn():
+      elems = [
+          tf.constant(boxes_np),
+          tf.constant(classes_np),
+          tf.constant(masks_np),
+          tf.constant(part_heatmap_np),
+          tf.constant(surf_coords_np),
+          tf.constant(true_height, dtype=tf.int32),
+          tf.constant(true_width, dtype=tf.int32)
+      ]
+      part_masks, surface_coords = cnma.crop_and_threshold_masks(
+          elems, input_height, input_width, mask_height=mask_height,
+          mask_width=mask_width, densepose_class_index=0)
+      return part_masks, surface_coords
+
+    part_masks, surface_coords = self.execute_cpu(graph_fn, [])
+
+    expected_part_masks = np.zeros((2, 4, 4), dtype=np.uint8)
+    expected_part_masks[0, 0, 0] = 5  # Recall classes are 1-indexed in output.
+    expected_part_masks[0, 2, 2] = 9  # Recall classes are 1-indexed in output.
+    expected_part_masks[1, 3, 3] = 1  # Standard instance segmentation mask.
+    expected_surface_coords = np.zeros((2, 4, 4, 2), dtype=np.float32)
+    expected_surface_coords[0, 0, 0, :] = 0.2, 0.9
+    expected_surface_coords[0, 2, 2, :] = 0.3, 0.5
+    np.testing.assert_allclose(expected_part_masks, part_masks)
+    np.testing.assert_allclose(expected_surface_coords, surface_coords)
+
+  def test_gather_surface_coords_for_parts(self):
+    surface_coords_cropped_np = np.zeros((2, 5, 5, _DENSEPOSE_NUM_PARTS, 2),
+                                         dtype=np.float32)
+    surface_coords_cropped_np[0, 0, 0, 5] = 0.3, 0.4
+    surface_coords_cropped_np[0, 1, 0, 9] = 0.5, 0.6
+    highest_scoring_part_np = np.zeros((2, 5, 5), dtype=np.int32)
+    highest_scoring_part_np[0, 0, 0] = 5
+    highest_scoring_part_np[0, 1, 0] = 9
+    def graph_fn():
+      surface_coords_cropped = tf.constant(surface_coords_cropped_np,
+                                           tf.float32)
+      highest_scoring_part = tf.constant(highest_scoring_part_np, tf.int32)
+      surface_coords_gathered = cnma.gather_surface_coords_for_parts(
+          surface_coords_cropped, highest_scoring_part)
+      return surface_coords_gathered
+
+    surface_coords_gathered = self.execute_cpu(graph_fn, [])
+
+    np.testing.assert_allclose([0.3, 0.4], surface_coords_gathered[0, 0, 0])
+    np.testing.assert_allclose([0.5, 0.6], surface_coords_gathered[0, 1, 0])
+
  def test_top_k_feature_map_locations(self):
    feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
    feature_map_np[0, 2, 0, 1] = 1.0
@@ -535,6 +633,8 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    keypoint_heatmap_np[1, 0, 1, 1] = 0.9
    keypoint_heatmap_np[1, 2, 0, 1] = 0.8

+    # Note that the keypoint offsets are now per keypoint (as opposed to
+    # keypoint agnostic, in the test test_keypoint_candidate_prediction).
    keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 4), dtype=np.float32)
    keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25, 0.0, 0.0]
    keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5, 0.0, 0.0]
@@ -949,6 +1049,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
 _NUM_CLASSES = 10
 _KEYPOINT_INDICES = [0, 1, 2, 3]
 _NUM_KEYPOINTS = len(_KEYPOINT_INDICES)
+_DENSEPOSE_NUM_PARTS = 24
 _TASK_NAME = 'human_pose'


@@ -991,6 +1092,20 @@ def get_fake_mask_params():
      mask_width=4)


+def get_fake_densepose_params():
+  """Returns the fake DensePose estimation parameter namedtuple."""
+  return cnma.DensePoseParams(
+      class_id=1,
+      classification_loss=losses.WeightedSoftmaxClassificationLoss(),
+      localization_loss=losses.L1LocalizationLoss(),
+      part_loss_weight=1.0,
+      coordinate_loss_weight=1.0,
+      num_parts=_DENSEPOSE_NUM_PARTS,
+      task_loss_weight=1.0,
+      upsample_to_input_res=True,
+      upsample_method='nearest')
+
+
 def build_center_net_meta_arch(build_resnet=False):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
@@ -1018,7 +1133,8 @@ def build_center_net_meta_arch(build_resnet=False):
      object_center_params=get_fake_center_params(),
      object_detection_params=get_fake_od_params(),
      keypoint_params_dict={_TASK_NAME: get_fake_kp_params()},
-      mask_params=get_fake_mask_params())
+      mask_params=get_fake_mask_params(),
+      densepose_params=get_fake_densepose_params())


 def _logit(p):
@@ -1102,6 +1218,16 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        fake_feature_map)
    self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)

+    # "densepose parts" head:
+    output = model._prediction_head_dict[cnma.DENSEPOSE_HEATMAP][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, _DENSEPOSE_NUM_PARTS), output.shape)
+
+    # "densepose surface coordinates" head:
+    output = model._prediction_head_dict[cnma.DENSEPOSE_REGRESSION][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, 2 * _DENSEPOSE_NUM_PARTS), output.shape)
+
  def test_initialize_target_assigners(self):
    model = build_center_net_meta_arch()
    assigner_dict = model._initialize_target_assigners(
@@ -1125,6 +1251,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertIsInstance(assigner_dict[cnma.SEGMENTATION_TASK],
                          cn_assigner.CenterNetMaskTargetAssigner)

+    # DensePose estimation target assigner:
+    self.assertIsInstance(assigner_dict[cnma.DENSEPOSE_TASK],
+                          cn_assigner.CenterNetDensePoseTargetAssigner)
+
  def test_predict(self):
    """Test the predict function."""

@@ -1145,6 +1275,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                     (2, 32, 32, 2))
    self.assertEqual(prediction_dict[cnma.SEGMENTATION_HEATMAP][0].shape,
                     (2, 32, 32, _NUM_CLASSES))
+    self.assertEqual(prediction_dict[cnma.DENSEPOSE_HEATMAP][0].shape,
+                     (2, 32, 32, _DENSEPOSE_NUM_PARTS))
+    self.assertEqual(prediction_dict[cnma.DENSEPOSE_REGRESSION][0].shape,
+                     (2, 32, 32, 2 * _DENSEPOSE_NUM_PARTS))

  def test_loss(self):
    """Test the loss function."""
@@ -1157,7 +1291,13 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        groundtruth_keypoints_list=groundtruth_dict[
            fields.BoxListFields.keypoints],
        groundtruth_masks_list=groundtruth_dict[
-            fields.BoxListFields.masks])
+            fields.BoxListFields.masks],
+        groundtruth_dp_num_points_list=groundtruth_dict[
+            fields.BoxListFields.densepose_num_points],
+        groundtruth_dp_part_ids_list=groundtruth_dict[
+            fields.BoxListFields.densepose_part_ids],
+        groundtruth_dp_surface_coords_list=groundtruth_dict[
+            fields.BoxListFields.densepose_surface_coords])

    prediction_dict = get_fake_prediction_dict(
        input_height=16, input_width=32, stride=4)
@@ -1193,6 +1333,12 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertGreater(
        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
                                   cnma.SEGMENTATION_HEATMAP)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.DENSEPOSE_HEATMAP)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.DENSEPOSE_REGRESSION)])

  @parameterized.parameters(
      {'target_class_id': 1},
@@ -1230,6 +1376,14 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    segmentation_heatmap[:, 14:18, 14:18, target_class_id] = 1.0
    segmentation_heatmap = _logit(segmentation_heatmap)

+    dp_part_ind = 4
+    dp_part_heatmap = np.zeros((1, 32, 32, _DENSEPOSE_NUM_PARTS),
+                               dtype=np.float32)
+    dp_part_heatmap[0, 14:18, 14:18, dp_part_ind] = 1.0
+    dp_part_heatmap = _logit(dp_part_heatmap)
+
+    dp_surf_coords = np.random.randn(1, 32, 32, 2 * _DENSEPOSE_NUM_PARTS)
+
    class_center = tf.constant(class_center)
    height_width = tf.constant(height_width)
    offset = tf.constant(offset)
@@ -1237,6 +1391,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
    keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
    segmentation_heatmap = tf.constant(segmentation_heatmap, dtype=tf.float32)
+    dp_part_heatmap = tf.constant(dp_part_heatmap, dtype=tf.float32)
+    dp_surf_coords = tf.constant(dp_surf_coords, dtype=tf.float32)

    prediction_dict = {
        cnma.OBJECT_CENTER: [class_center],
@@ -1249,6 +1405,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
            [keypoint_regression],
        cnma.SEGMENTATION_HEATMAP: [segmentation_heatmap],
+        cnma.DENSEPOSE_HEATMAP: [dp_part_heatmap],
+        cnma.DENSEPOSE_REGRESSION: [dp_surf_coords]
    }

    def graph_fn():
@@ -1271,12 +1429,13 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual([1, max_detection, 4, 4],
                        detections['detection_masks'].shape)

-    # There should be some section of the first mask (correspond to the only
-    # detection) with non-zero mask values.
-    self.assertGreater(np.sum(detections['detection_masks'][0, 0, :, :] > 0), 0)
+    # Masks should be empty for everything but the first detection.
    self.assertAllEqual(
        detections['detection_masks'][0, 1:, :, :],
        np.zeros_like(detections['detection_masks'][0, 1:, :, :]))
+    self.assertAllEqual(
+        detections['detection_surface_coords'][0, 1:, :, :],
+        np.zeros_like(detections['detection_surface_coords'][0, 1:, :, :]))

    if target_class_id == 1:
      expected_kpts_for_obj_0 = np.array(
@@ -1287,6 +1446,12 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                                 expected_kpts_for_obj_0, rtol=1e-6)
      np.testing.assert_allclose(detections['detection_keypoint_scores'][0][0],
                                 expected_kpt_scores_for_obj_0, rtol=1e-6)
+      # First detection has DensePose parts.
+      self.assertSameElements(
+          np.unique(detections['detection_masks'][0, 0, :, :]),
+          set([0, dp_part_ind + 1]))
+      self.assertGreater(np.sum(np.abs(detections['detection_surface_coords'])),
+                         0.0)
    else:
      # All keypoint outputs should be zeros.
      np.testing.assert_allclose(
@@ -1297,6 +1462,14 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
          detections['detection_keypoint_scores'][0][0],
          np.zeros([num_keypoints], np.float),
          rtol=1e-6)
+      # Binary segmentation mask.
+      self.assertSameElements(
+          np.unique(detections['detection_masks'][0, 0, :, :]),
+          set([0, 1]))
+      # No DensePose surface coordinates.
+      np.testing.assert_allclose(
+          detections['detection_surface_coords'][0, 0, :, :],
+          np.zeros_like(detections['detection_surface_coords'][0, 0, :, :]))

  def test_get_instance_indices(self):
    classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
@@ -1353,6 +1526,17 @@ def get_fake_prediction_dict(input_height, input_width, stride):
  mask_heatmap[0, 2, 4, 1] = 1.0
  mask_heatmap = _logit(mask_heatmap)

+  densepose_heatmap = np.zeros((2, output_height, output_width,
+                                _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+  densepose_heatmap[0, 2, 4, 5] = 1.0
+  densepose_heatmap = _logit(densepose_heatmap)
+
+  densepose_regression = np.zeros((2, output_height, output_width,
+                                   2 * _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+  # The surface coordinate indices for part index 5 are:
+  # (5 * 2, 5 * 2 + 1), or (10, 11).
+  densepose_regression[0, 2, 4, 10:12] = 0.4, 0.7
+
  prediction_dict = {
      'preprocessed_inputs':
          tf.zeros((2, input_height, input_width, 3)),
@@ -1383,6 +1567,14 @@ def get_fake_prediction_dict(input_height, input_width, stride):
      cnma.SEGMENTATION_HEATMAP: [
          tf.constant(mask_heatmap),
          tf.constant(mask_heatmap)
+      ],
+      cnma.DENSEPOSE_HEATMAP: [
+          tf.constant(densepose_heatmap),
+          tf.constant(densepose_heatmap),
+      ],
+      cnma.DENSEPOSE_REGRESSION: [
+          tf.constant(densepose_regression),
+          tf.constant(densepose_regression),
      ]
  }
  return prediction_dict
@@ -1427,12 +1619,30 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
      tf.constant(mask),
      tf.zeros_like(mask),
  ]
+  densepose_num_points = [
+      tf.constant([1], dtype=tf.int32),
+      tf.constant([0], dtype=tf.int32),
+  ]
+  densepose_part_ids = [
+      tf.constant([[5, 0, 0]], dtype=tf.int32),
+      tf.constant([[0, 0, 0]], dtype=tf.int32),
+  ]
+  densepose_surface_coords_np = np.zeros((1, 3, 4), dtype=np.float32)
+  densepose_surface_coords_np[0, 0, :] = 0.55, 0.55, 0.4, 0.7
+  densepose_surface_coords = [
+      tf.constant(densepose_surface_coords_np),
+      tf.zeros_like(densepose_surface_coords_np)
+  ]
  groundtruth_dict = {
      fields.BoxListFields.boxes: boxes,
      fields.BoxListFields.weights: weights,
      fields.BoxListFields.classes: classes,
      fields.BoxListFields.keypoints: keypoints,
      fields.BoxListFields.masks: masks,
+      fields.BoxListFields.densepose_num_points: densepose_num_points,
+      fields.BoxListFields.densepose_part_ids: densepose_part_ids,
+      fields.BoxListFields.densepose_surface_coords:
+          densepose_surface_coords,
      fields.InputDataFields.groundtruth_labeled_classes: labeled_classes,
  }
  return groundtruth_dict