Merged commit includes the following changes: (#8761)

319125512 by aom: Internal change -- 319108395 by rathodv: Internal Change -- 319106259 by ronnyvotel: Updating input pipeline to return DensePose labels. -- PiperOrigin-RevId: 319125512 Co-authored-by: Zhichao Lu <lzc@google.com>

Merged commit includes the following changes: (#8761)
319125512 by aom: Internal change -- 319108395 by rathodv: Internal Change -- 319106259 by ronnyvotel: Updating input pipeline to return DensePose labels. -- PiperOrigin-RevId: 319125512 Co-authored-by: Zhichao Lu <lzc@google.com>
e6017471 · vivek rathod · GitHub · 58d19c67 · e6017471 · e6017471
Unverified Commit e6017471 authored Jun 30, 2020 by vivek rathod Committed by GitHub Jun 30, 2020
4 changed files
--- a/research/object_detection/builders/decoder_builder.py
+++ b/research/object_detection/builders/decoder_builder.py
@@ -58,7 +58,8 @@ def build(input_reader_config):
          use_display_name=input_reader_config.use_display_name,
          num_additional_channels=input_reader_config.num_additional_channels,
          num_keypoints=input_reader_config.num_keypoints,
-          expand_hierarchy_labels=input_reader_config.expand_labels_hierarchy)
+          expand_hierarchy_labels=input_reader_config.expand_labels_hierarchy,
+          load_dense_pose=input_reader_config.load_dense_pose)
      return decoder
    elif input_type == input_reader_pb2.InputType.Value('TF_SEQUENCE_EXAMPLE'):
      decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder(

--- a/research/object_detection/inputs.py
+++ b/research/object_detection/inputs.py
@@ -27,6 +27,7 @@ from object_detection.builders import model_builder
 from object_detection.builders import preprocessor_builder
 from object_detection.core import box_list
 from object_detection.core import box_list_ops
+from object_detection.core import densepose_ops
 from object_detection.core import keypoint_ops
 from object_detection.core import preprocessor
 from object_detection.core import standard_fields as fields
@@ -289,6 +290,13 @@ def transform_input_data(tensor_dict,
            out_tensor_dict[flds_gt_kpt_vis],
            keypoint_type_weight))

+  dp_surface_coords_fld = fields.InputDataFields.groundtruth_dp_surface_coords
+  if dp_surface_coords_fld in tensor_dict:
+    dp_surface_coords = out_tensor_dict[dp_surface_coords_fld]
+    realigned_dp_surface_coords = densepose_ops.change_coordinate_frame(
+        dp_surface_coords, im_box)
+    out_tensor_dict[dp_surface_coords_fld] = realigned_dp_surface_coords
+
  if use_bfloat16:
    preprocessed_resized_image = tf.cast(
        preprocessed_resized_image, tf.bfloat16)
@@ -355,7 +363,8 @@ def pad_input_data_to_static_shapes(tensor_dict,
                                    num_classes,
                                    spatial_image_shape=None,
                                    max_num_context_features=None,
-                                    context_feature_length=None):
+                                    context_feature_length=None,
+                                    max_dp_points=336):
  """Pads input tensors to static shapes.

  In case num_additional_channels > 0, we assume that the additional channels
@@ -372,6 +381,11 @@ def pad_input_data_to_static_shapes(tensor_dict,
    max_num_context_features (optional): The maximum number of context
      features needed to compute shapes padding.
    context_feature_length (optional): The length of the context feature.
+    max_dp_points (optional): The maximum number of DensePose sampled points per
+      instance. The default (336) is selected since the original DensePose paper
+      (https://arxiv.org/pdf/1802.00434.pdf) indicates that the maximum number
+      of samples per part is 14, and therefore 24 * 14 = 336 is the maximum
+      sampler per instance.

  Returns:
    A dictionary keyed by fields.InputDataFields containing padding shapes for
@@ -476,6 +490,15 @@ def pad_input_data_to_static_shapes(tensor_dict,
    padding_shape = [max_num_boxes, shape_utils.get_dim_as_int(tensor_shape[1])]
    padding_shapes[fields.InputDataFields.
                   groundtruth_keypoint_weights] = padding_shape
+  if fields.InputDataFields.groundtruth_dp_num_points in tensor_dict:
+    padding_shapes[
+        fields.InputDataFields.groundtruth_dp_num_points] = [max_num_boxes]
+    padding_shapes[
+        fields.InputDataFields.groundtruth_dp_part_ids] = [
+            max_num_boxes, max_dp_points]
+    padding_shapes[
+        fields.InputDataFields.groundtruth_dp_surface_coords] = [
+            max_num_boxes, max_dp_points, 4]

  # Prepare for ContextRCNN related fields.
  if fields.InputDataFields.context_features in tensor_dict:
@@ -535,6 +558,10 @@ def augment_input_data(tensor_dict, data_augmentation_options):
                               in tensor_dict)
  include_multiclass_scores = (fields.InputDataFields.multiclass_scores in
                               tensor_dict)
+  dense_pose_fields = [fields.InputDataFields.groundtruth_dp_num_points,
+                       fields.InputDataFields.groundtruth_dp_part_ids,
+                       fields.InputDataFields.groundtruth_dp_surface_coords]
+  include_dense_pose = all(field in tensor_dict for field in dense_pose_fields)
  tensor_dict = preprocessor.preprocess(
      tensor_dict, data_augmentation_options,
      func_arg_map=preprocessor.get_default_func_arg_map(
@@ -543,7 +570,8 @@ def augment_input_data(tensor_dict, data_augmentation_options):
          include_multiclass_scores=include_multiclass_scores,
          include_instance_masks=include_instance_masks,
          include_keypoints=include_keypoints,
-          include_keypoint_visibilities=include_keypoint_visibilities))
+          include_keypoint_visibilities=include_keypoint_visibilities,
+          include_dense_pose=include_dense_pose))
  tensor_dict[fields.InputDataFields.image] = tf.squeeze(
      tensor_dict[fields.InputDataFields.image], axis=0)
  return tensor_dict
@@ -572,6 +600,9 @@ def _get_labels_dict(input_dict):
      fields.InputDataFields.groundtruth_difficult,
      fields.InputDataFields.groundtruth_keypoint_visibilities,
      fields.InputDataFields.groundtruth_keypoint_weights,
+      fields.InputDataFields.groundtruth_dp_num_points,
+      fields.InputDataFields.groundtruth_dp_part_ids,
+      fields.InputDataFields.groundtruth_dp_surface_coords
  ]

  for key in optional_label_keys:
@@ -720,6 +751,17 @@ def train_input(train_config, train_input_config,
        groundtruth visibilities for each keypoint.
      labels[fields.InputDataFields.groundtruth_labeled_classes] is a
        [batch_size, num_classes] float32 k-hot tensor of classes.
+      labels[fields.InputDataFields.groundtruth_dp_num_points] is a
+        [batch_size, num_boxes] int32 tensor with the number of sampled
+        DensePose points per object.
+      labels[fields.InputDataFields.groundtruth_dp_part_ids] is a
+        [batch_size, num_boxes, max_sampled_points] int32 tensor with the
+        DensePose part ids (0-indexed) per object.
+      labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
+        [batch_size, num_boxes, max_sampled_points, 4] float32 tensor with the
+        DensePose surface coordinates. The format is (y, x, v, u), where (y, x)
+        are normalized image coordinates and (v, u) are normalized surface part
+        coordinates.

  Raises:
    TypeError: if the `train_config`, `train_input_config` or `model_config`
@@ -861,6 +903,17 @@ def eval_input(eval_config, eval_input_config, model_config,
        same class which heavily occlude each other.
      labels[fields.InputDataFields.groundtruth_labeled_classes] is a
        [num_boxes, num_classes] float32 k-hot tensor of classes.
+      labels[fields.InputDataFields.groundtruth_dp_num_points] is a
+        [batch_size, num_boxes] int32 tensor with the number of sampled
+        DensePose points per object.
+      labels[fields.InputDataFields.groundtruth_dp_part_ids] is a
+        [batch_size, num_boxes, max_sampled_points] int32 tensor with the
+        DensePose part ids (0-indexed) per object.
+      labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
+        [batch_size, num_boxes, max_sampled_points, 4] float32 tensor with the
+        DensePose surface coordinates. The format is (y, x, v, u), where (y, x)
+        are normalized image coordinates and (v, u) are normalized surface part
+        coordinates.

  Raises:
    TypeError: if the `eval_config`, `eval_input_config` or `model_config`

--- a/research/object_detection/inputs_test.py
+++ b/research/object_detection/inputs_test.py
@@ -1293,6 +1293,51 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
        groundtruth_keypoint_weights,
        [[1.0, 1.0], [1.0, 1.0]])

+  def test_groundtruth_dense_pose(self):
+    def graph_fn():
+      tensor_dict = {
+          fields.InputDataFields.image:
+              tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
+          fields.InputDataFields.groundtruth_boxes:
+              tf.constant(np.array([[.5, .5, 1, 1], [.0, .0, .5, .5]],
+                                   np.float32)),
+          fields.InputDataFields.groundtruth_classes:
+              tf.constant(np.array([1, 2], np.int32)),
+          fields.InputDataFields.groundtruth_dp_num_points:
+              tf.constant([0, 2], dtype=tf.int32),
+          fields.InputDataFields.groundtruth_dp_part_ids:
+              tf.constant([[0, 0], [4, 23]], dtype=tf.int32),
+          fields.InputDataFields.groundtruth_dp_surface_coords:
+              tf.constant([[[0., 0., 0., 0.,], [0., 0., 0., 0.,]],
+                           [[0.1, 0.2, 0.3, 0.4,], [0.6, 0.8, 0.6, 0.7,]]],
+                          dtype=tf.float32),
+      }
+
+      num_classes = 1
+      input_transformation_fn = functools.partial(
+          inputs.transform_input_data,
+          model_preprocess_fn=_fake_resize50_preprocess_fn,
+          image_resizer_fn=_fake_image_resizer_fn,
+          num_classes=num_classes)
+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      transformed_dp_num_points = transformed_inputs[
+          fields.InputDataFields.groundtruth_dp_num_points]
+      transformed_dp_part_ids = transformed_inputs[
+          fields.InputDataFields.groundtruth_dp_part_ids]
+      transformed_dp_surface_coords = transformed_inputs[
+          fields.InputDataFields.groundtruth_dp_surface_coords]
+      return (transformed_dp_num_points, transformed_dp_part_ids,
+              transformed_dp_surface_coords)
+
+    dp_num_points, dp_part_ids, dp_surface_coords = self.execute_cpu(
+        graph_fn, [])
+    self.assertAllEqual(dp_num_points, [0, 2])
+    self.assertAllEqual(dp_part_ids, [[0, 0], [4, 23]])
+    self.assertAllClose(
+        dp_surface_coords,
+        [[[0., 0., 0., 0.,], [0., 0., 0., 0.,]],
+         [[0.1, 0.1, 0.3, 0.4,], [0.6, 0.4, 0.6, 0.7,]]])
+

 class PadInputDataToStaticShapesFnTest(test_case.TestCase):

@@ -1454,6 +1499,35 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
            fields.InputDataFields.groundtruth_keypoint_visibilities]
        .shape.as_list(), [3, 16])

+  def test_dense_pose(self):
+    input_tensor_dict = {
+        fields.InputDataFields.groundtruth_dp_num_points:
+            tf.constant([0, 2], dtype=tf.int32),
+        fields.InputDataFields.groundtruth_dp_part_ids:
+            tf.constant([[0, 0], [4, 23]], dtype=tf.int32),
+        fields.InputDataFields.groundtruth_dp_surface_coords:
+            tf.constant([[[0., 0., 0., 0.,], [0., 0., 0., 0.,]],
+                         [[0.1, 0.2, 0.3, 0.4,], [0.6, 0.8, 0.6, 0.7,]]],
+                        dtype=tf.float32),
+    }
+
+    padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
+        tensor_dict=input_tensor_dict,
+        max_num_boxes=3,
+        num_classes=1,
+        spatial_image_shape=[128, 128],
+        max_dp_points=200)
+
+    self.assertAllEqual(
+        padded_tensor_dict[fields.InputDataFields.groundtruth_dp_num_points]
+        .shape.as_list(), [3])
+    self.assertAllEqual(
+        padded_tensor_dict[fields.InputDataFields.groundtruth_dp_part_ids]
+        .shape.as_list(), [3, 200])
+    self.assertAllEqual(
+        padded_tensor_dict[fields.InputDataFields.groundtruth_dp_surface_coords]
+        .shape.as_list(), [3, 200, 4])
+
  def test_context_features(self):
    context_memory_size = 8
    context_feature_length = 10

--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -479,12 +479,9 @@ class SSDMetaArch(model.DetectionModel):
      ValueError: if inputs tensor does not have type tf.float32
    """
    with tf.name_scope('Preprocessor'):
-      (resized_inputs,
-       true_image_shapes) = shape_utils.resize_images_and_return_shapes(
-           inputs, self._image_resizer_fn)
-
-      return (self._feature_extractor.preprocess(resized_inputs),
-              true_image_shapes)
+      normalized_inputs = self._feature_extractor.preprocess(inputs)
+      return shape_utils.resize_images_and_return_shapes(
+          normalized_inputs, self._image_resizer_fn)

  def _compute_clip_window(self, preprocessed_images, true_image_shapes):
    """Computes clip window to use during post_processing.