Updated the tf_example_decoder to support the case where the keypoint labels in

the image/object/keypoint/text feature but not in the keypoint label map. PiperOrigin-RevId: 404372105

Updated the tf_example_decoder to support the case where the keypoint labels in
the image/object/keypoint/text feature but not in the keypoint label map. PiperOrigin-RevId: 404372105
bd303e0a · Yu-hui Chen · TF Object Detection Team · dd933006 · bd303e0a · bd303e0a
Commit bd303e0a authored Oct 19, 2021 by Yu-hui Chen Committed by TF Object Detection Team Oct 19, 2021
2 changed files
--- a/research/object_detection/data_decoders/tf_example_decoder.py
+++ b/research/object_detection/data_decoders/tf_example_decoder.py
@@ -611,7 +611,6 @@ class TfExampleDecoder(data_decoder.DataDecoder):
            np.nan * tf.ones_like(tensor_dict[gt_kpt_fld]))
      else:
        num_instances = tf.shape(tensor_dict['groundtruth_classes'])[0]
        def true_fn(num_instances):
          """Logics to process the tensor when num_instances is not zero."""
          kpts_idx = tf.cast(self._kpts_name_to_id_table.lookup(
@@ -625,19 +624,25 @@ class TfExampleDecoder(data_decoder.DataDecoder):
              [1, num_kpt_texts])
          # Prepare the index of the keypoints to scatter the keypoint
          # coordinates: [num_kpts_texts * num_instances, 2].
-          kpt_idx = tf.concat([
+          full_kpt_idx = tf.concat([
              tf.reshape(
                  instance_idx, shape=[num_kpt_texts * num_instances, 1]),
              tf.expand_dims(kpts_idx, axis=-1)
          ], axis=1)
+          # Get the mask and gather only the keypoints with non-negative
+          # indices (i.e. the keypoint labels in the image/object/keypoint/text
+          # but do not exist in the label map).
+          valid_mask = tf.greater_equal(kpts_idx, 0)
+          full_kpt_idx = tf.boolean_mask(full_kpt_idx, valid_mask)
          gt_kpt = tf.scatter_nd(
-              kpt_idx,
+              full_kpt_idx,
-              tensor_dict[gt_kpt_fld],
+              tf.boolean_mask(tensor_dict[gt_kpt_fld], valid_mask),
              shape=[num_instances, self._num_keypoints, 2])
          gt_kpt_vis = tf.cast(tf.scatter_nd(
-              kpt_idx,
+              full_kpt_idx,
-              tensor_dict[gt_kpt_vis_fld],
+              tf.boolean_mask(tensor_dict[gt_kpt_vis_fld], valid_mask),
              shape=[num_instances, self._num_keypoints]), dtype=tf.bool)
          visibilities_tiled = tf.tile(
              tf.expand_dims(gt_kpt_vis, axis=-1), [1, 1, 2])
@@ -1091,3 +1096,4 @@ class TfExampleDecoder(data_decoder.DataDecoder):
      new_object_field = tf.repeat(
          object_field, tf.reduce_sum(expanded_indices, axis=1), axis=0)
    return new_object_field
--- a/research/object_detection/data_decoders/tf_example_decoder_test.py
+++ b/research/object_detection/data_decoders/tf_example_decoder_test.py
@@ -620,6 +620,116 @@ class TfExampleDecoderTest(test_case.TestCase):
    self.assertAllEqual(
        np.zeros([2, 2], dtype=np.bool), output[gt_kpts_vis_fld][:, 3:])
+  def testDecodeKeypointWithKptsLabelsNotInText(self):
+    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
+    encoded_jpeg, _ = self._create_encoded_and_decoded_data(
+        image_tensor, 'jpeg')
+    bbox_classes = [0, 1]
+    bbox_ymins = [0.0, 4.0]
+    bbox_xmins = [1.0, 5.0]
+    bbox_ymaxs = [2.0, 6.0]
+    bbox_xmaxs = [3.0, 7.0]
+    keypoint_ys = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
+    keypoint_xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    keypoint_visibility = [1, 2, 0, 1, 0, 2]
+    keypoint_texts = [
+        six.b('nose'), six.b('left_eye'), six.b('right_eye'), six.b('nose'),
+        six.b('left_eye'), six.b('right_eye')
+    ]
+    label_map_string = """
+      item: {
+        id: 1
+        name: 'face'
+        display_name: 'face'
+        keypoints {
+         id: 0
+         label: "missing_part"
+        }
+        keypoints {
+         id: 2
+         label: "right_eye"
+        }
+        keypoints {
+         id: 3
+         label: "nose"
+        }
+      }
+      item: {
+        id: 2
+        name: 'person'
+        display_name: 'person'
+        keypoints {
+         id: 1
+         label: "left_eye"
+        }
+      }
+    """
+    label_map_proto_file = os.path.join(self.get_temp_dir(), 'label_map.pbtxt')
+    with tf.gfile.Open(label_map_proto_file, 'wb') as f:
+      f.write(label_map_string)
+    def graph_fn():
+      example = tf.train.Example(
+          features=tf.train.Features(
+              feature={
+                  'image/encoded':
+                      dataset_util.bytes_feature(encoded_jpeg),
+                  'image/format':
+                      dataset_util.bytes_feature(six.b('jpeg')),
+                  'image/object/bbox/ymin':
+                      dataset_util.float_list_feature(bbox_ymins),
+                  'image/object/bbox/xmin':
+                      dataset_util.float_list_feature(bbox_xmins),
+                  'image/object/bbox/ymax':
+                      dataset_util.float_list_feature(bbox_ymaxs),
+                  'image/object/bbox/xmax':
+                      dataset_util.float_list_feature(bbox_xmaxs),
+                  'image/object/keypoint/y':
+                      dataset_util.float_list_feature(keypoint_ys),
+                  'image/object/keypoint/x':
+                      dataset_util.float_list_feature(keypoint_xs),
+                  'image/object/keypoint/visibility':
+                      dataset_util.int64_list_feature(keypoint_visibility),
+                  'image/object/keypoint/text':
+                      dataset_util.bytes_list_feature(keypoint_texts),
+                  'image/object/class/label':
+                      dataset_util.int64_list_feature(bbox_classes),
+              })).SerializeToString()
+      example_decoder = tf_example_decoder.TfExampleDecoder(
+          label_map_proto_file=label_map_proto_file, num_keypoints=5,
+          use_keypoint_label_map=True)
+      output = example_decoder.decode(tf.convert_to_tensor(example))
+      self.assertAllEqual((output[
+          fields.InputDataFields.groundtruth_boxes].get_shape().as_list()),
+                          [None, 4])
+      self.assertAllEqual((output[
+          fields.InputDataFields.groundtruth_keypoints].get_shape().as_list()),
+                          [None, 5, 2])
+      return output
+    output = self.execute_cpu(graph_fn, [])
+    expected_boxes = np.vstack([bbox_ymins, bbox_xmins, bbox_ymaxs,
+                                bbox_xmaxs]).transpose()
+    self.assertAllEqual(expected_boxes,
+                        output[fields.InputDataFields.groundtruth_boxes])
+    expected_keypoints = [[[np.nan, np.nan], [1., 2.], [np.nan, np.nan],
+                           [0., 1.], [np.nan, np.nan]],
+                          [[np.nan, np.nan], [np.nan, np.nan], [5., 6.],
+                           [3., 4.], [np.nan, np.nan]]]
+    gt_kpts_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities
+    self.assertAllClose(expected_keypoints,
+                        output[fields.InputDataFields.groundtruth_keypoints])
+    expected_visibility = [[False, True, False, True, False],
+                           [False, False, True, True, False]]
+    gt_kpts_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities
+    self.assertAllEqual(expected_visibility, output[gt_kpts_vis_fld])
  def testDecodeKeypointNoVisibilities(self):
    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
    encoded_jpeg, _ = self._create_encoded_and_decoded_data(