Merge branch 'tensorflow:master' into panoptic-deeplab

57557684 · Srihari Humbarwadi · GitHub · c4ce3a9e · 2f9266ac · 57557684
Unverified Commit 57557684 authored Apr 29, 2022 by Srihari Humbarwadi Committed by GitHub Apr 29, 2022
11 changed files
--- a/official/vision/dataloaders/video_input.py
+++ b/official/vision/dataloaders/video_input.py
@@ -271,6 +271,7 @@ class Parser(parser.Parser):
    self._min_resize = input_params.min_image_size
    self._crop_size = input_params.feature_shape[1]
    self._num_crops = input_params.num_test_crops
+    self._zero_centering_image = input_params.zero_centering_image
    self._one_hot_label = input_params.one_hot
    self._num_classes = input_params.num_classes
    self._image_key = image_key
@@ -317,7 +318,8 @@ class Parser(parser.Parser):
        max_aspect_ratio=self._max_aspect_ratio,
        min_area_ratio=self._min_area_ratio,
        max_area_ratio=self._max_area_ratio,
-        augmenter=self._augmenter)
+        augmenter=self._augmenter,
+        zero_centering_image=self._zero_centering_image)
    image = tf.cast(image, dtype=self._dtype)
    features = {'image': image}
@@ -349,7 +351,8 @@ class Parser(parser.Parser):
        num_test_clips=self._num_test_clips,
        min_resize=self._min_resize,
        crop_size=self._crop_size,
-        num_crops=self._num_crops)
+        num_crops=self._num_crops,
+        zero_centering_image=self._zero_centering_image)
    image = tf.cast(image, dtype=self._dtype)
    features = {'image': image}

--- a/official/vision/examples/starter/train.py
+++ b/official/vision/examples/starter/train.py
@@ -21,7 +21,7 @@ branch from `official/vision/beta/train.py` and make changes.
 from absl import app
 from official.common import flags as tfm_flags
-from official.vision.beta import train
+from official.vision import train
 from official.vision.examples.starter import registry_imports  # pylint: disable=unused-import

--- a/official/vision/modeling/layers/nn_layers.py
+++ b/official/vision/modeling/layers/nn_layers.py
@@ -697,7 +697,7 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):
  def call(self,
           inputs: tf.Tensor,
           states: Optional[States] = None,
-           output_states: bool = True
+           output_states: bool = False
           ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]:
    """Calls the layer with the given inputs.
@@ -813,13 +813,14 @@ class SpatialAveragePool3D(tf.keras.layers.Layer):
    super(SpatialAveragePool3D, self).build(input_shape)
-  def call(self, inputs):
+  def call(self, inputs, states=None, output_states: bool = False):
    """Calls the layer with the given inputs."""
    if inputs.shape.rank != 5:
      raise ValueError(
          'Input should have rank {}, got {}'.format(5, inputs.shape.rank))
-    return tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims)
+    output = tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims)
+    return (output, states) if output_states else output
 class CausalConvMixin:

--- a/official/vision/modeling/layers/nn_layers_test.py
+++ b/official/vision/modeling/layers/nn_layers_test.py
@@ -134,14 +134,14 @@ class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
    inputs = tf.range(4, dtype=tf.float32) + 1.
    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
-    expected, _ = gap(inputs)
+    expected, _ = gap(inputs, output_states=True)
    for num_splits in [1, 2, 4]:
      frames = tf.split(inputs, num_splits, axis=1)
      states = {}
      predicted = None
      for frame in frames:
-        predicted, states = gap(frame, states=states)
+        predicted, states = gap(frame, states=states, output_states=True)
      self.assertEqual(predicted.shape, expected.shape)
      self.assertAllClose(predicted, expected)
@@ -155,14 +155,14 @@ class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
    inputs = tf.range(4, dtype=tf.float32) + 1.
    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
    inputs = tf.tile(inputs, [1, 1, 2, 2, 3])
-    expected, _ = gap(inputs)
+    expected, _ = gap(inputs, output_states=True)
    for num_splits in [1, 2, 4]:
      frames = tf.split(inputs, num_splits, axis=1)
      states = {}
      predicted = []
      for frame in frames:
-        x, states = gap(frame, states=states)
+        x, states = gap(frame, states=states, output_states=True)
        predicted.append(x)
      predicted = tf.concat(predicted, axis=1)

--- a/official/vision/serving/semantic_segmentation.py
+++ b/official/vision/serving/semantic_segmentation.py
@@ -45,12 +45,16 @@ class SegmentationModule(export_base.ExportModule):
                                           offset=MEAN_RGB,
                                           scale=STDDEV_RGB)
+    if self.params.task.train_data.preserve_aspect_ratio:
      image, image_info = preprocess_ops.resize_and_crop_image(
          image,
          self._input_image_size,
          padded_size=self._input_image_size,
          aug_scale_min=1.0,
          aug_scale_max=1.0)
+    else:
+      image, image_info = preprocess_ops.resize_image(image,
+                                                      self._input_image_size)
    return image, image_info
  def serve(self, images):
@@ -80,6 +84,25 @@ class SegmentationModule(export_base.ExportModule):
                parallel_iterations=32))
    outputs = self.inference_step(images)
+    # Optionally resize prediction to the input image size.
+    if self.params.task.export_config.rescale_output:
+      logits = outputs['logits']
+      if logits.shape[0] != 1:
+        raise ValueError('Batch size cannot be more than 1.')
+      image_shape = tf.cast(image_info[0, 0, :], tf.int32)
+      if self.params.task.train_data.preserve_aspect_ratio:
+        rescale_size = tf.cast(
+            tf.math.ceil(image_info[0, 1, :] / image_info[0, 2, :]), tf.int32)
+        offsets = tf.cast(image_info[0, 3, :], tf.int32)
+        logits = tf.image.resize(logits, rescale_size, method='bilinear')
+        outputs['logits'] = tf.image.crop_to_bounding_box(
+            logits, offsets[0], offsets[1], image_shape[0], image_shape[1])
+      else:
+        outputs['logits'] = tf.image.resize(
+            logits, [image_shape[0], image_shape[1]], method='bilinear')
+    else:
      outputs['logits'] = tf.image.resize(
          outputs['logits'], self._input_image_size, method='bilinear')

--- a/official/vision/serving/semantic_segmentation_test.py
+++ b/official/vision/serving/semantic_segmentation_test.py
@@ -29,11 +29,17 @@ from official.vision.serving import semantic_segmentation
 class SemanticSegmentationExportTest(tf.test.TestCase, parameterized.TestCase):
-  def _get_segmentation_module(self, input_type):
+  def _get_segmentation_module(self,
+                               input_type,
+                               rescale_output,
+                               preserve_aspect_ratio,
+                               batch_size=1):
    params = exp_factory.get_exp_config('mnv2_deeplabv3_pascal')
+    params.task.export_config.rescale_output = rescale_output
+    params.task.train_data.preserve_aspect_ratio = preserve_aspect_ratio
    segmentation_module = semantic_segmentation.SegmentationModule(
        params,
-        batch_size=1,
+        batch_size=batch_size,
        input_image_size=[112, 112],
        input_type=input_type)
    return segmentation_module
@@ -43,18 +49,20 @@ class SemanticSegmentationExportTest(tf.test.TestCase, parameterized.TestCase):
        {input_type: 'serving_default'})
    tf.saved_model.save(module, save_directory, signatures=signatures)
-  def _get_dummy_input(self, input_type):
+  def _get_dummy_input(self, input_type, input_image_size):
    """Get dummy input for the given input type."""
+    height = input_image_size[0]
+    width = input_image_size[1]
    if input_type == 'image_tensor':
-      return tf.zeros((1, 112, 112, 3), dtype=np.uint8)
+      return tf.zeros((1, height, width, 3), dtype=np.uint8)
    elif input_type == 'image_bytes':
-      image = Image.fromarray(np.zeros((112, 112, 3), dtype=np.uint8))
+      image = Image.fromarray(np.zeros((height, width, 3), dtype=np.uint8))
      byte_io = io.BytesIO()
      image.save(byte_io, 'PNG')
      return [byte_io.getvalue()]
    elif input_type == 'tf_example':
-      image_tensor = tf.zeros((112, 112, 3), dtype=tf.uint8)
+      image_tensor = tf.zeros((height, width, 3), dtype=tf.uint8)
      encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy()
      example = tf.train.Example(
          features=tf.train.Features(
@@ -65,17 +73,24 @@ class SemanticSegmentationExportTest(tf.test.TestCase, parameterized.TestCase):
              })).SerializeToString()
      return [example]
    elif input_type == 'tflite':
-      return tf.zeros((1, 112, 112, 3), dtype=np.float32)
+      return tf.zeros((1, height, width, 3), dtype=np.float32)
  @parameterized.parameters(
-      {'input_type': 'image_tensor'},
+      ('image_tensor', False, [112, 112], False),
-      {'input_type': 'image_bytes'},
+      ('image_bytes', False, [112, 112], False),
-      {'input_type': 'tf_example'},
+      ('tf_example', False, [112, 112], True),
-      {'input_type': 'tflite'},
+      ('tflite', False, [112, 112], False),
+      ('image_tensor', True, [112, 56], True),
+      ('image_bytes', True, [112, 56], True),
+      ('tf_example', True, [56, 112], False),
  )
-  def test_export(self, input_type='image_tensor'):
+  def test_export(self, input_type, rescale_output, input_image_size,
+                  preserve_aspect_ratio):
    tmp_dir = self.get_temp_dir()
-    module = self._get_segmentation_module(input_type)
+    module = self._get_segmentation_module(
+        input_type=input_type,
+        rescale_output=rescale_output,
+        preserve_aspect_ratio=preserve_aspect_ratio)
    self._export_from_module(module, input_type, tmp_dir)
@@ -90,7 +105,7 @@ class SemanticSegmentationExportTest(tf.test.TestCase, parameterized.TestCase):
    imported = tf.saved_model.load(tmp_dir)
    segmentation_fn = imported.signatures['serving_default']
-    images = self._get_dummy_input(input_type)
+    images = self._get_dummy_input(input_type, input_image_size)
    if input_type != 'tflite':
      processed_images, _ = tf.nest.map_structure(
          tf.stop_gradient,
@@ -103,12 +118,28 @@ class SemanticSegmentationExportTest(tf.test.TestCase, parameterized.TestCase):
                                       shape=[4, 2], dtype=tf.float32))))
    else:
      processed_images = images
+    logits = module.model(processed_images, training=False)['logits']
+    if rescale_output:
      expected_output = tf.image.resize(
-        module.model(processed_images, training=False)['logits'], [112, 112],
+          logits, input_image_size, method='bilinear')
-        method='bilinear')
+    else:
+      expected_output = tf.image.resize(logits, [112, 112], method='bilinear')
    out = segmentation_fn(tf.constant(images))
    self.assertAllClose(out['logits'].numpy(), expected_output.numpy())
+  def test_export_invalid_batch_size(self):
+    batch_size = 3
+    tmp_dir = self.get_temp_dir()
+    module = self._get_segmentation_module(
+        input_type='image_tensor',
+        rescale_output=True,
+        preserve_aspect_ratio=False,
+        batch_size=batch_size)
+    with self.assertRaisesRegex(ValueError,
+                                'Batch size cannot be more than 1.'):
+      self._export_from_module(module, 'image_tensor', tmp_dir)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/core/keypoint_ops.py
+++ b/research/object_detection/core/keypoint_ops.py
@@ -22,6 +22,8 @@ the coordinates of the keypoint.
 import numpy as np
 import tensorflow.compat.v1 as tf
+from object_detection.utils import shape_utils
 def scale(keypoints, y_scale, x_scale, scope=None):
  """Scales keypoint coordinates in x and y dimensions.
@@ -345,7 +347,8 @@ def keypoint_weights_from_visibilities(keypoint_visibilities,
  """
  keypoint_visibilities.get_shape().assert_has_rank(2)
  if per_keypoint_weights is None:
-    num_keypoints = keypoint_visibilities.shape.as_list()[1]
+    num_keypoints = shape_utils.combined_static_and_dynamic_shape(
+        keypoint_visibilities)[1]
    per_keypoint_weight_mult = tf.ones((1, num_keypoints,), dtype=tf.float32)
  else:
    per_keypoint_weight_mult = tf.expand_dims(per_keypoint_weights, axis=0)

--- a/research/object_detection/meta_architectures/deepmac_meta_arch.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch.py
@@ -34,6 +34,7 @@ MASK_LOGITS_GT_BOXES = 'MASK_LOGITS_GT_BOXES'
 DEEP_MASK_ESTIMATION = 'deep_mask_estimation'
 DEEP_MASK_BOX_CONSISTENCY = 'deep_mask_box_consistency'
 DEEP_MASK_COLOR_CONSISTENCY = 'deep_mask_color_consistency'
+DEEP_MASK_POINTLY_SUPERVISED = 'deep_mask_pointly_supervised'
 SELF_SUPERVISED_DEAUGMENTED_MASK_LOGITS = (
    'SELF_SUPERVISED_DEAUGMENTED_MASK_LOGITS')
 DEEP_MASK_AUGMENTED_SELF_SUPERVISION = 'deep_mask_augmented_self_supervision'
@@ -41,8 +42,10 @@ LOSS_KEY_PREFIX = center_net_meta_arch.LOSS_KEY_PREFIX
 NEIGHBORS_2D = [[-1, -1], [-1, 0], [-1, 1],
                [0, -1], [0, 1],
                [1, -1], [1, 0], [1, 1]]
 WEAK_LOSSES = [DEEP_MASK_BOX_CONSISTENCY, DEEP_MASK_COLOR_CONSISTENCY,
-               DEEP_MASK_AUGMENTED_SELF_SUPERVISION]
+               DEEP_MASK_AUGMENTED_SELF_SUPERVISION,
+               DEEP_MASK_POINTLY_SUPERVISED]
 MASK_LOSSES = WEAK_LOSSES + [DEEP_MASK_ESTIMATION]
@@ -64,7 +67,8 @@ DeepMACParams = collections.namedtuple('DeepMACParams', [
        'augmented_self_supervision_warmup_steps',
        'augmented_self_supervision_loss',
        'augmented_self_supervision_scale_min',
-        'augmented_self_supervision_scale_max'
+        'augmented_self_supervision_scale_max',
+        'pointly_supervised_keypoint_loss_weight'
    ])
@@ -78,6 +82,8 @@ def _get_loss_weight(loss_name, config):
    return config.box_consistency_loss_weight
  elif loss_name == DEEP_MASK_AUGMENTED_SELF_SUPERVISION:
    return config.augmented_self_supervision_loss_weight
+  elif loss_name == DEEP_MASK_POINTLY_SUPERVISED:
+    return config.pointly_supervised_keypoint_loss_weight
  else:
    raise ValueError('Unknown loss - {}'.format(loss_name))
@@ -1356,6 +1362,11 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
      loss: A [batch_size, num_instances] shaped tensor with the loss for each
        instance.
    """
+    if mask_gt is None:
+      logging.info('No mask GT provided, mask loss is 0.')
+      return tf.zeros_like(boxes[:, :, 0])
    batch_size, num_instances = tf.shape(boxes)[0], tf.shape(boxes)[1]
    mask_logits = self._resize_logits_like_gt(mask_logits, mask_gt)
@@ -1572,9 +1583,86 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    return loss
+  def _compute_pointly_supervised_loss_from_keypoints(
+      self, mask_logits, keypoints_gt, keypoints_depth_gt):
+    """Computes per-point mask loss from keypoints.
+    Args:
+      mask_logits: A [batch_size, num_instances, height, width] float tensor
+        denoting predicted masks.
+      keypoints_gt: A [batch_size, num_instances, num_keypoints, 2] float tensor
+        of normalize keypoint coordinates.
+      keypoints_depth_gt: A [batch_size, num_instances, num_keyponts] float
+        tensor of keypoint depths. We assume that +1 is foreground and -1
+        is background.
+    Returns:
+      loss: Pointly supervised loss with shape [batch_size, num_instances].
+    """
+    if keypoints_gt is None:
+      logging.info(('Returning 0 pointly supervised loss because '
+                    'keypoints are not given.'))
+      return tf.zeros(tf.shape(mask_logits)[:2])
+    if keypoints_depth_gt is None:
+      logging.info(('Returning 0 pointly supervised loss because '
+                    'keypoint depths are not given.'))
+      return tf.zeros(tf.shape(mask_logits)[:2])
+    if not self._deepmac_params.predict_full_resolution_masks:
+      raise NotImplementedError(
+          'Pointly supervised loss not implemented with RoIAlign.')
+    num_keypoints = tf.shape(keypoints_gt)[2]
+    keypoints_nan = tf.math.is_nan(keypoints_gt)
+    keypoints_gt = tf.where(
+        keypoints_nan, tf.zeros_like(keypoints_gt), keypoints_gt)
+    weights = tf.cast(
+        tf.logical_not(tf.reduce_any(keypoints_nan, axis=3)), tf.float32)
+    height, width = tf.shape(mask_logits)[2], tf.shape(mask_logits)[3]
+    ky, kx = tf.unstack(keypoints_gt, axis=3)
+    height_f, width_f = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+    ky = tf.clip_by_value(tf.cast(ky * height_f, tf.int32), 0, height - 1)
+    kx = tf.clip_by_value(tf.cast(kx * width_f, tf.int32), 0, width - 1)
+    keypoints_gt_int = tf.stack([ky, kx], axis=3)
+    mask_logits_flat, batch_size, num_instances = flatten_first2_dims(
+        mask_logits)
+    keypoints_gt_int_flat, _, _ = flatten_first2_dims(keypoints_gt_int)
+    keypoint_depths_flat, _, _ = flatten_first2_dims(keypoints_depth_gt)
+    weights_flat = tf.logical_not(
+        tf.reduce_any(keypoints_nan, axis=2))
+    weights_flat, _, _ = flatten_first2_dims(weights)
+    # TODO(vighneshb): Replace with bilinear interpolation
+    point_mask_logits = tf.gather_nd(
+        mask_logits_flat, keypoints_gt_int_flat, batch_dims=1)
+    point_mask_logits = tf.reshape(
+        point_mask_logits, [batch_size * num_instances, num_keypoints, 1])
+    labels = tf.cast(keypoint_depths_flat > 0.0, tf.float32)
+    labels = tf.reshape(
+        labels, [batch_size * num_instances, num_keypoints, 1])
+    weights_flat = tf.reshape(
+        weights_flat, [batch_size * num_instances, num_keypoints, 1])
+    loss = self._deepmac_params.classification_loss(
+        prediction_tensor=point_mask_logits, target_tensor=labels,
+        weights=weights_flat
+    )
+    loss = self._aggregate_classification_loss(
+        loss, gt=labels, pred=point_mask_logits, method='normalize_auto')
+    return tf.reshape(loss, [batch_size, num_instances])
  def _compute_deepmac_losses(
      self, boxes, masks_logits, masks_gt, image,
-      self_supervised_masks_logits=None):
+      self_supervised_masks_logits=None, keypoints_gt=None,
+      keypoints_depth_gt=None):
    """Returns the mask loss per instance.
    Args:
@@ -1584,19 +1672,28 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        float tensor containing the instance mask predictions in their logit
        form.
      masks_gt: A [batch_size, num_instances, output_height, output_width] float
-        tensor containing the groundtruth masks.
+        tensor containing the groundtruth masks. If masks_gt is None,
+        DEEP_MASK_ESTIMATION is filled with 0s.
      image: [batch_size, output_height, output_width, channels] float tensor
        denoting the input image.
      self_supervised_masks_logits: Optional self-supervised mask logits to
        compare against of same shape as mask_logits.
+      keypoints_gt: A float tensor of shape
+        [batch_size, num_instances, num_keypoints, 2], representing the points
+        where we have mask supervision.
+      keypoints_depth_gt: A float tensor of shape
+        [batch_size, num_instances, num_keypoints] of keypoint depths which
+        indicate the mask label at the keypoint locations. depth=+1 is
+        foreground and depth=-1 is background.
    Returns:
-      mask_prediction_loss: A [batch_size, num_instances] shaped float tensor
+      tensor_dict: A dictionary with 4 keys, each mapping to a tensor of shape
-        containing the mask loss for each instance in the batch.
+        [batch_size, num_instances]. The 4 keys are:
-      box_consistency_loss: A [batch_size, num_instances] shaped float tensor
+          - DEEP_MASK_ESTIMATION
-        containing the box consistency loss for each instance in the batch.
+          - DEEP_MASK_BOX_CONSISTENCY
-      box_consistency_loss: A [batch_size, num_instances] shaped float tensor
+          - DEEP_MASK_COLOR_CONSISTENCY
-        containing the color consistency loss in the batch.
+          - DEEP_MASK_AUGMENTED_SELF_SUPERVISION
+          - DEEP_MASK_POINTLY_SUPERVISED
    """
    if tf.keras.backend.learning_phase():
@@ -1611,11 +1708,11 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    else:
      boxes_for_crop = boxes
-    mask_gt = self._get_groundtruth_mask_output(
+    if masks_gt is not None:
+      masks_gt = self._get_groundtruth_mask_output(
          boxes_for_crop, masks_gt)
    mask_prediction_loss = self._compute_mask_prediction_loss(
-        boxes_for_crop, masks_logits, mask_gt)
+        boxes_for_crop, masks_logits, masks_gt)
    box_consistency_loss = self._compute_box_consistency_loss(
        boxes, boxes_for_crop, masks_logits)
@@ -1627,11 +1724,16 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        masks_logits, self_supervised_masks_logits, boxes,
    )
+    pointly_supervised_loss = (
+        self._compute_pointly_supervised_loss_from_keypoints(
+            masks_logits, keypoints_gt, keypoints_depth_gt))
    return {
        DEEP_MASK_ESTIMATION: mask_prediction_loss,
        DEEP_MASK_BOX_CONSISTENCY: box_consistency_loss,
        DEEP_MASK_COLOR_CONSISTENCY: color_consistency_loss,
-        DEEP_MASK_AUGMENTED_SELF_SUPERVISION: self_supervised_loss
+        DEEP_MASK_AUGMENTED_SELF_SUPERVISION: self_supervised_loss,
+        DEEP_MASK_POINTLY_SUPERVISED: pointly_supervised_loss,
    }
  def _get_lab_image(self, preprocessed_image):
@@ -1644,6 +1746,13 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
                                 ' consistency loss is not supported in TF1.'))
    return tfio.experimental.color.rgb_to_lab(raw_image)
+  def _maybe_get_gt_batch(self, field):
+    """Returns a batch of groundtruth tensors if available, else None."""
+    if self.groundtruth_has_field(field):
+      return _batch_gt_list(self.groundtruth_lists(field))
+    else:
+      return None
  def _compute_masks_loss(self, prediction_dict):
    """Computes the mask loss.
@@ -1671,16 +1780,12 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        prediction_dict['preprocessed_inputs'], (height, width))
    image = self._get_lab_image(preprocessed_image)
-    # Iterate over multiple preidctions by backbone (for hourglass length=2)
+    gt_boxes = self._maybe_get_gt_batch(fields.BoxListFields.boxes)
+    gt_weights = self._maybe_get_gt_batch(fields.BoxListFields.weights)
-    gt_boxes = _batch_gt_list(
+    gt_classes = self._maybe_get_gt_batch(fields.BoxListFields.classes)
-        self.groundtruth_lists(fields.BoxListFields.boxes))
+    gt_masks = self._maybe_get_gt_batch(fields.BoxListFields.masks)
-    gt_weights = _batch_gt_list(
+    gt_keypoints = self._maybe_get_gt_batch(fields.BoxListFields.keypoints)
-        self.groundtruth_lists(fields.BoxListFields.weights))
+    gt_depths = self._maybe_get_gt_batch(fields.BoxListFields.keypoint_depths)
-    gt_masks = _batch_gt_list(
-        self.groundtruth_lists(fields.BoxListFields.masks))
-    gt_classes = _batch_gt_list(
-        self.groundtruth_lists(fields.BoxListFields.classes))
    mask_logits_list = prediction_dict[MASK_LOGITS_GT_BOXES]
    self_supervised_mask_logits_list = prediction_dict.get(
@@ -1688,6 +1793,7 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        [None] * len(mask_logits_list))
    assert len(mask_logits_list) == len(self_supervised_mask_logits_list)
+    # Iterate over multiple preidctions by backbone (for hourglass length=2)
    for (mask_logits, self_supervised_mask_logits) in zip(
        mask_logits_list, self_supervised_mask_logits_list):
@@ -1698,9 +1804,11 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
      sample_loss_dict = self._compute_deepmac_losses(
          gt_boxes, mask_logits, gt_masks, image,
-          self_supervised_masks_logits=self_supervised_mask_logits)
+          self_supervised_masks_logits=self_supervised_mask_logits,
+          keypoints_gt=gt_keypoints, keypoints_depth_gt=gt_depths)
      sample_loss_dict[DEEP_MASK_ESTIMATION] *= valid_mask_weights
      for loss_name in WEAK_LOSSES:
        sample_loss_dict[loss_name] *= gt_weights

--- a/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
@@ -108,7 +108,8 @@ def build_meta_arch(**override_params):
      augmented_self_supervision_warmup_steps=0,
      augmented_self_supervision_loss='loss_dice',
      augmented_self_supervision_scale_min=1.0,
-      augmented_self_supervision_scale_max=1.0)
+      augmented_self_supervision_scale_max=1.0,
+      pointly_supervised_keypoint_loss_weight=1.0)
  params.update(override_params)
@@ -197,6 +198,7 @@ DEEPMAC_PROTO_TEXT = """
  augmented_self_supervision_flip_probability: 0.9
  augmented_self_supervision_scale_min: 0.42
  augmented_self_supervision_scale_max: 1.42
+  pointly_supervised_keypoint_loss_weight: 0.13
 """
@@ -225,6 +227,8 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
        params.augmented_self_supervision_scale_min, 0.42)
    self.assertAlmostEqual(
        params.augmented_self_supervision_scale_max, 1.42)
+    self.assertAlmostEqual(
+        params.pointly_supervised_keypoint_loss_weight, 0.13)
  def test_subsample_trivial(self):
    """Test subsampling masks."""
@@ -1440,9 +1444,12 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
                           loss_at_100[loss_key].numpy())
  def test_loss_keys(self):
-    model = build_meta_arch(use_dice_loss=True,
+    model = build_meta_arch(
+        use_dice_loss=True,
        augmented_self_supervision_loss_weight=1.0,
-                            augmented_self_supervision_max_translation=0.5)
+        augmented_self_supervision_max_translation=0.5,
+        predict_full_resolution_masks=True)
    prediction = {
        'preprocessed_inputs': tf.random.normal((3, 32, 32, 3)),
        'MASK_LOGITS_GT_BOXES': [tf.random.normal((3, 5, 8, 8))] * 2,
@@ -1457,7 +1464,9 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
            tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)] * 3,
        groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)] * 3,
        groundtruth_weights_list=[tf.ones(5)] * 3,
-        groundtruth_masks_list=[tf.ones((5, 32, 32))] * 3)
+        groundtruth_masks_list=[tf.ones((5, 32, 32))] * 3,
+        groundtruth_keypoints_list=[tf.zeros((5, 10, 2))] * 3,
+        groundtruth_keypoint_depths_list=[tf.zeros((5, 10))] * 3)
    loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
    self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
@@ -1495,11 +1504,15 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
    weights = [tf.ones(5)]
    masks = [tf.ones((5, 32, 32))]
+    keypoints = [tf.zeros((5, 10, 2))]
+    keypoint_depths = [tf.ones((5, 10))]
    model.provide_groundtruth(
        groundtruth_boxes_list=boxes,
        groundtruth_classes_list=classes,
        groundtruth_weights_list=weights,
-        groundtruth_masks_list=masks)
+        groundtruth_masks_list=masks,
+        groundtruth_keypoints_list=keypoints,
+        groundtruth_keypoint_depths_list=keypoint_depths)
    loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
    self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
@@ -1513,7 +1526,8 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
        deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY: rng.uniform(1, 5),
        deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY: rng.uniform(1, 5),
        deepmac_meta_arch.DEEP_MASK_AUGMENTED_SELF_SUPERVISION: (
-            rng.uniform(1, 5))
+            rng.uniform(1, 5)),
+        deepmac_meta_arch.DEEP_MASK_POINTLY_SUPERVISED: rng.uniform(1, 5)
    }
    weighted_model = build_meta_arch(
@@ -1531,14 +1545,18 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
            loss_weights[deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY]),
        augmented_self_supervision_loss_weight=(
            loss_weights[deepmac_meta_arch.DEEP_MASK_AUGMENTED_SELF_SUPERVISION]
-            )
+            ),
+        pointly_supervised_keypoint_loss_weight=(
+            loss_weights[deepmac_meta_arch.DEEP_MASK_POINTLY_SUPERVISED])
        )
    weighted_model.provide_groundtruth(
        groundtruth_boxes_list=boxes,
        groundtruth_classes_list=classes,
        groundtruth_weights_list=weights,
-        groundtruth_masks_list=masks)
+        groundtruth_masks_list=masks,
+        groundtruth_keypoints_list=keypoints,
+        groundtruth_keypoint_depths_list=keypoint_depths)
    weighted_loss = weighted_model.loss(prediction, tf.constant([[32, 32, 3]]))
    for mask_loss in deepmac_meta_arch.MASK_LOSSES:
@@ -1613,6 +1631,36 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAlmostEqual(loss_at_20[loss_key].numpy(),
                           loss_at_100[loss_key].numpy())
+  def test_pointly_supervised_loss(self):
+    tf.keras.backend.set_learning_phase(True)
+    model = build_meta_arch(
+        use_dice_loss=False,
+        predict_full_resolution_masks=True,
+        network_type='cond_inst1',
+        dim=9,
+        pixel_embedding_dim=8,
+        use_instance_embedding=False,
+        use_xy=False,
+        pointly_supervised_keypoint_loss_weight=1.0)
+    mask_logits = np.zeros((1, 1, 32, 32), dtype=np.float32)
+    keypoints = np.zeros((1, 1, 1, 2), dtype=np.float32)
+    keypoint_depths = np.zeros((1, 1, 1), dtype=np.float32)
+    keypoints[..., 0] = 0.5
+    keypoints[..., 1] = 0.5
+    keypoint_depths[..., 0] = 1.0
+    mask_logits[:, :, 16, 16] = 1.0
+    expected_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        logits=[[1.0]], labels=[[1.0]]
+    ).numpy()
+    loss = model._compute_pointly_supervised_loss_from_keypoints(
+        mask_logits, keypoints, keypoint_depths)
+    self.assertEqual(loss.shape, (1, 1))
+    self.assertAllClose(expected_loss, loss)
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class FullyConnectedMaskHeadTest(tf.test.TestCase):

--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -41,6 +41,7 @@ from object_detection.utils import visualization_utils as vutils
 MODEL_BUILD_UTIL_MAP = model_lib.MODEL_BUILD_UTIL_MAP
 NUM_STEPS_PER_ITERATION = 100
+LOG_EVERY = 100
 RESTORE_MAP_ERROR_TEMPLATE = (
@@ -536,8 +537,7 @@ def train_loop(
  # Write the as-run pipeline config to disk.
  if save_final_config:
-    tf.logging.info('Saving pipeline config file to directory {}'.format(
+    tf.logging.info('Saving pipeline config file to directory %s', model_dir)
-        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)
@@ -699,7 +699,7 @@ def train_loop(
          for key, val in logged_dict.items():
            tf.compat.v2.summary.scalar(key, val, step=global_step)
-          if global_step.value() - logged_step >= 100:
+          if global_step.value() - logged_step >= LOG_EVERY:
            logged_dict_np = {name: value.numpy() for name, value in
                              logged_dict.items()}
            tf.logging.info(
@@ -1091,8 +1091,7 @@ def eval_continuously(
  configs = merge_external_params_with_configs(
      configs, None, kwargs_dict=kwargs)
  if model_dir and save_final_config:
-    tf.logging.info('Saving pipeline config file to directory {}'.format(
+    tf.logging.info('Saving pipeline config file to directory %s', model_dir)
-        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)
@@ -1104,11 +1103,11 @@ def eval_continuously(
  eval_on_train_input_config.sample_1_of_n_examples = (
      sample_1_of_n_eval_on_train_examples)
  if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
-    tf.logging.warning('Expected number of evaluation epochs is 1, but '
+    tf.logging.warning(
+        ('Expected number of evaluation epochs is 1, but '
         'instead encountered `eval_on_train_input_config'
-                       '.num_epochs` = '
+         '.num_epochs` = %d. Overwriting `num_epochs` to 1.'),
-                       '{}. Overwriting `num_epochs` to 1.'.format(
+        eval_on_train_input_config.num_epochs)
-                           eval_on_train_input_config.num_epochs))
    eval_on_train_input_config.num_epochs = 1
  if kwargs['use_bfloat16']:

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -403,7 +403,7 @@ message CenterNet {
  // Mask prediction support using DeepMAC. See https://arxiv.org/abs/2104.00613
-  // Next ID 33
+  // Next ID 34
  message DeepMACMaskEstimation {
    // The loss used for penalizing mask predictions.
    optional ClassificationLoss classification_loss = 1;
@@ -520,6 +520,17 @@ message CenterNet {
    optional float augmented_self_supervision_scale_min = 31 [default=1.0];
    optional float augmented_self_supervision_scale_max = 32 [default=1.0];
+    // The loss weight for the pointly supervised loss as defined in the paper
+    // https://arxiv.org/abs/2104.06404
+    // We assume that point supervision is given through a keypoint dataset,
+    // where each keypoint represents a sampled point, and its depth indicates
+    // whether it is a foreground or background point.
+    // Depth = +1 is assumed to be foreground and
+    // Depth = -1 is assumed to be background.
+    optional float pointly_supervised_keypoint_loss_weight = 33 [default = 0.0];
  }
  optional DeepMACMaskEstimation deepmac_mask_estimation = 14;