resovle merge conflicts

31ca3b97 · Kaushik Shivakumar · 3e9d886d · 7fcd7cba · 31ca3b97 · 31ca3b97
Commit 31ca3b97 authored Jul 23, 2020 by Kaushik Shivakumar
20 changed files
--- a/research/object_detection/legacy/trainer_tf1_test.py
+++ b/research/object_detection/legacy/trainer_tf1_test.py
@@ -185,6 +185,9 @@ class FakeDetectionModel(model.DetectionModel):
    """
    return {var.op.name: var for var in tf.global_variables()}

+  def restore_from_objects(self, fine_tune_checkpoint_type):
+    pass
+
  def updates(self):
    """Returns a list of update operators for this model.


--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -924,13 +924,16 @@ def convert_strided_predictions_to_normalized_keypoints(


 def convert_strided_predictions_to_instance_masks(
-    boxes, classes, masks, stride, mask_height, mask_width,
-    true_image_shapes, score_threshold=0.5):
+    boxes, classes, masks, true_image_shapes,
+    densepose_part_heatmap=None, densepose_surface_coords=None, stride=4,
+    mask_height=256, mask_width=256, score_threshold=0.5,
+    densepose_class_index=-1):
  """Converts predicted full-image masks into instance masks.

  For each predicted detection box:
-    * Crop and resize the predicted mask based on the detected bounding box
-      coordinates and class prediction. Uses bilinear resampling.
+    * Crop and resize the predicted mask (and optionally DensePose coordinates)
+      based on the detected bounding box coordinates and class prediction. Uses
+      bilinear resampling.
    * Binarize the mask using the provided score threshold.

  Args:
@@ -940,57 +943,212 @@ def convert_strided_predictions_to_instance_masks(
      detected class for each box (0-indexed).
    masks: A [batch, output_height, output_width, num_classes] float32
      tensor with class probabilities.
+    true_image_shapes: A tensor of shape [batch, 3] representing the true
+      shape of the inputs not considering padding.
+    densepose_part_heatmap: (Optional) A [batch, output_height, output_width,
+      num_parts] float32 tensor with part scores (i.e. logits).
+    densepose_surface_coords: (Optional) A [batch, output_height, output_width,
+      2 * num_parts] float32 tensor with predicted part coordinates (in
+      vu-format).
    stride: The stride in the output space.
    mask_height: The desired resized height for instance masks.
    mask_width: The desired resized width for instance masks.
-    true_image_shapes: A tensor of shape [batch, 3] representing the true
-      shape of the inputs not considering padding.
    score_threshold: The threshold at which to convert predicted mask
       into foreground pixels.
+    densepose_class_index: The class index (0-indexed) corresponding to the
+      class which has DensePose labels (e.g. person class).

  Returns:
-    A [batch_size, max_detections, mask_height, mask_width] uint8 tensor with
-    predicted foreground mask for each instance. The masks take values in
-    {0, 1}.
+    A tuple of masks and surface_coords.
+    instance_masks: A [batch_size, max_detections, mask_height, mask_width]
+      uint8 tensor with predicted foreground mask for each
+      instance. If DensePose tensors are provided, then each pixel value in the
+      mask encodes the 1-indexed part.
+    surface_coords: A [batch_size, max_detections, mask_height, mask_width, 2]
+      float32 tensor with (v, u) coordinates. Note that v, u coordinates are
+      only defined on instance masks, and the coordinates at each location of
+      the foreground mask correspond to coordinates on a local part coordinate
+      system (the specific part can be inferred from the `instance_masks`
+      output. If DensePose feature maps are not passed to this function, this
+      output will be None.
+
+  Raises:
+    ValueError: If one but not both of `densepose_part_heatmap` and
+    `densepose_surface_coords` is provided.
  """
-  _, output_height, output_width, _ = (
+  batch_size, output_height, output_width, _ = (
      shape_utils.combined_static_and_dynamic_shape(masks))
  input_height = stride * output_height
  input_width = stride * output_width

+  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
+  # If necessary, create dummy DensePose tensors to simplify the map function.
+  densepose_present = True
+  if ((densepose_part_heatmap is not None) ^
+      (densepose_surface_coords is not None)):
+    raise ValueError('To use DensePose, both `densepose_part_heatmap` and '
+                     '`densepose_surface_coords` must be provided')
+  if densepose_part_heatmap is None and densepose_surface_coords is None:
+    densepose_present = False
+    densepose_part_heatmap = tf.zeros(
+        (batch_size, output_height, output_width, 1), dtype=tf.float32)
+    densepose_surface_coords = tf.zeros(
+        (batch_size, output_height, output_width, 2), dtype=tf.float32)
+  crop_and_threshold_fn = functools.partial(
+      crop_and_threshold_masks, input_height=input_height,
+      input_width=input_width, mask_height=mask_height, mask_width=mask_width,
+      score_threshold=score_threshold,
+      densepose_class_index=densepose_class_index)
+
+  instance_masks, surface_coords = shape_utils.static_or_dynamic_map_fn(
+      crop_and_threshold_fn,
+      elems=[boxes, classes, masks, densepose_part_heatmap,
+             densepose_surface_coords, true_heights, true_widths],
+      dtype=[tf.uint8, tf.float32],
+      back_prop=False)
+  surface_coords = surface_coords if densepose_present else None
+  return instance_masks, surface_coords
+
+
+def crop_and_threshold_masks(elems, input_height, input_width, mask_height=256,
+                             mask_width=256, score_threshold=0.5,
+                             densepose_class_index=-1):
+  """Crops and thresholds masks based on detection boxes.
+
+  Args:
+    elems: A tuple of
+      boxes - float32 tensor of shape [max_detections, 4]
+      classes - int32 tensor of shape [max_detections] (0-indexed)
+      masks - float32 tensor of shape [output_height, output_width, num_classes]
+      part_heatmap - float32 tensor of shape [output_height, output_width,
+        num_parts]
+      surf_coords - float32 tensor of shape [output_height, output_width,
+        2 * num_parts]
+      true_height - scalar int tensor
+      true_width - scalar int tensor
+    input_height: Input height to network.
+    input_width: Input width to network.
+    mask_height: Height for resizing mask crops.
+    mask_width: Width for resizing mask crops.
+    score_threshold: The threshold at which to convert predicted mask
+      into foreground pixels.
+    densepose_class_index: scalar int tensor with the class index (0-indexed)
+      for DensePose.
+
+  Returns:
+    A tuple of
+    all_instances: A [max_detections, mask_height, mask_width] uint8 tensor
+      with a predicted foreground mask for each instance. Background is encoded
+      as 0, and foreground is encoded as a positive integer. Specific part
+      indices are encoded as 1-indexed parts (for classes that have part
+      information).
+    surface_coords: A [max_detections, mask_height, mask_width, 2]
+      float32 tensor with (v, u) coordinates. for each part.
+  """
+  (boxes, classes, masks, part_heatmap, surf_coords, true_height,
+   true_width) = elems
  # Boxes are in normalized coordinates relative to true image shapes. Convert
  # coordinates to be normalized relative to input image shapes (since masks
  # may still have padding).
-  # Then crop and resize each mask.
-  def crop_and_threshold_masks(args):
-    """Crops masks based on detection boxes."""
-    boxes, classes, masks, true_height, true_width = args
-    boxlist = box_list.BoxList(boxes)
-    y_scale = true_height / input_height
-    x_scale = true_width / input_width
-    boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
-    boxes = boxlist.get()
-    # Convert masks from [input_height, input_width, num_classes] to
-    # [num_classes, input_height, input_width, 1].
-    masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
-    cropped_masks = tf2.image.crop_and_resize(
-        masks_4d,
-        boxes=boxes,
-        box_indices=classes,
-        crop_size=[mask_height, mask_width],
-        method='bilinear')
-    masks_3d = tf.squeeze(cropped_masks, axis=3)
-    masks_binarized = tf.math.greater_equal(masks_3d, score_threshold)
-    return tf.cast(masks_binarized, tf.uint8)
+  boxlist = box_list.BoxList(boxes)
+  y_scale = true_height / input_height
+  x_scale = true_width / input_width
+  boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
+  boxes = boxlist.get()
+  # Convert masks from [output_height, output_width, num_classes] to
+  # [num_classes, output_height, output_width, 1].
+  num_classes = tf.shape(masks)[-1]
+  masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
+  # Tile part and surface coordinate masks for all classes.
+  part_heatmap_4d = tf.tile(part_heatmap[tf.newaxis, :, :, :],
+                            multiples=[num_classes, 1, 1, 1])
+  surf_coords_4d = tf.tile(surf_coords[tf.newaxis, :, :, :],
+                           multiples=[num_classes, 1, 1, 1])
+  feature_maps_concat = tf.concat([masks_4d, part_heatmap_4d, surf_coords_4d],
+                                  axis=-1)
+  # The following tensor has shape
+  # [max_detections, mask_height, mask_width, 1 + 3 * num_parts].
+  cropped_masks = tf2.image.crop_and_resize(
+      feature_maps_concat,
+      boxes=boxes,
+      box_indices=classes,
+      crop_size=[mask_height, mask_width],
+      method='bilinear')
+
+  # Split the cropped masks back into instance masks, part masks, and surface
+  # coordinates.
+  num_parts = tf.shape(part_heatmap)[-1]
+  instance_masks, part_heatmap_cropped, surface_coords_cropped = tf.split(
+      cropped_masks, [1, num_parts, 2 * num_parts], axis=-1)
+
+  # Threshold the instance masks. Resulting tensor has shape
+  # [max_detections, mask_height, mask_width, 1].
+  instance_masks_int = tf.cast(
+      tf.math.greater_equal(instance_masks, score_threshold), dtype=tf.int32)
+
+  # Produce a binary mask that is 1.0 only:
+  #  - in the foreground region for an instance
+  #  - in detections corresponding to the DensePose class
+  det_with_parts = tf.equal(classes, densepose_class_index)
+  det_with_parts = tf.cast(
+      tf.reshape(det_with_parts, [-1, 1, 1, 1]), dtype=tf.int32)
+  instance_masks_with_parts = tf.math.multiply(instance_masks_int,
+                                               det_with_parts)
+
+  # Similarly, produce a binary mask that holds the foreground masks only for
+  # instances without parts (i.e. non-DensePose classes).
+  det_without_parts = 1 - det_with_parts
+  instance_masks_without_parts = tf.math.multiply(instance_masks_int,
+                                                  det_without_parts)
+
+  # Assemble a tensor that has standard instance segmentation masks for
+  # non-DensePose classes (with values in [0, 1]), and part segmentation masks
+  # for DensePose classes (with vaues in [0, 1, ..., num_parts]).
+  part_mask_int_zero_indexed = tf.math.argmax(
+      part_heatmap_cropped, axis=-1, output_type=tf.int32)[:, :, :, tf.newaxis]
+  part_mask_int_one_indexed = part_mask_int_zero_indexed + 1
+  all_instances = (instance_masks_without_parts +
+                   instance_masks_with_parts * part_mask_int_one_indexed)
+
+  # Gather the surface coordinates for the parts.
+  surface_coords_cropped = tf.reshape(
+      surface_coords_cropped, [-1, mask_height, mask_width, num_parts, 2])
+  surface_coords = gather_surface_coords_for_parts(surface_coords_cropped,
+                                                   part_mask_int_zero_indexed)
+  surface_coords = (
+      surface_coords * tf.cast(instance_masks_with_parts, tf.float32))
+
+  return [tf.squeeze(all_instances, axis=3), surface_coords]
+
+
+def gather_surface_coords_for_parts(surface_coords_cropped,
+                                    highest_scoring_part):
+  """Gathers the (v, u) coordinates for the highest scoring DensePose parts.

-  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
-  masks_for_image = shape_utils.static_or_dynamic_map_fn(
-      crop_and_threshold_masks,
-      elems=[boxes, classes, masks, true_heights, true_widths],
-      dtype=tf.uint8,
-      back_prop=False)
-  masks = tf.stack(masks_for_image, axis=0)
-  return masks
+  Args:
+    surface_coords_cropped: A [max_detections, height, width, num_parts, 2]
+      float32 tensor with (v, u) surface coordinates.
+    highest_scoring_part: A [max_detections, height, width] integer tensor with
+      the highest scoring part (0-indexed) indices for each location.
+
+  Returns:
+    A [max_detections, height, width, 2] float32 tensor with the (v, u)
+    coordinates selected from the highest scoring parts.
+  """
+  max_detections, height, width, num_parts, _ = (
+      shape_utils.combined_static_and_dynamic_shape(surface_coords_cropped))
+  flattened_surface_coords = tf.reshape(surface_coords_cropped, [-1, 2])
+  flattened_part_ids = tf.reshape(highest_scoring_part, [-1])
+
+  # Produce lookup indices that represent the locations of the highest scoring
+  # parts in the `flattened_surface_coords` tensor.
+  flattened_lookup_indices = (
+      num_parts * tf.range(max_detections * height * width) +
+      flattened_part_ids)
+
+  vu_coords_flattened = tf.gather(flattened_surface_coords,
+                                  flattened_lookup_indices, axis=0)
+  return tf.reshape(vu_coords_flattened, [max_detections, height, width, 2])


 class ObjectDetectionParams(
@@ -1235,6 +1393,64 @@ class MaskParams(
                              score_threshold, heatmap_bias_init)


+class DensePoseParams(
+    collections.namedtuple('DensePoseParams', [
+        'class_id', 'classification_loss', 'localization_loss',
+        'part_loss_weight', 'coordinate_loss_weight', 'num_parts',
+        'task_loss_weight', 'upsample_to_input_res', 'upsample_method',
+        'heatmap_bias_init'
+    ])):
+  """Namedtuple to store DensePose prediction related parameters."""
+
+  __slots__ = ()
+
+  def __new__(cls,
+              class_id,
+              classification_loss,
+              localization_loss,
+              part_loss_weight=1.0,
+              coordinate_loss_weight=1.0,
+              num_parts=24,
+              task_loss_weight=1.0,
+              upsample_to_input_res=True,
+              upsample_method='bilinear',
+              heatmap_bias_init=-2.19):
+    """Constructor with default values for DensePoseParams.
+
+    Args:
+      class_id: the ID of the class that contains the DensePose groundtruth.
+        This should typically correspond to the "person" class. Note that the ID
+        is 0-based, meaning that class 0 corresponds to the first non-background
+        object class.
+      classification_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the body part predictions in CenterNet.
+      localization_loss: an object_detection.core.losses.Loss object to compute
+        the loss for the surface coordinate regression in CenterNet.
+      part_loss_weight: The loss weight to apply to part prediction.
+      coordinate_loss_weight: The loss weight to apply to surface coordinate
+        prediction.
+      num_parts: The number of DensePose parts to predict.
+      task_loss_weight: float, the loss weight for the DensePose task.
+      upsample_to_input_res: Whether to upsample the DensePose feature maps to
+        the input resolution before applying loss. Note that the prediction
+        outputs are still at the standard CenterNet output stride.
+      upsample_method: Method for upsampling DensePose feature maps. Options are
+        either 'bilinear' or 'nearest'). This takes no effect when
+        `upsample_to_input_res` is False.
+      heatmap_bias_init: float, the initial value of bias in the convolutional
+        kernel of the part prediction head. If set to None, the
+        bias is initialized with zeros.
+
+    Returns:
+      An initialized DensePoseParams namedtuple.
+    """
+    return super(DensePoseParams,
+                 cls).__new__(cls, class_id, classification_loss,
+                              localization_loss, part_loss_weight,
+                              coordinate_loss_weight, num_parts,
+                              task_loss_weight, upsample_to_input_res,
+                              upsample_method, heatmap_bias_init)
+
 # The following constants are used to generate the keys of the
 # (prediction, loss, target assigner,...) dictionaries used in CenterNetMetaArch
 # class.
@@ -1247,6 +1463,9 @@ KEYPOINT_HEATMAP = 'keypoint/heatmap'
 KEYPOINT_OFFSET = 'keypoint/offset'
 SEGMENTATION_TASK = 'segmentation_task'
 SEGMENTATION_HEATMAP = 'segmentation/heatmap'
+DENSEPOSE_TASK = 'densepose_task'
+DENSEPOSE_HEATMAP = 'densepose/heatmap'
+DENSEPOSE_REGRESSION = 'densepose/regression'
 LOSS_KEY_PREFIX = 'Loss'


@@ -1290,7 +1509,8 @@ class CenterNetMetaArch(model.DetectionModel):
               object_center_params,
               object_detection_params=None,
               keypoint_params_dict=None,
-               mask_params=None):
+               mask_params=None,
+               densepose_params=None):
    """Initializes a CenterNet model.

    Args:
@@ -1318,6 +1538,10 @@ class CenterNetMetaArch(model.DetectionModel):
      mask_params: A MaskParams namedtuple. This object
        holds the hyper-parameters for segmentation. Please see the class
        definition for more details.
+      densepose_params: A DensePoseParams namedtuple. This object holds the
+        hyper-parameters for DensePose prediction. Please see the class
+        definition for more details. Note that if this is provided, it is
+        expected that `mask_params` is also provided.
    """
    assert object_detection_params or keypoint_params_dict
    # Shorten the name for convenience and better formatting.
@@ -1333,6 +1557,10 @@ class CenterNetMetaArch(model.DetectionModel):
    self._od_params = object_detection_params
    self._kp_params_dict = keypoint_params_dict
    self._mask_params = mask_params
+    if densepose_params is not None and mask_params is None:
+      raise ValueError('To run DensePose prediction, `mask_params` must also '
+                       'be supplied.')
+    self._densepose_params = densepose_params

    # Construct the prediction head nets.
    self._prediction_head_dict = self._construct_prediction_heads(
@@ -1413,8 +1641,18 @@ class CenterNetMetaArch(model.DetectionModel):
    if self._mask_params is not None:
      prediction_heads[SEGMENTATION_HEATMAP] = [
          make_prediction_net(num_classes,
-                              bias_fill=class_prediction_bias_init)
+                              bias_fill=self._mask_params.heatmap_bias_init)
+          for _ in range(num_feature_outputs)]
+    if self._densepose_params is not None:
+      prediction_heads[DENSEPOSE_HEATMAP] = [
+          make_prediction_net(  # pylint: disable=g-complex-comprehension
+              self._densepose_params.num_parts,
+              bias_fill=self._densepose_params.heatmap_bias_init)
          for _ in range(num_feature_outputs)]
+      prediction_heads[DENSEPOSE_REGRESSION] = [
+          make_prediction_net(2 * self._densepose_params.num_parts)
+          for _ in range(num_feature_outputs)
+      ]
    return prediction_heads

  def _initialize_target_assigners(self, stride, min_box_overlap_iou):
@@ -1449,6 +1687,10 @@ class CenterNetMetaArch(model.DetectionModel):
    if self._mask_params is not None:
      target_assigners[SEGMENTATION_TASK] = (
          cn_assigner.CenterNetMaskTargetAssigner(stride))
+    if self._densepose_params is not None:
+      dp_stride = 1 if self._densepose_params.upsample_to_input_res else stride
+      target_assigners[DENSEPOSE_TASK] = (
+          cn_assigner.CenterNetDensePoseTargetAssigner(dp_stride))

    return target_assigners

@@ -1860,6 +2102,113 @@ class CenterNetMetaArch(model.DetectionModel):
        float(len(segmentation_predictions)) * total_pixels_in_loss)
    return total_loss

+  def _compute_densepose_losses(self, input_height, input_width,
+                                prediction_dict):
+    """Computes the weighted DensePose losses.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      prediction_dict: A dictionary holding predicted tensors output by the
+        "predict" function. See the "predict" function for more detailed
+        description.
+
+    Returns:
+      A dictionary of scalar float tensors representing the weighted losses for
+      the DensePose task:
+         DENSEPOSE_HEATMAP: the weighted part segmentation loss.
+         DENSEPOSE_REGRESSION: the weighted part surface coordinate loss.
+    """
+    dp_heatmap_loss, dp_regression_loss = (
+        self._compute_densepose_part_and_coordinate_losses(
+            input_height=input_height,
+            input_width=input_width,
+            part_predictions=prediction_dict[DENSEPOSE_HEATMAP],
+            surface_coord_predictions=prediction_dict[DENSEPOSE_REGRESSION]))
+    loss_dict = {}
+    loss_dict[DENSEPOSE_HEATMAP] = (
+        self._densepose_params.part_loss_weight * dp_heatmap_loss)
+    loss_dict[DENSEPOSE_REGRESSION] = (
+        self._densepose_params.coordinate_loss_weight * dp_regression_loss)
+    return loss_dict
+
+  def _compute_densepose_part_and_coordinate_losses(
+      self, input_height, input_width, part_predictions,
+      surface_coord_predictions):
+    """Computes the individual losses for the DensePose task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      part_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, num_parts].
+      surface_coord_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2 * num_parts].
+
+    Returns:
+      A tuple with two scalar loss tensors: part_prediction_loss and
+      surface_coord_loss.
+    """
+    gt_dp_num_points_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_num_points)
+    gt_dp_part_ids_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_part_ids)
+    gt_dp_surface_coords_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_surface_coords)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+
+    assigner = self._target_assigner_dict[DENSEPOSE_TASK]
+    batch_indices, batch_part_ids, batch_surface_coords, batch_weights = (
+        assigner.assign_part_and_coordinate_targets(
+            height=input_height,
+            width=input_width,
+            gt_dp_num_points_list=gt_dp_num_points_list,
+            gt_dp_part_ids_list=gt_dp_part_ids_list,
+            gt_dp_surface_coords_list=gt_dp_surface_coords_list,
+            gt_weights_list=gt_weights_list))
+
+    part_prediction_loss = 0
+    surface_coord_loss = 0
+    classification_loss_fn = self._densepose_params.classification_loss
+    localization_loss_fn = self._densepose_params.localization_loss
+    num_predictions = float(len(part_predictions))
+    num_valid_points = tf.math.count_nonzero(batch_weights)
+    num_valid_points = tf.cast(tf.math.maximum(num_valid_points, 1), tf.float32)
+    for part_pred, surface_coord_pred in zip(part_predictions,
+                                             surface_coord_predictions):
+      # Potentially upsample the feature maps, so that better quality (i.e.
+      # higher res) groundtruth can be applied.
+      if self._densepose_params.upsample_to_input_res:
+        part_pred = tf.keras.layers.UpSampling2D(
+            self._stride, interpolation=self._densepose_params.upsample_method)(
+                part_pred)
+        surface_coord_pred = tf.keras.layers.UpSampling2D(
+            self._stride, interpolation=self._densepose_params.upsample_method)(
+                surface_coord_pred)
+      # Compute the part prediction loss.
+      part_pred = cn_assigner.get_batch_predictions_from_indices(
+          part_pred, batch_indices[:, 0:3])
+      part_prediction_loss += classification_loss_fn(
+          part_pred[:, tf.newaxis, :],
+          batch_part_ids[:, tf.newaxis, :],
+          weights=batch_weights[:, tf.newaxis, tf.newaxis])
+      # Compute the surface coordinate loss.
+      batch_size, out_height, out_width, _ = _get_shape(
+          surface_coord_pred, 4)
+      surface_coord_pred = tf.reshape(
+          surface_coord_pred, [batch_size, out_height, out_width, -1, 2])
+      surface_coord_pred = cn_assigner.get_batch_predictions_from_indices(
+          surface_coord_pred, batch_indices)
+      surface_coord_loss += localization_loss_fn(
+          surface_coord_pred,
+          batch_surface_coords,
+          weights=batch_weights[:, tf.newaxis])
+    part_prediction_loss = tf.reduce_sum(part_prediction_loss) / (
+        num_predictions * num_valid_points)
+    surface_coord_loss = tf.reduce_sum(surface_coord_loss) / (
+        num_predictions * num_valid_points)
+    return part_prediction_loss, surface_coord_loss
+
  def preprocess(self, inputs):
    outputs = shape_utils.resize_images_and_return_shapes(
        inputs, self._image_resizer_fn)
@@ -1909,6 +2258,13 @@ class CenterNetMetaArch(model.DetectionModel):
        'segmentation/heatmap' - [optional] A list of size num_feature_outputs
          holding float tensors of size [batch_size, output_height,
          output_width, num_classes] representing the mask logits.
+        'densepose/heatmap' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, num_parts] representing the mask logits for each part.
+        'densepose/regression' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, 2 * num_parts] representing the DensePose surface
+          coordinate predictions.
        Note the $TASK_NAME is provided by the KeypointEstimation namedtuple
        used to differentiate between different keypoint tasks.
    """
@@ -1938,10 +2294,16 @@ class CenterNetMetaArch(model.DetectionModel):
      scope: Optional scope name.

    Returns:
-      A dictionary mapping the keys ['Loss/object_center', 'Loss/box/scale',
-        'Loss/box/offset', 'Loss/$TASK_NAME/keypoint/heatmap',
-        'Loss/$TASK_NAME/keypoint/offset',
-        'Loss/$TASK_NAME/keypoint/regression', 'Loss/segmentation/heatmap'] to
+      A dictionary mapping the keys [
+        'Loss/object_center',
+        'Loss/box/scale',  (optional)
+        'Loss/box/offset', (optional)
+        'Loss/$TASK_NAME/keypoint/heatmap', (optional)
+        'Loss/$TASK_NAME/keypoint/offset', (optional)
+        'Loss/$TASK_NAME/keypoint/regression', (optional)
+        'Loss/segmentation/heatmap', (optional)
+        'Loss/densepose/heatmap', (optional)
+        'Loss/densepose/regression]' (optional)
        scalar tensors corresponding to the losses for different tasks. Note the
        $TASK_NAME is provided by the KeypointEstimation namedtuple used to
        differentiate between different keypoint tasks.
@@ -1999,6 +2361,16 @@ class CenterNetMetaArch(model.DetectionModel):
        seg_losses[key] = seg_losses[key] * self._mask_params.task_loss_weight
      losses.update(seg_losses)

+    if self._densepose_params is not None:
+      densepose_losses = self._compute_densepose_losses(
+          input_height=input_height,
+          input_width=input_width,
+          prediction_dict=prediction_dict)
+      for key in densepose_losses:
+        densepose_losses[key] = (
+            densepose_losses[key] * self._densepose_params.task_loss_weight)
+      losses.update(densepose_losses)
+
    # Prepend the LOSS_KEY_PREFIX to the keys in the dictionary such that the
    # losses will be grouped together in Tensorboard.
    return dict([('%s/%s' % (LOSS_KEY_PREFIX, key), val)
@@ -2033,9 +2405,14 @@ class CenterNetMetaArch(model.DetectionModel):
          invalid keypoints have their coordinates and scores set to 0.0.
        detection_keypoint_scores: (Optional) A float tensor of shape [batch,
          max_detection, num_keypoints] with scores for each keypoint.
-        detection_masks: (Optional) An int tensor of shape [batch,
-          max_detections, mask_height, mask_width] with binarized masks for each
-          detection.
+        detection_masks: (Optional) A uint8 tensor of shape [batch,
+          max_detections, mask_height, mask_width] with masks for each
+          detection. Background is specified with 0, and foreground is specified
+          with positive integers (1 for standard instance segmentation mask, and
+          1-indexed parts for DensePose task).
+        detection_surface_coords: (Optional) A float32 tensor of shape [batch,
+          max_detection, mask_height, mask_width, 2] with DensePose surface
+          coordinates, in (v, u) format.
    """
    object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
    # Get x, y and channel indices corresponding to the top indices in the class
@@ -2076,14 +2453,27 @@ class CenterNetMetaArch(model.DetectionModel):

    if self._mask_params:
      masks = tf.nn.sigmoid(prediction_dict[SEGMENTATION_HEATMAP][-1])
-      instance_masks = convert_strided_predictions_to_instance_masks(
-          boxes, classes, masks, self._stride, self._mask_params.mask_height,
-          self._mask_params.mask_width, true_image_shapes,
-          self._mask_params.score_threshold)
-      postprocess_dict.update({
-          fields.DetectionResultFields.detection_masks:
-              instance_masks
-      })
+      densepose_part_heatmap, densepose_surface_coords = None, None
+      densepose_class_index = 0
+      if self._densepose_params:
+        densepose_part_heatmap = prediction_dict[DENSEPOSE_HEATMAP][-1]
+        densepose_surface_coords = prediction_dict[DENSEPOSE_REGRESSION][-1]
+        densepose_class_index = self._densepose_params.class_id
+      instance_masks, surface_coords = (
+          convert_strided_predictions_to_instance_masks(
+              boxes, classes, masks, true_image_shapes,
+              densepose_part_heatmap, densepose_surface_coords,
+              stride=self._stride, mask_height=self._mask_params.mask_height,
+              mask_width=self._mask_params.mask_width,
+              score_threshold=self._mask_params.score_threshold,
+              densepose_class_index=densepose_class_index))
+      postprocess_dict[
+          fields.DetectionResultFields.detection_masks] = instance_masks
+      if self._densepose_params:
+        postprocess_dict[
+            fields.DetectionResultFields.detection_surface_coords] = (
+                surface_coords)
+
    return postprocess_dict

  def _postprocess_keypoints(self, prediction_dict, classes, y_indices,
@@ -2330,17 +2720,61 @@ class CenterNetMetaArch(model.DetectionModel):
  def regularization_losses(self):
    return []

-  def restore_map(self, fine_tune_checkpoint_type='classification',
+  def restore_map(self,
+                  fine_tune_checkpoint_type='detection',
                  load_all_detection_checkpoint_vars=False):
+    raise RuntimeError('CenterNetMetaArch not supported under TF1.x.')
+
+  def restore_from_objects(self, fine_tune_checkpoint_type='detection'):
+    """Returns a map of Trackable objects to load from a foreign checkpoint.
+
+    Returns a dictionary of Tensorflow 2 Trackable objects (e.g. tf.Module
+    or Checkpoint). This enables the model to initialize based on weights from
+    another task. For example, the feature extractor variables from a
+    classification model can be used to bootstrap training of an object
+    detector. When loading from an object detection model, the checkpoint model
+    should have the same parameters as this detection model with exception of
+    the num_classes parameter.
+
+    Note that this function is intended to be used to restore Keras-based
+    models when running Tensorflow 2, whereas restore_map (not implemented
+    in CenterNet) is intended to be used to restore Slim-based models when
+    running Tensorflow 1.x.
+
+    TODO(jonathanhuang): Make this function consistent with other
+    meta-architectures.
+
+    Args:
+      fine_tune_checkpoint_type: whether to restore from a full detection
+        checkpoint (with compatible variable names) or to restore from a
+        classification checkpoint for initialization prior to training.
+        Valid values: `detection`, `classification`. Default 'detection'.
+        'detection': used when loading in the Hourglass model pre-trained on
+          other detection task.
+        'classification': used when loading in the ResNet model pre-trained on
+          image classification task. Note that only the image feature encoding
+          part is loaded but not those upsampling layers.
+        'fine_tune': used when loading the entire CenterNet feature extractor
+          pre-trained on other tasks. The checkpoints saved during CenterNet
+          model training can be directly loaded using this mode.
+
+    Returns:
+      A dict mapping keys to Trackable objects (tf.Module or Checkpoint).
+    """

    if fine_tune_checkpoint_type == 'classification':
      return {'feature_extractor': self._feature_extractor.get_base_model()}

-    if fine_tune_checkpoint_type == 'detection':
+    elif fine_tune_checkpoint_type == 'detection':
      return {'feature_extractor': self._feature_extractor.get_model()}

+    elif fine_tune_checkpoint_type == 'fine_tune':
+      feature_extractor_model = tf.train.Checkpoint(
+          _feature_extractor=self._feature_extractor)
+      return {'model': feature_extractor_model}
+
    else:
-      raise ValueError('Unknown fine tune checkpoint type - {}'.format(
+      raise ValueError('Not supported  fine tune checkpoint type - {}'.format(
          fine_tune_checkpoint_type))

  def updates(self):

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -266,7 +266,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      masks_np[0, :, :3, 1] = 1  # Class 1.
      masks = tf.constant(masks_np)
      true_image_shapes = tf.constant([[6, 8, 3]])
-      instance_masks = cnma.convert_strided_predictions_to_instance_masks(
+      instance_masks, _ = cnma.convert_strided_predictions_to_instance_masks(
          boxes, classes, masks, stride=2, mask_height=2, mask_width=2,
          true_image_shapes=true_image_shapes)
      return instance_masks
@@ -289,6 +289,104 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
        ])
    np.testing.assert_array_equal(expected_instance_masks, instance_masks)

+  def test_convert_strided_predictions_raises_error_with_one_tensor(self):
+    def graph_fn():
+      boxes = tf.constant(
+          [
+              [[0.5, 0.5, 1.0, 1.0],
+               [0.0, 0.5, 0.5, 1.0],
+               [0.0, 0.0, 0.0, 0.0]],
+          ], tf.float32)
+      classes = tf.constant(
+          [
+              [0, 1, 0],
+          ], tf.int32)
+      masks_np = np.zeros((1, 4, 4, 2), dtype=np.float32)
+      masks_np[0, :, 2:, 0] = 1  # Class 0.
+      masks_np[0, :, :3, 1] = 1  # Class 1.
+      masks = tf.constant(masks_np)
+      true_image_shapes = tf.constant([[6, 8, 3]])
+      densepose_part_heatmap = tf.random.uniform(
+          [1, 4, 4, 24])
+      instance_masks, _ = cnma.convert_strided_predictions_to_instance_masks(
+          boxes, classes, masks, true_image_shapes,
+          densepose_part_heatmap=densepose_part_heatmap,
+          densepose_surface_coords=None)
+      return instance_masks
+
+    with self.assertRaises(ValueError):
+      self.execute_cpu(graph_fn, [])
+
+  def test_crop_and_threshold_masks(self):
+    boxes_np = np.array(
+        [[0., 0., 0.5, 0.5],
+         [0.25, 0.25, 1.0, 1.0]], dtype=np.float32)
+    classes_np = np.array([0, 2], dtype=np.int32)
+    masks_np = np.zeros((4, 4, _NUM_CLASSES), dtype=np.float32)
+    masks_np[0, 0, 0] = 0.8
+    masks_np[1, 1, 0] = 0.6
+    masks_np[3, 3, 2] = 0.7
+    part_heatmap_np = np.zeros((4, 4, _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+    part_heatmap_np[0, 0, 4] = 1
+    part_heatmap_np[0, 0, 2] = 0.6  # Lower scoring.
+    part_heatmap_np[1, 1, 8] = 0.2
+    part_heatmap_np[3, 3, 4] = 0.5
+    surf_coords_np = np.zeros((4, 4, 2 * _DENSEPOSE_NUM_PARTS),
+                              dtype=np.float32)
+    surf_coords_np[:, :, 8:10] = 0.2, 0.9
+    surf_coords_np[:, :, 16:18] = 0.3, 0.5
+    true_height, true_width = 10, 10
+    input_height, input_width = 10, 10
+    mask_height = 4
+    mask_width = 4
+    def graph_fn():
+      elems = [
+          tf.constant(boxes_np),
+          tf.constant(classes_np),
+          tf.constant(masks_np),
+          tf.constant(part_heatmap_np),
+          tf.constant(surf_coords_np),
+          tf.constant(true_height, dtype=tf.int32),
+          tf.constant(true_width, dtype=tf.int32)
+      ]
+      part_masks, surface_coords = cnma.crop_and_threshold_masks(
+          elems, input_height, input_width, mask_height=mask_height,
+          mask_width=mask_width, densepose_class_index=0)
+      return part_masks, surface_coords
+
+    part_masks, surface_coords = self.execute_cpu(graph_fn, [])
+
+    expected_part_masks = np.zeros((2, 4, 4), dtype=np.uint8)
+    expected_part_masks[0, 0, 0] = 5  # Recall classes are 1-indexed in output.
+    expected_part_masks[0, 2, 2] = 9  # Recall classes are 1-indexed in output.
+    expected_part_masks[1, 3, 3] = 1  # Standard instance segmentation mask.
+    expected_surface_coords = np.zeros((2, 4, 4, 2), dtype=np.float32)
+    expected_surface_coords[0, 0, 0, :] = 0.2, 0.9
+    expected_surface_coords[0, 2, 2, :] = 0.3, 0.5
+    np.testing.assert_allclose(expected_part_masks, part_masks)
+    np.testing.assert_allclose(expected_surface_coords, surface_coords)
+
+  def test_gather_surface_coords_for_parts(self):
+    surface_coords_cropped_np = np.zeros((2, 5, 5, _DENSEPOSE_NUM_PARTS, 2),
+                                         dtype=np.float32)
+    surface_coords_cropped_np[0, 0, 0, 5] = 0.3, 0.4
+    surface_coords_cropped_np[0, 1, 0, 9] = 0.5, 0.6
+    highest_scoring_part_np = np.zeros((2, 5, 5), dtype=np.int32)
+    highest_scoring_part_np[0, 0, 0] = 5
+    highest_scoring_part_np[0, 1, 0] = 9
+    def graph_fn():
+      surface_coords_cropped = tf.constant(surface_coords_cropped_np,
+                                           tf.float32)
+      highest_scoring_part = tf.constant(highest_scoring_part_np, tf.int32)
+      surface_coords_gathered = cnma.gather_surface_coords_for_parts(
+          surface_coords_cropped, highest_scoring_part)
+      return surface_coords_gathered
+
+    surface_coords_gathered = self.execute_cpu(graph_fn, [])
+
+    np.testing.assert_allclose([0.3, 0.4], surface_coords_gathered[0, 0, 0])
+    np.testing.assert_allclose([0.5, 0.6], surface_coords_gathered[0, 1, 0])
+
  def test_top_k_feature_map_locations(self):
    feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
    feature_map_np[0, 2, 0, 1] = 1.0
@@ -535,6 +633,8 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    keypoint_heatmap_np[1, 0, 1, 1] = 0.9
    keypoint_heatmap_np[1, 2, 0, 1] = 0.8

+    # Note that the keypoint offsets are now per keypoint (as opposed to
+    # keypoint agnostic, in the test test_keypoint_candidate_prediction).
    keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 4), dtype=np.float32)
    keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25, 0.0, 0.0]
    keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5, 0.0, 0.0]
@@ -949,6 +1049,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
 _NUM_CLASSES = 10
 _KEYPOINT_INDICES = [0, 1, 2, 3]
 _NUM_KEYPOINTS = len(_KEYPOINT_INDICES)
+_DENSEPOSE_NUM_PARTS = 24
 _TASK_NAME = 'human_pose'


@@ -991,6 +1092,20 @@ def get_fake_mask_params():
      mask_width=4)


+def get_fake_densepose_params():
+  """Returns the fake DensePose estimation parameter namedtuple."""
+  return cnma.DensePoseParams(
+      class_id=1,
+      classification_loss=losses.WeightedSoftmaxClassificationLoss(),
+      localization_loss=losses.L1LocalizationLoss(),
+      part_loss_weight=1.0,
+      coordinate_loss_weight=1.0,
+      num_parts=_DENSEPOSE_NUM_PARTS,
+      task_loss_weight=1.0,
+      upsample_to_input_res=True,
+      upsample_method='nearest')
+
+
 def build_center_net_meta_arch(build_resnet=False):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
@@ -1018,7 +1133,8 @@ def build_center_net_meta_arch(build_resnet=False):
      object_center_params=get_fake_center_params(),
      object_detection_params=get_fake_od_params(),
      keypoint_params_dict={_TASK_NAME: get_fake_kp_params()},
-      mask_params=get_fake_mask_params())
+      mask_params=get_fake_mask_params(),
+      densepose_params=get_fake_densepose_params())


 def _logit(p):
@@ -1102,6 +1218,16 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        fake_feature_map)
    self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)

+    # "densepose parts" head:
+    output = model._prediction_head_dict[cnma.DENSEPOSE_HEATMAP][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, _DENSEPOSE_NUM_PARTS), output.shape)
+
+    # "densepose surface coordinates" head:
+    output = model._prediction_head_dict[cnma.DENSEPOSE_REGRESSION][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, 2 * _DENSEPOSE_NUM_PARTS), output.shape)
+
  def test_initialize_target_assigners(self):
    model = build_center_net_meta_arch()
    assigner_dict = model._initialize_target_assigners(
@@ -1125,6 +1251,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertIsInstance(assigner_dict[cnma.SEGMENTATION_TASK],
                          cn_assigner.CenterNetMaskTargetAssigner)

+    # DensePose estimation target assigner:
+    self.assertIsInstance(assigner_dict[cnma.DENSEPOSE_TASK],
+                          cn_assigner.CenterNetDensePoseTargetAssigner)
+
  def test_predict(self):
    """Test the predict function."""

@@ -1145,6 +1275,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                     (2, 32, 32, 2))
    self.assertEqual(prediction_dict[cnma.SEGMENTATION_HEATMAP][0].shape,
                     (2, 32, 32, _NUM_CLASSES))
+    self.assertEqual(prediction_dict[cnma.DENSEPOSE_HEATMAP][0].shape,
+                     (2, 32, 32, _DENSEPOSE_NUM_PARTS))
+    self.assertEqual(prediction_dict[cnma.DENSEPOSE_REGRESSION][0].shape,
+                     (2, 32, 32, 2 * _DENSEPOSE_NUM_PARTS))

  def test_loss(self):
    """Test the loss function."""
@@ -1157,7 +1291,13 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        groundtruth_keypoints_list=groundtruth_dict[
            fields.BoxListFields.keypoints],
        groundtruth_masks_list=groundtruth_dict[
-            fields.BoxListFields.masks])
+            fields.BoxListFields.masks],
+        groundtruth_dp_num_points_list=groundtruth_dict[
+            fields.BoxListFields.densepose_num_points],
+        groundtruth_dp_part_ids_list=groundtruth_dict[
+            fields.BoxListFields.densepose_part_ids],
+        groundtruth_dp_surface_coords_list=groundtruth_dict[
+            fields.BoxListFields.densepose_surface_coords])

    prediction_dict = get_fake_prediction_dict(
        input_height=16, input_width=32, stride=4)
@@ -1193,6 +1333,12 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertGreater(
        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
                                   cnma.SEGMENTATION_HEATMAP)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.DENSEPOSE_HEATMAP)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.DENSEPOSE_REGRESSION)])

  @parameterized.parameters(
      {'target_class_id': 1},
@@ -1230,6 +1376,14 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    segmentation_heatmap[:, 14:18, 14:18, target_class_id] = 1.0
    segmentation_heatmap = _logit(segmentation_heatmap)

+    dp_part_ind = 4
+    dp_part_heatmap = np.zeros((1, 32, 32, _DENSEPOSE_NUM_PARTS),
+                               dtype=np.float32)
+    dp_part_heatmap[0, 14:18, 14:18, dp_part_ind] = 1.0
+    dp_part_heatmap = _logit(dp_part_heatmap)
+
+    dp_surf_coords = np.random.randn(1, 32, 32, 2 * _DENSEPOSE_NUM_PARTS)
+
    class_center = tf.constant(class_center)
    height_width = tf.constant(height_width)
    offset = tf.constant(offset)
@@ -1237,6 +1391,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
    keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
    segmentation_heatmap = tf.constant(segmentation_heatmap, dtype=tf.float32)
+    dp_part_heatmap = tf.constant(dp_part_heatmap, dtype=tf.float32)
+    dp_surf_coords = tf.constant(dp_surf_coords, dtype=tf.float32)

    prediction_dict = {
        cnma.OBJECT_CENTER: [class_center],
@@ -1249,6 +1405,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
            [keypoint_regression],
        cnma.SEGMENTATION_HEATMAP: [segmentation_heatmap],
+        cnma.DENSEPOSE_HEATMAP: [dp_part_heatmap],
+        cnma.DENSEPOSE_REGRESSION: [dp_surf_coords]
    }

    def graph_fn():
@@ -1271,12 +1429,13 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual([1, max_detection, 4, 4],
                        detections['detection_masks'].shape)

-    # There should be some section of the first mask (correspond to the only
-    # detection) with non-zero mask values.
-    self.assertGreater(np.sum(detections['detection_masks'][0, 0, :, :] > 0), 0)
+    # Masks should be empty for everything but the first detection.
    self.assertAllEqual(
        detections['detection_masks'][0, 1:, :, :],
        np.zeros_like(detections['detection_masks'][0, 1:, :, :]))
+    self.assertAllEqual(
+        detections['detection_surface_coords'][0, 1:, :, :],
+        np.zeros_like(detections['detection_surface_coords'][0, 1:, :, :]))

    if target_class_id == 1:
      expected_kpts_for_obj_0 = np.array(
@@ -1287,6 +1446,12 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                                 expected_kpts_for_obj_0, rtol=1e-6)
      np.testing.assert_allclose(detections['detection_keypoint_scores'][0][0],
                                 expected_kpt_scores_for_obj_0, rtol=1e-6)
+      # First detection has DensePose parts.
+      self.assertSameElements(
+          np.unique(detections['detection_masks'][0, 0, :, :]),
+          set([0, dp_part_ind + 1]))
+      self.assertGreater(np.sum(np.abs(detections['detection_surface_coords'])),
+                         0.0)
    else:
      # All keypoint outputs should be zeros.
      np.testing.assert_allclose(
@@ -1297,6 +1462,14 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
          detections['detection_keypoint_scores'][0][0],
          np.zeros([num_keypoints], np.float),
          rtol=1e-6)
+      # Binary segmentation mask.
+      self.assertSameElements(
+          np.unique(detections['detection_masks'][0, 0, :, :]),
+          set([0, 1]))
+      # No DensePose surface coordinates.
+      np.testing.assert_allclose(
+          detections['detection_surface_coords'][0, 0, :, :],
+          np.zeros_like(detections['detection_surface_coords'][0, 0, :, :]))

  def test_get_instance_indices(self):
    classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
@@ -1353,6 +1526,17 @@ def get_fake_prediction_dict(input_height, input_width, stride):
  mask_heatmap[0, 2, 4, 1] = 1.0
  mask_heatmap = _logit(mask_heatmap)

+  densepose_heatmap = np.zeros((2, output_height, output_width,
+                                _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+  densepose_heatmap[0, 2, 4, 5] = 1.0
+  densepose_heatmap = _logit(densepose_heatmap)
+
+  densepose_regression = np.zeros((2, output_height, output_width,
+                                   2 * _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+  # The surface coordinate indices for part index 5 are:
+  # (5 * 2, 5 * 2 + 1), or (10, 11).
+  densepose_regression[0, 2, 4, 10:12] = 0.4, 0.7
+
  prediction_dict = {
      'preprocessed_inputs':
          tf.zeros((2, input_height, input_width, 3)),
@@ -1383,6 +1567,14 @@ def get_fake_prediction_dict(input_height, input_width, stride):
      cnma.SEGMENTATION_HEATMAP: [
          tf.constant(mask_heatmap),
          tf.constant(mask_heatmap)
+      ],
+      cnma.DENSEPOSE_HEATMAP: [
+          tf.constant(densepose_heatmap),
+          tf.constant(densepose_heatmap),
+      ],
+      cnma.DENSEPOSE_REGRESSION: [
+          tf.constant(densepose_regression),
+          tf.constant(densepose_regression),
      ]
  }
  return prediction_dict
@@ -1427,12 +1619,30 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
      tf.constant(mask),
      tf.zeros_like(mask),
  ]
+  densepose_num_points = [
+      tf.constant([1], dtype=tf.int32),
+      tf.constant([0], dtype=tf.int32),
+  ]
+  densepose_part_ids = [
+      tf.constant([[5, 0, 0]], dtype=tf.int32),
+      tf.constant([[0, 0, 0]], dtype=tf.int32),
+  ]
+  densepose_surface_coords_np = np.zeros((1, 3, 4), dtype=np.float32)
+  densepose_surface_coords_np[0, 0, :] = 0.55, 0.55, 0.4, 0.7
+  densepose_surface_coords = [
+      tf.constant(densepose_surface_coords_np),
+      tf.zeros_like(densepose_surface_coords_np)
+  ]
  groundtruth_dict = {
      fields.BoxListFields.boxes: boxes,
      fields.BoxListFields.weights: weights,
      fields.BoxListFields.classes: classes,
      fields.BoxListFields.keypoints: keypoints,
      fields.BoxListFields.masks: masks,
+      fields.BoxListFields.densepose_num_points: densepose_num_points,
+      fields.BoxListFields.densepose_part_ids: densepose_part_ids,
+      fields.BoxListFields.densepose_surface_coords:
+          densepose_surface_coords,
      fields.InputDataFields.groundtruth_labeled_classes: labeled_classes,
  }
  return groundtruth_dict
@@ -1574,8 +1784,9 @@ class CenterNetMetaArchRestoreTest(test_case.TestCase):
    """Test restore map for a resnet backbone."""

    model = build_center_net_meta_arch(build_resnet=True)
-    restore_map = model.restore_map('classification')
-    self.assertIsInstance(restore_map['feature_extractor'], tf.keras.Model)
+    restore_from_objects_map = model.restore_from_objects('classification')
+    self.assertIsInstance(restore_from_objects_map['feature_extractor'],
+                          tf.keras.Model)


 class DummyFeatureExtractor(cnma.CenterNetFeatureExtractor):
@@ -1601,9 +1812,6 @@ class DummyFeatureExtractor(cnma.CenterNetFeatureExtractor):
  def postprocess(self):
    pass

-  def restore_map(self):
-    pass
-
  def call(self, inputs):
    batch_size, input_height, input_width, _ = inputs.shape
    fake_output = tf.ones([

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
@@ -324,7 +324,7 @@ class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
      A float32 Tensor with shape [K, new_height, new_width, depth].
    """
    box_features = self._crop_and_resize_fn(
-        features_to_crop, proposal_boxes_normalized,
+        [features_to_crop], proposal_boxes_normalized, None,
        [self._initial_crop_size, self._initial_crop_size])

    attention_features = self._context_feature_extract_fn(

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function

 import functools
 import unittest
+from unittest import mock  # pylint: disable=g-importing-member
 from absl.testing import parameterized
-import mock
 import tensorflow.compat.v1 as tf
 import tf_slim as slim

@@ -41,7 +41,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.protos import box_predictor_pb2
 from object_detection.protos import hyperparams_pb2
 from object_detection.protos import post_processing_pb2
-from object_detection.utils import ops
+from object_detection.utils import spatial_transform_ops as spatial_ops
 from object_detection.utils import test_case
 from object_detection.utils import test_utils
 from object_detection.utils import tf_version
@@ -363,8 +363,9 @@ class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
          max_negatives_per_positive=None)

    crop_and_resize_fn = (
-        ops.matmul_crop_and_resize
-        if use_matmul_crop_and_resize else ops.native_crop_and_resize)
+        spatial_ops.multilevel_matmul_crop_and_resize
+        if use_matmul_crop_and_resize
+        else spatial_ops.multilevel_native_crop_and_resize)
    common_kwargs = {
        'is_training':
            is_training,

--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch.py
@@ -261,31 +261,6 @@ class FasterRCNNKerasFeatureExtractor(object):
    """Get model that extracts second stage box classifier features."""
    pass

-  def restore_from_classification_checkpoint_fn(
-      self,
-      first_stage_feature_extractor_scope,
-      second_stage_feature_extractor_scope):
-    """Returns a map of variables to load from a foreign checkpoint.
-
-    Args:
-      first_stage_feature_extractor_scope: A scope name for the first stage
-        feature extractor.
-      second_stage_feature_extractor_scope: A scope name for the second stage
-        feature extractor.
-
-    Returns:
-      A dict mapping variable names (to load from a checkpoint) to variables in
-      the model graph.
-    """
-    variables_to_restore = {}
-    for variable in variables_helper.get_global_variables_safely():
-      for scope_name in [first_stage_feature_extractor_scope,
-                         second_stage_feature_extractor_scope]:
-        if variable.op.name.startswith(scope_name):
-          var_name = variable.op.name.replace(scope_name + '/', '')
-          variables_to_restore[var_name] = variable
-    return variables_to_restore
-

 class FasterRCNNMetaArch(model.DetectionModel):
  """Faster R-CNN Meta-architecture definition."""
@@ -1973,9 +1948,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
    Returns:
      A float32 tensor with shape [K, new_height, new_width, depth].
    """
+    features_to_crop = [features_to_crop]
+    num_levels = len(features_to_crop)
+    box_levels = None
+    if num_levels != 1:
+      # If there are multiple levels to select, get the box levels
+      box_levels = ops.fpn_feature_levels(num_levels, num_levels - 1,
+                                          1.0/224, proposal_boxes_normalized)
    cropped_regions = self._flatten_first_two_dimensions(
        self._crop_and_resize_fn(
-            features_to_crop, proposal_boxes_normalized,
+            features_to_crop, proposal_boxes_normalized, box_levels,
            [self._initial_crop_size, self._initial_crop_size]))
    return self._maxpool_layer(cropped_regions)

@@ -2542,8 +2524,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
            image_shape[1], image_shape[2], check_range=False).get()

        flat_cropped_gt_mask = self._crop_and_resize_fn(
-            tf.expand_dims(flat_gt_masks, -1),
-            tf.expand_dims(flat_normalized_proposals, axis=1),
+            [tf.expand_dims(flat_gt_masks, -1)],
+            tf.expand_dims(flat_normalized_proposals, axis=1), None,
            [mask_height, mask_width])
        # Without stopping gradients into cropped groundtruth masks the
        # performance with 100-padded groundtruth masks when batch size > 1 is
@@ -2572,7 +2554,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
      if second_stage_mask_loss is not None:
        mask_loss = tf.multiply(self._second_stage_mask_loss_weight,
                                second_stage_mask_loss, name='mask_loss')
-        loss_dict[mask_loss.op.name] = mask_loss
+        loss_dict['Loss/BoxClassifierLoss/mask_loss'] = mask_loss
    return loss_dict

  def _get_mask_proposal_boxes_and_classes(
@@ -2801,6 +2783,46 @@ class FasterRCNNMetaArch(model.DetectionModel):
        variables_to_restore, include_patterns=include_patterns)
    return {var.op.name: var for var in feature_extractor_variables}

+  def restore_from_objects(self, fine_tune_checkpoint_type='detection'):
+    """Returns a map of Trackable objects to load from a foreign checkpoint.
+
+    Returns a dictionary of Tensorflow 2 Trackable objects (e.g. tf.Module
+    or Checkpoint). This enables the model to initialize based on weights from
+    another task. For example, the feature extractor variables from a
+    classification model can be used to bootstrap training of an object
+    detector. When loading from an object detection model, the checkpoint model
+    should have the same parameters as this detection model with exception of
+    the num_classes parameter.
+
+    Note that this function is intended to be used to restore Keras-based
+    models when running Tensorflow 2, whereas restore_map (above) is intended
+    to be used to restore Slim-based models when running Tensorflow 1.x.
+
+    Args:
+      fine_tune_checkpoint_type: whether to restore from a full detection
+        checkpoint (with compatible variable names) or to restore from a
+        classification checkpoint for initialization prior to training.
+        Valid values: `detection`, `classification`. Default 'detection'.
+
+    Returns:
+      A dict mapping keys to Trackable objects (tf.Module or Checkpoint).
+    """
+    if fine_tune_checkpoint_type == 'classification':
+      return {
+          'feature_extractor':
+              self._feature_extractor.classification_backbone
+      }
+    elif fine_tune_checkpoint_type == 'detection':
+      fake_model = tf.train.Checkpoint(
+          _feature_extractor_for_box_classifier_features=
+          self._feature_extractor_for_box_classifier_features,
+          _feature_extractor_for_proposal_features=
+          self._feature_extractor_for_proposal_features)
+      return {'model': fake_model}
+    else:
+      raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
+          fine_tune_checkpoint_type))
+
  def updates(self):
    """Returns a list of update operators for this model.


--- a/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
+++ b/research/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py
@@ -34,7 +34,7 @@ from object_detection.meta_architectures import faster_rcnn_meta_arch
 from object_detection.protos import box_predictor_pb2
 from object_detection.protos import hyperparams_pb2
 from object_detection.protos import post_processing_pb2
-from object_detection.utils import ops
+from object_detection.utils import spatial_transform_ops as spatial_ops
 from object_detection.utils import test_case
 from object_detection.utils import test_utils
 from object_detection.utils import tf_version
@@ -377,8 +377,9 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
          max_negatives_per_positive=None)

    crop_and_resize_fn = (
-        ops.matmul_crop_and_resize
-        if use_matmul_crop_and_resize else ops.native_crop_and_resize)
+        spatial_ops.multilevel_matmul_crop_and_resize
+        if use_matmul_crop_and_resize
+        else spatial_ops.multilevel_native_crop_and_resize)
    common_kwargs = {
        'is_training':
            is_training,

--- a/research/object_detection/meta_architectures/ssd_meta_arch.py
+++ b/research/object_detection/meta_architectures/ssd_meta_arch.py
@@ -250,35 +250,6 @@ class SSDKerasFeatureExtractor(tf.keras.Model):
  def call(self, inputs, **kwargs):
    return self._extract_features(inputs)

-  def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
-    """Returns a map of variables to load from a foreign checkpoint.
-
-    Args:
-      feature_extractor_scope: A scope name for the feature extractor.
-
-    Returns:
-      A dict mapping variable names (to load from a checkpoint) to variables in
-      the model graph.
-    """
-    variables_to_restore = {}
-    if tf.executing_eagerly():
-      for variable in self.variables:
-        # variable.name includes ":0" at the end, but the names in the
-        # checkpoint do not have the suffix ":0". So, we strip it here.
-        var_name = variable.name[:-2]
-        if var_name.startswith(feature_extractor_scope + '/'):
-          var_name = var_name.replace(feature_extractor_scope + '/', '')
-        variables_to_restore[var_name] = variable
-    else:
-      # b/137854499: use global_variables.
-      for variable in variables_helper.get_global_variables_safely():
-        var_name = variable.op.name
-        if var_name.startswith(feature_extractor_scope + '/'):
-          var_name = var_name.replace(feature_extractor_scope + '/', '')
-          variables_to_restore[var_name] = variable
-
-    return variables_to_restore
-

 class SSDMetaArch(model.DetectionModel):
  """SSD Meta-architecture definition."""
@@ -508,12 +479,9 @@ class SSDMetaArch(model.DetectionModel):
      ValueError: if inputs tensor does not have type tf.float32
    """
    with tf.name_scope('Preprocessor'):
-      (resized_inputs,
-       true_image_shapes) = shape_utils.resize_images_and_return_shapes(
-           inputs, self._image_resizer_fn)
-
-      return (self._feature_extractor.preprocess(resized_inputs),
-              true_image_shapes)
+      normalized_inputs = self._feature_extractor.preprocess(inputs)
+      return shape_utils.resize_images_and_return_shapes(
+          normalized_inputs, self._image_resizer_fn)

  def _compute_clip_window(self, preprocessed_images, true_image_shapes):
    """Computes clip window to use during post_processing.
@@ -1295,8 +1263,8 @@ class SSDMetaArch(model.DetectionModel):
        classification checkpoint for initialization prior to training.
        Valid values: `detection`, `classification`. Default 'detection'.
      load_all_detection_checkpoint_vars: whether to load all variables (when
-         `fine_tune_checkpoint_type='detection'`). If False, only variables
-         within the appropriate scopes are included. Default False.
+         `fine_tune_checkpoint_type` is `detection`). If False, only variables
+         within the feature extractor scope are included. Default False.

    Returns:
      A dict mapping variable names (to load from a checkpoint) to variables in
@@ -1311,36 +1279,56 @@ class SSDMetaArch(model.DetectionModel):

    elif fine_tune_checkpoint_type == 'detection':
      variables_to_restore = {}
-      if tf.executing_eagerly():
+      for variable in variables_helper.get_global_variables_safely():
+        var_name = variable.op.name
        if load_all_detection_checkpoint_vars:
-          # Grab all detection vars by name
-          for variable in self.variables:
-            # variable.name includes ":0" at the end, but the names in the
-            # checkpoint do not have the suffix ":0". So, we strip it here.
-            var_name = variable.name[:-2]
-            variables_to_restore[var_name] = variable
+          variables_to_restore[var_name] = variable
        else:
-          # Grab just the feature extractor vars by name
-          for variable in self._feature_extractor.variables:
-            # variable.name includes ":0" at the end, but the names in the
-            # checkpoint do not have the suffix ":0". So, we strip it here.
-            var_name = variable.name[:-2]
-            variables_to_restore[var_name] = variable
-      else:
-        for variable in variables_helper.get_global_variables_safely():
-          var_name = variable.op.name
-          if load_all_detection_checkpoint_vars:
+          if var_name.startswith(self._extract_features_scope):
            variables_to_restore[var_name] = variable
-          else:
-            if var_name.startswith(self._extract_features_scope):
-              variables_to_restore[var_name] = variable
-
      return variables_to_restore

    else:
      raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
          fine_tune_checkpoint_type))

+  def restore_from_objects(self, fine_tune_checkpoint_type='detection'):
+    """Returns a map of Trackable objects to load from a foreign checkpoint.
+
+    Returns a dictionary of Tensorflow 2 Trackable objects (e.g. tf.Module
+    or Checkpoint). This enables the model to initialize based on weights from
+    another task. For example, the feature extractor variables from a
+    classification model can be used to bootstrap training of an object
+    detector. When loading from an object detection model, the checkpoint model
+    should have the same parameters as this detection model with exception of
+    the num_classes parameter.
+
+    Note that this function is intended to be used to restore Keras-based
+    models when running Tensorflow 2, whereas restore_map (above) is intended
+    to be used to restore Slim-based models when running Tensorflow 1.x.
+
+    Args:
+      fine_tune_checkpoint_type: whether to restore from a full detection
+        checkpoint (with compatible variable names) or to restore from a
+        classification checkpoint for initialization prior to training.
+        Valid values: `detection`, `classification`. Default 'detection'.
+
+    Returns:
+      A dict mapping keys to Trackable objects (tf.Module or Checkpoint).
+    """
+    if fine_tune_checkpoint_type == 'classification':
+      return {
+          'feature_extractor':
+              self._feature_extractor.classification_backbone
+      }
+    elif fine_tune_checkpoint_type == 'detection':
+      fake_model = tf.train.Checkpoint(
+          _feature_extractor=self._feature_extractor)
+      return {'model': fake_model}
+    else:
+      raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
+          fine_tune_checkpoint_type))
+
  def updates(self):
    """Returns a list of update operators for this model.


--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -432,14 +432,9 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
    return eval_metric_ops


-def _check_mask_type_and_value(array_name, masks):
-  """Checks whether mask dtype is uint8 and the values are either 0 or 1."""
-  if masks.dtype != np.uint8:
-    raise ValueError('{} must be of type np.uint8. Found {}.'.format(
-        array_name, masks.dtype))
-  if np.any(np.logical_and(masks != 0, masks != 1)):
-    raise ValueError('{} elements can only be either 0 or 1.'.format(
-        array_name))
+def convert_masks_to_binary(masks):
+  """Converts masks to 0 or 1 and uint8 type."""
+  return (masks > 0).astype(np.uint8)


 class CocoKeypointEvaluator(CocoDetectionEvaluator):
@@ -952,9 +947,8 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):

    groundtruth_instance_masks = groundtruth_dict[
        standard_fields.InputDataFields.groundtruth_instance_masks]
-    _check_mask_type_and_value(standard_fields.InputDataFields.
-                               groundtruth_instance_masks,
-                               groundtruth_instance_masks)
+    groundtruth_instance_masks = convert_masks_to_binary(
+        groundtruth_instance_masks)
    self._groundtruth_list.extend(
        coco_tools.
        ExportSingleImageGroundtruthToCoco(
@@ -1013,9 +1007,7 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
                       'are incompatible: {} vs {}'.format(
                           groundtruth_masks_shape,
                           detection_masks.shape))
-    _check_mask_type_and_value(standard_fields.DetectionResultFields.
-                               detection_masks,
-                               detection_masks)
+    detection_masks = convert_masks_to_binary(detection_masks)
    self._detection_masks_list.extend(
        coco_tools.ExportSingleImageDetectionMasksToCoco(
            image_id=image_id,

--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -1424,14 +1424,16 @@ class CocoMaskEvaluationTest(tf.test.TestCase):
        image_id='image3',
        detections_dict={
            standard_fields.DetectionResultFields.detection_boxes:
-            np.array([[25., 25., 50., 50.]]),
+                np.array([[25., 25., 50., 50.]]),
            standard_fields.DetectionResultFields.detection_scores:
-            np.array([.8]),
+                np.array([.8]),
            standard_fields.DetectionResultFields.detection_classes:
-            np.array([1]),
+                np.array([1]),
            standard_fields.DetectionResultFields.detection_masks:
-            np.pad(np.ones([1, 25, 25], dtype=np.uint8),
-                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+                # The value of 5 is equivalent to 1, since masks will be
+                # thresholded and binarized before evaluation.
+                np.pad(5 * np.ones([1, 25, 25], dtype=np.uint8),
+                       ((0, 0), (10, 10), (10, 10)), mode='constant')
        })
    metrics = coco_evaluator.evaluate()
    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0)

--- a/research/object_detection/metrics/oid_challenge_evaluation_utils.py
+++ b/research/object_detection/metrics/oid_challenge_evaluation_utils.py
@@ -136,15 +136,15 @@ def build_groundtruth_dictionary(data, class_label_map):

  dictionary = {
      standard_fields.InputDataFields.groundtruth_boxes:
-          data_location[['YMin', 'XMin', 'YMax', 'XMax']].as_matrix(),
+          data_location[['YMin', 'XMin', 'YMax', 'XMax']].to_numpy(),
      standard_fields.InputDataFields.groundtruth_classes:
          data_location['LabelName'].map(lambda x: class_label_map[x]
-                                        ).as_matrix(),
+                                        ).to_numpy(),
      standard_fields.InputDataFields.groundtruth_group_of:
-          data_location['IsGroupOf'].as_matrix().astype(int),
+          data_location['IsGroupOf'].to_numpy().astype(int),
      standard_fields.InputDataFields.groundtruth_image_classes:
          data_labels['LabelName'].map(lambda x: class_label_map[x]
-                                      ).as_matrix(),
+                                      ).to_numpy(),
  }

  if 'Mask' in data_location:
@@ -179,9 +179,9 @@ def build_predictions_dictionary(data, class_label_map):
  """
  dictionary = {
      standard_fields.DetectionResultFields.detection_classes:
-          data['LabelName'].map(lambda x: class_label_map[x]).as_matrix(),
+          data['LabelName'].map(lambda x: class_label_map[x]).to_numpy(),
      standard_fields.DetectionResultFields.detection_scores:
-          data['Score'].as_matrix()
+          data['Score'].to_numpy()
  }

  if 'Mask' in data:
@@ -192,6 +192,6 @@ def build_predictions_dictionary(data, class_label_map):
  else:
    dictionary[standard_fields.DetectionResultFields.detection_boxes] = data[[
        'YMin', 'XMin', 'YMax', 'XMax'
-    ]].as_matrix()
+    ]].to_numpy()

  return dictionary
--- a/research/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py
+++ b/research/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py
@@ -53,16 +53,16 @@ def build_groundtruth_vrd_dictionary(data, class_label_map,

  boxes = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.vrd_box_data_type)
  boxes['subject'] = data_boxes[['YMin1', 'XMin1', 'YMax1',
-                                 'XMax1']].as_matrix()
-  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].as_matrix()
+                                 'XMax1']].to_numpy()
+  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].to_numpy()

  labels = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.label_data_type)
  labels['subject'] = data_boxes['LabelName1'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['object'] = data_boxes['LabelName2'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['relation'] = data_boxes['RelationshipLabel'].map(
-      lambda x: relationship_label_map[x]).as_matrix()
+      lambda x: relationship_label_map[x]).to_numpy()

  return {
      standard_fields.InputDataFields.groundtruth_boxes:
@@ -71,7 +71,7 @@ def build_groundtruth_vrd_dictionary(data, class_label_map,
          labels,
      standard_fields.InputDataFields.groundtruth_image_classes:
          data_labels['LabelName'].map(lambda x: class_label_map[x])
-          .as_matrix(),
+          .to_numpy(),
  }


@@ -104,16 +104,16 @@ def build_predictions_vrd_dictionary(data, class_label_map,

  boxes = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.vrd_box_data_type)
  boxes['subject'] = data_boxes[['YMin1', 'XMin1', 'YMax1',
-                                 'XMax1']].as_matrix()
-  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].as_matrix()
+                                 'XMax1']].to_numpy()
+  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].to_numpy()

  labels = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.label_data_type)
  labels['subject'] = data_boxes['LabelName1'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['object'] = data_boxes['LabelName2'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['relation'] = data_boxes['RelationshipLabel'].map(
-      lambda x: relationship_label_map[x]).as_matrix()
+      lambda x: relationship_label_map[x]).to_numpy()

  return {
      standard_fields.DetectionResultFields.detection_boxes:
@@ -121,5 +121,5 @@ def build_predictions_vrd_dictionary(data, class_label_map,
      standard_fields.DetectionResultFields.detection_classes:
          labels,
      standard_fields.DetectionResultFields.detection_scores:
-          data_boxes['Score'].as_matrix()
+          data_boxes['Score'].to_numpy()
  }
--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -43,7 +43,6 @@ from object_detection.utils import visualization_utils as vis_utils
 # pylint: disable=g-import-not-at-top
 try:
  from tensorflow.contrib import learn as contrib_learn
-  from tensorflow.contrib import tpu as contrib_tpu
 except ImportError:
  # TF 2.0 doesn't ship with contrib.
  pass
@@ -94,6 +93,15 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
        of groundtruth boxes per image..
      'groundtruth_keypoints': [batch_size, num_boxes, num_keypoints, 2] float32
        tensor of keypoints (if provided in groundtruth).
+      'groundtruth_dp_num_points_list': [batch_size, num_boxes] int32 tensor
+        with the number of DensePose points for each instance (if provided in
+        groundtruth).
+      'groundtruth_dp_part_ids_list': [batch_size, num_boxes,
+        max_sampled_points] int32 tensor with the part ids for each DensePose
+        sampled point (if provided in groundtruth).
+      'groundtruth_dp_surface_coords_list': [batch_size, num_boxes,
+        max_sampled_points, 4] containing the DensePose surface coordinates for
+        each sampled point (if provided in groundtruth).
      'groundtruth_group_of': [batch_size, num_boxes] bool tensor indicating
        group_of annotations (if provided in groundtruth).
      'groundtruth_labeled_classes': [batch_size, num_classes] int64
@@ -164,6 +172,21 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
      groundtruth[input_data_fields.groundtruth_labeled_classes] = tf.stack(
          labeled_classes)

+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.densepose_num_points):
+    groundtruth[input_data_fields.groundtruth_dp_num_points] = tf.stack(
+        detection_model.groundtruth_lists(
+            fields.BoxListFields.densepose_num_points))
+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.densepose_part_ids):
+    groundtruth[input_data_fields.groundtruth_dp_part_ids] = tf.stack(
+        detection_model.groundtruth_lists(
+            fields.BoxListFields.densepose_part_ids))
+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.densepose_surface_coords):
+    groundtruth[input_data_fields.groundtruth_dp_surface_coords] = tf.stack(
+        detection_model.groundtruth_lists(
+            fields.BoxListFields.densepose_surface_coords))
  groundtruth[input_data_fields.num_groundtruth_boxes] = (
      tf.tile([max_number_of_boxes], multiples=[groundtruth_boxes_shape[0]]))
  return groundtruth
@@ -219,6 +242,9 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
        fields.InputDataFields.groundtruth_boxes,
        fields.InputDataFields.groundtruth_keypoints,
        fields.InputDataFields.groundtruth_keypoint_visibilities,
+        fields.InputDataFields.groundtruth_dp_num_points,
+        fields.InputDataFields.groundtruth_dp_part_ids,
+        fields.InputDataFields.groundtruth_dp_surface_coords,
        fields.InputDataFields.groundtruth_group_of,
        fields.InputDataFields.groundtruth_difficult,
        fields.InputDataFields.groundtruth_is_crowd,
@@ -269,6 +295,18 @@ def provide_groundtruth(model, labels):
  if fields.InputDataFields.groundtruth_keypoint_visibilities in labels:
    gt_keypoint_visibilities_list = labels[
        fields.InputDataFields.groundtruth_keypoint_visibilities]
+  gt_dp_num_points_list = None
+  if fields.InputDataFields.groundtruth_dp_num_points in labels:
+    gt_dp_num_points_list = labels[
+        fields.InputDataFields.groundtruth_dp_num_points]
+  gt_dp_part_ids_list = None
+  if fields.InputDataFields.groundtruth_dp_part_ids in labels:
+    gt_dp_part_ids_list = labels[
+        fields.InputDataFields.groundtruth_dp_part_ids]
+  gt_dp_surface_coords_list = None
+  if fields.InputDataFields.groundtruth_dp_surface_coords in labels:
+    gt_dp_surface_coords_list = labels[
+        fields.InputDataFields.groundtruth_dp_surface_coords]
  gt_weights_list = None
  if fields.InputDataFields.groundtruth_weights in labels:
    gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
@@ -297,13 +335,16 @@ def provide_groundtruth(model, labels):
      groundtruth_masks_list=gt_masks_list,
      groundtruth_keypoints_list=gt_keypoints_list,
      groundtruth_keypoint_visibilities_list=gt_keypoint_visibilities_list,
+      groundtruth_dp_num_points_list=gt_dp_num_points_list,
+      groundtruth_dp_part_ids_list=gt_dp_part_ids_list,
+      groundtruth_dp_surface_coords_list=gt_dp_surface_coords_list,
      groundtruth_weights_list=gt_weights_list,
      groundtruth_is_crowd_list=gt_is_crowd_list,
      groundtruth_group_of_list=gt_group_of_list,
      groundtruth_area_list=gt_area_list)


-def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,
+def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False,
                    postprocess_on_cpu=False):
  """Creates a model function for `Estimator`.

@@ -377,7 +418,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,
    side_inputs = detection_model.get_side_inputs(features)

    if use_tpu and train_config.use_bfloat16:
-      with contrib_tpu.bfloat16_scope():
+      with tf.tpu.bfloat16_scope():
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape], **side_inputs)
@@ -392,7 +433,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,

    if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
      if use_tpu and postprocess_on_cpu:
-        detections = contrib_tpu.outside_compilation(
+        detections = tf.tpu.outside_compilation(
            postprocess_wrapper,
            (prediction_dict,
             features[fields.InputDataFields.true_image_shape]))
@@ -468,7 +509,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
-        training_optimizer = contrib_tpu.CrossShardOptimizer(training_optimizer)
+        training_optimizer = tf.tpu.CrossShardOptimizer(training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
@@ -588,7 +629,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,

    # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
    if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
-      return contrib_tpu.TPUEstimatorSpec(
+      return tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
@@ -619,8 +660,8 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,


 def create_estimator_and_inputs(run_config,
-                                hparams,
-                                pipeline_config_path,
+                                hparams=None,
+                                pipeline_config_path=None,
                                config_override=None,
                                train_steps=None,
                                sample_1_of_n_eval_examples=1,
@@ -639,7 +680,7 @@ def create_estimator_and_inputs(run_config,

  Args:
    run_config: A `RunConfig`.
-    hparams: A `HParams`.
+    hparams: (optional) A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
@@ -762,14 +803,14 @@ def create_estimator_and_inputs(run_config,
      model_config=model_config, predict_input_config=eval_input_configs[0])

  # Read export_to_tpu from hparams if not passed.
-  if export_to_tpu is None:
+  if export_to_tpu is None and hparams is not None:
    export_to_tpu = hparams.get('export_to_tpu', False)
  tf.logging.info('create_estimator_and_inputs: use_tpu %s, export_to_tpu %s',
                  use_tpu, export_to_tpu)
  model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu,
                              postprocess_on_cpu)
  if use_tpu_estimator:
-    estimator = contrib_tpu.TPUEstimator(
+    estimator = tf.estimator.tpu.TPUEstimator(
        model_fn=model_fn,
        train_batch_size=train_config.batch_size,
        # For each core, only batch size 1 is supported for eval.

--- a/research/object_detection/model_lib_tf2_test.py
+++ b/research/object_detection/model_lib_tf2_test.py
@@ -123,6 +123,9 @@ class SimpleModel(model.DetectionModel):
    return []

  def restore_map(self, *args, **kwargs):
+    pass
+
+  def restore_from_objects(self, fine_tune_checkpoint_type):
    return {'model': self}

  def preprocess(self, _):
@@ -174,7 +177,7 @@ class ModelCheckpointTest(tf.test.TestCase):

 class IncompatibleModel(SimpleModel):

-  def restore_map(self, *args, **kwargs):
+  def restore_from_objects(self, *args, **kwargs):
    return {'weight': self.weight}


@@ -207,7 +210,6 @@ class CheckpointV2Test(tf.test.TestCase):
    model_lib_v2.load_fine_tune_checkpoint(
        self._model, self._ckpt_path, checkpoint_type='',
        checkpoint_version=train_pb2.CheckpointVersion.V2,
-        load_all_detection_checkpoint_vars=True,
        input_dataset=self._train_input_fn(),
        unpad_groundtruth_tensors=True)
    np.testing.assert_allclose(self._model.weight.numpy(), 42)
@@ -220,7 +222,6 @@ class CheckpointV2Test(tf.test.TestCase):
      model_lib_v2.load_fine_tune_checkpoint(
          IncompatibleModel(), self._ckpt_path, checkpoint_type='',
          checkpoint_version=train_pb2.CheckpointVersion.V2,
-          load_all_detection_checkpoint_vars=True,
          input_dataset=self._train_input_fn(),
          unpad_groundtruth_tensors=True)


--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -34,7 +34,6 @@ from object_detection.protos import train_pb2
 from object_detection.utils import config_util
 from object_detection.utils import label_map_util
 from object_detection.utils import ops
-from object_detection.utils import variables_helper
 from object_detection.utils import visualization_utils as vutils

 # pylint: disable=g-import-not-at-top
@@ -47,13 +46,6 @@ except ImportError:

 MODEL_BUILD_UTIL_MAP = model_lib.MODEL_BUILD_UTIL_MAP

-### NOTE: This file is a wip.
-### TODO(kaftan): Explore adding unit tests for individual methods
-### TODO(kaftan): Add unit test that checks training on a single image w/
-#### groundtruth, and verfiy that loss goes to zero.
-#### Possibly have version that takes it as the whole train & eval dataset,
-#### & verify the loss output from the eval_loop method.
-### TODO(kaftan): Make sure the unit tests run in TAP presubmits or Kokoro

 RESTORE_MAP_ERROR_TEMPLATE = (
    'Since we are restoring a v2 style checkpoint'
@@ -101,6 +93,12 @@ def _compute_losses_and_predictions_dicts(
          instance masks for objects.
        labels[fields.InputDataFields.groundtruth_keypoints] is a
          float32 tensor containing keypoints for each box.
+        labels[fields.InputDataFields.groundtruth_dp_num_points] is an int32
+          tensor with the number of sampled DensePose points per object.
+        labels[fields.InputDataFields.groundtruth_dp_part_ids] is an int32
+          tensor with the DensePose part ids (0-indexed) per object.
+        labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
+          float32 tensor with the DensePose surface coordinates.
        labels[fields.InputDataFields.groundtruth_group_of] is a tf.bool tensor
          containing group_of annotations.
        labels[fields.InputDataFields.groundtruth_labeled_classes] is a float32
@@ -203,6 +201,17 @@ def eager_train_step(detection_model,
        labels[fields.InputDataFields.groundtruth_keypoints] is a
          [batch_size, num_boxes, num_keypoints, 2] float32 tensor containing
          keypoints for each box.
+        labels[fields.InputDataFields.groundtruth_dp_num_points] is a
+          [batch_size, num_boxes] int32 tensor with the number of DensePose
+          sampled points per instance.
+        labels[fields.InputDataFields.groundtruth_dp_part_ids] is a
+          [batch_size, num_boxes, max_sampled_points] int32 tensor with the
+          part ids (0-indexed) for each instance.
+        labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
+          [batch_size, num_boxes, max_sampled_points, 4] float32 tensor with the
+          surface coordinates for each point. Each surface coordinate is of the
+          form (y, x, v, u) where (y, x) are normalized image locations and
+          (v, u) are part-relative normalized surface coordinates.
        labels[fields.InputDataFields.groundtruth_labeled_classes] is a float32
          k-hot tensor of classes.
    unpad_groundtruth_tensors: A parameter passed to unstack_batch.
@@ -277,14 +286,21 @@ def validate_tf_v2_checkpoint_restore_map(checkpoint_restore_map):
  """

  for key, value in checkpoint_restore_map.items():
-    if not (isinstance(key, str) and isinstance(value, tf.Module)):
+    if not (isinstance(key, str) and
+            (isinstance(value, tf.Module)
+             or isinstance(value, tf.train.Checkpoint))):
      raise TypeError(RESTORE_MAP_ERROR_TEMPLATE.format(
          key.__class__.__name__, value.__class__.__name__))


+def is_object_based_checkpoint(checkpoint_path):
+  """Returns true if `checkpoint_path` points to an object-based checkpoint."""
+  var_names = [var[0] for var in tf.train.list_variables(checkpoint_path)]
+  return '_CHECKPOINTABLE_OBJECT_GRAPH' in var_names
+
+
 def load_fine_tune_checkpoint(
-    model, checkpoint_path, checkpoint_type, checkpoint_version,
-    load_all_detection_checkpoint_vars, input_dataset,
+    model, checkpoint_path, checkpoint_type, checkpoint_version, input_dataset,
    unpad_groundtruth_tensors):
  """Load a fine tuning classification or detection checkpoint.

@@ -292,8 +308,7 @@ def load_fine_tune_checkpoint(
  the model by computing a dummy loss. (Models might not have built their
  variables before their first execution)

-  It then loads a variable-name based classification or detection checkpoint
-  that comes from converted TF 1.x slim model checkpoints.
+  It then loads an object-based classification or detection checkpoint.

  This method updates the model in-place and does not return a value.

@@ -306,14 +321,22 @@ def load_fine_tune_checkpoint(
      classification checkpoint for initialization prior to training.
      Valid values: `detection`, `classification`.
    checkpoint_version: train_pb2.CheckpointVersion.V1 or V2 enum indicating
-      whether to load checkpoints in V1 style or V2 style.
-    load_all_detection_checkpoint_vars: whether to load all variables (when
-      `fine_tune_checkpoint_type` is `detection`). If False, only variables
-      within the feature extractor scopes are included. Default False.
+      whether to load checkpoints in V1 style or V2 style.  In this binary
+      we only support V2 style (object-based) checkpoints.
    input_dataset: The tf.data Dataset the model is being trained on. Needed
      to get the shapes for the dummy loss computation.
    unpad_groundtruth_tensors: A parameter passed to unstack_batch.
+
+  Raises:
+    IOError: if `checkpoint_path` does not point at a valid object-based
+      checkpoint
+    ValueError: if `checkpoint_version` is not train_pb2.CheckpointVersion.V2
  """
+  if not is_object_based_checkpoint(checkpoint_path):
+    raise IOError('Checkpoint is expected to be an object-based checkpoint.')
+  if checkpoint_version == train_pb2.CheckpointVersion.V1:
+    raise ValueError('Checkpoint version should be V2')
+
  features, labels = iter(input_dataset).next()

  @tf.function
@@ -330,32 +353,24 @@ def load_fine_tune_checkpoint(
        labels)

  strategy = tf.compat.v2.distribute.get_strategy()
-  strategy.experimental_run_v2(
-      _dummy_computation_fn, args=(
-          features,
-          labels,
-      ))
+  if hasattr(tf.distribute.Strategy, 'run'):
+    strategy.run(
+        _dummy_computation_fn, args=(
+            features,
+            labels,
+        ))
+  else:
+    strategy.experimental_run_v2(
+        _dummy_computation_fn, args=(
+            features,
+            labels,
+        ))

-  if checkpoint_version == train_pb2.CheckpointVersion.V1:
-    var_map = model.restore_map(
-        fine_tune_checkpoint_type=checkpoint_type,
-        load_all_detection_checkpoint_vars=(
-            load_all_detection_checkpoint_vars))
-    available_var_map = variables_helper.get_variables_available_in_checkpoint(
-        var_map,
-        checkpoint_path,
-        include_global_step=False)
-    tf.train.init_from_checkpoint(checkpoint_path,
-                                  available_var_map)
-  elif checkpoint_version == train_pb2.CheckpointVersion.V2:
-    restore_map = model.restore_map(
-        fine_tune_checkpoint_type=checkpoint_type,
-        load_all_detection_checkpoint_vars=(
-            load_all_detection_checkpoint_vars))
-    validate_tf_v2_checkpoint_restore_map(restore_map)
-
-    ckpt = tf.train.Checkpoint(**restore_map)
-    ckpt.restore(checkpoint_path).assert_existing_objects_matched()
+  restore_from_objects_dict = model.restore_from_objects(
+      fine_tune_checkpoint_type=checkpoint_type)
+  validate_tf_v2_checkpoint_restore_map(restore_from_objects_dict)
+  ckpt = tf.train.Checkpoint(**restore_from_objects_dict)
+  ckpt.restore(checkpoint_path).assert_existing_objects_matched()


 def get_filepath(strategy, filepath):
@@ -398,7 +413,7 @@ def train_loop(
    train_steps=None,
    use_tpu=False,
    save_final_config=False,
-    checkpoint_every_n=1000,
+    checkpoint_every_n=5000,
    checkpoint_max_to_keep=7,
    **kwargs):
  """Trains a model using eager + functions.
@@ -464,8 +479,10 @@ def train_loop(
  if kwargs['use_bfloat16']:
    tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

-  load_all_detection_checkpoint_vars = (
-      train_config.load_all_detection_checkpoint_vars)
+  if train_config.load_all_detection_checkpoint_vars:
+    raise ValueError('train_pb2.load_all_detection_checkpoint_vars '
+                     'unsupported in TF2')
+
  config_util.update_fine_tune_checkpoint_type(train_config)
  fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
  fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version
@@ -533,7 +550,6 @@ def train_loop(
                                    train_config.fine_tune_checkpoint,
                                    fine_tune_checkpoint_type,
                                    fine_tune_checkpoint_version,
-                                    load_all_detection_checkpoint_vars,
                                    train_input,
                                    unpad_groundtruth_tensors)

@@ -570,8 +586,12 @@ def train_loop(

        def _sample_and_train(strategy, train_step_fn, data_iterator):
          features, labels = data_iterator.next()
-          per_replica_losses = strategy.experimental_run_v2(
-              train_step_fn, args=(features, labels))
+          if hasattr(tf.distribute.Strategy, 'run'):
+            per_replica_losses = strategy.run(
+                train_step_fn, args=(features, labels))
+          else:
+            per_replica_losses = strategy.experimental_run_v2(
+                train_step_fn, args=(features, labels))
          # TODO(anjalisridhar): explore if it is safe to remove the
          ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
          return strategy.reduce(tf.distribute.ReduceOp.SUM,
@@ -744,28 +764,25 @@ def eager_eval_loop(

    return eval_dict, losses_dict, class_agnostic

+  agnostic_categories = label_map_util.create_class_agnostic_category_index()
+  per_class_categories = label_map_util.create_category_index_from_labelmap(
+      eval_input_config.label_map_path)
+  keypoint_edges = [
+      (kp.start, kp.end) for kp in eval_config.keypoint_edge]
+
  for i, (features, labels) in enumerate(eval_dataset):
    eval_dict, losses_dict, class_agnostic = compute_eval_dict(features, labels)

+    if class_agnostic:
+      category_index = agnostic_categories
+    else:
+      category_index = per_class_categories
+
    if i % 100 == 0:
      tf.logging.info('Finished eval step %d', i)

    use_original_images = fields.InputDataFields.original_image in features
-    if not use_tpu and use_original_images:
-      # Summary for input images.
-      tf.compat.v2.summary.image(
-          name='eval_input_images',
-          step=global_step,
-          data=eval_dict['original_image'],
-          max_outputs=1)
-      # Summary for prediction/groundtruth side-by-side images.
-      if class_agnostic:
-        category_index = label_map_util.create_class_agnostic_category_index()
-      else:
-        category_index = label_map_util.create_category_index_from_labelmap(
-            eval_input_config.label_map_path)
-      keypoint_edges = [
-          (kp.start, kp.end) for kp in eval_config.keypoint_edge]
+    if use_original_images and i < eval_config.num_visualizations:
      sbys_image_list = vutils.draw_side_by_side_evaluation_image(
          eval_dict,
          category_index=category_index,
@@ -775,10 +792,19 @@ def eager_eval_loop(
          keypoint_edges=keypoint_edges or None)
      sbys_images = tf.concat(sbys_image_list, axis=0)
      tf.compat.v2.summary.image(
-          name='eval_side_by_side',
+          name='eval_side_by_side_' + str(i),
          step=global_step,
          data=sbys_images,
          max_outputs=eval_config.num_visualizations)
+      if eval_util.has_densepose(eval_dict):
+        dp_image_list = vutils.draw_densepose_visualizations(
+            eval_dict)
+        dp_images = tf.concat(dp_image_list, axis=0)
+        tf.compat.v2.summary.image(
+            name='densepose_detections_' + str(i),
+            step=global_step,
+            data=dp_images,
+            max_outputs=eval_config.num_visualizations)

    if evaluators is None:
      if class_agnostic:
@@ -807,8 +833,10 @@ def eager_eval_loop(
    eval_metrics[loss_key] = loss_metrics[loss_key].result()

  eval_metrics = {str(k): v for k, v in eval_metrics.items()}
+  tf.logging.info('Eval metrics at step %d', global_step)
  for k in eval_metrics:
    tf.compat.v2.summary.scalar(k, eval_metrics[k], step=global_step)
+    tf.logging.info('\t+ %s: %f', k, eval_metrics[k])

  return eval_metrics

@@ -826,6 +854,7 @@ def eval_continuously(
    checkpoint_dir=None,
    wait_interval=180,
    timeout=3600,
+    eval_index=None,
    **kwargs):
  """Run continuous evaluation of a detection model eagerly.

@@ -855,6 +884,8 @@ def eval_continuously(
      new checkpoint.
    timeout: The maximum number of seconds to wait for a checkpoint. Execution
      will terminate if no new checkpoints are found after these many seconds.
+    eval_index: int, optional If give, only evaluate the dataset at the given
+      index.

    **kwargs: Additional keyword arguments for configuration override.
  """
@@ -908,6 +939,11 @@ def eval_continuously(
        model=detection_model)
    eval_inputs.append((eval_input_config.name, next_eval_input))

+  if eval_index is not None:
+    eval_inputs = [eval_inputs[eval_index]]
+    tf.logging.info('eval_index selected - {}'.format(
+        eval_inputs))
+
  global_step = tf.compat.v2.Variable(
      0, trainable=False, dtype=tf.compat.v2.dtypes.int64)

@@ -920,7 +956,7 @@ def eval_continuously(

    for eval_name, eval_input in eval_inputs:
      summary_writer = tf.compat.v2.summary.create_file_writer(
-          model_dir + '/eval' + eval_name)
+          os.path.join(model_dir, 'eval', eval_name))
      with summary_writer.as_default():
        eager_eval_loop(
            detection_model,

--- a/research/object_detection/model_main.py
+++ b/research/object_detection/model_main.py
@@ -22,7 +22,6 @@ from absl import flags

 import tensorflow.compat.v1 as tf

-from object_detection import model_hparams
 from object_detection import model_lib

 flags.DEFINE_string(
@@ -41,10 +40,6 @@ flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
                     'one of every n train input examples for evaluation, '
                     'where n is provided. This is only used if '
                     '`eval_training_data` is True.')
-flags.DEFINE_string(
-    'hparams_overrides', None, 'Hyperparameter overrides, '
-    'represented as a string containing comma-separated '
-    'hparam_name=value pairs.')
 flags.DEFINE_string(
    'checkpoint_dir', None, 'Path to directory holding a checkpoint.  If '
    '`checkpoint_dir` is provided, this binary operates in eval-only mode, '
@@ -68,7 +63,6 @@ def main(unused_argv):

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
-      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,

--- a/research/object_detection/model_main_tf2.py
+++ b/research/object_detection/model_main_tf2.py
@@ -16,14 +16,6 @@

 r"""Creates and runs TF2 object detection models.

-##################################
-NOTE: This module has not been fully tested; please bear with us while we iron
-out the kinks.
-##################################
-
-When a TPU device is available, this binary uses TPUStrategy. Otherwise, it uses
-GPUS with MirroredStrategy/MultiWorkerMirroredStrategy.
-
 For local training/evaluation run:
 PIPELINE_CONFIG_PATH=path/to/pipeline.config
 MODEL_DIR=/tmp/model_outputs
@@ -61,6 +53,12 @@ flags.DEFINE_string(

 flags.DEFINE_integer('eval_timeout', 3600, 'Number of seconds to wait for an'
                     'evaluation checkpoint before exiting.')
+
+flags.DEFINE_bool('use_tpu', False, 'Whether the job is executing on a TPU.')
+flags.DEFINE_string(
+    'tpu_name',
+    default=None,
+    help='Name of the Cloud TPU for Cluster Resolvers.')
 flags.DEFINE_integer(
    'num_workers', 1, 'When num_workers > 1, training uses '
    'MultiWorkerMirroredStrategy. When num_workers = 1 it uses '
@@ -86,7 +84,10 @@ def main(unused_argv):
        wait_interval=300, timeout=FLAGS.eval_timeout)
  else:
    if FLAGS.use_tpu:
-      resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+      # TPU is automatically inferred if tpu_name is None and
+      # we are running under cloud ai-platform.
+      resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+          FLAGS.tpu_name)
      tf.config.experimental_connect_to_cluster(resolver)
      tf.tpu.experimental.initialize_tpu_system(resolver)
      strategy = tf.distribute.experimental.TPUStrategy(resolver)

--- a/research/object_detection/model_tpu_main.py
+++ b/research/object_detection/model_tpu_main.py
@@ -26,18 +26,8 @@ from absl import flags
 import tensorflow.compat.v1 as tf


-from object_detection import model_hparams
 from object_detection import model_lib

-# pylint: disable=g-import-not-at-top
-try:
-  from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver
-  from tensorflow.contrib import tpu as contrib_tpu
-except ImportError:
-  # TF 2.0 doesn't ship with contrib.
-  pass
-# pylint: enable=g-import-not-at-top
-
 tf.flags.DEFINE_bool('use_tpu', True, 'Use TPUs rather than plain CPUs')

 # Cloud TPU Cluster Resolvers
@@ -67,10 +57,6 @@ flags.DEFINE_string('mode', 'train',
 flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If '
                     'this is not provided, batch size is read from training '
                     'config.')
-
-flags.DEFINE_string(
-    'hparams_overrides', None, 'Comma-separated list of '
-    'hyperparameters to override defaults.')
 flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
 flags.DEFINE_boolean('eval_training_data', False,
                     'If training data should be evaluated for this job.')
@@ -99,15 +85,15 @@ def main(unused_argv):
  flags.mark_flag_as_required('pipeline_config_path')

  tpu_cluster_resolver = (
-      contrib_cluster_resolver.TPUClusterResolver(
+      tf.distribute.cluster_resolver.TPUClusterResolver(
          tpu=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
  tpu_grpc_url = tpu_cluster_resolver.get_master()

-  config = contrib_tpu.RunConfig(
+  config = tf.estimator.tpu.RunConfig(
      master=tpu_grpc_url,
      evaluation_master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
-      tpu_config=contrib_tpu.TPUConfig(
+      tpu_config=tf.estimator.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards))

@@ -117,7 +103,6 @@ def main(unused_argv):

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
-      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,

--- a/research/object_detection/models/bidirectional_feature_pyramid_generators.py
+++ b/research/object_detection/models/bidirectional_feature_pyramid_generators.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to generate bidirectional feature pyramids based on image features.
+
+Provides bidirectional feature pyramid network (BiFPN) generators that can be
+used to build object detection feature extractors, as proposed by Tan et al.
+See https://arxiv.org/abs/1911.09070 for more details.
+"""
+import collections
+import functools
+from six.moves import range
+from six.moves import zip
+import tensorflow as tf
+
+from object_detection.utils import bifpn_utils
+
+
+def _create_bifpn_input_config(fpn_min_level,
+                               fpn_max_level,
+                               input_max_level,
+                               level_scales=None):
+  """Creates a BiFPN input config for the input levels from a backbone network.
+
+  Args:
+    fpn_min_level: the minimum pyramid level (highest feature map resolution) to
+      use in the BiFPN.
+    fpn_max_level: the maximum pyramid level (lowest feature map resolution) to
+      use in the BiFPN.
+    input_max_level: the maximum pyramid level that will be provided as input to
+      the BiFPN. Accordingly, the BiFPN will compute additional pyramid levels
+      from input_max_level, up to the desired fpn_max_level.
+    level_scales: a list of pyramid level scale factors. If 'None', each level's
+      scale is set to 2^level by default, which corresponds to each successive
+      feature map scaling by a factor of 2.
+
+  Returns:
+    A list of dictionaries for each feature map expected as input to the BiFPN,
+    where each has entries for the feature map 'name' and 'scale'.
+  """
+  if not level_scales:
+    level_scales = [2**i for i in range(fpn_min_level, fpn_max_level + 1)]
+
+  bifpn_input_params = []
+  for i in range(fpn_min_level, min(fpn_max_level, input_max_level) + 1):
+    bifpn_input_params.append({
+        'name': '0_up_lvl_{}'.format(i),
+        'scale': level_scales[i - fpn_min_level]
+    })
+
+  return bifpn_input_params
+
+
+def _get_bifpn_output_node_names(fpn_min_level, fpn_max_level, node_config):
+  """Returns a list of BiFPN output node names, given a BiFPN node config.
+
+  Args:
+    fpn_min_level: the minimum pyramid level (highest feature map resolution)
+      used by the BiFPN.
+    fpn_max_level: the maximum pyramid level (lowest feature map resolution)
+      used by the BiFPN.
+    node_config: the BiFPN node_config, a list of dictionaries corresponding to
+      each node in the BiFPN computation graph, where each entry should have an
+      associated 'name'.
+
+  Returns:
+    A list of strings corresponding to the names of the output BiFPN nodes.
+  """
+  num_output_nodes = fpn_max_level - fpn_min_level + 1
+  return [node['name'] for node in node_config[-num_output_nodes:]]
+
+
+def _create_bifpn_node_config(bifpn_num_iterations,
+                              bifpn_num_filters,
+                              fpn_min_level,
+                              fpn_max_level,
+                              input_max_level,
+                              bifpn_node_params=None,
+                              level_scales=None):
+  """Creates a config specifying a bidirectional feature pyramid network.
+
+  Args:
+    bifpn_num_iterations: the number of top-down bottom-up feature computations
+      to repeat in the BiFPN.
+    bifpn_num_filters: the number of filters (channels) for every feature map
+      used in the BiFPN.
+    fpn_min_level: the minimum pyramid level (highest feature map resolution) to
+      use in the BiFPN.
+    fpn_max_level: the maximum pyramid level (lowest feature map resolution) to
+      use in the BiFPN.
+    input_max_level: the maximum pyramid level that will be provided as input to
+      the BiFPN. Accordingly, the BiFPN will compute additional pyramid levels
+      from input_max_level, up to the desired fpn_max_level.
+    bifpn_node_params: If not 'None', a dictionary of additional default BiFPN
+      node parameters that will be applied to all BiFPN nodes.
+    level_scales: a list of pyramid level scale factors. If 'None', each level's
+      scale is set to 2^level by default, which corresponds to each successive
+      feature map scaling by a factor of 2.
+
+  Returns:
+    A list of dictionaries used to define nodes in the BiFPN computation graph,
+    as proposed by EfficientDet, Tan et al (https://arxiv.org/abs/1911.09070).
+    Each node's entry has the corresponding keys:
+      name: String. The name of this node in the BiFPN. The node name follows
+        the format '{bifpn_iteration}_{dn|up}_lvl_{pyramid_level}', where 'dn'
+        or 'up' refers to whether the node is in the top-down or bottom-up
+        portion of a single BiFPN iteration.
+      scale: the scale factor for this node, by default 2^level.
+      inputs: A list of names of nodes which are inputs to this node.
+      num_channels: The number of channels for this node.
+      combine_method: String. Name of the method used to combine input
+        node feature maps, 'fast_attention' by default for nodes which have more
+        than one input. Otherwise, 'None' for nodes with only one input node.
+      input_op: A (partial) function which is called to construct the layers
+        that will be applied to this BiFPN node's inputs. This function is
+        called with the arguments:
+          input_op(name, input_scale, input_num_channels, output_scale,
+                   output_num_channels, conv_hyperparams, is_training,
+                   freeze_batchnorm)
+      post_combine_op: A (partial) function which is called to construct the
+        layers that will be applied to the result of the combine operation for
+        this BiFPN node. This function will be called with the arguments:
+          post_combine_op(name, conv_hyperparams, is_training, freeze_batchnorm)
+        If 'None', then no layers will be applied after the combine operation
+        for this node.
+  """
+  if not level_scales:
+    level_scales = [2**i for i in range(fpn_min_level, fpn_max_level + 1)]
+
+  default_node_params = {
+      'num_channels':
+          bifpn_num_filters,
+      'combine_method':
+          'fast_attention',
+      'input_op':
+          functools.partial(
+              _create_bifpn_resample_block, downsample_method='max_pooling'),
+      'post_combine_op':
+          functools.partial(
+              bifpn_utils.create_conv_block,
+              num_filters=bifpn_num_filters,
+              kernel_size=3,
+              strides=1,
+              padding='SAME',
+              use_separable=True,
+              apply_batchnorm=True,
+              apply_activation=True,
+              conv_bn_act_pattern=False),
+  }
+  if bifpn_node_params:
+    default_node_params.update(bifpn_node_params)
+
+  bifpn_node_params = []
+  # Create additional base pyramid levels not provided as input to the BiFPN.
+  # Note, combine_method and post_combine_op are set to None for additional
+  # base pyramid levels because they do not combine multiple input BiFPN nodes.
+  for i in range(input_max_level + 1, fpn_max_level + 1):
+    node_params = dict(default_node_params)
+    node_params.update({
+        'name': '0_up_lvl_{}'.format(i),
+        'scale': level_scales[i - fpn_min_level],
+        'inputs': ['0_up_lvl_{}'.format(i - 1)],
+        'combine_method': None,
+        'post_combine_op': None,
+    })
+    bifpn_node_params.append(node_params)
+
+  for i in range(bifpn_num_iterations):
+    # The first bottom-up feature pyramid (which includes the input pyramid
+    # levels from the backbone network and the additional base pyramid levels)
+    # is indexed at 0. So, the first top-down bottom-up pass of the BiFPN is
+    # indexed from 1, and repeated for bifpn_num_iterations iterations.
+    bifpn_i = i + 1
+
+    # Create top-down nodes.
+    for level_i in reversed(range(fpn_min_level, fpn_max_level)):
+      inputs = []
+      # BiFPN nodes in the top-down pass receive input from the corresponding
+      # level from the previous BiFPN iteration's bottom-up pass, except for the
+      # bottom-most (min) level node, which is computed once in the initial
+      # bottom-up pass, and is afterwards only computed in each top-down pass.
+      if level_i > fpn_min_level or bifpn_i == 1:
+        inputs.append('{}_up_lvl_{}'.format(bifpn_i - 1, level_i))
+      else:
+        inputs.append('{}_dn_lvl_{}'.format(bifpn_i - 1, level_i))
+      inputs.append(bifpn_node_params[-1]['name'])
+      node_params = dict(default_node_params)
+      node_params.update({
+          'name': '{}_dn_lvl_{}'.format(bifpn_i, level_i),
+          'scale': level_scales[level_i - fpn_min_level],
+          'inputs': inputs
+      })
+      bifpn_node_params.append(node_params)
+
+    # Create bottom-up nodes.
+    for level_i in range(fpn_min_level + 1, fpn_max_level + 1):
+      # BiFPN nodes in the bottom-up pass receive input from the corresponding
+      # level from the preceding top-down pass, except for the top (max) level
+      # which does not have a corresponding node in the top-down pass.
+      inputs = ['{}_up_lvl_{}'.format(bifpn_i - 1, level_i)]
+      if level_i < fpn_max_level:
+        inputs.append('{}_dn_lvl_{}'.format(bifpn_i, level_i))
+      inputs.append(bifpn_node_params[-1]['name'])
+      node_params = dict(default_node_params)
+      node_params.update({
+          'name': '{}_up_lvl_{}'.format(bifpn_i, level_i),
+          'scale': level_scales[level_i - fpn_min_level],
+          'inputs': inputs
+      })
+      bifpn_node_params.append(node_params)
+
+  return bifpn_node_params
+
+
+def _create_bifpn_resample_block(name,
+                                 input_scale,
+                                 input_num_channels,
+                                 output_scale,
+                                 output_num_channels,
+                                 conv_hyperparams,
+                                 is_training,
+                                 freeze_batchnorm,
+                                 downsample_method=None,
+                                 use_native_resize_op=False,
+                                 maybe_apply_1x1_conv=True,
+                                 apply_1x1_pre_sampling=True,
+                                 apply_1x1_post_sampling=False):
+  """Creates resample block layers for input feature maps to BiFPN nodes.
+
+  Args:
+    name: String. Name used for this block of layers.
+    input_scale: Scale factor of the input feature map.
+    input_num_channels: Number of channels in the input feature map.
+    output_scale: Scale factor of the output feature map.
+    output_num_channels: Number of channels in the output feature map.
+    conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+      containing hyperparameters for convolution ops.
+    is_training: Indicates whether the feature generator is in training mode.
+    freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
+      training or not. When training with a small batch size (e.g. 1), it is
+      desirable to freeze batch norm update and use pretrained batch norm
+      params.
+    downsample_method: String. Method to use when downsampling feature maps.
+    use_native_resize_op: Bool. Whether to use the native resize up when
+      upsampling feature maps.
+    maybe_apply_1x1_conv: Bool. If 'True', a 1x1 convolution will only be
+      applied if the input_num_channels differs from the output_num_channels.
+    apply_1x1_pre_sampling: Bool. Whether a 1x1 convolution will be applied to
+      the input feature map before the up/down-sampling operation.
+    apply_1x1_post_sampling: Bool. Whether a 1x1 convolution will be applied to
+      the input feature map after the up/down-sampling operation.
+
+  Returns:
+    A list of layers which may be applied to the input feature maps in order to
+    compute feature maps with the specified scale and number of channels.
+  """
+  # By default, 1x1 convolutions are only applied before sampling when the
+  # number of input and output channels differ.
+  if maybe_apply_1x1_conv and output_num_channels == input_num_channels:
+    apply_1x1_pre_sampling = False
+    apply_1x1_post_sampling = False
+
+  apply_bn_for_resampling = True
+  layers = []
+  if apply_1x1_pre_sampling:
+    layers.extend(
+        bifpn_utils.create_conv_block(
+            name=name + '1x1_pre_sample/',
+            num_filters=output_num_channels,
+            kernel_size=1,
+            strides=1,
+            padding='SAME',
+            use_separable=False,
+            apply_batchnorm=apply_bn_for_resampling,
+            apply_activation=False,
+            conv_hyperparams=conv_hyperparams,
+            is_training=is_training,
+            freeze_batchnorm=freeze_batchnorm))
+
+  layers.extend(
+      bifpn_utils.create_resample_feature_map_ops(input_scale, output_scale,
+                                                  downsample_method,
+                                                  use_native_resize_op,
+                                                  conv_hyperparams, is_training,
+                                                  freeze_batchnorm, name))
+
+  if apply_1x1_post_sampling:
+    layers.extend(
+        bifpn_utils.create_conv_block(
+            name=name + '1x1_post_sample/',
+            num_filters=output_num_channels,
+            kernel_size=1,
+            strides=1,
+            padding='SAME',
+            use_separable=False,
+            apply_batchnorm=apply_bn_for_resampling,
+            apply_activation=False,
+            conv_hyperparams=conv_hyperparams,
+            is_training=is_training,
+            freeze_batchnorm=freeze_batchnorm))
+
+  return layers
+
+
+def _create_bifpn_combine_op(num_inputs, name, combine_method):
+  """Creates a BiFPN output config, a list of the output BiFPN node names.
+
+  Args:
+    num_inputs: The number of inputs to this combine operation.
+    name: String. The name of this combine operation.
+    combine_method: String. The method used to combine input feature maps.
+
+  Returns:
+    A function which may be called with a list of num_inputs feature maps
+    and which will return a single feature map.
+  """
+
+  combine_op = None
+  if num_inputs < 1:
+    raise ValueError('Expected at least 1 input for BiFPN combine.')
+  elif num_inputs == 1:
+    combine_op = lambda x: x[0]
+  else:
+    combine_op = bifpn_utils.BiFPNCombineLayer(
+        combine_method=combine_method, name=name)
+  return combine_op
+
+
+class KerasBiFpnFeatureMaps(tf.keras.Model):
+  """Generates Keras based BiFPN feature maps from an input feature map pyramid.
+
+  A Keras model that generates multi-scale feature maps for detection by
+  iteratively computing top-down and bottom-up feature pyramids, as in the
+  EfficientDet paper by Tan et al, see arxiv.org/abs/1911.09070 for details.
+  """
+
+  def __init__(self,
+               bifpn_num_iterations,
+               bifpn_num_filters,
+               fpn_min_level,
+               fpn_max_level,
+               input_max_level,
+               is_training,
+               conv_hyperparams,
+               freeze_batchnorm,
+               bifpn_node_params=None,
+               name=None):
+    """Constructor.
+
+    Args:
+      bifpn_num_iterations: The number of top-down bottom-up iterations.
+      bifpn_num_filters: The number of filters (channels) to be used for all
+        feature maps in this BiFPN.
+      fpn_min_level: The minimum pyramid level (highest feature map resolution)
+        to use in the BiFPN.
+      fpn_max_level: The maximum pyramid level (lowest feature map resolution)
+        to use in the BiFPN.
+      input_max_level: The maximum pyramid level that will be provided as input
+        to the BiFPN. Accordingly, the BiFPN will compute any additional pyramid
+        levels from input_max_level up to the desired fpn_max_level, with each
+        successivel level downsampling by a scale factor of 2 by default.
+      is_training: Indicates whether the feature generator is in training mode.
+      conv_hyperparams: A `hyperparams_builder.KerasLayerHyperparams` object
+        containing hyperparameters for convolution ops.
+      freeze_batchnorm: Bool. Whether to freeze batch norm parameters during
+        training or not. When training with a small batch size (e.g. 1), it is
+        desirable to freeze batch norm update and use pretrained batch norm
+        params.
+      bifpn_node_params: An optional dictionary that may be used to specify
+        default parameters for BiFPN nodes, without the need to provide a custom
+        bifpn_node_config. For example, if '{ combine_method: 'sum' }', then all
+        BiFPN nodes will combine input feature maps by summation, rather than
+        by the default fast attention method.
+      name: A string name scope to assign to the model. If 'None', Keras
+        will auto-generate one from the class name.
+    """
+    super(KerasBiFpnFeatureMaps, self).__init__(name=name)
+    bifpn_node_config = _create_bifpn_node_config(
+        bifpn_num_iterations, bifpn_num_filters, fpn_min_level, fpn_max_level,
+        input_max_level, bifpn_node_params)
+    bifpn_input_config = _create_bifpn_input_config(
+        fpn_min_level, fpn_max_level, input_max_level)
+    bifpn_output_node_names = _get_bifpn_output_node_names(
+        fpn_min_level, fpn_max_level, bifpn_node_config)
+
+    self.bifpn_node_config = bifpn_node_config
+    self.bifpn_output_node_names = bifpn_output_node_names
+    self.node_input_blocks = []
+    self.node_combine_op = []
+    self.node_post_combine_block = []
+
+    all_node_params = bifpn_input_config
+    all_node_names = [node['name'] for node in all_node_params]
+    for node_config in bifpn_node_config:
+      # Maybe transform and/or resample input feature maps.
+      input_blocks = []
+      for input_name in node_config['inputs']:
+        if input_name not in all_node_names:
+          raise ValueError(
+              'Input feature map ({}) does not exist:'.format(input_name))
+        input_index = all_node_names.index(input_name)
+        input_params = all_node_params[input_index]
+        input_block = node_config['input_op'](
+            name='{}/input_{}/'.format(node_config['name'], input_name),
+            input_scale=input_params['scale'],
+            input_num_channels=input_params.get('num_channels', None),
+            output_scale=node_config['scale'],
+            output_num_channels=node_config['num_channels'],
+            conv_hyperparams=conv_hyperparams,
+            is_training=is_training,
+            freeze_batchnorm=freeze_batchnorm)
+        input_blocks.append((input_index, input_block))
+
+      # Combine input feature maps.
+      combine_op = _create_bifpn_combine_op(
+          num_inputs=len(input_blocks),
+          name=(node_config['name'] + '/combine'),
+          combine_method=node_config['combine_method'])
+
+      # Post-combine layers.
+      post_combine_block = []
+      if node_config['post_combine_op']:
+        post_combine_block.extend(node_config['post_combine_op'](
+            name=node_config['name'] + '/post_combine/',
+            conv_hyperparams=conv_hyperparams,
+            is_training=is_training,
+            freeze_batchnorm=freeze_batchnorm))
+
+      self.node_input_blocks.append(input_blocks)
+      self.node_combine_op.append(combine_op)
+      self.node_post_combine_block.append(post_combine_block)
+      all_node_params.append(node_config)
+      all_node_names.append(node_config['name'])
+
+  def call(self, feature_pyramid):
+    """Compute BiFPN feature maps from input feature pyramid.
+
+    Executed when calling the `.__call__` method on input.
+
+    Args:
+      feature_pyramid: list of tuples of (tensor_name, image_feature_tensor).
+
+    Returns:
+      feature_maps: an OrderedDict mapping keys (feature map names) to
+        tensors where each tensor has shape [batch, height_i, width_i, depth_i].
+    """
+    feature_maps = [el[1] for el in feature_pyramid]
+    output_feature_maps = [None for node in self.bifpn_output_node_names]
+
+    for index, node in enumerate(self.bifpn_node_config):
+      node_scope = 'node_{:02d}'.format(index)
+      with tf.name_scope(node_scope):
+        # Apply layer blocks to this node's input feature maps.
+        input_block_results = []
+        for input_index, input_block in self.node_input_blocks[index]:
+          block_result = feature_maps[input_index]
+          for layer in input_block:
+            block_result = layer(block_result)
+          input_block_results.append(block_result)
+
+        # Combine the resulting feature maps.
+        node_result = self.node_combine_op[index](input_block_results)
+
+        # Apply post-combine layer block if applicable.
+        for layer in self.node_post_combine_block[index]:
+          node_result = layer(node_result)
+
+        feature_maps.append(node_result)
+
+        if node['name'] in self.bifpn_output_node_names:
+          index = self.bifpn_output_node_names.index(node['name'])
+          output_feature_maps[index] = node_result
+
+    return collections.OrderedDict(
+        zip(self.bifpn_output_node_names, output_feature_maps))
--- a/research/object_detection/models/bidirectional_feature_pyramid_generators_tf2_test.py
+++ b/research/object_detection/models/bidirectional_feature_pyramid_generators_tf2_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for bidirectional feature pyramid generators."""
+import unittest
+from absl.testing import parameterized
+
+import tensorflow.compat.v1 as tf
+
+from google.protobuf import text_format
+
+from object_detection.builders import hyperparams_builder
+from object_detection.models import bidirectional_feature_pyramid_generators as bifpn_generators
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+from object_detection.utils import test_utils
+from object_detection.utils import tf_version
+
+
+@parameterized.parameters({'bifpn_num_iterations': 2},
+                          {'bifpn_num_iterations': 8})
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class BiFPNFeaturePyramidGeneratorTest(test_case.TestCase):
+
+  def _build_conv_hyperparams(self):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      regularizer {
+        l2_regularizer {
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+        }
+      }
+      force_use_bias: true
+    """
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+
+  def test_get_expected_feature_map_shapes(self, bifpn_num_iterations):
+    with test_utils.GraphContextOrNone() as g:
+      image_features = [
+          ('block3', tf.random_uniform([4, 16, 16, 256], dtype=tf.float32)),
+          ('block4', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)),
+          ('block5', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32))
+      ]
+      bifpn_generator = bifpn_generators.KerasBiFpnFeatureMaps(
+          bifpn_num_iterations=bifpn_num_iterations,
+          bifpn_num_filters=128,
+          fpn_min_level=3,
+          fpn_max_level=7,
+          input_max_level=5,
+          is_training=True,
+          conv_hyperparams=self._build_conv_hyperparams(),
+          freeze_batchnorm=False)
+    def graph_fn():
+      feature_maps = bifpn_generator(image_features)
+      return feature_maps
+
+    expected_feature_map_shapes = {
+        '{}_dn_lvl_3'.format(bifpn_num_iterations): (4, 16, 16, 128),
+        '{}_up_lvl_4'.format(bifpn_num_iterations): (4, 8, 8, 128),
+        '{}_up_lvl_5'.format(bifpn_num_iterations): (4, 4, 4, 128),
+        '{}_up_lvl_6'.format(bifpn_num_iterations): (4, 2, 2, 128),
+        '{}_up_lvl_7'.format(bifpn_num_iterations): (4, 1, 1, 128)}
+    out_feature_maps = self.execute(graph_fn, [], g)
+    out_feature_map_shapes = dict(
+        (key, value.shape) for key, value in out_feature_maps.items())
+    self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
+
+  def test_get_expected_variable_names(self, bifpn_num_iterations):
+    with test_utils.GraphContextOrNone() as g:
+      image_features = [
+          ('block3', tf.random_uniform([4, 16, 16, 256], dtype=tf.float32)),
+          ('block4', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)),
+          ('block5', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32))
+      ]
+      bifpn_generator = bifpn_generators.KerasBiFpnFeatureMaps(
+          bifpn_num_iterations=bifpn_num_iterations,
+          bifpn_num_filters=128,
+          fpn_min_level=3,
+          fpn_max_level=7,
+          input_max_level=5,
+          is_training=True,
+          conv_hyperparams=self._build_conv_hyperparams(),
+          freeze_batchnorm=False,
+          name='bifpn')
+    def graph_fn():
+      return bifpn_generator(image_features)
+
+    self.execute(graph_fn, [], g)
+    expected_variables = [
+        'bifpn/node_00/0_up_lvl_6/input_0_up_lvl_5/1x1_pre_sample/conv/bias',
+        'bifpn/node_00/0_up_lvl_6/input_0_up_lvl_5/1x1_pre_sample/conv/kernel',
+        'bifpn/node_03/1_dn_lvl_5/input_0_up_lvl_5/1x1_pre_sample/conv/bias',
+        'bifpn/node_03/1_dn_lvl_5/input_0_up_lvl_5/1x1_pre_sample/conv/kernel',
+        'bifpn/node_04/1_dn_lvl_4/input_0_up_lvl_4/1x1_pre_sample/conv/bias',
+        'bifpn/node_04/1_dn_lvl_4/input_0_up_lvl_4/1x1_pre_sample/conv/kernel',
+        'bifpn/node_05/1_dn_lvl_3/input_0_up_lvl_3/1x1_pre_sample/conv/bias',
+        'bifpn/node_05/1_dn_lvl_3/input_0_up_lvl_3/1x1_pre_sample/conv/kernel',
+        'bifpn/node_06/1_up_lvl_4/input_0_up_lvl_4/1x1_pre_sample/conv/bias',
+        'bifpn/node_06/1_up_lvl_4/input_0_up_lvl_4/1x1_pre_sample/conv/kernel',
+        'bifpn/node_07/1_up_lvl_5/input_0_up_lvl_5/1x1_pre_sample/conv/bias',
+        'bifpn/node_07/1_up_lvl_5/input_0_up_lvl_5/1x1_pre_sample/conv/kernel']
+    expected_node_variable_patterns = [
+        ['bifpn/node_{:02}/{}_dn_lvl_6/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_dn_lvl_6/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_dn_lvl_6/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_dn_lvl_6/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_dn_lvl_5/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_dn_lvl_5/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_dn_lvl_5/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_dn_lvl_5/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_dn_lvl_4/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_dn_lvl_4/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_dn_lvl_4/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_dn_lvl_4/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_dn_lvl_3/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_dn_lvl_3/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_dn_lvl_3/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_dn_lvl_3/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_up_lvl_4/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_up_lvl_4/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_up_lvl_4/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_up_lvl_4/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_up_lvl_5/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_up_lvl_5/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_up_lvl_5/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_up_lvl_5/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_up_lvl_6/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_up_lvl_6/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_up_lvl_6/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_up_lvl_6/post_combine/separable_conv/pointwise_kernel'],
+        ['bifpn/node_{:02}/{}_up_lvl_7/combine/bifpn_combine_weights',
+         'bifpn/node_{:02}/{}_up_lvl_7/post_combine/separable_conv/bias',
+         'bifpn/node_{:02}/{}_up_lvl_7/post_combine/separable_conv/depthwise_kernel',
+         'bifpn/node_{:02}/{}_up_lvl_7/post_combine/separable_conv/pointwise_kernel']]
+
+    node_i = 2
+    for iter_i in range(1, bifpn_num_iterations+1):
+      for node_variable_patterns in expected_node_variable_patterns:
+        for pattern in node_variable_patterns:
+          expected_variables.append(pattern.format(node_i, iter_i))
+        node_i += 1
+
+    expected_variables = set(expected_variables)
+    actual_variable_set = set(
+        [var.name.split(':')[0] for var in bifpn_generator.variables])
+    self.assertSetEqual(expected_variables, actual_variable_set)
+
+# TODO(aom): Tests for create_bifpn_combine_op.
+
+if __name__ == '__main__':
+  tf.test.main()