Merge branch 'master' of https://github.com/tensorflow/models into latest

e2385734 · Kaushik Shivakumar · 30c14aa9 · 1bfb577d · e2385734 · e2385734
Commit e2385734 authored Jul 10, 2020 by Kaushik Shivakumar
20 changed files
--- a/research/object_detection/g3doc/using_your_own_dataset.md
+++ b/research/object_detection/g3doc/using_your_own_dataset.md
@@ -2,7 +2,7 @@

 [TOC]

-To use your own dataset in Tensorflow Object Detection API, you must convert it
+To use your own dataset in TensorFlow Object Detection API, you must convert it
 into the [TFRecord file format](https://www.tensorflow.org/api_guides/python/python_io#tfrecords_format_details).
 This document outlines how to write a script to generate the TFRecord file.


--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -924,13 +924,16 @@ def convert_strided_predictions_to_normalized_keypoints(


 def convert_strided_predictions_to_instance_masks(
-    boxes, classes, masks, stride, mask_height, mask_width,
-    true_image_shapes, score_threshold=0.5):
+    boxes, classes, masks, true_image_shapes,
+    densepose_part_heatmap=None, densepose_surface_coords=None, stride=4,
+    mask_height=256, mask_width=256, score_threshold=0.5,
+    densepose_class_index=-1):
  """Converts predicted full-image masks into instance masks.

  For each predicted detection box:
-    * Crop and resize the predicted mask based on the detected bounding box
-      coordinates and class prediction. Uses bilinear resampling.
+    * Crop and resize the predicted mask (and optionally DensePose coordinates)
+      based on the detected bounding box coordinates and class prediction. Uses
+      bilinear resampling.
    * Binarize the mask using the provided score threshold.

  Args:
@@ -940,57 +943,212 @@ def convert_strided_predictions_to_instance_masks(
      detected class for each box (0-indexed).
    masks: A [batch, output_height, output_width, num_classes] float32
      tensor with class probabilities.
+    true_image_shapes: A tensor of shape [batch, 3] representing the true
+      shape of the inputs not considering padding.
+    densepose_part_heatmap: (Optional) A [batch, output_height, output_width,
+      num_parts] float32 tensor with part scores (i.e. logits).
+    densepose_surface_coords: (Optional) A [batch, output_height, output_width,
+      2 * num_parts] float32 tensor with predicted part coordinates (in
+      vu-format).
    stride: The stride in the output space.
    mask_height: The desired resized height for instance masks.
    mask_width: The desired resized width for instance masks.
-    true_image_shapes: A tensor of shape [batch, 3] representing the true
-      shape of the inputs not considering padding.
    score_threshold: The threshold at which to convert predicted mask
       into foreground pixels.
+    densepose_class_index: The class index (0-indexed) corresponding to the
+      class which has DensePose labels (e.g. person class).

  Returns:
-    A [batch_size, max_detections, mask_height, mask_width] uint8 tensor with
-    predicted foreground mask for each instance. The masks take values in
-    {0, 1}.
+    A tuple of masks and surface_coords.
+    instance_masks: A [batch_size, max_detections, mask_height, mask_width]
+      uint8 tensor with predicted foreground mask for each
+      instance. If DensePose tensors are provided, then each pixel value in the
+      mask encodes the 1-indexed part.
+    surface_coords: A [batch_size, max_detections, mask_height, mask_width, 2]
+      float32 tensor with (v, u) coordinates. Note that v, u coordinates are
+      only defined on instance masks, and the coordinates at each location of
+      the foreground mask correspond to coordinates on a local part coordinate
+      system (the specific part can be inferred from the `instance_masks`
+      output. If DensePose feature maps are not passed to this function, this
+      output will be None.
+
+  Raises:
+    ValueError: If one but not both of `densepose_part_heatmap` and
+    `densepose_surface_coords` is provided.
  """
-  _, output_height, output_width, _ = (
+  batch_size, output_height, output_width, _ = (
      shape_utils.combined_static_and_dynamic_shape(masks))
  input_height = stride * output_height
  input_width = stride * output_width

+  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
+  # If necessary, create dummy DensePose tensors to simplify the map function.
+  densepose_present = True
+  if ((densepose_part_heatmap is not None) ^
+      (densepose_surface_coords is not None)):
+    raise ValueError('To use DensePose, both `densepose_part_heatmap` and '
+                     '`densepose_surface_coords` must be provided')
+  if densepose_part_heatmap is None and densepose_surface_coords is None:
+    densepose_present = False
+    densepose_part_heatmap = tf.zeros(
+        (batch_size, output_height, output_width, 1), dtype=tf.float32)
+    densepose_surface_coords = tf.zeros(
+        (batch_size, output_height, output_width, 2), dtype=tf.float32)
+  crop_and_threshold_fn = functools.partial(
+      crop_and_threshold_masks, input_height=input_height,
+      input_width=input_width, mask_height=mask_height, mask_width=mask_width,
+      score_threshold=score_threshold,
+      densepose_class_index=densepose_class_index)
+
+  instance_masks, surface_coords = shape_utils.static_or_dynamic_map_fn(
+      crop_and_threshold_fn,
+      elems=[boxes, classes, masks, densepose_part_heatmap,
+             densepose_surface_coords, true_heights, true_widths],
+      dtype=[tf.uint8, tf.float32],
+      back_prop=False)
+  surface_coords = surface_coords if densepose_present else None
+  return instance_masks, surface_coords
+
+
+def crop_and_threshold_masks(elems, input_height, input_width, mask_height=256,
+                             mask_width=256, score_threshold=0.5,
+                             densepose_class_index=-1):
+  """Crops and thresholds masks based on detection boxes.
+
+  Args:
+    elems: A tuple of
+      boxes - float32 tensor of shape [max_detections, 4]
+      classes - int32 tensor of shape [max_detections] (0-indexed)
+      masks - float32 tensor of shape [output_height, output_width, num_classes]
+      part_heatmap - float32 tensor of shape [output_height, output_width,
+        num_parts]
+      surf_coords - float32 tensor of shape [output_height, output_width,
+        2 * num_parts]
+      true_height - scalar int tensor
+      true_width - scalar int tensor
+    input_height: Input height to network.
+    input_width: Input width to network.
+    mask_height: Height for resizing mask crops.
+    mask_width: Width for resizing mask crops.
+    score_threshold: The threshold at which to convert predicted mask
+      into foreground pixels.
+    densepose_class_index: scalar int tensor with the class index (0-indexed)
+      for DensePose.
+
+  Returns:
+    A tuple of
+    all_instances: A [max_detections, mask_height, mask_width] uint8 tensor
+      with a predicted foreground mask for each instance. Background is encoded
+      as 0, and foreground is encoded as a positive integer. Specific part
+      indices are encoded as 1-indexed parts (for classes that have part
+      information).
+    surface_coords: A [max_detections, mask_height, mask_width, 2]
+      float32 tensor with (v, u) coordinates. for each part.
+  """
+  (boxes, classes, masks, part_heatmap, surf_coords, true_height,
+   true_width) = elems
  # Boxes are in normalized coordinates relative to true image shapes. Convert
  # coordinates to be normalized relative to input image shapes (since masks
  # may still have padding).
-  # Then crop and resize each mask.
-  def crop_and_threshold_masks(args):
-    """Crops masks based on detection boxes."""
-    boxes, classes, masks, true_height, true_width = args
-    boxlist = box_list.BoxList(boxes)
-    y_scale = true_height / input_height
-    x_scale = true_width / input_width
-    boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
-    boxes = boxlist.get()
-    # Convert masks from [input_height, input_width, num_classes] to
-    # [num_classes, input_height, input_width, 1].
-    masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
-    cropped_masks = tf2.image.crop_and_resize(
-        masks_4d,
-        boxes=boxes,
-        box_indices=classes,
-        crop_size=[mask_height, mask_width],
-        method='bilinear')
-    masks_3d = tf.squeeze(cropped_masks, axis=3)
-    masks_binarized = tf.math.greater_equal(masks_3d, score_threshold)
-    return tf.cast(masks_binarized, tf.uint8)
+  boxlist = box_list.BoxList(boxes)
+  y_scale = true_height / input_height
+  x_scale = true_width / input_width
+  boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
+  boxes = boxlist.get()
+  # Convert masks from [output_height, output_width, num_classes] to
+  # [num_classes, output_height, output_width, 1].
+  num_classes = tf.shape(masks)[-1]
+  masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
+  # Tile part and surface coordinate masks for all classes.
+  part_heatmap_4d = tf.tile(part_heatmap[tf.newaxis, :, :, :],
+                            multiples=[num_classes, 1, 1, 1])
+  surf_coords_4d = tf.tile(surf_coords[tf.newaxis, :, :, :],
+                           multiples=[num_classes, 1, 1, 1])
+  feature_maps_concat = tf.concat([masks_4d, part_heatmap_4d, surf_coords_4d],
+                                  axis=-1)
+  # The following tensor has shape
+  # [max_detections, mask_height, mask_width, 1 + 3 * num_parts].
+  cropped_masks = tf2.image.crop_and_resize(
+      feature_maps_concat,
+      boxes=boxes,
+      box_indices=classes,
+      crop_size=[mask_height, mask_width],
+      method='bilinear')
+
+  # Split the cropped masks back into instance masks, part masks, and surface
+  # coordinates.
+  num_parts = tf.shape(part_heatmap)[-1]
+  instance_masks, part_heatmap_cropped, surface_coords_cropped = tf.split(
+      cropped_masks, [1, num_parts, 2 * num_parts], axis=-1)
+
+  # Threshold the instance masks. Resulting tensor has shape
+  # [max_detections, mask_height, mask_width, 1].
+  instance_masks_int = tf.cast(
+      tf.math.greater_equal(instance_masks, score_threshold), dtype=tf.int32)
+
+  # Produce a binary mask that is 1.0 only:
+  #  - in the foreground region for an instance
+  #  - in detections corresponding to the DensePose class
+  det_with_parts = tf.equal(classes, densepose_class_index)
+  det_with_parts = tf.cast(
+      tf.reshape(det_with_parts, [-1, 1, 1, 1]), dtype=tf.int32)
+  instance_masks_with_parts = tf.math.multiply(instance_masks_int,
+                                               det_with_parts)
+
+  # Similarly, produce a binary mask that holds the foreground masks only for
+  # instances without parts (i.e. non-DensePose classes).
+  det_without_parts = 1 - det_with_parts
+  instance_masks_without_parts = tf.math.multiply(instance_masks_int,
+                                                  det_without_parts)
+
+  # Assemble a tensor that has standard instance segmentation masks for
+  # non-DensePose classes (with values in [0, 1]), and part segmentation masks
+  # for DensePose classes (with vaues in [0, 1, ..., num_parts]).
+  part_mask_int_zero_indexed = tf.math.argmax(
+      part_heatmap_cropped, axis=-1, output_type=tf.int32)[:, :, :, tf.newaxis]
+  part_mask_int_one_indexed = part_mask_int_zero_indexed + 1
+  all_instances = (instance_masks_without_parts +
+                   instance_masks_with_parts * part_mask_int_one_indexed)
+
+  # Gather the surface coordinates for the parts.
+  surface_coords_cropped = tf.reshape(
+      surface_coords_cropped, [-1, mask_height, mask_width, num_parts, 2])
+  surface_coords = gather_surface_coords_for_parts(surface_coords_cropped,
+                                                   part_mask_int_zero_indexed)
+  surface_coords = (
+      surface_coords * tf.cast(instance_masks_with_parts, tf.float32))
+
+  return [tf.squeeze(all_instances, axis=3), surface_coords]
+
+
+def gather_surface_coords_for_parts(surface_coords_cropped,
+                                    highest_scoring_part):
+  """Gathers the (v, u) coordinates for the highest scoring DensePose parts.

-  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
-  masks_for_image = shape_utils.static_or_dynamic_map_fn(
-      crop_and_threshold_masks,
-      elems=[boxes, classes, masks, true_heights, true_widths],
-      dtype=tf.uint8,
-      back_prop=False)
-  masks = tf.stack(masks_for_image, axis=0)
-  return masks
+  Args:
+    surface_coords_cropped: A [max_detections, height, width, num_parts, 2]
+      float32 tensor with (v, u) surface coordinates.
+    highest_scoring_part: A [max_detections, height, width] integer tensor with
+      the highest scoring part (0-indexed) indices for each location.
+
+  Returns:
+    A [max_detections, height, width, 2] float32 tensor with the (v, u)
+    coordinates selected from the highest scoring parts.
+  """
+  max_detections, height, width, num_parts, _ = (
+      shape_utils.combined_static_and_dynamic_shape(surface_coords_cropped))
+  flattened_surface_coords = tf.reshape(surface_coords_cropped, [-1, 2])
+  flattened_part_ids = tf.reshape(highest_scoring_part, [-1])
+
+  # Produce lookup indices that represent the locations of the highest scoring
+  # parts in the `flattened_surface_coords` tensor.
+  flattened_lookup_indices = (
+      num_parts * tf.range(max_detections * height * width) +
+      flattened_part_ids)
+
+  vu_coords_flattened = tf.gather(flattened_surface_coords,
+                                  flattened_lookup_indices, axis=0)
+  return tf.reshape(vu_coords_flattened, [max_detections, height, width, 2])


 class ObjectDetectionParams(
@@ -1235,6 +1393,64 @@ class MaskParams(
                              score_threshold, heatmap_bias_init)


+class DensePoseParams(
+    collections.namedtuple('DensePoseParams', [
+        'class_id', 'classification_loss', 'localization_loss',
+        'part_loss_weight', 'coordinate_loss_weight', 'num_parts',
+        'task_loss_weight', 'upsample_to_input_res', 'upsample_method',
+        'heatmap_bias_init'
+    ])):
+  """Namedtuple to store DensePose prediction related parameters."""
+
+  __slots__ = ()
+
+  def __new__(cls,
+              class_id,
+              classification_loss,
+              localization_loss,
+              part_loss_weight=1.0,
+              coordinate_loss_weight=1.0,
+              num_parts=24,
+              task_loss_weight=1.0,
+              upsample_to_input_res=True,
+              upsample_method='bilinear',
+              heatmap_bias_init=-2.19):
+    """Constructor with default values for DensePoseParams.
+
+    Args:
+      class_id: the ID of the class that contains the DensePose groundtruth.
+        This should typically correspond to the "person" class. Note that the ID
+        is 0-based, meaning that class 0 corresponds to the first non-background
+        object class.
+      classification_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the body part predictions in CenterNet.
+      localization_loss: an object_detection.core.losses.Loss object to compute
+        the loss for the surface coordinate regression in CenterNet.
+      part_loss_weight: The loss weight to apply to part prediction.
+      coordinate_loss_weight: The loss weight to apply to surface coordinate
+        prediction.
+      num_parts: The number of DensePose parts to predict.
+      task_loss_weight: float, the loss weight for the DensePose task.
+      upsample_to_input_res: Whether to upsample the DensePose feature maps to
+        the input resolution before applying loss. Note that the prediction
+        outputs are still at the standard CenterNet output stride.
+      upsample_method: Method for upsampling DensePose feature maps. Options are
+        either 'bilinear' or 'nearest'). This takes no effect when
+        `upsample_to_input_res` is False.
+      heatmap_bias_init: float, the initial value of bias in the convolutional
+        kernel of the part prediction head. If set to None, the
+        bias is initialized with zeros.
+
+    Returns:
+      An initialized DensePoseParams namedtuple.
+    """
+    return super(DensePoseParams,
+                 cls).__new__(cls, class_id, classification_loss,
+                              localization_loss, part_loss_weight,
+                              coordinate_loss_weight, num_parts,
+                              task_loss_weight, upsample_to_input_res,
+                              upsample_method, heatmap_bias_init)
+
 # The following constants are used to generate the keys of the
 # (prediction, loss, target assigner,...) dictionaries used in CenterNetMetaArch
 # class.
@@ -1247,6 +1463,9 @@ KEYPOINT_HEATMAP = 'keypoint/heatmap'
 KEYPOINT_OFFSET = 'keypoint/offset'
 SEGMENTATION_TASK = 'segmentation_task'
 SEGMENTATION_HEATMAP = 'segmentation/heatmap'
+DENSEPOSE_TASK = 'densepose_task'
+DENSEPOSE_HEATMAP = 'densepose/heatmap'
+DENSEPOSE_REGRESSION = 'densepose/regression'
 LOSS_KEY_PREFIX = 'Loss'


@@ -1290,7 +1509,8 @@ class CenterNetMetaArch(model.DetectionModel):
               object_center_params,
               object_detection_params=None,
               keypoint_params_dict=None,
-               mask_params=None):
+               mask_params=None,
+               densepose_params=None):
    """Initializes a CenterNet model.

    Args:
@@ -1318,6 +1538,10 @@ class CenterNetMetaArch(model.DetectionModel):
      mask_params: A MaskParams namedtuple. This object
        holds the hyper-parameters for segmentation. Please see the class
        definition for more details.
+      densepose_params: A DensePoseParams namedtuple. This object holds the
+        hyper-parameters for DensePose prediction. Please see the class
+        definition for more details. Note that if this is provided, it is
+        expected that `mask_params` is also provided.
    """
    assert object_detection_params or keypoint_params_dict
    # Shorten the name for convenience and better formatting.
@@ -1333,6 +1557,10 @@ class CenterNetMetaArch(model.DetectionModel):
    self._od_params = object_detection_params
    self._kp_params_dict = keypoint_params_dict
    self._mask_params = mask_params
+    if densepose_params is not None and mask_params is None:
+      raise ValueError('To run DensePose prediction, `mask_params` must also '
+                       'be supplied.')
+    self._densepose_params = densepose_params

    # Construct the prediction head nets.
    self._prediction_head_dict = self._construct_prediction_heads(
@@ -1413,8 +1641,18 @@ class CenterNetMetaArch(model.DetectionModel):
    if self._mask_params is not None:
      prediction_heads[SEGMENTATION_HEATMAP] = [
          make_prediction_net(num_classes,
-                              bias_fill=class_prediction_bias_init)
+                              bias_fill=self._mask_params.heatmap_bias_init)
+          for _ in range(num_feature_outputs)]
+    if self._densepose_params is not None:
+      prediction_heads[DENSEPOSE_HEATMAP] = [
+          make_prediction_net(  # pylint: disable=g-complex-comprehension
+              self._densepose_params.num_parts,
+              bias_fill=self._densepose_params.heatmap_bias_init)
          for _ in range(num_feature_outputs)]
+      prediction_heads[DENSEPOSE_REGRESSION] = [
+          make_prediction_net(2 * self._densepose_params.num_parts)
+          for _ in range(num_feature_outputs)
+      ]
    return prediction_heads

  def _initialize_target_assigners(self, stride, min_box_overlap_iou):
@@ -1449,6 +1687,10 @@ class CenterNetMetaArch(model.DetectionModel):
    if self._mask_params is not None:
      target_assigners[SEGMENTATION_TASK] = (
          cn_assigner.CenterNetMaskTargetAssigner(stride))
+    if self._densepose_params is not None:
+      dp_stride = 1 if self._densepose_params.upsample_to_input_res else stride
+      target_assigners[DENSEPOSE_TASK] = (
+          cn_assigner.CenterNetDensePoseTargetAssigner(dp_stride))

    return target_assigners

@@ -1860,6 +2102,113 @@ class CenterNetMetaArch(model.DetectionModel):
        float(len(segmentation_predictions)) * total_pixels_in_loss)
    return total_loss

+  def _compute_densepose_losses(self, input_height, input_width,
+                                prediction_dict):
+    """Computes the weighted DensePose losses.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      prediction_dict: A dictionary holding predicted tensors output by the
+        "predict" function. See the "predict" function for more detailed
+        description.
+
+    Returns:
+      A dictionary of scalar float tensors representing the weighted losses for
+      the DensePose task:
+         DENSEPOSE_HEATMAP: the weighted part segmentation loss.
+         DENSEPOSE_REGRESSION: the weighted part surface coordinate loss.
+    """
+    dp_heatmap_loss, dp_regression_loss = (
+        self._compute_densepose_part_and_coordinate_losses(
+            input_height=input_height,
+            input_width=input_width,
+            part_predictions=prediction_dict[DENSEPOSE_HEATMAP],
+            surface_coord_predictions=prediction_dict[DENSEPOSE_REGRESSION]))
+    loss_dict = {}
+    loss_dict[DENSEPOSE_HEATMAP] = (
+        self._densepose_params.part_loss_weight * dp_heatmap_loss)
+    loss_dict[DENSEPOSE_REGRESSION] = (
+        self._densepose_params.coordinate_loss_weight * dp_regression_loss)
+    return loss_dict
+
+  def _compute_densepose_part_and_coordinate_losses(
+      self, input_height, input_width, part_predictions,
+      surface_coord_predictions):
+    """Computes the individual losses for the DensePose task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      part_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, num_parts].
+      surface_coord_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2 * num_parts].
+
+    Returns:
+      A tuple with two scalar loss tensors: part_prediction_loss and
+      surface_coord_loss.
+    """
+    gt_dp_num_points_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_num_points)
+    gt_dp_part_ids_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_part_ids)
+    gt_dp_surface_coords_list = self.groundtruth_lists(
+        fields.BoxListFields.densepose_surface_coords)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+
+    assigner = self._target_assigner_dict[DENSEPOSE_TASK]
+    batch_indices, batch_part_ids, batch_surface_coords, batch_weights = (
+        assigner.assign_part_and_coordinate_targets(
+            height=input_height,
+            width=input_width,
+            gt_dp_num_points_list=gt_dp_num_points_list,
+            gt_dp_part_ids_list=gt_dp_part_ids_list,
+            gt_dp_surface_coords_list=gt_dp_surface_coords_list,
+            gt_weights_list=gt_weights_list))
+
+    part_prediction_loss = 0
+    surface_coord_loss = 0
+    classification_loss_fn = self._densepose_params.classification_loss
+    localization_loss_fn = self._densepose_params.localization_loss
+    num_predictions = float(len(part_predictions))
+    num_valid_points = tf.math.count_nonzero(batch_weights)
+    num_valid_points = tf.cast(tf.math.maximum(num_valid_points, 1), tf.float32)
+    for part_pred, surface_coord_pred in zip(part_predictions,
+                                             surface_coord_predictions):
+      # Potentially upsample the feature maps, so that better quality (i.e.
+      # higher res) groundtruth can be applied.
+      if self._densepose_params.upsample_to_input_res:
+        part_pred = tf.keras.layers.UpSampling2D(
+            self._stride, interpolation=self._densepose_params.upsample_method)(
+                part_pred)
+        surface_coord_pred = tf.keras.layers.UpSampling2D(
+            self._stride, interpolation=self._densepose_params.upsample_method)(
+                surface_coord_pred)
+      # Compute the part prediction loss.
+      part_pred = cn_assigner.get_batch_predictions_from_indices(
+          part_pred, batch_indices[:, 0:3])
+      part_prediction_loss += classification_loss_fn(
+          part_pred[:, tf.newaxis, :],
+          batch_part_ids[:, tf.newaxis, :],
+          weights=batch_weights[:, tf.newaxis, tf.newaxis])
+      # Compute the surface coordinate loss.
+      batch_size, out_height, out_width, _ = _get_shape(
+          surface_coord_pred, 4)
+      surface_coord_pred = tf.reshape(
+          surface_coord_pred, [batch_size, out_height, out_width, -1, 2])
+      surface_coord_pred = cn_assigner.get_batch_predictions_from_indices(
+          surface_coord_pred, batch_indices)
+      surface_coord_loss += localization_loss_fn(
+          surface_coord_pred,
+          batch_surface_coords,
+          weights=batch_weights[:, tf.newaxis])
+    part_prediction_loss = tf.reduce_sum(part_prediction_loss) / (
+        num_predictions * num_valid_points)
+    surface_coord_loss = tf.reduce_sum(surface_coord_loss) / (
+        num_predictions * num_valid_points)
+    return part_prediction_loss, surface_coord_loss
+
  def preprocess(self, inputs):
    outputs = shape_utils.resize_images_and_return_shapes(
        inputs, self._image_resizer_fn)
@@ -1909,6 +2258,13 @@ class CenterNetMetaArch(model.DetectionModel):
        'segmentation/heatmap' - [optional] A list of size num_feature_outputs
          holding float tensors of size [batch_size, output_height,
          output_width, num_classes] representing the mask logits.
+        'densepose/heatmap' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, num_parts] representing the mask logits for each part.
+        'densepose/regression' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, 2 * num_parts] representing the DensePose surface
+          coordinate predictions.
        Note the $TASK_NAME is provided by the KeypointEstimation namedtuple
        used to differentiate between different keypoint tasks.
    """
@@ -1938,10 +2294,16 @@ class CenterNetMetaArch(model.DetectionModel):
      scope: Optional scope name.

    Returns:
-      A dictionary mapping the keys ['Loss/object_center', 'Loss/box/scale',
-        'Loss/box/offset', 'Loss/$TASK_NAME/keypoint/heatmap',
-        'Loss/$TASK_NAME/keypoint/offset',
-        'Loss/$TASK_NAME/keypoint/regression', 'Loss/segmentation/heatmap'] to
+      A dictionary mapping the keys [
+        'Loss/object_center',
+        'Loss/box/scale',  (optional)
+        'Loss/box/offset', (optional)
+        'Loss/$TASK_NAME/keypoint/heatmap', (optional)
+        'Loss/$TASK_NAME/keypoint/offset', (optional)
+        'Loss/$TASK_NAME/keypoint/regression', (optional)
+        'Loss/segmentation/heatmap', (optional)
+        'Loss/densepose/heatmap', (optional)
+        'Loss/densepose/regression]' (optional)
        scalar tensors corresponding to the losses for different tasks. Note the
        $TASK_NAME is provided by the KeypointEstimation namedtuple used to
        differentiate between different keypoint tasks.
@@ -1999,6 +2361,16 @@ class CenterNetMetaArch(model.DetectionModel):
        seg_losses[key] = seg_losses[key] * self._mask_params.task_loss_weight
      losses.update(seg_losses)

+    if self._densepose_params is not None:
+      densepose_losses = self._compute_densepose_losses(
+          input_height=input_height,
+          input_width=input_width,
+          prediction_dict=prediction_dict)
+      for key in densepose_losses:
+        densepose_losses[key] = (
+            densepose_losses[key] * self._densepose_params.task_loss_weight)
+      losses.update(densepose_losses)
+
    # Prepend the LOSS_KEY_PREFIX to the keys in the dictionary such that the
    # losses will be grouped together in Tensorboard.
    return dict([('%s/%s' % (LOSS_KEY_PREFIX, key), val)
@@ -2033,9 +2405,14 @@ class CenterNetMetaArch(model.DetectionModel):
          invalid keypoints have their coordinates and scores set to 0.0.
        detection_keypoint_scores: (Optional) A float tensor of shape [batch,
          max_detection, num_keypoints] with scores for each keypoint.
-        detection_masks: (Optional) An int tensor of shape [batch,
-          max_detections, mask_height, mask_width] with binarized masks for each
-          detection.
+        detection_masks: (Optional) A uint8 tensor of shape [batch,
+          max_detections, mask_height, mask_width] with masks for each
+          detection. Background is specified with 0, and foreground is specified
+          with positive integers (1 for standard instance segmentation mask, and
+          1-indexed parts for DensePose task).
+        detection_surface_coords: (Optional) A float32 tensor of shape [batch,
+          max_detection, mask_height, mask_width, 2] with DensePose surface
+          coordinates, in (v, u) format.
    """
    object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
    # Get x, y and channel indices corresponding to the top indices in the class
@@ -2076,14 +2453,27 @@ class CenterNetMetaArch(model.DetectionModel):

    if self._mask_params:
      masks = tf.nn.sigmoid(prediction_dict[SEGMENTATION_HEATMAP][-1])
-      instance_masks = convert_strided_predictions_to_instance_masks(
-          boxes, classes, masks, self._stride, self._mask_params.mask_height,
-          self._mask_params.mask_width, true_image_shapes,
-          self._mask_params.score_threshold)
-      postprocess_dict.update({
-          fields.DetectionResultFields.detection_masks:
-              instance_masks
-      })
+      densepose_part_heatmap, densepose_surface_coords = None, None
+      densepose_class_index = 0
+      if self._densepose_params:
+        densepose_part_heatmap = prediction_dict[DENSEPOSE_HEATMAP][-1]
+        densepose_surface_coords = prediction_dict[DENSEPOSE_REGRESSION][-1]
+        densepose_class_index = self._densepose_params.class_id
+      instance_masks, surface_coords = (
+          convert_strided_predictions_to_instance_masks(
+              boxes, classes, masks, true_image_shapes,
+              densepose_part_heatmap, densepose_surface_coords,
+              stride=self._stride, mask_height=self._mask_params.mask_height,
+              mask_width=self._mask_params.mask_width,
+              score_threshold=self._mask_params.score_threshold,
+              densepose_class_index=densepose_class_index))
+      postprocess_dict[
+          fields.DetectionResultFields.detection_masks] = instance_masks
+      if self._densepose_params:
+        postprocess_dict[
+            fields.DetectionResultFields.detection_surface_coords] = (
+                surface_coords)
+
    return postprocess_dict

  def _postprocess_keypoints(self, prediction_dict, classes, y_indices,
@@ -2359,6 +2749,14 @@ class CenterNetMetaArch(model.DetectionModel):
        checkpoint (with compatible variable names) or to restore from a
        classification checkpoint for initialization prior to training.
        Valid values: `detection`, `classification`. Default 'detection'.
+        'detection': used when loading in the Hourglass model pre-trained on
+          other detection task.
+        'classification': used when loading in the ResNet model pre-trained on
+          image classification task. Note that only the image feature encoding
+          part is loaded but not those upsampling layers.
+        'fine_tune': used when loading the entire CenterNet feature extractor
+          pre-trained on other tasks. The checkpoints saved during CenterNet
+          model training can be directly loaded using this mode.

    Returns:
      A dict mapping keys to Trackable objects (tf.Module or Checkpoint).
@@ -2367,9 +2765,14 @@ class CenterNetMetaArch(model.DetectionModel):
    if fine_tune_checkpoint_type == 'classification':
      return {'feature_extractor': self._feature_extractor.get_base_model()}

-    if fine_tune_checkpoint_type == 'detection':
+    elif fine_tune_checkpoint_type == 'detection':
      return {'feature_extractor': self._feature_extractor.get_model()}

+    elif fine_tune_checkpoint_type == 'fine_tune':
+      feature_extractor_model = tf.train.Checkpoint(
+          _feature_extractor=self._feature_extractor)
+      return {'model': feature_extractor_model}
+
    else:
      raise ValueError('Not supported  fine tune checkpoint type - {}'.format(
          fine_tune_checkpoint_type))

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -266,7 +266,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
      masks_np[0, :, :3, 1] = 1  # Class 1.
      masks = tf.constant(masks_np)
      true_image_shapes = tf.constant([[6, 8, 3]])
-      instance_masks = cnma.convert_strided_predictions_to_instance_masks(
+      instance_masks, _ = cnma.convert_strided_predictions_to_instance_masks(
          boxes, classes, masks, stride=2, mask_height=2, mask_width=2,
          true_image_shapes=true_image_shapes)
      return instance_masks
@@ -289,6 +289,104 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
        ])
    np.testing.assert_array_equal(expected_instance_masks, instance_masks)

+  def test_convert_strided_predictions_raises_error_with_one_tensor(self):
+    def graph_fn():
+      boxes = tf.constant(
+          [
+              [[0.5, 0.5, 1.0, 1.0],
+               [0.0, 0.5, 0.5, 1.0],
+               [0.0, 0.0, 0.0, 0.0]],
+          ], tf.float32)
+      classes = tf.constant(
+          [
+              [0, 1, 0],
+          ], tf.int32)
+      masks_np = np.zeros((1, 4, 4, 2), dtype=np.float32)
+      masks_np[0, :, 2:, 0] = 1  # Class 0.
+      masks_np[0, :, :3, 1] = 1  # Class 1.
+      masks = tf.constant(masks_np)
+      true_image_shapes = tf.constant([[6, 8, 3]])
+      densepose_part_heatmap = tf.random.uniform(
+          [1, 4, 4, 24])
+      instance_masks, _ = cnma.convert_strided_predictions_to_instance_masks(
+          boxes, classes, masks, true_image_shapes,
+          densepose_part_heatmap=densepose_part_heatmap,
+          densepose_surface_coords=None)
+      return instance_masks
+
+    with self.assertRaises(ValueError):
+      self.execute_cpu(graph_fn, [])
+
+  def test_crop_and_threshold_masks(self):
+    boxes_np = np.array(
+        [[0., 0., 0.5, 0.5],
+         [0.25, 0.25, 1.0, 1.0]], dtype=np.float32)
+    classes_np = np.array([0, 2], dtype=np.int32)
+    masks_np = np.zeros((4, 4, _NUM_CLASSES), dtype=np.float32)
+    masks_np[0, 0, 0] = 0.8
+    masks_np[1, 1, 0] = 0.6
+    masks_np[3, 3, 2] = 0.7
+    part_heatmap_np = np.zeros((4, 4, _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+    part_heatmap_np[0, 0, 4] = 1
+    part_heatmap_np[0, 0, 2] = 0.6  # Lower scoring.
+    part_heatmap_np[1, 1, 8] = 0.2
+    part_heatmap_np[3, 3, 4] = 0.5
+    surf_coords_np = np.zeros((4, 4, 2 * _DENSEPOSE_NUM_PARTS),
+                              dtype=np.float32)
+    surf_coords_np[:, :, 8:10] = 0.2, 0.9
+    surf_coords_np[:, :, 16:18] = 0.3, 0.5
+    true_height, true_width = 10, 10
+    input_height, input_width = 10, 10
+    mask_height = 4
+    mask_width = 4
+    def graph_fn():
+      elems = [
+          tf.constant(boxes_np),
+          tf.constant(classes_np),
+          tf.constant(masks_np),
+          tf.constant(part_heatmap_np),
+          tf.constant(surf_coords_np),
+          tf.constant(true_height, dtype=tf.int32),
+          tf.constant(true_width, dtype=tf.int32)
+      ]
+      part_masks, surface_coords = cnma.crop_and_threshold_masks(
+          elems, input_height, input_width, mask_height=mask_height,
+          mask_width=mask_width, densepose_class_index=0)
+      return part_masks, surface_coords
+
+    part_masks, surface_coords = self.execute_cpu(graph_fn, [])
+
+    expected_part_masks = np.zeros((2, 4, 4), dtype=np.uint8)
+    expected_part_masks[0, 0, 0] = 5  # Recall classes are 1-indexed in output.
+    expected_part_masks[0, 2, 2] = 9  # Recall classes are 1-indexed in output.
+    expected_part_masks[1, 3, 3] = 1  # Standard instance segmentation mask.
+    expected_surface_coords = np.zeros((2, 4, 4, 2), dtype=np.float32)
+    expected_surface_coords[0, 0, 0, :] = 0.2, 0.9
+    expected_surface_coords[0, 2, 2, :] = 0.3, 0.5
+    np.testing.assert_allclose(expected_part_masks, part_masks)
+    np.testing.assert_allclose(expected_surface_coords, surface_coords)
+
+  def test_gather_surface_coords_for_parts(self):
+    surface_coords_cropped_np = np.zeros((2, 5, 5, _DENSEPOSE_NUM_PARTS, 2),
+                                         dtype=np.float32)
+    surface_coords_cropped_np[0, 0, 0, 5] = 0.3, 0.4
+    surface_coords_cropped_np[0, 1, 0, 9] = 0.5, 0.6
+    highest_scoring_part_np = np.zeros((2, 5, 5), dtype=np.int32)
+    highest_scoring_part_np[0, 0, 0] = 5
+    highest_scoring_part_np[0, 1, 0] = 9
+    def graph_fn():
+      surface_coords_cropped = tf.constant(surface_coords_cropped_np,
+                                           tf.float32)
+      highest_scoring_part = tf.constant(highest_scoring_part_np, tf.int32)
+      surface_coords_gathered = cnma.gather_surface_coords_for_parts(
+          surface_coords_cropped, highest_scoring_part)
+      return surface_coords_gathered
+
+    surface_coords_gathered = self.execute_cpu(graph_fn, [])
+
+    np.testing.assert_allclose([0.3, 0.4], surface_coords_gathered[0, 0, 0])
+    np.testing.assert_allclose([0.5, 0.6], surface_coords_gathered[0, 1, 0])
+
  def test_top_k_feature_map_locations(self):
    feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
    feature_map_np[0, 2, 0, 1] = 1.0
@@ -535,6 +633,8 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    keypoint_heatmap_np[1, 0, 1, 1] = 0.9
    keypoint_heatmap_np[1, 2, 0, 1] = 0.8

+    # Note that the keypoint offsets are now per keypoint (as opposed to
+    # keypoint agnostic, in the test test_keypoint_candidate_prediction).
    keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 4), dtype=np.float32)
    keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25, 0.0, 0.0]
    keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5, 0.0, 0.0]
@@ -949,6 +1049,7 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
 _NUM_CLASSES = 10
 _KEYPOINT_INDICES = [0, 1, 2, 3]
 _NUM_KEYPOINTS = len(_KEYPOINT_INDICES)
+_DENSEPOSE_NUM_PARTS = 24
 _TASK_NAME = 'human_pose'


@@ -991,6 +1092,20 @@ def get_fake_mask_params():
      mask_width=4)


+def get_fake_densepose_params():
+  """Returns the fake DensePose estimation parameter namedtuple."""
+  return cnma.DensePoseParams(
+      class_id=1,
+      classification_loss=losses.WeightedSoftmaxClassificationLoss(),
+      localization_loss=losses.L1LocalizationLoss(),
+      part_loss_weight=1.0,
+      coordinate_loss_weight=1.0,
+      num_parts=_DENSEPOSE_NUM_PARTS,
+      task_loss_weight=1.0,
+      upsample_to_input_res=True,
+      upsample_method='nearest')
+
+
 def build_center_net_meta_arch(build_resnet=False):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
@@ -1018,7 +1133,8 @@ def build_center_net_meta_arch(build_resnet=False):
      object_center_params=get_fake_center_params(),
      object_detection_params=get_fake_od_params(),
      keypoint_params_dict={_TASK_NAME: get_fake_kp_params()},
-      mask_params=get_fake_mask_params())
+      mask_params=get_fake_mask_params(),
+      densepose_params=get_fake_densepose_params())


 def _logit(p):
@@ -1102,6 +1218,16 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        fake_feature_map)
    self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)

+    # "densepose parts" head:
+    output = model._prediction_head_dict[cnma.DENSEPOSE_HEATMAP][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, _DENSEPOSE_NUM_PARTS), output.shape)
+
+    # "densepose surface coordinates" head:
+    output = model._prediction_head_dict[cnma.DENSEPOSE_REGRESSION][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, 2 * _DENSEPOSE_NUM_PARTS), output.shape)
+
  def test_initialize_target_assigners(self):
    model = build_center_net_meta_arch()
    assigner_dict = model._initialize_target_assigners(
@@ -1125,6 +1251,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertIsInstance(assigner_dict[cnma.SEGMENTATION_TASK],
                          cn_assigner.CenterNetMaskTargetAssigner)

+    # DensePose estimation target assigner:
+    self.assertIsInstance(assigner_dict[cnma.DENSEPOSE_TASK],
+                          cn_assigner.CenterNetDensePoseTargetAssigner)
+
  def test_predict(self):
    """Test the predict function."""

@@ -1145,6 +1275,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                     (2, 32, 32, 2))
    self.assertEqual(prediction_dict[cnma.SEGMENTATION_HEATMAP][0].shape,
                     (2, 32, 32, _NUM_CLASSES))
+    self.assertEqual(prediction_dict[cnma.DENSEPOSE_HEATMAP][0].shape,
+                     (2, 32, 32, _DENSEPOSE_NUM_PARTS))
+    self.assertEqual(prediction_dict[cnma.DENSEPOSE_REGRESSION][0].shape,
+                     (2, 32, 32, 2 * _DENSEPOSE_NUM_PARTS))

  def test_loss(self):
    """Test the loss function."""
@@ -1157,7 +1291,13 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        groundtruth_keypoints_list=groundtruth_dict[
            fields.BoxListFields.keypoints],
        groundtruth_masks_list=groundtruth_dict[
-            fields.BoxListFields.masks])
+            fields.BoxListFields.masks],
+        groundtruth_dp_num_points_list=groundtruth_dict[
+            fields.BoxListFields.densepose_num_points],
+        groundtruth_dp_part_ids_list=groundtruth_dict[
+            fields.BoxListFields.densepose_part_ids],
+        groundtruth_dp_surface_coords_list=groundtruth_dict[
+            fields.BoxListFields.densepose_surface_coords])

    prediction_dict = get_fake_prediction_dict(
        input_height=16, input_width=32, stride=4)
@@ -1193,6 +1333,12 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertGreater(
        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
                                   cnma.SEGMENTATION_HEATMAP)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.DENSEPOSE_HEATMAP)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.DENSEPOSE_REGRESSION)])

  @parameterized.parameters(
      {'target_class_id': 1},
@@ -1230,6 +1376,14 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    segmentation_heatmap[:, 14:18, 14:18, target_class_id] = 1.0
    segmentation_heatmap = _logit(segmentation_heatmap)

+    dp_part_ind = 4
+    dp_part_heatmap = np.zeros((1, 32, 32, _DENSEPOSE_NUM_PARTS),
+                               dtype=np.float32)
+    dp_part_heatmap[0, 14:18, 14:18, dp_part_ind] = 1.0
+    dp_part_heatmap = _logit(dp_part_heatmap)
+
+    dp_surf_coords = np.random.randn(1, 32, 32, 2 * _DENSEPOSE_NUM_PARTS)
+
    class_center = tf.constant(class_center)
    height_width = tf.constant(height_width)
    offset = tf.constant(offset)
@@ -1237,6 +1391,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
    keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
    segmentation_heatmap = tf.constant(segmentation_heatmap, dtype=tf.float32)
+    dp_part_heatmap = tf.constant(dp_part_heatmap, dtype=tf.float32)
+    dp_surf_coords = tf.constant(dp_surf_coords, dtype=tf.float32)

    prediction_dict = {
        cnma.OBJECT_CENTER: [class_center],
@@ -1249,6 +1405,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
            [keypoint_regression],
        cnma.SEGMENTATION_HEATMAP: [segmentation_heatmap],
+        cnma.DENSEPOSE_HEATMAP: [dp_part_heatmap],
+        cnma.DENSEPOSE_REGRESSION: [dp_surf_coords]
    }

    def graph_fn():
@@ -1271,12 +1429,13 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual([1, max_detection, 4, 4],
                        detections['detection_masks'].shape)

-    # There should be some section of the first mask (correspond to the only
-    # detection) with non-zero mask values.
-    self.assertGreater(np.sum(detections['detection_masks'][0, 0, :, :] > 0), 0)
+    # Masks should be empty for everything but the first detection.
    self.assertAllEqual(
        detections['detection_masks'][0, 1:, :, :],
        np.zeros_like(detections['detection_masks'][0, 1:, :, :]))
+    self.assertAllEqual(
+        detections['detection_surface_coords'][0, 1:, :, :],
+        np.zeros_like(detections['detection_surface_coords'][0, 1:, :, :]))

    if target_class_id == 1:
      expected_kpts_for_obj_0 = np.array(
@@ -1287,6 +1446,12 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                                 expected_kpts_for_obj_0, rtol=1e-6)
      np.testing.assert_allclose(detections['detection_keypoint_scores'][0][0],
                                 expected_kpt_scores_for_obj_0, rtol=1e-6)
+      # First detection has DensePose parts.
+      self.assertSameElements(
+          np.unique(detections['detection_masks'][0, 0, :, :]),
+          set([0, dp_part_ind + 1]))
+      self.assertGreater(np.sum(np.abs(detections['detection_surface_coords'])),
+                         0.0)
    else:
      # All keypoint outputs should be zeros.
      np.testing.assert_allclose(
@@ -1297,6 +1462,14 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
          detections['detection_keypoint_scores'][0][0],
          np.zeros([num_keypoints], np.float),
          rtol=1e-6)
+      # Binary segmentation mask.
+      self.assertSameElements(
+          np.unique(detections['detection_masks'][0, 0, :, :]),
+          set([0, 1]))
+      # No DensePose surface coordinates.
+      np.testing.assert_allclose(
+          detections['detection_surface_coords'][0, 0, :, :],
+          np.zeros_like(detections['detection_surface_coords'][0, 0, :, :]))

  def test_get_instance_indices(self):
    classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
@@ -1353,6 +1526,17 @@ def get_fake_prediction_dict(input_height, input_width, stride):
  mask_heatmap[0, 2, 4, 1] = 1.0
  mask_heatmap = _logit(mask_heatmap)

+  densepose_heatmap = np.zeros((2, output_height, output_width,
+                                _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+  densepose_heatmap[0, 2, 4, 5] = 1.0
+  densepose_heatmap = _logit(densepose_heatmap)
+
+  densepose_regression = np.zeros((2, output_height, output_width,
+                                   2 * _DENSEPOSE_NUM_PARTS), dtype=np.float32)
+  # The surface coordinate indices for part index 5 are:
+  # (5 * 2, 5 * 2 + 1), or (10, 11).
+  densepose_regression[0, 2, 4, 10:12] = 0.4, 0.7
+
  prediction_dict = {
      'preprocessed_inputs':
          tf.zeros((2, input_height, input_width, 3)),
@@ -1383,6 +1567,14 @@ def get_fake_prediction_dict(input_height, input_width, stride):
      cnma.SEGMENTATION_HEATMAP: [
          tf.constant(mask_heatmap),
          tf.constant(mask_heatmap)
+      ],
+      cnma.DENSEPOSE_HEATMAP: [
+          tf.constant(densepose_heatmap),
+          tf.constant(densepose_heatmap),
+      ],
+      cnma.DENSEPOSE_REGRESSION: [
+          tf.constant(densepose_regression),
+          tf.constant(densepose_regression),
      ]
  }
  return prediction_dict
@@ -1427,12 +1619,30 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
      tf.constant(mask),
      tf.zeros_like(mask),
  ]
+  densepose_num_points = [
+      tf.constant([1], dtype=tf.int32),
+      tf.constant([0], dtype=tf.int32),
+  ]
+  densepose_part_ids = [
+      tf.constant([[5, 0, 0]], dtype=tf.int32),
+      tf.constant([[0, 0, 0]], dtype=tf.int32),
+  ]
+  densepose_surface_coords_np = np.zeros((1, 3, 4), dtype=np.float32)
+  densepose_surface_coords_np[0, 0, :] = 0.55, 0.55, 0.4, 0.7
+  densepose_surface_coords = [
+      tf.constant(densepose_surface_coords_np),
+      tf.zeros_like(densepose_surface_coords_np)
+  ]
  groundtruth_dict = {
      fields.BoxListFields.boxes: boxes,
      fields.BoxListFields.weights: weights,
      fields.BoxListFields.classes: classes,
      fields.BoxListFields.keypoints: keypoints,
      fields.BoxListFields.masks: masks,
+      fields.BoxListFields.densepose_num_points: densepose_num_points,
+      fields.BoxListFields.densepose_part_ids: densepose_part_ids,
+      fields.BoxListFields.densepose_surface_coords:
+          densepose_surface_coords,
      fields.InputDataFields.groundtruth_labeled_classes: labeled_classes,
  }
  return groundtruth_dict

--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function

 import functools
 import unittest
+from unittest import mock  # pylint: disable=g-importing-member
 from absl.testing import parameterized
-import mock
 import tensorflow.compat.v1 as tf
 import tf_slim as slim


--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -432,14 +432,9 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
    return eval_metric_ops


-def _check_mask_type_and_value(array_name, masks):
-  """Checks whether mask dtype is uint8 and the values are either 0 or 1."""
-  if masks.dtype != np.uint8:
-    raise ValueError('{} must be of type np.uint8. Found {}.'.format(
-        array_name, masks.dtype))
-  if np.any(np.logical_and(masks != 0, masks != 1)):
-    raise ValueError('{} elements can only be either 0 or 1.'.format(
-        array_name))
+def convert_masks_to_binary(masks):
+  """Converts masks to 0 or 1 and uint8 type."""
+  return (masks > 0).astype(np.uint8)


 class CocoKeypointEvaluator(CocoDetectionEvaluator):
@@ -952,9 +947,8 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):

    groundtruth_instance_masks = groundtruth_dict[
        standard_fields.InputDataFields.groundtruth_instance_masks]
-    _check_mask_type_and_value(standard_fields.InputDataFields.
-                               groundtruth_instance_masks,
-                               groundtruth_instance_masks)
+    groundtruth_instance_masks = convert_masks_to_binary(
+        groundtruth_instance_masks)
    self._groundtruth_list.extend(
        coco_tools.
        ExportSingleImageGroundtruthToCoco(
@@ -1013,9 +1007,7 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
                       'are incompatible: {} vs {}'.format(
                           groundtruth_masks_shape,
                           detection_masks.shape))
-    _check_mask_type_and_value(standard_fields.DetectionResultFields.
-                               detection_masks,
-                               detection_masks)
+    detection_masks = convert_masks_to_binary(detection_masks)
    self._detection_masks_list.extend(
        coco_tools.ExportSingleImageDetectionMasksToCoco(
            image_id=image_id,

--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -1424,14 +1424,16 @@ class CocoMaskEvaluationTest(tf.test.TestCase):
        image_id='image3',
        detections_dict={
            standard_fields.DetectionResultFields.detection_boxes:
-            np.array([[25., 25., 50., 50.]]),
+                np.array([[25., 25., 50., 50.]]),
            standard_fields.DetectionResultFields.detection_scores:
-            np.array([.8]),
+                np.array([.8]),
            standard_fields.DetectionResultFields.detection_classes:
-            np.array([1]),
+                np.array([1]),
            standard_fields.DetectionResultFields.detection_masks:
-            np.pad(np.ones([1, 25, 25], dtype=np.uint8),
-                   ((0, 0), (10, 10), (10, 10)), mode='constant')
+                # The value of 5 is equivalent to 1, since masks will be
+                # thresholded and binarized before evaluation.
+                np.pad(5 * np.ones([1, 25, 25], dtype=np.uint8),
+                       ((0, 0), (10, 10), (10, 10)), mode='constant')
        })
    metrics = coco_evaluator.evaluate()
    self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0)

--- a/research/object_detection/metrics/oid_challenge_evaluation_utils.py
+++ b/research/object_detection/metrics/oid_challenge_evaluation_utils.py
@@ -136,15 +136,15 @@ def build_groundtruth_dictionary(data, class_label_map):

  dictionary = {
      standard_fields.InputDataFields.groundtruth_boxes:
-          data_location[['YMin', 'XMin', 'YMax', 'XMax']].as_matrix(),
+          data_location[['YMin', 'XMin', 'YMax', 'XMax']].to_numpy(),
      standard_fields.InputDataFields.groundtruth_classes:
          data_location['LabelName'].map(lambda x: class_label_map[x]
-                                        ).as_matrix(),
+                                        ).to_numpy(),
      standard_fields.InputDataFields.groundtruth_group_of:
-          data_location['IsGroupOf'].as_matrix().astype(int),
+          data_location['IsGroupOf'].to_numpy().astype(int),
      standard_fields.InputDataFields.groundtruth_image_classes:
          data_labels['LabelName'].map(lambda x: class_label_map[x]
-                                      ).as_matrix(),
+                                      ).to_numpy(),
  }

  if 'Mask' in data_location:
@@ -179,9 +179,9 @@ def build_predictions_dictionary(data, class_label_map):
  """
  dictionary = {
      standard_fields.DetectionResultFields.detection_classes:
-          data['LabelName'].map(lambda x: class_label_map[x]).as_matrix(),
+          data['LabelName'].map(lambda x: class_label_map[x]).to_numpy(),
      standard_fields.DetectionResultFields.detection_scores:
-          data['Score'].as_matrix()
+          data['Score'].to_numpy()
  }

  if 'Mask' in data:
@@ -192,6 +192,6 @@ def build_predictions_dictionary(data, class_label_map):
  else:
    dictionary[standard_fields.DetectionResultFields.detection_boxes] = data[[
        'YMin', 'XMin', 'YMax', 'XMax'
-    ]].as_matrix()
+    ]].to_numpy()

  return dictionary
--- a/research/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py
+++ b/research/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py
@@ -53,16 +53,16 @@ def build_groundtruth_vrd_dictionary(data, class_label_map,

  boxes = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.vrd_box_data_type)
  boxes['subject'] = data_boxes[['YMin1', 'XMin1', 'YMax1',
-                                 'XMax1']].as_matrix()
-  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].as_matrix()
+                                 'XMax1']].to_numpy()
+  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].to_numpy()

  labels = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.label_data_type)
  labels['subject'] = data_boxes['LabelName1'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['object'] = data_boxes['LabelName2'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['relation'] = data_boxes['RelationshipLabel'].map(
-      lambda x: relationship_label_map[x]).as_matrix()
+      lambda x: relationship_label_map[x]).to_numpy()

  return {
      standard_fields.InputDataFields.groundtruth_boxes:
@@ -71,7 +71,7 @@ def build_groundtruth_vrd_dictionary(data, class_label_map,
          labels,
      standard_fields.InputDataFields.groundtruth_image_classes:
          data_labels['LabelName'].map(lambda x: class_label_map[x])
-          .as_matrix(),
+          .to_numpy(),
  }


@@ -104,16 +104,16 @@ def build_predictions_vrd_dictionary(data, class_label_map,

  boxes = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.vrd_box_data_type)
  boxes['subject'] = data_boxes[['YMin1', 'XMin1', 'YMax1',
-                                 'XMax1']].as_matrix()
-  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].as_matrix()
+                                 'XMax1']].to_numpy()
+  boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].to_numpy()

  labels = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.label_data_type)
  labels['subject'] = data_boxes['LabelName1'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['object'] = data_boxes['LabelName2'].map(
-      lambda x: class_label_map[x]).as_matrix()
+      lambda x: class_label_map[x]).to_numpy()
  labels['relation'] = data_boxes['RelationshipLabel'].map(
-      lambda x: relationship_label_map[x]).as_matrix()
+      lambda x: relationship_label_map[x]).to_numpy()

  return {
      standard_fields.DetectionResultFields.detection_boxes:
@@ -121,5 +121,5 @@ def build_predictions_vrd_dictionary(data, class_label_map,
      standard_fields.DetectionResultFields.detection_classes:
          labels,
      standard_fields.DetectionResultFields.detection_scores:
-          data_boxes['Score'].as_matrix()
+          data_boxes['Score'].to_numpy()
  }
--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -43,7 +43,6 @@ from object_detection.utils import visualization_utils as vis_utils
 # pylint: disable=g-import-not-at-top
 try:
  from tensorflow.contrib import learn as contrib_learn
-  from tensorflow.contrib import tpu as contrib_tpu
 except ImportError:
  # TF 2.0 doesn't ship with contrib.
  pass
@@ -94,6 +93,15 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
        of groundtruth boxes per image..
      'groundtruth_keypoints': [batch_size, num_boxes, num_keypoints, 2] float32
        tensor of keypoints (if provided in groundtruth).
+      'groundtruth_dp_num_points_list': [batch_size, num_boxes] int32 tensor
+        with the number of DensePose points for each instance (if provided in
+        groundtruth).
+      'groundtruth_dp_part_ids_list': [batch_size, num_boxes,
+        max_sampled_points] int32 tensor with the part ids for each DensePose
+        sampled point (if provided in groundtruth).
+      'groundtruth_dp_surface_coords_list': [batch_size, num_boxes,
+        max_sampled_points, 4] containing the DensePose surface coordinates for
+        each sampled point (if provided in groundtruth).
      'groundtruth_group_of': [batch_size, num_boxes] bool tensor indicating
        group_of annotations (if provided in groundtruth).
      'groundtruth_labeled_classes': [batch_size, num_classes] int64
@@ -164,6 +172,21 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
      groundtruth[input_data_fields.groundtruth_labeled_classes] = tf.stack(
          labeled_classes)

+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.densepose_num_points):
+    groundtruth[input_data_fields.groundtruth_dp_num_points] = tf.stack(
+        detection_model.groundtruth_lists(
+            fields.BoxListFields.densepose_num_points))
+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.densepose_part_ids):
+    groundtruth[input_data_fields.groundtruth_dp_part_ids] = tf.stack(
+        detection_model.groundtruth_lists(
+            fields.BoxListFields.densepose_part_ids))
+  if detection_model.groundtruth_has_field(
+      fields.BoxListFields.densepose_surface_coords):
+    groundtruth[input_data_fields.groundtruth_dp_surface_coords] = tf.stack(
+        detection_model.groundtruth_lists(
+            fields.BoxListFields.densepose_surface_coords))
  groundtruth[input_data_fields.num_groundtruth_boxes] = (
      tf.tile([max_number_of_boxes], multiples=[groundtruth_boxes_shape[0]]))
  return groundtruth
@@ -219,6 +242,9 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
        fields.InputDataFields.groundtruth_boxes,
        fields.InputDataFields.groundtruth_keypoints,
        fields.InputDataFields.groundtruth_keypoint_visibilities,
+        fields.InputDataFields.groundtruth_dp_num_points,
+        fields.InputDataFields.groundtruth_dp_part_ids,
+        fields.InputDataFields.groundtruth_dp_surface_coords,
        fields.InputDataFields.groundtruth_group_of,
        fields.InputDataFields.groundtruth_difficult,
        fields.InputDataFields.groundtruth_is_crowd,
@@ -269,6 +295,18 @@ def provide_groundtruth(model, labels):
  if fields.InputDataFields.groundtruth_keypoint_visibilities in labels:
    gt_keypoint_visibilities_list = labels[
        fields.InputDataFields.groundtruth_keypoint_visibilities]
+  gt_dp_num_points_list = None
+  if fields.InputDataFields.groundtruth_dp_num_points in labels:
+    gt_dp_num_points_list = labels[
+        fields.InputDataFields.groundtruth_dp_num_points]
+  gt_dp_part_ids_list = None
+  if fields.InputDataFields.groundtruth_dp_part_ids in labels:
+    gt_dp_part_ids_list = labels[
+        fields.InputDataFields.groundtruth_dp_part_ids]
+  gt_dp_surface_coords_list = None
+  if fields.InputDataFields.groundtruth_dp_surface_coords in labels:
+    gt_dp_surface_coords_list = labels[
+        fields.InputDataFields.groundtruth_dp_surface_coords]
  gt_weights_list = None
  if fields.InputDataFields.groundtruth_weights in labels:
    gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
@@ -297,13 +335,16 @@ def provide_groundtruth(model, labels):
      groundtruth_masks_list=gt_masks_list,
      groundtruth_keypoints_list=gt_keypoints_list,
      groundtruth_keypoint_visibilities_list=gt_keypoint_visibilities_list,
+      groundtruth_dp_num_points_list=gt_dp_num_points_list,
+      groundtruth_dp_part_ids_list=gt_dp_part_ids_list,
+      groundtruth_dp_surface_coords_list=gt_dp_surface_coords_list,
      groundtruth_weights_list=gt_weights_list,
      groundtruth_is_crowd_list=gt_is_crowd_list,
      groundtruth_group_of_list=gt_group_of_list,
      groundtruth_area_list=gt_area_list)


-def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,
+def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False,
                    postprocess_on_cpu=False):
  """Creates a model function for `Estimator`.

@@ -377,7 +418,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,
    side_inputs = detection_model.get_side_inputs(features)

    if use_tpu and train_config.use_bfloat16:
-      with contrib_tpu.bfloat16_scope():
+      with tf.tpu.bfloat16_scope():
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape], **side_inputs)
@@ -392,7 +433,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,

    if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
      if use_tpu and postprocess_on_cpu:
-        detections = contrib_tpu.outside_compilation(
+        detections = tf.tpu.outside_compilation(
            postprocess_wrapper,
            (prediction_dict,
             features[fields.InputDataFields.true_image_shape]))
@@ -468,7 +509,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
-        training_optimizer = contrib_tpu.CrossShardOptimizer(training_optimizer)
+        training_optimizer = tf.tpu.CrossShardOptimizer(training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
@@ -588,7 +629,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,

    # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
    if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
-      return contrib_tpu.TPUEstimatorSpec(
+      return tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
@@ -619,8 +660,8 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False,


 def create_estimator_and_inputs(run_config,
-                                hparams,
-                                pipeline_config_path,
+                                hparams=None,
+                                pipeline_config_path=None,
                                config_override=None,
                                train_steps=None,
                                sample_1_of_n_eval_examples=1,
@@ -639,7 +680,7 @@ def create_estimator_and_inputs(run_config,

  Args:
    run_config: A `RunConfig`.
-    hparams: A `HParams`.
+    hparams: (optional) A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
@@ -762,14 +803,14 @@ def create_estimator_and_inputs(run_config,
      model_config=model_config, predict_input_config=eval_input_configs[0])

  # Read export_to_tpu from hparams if not passed.
-  if export_to_tpu is None:
+  if export_to_tpu is None and hparams is not None:
    export_to_tpu = hparams.get('export_to_tpu', False)
  tf.logging.info('create_estimator_and_inputs: use_tpu %s, export_to_tpu %s',
                  use_tpu, export_to_tpu)
  model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu,
                              postprocess_on_cpu)
  if use_tpu_estimator:
-    estimator = contrib_tpu.TPUEstimator(
+    estimator = tf.estimator.tpu.TPUEstimator(
        model_fn=model_fn,
        train_batch_size=train_config.batch_size,
        # For each core, only batch size 1 is supported for eval.

--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -93,6 +93,12 @@ def _compute_losses_and_predictions_dicts(
          instance masks for objects.
        labels[fields.InputDataFields.groundtruth_keypoints] is a
          float32 tensor containing keypoints for each box.
+        labels[fields.InputDataFields.groundtruth_dp_num_points] is an int32
+          tensor with the number of sampled DensePose points per object.
+        labels[fields.InputDataFields.groundtruth_dp_part_ids] is an int32
+          tensor with the DensePose part ids (0-indexed) per object.
+        labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
+          float32 tensor with the DensePose surface coordinates.
        labels[fields.InputDataFields.groundtruth_group_of] is a tf.bool tensor
          containing group_of annotations.
        labels[fields.InputDataFields.groundtruth_labeled_classes] is a float32
@@ -195,6 +201,17 @@ def eager_train_step(detection_model,
        labels[fields.InputDataFields.groundtruth_keypoints] is a
          [batch_size, num_boxes, num_keypoints, 2] float32 tensor containing
          keypoints for each box.
+        labels[fields.InputDataFields.groundtruth_dp_num_points] is a
+          [batch_size, num_boxes] int32 tensor with the number of DensePose
+          sampled points per instance.
+        labels[fields.InputDataFields.groundtruth_dp_part_ids] is a
+          [batch_size, num_boxes, max_sampled_points] int32 tensor with the
+          part ids (0-indexed) for each instance.
+        labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
+          [batch_size, num_boxes, max_sampled_points, 4] float32 tensor with the
+          surface coordinates for each point. Each surface coordinate is of the
+          form (y, x, v, u) where (y, x) are normalized image locations and
+          (v, u) are part-relative normalized surface coordinates.
        labels[fields.InputDataFields.groundtruth_labeled_classes] is a float32
          k-hot tensor of classes.
    unpad_groundtruth_tensors: A parameter passed to unstack_batch.
@@ -767,7 +784,16 @@ def eager_eval_loop(
          name='eval_side_by_side_' + str(i),
          step=global_step,
          data=sbys_images,
-          max_outputs=1)
+          max_outputs=eval_config.num_visualizations)
+      if eval_util.has_densepose(eval_dict):
+        dp_image_list = vutils.draw_densepose_visualizations(
+            eval_dict)
+        dp_images = tf.concat(dp_image_list, axis=0)
+        tf.compat.v2.summary.image(
+            name='densepose_detections_' + str(i),
+            step=global_step,
+            data=dp_images,
+            max_outputs=eval_config.num_visualizations)

    if evaluators is None:
      if class_agnostic:

--- a/research/object_detection/model_main.py
+++ b/research/object_detection/model_main.py
@@ -22,7 +22,6 @@ from absl import flags

 import tensorflow.compat.v1 as tf

-from object_detection import model_hparams
 from object_detection import model_lib

 flags.DEFINE_string(
@@ -41,10 +40,6 @@ flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
                     'one of every n train input examples for evaluation, '
                     'where n is provided. This is only used if '
                     '`eval_training_data` is True.')
-flags.DEFINE_string(
-    'hparams_overrides', None, 'Hyperparameter overrides, '
-    'represented as a string containing comma-separated '
-    'hparam_name=value pairs.')
 flags.DEFINE_string(
    'checkpoint_dir', None, 'Path to directory holding a checkpoint.  If '
    '`checkpoint_dir` is provided, this binary operates in eval-only mode, '
@@ -68,7 +63,6 @@ def main(unused_argv):

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
-      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,

--- a/research/object_detection/model_main_tf2.py
+++ b/research/object_detection/model_main_tf2.py
@@ -54,6 +54,10 @@ flags.DEFINE_integer('eval_timeout', 3600, 'Number of seconds to wait for an'
                     'evaluation checkpoint before exiting.')

 flags.DEFINE_bool('use_tpu', False, 'Whether the job is executing on a TPU.')
+flags.DEFINE_string(
+    'tpu_name',
+    default=None,
+    help='Name of the Cloud TPU for Cluster Resolvers.')
 flags.DEFINE_integer(
    'num_workers', 1, 'When num_workers > 1, training uses '
    'MultiWorkerMirroredStrategy. When num_workers = 1 it uses '
@@ -79,7 +83,10 @@ def main(unused_argv):
        wait_interval=300, timeout=FLAGS.eval_timeout)
  else:
    if FLAGS.use_tpu:
-      resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+      # TPU is automatically inferred if tpu_name is None and
+      # we are running under cloud ai-platform.
+      resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+          FLAGS.tpu_name)
      tf.config.experimental_connect_to_cluster(resolver)
      tf.tpu.experimental.initialize_tpu_system(resolver)
      strategy = tf.distribute.experimental.TPUStrategy(resolver)

--- a/research/object_detection/model_tpu_main.py
+++ b/research/object_detection/model_tpu_main.py
@@ -26,18 +26,8 @@ from absl import flags
 import tensorflow.compat.v1 as tf


-from object_detection import model_hparams
 from object_detection import model_lib

-# pylint: disable=g-import-not-at-top
-try:
-  from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver
-  from tensorflow.contrib import tpu as contrib_tpu
-except ImportError:
-  # TF 2.0 doesn't ship with contrib.
-  pass
-# pylint: enable=g-import-not-at-top
-
 tf.flags.DEFINE_bool('use_tpu', True, 'Use TPUs rather than plain CPUs')

 # Cloud TPU Cluster Resolvers
@@ -67,10 +57,6 @@ flags.DEFINE_string('mode', 'train',
 flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If '
                     'this is not provided, batch size is read from training '
                     'config.')
-
-flags.DEFINE_string(
-    'hparams_overrides', None, 'Comma-separated list of '
-    'hyperparameters to override defaults.')
 flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
 flags.DEFINE_boolean('eval_training_data', False,
                     'If training data should be evaluated for this job.')
@@ -99,15 +85,15 @@ def main(unused_argv):
  flags.mark_flag_as_required('pipeline_config_path')

  tpu_cluster_resolver = (
-      contrib_cluster_resolver.TPUClusterResolver(
+      tf.distribute.cluster_resolver.TPUClusterResolver(
          tpu=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
  tpu_grpc_url = tpu_cluster_resolver.get_master()

-  config = contrib_tpu.RunConfig(
+  config = tf.estimator.tpu.RunConfig(
      master=tpu_grpc_url,
      evaluation_master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
-      tpu_config=contrib_tpu.TPUConfig(
+      tpu_config=tf.estimator.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_shards))

@@ -117,7 +103,6 @@ def main(unused_argv):

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
-      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,

--- a/research/object_detection/models/ssd_efficientnet_bifpn_feature_extractor.py
+++ b/research/object_detection/models/ssd_efficientnet_bifpn_feature_extractor.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SSD Keras-based EfficientNet + BiFPN (EfficientDet) Feature Extractor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import logging
+from six.moves import range
+from six.moves import zip
+import tensorflow.compat.v2 as tf
+
+from object_detection.meta_architectures import ssd_meta_arch
+from object_detection.models import bidirectional_feature_pyramid_generators as bifpn_generators
+from object_detection.utils import ops
+from object_detection.utils import shape_utils
+from object_detection.utils import tf_version
+# pylint: disable=g-import-not-at-top
+if tf_version.is_tf2():
+  from official.vision.image_classification.efficientnet import efficientnet_model
+
+_EFFICIENTNET_LEVEL_ENDPOINTS = {
+    1: 'stack_0/block_0/project_bn',
+    2: 'stack_1/block_1/add',
+    3: 'stack_2/block_1/add',
+    4: 'stack_4/block_2/add',
+    5: 'stack_6/block_0/project_bn',
+}
+
+
+class SSDEfficientNetBiFPNKerasFeatureExtractor(
+    ssd_meta_arch.SSDKerasFeatureExtractor):
+  """SSD Keras-based EfficientNetBiFPN (EfficientDet) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level,
+               bifpn_max_level,
+               bifpn_num_iterations,
+               bifpn_num_filters,
+               bifpn_combine_method,
+               efficientnet_version,
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name=None):
+    """SSD Keras-based EfficientNetBiFPN (EfficientDet) feature extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      efficientnet_version: the EfficientNet version to use for this feature
+        extractor's backbone.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetBiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        use_explicit_padding=None,
+        use_depthwise=None,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+    if depth_multiplier != 1.0:
+      raise ValueError('EfficientNetBiFPN does not support a non-default '
+                       'depth_multiplier.')
+    if use_explicit_padding:
+      raise ValueError('EfficientNetBiFPN does not support explicit padding.')
+    if use_depthwise:
+      raise ValueError('EfficientNetBiFPN does not support use_depthwise.')
+    if override_base_feature_extractor_hyperparams:
+      raise ValueError('EfficientNetBiFPN does not support '
+                       'override_base_feature_extractor_hyperparams.')
+
+    self._bifpn_min_level = bifpn_min_level
+    self._bifpn_max_level = bifpn_max_level
+    self._bifpn_num_iterations = bifpn_num_iterations
+    self._bifpn_num_filters = max(bifpn_num_filters, min_depth)
+    self._bifpn_node_params = {'combine_method': bifpn_combine_method}
+    self._efficientnet_version = efficientnet_version
+
+    logging.info('EfficientDet EfficientNet backbone version: %s',
+                 self._efficientnet_version)
+    logging.info('EfficientDet BiFPN num filters: %d', self._bifpn_num_filters)
+    logging.info('EfficientDet BiFPN num iterations: %d',
+                 self._bifpn_num_iterations)
+
+    self._backbone_max_level = min(
+        max(_EFFICIENTNET_LEVEL_ENDPOINTS.keys()), bifpn_max_level)
+    self._output_layer_names = [
+        _EFFICIENTNET_LEVEL_ENDPOINTS[i]
+        for i in range(bifpn_min_level, self._backbone_max_level + 1)]
+    self._output_layer_alias = [
+        'level_{}'.format(i)
+        for i in range(bifpn_min_level, self._backbone_max_level + 1)]
+
+    # Initialize the EfficientNet backbone.
+    # Note, this is currently done in the init method rather than in the build
+    # method, since doing so introduces an error which is not well understood.
+    efficientnet_base = efficientnet_model.EfficientNet.from_name(
+        model_name=self._efficientnet_version,
+        overrides={'rescale_input': False})
+    outputs = [efficientnet_base.get_layer(output_layer_name).output
+               for output_layer_name in self._output_layer_names]
+    self._efficientnet = tf.keras.Model(
+        inputs=efficientnet_base.inputs, outputs=outputs)
+    self.classification_backbone = efficientnet_base
+    self._bifpn_stage = None
+
+  def build(self, input_shape):
+    self._bifpn_stage = bifpn_generators.KerasBiFpnFeatureMaps(
+        bifpn_num_iterations=self._bifpn_num_iterations,
+        bifpn_num_filters=self._bifpn_num_filters,
+        fpn_min_level=self._bifpn_min_level,
+        fpn_max_level=self._bifpn_max_level,
+        input_max_level=self._backbone_max_level,
+        is_training=self._is_training,
+        conv_hyperparams=self._conv_hyperparams,
+        freeze_batchnorm=self._freeze_batchnorm,
+        bifpn_node_params=self._bifpn_node_params,
+        name='bifpn')
+    self.built = True
+
+  def preprocess(self, inputs):
+    """SSD preprocessing.
+
+    Channel-wise mean subtraction and scaling.
+
+    Args:
+      inputs: a [batch, height, width, channels] float tensor representing a
+        batch of images.
+
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    if inputs.shape.as_list()[3] == 3:
+      # Input images are expected to be in the range [0, 255].
+      channel_offset = [0.485, 0.456, 0.406]
+      channel_scale = [0.229, 0.224, 0.225]
+      return ((inputs / 255.0) - [[channel_offset]]) / [[channel_scale]]
+    else:
+      return inputs
+
+  def _extract_features(self, preprocessed_inputs):
+    """Extract features from preprocessed inputs.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    """
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        129, preprocessed_inputs)
+
+    base_feature_maps = self._efficientnet(
+        ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple))
+
+    output_feature_map_dict = self._bifpn_stage(
+        list(zip(self._output_layer_alias, base_feature_maps)))
+
+    return list(output_feature_map_dict.values())
+
+
+class SSDEfficientNetB0BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b0 BiFPN (EfficientDet-d0) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=3,
+               bifpn_num_filters=64,
+               bifpn_combine_method='fast_attention',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D0'):
+    """SSD Keras EfficientNet-b0 BiFPN (EfficientDet-d0) Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB0BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b0',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB1BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b1 BiFPN (EfficientDet-d1) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=4,
+               bifpn_num_filters=88,
+               bifpn_combine_method='fast_attention',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D1'):
+    """SSD Keras EfficientNet-b1 BiFPN (EfficientDet-d1) Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB1BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b1',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB2BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b2 BiFPN (EfficientDet-d2) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=5,
+               bifpn_num_filters=112,
+               bifpn_combine_method='fast_attention',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D2'):
+
+    """SSD Keras EfficientNet-b2 BiFPN (EfficientDet-d2) Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB2BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b2',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB3BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b3 BiFPN (EfficientDet-d3) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=6,
+               bifpn_num_filters=160,
+               bifpn_combine_method='fast_attention',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D3'):
+
+    """SSD Keras EfficientNet-b3 BiFPN (EfficientDet-d3) Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB3BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b3',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB4BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b4 BiFPN (EfficientDet-d4) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=7,
+               bifpn_num_filters=224,
+               bifpn_combine_method='fast_attention',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D4'):
+
+    """SSD Keras EfficientNet-b4 BiFPN (EfficientDet-d4) Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB4BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b4',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB5BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b5 BiFPN (EfficientDet-d5) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=7,
+               bifpn_num_filters=288,
+               bifpn_combine_method='fast_attention',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D5'):
+
+    """SSD Keras EfficientNet-b5 BiFPN (EfficientDet-d5) Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB5BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b5',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB6BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b6 BiFPN (EfficientDet-d[6,7]) Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=8,
+               bifpn_num_filters=384,
+               bifpn_combine_method='sum',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientDet-D6-D7'):
+
+    """SSD Keras EfficientNet-b6 BiFPN (EfficientDet-d[6,7]) Feature Extractor.
+
+    SSD Keras EfficientNet-b6 BiFPN Feature Extractor, a.k.a. EfficientDet-d6
+    and EfficientDet-d7. The EfficientDet-d[6,7] models use the same backbone
+    EfficientNet-b6 and the same BiFPN architecture, and therefore have the same
+    number of parameters. They only differ in their input resolutions.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB6BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b6',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
+
+
+class SSDEfficientNetB7BiFPNKerasFeatureExtractor(
+    SSDEfficientNetBiFPNKerasFeatureExtractor):
+  """SSD Keras EfficientNet-b7 BiFPN Feature Extractor."""
+
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               freeze_batchnorm,
+               inplace_batchnorm_update,
+               bifpn_min_level=3,
+               bifpn_max_level=7,
+               bifpn_num_iterations=8,
+               bifpn_num_filters=384,
+               bifpn_combine_method='sum',
+               use_explicit_padding=None,
+               use_depthwise=None,
+               override_base_feature_extractor_hyperparams=None,
+               name='EfficientNet-B7_BiFPN'):
+
+    """SSD Keras EfficientNet-b7 BiFPN Feature Extractor.
+
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: unsupported by EfficientNetBiFPN. float, depth
+        multiplier for the feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: a `hyperparams_builder.KerasLayerHyperparams` object
+        containing convolution hyperparameters for the layers added on top of
+        the base feature extractor.
+      freeze_batchnorm: whether to freeze batch norm parameters during training
+        or not. When training with a small batch size (e.g. 1), it is desirable
+        to freeze batch norm update and use pretrained batch norm params.
+      inplace_batchnorm_update: whether to update batch norm moving average
+        values inplace. When this is false train op must add a control
+        dependency on tf.graphkeys.UPDATE_OPS collection in order to update
+        batch norm statistics.
+      bifpn_min_level: the highest resolution feature map to use in BiFPN. The
+        valid values are {2, 3, 4, 5} which map to Resnet blocks {1, 2, 3, 4}
+        respectively.
+      bifpn_max_level: the smallest resolution feature map to use in the BiFPN.
+        BiFPN constructions uses features maps starting from bifpn_min_level
+        upto the bifpn_max_level. In the case that there are not enough feature
+        maps in the backbone network, additional feature maps are created by
+        applying stride 2 convolutions until we get the desired number of BiFPN
+        levels.
+      bifpn_num_iterations: number of BiFPN iterations. Overrided if
+        efficientdet_version is provided.
+      bifpn_num_filters: number of filters (channels) in all BiFPN layers.
+        Overrided if efficientdet_version is provided.
+      bifpn_combine_method: the method used to combine BiFPN nodes.
+      use_explicit_padding: unsupported by EfficientNetBiFPN. Whether to use
+        explicit padding when extracting features.
+      use_depthwise: unsupported by EfficientNetBiFPN, since BiFPN uses regular
+        convolutions when inputs to a node have a differing number of channels,
+        and use separable convolutions after combine operations.
+      override_base_feature_extractor_hyperparams: unsupported. Whether to
+        override hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams`.
+      name: a string name scope to assign to the model. If 'None', Keras will
+        auto-generate one from the class name.
+    """
+    super(SSDEfficientNetB7BiFPNKerasFeatureExtractor, self).__init__(
+        is_training=is_training,
+        depth_multiplier=depth_multiplier,
+        min_depth=min_depth,
+        pad_to_multiple=pad_to_multiple,
+        conv_hyperparams=conv_hyperparams,
+        freeze_batchnorm=freeze_batchnorm,
+        inplace_batchnorm_update=inplace_batchnorm_update,
+        bifpn_min_level=bifpn_min_level,
+        bifpn_max_level=bifpn_max_level,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method,
+        efficientnet_version='efficientnet-b7',
+        use_explicit_padding=use_explicit_padding,
+        use_depthwise=use_depthwise,
+        override_base_feature_extractor_hyperparams=
+        override_base_feature_extractor_hyperparams,
+        name=name)
--- a/research/object_detection/models/ssd_efficientnet_bifpn_feature_extractor_tf2_test.py
+++ b/research/object_detection/models/ssd_efficientnet_bifpn_feature_extractor_tf2_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ssd_efficientnet_bifpn_feature_extractor."""
+import unittest
+from absl.testing import parameterized
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+from google.protobuf import text_format
+from object_detection.builders import hyperparams_builder
+from object_detection.models import ssd_efficientnet_bifpn_feature_extractor
+from object_detection.protos import hyperparams_pb2
+from object_detection.utils import test_case
+from object_detection.utils import tf_version
+
+
+def _count_params(model, trainable_only=True):
+  """Returns the count of all model parameters, or just trainable ones."""
+  if not trainable_only:
+    return model.count_params()
+  else:
+    return int(np.sum([
+        tf.keras.backend.count_params(p) for p in model.trainable_weights]))
+
+
+@parameterized.parameters(
+    {'efficientdet_version': 'efficientdet-d0',
+     'efficientnet_version': 'efficientnet-b0',
+     'bifpn_num_iterations': 3,
+     'bifpn_num_filters': 64,
+     'bifpn_combine_method': 'fast_attention'},
+    {'efficientdet_version': 'efficientdet-d1',
+     'efficientnet_version': 'efficientnet-b1',
+     'bifpn_num_iterations': 4,
+     'bifpn_num_filters': 88,
+     'bifpn_combine_method': 'fast_attention'},
+    {'efficientdet_version': 'efficientdet-d2',
+     'efficientnet_version': 'efficientnet-b2',
+     'bifpn_num_iterations': 5,
+     'bifpn_num_filters': 112,
+     'bifpn_combine_method': 'fast_attention'},
+    {'efficientdet_version': 'efficientdet-d3',
+     'efficientnet_version': 'efficientnet-b3',
+     'bifpn_num_iterations': 6,
+     'bifpn_num_filters': 160,
+     'bifpn_combine_method': 'fast_attention'},
+    {'efficientdet_version': 'efficientdet-d4',
+     'efficientnet_version': 'efficientnet-b4',
+     'bifpn_num_iterations': 7,
+     'bifpn_num_filters': 224,
+     'bifpn_combine_method': 'fast_attention'},
+    {'efficientdet_version': 'efficientdet-d5',
+     'efficientnet_version': 'efficientnet-b5',
+     'bifpn_num_iterations': 7,
+     'bifpn_num_filters': 288,
+     'bifpn_combine_method': 'fast_attention'},
+    # efficientdet-d6 and efficientdet-d7 only differ in input size.
+    {'efficientdet_version': 'efficientdet-d6-d7',
+     'efficientnet_version': 'efficientnet-b6',
+     'bifpn_num_iterations': 8,
+     'bifpn_num_filters': 384,
+     'bifpn_combine_method': 'sum'})
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class SSDEfficientNetBiFPNFeatureExtractorTest(
+    test_case.TestCase, parameterized.TestCase):
+
+  def _build_conv_hyperparams(self, add_batch_norm=True):
+    conv_hyperparams = hyperparams_pb2.Hyperparams()
+    conv_hyperparams_text_proto = """
+      force_use_bias: true
+      activation: SWISH
+      regularizer {
+        l2_regularizer {
+          weight: 0.0004
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+          stddev: 0.03
+          mean: 0.0
+        }
+      }
+    """
+    if add_batch_norm:
+      batch_norm_proto = """
+        batch_norm {
+          scale: true,
+          decay: 0.99,
+          epsilon: 0.001,
+        }
+      """
+      conv_hyperparams_text_proto += batch_norm_proto
+    text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
+    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
+
+  def _create_feature_extractor(self,
+                                efficientnet_version='efficientnet-b0',
+                                bifpn_num_iterations=3,
+                                bifpn_num_filters=64,
+                                bifpn_combine_method='fast_attention'):
+    """Constructs a new EfficientNetBiFPN feature extractor."""
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    min_depth = 16
+    return (ssd_efficientnet_bifpn_feature_extractor
+            .SSDEfficientNetBiFPNKerasFeatureExtractor(
+                is_training=True,
+                depth_multiplier=depth_multiplier,
+                min_depth=min_depth,
+                pad_to_multiple=pad_to_multiple,
+                conv_hyperparams=self._build_conv_hyperparams(),
+                freeze_batchnorm=False,
+                inplace_batchnorm_update=False,
+                bifpn_min_level=3,
+                bifpn_max_level=7,
+                bifpn_num_iterations=bifpn_num_iterations,
+                bifpn_num_filters=bifpn_num_filters,
+                bifpn_combine_method=bifpn_combine_method,
+                efficientnet_version=efficientnet_version))
+
+  def test_efficientdet_feature_extractor_shapes(self,
+                                                 efficientdet_version,
+                                                 efficientnet_version,
+                                                 bifpn_num_iterations,
+                                                 bifpn_num_filters,
+                                                 bifpn_combine_method):
+    feature_extractor = self._create_feature_extractor(
+        efficientnet_version=efficientnet_version,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method)
+    outputs = feature_extractor(np.zeros((2, 256, 256, 3), dtype=np.float32))
+
+    self.assertEqual(outputs[0].shape, (2, 32, 32, bifpn_num_filters))
+    self.assertEqual(outputs[1].shape, (2, 16, 16, bifpn_num_filters))
+    self.assertEqual(outputs[2].shape, (2, 8, 8, bifpn_num_filters))
+    self.assertEqual(outputs[3].shape, (2, 4, 4, bifpn_num_filters))
+    self.assertEqual(outputs[4].shape, (2, 2, 2, bifpn_num_filters))
+
+  def test_efficientdet_feature_extractor_params(self,
+                                                 efficientdet_version,
+                                                 efficientnet_version,
+                                                 bifpn_num_iterations,
+                                                 bifpn_num_filters,
+                                                 bifpn_combine_method):
+    feature_extractor = self._create_feature_extractor(
+        efficientnet_version=efficientnet_version,
+        bifpn_num_iterations=bifpn_num_iterations,
+        bifpn_num_filters=bifpn_num_filters,
+        bifpn_combine_method=bifpn_combine_method)
+    _ = feature_extractor(np.zeros((2, 256, 256, 3), dtype=np.float32))
+    expected_params = {
+        'efficientdet-d0': 5484829,
+        'efficientdet-d1': 8185156,
+        'efficientdet-d2': 9818153,
+        'efficientdet-d3': 13792706,
+        'efficientdet-d4': 22691445,
+        'efficientdet-d5': 35795677,
+        'efficientdet-d6-d7': 53624512,
+    }
+    num_params = _count_params(feature_extractor)
+    self.assertEqual(expected_params[efficientdet_version], num_params)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/packages/tf1/setup.py
+++ b/research/object_detection/packages/tf1/setup.py
+"""Setup script for object_detection with TF1.0."""
+import os
+from setuptools import find_packages
+from setuptools import setup
+
+REQUIRED_PACKAGES = ['apache-beam', 'pillow', 'lxml', 'matplotlib', 'Cython',
+                     'contextlib2', 'tf-slim', 'six', 'pycocotools', 'scipy',
+                     'pandas']
+
+setup(
+    name='object_detection',
+    version='0.1',
+    install_requires=REQUIRED_PACKAGES,
+    include_package_data=True,
+    packages=(
+        [p for p in find_packages() if p.startswith('object_detection')] +
+        find_packages(where=os.path.join('.', 'slim'))),
+    package_dir={
+        'datasets': os.path.join('slim', 'datasets'),
+        'nets': os.path.join('slim', 'nets'),
+        'preprocessing': os.path.join('slim', 'preprocessing'),
+        'deployment': os.path.join('slim', 'deployment'),
+        'scripts': os.path.join('slim', 'scripts'),
+    },
+    description='Tensorflow Object Detection Library with TF1.0',
+    python_requires='>3.6',
+)
--- a/research/object_detection/packages/tf2/setup.py
+++ b/research/object_detection/packages/tf2/setup.py
+"""Setup script for object_detection with TF2.0."""
+import os
+from setuptools import find_packages
+from setuptools import setup
+
+# Note: adding apache-beam to required packages causes conflict with
+# tf-models-offical requirements. These packages request for incompatible
+# oauth2client package.
+REQUIRED_PACKAGES = ['pillow', 'lxml', 'matplotlib', 'Cython', 'contextlib2',
+                     'tf-slim', 'six', 'pycocotools', 'scipy', 'pandas',
+                     'tf-models-official']
+
+setup(
+    name='object_detection',
+    version='0.1',
+    install_requires=REQUIRED_PACKAGES,
+    include_package_data=True,
+    packages=(
+        [p for p in find_packages() if p.startswith('object_detection')] +
+        find_packages(where=os.path.join('.', 'slim'))),
+    package_dir={
+        'datasets': os.path.join('slim', 'datasets'),
+        'nets': os.path.join('slim', 'nets'),
+        'preprocessing': os.path.join('slim', 'preprocessing'),
+        'deployment': os.path.join('slim', 'deployment'),
+        'scripts': os.path.join('slim', 'scripts'),
+    },
+    description='Tensorflow Object Detection Library',
+    python_requires='>3.6',
+)
--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -183,6 +183,41 @@ message CenterNet {
    optional float heatmap_bias_init = 3 [default = -2.19];
  }
  optional MaskEstimation mask_estimation_task = 8;
+
+  // Parameters which are related to DensePose estimation task.
+  // http://densepose.org/
+  message DensePoseEstimation {
+    // Weight of the task loss. The total loss of the model will be their
+    // summation of task losses weighted by the weights.
+    optional float task_loss_weight = 1 [default = 1.0];
+
+    // Class ID (0-indexed) that corresponds to the object in the label map that
+    // contains DensePose data.
+    optional int32 class_id = 2;
+
+    // Loss configuration for DensePose heatmap and regression losses. Note
+    // that the localization loss is used for surface coordinate losses and
+    // classification loss is used for part classification losses.
+    optional Loss loss = 3;
+
+    // The number of body parts.
+    optional int32 num_parts = 4 [default = 24];
+
+    // Loss weights for the two DensePose heads.
+    optional float part_loss_weight = 5 [default = 1.0];
+    optional float coordinate_loss_weight = 6 [default = 1.0];
+
+    // Whether to upsample the prediction feature maps back to the original
+    // input dimension prior to applying loss. This has the benefit of
+    // maintaining finer groundtruth location information.
+    optional bool upsample_to_input_res = 7 [default = true];
+
+    // The initial bias value of the convlution kernel of the class heatmap
+    // prediction head. -2.19 corresponds to predicting foreground with
+    // a probability of 0.1.
+    optional float heatmap_bias_init = 8 [default = -2.19];
+  }
+  optional DensePoseEstimation densepose_estimation_task = 9;
 }

 message CenterNetFeatureExtractor {

--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -4,7 +4,7 @@ package object_detection.protos;

 // Message for defining a preprocessing operation on input data.
 // See: //third_party/tensorflow_models/object_detection/core/preprocessor.py
-// Next ID: 38
+// Next ID: 39
 message PreprocessingStep {
  oneof preprocessing_step {
    NormalizeImage normalize_image = 1;
@@ -44,6 +44,7 @@ message PreprocessingStep {
    RandomDownscaleToTargetPixels random_downscale_to_target_pixels = 35;
    RandomPatchGaussian random_patch_gaussian = 36;
    RandomSquareCropByScale random_square_crop_by_scale = 37;
+    RandomScaleCropAndPadToSquare random_scale_crop_and_pad_to_square = 38;
  }
 }

@@ -572,3 +573,20 @@ message RandomSquareCropByScale {
  // [min_scale, max_scale]
  optional int32 num_scales = 4 [default=8];
 }
+
+// Randomly scale, crop, and then pad an image to the desired square output
+// dimensions. Specifically, this method first samples a random_scale factor
+// from a uniform distribution between scale_min and scale_max, and then resizes
+// the image such that it's maximum dimension is (output_size * random_scale).
+// Secondly, a square output_size crop is extracted from the resized image, and
+// finally the cropped region is padded to the desired square output_size.
+// The augmentation is borrowed from [1]
+// [1]: https://arxiv.org/abs/1911.09070
+message RandomScaleCropAndPadToSquare {
+  // The (square) output image size
+  optional int32 output_size = 1 [default = 512];
+
+  // The minimum and maximum values from which to sample the random scale.
+  optional float scale_min = 2 [default=0.1];
+  optional float scale_max = 3 [default=2.0];
+}
--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -145,7 +145,7 @@ message Ssd {
  optional MaskHead mask_head_config = 25;
 }

-// Next id: 19.
+// Next id: 20.
 message SsdFeatureExtractor {
  reserved 6;

@@ -185,8 +185,13 @@ message SsdFeatureExtractor {
  // feature maps added by SSD.
  optional bool use_depthwise = 8 [default = false];

-  // Feature Pyramid Networks config.
-  optional FeaturePyramidNetworks fpn = 10;
+  oneof feature_pyramid_oneof {
+    // Feature Pyramid Networks config.
+    FeaturePyramidNetworks fpn = 10;
+
+    // Bidirectional Feature Pyramid Networks config.
+    BidirectionalFeaturePyramidNetworks bifpn = 19;
+  }

  // If true, replace preprocess function of feature extractor with a
  // placeholder. This should only be used if all the image preprocessing steps
@@ -225,3 +230,23 @@ message FeaturePyramidNetworks {

 }

+// Configuration for Bidirectional Feature Pyramid Networks.
+message BidirectionalFeaturePyramidNetworks {
+  // minimum level in the feature pyramid.
+  optional int32 min_level = 1 [default = 3];
+
+  // maximum level in the feature pyramid.
+  optional int32 max_level = 2 [default = 7];
+
+  // The number of repeated top-down bottom-up iterations for BiFPN-based
+  // feature extractors (bidirectional feature pyramid networks).
+  optional int32 num_iterations = 3;
+
+  // The number of filters (channels) to use in feature pyramid layers for
+  // BiFPN-based feature extractors (bidirectional feature pyramid networks).
+  optional int32 num_filters = 4;
+
+  // Method used to combine inputs to BiFPN nodes.
+  optional string combine_method = 5 [default = 'fast_attention'];
+}
+