Merge remote-tracking branch 'upstream/master' into fasterrcnn_fpn_keras_feature_extractor

b1025b3b · syiming · 69ce1c45 · e9df75ab · b1025b3b · b1025b3b
Commit b1025b3b authored Jun 18, 2020 by syiming
20 changed files
--- a/research/object_detection/core/freezable_batch_norm_test.py
+++ b/research/object_detection/core/freezable_batch_norm_test.py
@@ -17,15 +17,17 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+import unittest
 import numpy as np
 from six.moves import zip
 import tensorflow.compat.v1 as tf


 from object_detection.core import freezable_batch_norm
+from object_detection.utils import tf_version


+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class FreezableBatchNormTest(tf.test.TestCase):
  """Tests for FreezableBatchNorm operations."""


--- a/research/object_detection/core/losses.py
+++ b/research/object_detection/core/losses.py
@@ -681,3 +681,95 @@ class HardExampleMiner(object):
            num_positives, num_negatives)


+class PenaltyReducedLogisticFocalLoss(Loss):
+  """Penalty-reduced pixelwise logistic regression with focal loss.
+
+  The loss is defined in Equation (1) of the Objects as Points[1] paper.
+  Although the loss is defined per-pixel in the output space, this class
+  assumes that each pixel is an anchor to be compatible with the base class.
+
+  [1]: https://arxiv.org/abs/1904.07850
+  """
+
+  def __init__(self, alpha=2.0, beta=4.0, sigmoid_clip_value=1e-4):
+    """Constructor.
+
+    Args:
+      alpha: Focussing parameter of the focal loss. Increasing this will
+        decrease the loss contribution of the well classified examples.
+      beta: The local penalty reduction factor. Increasing this will decrease
+        the contribution of loss due to negative pixels near the keypoint.
+      sigmoid_clip_value: The sigmoid operation used internally will be clipped
+        between [sigmoid_clip_value, 1 - sigmoid_clip_value)
+    """
+    self._alpha = alpha
+    self._beta = beta
+    self._sigmoid_clip_value = sigmoid_clip_value
+    super(PenaltyReducedLogisticFocalLoss, self).__init__()
+
+  def _compute_loss(self, prediction_tensor, target_tensor, weights):
+    """Compute loss function.
+
+    In all input tensors, `num_anchors` is the total number of pixels in the
+    the output space.
+
+    Args:
+      prediction_tensor: A float tensor of shape [batch_size, num_anchors,
+        num_classes] representing the predicted unscaled logits for each class.
+        The function will compute sigmoid on this tensor internally.
+      target_tensor: A float tensor of shape [batch_size, num_anchors,
+        num_classes] representing a tensor with the 'splatted' keypoints,
+        possibly using a gaussian kernel. This function assumes that
+        the target is bounded between [0, 1].
+      weights: a float tensor of shape, either [batch_size, num_anchors,
+        num_classes] or [batch_size, num_anchors, 1]. If the shape is
+        [batch_size, num_anchors, 1], all the classses are equally weighted.
+
+
+    Returns:
+      loss: a float tensor of shape [batch_size, num_anchors, num_classes]
+        representing the value of the loss function.
+    """
+
+    is_present_tensor = tf.math.equal(target_tensor, 1.0)
+    prediction_tensor = tf.clip_by_value(tf.sigmoid(prediction_tensor),
+                                         self._sigmoid_clip_value,
+                                         1 - self._sigmoid_clip_value)
+
+    positive_loss = (tf.math.pow((1 - prediction_tensor), self._alpha)*
+                     tf.math.log(prediction_tensor))
+    negative_loss = (tf.math.pow((1 - target_tensor), self._beta)*
+                     tf.math.pow(prediction_tensor, self._alpha)*
+                     tf.math.log(1 - prediction_tensor))
+
+    loss = -tf.where(is_present_tensor, positive_loss, negative_loss)
+    return loss * weights
+
+
+class L1LocalizationLoss(Loss):
+  """L1 loss or absolute difference.
+
+  When used in a per-pixel manner, each pixel should be given as an anchor.
+  """
+
+  def _compute_loss(self, prediction_tensor, target_tensor, weights):
+    """Compute loss function.
+
+    Args:
+      prediction_tensor: A float tensor of shape [batch_size, num_anchors]
+        representing the (encoded) predicted locations of objects.
+      target_tensor: A float tensor of shape [batch_size, num_anchors]
+        representing the regression targets
+      weights: a float tensor of shape [batch_size, num_anchors]
+
+    Returns:
+      loss: a float tensor of shape [batch_size, num_anchors] tensor
+        representing the value of the loss function.
+    """
+    return tf.losses.absolute_difference(
+        target_tensor,
+        prediction_tensor,
+        weights=weights,
+        loss_collection=None,
+        reduction=tf.losses.Reduction.NONE
+    )
--- a/research/object_detection/core/prefetcher.py
+++ b/research/object_detection/core/prefetcher.py
@@ -16,10 +16,6 @@
 """Provides functions to prefetch tensors to feed into models."""
 import tensorflow.compat.v1 as tf

-from object_detection.utils import tf_version
-if not tf_version.is_tf1():
-  raise ValueError('`prefetcher.py` is only supported in Tensorflow 1.X')
-

 def prefetch(tensor_dict, capacity):
  """Creates a prefetch queue for tensors.

--- a/research/object_detection/core/prefetcher_tf1_test.py
+++ b/research/object_detection/core/prefetcher_tf1_test.py
@@ -18,16 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 from six.moves import range
 import tensorflow.compat.v1 as tf
-
-# pylint: disable=g-bad-import-order,
-from object_detection.core import prefetcher
 import tf_slim as slim

-# pylint: disable=g-bad-import-order
+from object_detection.core import prefetcher
+from object_detection.utils import tf_version


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class PrefetcherTest(tf.test.TestCase):
  """Test class for prefetcher."""


--- a/research/object_detection/core/preprocessor_test.py
+++ b/research/object_detection/core/preprocessor_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 from absl.testing import parameterized
 import numpy as np
 import six
@@ -30,11 +31,12 @@ from object_detection.core import preprocessor
 from object_detection.core import preprocessor_cache
 from object_detection.core import standard_fields as fields
 from object_detection.utils import test_case
+from object_detection.utils import tf_version

 if six.PY2:
  import mock  # pylint: disable=g-import-not-at-top
 else:
-  from unittest import mock  # pylint: disable=g-import-not-at-top
+  mock = unittest.mock  # pylint: disable=g-import-not-at-top


 class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
@@ -2819,6 +2821,7 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllEqual(images_shape, patched_images_shape)
    self.assertAllEqual(images, patched_images)

+  @unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
  def testAutoAugmentImage(self):
    def graph_fn():
      preprocessing_options = []

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -50,10 +50,12 @@ from object_detection.core import matcher as mat
 from object_detection.core import region_similarity_calculator as sim_calc
 from object_detection.core import standard_fields as fields
 from object_detection.matchers import argmax_matcher
-from object_detection.matchers import bipartite_matcher
 from object_detection.utils import shape_utils
 from object_detection.utils import target_assigner_utils as ta_utils
+from object_detection.utils import tf_version

+if tf_version.is_tf1():
+  from object_detection.matchers import bipartite_matcher  # pylint: disable=g-import-not-at-top

 ResizeMethod = tf2.image.ResizeMethod

@@ -398,6 +400,8 @@ def create_target_assigner(reference, stage=None,
    ValueError: if combination reference+stage is invalid.
  """
  if reference == 'Multibox' and stage == 'proposal':
+    if tf_version.is_tf2():
+      raise ValueError('GreedyBipartiteMatcher is not supported in TF 2.X.')
    similarity_calc = sim_calc.NegSqDistSimilarity()
    matcher = bipartite_matcher.GreedyBipartiteMatcher()
    box_coder_instance = mean_stddev_box_coder.MeanStddevBoxCoder()
@@ -713,3 +717,943 @@ def batch_assign_confidences(target_assigner,
          batch_reg_weights, batch_match)


+def _smallest_positive_root(a, b, c):
+  """Returns the smallest positive root of a quadratic equation."""
+
+  discriminant = tf.sqrt(b ** 2 - 4 * a * c)
+
+  # TODO(vighneshb) We are currently using the slightly incorrect
+  # CenterNet implementation. The commented lines implement the fixed version
+  # in https://github.com/princeton-vl/CornerNet. Change the implementation
+  # after verifying it has no negative impact.
+  # root1 = (-b - discriminant) / (2 * a)
+  # root2 = (-b + discriminant) / (2 * a)
+
+  # return tf.where(tf.less(root1, 0), root2, root1)
+
+  return (-b + discriminant) / (2.0)
+
+
+def max_distance_for_overlap(height, width, min_iou):
+  """Computes how far apart bbox corners can lie while maintaining the iou.
+
+  Given a bounding box size, this function returns a lower bound on how far
+  apart the corners of another box can lie while still maintaining the given
+  IoU. The implementation is based on the `gaussian_radius` function in the
+  Objects as Points github repo: https://github.com/xingyizhou/CenterNet
+
+  Args:
+    height: A 1-D float Tensor representing height of the ground truth boxes.
+    width: A 1-D float Tensor representing width of the ground truth boxes.
+    min_iou: A float representing the minimum IoU desired.
+
+  Returns:
+   distance: A 1-D Tensor of distances, of the same length as the input
+     height and width tensors.
+  """
+
+  # Given that the detected box is displaced at a distance `d`, the exact
+  # IoU value will depend on the angle at which each corner is displaced.
+  # We simplify our computation by assuming that each corner is displaced by
+  # a distance `d` in both x and y direction. This gives us a lower IoU than
+  # what is actually realizable and ensures that any box with corners less
+  # than `d` distance apart will always have an IoU greater than or equal
+  # to `min_iou`
+
+  # The following 3 cases can be worked on geometrically and come down to
+  # solving a quadratic inequality. In each case, to ensure `min_iou` we use
+  # the smallest positive root of the equation.
+
+  # Case where detected box is offset from ground truth and no box completely
+  # contains the other.
+
+  distance_detection_offset = _smallest_positive_root(
+      a=1, b=-(height + width),
+      c=width * height * ((1 - min_iou) / (1 + min_iou))
+  )
+
+  # Case where detection is smaller than ground truth and completely contained
+  # in it.
+  distance_detection_in_gt = _smallest_positive_root(
+      a=4, b=-2 * (height + width),
+      c=(1 - min_iou) * width * height
+  )
+
+  # Case where ground truth is smaller than detection and completely contained
+  # in it.
+  distance_gt_in_detection = _smallest_positive_root(
+      a=4 * min_iou, b=(2 * min_iou) * (width + height),
+      c=(min_iou - 1) * width * height
+  )
+
+  return tf.reduce_min([distance_detection_offset,
+                        distance_gt_in_detection,
+                        distance_detection_in_gt], axis=0)
+
+
+def get_batch_predictions_from_indices(batch_predictions, indices):
+  """Gets the values of predictions in a batch at the given indices.
+
+  The indices are expected to come from the offset targets generation functions
+  in this library. The returned value is intended to be used inside a loss
+  function.
+
+  Args:
+    batch_predictions: A tensor of shape [batch_size, height, width, 2] for
+      single class offsets and [batch_size, height, width, class, 2] for
+      multiple classes offsets (e.g. keypoint joint offsets) representing the
+      (height, width) or (y_offset, x_offset) predictions over a batch.
+    indices: A tensor of shape [num_instances, 3] for single class offset and
+      [num_instances, 4] for multiple classes offsets representing the indices
+      in the batch to be penalized in a loss function
+
+  Returns:
+    values: A tensor of shape [num_instances, 2] holding the predicted values
+      at the given indices.
+  """
+  return tf.gather_nd(batch_predictions, indices)
+
+
+def _compute_std_dev_from_box_size(boxes_height, boxes_width, min_overlap):
+  """Computes the standard deviation of the Gaussian kernel from box size.
+
+  Args:
+    boxes_height: A 1D tensor with shape [num_instances] representing the height
+      of each box.
+    boxes_width: A 1D tensor with shape [num_instances] representing the width
+      of each box.
+    min_overlap: The minimum IOU overlap that boxes need to have to not be
+      penalized.
+
+  Returns:
+    A 1D tensor with shape [num_instances] representing the computed Gaussian
+    sigma for each of the box.
+  """
+  # We are dividing by 3 so that points closer than the computed
+  # distance have a >99% CDF.
+  sigma = max_distance_for_overlap(boxes_height, boxes_width, min_overlap)
+  sigma = (2 * tf.math.maximum(tf.math.floor(sigma), 0.0) + 1) / 6.0
+  return sigma
+
+
+class CenterNetCenterHeatmapTargetAssigner(object):
+  """Wrapper to compute the object center heatmap."""
+
+  def __init__(self, stride, min_overlap=0.7):
+    """Initializes the target assigner.
+
+    Args:
+      stride: int, the stride of the network in output pixels.
+      min_overlap: The minimum IOU overlap that boxes need to have to not be
+        penalized.
+    """
+
+    self._stride = stride
+    self._min_overlap = min_overlap
+
+  def assign_center_targets_from_boxes(self,
+                                       height,
+                                       width,
+                                       gt_boxes_list,
+                                       gt_classes_list,
+                                       gt_weights_list=None):
+    """Computes the object center heatmap target.
+
+    Args:
+      height: int, height of input to the model. This is used to
+        determine the height of the output.
+      width: int, width of the input to the model. This is used to
+        determine the width of the output.
+      gt_boxes_list: A list of float tensors with shape [num_boxes, 4]
+        representing the groundtruth detection bounding boxes for each sample in
+        the batch. The box coordinates are expected in normalized coordinates.
+      gt_classes_list: A list of float tensors with shape [num_boxes,
+        num_classes] representing the one-hot encoded class labels for each box
+        in the gt_boxes_list.
+      gt_weights_list: A list of float tensors with shape [num_boxes]
+        representing the weight of each groundtruth detection box.
+
+    Returns:
+      heatmap: A Tensor of size [batch_size, output_height, output_width,
+        num_classes] representing the per class center heatmap. output_height
+        and output_width are computed by dividing the input height and width by
+        the stride specified during initialization.
+    """
+
+    out_height = tf.cast(height // self._stride, tf.float32)
+    out_width = tf.cast(width // self._stride, tf.float32)
+    # Compute the yx-grid to be used to generate the heatmap. Each returned
+    # tensor has shape of [out_height, out_width]
+    (y_grid, x_grid) = ta_utils.image_shape_to_grids(out_height, out_width)
+
+    heatmaps = []
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_boxes_list)
+    # TODO(vighneshb) Replace the for loop with a batch version.
+    for boxes, class_targets, weights in zip(gt_boxes_list, gt_classes_list,
+                                             gt_weights_list):
+      boxes = box_list.BoxList(boxes)
+      # Convert the box coordinates to absolute output image dimension space.
+      boxes = box_list_ops.to_absolute_coordinates(boxes,
+                                                   height // self._stride,
+                                                   width // self._stride)
+      # Get the box center coordinates. Each returned tensors have the shape of
+      # [num_instances]
+      (y_center, x_center, boxes_height,
+       boxes_width) = boxes.get_center_coordinates_and_sizes()
+
+      # Compute the sigma from box size. The tensor shape: [num_instances].
+      sigma = _compute_std_dev_from_box_size(boxes_height, boxes_width,
+                                             self._min_overlap)
+      # Apply the Gaussian kernel to the center coordinates. Returned heatmap
+      # has shape of [out_height, out_width, num_classes]
+      heatmap = ta_utils.coordinates_to_heatmap(
+          y_grid=y_grid,
+          x_grid=x_grid,
+          y_coordinates=y_center,
+          x_coordinates=x_center,
+          sigma=sigma,
+          channel_onehot=class_targets,
+          channel_weights=weights)
+      heatmaps.append(heatmap)
+
+    # Return the stacked heatmaps over the batch.
+    return tf.stack(heatmaps, axis=0)
+
+
+class CenterNetBoxTargetAssigner(object):
+  """Wrapper to compute target tensors for the object detection task.
+
+  This class has methods that take as input a batch of ground truth tensors
+  (in the form of a list) and return the targets required to train the object
+  detection task.
+  """
+
+  def __init__(self, stride):
+    """Initializes the target assigner.
+
+    Args:
+      stride: int, the stride of the network in output pixels.
+    """
+
+    self._stride = stride
+
+  def assign_size_and_offset_targets(self,
+                                     height,
+                                     width,
+                                     gt_boxes_list,
+                                     gt_weights_list=None):
+    """Returns the box height/width and center offset targets and their indices.
+
+    The returned values are expected to be used with predicted tensors
+    of size (batch_size, height//self._stride, width//self._stride, 2). The
+    predicted values at the relevant indices can be retrieved with the
+    get_batch_predictions_from_indices function.
+
+    Args:
+      height: int, height of input to the model. This is used to determine the
+        height of the output.
+      width: int, width of the input to the model. This is used to determine the
+        width of the output.
+      gt_boxes_list: A list of float tensors with shape [num_boxes, 4]
+        representing the groundtruth detection bounding boxes for each sample in
+        the batch. The coordinates are expected in normalized coordinates.
+      gt_weights_list: A list of tensors with shape [num_boxes] corresponding to
+        the weight of each groundtruth detection box.
+
+    Returns:
+      batch_indices: an integer tensor of shape [num_boxes, 3] holding the
+        indices inside the predicted tensor which should be penalized. The
+        first column indicates the index along the batch dimension and the
+        second and third columns indicate the index along the y and x
+        dimensions respectively.
+      batch_box_height_width: a float tensor of shape [num_boxes, 2] holding
+        expected height and width of each box in the output space.
+      batch_offsets: a float tensor of shape [num_boxes, 2] holding the
+        expected y and x offset of each box in the output space.
+      batch_weights: a float tensor of shape [num_boxes] indicating the
+        weight of each prediction.
+    """
+
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_boxes_list)
+
+    batch_indices = []
+    batch_box_height_width = []
+    batch_weights = []
+    batch_offsets = []
+
+    for i, (boxes, weights) in enumerate(zip(gt_boxes_list, gt_weights_list)):
+      boxes = box_list.BoxList(boxes)
+      boxes = box_list_ops.to_absolute_coordinates(boxes,
+                                                   height // self._stride,
+                                                   width // self._stride)
+      # Get the box center coordinates. Each returned tensors have the shape of
+      # [num_boxes]
+      (y_center, x_center, boxes_height,
+       boxes_width) = boxes.get_center_coordinates_and_sizes()
+      num_boxes = tf.shape(x_center)
+
+      # Compute the offsets and indices of the box centers. Shape:
+      #   offsets: [num_boxes, 2]
+      #   indices: [num_boxes, 2]
+      (offsets, indices) = ta_utils.compute_floor_offsets_with_indices(
+          y_source=y_center, x_source=x_center)
+
+      # Assign ones if weights are not provided.
+      if weights is None:
+        weights = tf.ones(num_boxes, dtype=tf.float32)
+
+      # Shape of [num_boxes, 1] integer tensor filled with current batch index.
+      batch_index = i * tf.ones_like(indices[:, 0:1], dtype=tf.int32)
+      batch_indices.append(tf.concat([batch_index, indices], axis=1))
+      batch_box_height_width.append(
+          tf.stack([boxes_height, boxes_width], axis=1))
+      batch_weights.append(weights)
+      batch_offsets.append(offsets)
+
+    batch_indices = tf.concat(batch_indices, axis=0)
+    batch_box_height_width = tf.concat(batch_box_height_width, axis=0)
+    batch_weights = tf.concat(batch_weights, axis=0)
+    batch_offsets = tf.concat(batch_offsets, axis=0)
+    return (batch_indices, batch_box_height_width, batch_offsets, batch_weights)
+
+
+# TODO(yuhuic): Update this class to handle the instance/keypoint weights.
+# Currently those weights are used as "mask" to indicate whether an
+# instance/keypoint should be considered or not (expecting only either 0 or 1
+# value). In reality, the weights can be any value and this class should handle
+# those values properly.
+class CenterNetKeypointTargetAssigner(object):
+  """Wrapper to compute target tensors for the CenterNet keypoint estimation.
+
+  This class has methods that take as input a batch of groundtruth tensors
+  (in the form of a list) and returns the targets required to train the
+  CenterNet model for keypoint estimation. Specifically, the class methods
+  expect the groundtruth in the following formats (consistent with the
+  standard Object Detection API). Note that usually the groundtruth tensors are
+  packed with a list which represents the batch dimension:
+
+  gt_classes_list: [Required] a list of 2D tf.float32 one-hot
+    (or k-hot) tensors of shape [num_instances, num_classes] containing the
+    class targets with the 0th index assumed to map to the first non-background
+    class.
+  gt_keypoints_list: [Required] a list of 3D tf.float32 tensors of
+    shape [num_instances, num_total_keypoints, 2] containing keypoint
+    coordinates. Note that the "num_total_keypoints" should be the sum of the
+    num_keypoints over all possible keypoint types, e.g. human pose, face.
+    For example, if a dataset contains both 17 human pose keypoints and 5 face
+    keypoints, then num_total_keypoints = 17 + 5 = 22.
+    If an intance contains only a subet of keypoints (e.g. human pose keypoints
+    but not face keypoints), the face keypoints will be filled with zeros.
+    Also note that keypoints are assumed to be provided in normalized
+    coordinates and missing keypoints should be encoded as NaN.
+  gt_keypoints_weights_list: [Optional] a list 3D tf.float32 tensors of shape
+    [num_instances, num_total_keypoints] representing the weights of each
+    keypoints. If not provided, then all not NaN keypoints will be equally
+    weighted.
+  gt_boxes_list: [Optional] a list of 2D tf.float32 tensors of shape
+    [num_instances, 4] containing coordinates of the groundtruth boxes.
+    Groundtruth boxes are provided in [y_min, x_min, y_max, x_max] format and
+    assumed to be normalized and clipped relative to the image window with
+    y_min <= y_max and x_min <= x_max.
+    Note that the boxes are only used to compute the center targets but are not
+    considered as required output of the keypoint task. If the boxes were not
+    provided, the center targets will be inferred from the keypoints
+    [not implemented yet].
+  gt_weights_list: [Optional] A list of 1D tf.float32 tensors of shape
+    [num_instances] containing weights for groundtruth boxes. Only useful when
+    gt_boxes_list is also provided.
+  """
+
+  def __init__(self,
+               stride,
+               class_id,
+               keypoint_indices,
+               keypoint_std_dev=None,
+               per_keypoint_offset=False,
+               peak_radius=0):
+    """Initializes a CenterNet keypoints target assigner.
+
+    Args:
+      stride: int, the stride of the network in output pixels.
+      class_id: int, the ID of the class (0-indexed) that contains the target
+        keypoints to consider in this task. For example, if the task is human
+        pose estimation, the class id should correspond to the "human" class.
+      keypoint_indices: A list of integers representing the indices of the
+        keypoints to be considered in this task. This is used to retrieve the
+        subset of the keypoints from gt_keypoints that should be considered in
+        this task.
+      keypoint_std_dev: A list of floats represent the standard deviation of the
+        Gaussian kernel used to generate the keypoint heatmap (in the unit of
+        output pixels). It is to provide the flexibility of using different
+        sizes of Gaussian kernel for each keypoint type. If not provided, then
+        all standard deviation will be the same as the default value (10.0 in
+        the output pixel space). If provided, the length of keypoint_std_dev
+        needs to be the same as the length of keypoint_indices, indicating the
+        standard deviation of each keypoint type.
+      per_keypoint_offset: boolean, indicating whether to assign offset for
+        each keypoint channel. If set False, the output offset target will have
+        the shape [batch_size, out_height, out_width, 2]. If set True, the
+        output offset target will have the shape [batch_size, out_height,
+        out_width, 2 * num_keypoints].
+      peak_radius: int, the radius (in the unit of output pixel) around heatmap
+        peak to assign the offset targets.
+    """
+
+    self._stride = stride
+    self._class_id = class_id
+    self._keypoint_indices = keypoint_indices
+    self._per_keypoint_offset = per_keypoint_offset
+    self._peak_radius = peak_radius
+    if keypoint_std_dev is None:
+      self._keypoint_std_dev = ([_DEFAULT_KEYPOINT_OFFSET_STD_DEV] *
+                                len(keypoint_indices))
+    else:
+      assert len(keypoint_indices) == len(keypoint_std_dev)
+      self._keypoint_std_dev = keypoint_std_dev
+
+  def _preprocess_keypoints_and_weights(self, out_height, out_width, keypoints,
+                                        class_onehot, class_weights,
+                                        keypoint_weights):
+    """Preprocesses the keypoints and the corresponding keypoint weights.
+
+    This function performs several common steps to preprocess the keypoints and
+    keypoint weights features, including:
+      1) Select the subset of keypoints based on the keypoint indices, fill the
+         keypoint NaN values with zeros and convert to absoluate coordinates.
+      2) Generate the weights of the keypoint using the following information:
+         a. The class of the instance.
+         b. The NaN value of the keypoint coordinates.
+         c. The provided keypoint weights.
+
+    Args:
+      out_height: An integer or an interger tensor indicating the output height
+        of the model.
+      out_width: An integer or an interger tensor indicating the output width of
+        the model.
+      keypoints: A float tensor of shape [num_instances, num_total_keypoints, 2]
+        representing the original keypoint grountruth coordinates.
+      class_onehot: A float tensor of shape [num_instances, num_classes]
+        containing the class targets with the 0th index assumed to map to the
+        first non-background class.
+      class_weights: A float tensor of shape [num_instances] containing weights
+        for groundtruth instances.
+      keypoint_weights: A float tensor of shape
+        [num_instances, num_total_keypoints] representing the weights of each
+        keypoints.
+
+    Returns:
+      A tuple of two tensors:
+        keypoint_absolute: A float tensor of shape
+          [num_instances, num_keypoints, 2] which is the selected and updated
+          keypoint coordinates.
+        keypoint_weights: A float tensor of shape [num_instances, num_keypoints]
+          representing the updated weight of each keypoint.
+    """
+    # Select the targets keypoints by their type ids and generate the mask
+    # of valid elements.
+    valid_mask, keypoints = ta_utils.get_valid_keypoint_mask_for_class(
+        keypoint_coordinates=keypoints,
+        class_id=self._class_id,
+        class_onehot=class_onehot,
+        class_weights=class_weights,
+        keypoint_indices=self._keypoint_indices)
+    # Keypoint coordinates in absolute coordinate system.
+    # The shape of the tensors: [num_instances, num_keypoints, 2].
+    keypoints_absolute = keypoint_ops.to_absolute_coordinates(
+        keypoints, out_height, out_width)
+    # Assign default weights for the keypoints.
+    if keypoint_weights is None:
+      keypoint_weights = tf.ones_like(keypoints[:, :, 0])
+    else:
+      keypoint_weights = tf.gather(
+          keypoint_weights, indices=self._keypoint_indices, axis=1)
+    keypoint_weights = keypoint_weights * valid_mask
+    return keypoints_absolute, keypoint_weights
+
+  def assign_keypoint_heatmap_targets(self,
+                                      height,
+                                      width,
+                                      gt_keypoints_list,
+                                      gt_classes_list,
+                                      gt_keypoints_weights_list=None,
+                                      gt_weights_list=None,
+                                      gt_boxes_list=None):
+    """Returns the keypoint heatmap targets for the CenterNet model.
+
+    Args:
+      height: int, height of input to the CenterNet model. This is used to
+        determine the height of the output.
+      width: int, width of the input to the CenterNet model. This is used to
+        determine the width of the output.
+      gt_keypoints_list: A list of float tensors with shape [num_instances,
+        num_total_keypoints, 2]. See class-level description for more detail.
+      gt_classes_list: A list of float tensors with shape [num_instances,
+        num_classes]. See class-level description for more detail.
+      gt_keypoints_weights_list: A list of tensors with shape [num_instances,
+        num_total_keypoints] corresponding to the weight of each keypoint.
+      gt_weights_list: A list of float tensors with shape [num_instances]. See
+        class-level description for more detail.
+      gt_boxes_list: A list of float tensors with shape [num_instances, 4]. See
+        class-level description for more detail. If provided, the keypoint
+        standard deviations will be scaled based on the box sizes.
+
+    Returns:
+      heatmap: A float tensor of shape [batch_size, output_height, output_width,
+        num_keypoints] representing the per keypoint type center heatmap.
+        output_height and output_width are computed by dividing the input height
+        and width by the stride specified during initialization. Note that the
+        "num_keypoints" is defined by the length of keypoint_indices, which is
+        not necessarily equal to "num_total_keypoints".
+      num_instances_batch: A 2D int tensor of shape
+        [batch_size, num_keypoints] representing number of instances for each
+        keypoint type.
+      valid_mask: A float tensor with shape [batch_size, output_height,
+        output_width] where all values within the regions of the blackout boxes
+        are 0.0 and 1.0 else where.
+    """
+    out_width = tf.cast(width // self._stride, tf.float32)
+    out_height = tf.cast(height // self._stride, tf.float32)
+    # Compute the yx-grid to be used to generate the heatmap. Each returned
+    # tensor has shape of [out_height, out_width]
+    y_grid, x_grid = ta_utils.image_shape_to_grids(out_height, out_width)
+
+    if gt_keypoints_weights_list is None:
+      gt_keypoints_weights_list = [None] * len(gt_keypoints_list)
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_classes_list)
+    if gt_boxes_list is None:
+      gt_boxes_list = [None] * len(gt_keypoints_list)
+
+    heatmaps = []
+    num_instances_list = []
+    valid_mask_list = []
+    for keypoints, classes, kp_weights, weights, boxes in zip(
+        gt_keypoints_list, gt_classes_list, gt_keypoints_weights_list,
+        gt_weights_list, gt_boxes_list):
+      keypoints_absolute, kp_weights = self._preprocess_keypoints_and_weights(
+          out_height=out_height,
+          out_width=out_width,
+          keypoints=keypoints,
+          class_onehot=classes,
+          class_weights=weights,
+          keypoint_weights=kp_weights)
+      num_instances, num_keypoints, _ = (
+          shape_utils.combined_static_and_dynamic_shape(keypoints_absolute))
+
+      # A tensor of shape [num_instances, num_keypoints] with
+      # each element representing the type dimension for each corresponding
+      # keypoint:
+      # [[0, 1, ..., k-1],
+      #  [0, 1, ..., k-1],
+      #          :
+      #  [0, 1, ..., k-1]]
+      keypoint_types = tf.tile(
+          input=tf.expand_dims(tf.range(num_keypoints), axis=0),
+          multiples=[num_instances, 1])
+
+      # A tensor of shape [num_instances, num_keypoints] with
+      # each element representing the sigma of the Gaussian kernel for each
+      # keypoint.
+      keypoint_std_dev = tf.tile(
+          input=tf.expand_dims(tf.constant(self._keypoint_std_dev), axis=0),
+          multiples=[num_instances, 1])
+
+      # If boxes is not None, then scale the standard deviation based on the
+      # size of the object bounding boxes similar to object center heatmap.
+      if boxes is not None:
+        boxes = box_list.BoxList(boxes)
+        # Convert the box coordinates to absolute output image dimension space.
+        boxes = box_list_ops.to_absolute_coordinates(boxes,
+                                                     height // self._stride,
+                                                     width // self._stride)
+        # Get the box height and width. Each returned tensors have the shape
+        # of [num_instances]
+        (_, _, boxes_height,
+         boxes_width) = boxes.get_center_coordinates_and_sizes()
+
+        # Compute the sigma from box size. The tensor shape: [num_instances].
+        sigma = _compute_std_dev_from_box_size(boxes_height, boxes_width, 0.7)
+        keypoint_std_dev = keypoint_std_dev * tf.stack(
+            [sigma] * num_keypoints, axis=1)
+
+        # Generate the valid region mask to ignore regions with target class but
+        # no corresponding keypoints.
+        # Shape: [num_instances].
+        blackout = tf.logical_and(classes[:, self._class_id] > 0,
+                                  tf.reduce_max(kp_weights, axis=1) < 1e-3)
+        valid_mask = ta_utils.blackout_pixel_weights_by_box_regions(
+            out_height, out_width, boxes.get(), blackout)
+        valid_mask_list.append(valid_mask)
+
+      # Apply the Gaussian kernel to the keypoint coordinates. Returned heatmap
+      # has shape of [out_height, out_width, num_keypoints].
+      heatmap = ta_utils.coordinates_to_heatmap(
+          y_grid=y_grid,
+          x_grid=x_grid,
+          y_coordinates=tf.keras.backend.flatten(keypoints_absolute[:, :, 0]),
+          x_coordinates=tf.keras.backend.flatten(keypoints_absolute[:, :, 1]),
+          sigma=tf.keras.backend.flatten(keypoint_std_dev),
+          channel_onehot=tf.one_hot(
+              tf.keras.backend.flatten(keypoint_types), depth=num_keypoints),
+          channel_weights=tf.keras.backend.flatten(kp_weights))
+      num_instances_list.append(
+          tf.cast(tf.reduce_sum(kp_weights, axis=0), dtype=tf.int32))
+      heatmaps.append(heatmap)
+    return (tf.stack(heatmaps, axis=0), tf.stack(num_instances_list, axis=0),
+            tf.stack(valid_mask_list, axis=0))
+
+  def _get_keypoint_types(self, num_instances, num_keypoints, num_neighbors):
+    """Gets keypoint type index tensor.
+
+    The function prepares the tensor of keypoint indices with shape
+    [num_instances, num_keypoints, num_neighbors]. Each element represents the
+    keypoint type index for each corresponding keypoint and tiled along the 3rd
+    axis:
+    [[0, 1, ..., num_keypoints - 1],
+     [0, 1, ..., num_keypoints - 1],
+             :
+     [0, 1, ..., num_keypoints - 1]]
+
+    Args:
+      num_instances: int, the number of instances, used to define the 1st
+        dimension.
+      num_keypoints: int, the number of keypoint types, used to define the 2nd
+        dimension.
+      num_neighbors: int, the number of neighborhood pixels to consider for each
+        keypoint, used to define the 3rd dimension.
+
+    Returns:
+      A integer tensor of shape [num_instances, num_keypoints, num_neighbors].
+    """
+    keypoint_types = tf.range(num_keypoints)[tf.newaxis, :, tf.newaxis]
+    tiled_keypoint_types = tf.tile(keypoint_types,
+                                   multiples=[num_instances, 1, num_neighbors])
+    return tiled_keypoint_types
+
+  def assign_keypoints_offset_targets(self,
+                                      height,
+                                      width,
+                                      gt_keypoints_list,
+                                      gt_classes_list,
+                                      gt_keypoints_weights_list=None,
+                                      gt_weights_list=None):
+    """Returns the offsets and indices of the keypoints for location refinement.
+
+    The returned values are used to refine the location of each keypoints in the
+    heatmap. The predicted values at the relevant indices can be retrieved with
+    the get_batch_predictions_from_indices function.
+
+    Args:
+      height: int, height of input to the CenterNet model. This is used to
+        determine the height of the output.
+      width: int, width of the input to the CenterNet model. This is used to
+        determine the width of the output.
+      gt_keypoints_list: A list of tensors with shape [num_instances,
+        num_total_keypoints]. See class-level description for more detail.
+      gt_classes_list: A list of tensors with shape [num_instances,
+        num_classes]. See class-level description for more detail.
+      gt_keypoints_weights_list: A list of tensors with shape [num_instances,
+        num_total_keypoints] corresponding to the weight of each keypoint.
+      gt_weights_list: A list of float tensors with shape [num_instances]. See
+        class-level description for more detail.
+
+    Returns:
+      batch_indices: an integer tensor of shape [num_total_instances, 3] (or
+        [num_total_instances, 4] if 'per_keypoint_offset' is set True) holding
+        the indices inside the predicted tensor which should be penalized. The
+        first column indicates the index along the batch dimension and the
+        second and third columns indicate the index along the y and x
+        dimensions respectively. The fourth column corresponds to the channel
+        dimension (if 'per_keypoint_offset' is set True).
+      batch_offsets: a float tensor of shape [num_total_instances, 2] holding
+        the expected y and x offset of each box in the output space.
+      batch_weights: a float tensor of shape [num_total_instances] indicating
+        the weight of each prediction.
+      Note that num_total_instances = batch_size * num_instances *
+                                      num_keypoints * num_neighbors
+    """
+
+    batch_indices = []
+    batch_offsets = []
+    batch_weights = []
+
+    if gt_keypoints_weights_list is None:
+      gt_keypoints_weights_list = [None] * len(gt_keypoints_list)
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_classes_list)
+    for i, (keypoints, classes, kp_weights, weights) in enumerate(
+        zip(gt_keypoints_list, gt_classes_list, gt_keypoints_weights_list,
+            gt_weights_list)):
+      keypoints_absolute, kp_weights = self._preprocess_keypoints_and_weights(
+          out_height=height // self._stride,
+          out_width=width // self._stride,
+          keypoints=keypoints,
+          class_onehot=classes,
+          class_weights=weights,
+          keypoint_weights=kp_weights)
+      num_instances, num_keypoints, _ = (
+          shape_utils.combined_static_and_dynamic_shape(keypoints_absolute))
+
+      # [num_instances * num_keypoints]
+      y_source = tf.keras.backend.flatten(keypoints_absolute[:, :, 0])
+      x_source = tf.keras.backend.flatten(keypoints_absolute[:, :, 1])
+
+      # All keypoint coordinates and their neighbors:
+      # [num_instance * num_keypoints, num_neighbors]
+      (y_source_neighbors, x_source_neighbors,
+       valid_sources) = ta_utils.get_surrounding_grids(height // self._stride,
+                                                       width // self._stride,
+                                                       y_source, x_source,
+                                                       self._peak_radius)
+      _, num_neighbors = shape_utils.combined_static_and_dynamic_shape(
+          y_source_neighbors)
+
+      # Update the valid keypoint weights.
+      # [num_instance * num_keypoints, num_neighbors]
+      valid_keypoints = tf.cast(
+          valid_sources, dtype=tf.float32) * tf.stack(
+              [tf.keras.backend.flatten(kp_weights)] * num_neighbors, axis=-1)
+
+      # Compute the offsets and indices of the box centers. Shape:
+      #   offsets: [num_instances * num_keypoints, num_neighbors, 2]
+      #   indices: [num_instances * num_keypoints, num_neighbors, 2]
+      offsets, indices = ta_utils.compute_floor_offsets_with_indices(
+          y_source=y_source_neighbors,
+          x_source=x_source_neighbors,
+          y_target=y_source,
+          x_target=x_source)
+      # Reshape to:
+      #   offsets: [num_instances * num_keypoints * num_neighbors, 2]
+      #   indices: [num_instances * num_keypoints * num_neighbors, 2]
+      offsets = tf.reshape(offsets, [-1, 2])
+      indices = tf.reshape(indices, [-1, 2])
+
+      # Prepare the batch indices to be prepended.
+      batch_index = tf.fill(
+          [num_instances * num_keypoints * num_neighbors, 1], i)
+      if self._per_keypoint_offset:
+        tiled_keypoint_types = self._get_keypoint_types(
+            num_instances, num_keypoints, num_neighbors)
+        batch_indices.append(
+            tf.concat([batch_index, indices,
+                       tf.reshape(tiled_keypoint_types, [-1, 1])], axis=1))
+      else:
+        batch_indices.append(tf.concat([batch_index, indices], axis=1))
+      batch_offsets.append(offsets)
+      batch_weights.append(tf.keras.backend.flatten(valid_keypoints))
+
+    # Concatenate the tensors in the batch in the first dimension:
+    # shape: [batch_size * num_instances * num_keypoints * num_neighbors, 3] or
+    # [batch_size * num_instances * num_keypoints * num_neighbors, 4] if
+    # 'per_keypoint_offset' is set to True.
+    batch_indices = tf.concat(batch_indices, axis=0)
+    # shape: [batch_size * num_instances * num_keypoints * num_neighbors]
+    batch_weights = tf.concat(batch_weights, axis=0)
+    # shape: [batch_size * num_instances * num_keypoints * num_neighbors, 2]
+    batch_offsets = tf.concat(batch_offsets, axis=0)
+    return (batch_indices, batch_offsets, batch_weights)
+
+  def assign_joint_regression_targets(self,
+                                      height,
+                                      width,
+                                      gt_keypoints_list,
+                                      gt_classes_list,
+                                      gt_boxes_list=None,
+                                      gt_keypoints_weights_list=None,
+                                      gt_weights_list=None):
+    """Returns the joint regression from center grid to keypoints.
+
+    The joint regression is used as the grouping cue from the estimated
+    keypoints to instance center. The offsets are the vectors from the floored
+    object center coordinates to the keypoint coordinates.
+
+    Args:
+      height: int, height of input to the CenterNet model. This is used to
+        determine the height of the output.
+      width: int, width of the input to the CenterNet model. This is used to
+        determine the width of the output.
+      gt_keypoints_list: A list of float tensors with shape [num_instances,
+        num_total_keypoints]. See class-level description for more detail.
+      gt_classes_list: A list of float tensors with shape [num_instances,
+        num_classes]. See class-level description for more detail.
+      gt_boxes_list: A list of float tensors with shape [num_instances, 4]. See
+        class-level description for more detail. If provided, then the center
+        targets will be computed based on the center of the boxes.
+      gt_keypoints_weights_list: A list of float tensors with shape
+        [num_instances, num_total_keypoints] representing to the weight of each
+        keypoint.
+      gt_weights_list: A list of float tensors with shape [num_instances]. See
+        class-level description for more detail.
+
+    Returns:
+      batch_indices: an integer tensor of shape [num_instances, 4] holding the
+        indices inside the predicted tensor which should be penalized. The
+        first column indicates the index along the batch dimension and the
+        second and third columns indicate the index along the y and x
+        dimensions respectively, the last dimension refers to the keypoint type
+        dimension.
+      batch_offsets: a float tensor of shape [num_instances, 2] holding the
+        expected y and x offset of each box in the output space.
+      batch_weights: a float tensor of shape [num_instances] indicating the
+        weight of each prediction.
+      Note that num_total_instances = batch_size * num_instances * num_keypoints
+
+    Raises:
+      NotImplementedError: currently the object center coordinates need to be
+        computed from groundtruth bounding boxes. The functionality of
+        generating the object center coordinates from keypoints is not
+        implemented yet.
+    """
+
+    batch_indices = []
+    batch_offsets = []
+    batch_weights = []
+    batch_size = len(gt_keypoints_list)
+    if gt_keypoints_weights_list is None:
+      gt_keypoints_weights_list = [None] * batch_size
+    if gt_boxes_list is None:
+      gt_boxes_list = [None] * batch_size
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_classes_list)
+    for i, (keypoints, classes, boxes, kp_weights, weights) in enumerate(
+        zip(gt_keypoints_list, gt_classes_list,
+            gt_boxes_list, gt_keypoints_weights_list, gt_weights_list)):
+      keypoints_absolute, kp_weights = self._preprocess_keypoints_and_weights(
+          out_height=height // self._stride,
+          out_width=width // self._stride,
+          keypoints=keypoints,
+          class_onehot=classes,
+          class_weights=weights,
+          keypoint_weights=kp_weights)
+      num_instances, num_keypoints, _ = (
+          shape_utils.combined_static_and_dynamic_shape(keypoints_absolute))
+
+      # If boxes are provided, compute the joint center from it.
+      if boxes is not None:
+        # Compute joint center from boxes.
+        boxes = box_list.BoxList(boxes)
+        boxes = box_list_ops.to_absolute_coordinates(boxes,
+                                                     height // self._stride,
+                                                     width // self._stride)
+        y_center, x_center, _, _ = boxes.get_center_coordinates_and_sizes()
+      else:
+        # TODO(yuhuic): Add the logic to generate object centers from keypoints.
+        raise NotImplementedError((
+            'The functionality of generating object centers from keypoints is'
+            ' not implemented yet. Please provide groundtruth bounding boxes.'
+        ))
+
+      # Tile the yx center coordinates to be the same shape as keypoints.
+      y_center_tiled = tf.tile(
+          tf.reshape(y_center, shape=[num_instances, 1]),
+          multiples=[1, num_keypoints])
+      x_center_tiled = tf.tile(
+          tf.reshape(x_center, shape=[num_instances, 1]),
+          multiples=[1, num_keypoints])
+      # [num_instance * num_keypoints, num_neighbors]
+      (y_source_neighbors, x_source_neighbors,
+       valid_sources) = ta_utils.get_surrounding_grids(
+           height // self._stride, width // self._stride,
+           tf.keras.backend.flatten(y_center_tiled),
+           tf.keras.backend.flatten(x_center_tiled), self._peak_radius)
+
+      _, num_neighbors = shape_utils.combined_static_and_dynamic_shape(
+          y_source_neighbors)
+      valid_keypoints = tf.cast(
+          valid_sources, dtype=tf.float32) * tf.stack(
+              [tf.keras.backend.flatten(kp_weights)] * num_neighbors, axis=-1)
+
+      # Compute the offsets and indices of the box centers. Shape:
+      #   offsets: [num_instances * num_keypoints, 2]
+      #   indices: [num_instances * num_keypoints, 2]
+      (offsets, indices) = ta_utils.compute_floor_offsets_with_indices(
+          y_source=y_source_neighbors,
+          x_source=x_source_neighbors,
+          y_target=tf.keras.backend.flatten(keypoints_absolute[:, :, 0]),
+          x_target=tf.keras.backend.flatten(keypoints_absolute[:, :, 1]))
+      # Reshape to:
+      #   offsets: [num_instances * num_keypoints * num_neighbors, 2]
+      #   indices: [num_instances * num_keypoints * num_neighbors, 2]
+      offsets = tf.reshape(offsets, [-1, 2])
+      indices = tf.reshape(indices, [-1, 2])
+
+      # keypoint type tensor: [num_instances, num_keypoints, num_neighbors].
+      tiled_keypoint_types = self._get_keypoint_types(
+          num_instances, num_keypoints, num_neighbors)
+
+      batch_index = tf.fill(
+          [num_instances * num_keypoints * num_neighbors, 1], i)
+      batch_indices.append(
+          tf.concat([batch_index, indices,
+                     tf.reshape(tiled_keypoint_types, [-1, 1])], axis=1))
+      batch_offsets.append(offsets)
+      batch_weights.append(tf.keras.backend.flatten(valid_keypoints))
+
+    # Concatenate the tensors in the batch in the first dimension:
+    # shape: [batch_size * num_instances * num_keypoints, 4]
+    batch_indices = tf.concat(batch_indices, axis=0)
+    # shape: [batch_size * num_instances * num_keypoints]
+    batch_weights = tf.concat(batch_weights, axis=0)
+    # shape: [batch_size * num_instances * num_keypoints, 2]
+    batch_offsets = tf.concat(batch_offsets, axis=0)
+    return (batch_indices, batch_offsets, batch_weights)
+
+
+class CenterNetMaskTargetAssigner(object):
+  """Wrapper to compute targets for segmentation masks."""
+
+  def __init__(self, stride):
+    self._stride = stride
+
+  def assign_segmentation_targets(
+      self, gt_masks_list, gt_classes_list,
+      mask_resize_method=ResizeMethod.BILINEAR):
+    """Computes the segmentation targets.
+
+    This utility produces a semantic segmentation mask for each class, starting
+    with whole image instance segmentation masks. Effectively, each per-class
+    segmentation target is the union of all masks from that class.
+
+    Args:
+      gt_masks_list: A list of float tensors with shape [num_boxes,
+        input_height, input_width] with values in {0, 1} representing instance
+        masks for each object.
+      gt_classes_list: A list of float tensors with shape [num_boxes,
+        num_classes] representing the one-hot encoded class labels for each box
+        in the gt_boxes_list.
+      mask_resize_method: A `tf.compat.v2.image.ResizeMethod`. The method to use
+        when resizing masks from input resolution to output resolution.
+
+    Returns:
+      segmentation_targets: An int32 tensor of size [batch_size, output_height,
+        output_width, num_classes] representing the class of each location in
+        the output space.
+    """
+    # TODO(ronnyvotel): Handle groundtruth weights.
+    _, num_classes = shape_utils.combined_static_and_dynamic_shape(
+        gt_classes_list[0])
+
+    _, input_height, input_width = (
+        shape_utils.combined_static_and_dynamic_shape(gt_masks_list[0]))
+    output_height = input_height // self._stride
+    output_width = input_width // self._stride
+
+    segmentation_targets_list = []
+    for gt_masks, gt_classes in zip(gt_masks_list, gt_classes_list):
+      # Resize segmentation masks to conform to output dimensions. Use TF2
+      # image resize because TF1's version is buggy:
+      # https://yaqs.corp.google.com/eng/q/4970450458378240
+      gt_masks = tf2.image.resize(
+          gt_masks[:, :, :, tf.newaxis],
+          size=(output_height, output_width),
+          method=mask_resize_method)
+      gt_classes_reshaped = tf.reshape(gt_classes, [-1, 1, 1, num_classes])
+      # Shape: [h, w, num_classes].
+      segmentations_for_image = tf.reduce_max(
+          gt_masks * gt_classes_reshaped, axis=0)
+      segmentation_targets_list.append(segmentations_for_image)
+
+    segmentation_target = tf.stack(segmentation_targets_list, axis=0)
+    return segmentation_target
--- a/research/object_detection/core/target_assigner_test.py
+++ b/research/object_detection/core/target_assigner_test.py
@@ -24,9 +24,9 @@ from object_detection.core import region_similarity_calculator
 from object_detection.core import standard_fields as fields
 from object_detection.core import target_assigner as targetassigner
 from object_detection.matchers import argmax_matcher
-from object_detection.matchers import bipartite_matcher
 from object_detection.utils import np_box_ops
 from object_detection.utils import test_case
+from object_detection.utils import tf_version


 class TargetAssignerTest(test_case.TestCase):
@@ -439,7 +439,7 @@ class TargetAssignerTest(test_case.TestCase):

  def test_raises_error_on_incompatible_groundtruth_boxes_and_labels(self):
    similarity_calc = region_similarity_calculator.NegSqDistSimilarity()
-    matcher = bipartite_matcher.GreedyBipartiteMatcher()
+    matcher = argmax_matcher.ArgMaxMatcher(0.5)
    box_coder = mean_stddev_box_coder.MeanStddevBoxCoder()
    unmatched_class_label = tf.constant([1, 0, 0, 0, 0, 0, 0], tf.float32)
    target_assigner = targetassigner.TargetAssigner(
@@ -469,7 +469,7 @@ class TargetAssignerTest(test_case.TestCase):

  def test_raises_error_on_invalid_groundtruth_labels(self):
    similarity_calc = region_similarity_calculator.NegSqDistSimilarity()
-    matcher = bipartite_matcher.GreedyBipartiteMatcher()
+    matcher = argmax_matcher.ArgMaxMatcher(0.5)
    box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=1.0)
    unmatched_class_label = tf.constant([[0, 0], [0, 0], [0, 0]], tf.float32)
    target_assigner = targetassigner.TargetAssigner(
@@ -1191,7 +1191,7 @@ class BatchTargetAssignConfidencesTest(test_case.TestCase):
      ])


-class CreateTargetAssignerTest(tf.test.TestCase):
+class CreateTargetAssignerTest(test_case.TestCase):

  def test_create_target_assigner(self):
    """Tests that named constructor gives working target assigners.
@@ -1202,9 +1202,10 @@ class CreateTargetAssignerTest(tf.test.TestCase):
    groundtruth = box_list.BoxList(tf.constant(corners))

    priors = box_list.BoxList(tf.constant(corners))
-    multibox_ta = (targetassigner
-                   .create_target_assigner('Multibox', stage='proposal'))
-    multibox_ta.assign(priors, groundtruth)
+    if tf_version.is_tf1():
+      multibox_ta = (targetassigner
+                     .create_target_assigner('Multibox', stage='proposal'))
+      multibox_ta.assign(priors, groundtruth)
    # No tests on output, as that may vary arbitrarily as new target assigners
    # are added. As long as it is constructed correctly and runs without errors,
    # tests on the individual assigners cover correctness of the assignments.
@@ -1229,6 +1230,681 @@ class CreateTargetAssignerTest(tf.test.TestCase):
                                            stage='invalid_stage')


+def _array_argmax(array):
+  return np.unravel_index(np.argmax(array), array.shape)
+
+
+class CenterNetCenterHeatmapTargetAssignerTest(test_case.TestCase):
+
+  def setUp(self):
+    super(CenterNetCenterHeatmapTargetAssignerTest, self).setUp()
+
+    self._box_center = [0.0, 0.0, 1.0, 1.0]
+    self._box_center_small = [0.25, 0.25, 0.75, 0.75]
+    self._box_lower_left = [0.5, 0.0, 1.0, 0.5]
+    self._box_center_offset = [0.1, 0.05, 1.0, 1.0]
+    self._box_odd_coordinates = [0.1625, 0.2125, 0.5625, 0.9625]
+
+  def test_center_location(self):
+    """Test that the centers are at the correct location."""
+    def graph_fn():
+      box_batch = [tf.constant([self._box_center, self._box_lower_left])]
+      classes = [
+          tf.one_hot([0, 1], depth=4),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(4)
+      targets = assigner.assign_center_targets_from_boxes(80, 80, box_batch,
+                                                          classes)
+      return targets
+    targets = self.execute(graph_fn, [])
+    self.assertEqual((10, 10), _array_argmax(targets[0, :, :, 0]))
+    self.assertAlmostEqual(1.0, targets[0, 10, 10, 0])
+    self.assertEqual((15, 5), _array_argmax(targets[0, :, :, 1]))
+    self.assertAlmostEqual(1.0, targets[0, 15, 5, 1])
+
+  def test_center_batch_shape(self):
+    """Test that the shape of the target for a batch is correct."""
+    def graph_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center]),
+          tf.constant([self._box_center_small]),
+      ]
+      classes = [
+          tf.one_hot([0, 1], depth=4),
+          tf.one_hot([2], depth=4),
+          tf.one_hot([3], depth=4),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(4)
+      targets = assigner.assign_center_targets_from_boxes(80, 80, box_batch,
+                                                          classes)
+      return targets
+    targets = self.execute(graph_fn, [])
+    self.assertEqual((3, 20, 20, 4), targets.shape)
+
+  def test_center_overlap_maximum(self):
+    """Test that when boxes overlap we, are computing the maximum."""
+    def graph_fn():
+      box_batch = [
+          tf.constant([
+              self._box_center, self._box_center_offset, self._box_center,
+              self._box_center_offset
+          ])
+      ]
+      classes = [
+          tf.one_hot([0, 0, 1, 2], depth=4),
+      ]
+
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(4)
+      targets = assigner.assign_center_targets_from_boxes(80, 80, box_batch,
+                                                          classes)
+      return targets
+    targets = self.execute(graph_fn, [])
+    class0_targets = targets[0, :, :, 0]
+    class1_targets = targets[0, :, :, 1]
+    class2_targets = targets[0, :, :, 2]
+    np.testing.assert_allclose(class0_targets,
+                               np.maximum(class1_targets, class2_targets))
+
+  def test_size_blur(self):
+    """Test that the heatmap of a larger box is more blurred."""
+    def graph_fn():
+      box_batch = [tf.constant([self._box_center, self._box_center_small])]
+
+      classes = [
+          tf.one_hot([0, 1], depth=4),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(4)
+      targets = assigner.assign_center_targets_from_boxes(80, 80, box_batch,
+                                                          classes)
+      return targets
+    targets = self.execute(graph_fn, [])
+    self.assertGreater(
+        np.count_nonzero(targets[:, :, :, 0]),
+        np.count_nonzero(targets[:, :, :, 1]))
+
+  def test_weights(self):
+    """Test that the weights correctly ignore ground truth."""
+    def graph1_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center]),
+          tf.constant([self._box_center_small]),
+      ]
+      classes = [
+          tf.one_hot([0, 1], depth=4),
+          tf.one_hot([2], depth=4),
+          tf.one_hot([3], depth=4),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(4)
+      targets = assigner.assign_center_targets_from_boxes(80, 80, box_batch,
+                                                          classes)
+      return targets
+
+    targets = self.execute(graph1_fn, [])
+    self.assertAlmostEqual(1.0, targets[0, :, :, 0].max())
+    self.assertAlmostEqual(1.0, targets[0, :, :, 1].max())
+    self.assertAlmostEqual(1.0, targets[1, :, :, 2].max())
+    self.assertAlmostEqual(1.0, targets[2, :, :, 3].max())
+    self.assertAlmostEqual(0.0, targets[0, :, :, [2, 3]].max())
+    self.assertAlmostEqual(0.0, targets[1, :, :, [0, 1, 3]].max())
+    self.assertAlmostEqual(0.0, targets[2, :, :, :3].max())
+
+    def graph2_fn():
+      weights = [
+          tf.constant([0., 1.]),
+          tf.constant([1.]),
+          tf.constant([1.]),
+      ]
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center]),
+          tf.constant([self._box_center_small]),
+      ]
+      classes = [
+          tf.one_hot([0, 1], depth=4),
+          tf.one_hot([2], depth=4),
+          tf.one_hot([3], depth=4),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(4)
+      targets = assigner.assign_center_targets_from_boxes(80, 80, box_batch,
+                                                          classes,
+                                                          weights)
+      return targets
+    targets = self.execute(graph2_fn, [])
+    self.assertAlmostEqual(1.0, targets[0, :, :, 1].max())
+    self.assertAlmostEqual(1.0, targets[1, :, :, 2].max())
+    self.assertAlmostEqual(1.0, targets[2, :, :, 3].max())
+    self.assertAlmostEqual(0.0, targets[0, :, :, [0, 2, 3]].max())
+    self.assertAlmostEqual(0.0, targets[1, :, :, [0, 1, 3]].max())
+    self.assertAlmostEqual(0.0, targets[2, :, :, :3].max())
+
+  def test_low_overlap(self):
+    def graph1_fn():
+      box_batch = [tf.constant([self._box_center])]
+      classes = [
+          tf.one_hot([0], depth=2),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          4, min_overlap=0.1)
+      targets_low_overlap = assigner.assign_center_targets_from_boxes(
+          80, 80, box_batch, classes)
+      return targets_low_overlap
+    targets_low_overlap = self.execute(graph1_fn, [])
+    self.assertLess(1, np.count_nonzero(targets_low_overlap))
+
+    def graph2_fn():
+      box_batch = [tf.constant([self._box_center])]
+      classes = [
+          tf.one_hot([0], depth=2),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          4, min_overlap=0.6)
+      targets_medium_overlap = assigner.assign_center_targets_from_boxes(
+          80, 80, box_batch, classes)
+      return targets_medium_overlap
+    targets_medium_overlap = self.execute(graph2_fn, [])
+    self.assertLess(1, np.count_nonzero(targets_medium_overlap))
+
+    def graph3_fn():
+      box_batch = [tf.constant([self._box_center])]
+      classes = [
+          tf.one_hot([0], depth=2),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          4, min_overlap=0.99)
+      targets_high_overlap = assigner.assign_center_targets_from_boxes(
+          80, 80, box_batch, classes)
+      return targets_high_overlap
+
+    targets_high_overlap = self.execute(graph3_fn, [])
+    self.assertTrue(np.all(targets_low_overlap >= targets_medium_overlap))
+    self.assertTrue(np.all(targets_medium_overlap >= targets_high_overlap))
+
+  def test_empty_box_list(self):
+    """Test that an empty box list gives an all 0 heatmap."""
+    def graph_fn():
+      box_batch = [
+          tf.zeros((0, 4), dtype=tf.float32),
+      ]
+
+      classes = [
+          tf.zeros((0, 5), dtype=tf.float32),
+      ]
+
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          4, min_overlap=0.1)
+      targets = assigner.assign_center_targets_from_boxes(
+          80, 80, box_batch, classes)
+      return targets
+    targets = self.execute(graph_fn, [])
+    np.testing.assert_allclose(targets, 0.)
+
+
+class CenterNetBoxTargetAssignerTest(test_case.TestCase):
+
+  def setUp(self):
+    super(CenterNetBoxTargetAssignerTest, self).setUp()
+    self._box_center = [0.0, 0.0, 1.0, 1.0]
+    self._box_center_small = [0.25, 0.25, 0.75, 0.75]
+    self._box_lower_left = [0.5, 0.0, 1.0, 0.5]
+    self._box_center_offset = [0.1, 0.05, 1.0, 1.0]
+    self._box_odd_coordinates = [0.1625, 0.2125, 0.5625, 0.9625]
+
+  def test_max_distance_for_overlap(self):
+    """Test that the distance ensures the IoU with random boxes."""
+
+    # TODO(vighneshb) remove this after the `_smallest_positive_root`
+    # function if fixed.
+    self.skipTest(('Skipping test because we are using an incorrect version of'
+                   'the `max_distance_for_overlap` function to reproduce'
+                   ' results.'))
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+
+    width = rng.uniform(1, 100, size=n_samples)
+    height = rng.uniform(1, 100, size=n_samples)
+    min_iou = rng.uniform(0.1, 1.0, size=n_samples)
+
+    def graph_fn():
+      max_dist = targetassigner.max_distance_for_overlap(height, width, min_iou)
+      return max_dist
+    max_dist = self.execute(graph_fn, [])
+    xmin1 = np.zeros(n_samples)
+    ymin1 = np.zeros(n_samples)
+    xmax1 = np.zeros(n_samples) + width
+    ymax1 = np.zeros(n_samples) + height
+
+    xmin2 = max_dist * np.cos(rng.uniform(0, 2 * np.pi))
+    ymin2 = max_dist * np.sin(rng.uniform(0, 2 * np.pi))
+    xmax2 = width + max_dist * np.cos(rng.uniform(0, 2 * np.pi))
+    ymax2 = height + max_dist * np.sin(rng.uniform(0, 2 * np.pi))
+
+    boxes1 = np.vstack([ymin1, xmin1, ymax1, xmax1]).T
+    boxes2 = np.vstack([ymin2, xmin2, ymax2, xmax2]).T
+
+    iou = np.diag(np_box_ops.iou(boxes1, boxes2))
+
+    self.assertTrue(np.all(iou >= min_iou))
+
+  def test_max_distance_for_overlap_centernet(self):
+    """Test the version of the function used in the CenterNet paper."""
+
+    def graph_fn():
+      distance = targetassigner.max_distance_for_overlap(10, 5, 0.5)
+      return distance
+    distance = self.execute(graph_fn, [])
+    self.assertAlmostEqual(2.807764064, distance)
+
+  def test_assign_size_and_offset_targets(self):
+    """Test the assign_size_and_offset_targets function."""
+    def graph_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center_offset]),
+          tf.constant([self._box_center_small, self._box_odd_coordinates]),
+      ]
+
+      assigner = targetassigner.CenterNetBoxTargetAssigner(4)
+      indices, hw, yx_offset, weights = assigner.assign_size_and_offset_targets(
+          80, 80, box_batch)
+      return indices, hw, yx_offset, weights
+    indices, hw, yx_offset, weights = self.execute(graph_fn, [])
+    self.assertEqual(indices.shape, (5, 3))
+    self.assertEqual(hw.shape, (5, 2))
+    self.assertEqual(yx_offset.shape, (5, 2))
+    self.assertEqual(weights.shape, (5,))
+    np.testing.assert_array_equal(
+        indices,
+        [[0, 10, 10], [0, 15, 5], [1, 11, 10], [2, 10, 10], [2, 7, 11]])
+    np.testing.assert_array_equal(
+        hw, [[20, 20], [10, 10], [18, 19], [10, 10], [8, 15]])
+    np.testing.assert_array_equal(
+        yx_offset, [[0, 0], [0, 0], [0, 0.5], [0, 0], [0.25, 0.75]])
+    np.testing.assert_array_equal(weights, 1)
+
+  def test_assign_size_and_offset_targets_weights(self):
+    """Test the assign_size_and_offset_targets function with box weights."""
+    def graph_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_lower_left, self._box_center_small]),
+          tf.constant([self._box_center_small, self._box_odd_coordinates]),
+      ]
+
+      cn_assigner = targetassigner.CenterNetBoxTargetAssigner(4)
+      weights_batch = [
+          tf.constant([0.0, 1.0]),
+          tf.constant([1.0, 1.0]),
+          tf.constant([0.0, 0.0])
+      ]
+      indices, hw, yx_offset, weights = cn_assigner.assign_size_and_offset_targets(
+          80, 80, box_batch, weights_batch)
+      return indices, hw, yx_offset, weights
+    indices, hw, yx_offset, weights = self.execute(graph_fn, [])
+    self.assertEqual(indices.shape, (6, 3))
+    self.assertEqual(hw.shape, (6, 2))
+    self.assertEqual(yx_offset.shape, (6, 2))
+    self.assertEqual(weights.shape, (6,))
+    np.testing.assert_array_equal(indices,
+                                  [[0, 10, 10], [0, 15, 5], [1, 15, 5],
+                                   [1, 10, 10], [2, 10, 10], [2, 7, 11]])
+    np.testing.assert_array_equal(
+        hw, [[20, 20], [10, 10], [10, 10], [10, 10], [10, 10], [8, 15]])
+    np.testing.assert_array_equal(
+        yx_offset, [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0.25, 0.75]])
+    np.testing.assert_array_equal(weights, [0, 1, 1, 1, 0, 0])
+
+  def test_get_batch_predictions_from_indices(self):
+    """Test the get_batch_predictions_from_indices function.
+
+    This test verifies that the indices returned by
+    assign_size_and_offset_targets function work as expected with a predicted
+    tensor.
+
+    """
+    def graph_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center_small, self._box_odd_coordinates]),
+      ]
+
+      pred_array = np.ones((2, 40, 20, 2), dtype=np.int32) * -1000
+      pred_array[0, 20, 10] = [1, 2]
+      pred_array[0, 30, 5] = [3, 4]
+      pred_array[1, 20, 10] = [5, 6]
+      pred_array[1, 14, 11] = [7, 8]
+
+      pred_tensor = tf.constant(pred_array)
+
+      cn_assigner = targetassigner.CenterNetBoxTargetAssigner(4)
+      indices, _, _, _ = cn_assigner.assign_size_and_offset_targets(
+          160, 80, box_batch)
+
+      preds = targetassigner.get_batch_predictions_from_indices(
+          pred_tensor, indices)
+      return preds
+    preds = self.execute(graph_fn, [])
+    np.testing.assert_array_equal(preds, [[1, 2], [3, 4], [5, 6], [7, 8]])
+
+
+class CenterNetKeypointTargetAssignerTest(test_case.TestCase):
+
+  def test_keypoint_heatmap_targets(self):
+    def graph_fn():
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, float('nan'), 0.9, 1.0],
+                        [0.4, 0.1, 0.4, 0.2, 0.1],
+                        [float('nan'), 0.1, 0.5, 0.7, 0.6]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+      gt_boxes_list = [
+          tf.constant(
+              np.array([[0.0, 0.0, 0.3, 0.3],
+                        [0.0, 0.0, 0.5, 0.5],
+                        [0.0, 0.0, 0.5, 0.5],
+                        [0.0, 0.0, 1.0, 1.0]]),
+              dtype=tf.float32)
+      ]
+
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2])
+      (targets, num_instances_batch,
+       valid_mask) = cn_assigner.assign_keypoint_heatmap_targets(
+           120,
+           80,
+           gt_keypoints_list,
+           gt_classes_list,
+           gt_boxes_list=gt_boxes_list)
+      return targets, num_instances_batch, valid_mask
+
+    targets, num_instances_batch, valid_mask = self.execute(graph_fn, [])
+    # keypoint (0.5, 0.5) is selected. The peak is expected to appear at the
+    # center of the image.
+    self.assertEqual((15, 10), _array_argmax(targets[0, :, :, 1]))
+    self.assertAlmostEqual(1.0, targets[0, 15, 10, 1])
+    # No peak for the first class since NaN is selected.
+    self.assertAlmostEqual(0.0, targets[0, 15, 10, 0])
+    # Verify the output heatmap shape.
+    self.assertAllEqual([1, 30, 20, 2], targets.shape)
+    # Verify the number of instances is correct.
+    np.testing.assert_array_almost_equal([[0, 1]],
+                                         num_instances_batch)
+    # When calling the function, we specify the class id to be 1 (1th and 3rd)
+    # instance and the keypoint indices to be [0, 2], meaning that the 1st
+    # instance is the target class with no valid keypoints in it. As a result,
+    # the region of the 1st instance boxing box should be blacked out
+    # (0.0, 0.0, 0.5, 0.5), transfering to (0, 0, 15, 10) in absolute output
+    # space.
+    self.assertAlmostEqual(np.sum(valid_mask[:, 0:16, 0:11]), 0.0)
+    # All other values are 1.0 so the sum is: 30 * 20 - 16 * 11 = 424.
+    self.assertAlmostEqual(np.sum(valid_mask), 424.0)
+
+  def test_assign_keypoints_offset_targets(self):
+    def graph_fn():
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, float('nan'), 0.9, 0.4],
+                        [0.4, 0.1, 0.4, 0.2, 0.0],
+                        [float('nan'), 0.0, 0.12, 0.7, 0.4]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2])
+      (indices, offsets, weights) = cn_assigner.assign_keypoints_offset_targets(
+          height=120,
+          width=80,
+          gt_keypoints_list=gt_keypoints_list,
+          gt_classes_list=gt_classes_list)
+      return indices, weights, offsets
+    indices, weights, offsets = self.execute(graph_fn, [])
+    # Only the last element has positive weight.
+    np.testing.assert_array_almost_equal(
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], weights)
+    # Validate the last element's indices and offsets.
+    np.testing.assert_array_equal([0, 3, 2], indices[7, :])
+    np.testing.assert_array_almost_equal([0.6, 0.4], offsets[7, :])
+
+  def test_assign_keypoints_offset_targets_radius(self):
+    def graph_fn():
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, float('nan'), 0.9, 0.4],
+                        [0.4, 0.1, 0.4, 0.2, 0.0],
+                        [float('nan'), 0.0, 0.12, 0.7, 0.4]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2],
+          peak_radius=1,
+          per_keypoint_offset=True)
+      (indices, offsets, weights) = cn_assigner.assign_keypoints_offset_targets(
+          height=120,
+          width=80,
+          gt_keypoints_list=gt_keypoints_list,
+          gt_classes_list=gt_classes_list)
+      return indices, weights, offsets
+    indices, weights, offsets = self.execute(graph_fn, [])
+
+    # There are total 8 * 5 (neighbors) = 40 targets.
+    self.assertAllEqual(indices.shape, [40, 4])
+    self.assertAllEqual(offsets.shape, [40, 2])
+    self.assertAllEqual(weights.shape, [40])
+    # Only the last 5 (radius 1 generates 5 valid points) element has positive
+    # weight.
+    np.testing.assert_array_almost_equal([
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0
+    ], weights)
+    # Validate the last element's (with neighbors) indices and offsets.
+    np.testing.assert_array_equal([0, 2, 2, 1], indices[35, :])
+    np.testing.assert_array_equal([0, 3, 1, 1], indices[36, :])
+    np.testing.assert_array_equal([0, 3, 2, 1], indices[37, :])
+    np.testing.assert_array_equal([0, 3, 3, 1], indices[38, :])
+    np.testing.assert_array_equal([0, 4, 2, 1], indices[39, :])
+    np.testing.assert_array_almost_equal([1.6, 0.4], offsets[35, :])
+    np.testing.assert_array_almost_equal([0.6, 1.4], offsets[36, :])
+    np.testing.assert_array_almost_equal([0.6, 0.4], offsets[37, :])
+    np.testing.assert_array_almost_equal([0.6, -0.6], offsets[38, :])
+    np.testing.assert_array_almost_equal([-0.4, 0.4], offsets[39, :])
+
+  def test_assign_joint_regression_targets(self):
+    def graph_fn():
+      gt_boxes_list = [
+          tf.constant(
+              np.array([[0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 1.0, 1.0]]),
+              dtype=tf.float32)
+      ]
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, float('nan'), 0.9, 0.4],
+                        [0.4, 0.1, 0.4, 0.2, 0.0],
+                        [float('nan'), 0.0, 0.12, 0.7, 0.4]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2])
+      (indices, offsets, weights) = cn_assigner.assign_joint_regression_targets(
+          height=120,
+          width=80,
+          gt_keypoints_list=gt_keypoints_list,
+          gt_classes_list=gt_classes_list,
+          gt_boxes_list=gt_boxes_list)
+      return indices, offsets, weights
+    indices, offsets, weights = self.execute(graph_fn, [])
+    np.testing.assert_array_almost_equal(
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], weights)
+    np.testing.assert_array_equal([0, 15, 10, 1], indices[7, :])
+    np.testing.assert_array_almost_equal([-11.4, -7.6], offsets[7, :])
+
+  def test_assign_joint_regression_targets_radius(self):
+    def graph_fn():
+      gt_boxes_list = [
+          tf.constant(
+              np.array([[0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 1.0, 1.0]]),
+              dtype=tf.float32)
+      ]
+      gt_classes_list = [
+          tf.one_hot([0, 1, 0, 1], depth=4),
+      ]
+      coordinates = tf.expand_dims(
+          tf.constant(
+              np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
+                        [float('nan'), 0.7, float('nan'), 0.9, 0.4],
+                        [0.4, 0.1, 0.4, 0.2, 0.0],
+                        [float('nan'), 0.0, 0.12, 0.7, 0.4]]),
+              dtype=tf.float32),
+          axis=2)
+      gt_keypoints_list = [tf.concat([coordinates, coordinates], axis=2)]
+
+      cn_assigner = targetassigner.CenterNetKeypointTargetAssigner(
+          stride=4,
+          class_id=1,
+          keypoint_indices=[0, 2],
+          peak_radius=1)
+      (indices, offsets, weights) = cn_assigner.assign_joint_regression_targets(
+          height=120,
+          width=80,
+          gt_keypoints_list=gt_keypoints_list,
+          gt_classes_list=gt_classes_list,
+          gt_boxes_list=gt_boxes_list)
+      return indices, offsets, weights
+    indices, offsets, weights = self.execute(graph_fn, [])
+
+    # There are total 8 * 5 (neighbors) = 40 targets.
+    self.assertAllEqual(indices.shape, [40, 4])
+    self.assertAllEqual(offsets.shape, [40, 2])
+    self.assertAllEqual(weights.shape, [40])
+    # Only the last 5 (radius 1 generates 5 valid points) element has positive
+    # weight.
+    np.testing.assert_array_almost_equal([
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0
+    ], weights)
+    # Test the values of the indices and offsets of the last 5 elements.
+    np.testing.assert_array_equal([0, 14, 10, 1], indices[35, :])
+    np.testing.assert_array_equal([0, 15, 9, 1], indices[36, :])
+    np.testing.assert_array_equal([0, 15, 10, 1], indices[37, :])
+    np.testing.assert_array_equal([0, 15, 11, 1], indices[38, :])
+    np.testing.assert_array_equal([0, 16, 10, 1], indices[39, :])
+    np.testing.assert_array_almost_equal([-10.4, -7.6], offsets[35, :])
+    np.testing.assert_array_almost_equal([-11.4, -6.6], offsets[36, :])
+    np.testing.assert_array_almost_equal([-11.4, -7.6], offsets[37, :])
+    np.testing.assert_array_almost_equal([-11.4, -8.6], offsets[38, :])
+    np.testing.assert_array_almost_equal([-12.4, -7.6], offsets[39, :])
+
+
+class CenterNetMaskTargetAssignerTest(test_case.TestCase):
+
+  def test_assign_segmentation_targets(self):
+    def graph_fn():
+      gt_masks_list = [
+          # Example 0.
+          tf.constant([
+              [
+                  [1., 0., 0., 0.],
+                  [1., 1., 0., 0.],
+                  [0., 0., 0., 0.],
+                  [0., 0., 0., 0.],
+              ],
+              [
+                  [0., 0., 0., 0.],
+                  [0., 0., 0., 1.],
+                  [0., 0., 0., 0.],
+                  [0., 0., 0., 0.],
+              ],
+              [
+                  [1., 1., 0., 0.],
+                  [1., 1., 0., 0.],
+                  [0., 0., 1., 1.],
+                  [0., 0., 1., 1.],
+              ]
+          ], dtype=tf.float32),
+          # Example 1.
+          tf.constant([
+              [
+                  [1., 1., 0., 1.],
+                  [1., 1., 1., 1.],
+                  [0., 0., 1., 1.],
+                  [0., 0., 0., 1.],
+              ],
+              [
+                  [0., 0., 0., 0.],
+                  [0., 0., 0., 0.],
+                  [1., 1., 0., 0.],
+                  [1., 1., 0., 0.],
+              ],
+          ], dtype=tf.float32),
+      ]
+      gt_classes_list = [
+          # Example 0.
+          tf.constant([[1., 0., 0.],
+                       [0., 1., 0.],
+                       [1., 0., 0.]], dtype=tf.float32),
+          # Example 1.
+          tf.constant([[0., 1., 0.],
+                       [0., 1., 0.]], dtype=tf.float32)
+      ]
+      cn_assigner = targetassigner.CenterNetMaskTargetAssigner(stride=2)
+      segmentation_target = cn_assigner.assign_segmentation_targets(
+          gt_masks_list=gt_masks_list,
+          gt_classes_list=gt_classes_list,
+          mask_resize_method=targetassigner.ResizeMethod.NEAREST_NEIGHBOR)
+      return segmentation_target
+    segmentation_target = self.execute(graph_fn, [])
+
+    expected_seg_target = np.array([
+        # Example 0  [[class 0, class 1], [background, class 0]]
+        [[[1, 0, 0], [0, 1, 0]],
+         [[0, 0, 0], [1, 0, 0]]],
+        # Example 1  [[class 1, class 1], [class 1, class 1]]
+        [[[0, 1, 0], [0, 1, 0]],
+         [[0, 1, 0], [0, 1, 0]]],
+    ], dtype=np.float32)
+    np.testing.assert_array_almost_equal(
+        expected_seg_target, segmentation_target)
+

 if __name__ == '__main__':
  tf.enable_v2_behavior()

--- a/research/object_detection/dataset_tools/context_rcnn/__init__.py
+++ b/research/object_detection/dataset_tools/context_rcnn/__init__.py
--- a/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples.py
+++ b/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A Beam job to add contextual memory banks to tf.Examples.
+
+This tool groups images containing bounding boxes and embedded context features
+by a key, either `image/location` or `image/seq_id`, and time horizon,
+then uses these groups to build up a contextual memory bank from the embedded
+context features from each image in the group and adds that context to the
+output tf.Examples for each image in the group.
+
+Steps to generate a dataset with context from one with bounding boxes and
+embedded context features:
+1. Use object/detection/export_inference_graph.py to get a `saved_model` for
+  inference. The input node must accept a tf.Example proto.
+2. Run this tool with `saved_model` from step 1 and a TFRecord of tf.Example
+  protos containing images, bounding boxes, and embedded context features.
+  The context features can be added to tf.Examples using
+  generate_embedding_data.py.
+
+Example Usage:
+--------------
+python add_context_to_examples.py \
+  --input_tfrecord path/to/input_tfrecords* \
+  --output_tfrecord path/to/output_tfrecords \
+  --sequence_key image/location \
+  --time_horizon month
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import datetime
+import io
+import itertools
+import json
+import os
+
+from absl import app
+from absl import flags
+import apache_beam as beam
+import numpy as np
+import PIL.Image
+import six
+import tensorflow as tf
+
+from apache_beam import runners
+
+
+flags.DEFINE_string('input_tfrecord', None, 'TFRecord containing images in '
+                    'tf.Example format for object detection, with bounding'
+                    'boxes and contextual feature embeddings.')
+flags.DEFINE_string('output_tfrecord', None,
+                    'TFRecord containing images in tf.Example format, with '
+                    'added contextual memory banks.')
+flags.DEFINE_string('sequence_key', None, 'Key to use when grouping sequences: '
+                    'so far supports `image/seq_id` and `image/location`.')
+flags.DEFINE_string('time_horizon', None, 'What time horizon to use when '
+                    'splitting the data, if any. Options are: `year`, `month`,'
+                    ' `week`, `day `, `hour`, `minute`, `None`.')
+flags.DEFINE_integer('subsample_context_features_rate', 0, 'Whether to '
+                     'subsample the context_features, and if so how many to '
+                     'sample. If the rate is set to X, it will sample context '
+                     'from 1 out of every X images. Default is sampling from '
+                     'every image, which is X=0.')
+flags.DEFINE_boolean('reduce_image_size', True, 'downsamples images to'
+                     'have longest side max_image_dimension, maintaining aspect'
+                     ' ratio')
+flags.DEFINE_integer('max_image_dimension', 1024, 'sets max image dimension')
+flags.DEFINE_boolean('add_context_features', True, 'adds a memory bank of'
+                     'embeddings to each clip')
+flags.DEFINE_boolean('sorted_image_ids', True, 'whether the image source_ids '
+                     'are sortable to deal with date_captured tie-breaks')
+flags.DEFINE_string('image_ids_to_keep', 'All', 'path to .json list of image'
+                    'ids to keep, used for ground truth eval creation')
+flags.DEFINE_boolean('keep_context_features_image_id_list', False, 'Whether or '
+                     'not to keep a list of the image_ids corresponding to the '
+                     'memory bank')
+flags.DEFINE_boolean('keep_only_positives', False, 'Whether or not to '
+                     'keep only positive boxes based on score')
+flags.DEFINE_boolean('keep_only_positives_gt', False, 'Whether or not to '
+                     'keep only positive boxes based on gt class')
+flags.DEFINE_float('context_features_score_threshold', 0.7, 'What score '
+                   'threshold to use for boxes in context_features')
+flags.DEFINE_integer('max_num_elements_in_context_features', 2000, 'Sets max '
+                     'num elements per memory bank')
+flags.DEFINE_integer('num_shards', 0, 'Number of output shards.')
+flags.DEFINE_string('output_type', 'tf_sequence_example', 'Output type, one of '
+                    '`tf_example`, `tf_sequence_example`')
+flags.DEFINE_integer('max_clip_length', None, 'Max length for sequence '
+                     'example outputs.')
+
+FLAGS = flags.FLAGS
+
+DEFAULT_FEATURE_LENGTH = 2057
+
+
+class ReKeyDataFn(beam.DoFn):
+  """Re-keys tfrecords by sequence_key.
+
+  This Beam DoFn re-keys the tfrecords by a user-defined sequence_key
+  """
+
+  def __init__(self, sequence_key, time_horizon,
+               reduce_image_size, max_image_dimension):
+    """Initialization function.
+
+    Args:
+      sequence_key: A feature name to use as a key for grouping sequences.
+        Must point to a key of type bytes_list
+      time_horizon: What length of time to use to partition the data when
+        building the memory banks. Options: `year`, `month`, `week`, `day `,
+        `hour`, `minute`, None
+      reduce_image_size: Whether to reduce the sizes of the stored images.
+      max_image_dimension: maximum dimension of reduced images
+    """
+    self._sequence_key = sequence_key
+    if time_horizon is None or time_horizon in {'year', 'month', 'week', 'day',
+                                                'hour', 'minute'}:
+      self._time_horizon = time_horizon
+    else:
+      raise ValueError('Time horizon not supported.')
+    self._reduce_image_size = reduce_image_size
+    self._max_image_dimension = max_image_dimension
+    self._session = None
+    self._num_examples_processed = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_tf_examples_processed')
+    self._num_images_resized = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_images_resized')
+    self._num_images_read = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_images_read')
+    self._num_images_found = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_images_read')
+    self._num_got_shape = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_images_got_shape')
+    self._num_images_found_size = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_images_found_size')
+    self._num_examples_cleared = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_examples_cleared')
+    self._num_examples_updated = beam.metrics.Metrics.counter(
+        'data_rekey', 'num_examples_updated')
+
+  def process(self, tfrecord_entry):
+    return self._rekey_examples(tfrecord_entry)
+
+  def _largest_size_at_most(self, height, width, largest_side):
+    """Computes new shape with the largest side equal to `largest_side`.
+
+    Args:
+      height: an int indicating the current height.
+      width: an int indicating the current width.
+      largest_side: A python integer indicating the size of
+        the largest side after resize.
+    Returns:
+      new_height: an int indicating the new height.
+      new_width: an int indicating the new width.
+    """
+
+    x_scale = float(largest_side) / float(width)
+    y_scale = float(largest_side) / float(height)
+    scale = min(x_scale, y_scale)
+
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+
+    return new_height, new_width
+
+  def _resize_image(self, input_example):
+    """Resizes the image within input_example and updates the height and width.
+
+    Args:
+      input_example: A tf.Example that we want to update to contain a resized
+        image.
+    Returns:
+      input_example: Updated tf.Example.
+    """
+
+    original_image = copy.deepcopy(
+        input_example.features.feature['image/encoded'].bytes_list.value[0])
+    self._num_images_read.inc(1)
+
+    height = copy.deepcopy(
+        input_example.features.feature['image/height'].int64_list.value[0])
+
+    width = copy.deepcopy(
+        input_example.features.feature['image/width'].int64_list.value[0])
+
+    self._num_got_shape.inc(1)
+
+    new_height, new_width = self._largest_size_at_most(
+        height, width, self._max_image_dimension)
+
+    self._num_images_found_size.inc(1)
+
+    encoded_jpg_io = io.BytesIO(original_image)
+    image = PIL.Image.open(encoded_jpg_io)
+    resized_image = image.resize((new_width, new_height))
+
+    with io.BytesIO() as output:
+      resized_image.save(output, format='JPEG')
+      encoded_resized_image = output.getvalue()
+
+    self._num_images_resized.inc(1)
+
+    del input_example.features.feature['image/encoded'].bytes_list.value[:]
+    del input_example.features.feature['image/height'].int64_list.value[:]
+    del input_example.features.feature['image/width'].int64_list.value[:]
+
+    self._num_examples_cleared.inc(1)
+
+    input_example.features.feature['image/encoded'].bytes_list.value.extend(
+        [encoded_resized_image])
+    input_example.features.feature['image/height'].int64_list.value.extend(
+        [new_height])
+    input_example.features.feature['image/width'].int64_list.value.extend(
+        [new_width])
+    self._num_examples_updated.inc(1)
+
+    return input_example
+
+  def _rekey_examples(self, tfrecord_entry):
+    serialized_example = copy.deepcopy(tfrecord_entry)
+
+    input_example = tf.train.Example.FromString(serialized_example)
+
+    self._num_images_found.inc(1)
+
+    if self._reduce_image_size:
+      input_example = self._resize_image(input_example)
+      self._num_images_resized.inc(1)
+
+    new_key = input_example.features.feature[
+        self._sequence_key].bytes_list.value[0]
+
+    if self._time_horizon:
+      date_captured = datetime.datetime.strptime(
+          six.ensure_str(input_example.features.feature[
+              'image/date_captured'].bytes_list.value[0]), '%Y-%m-%d %H:%M:%S')
+      year = date_captured.year
+      month = date_captured.month
+      day = date_captured.day
+      week = np.floor(float(day) / float(7))
+      hour = date_captured.hour
+      minute = date_captured.minute
+
+      if self._time_horizon == 'year':
+        new_key = new_key + six.ensure_binary('/' + str(year))
+      elif self._time_horizon == 'month':
+        new_key = new_key + six.ensure_binary(
+            '/' + str(year) + '/' + str(month))
+      elif self._time_horizon == 'week':
+        new_key = new_key + six.ensure_binary(
+            '/' + str(year) + '/' + str(month) + '/' + str(week))
+      elif self._time_horizon == 'day':
+        new_key = new_key + six.ensure_binary(
+            '/' + str(year) + '/' + str(month) + '/' + str(day))
+      elif self._time_horizon == 'hour':
+        new_key = new_key + six.ensure_binary(
+            '/' + str(year) + '/' + str(month) + '/' + str(day) + '/' + (
+                str(hour)))
+      elif self._time_horizon == 'minute':
+        new_key = new_key + six.ensure_binary(
+            '/' + str(year) + '/' + str(month) + '/' + str(day) + '/' + (
+                str(hour) + '/' + str(minute)))
+
+    self._num_examples_processed.inc(1)
+
+    return [(new_key, input_example)]
+
+
+class SortGroupedDataFn(beam.DoFn):
+  """Sorts data within a keyed group.
+
+  This Beam DoFn sorts the grouped list of image examples by frame_num
+  """
+
+  def __init__(self, sequence_key, sorted_image_ids,
+               max_num_elements_in_context_features):
+    """Initialization function.
+
+    Args:
+      sequence_key: A feature name to use as a key for grouping sequences.
+        Must point to a key of type bytes_list
+      sorted_image_ids: Whether the image ids are sortable to use as sorting
+        tie-breakers
+      max_num_elements_in_context_features: The maximum number of elements
+        allowed in the memory bank
+    """
+    self._session = None
+    self._num_examples_processed = beam.metrics.Metrics.counter(
+        'sort_group', 'num_groups_sorted')
+    self._too_many_elements = beam.metrics.Metrics.counter(
+        'sort_group', 'too_many_elements')
+    self._split_elements = beam.metrics.Metrics.counter(
+        'sort_group', 'split_elements')
+    self._sequence_key = six.ensure_binary(sequence_key)
+    self._sorted_image_ids = sorted_image_ids
+    self._max_num_elements_in_context_features = (
+        max_num_elements_in_context_features)
+
+  def process(self, grouped_entry):
+    return self._sort_image_examples(grouped_entry)
+
+  def _sort_image_examples(self, grouped_entry):
+    key, example_collection = grouped_entry
+    example_list = list(example_collection)
+
+    def get_frame_num(example):
+      return example.features.feature['image/seq_frame_num'].int64_list.value[0]
+
+    def get_date_captured(example):
+      return datetime.datetime.strptime(
+          six.ensure_str(
+              example.features.feature[
+                  'image/date_captured'].bytes_list.value[0]),
+          '%Y-%m-%d %H:%M:%S')
+
+    def get_image_id(example):
+      return example.features.feature['image/source_id'].bytes_list.value[0]
+
+    if self._sequence_key == six.ensure_binary('image/seq_id'):
+      sorting_fn = get_frame_num
+    elif self._sequence_key == six.ensure_binary('image/location'):
+      if self._sorted_image_ids:
+        sorting_fn = get_image_id
+      else:
+        sorting_fn = get_date_captured
+
+    sorted_example_list = sorted(example_list, key=sorting_fn)
+
+    self._num_examples_processed.inc(1)
+
+    if len(sorted_example_list) > self._max_num_elements_in_context_features:
+      leftovers = sorted_example_list
+      output_list = []
+      count = 0
+      self._too_many_elements.inc(1)
+      while len(leftovers) > self._max_num_elements_in_context_features:
+        self._split_elements.inc(1)
+        new_key = key + six.ensure_binary('_' + str(count))
+        new_list = leftovers[:self._max_num_elements_in_context_features]
+        output_list.append((new_key, new_list))
+        leftovers = leftovers[:self._max_num_elements_in_context_features]
+        count += 1
+    else:
+      output_list = [(key, sorted_example_list)]
+
+    return output_list
+
+
+def get_sliding_window(example_list, max_clip_length, stride_length):
+  """Yields a sliding window over data from example_list.
+
+  Sliding window has width max_clip_len (n) and stride stride_len (m).
+     s -> (s0,s1,...s[n-1]), (s[m],s[m+1],...,s[m+n]), ...
+
+  Args:
+    example_list: A list of examples.
+    max_clip_length: The maximum length of each clip.
+    stride_length: The stride between each clip.
+
+  Yields:
+    A list of lists of examples, each with length <= max_clip_length
+  """
+
+  # check if the list is too short to slide over
+  if len(example_list) < max_clip_length:
+    yield example_list
+  else:
+    starting_values = [i*stride_length for i in
+                       range(len(example_list)) if
+                       len(example_list) > i*stride_length]
+    for start in starting_values:
+      result = tuple(itertools.islice(example_list, start,
+                                      min(start + max_clip_length,
+                                          len(example_list))))
+      yield result
+
+
+class GenerateContextFn(beam.DoFn):
+  """Generates context data for camera trap images.
+
+  This Beam DoFn builds up contextual memory banks from groups of images and
+  stores them in the output tf.Example or tf.Sequence_example for each image.
+  """
+
+  def __init__(self, sequence_key, add_context_features, image_ids_to_keep,
+               keep_context_features_image_id_list=False,
+               subsample_context_features_rate=0,
+               keep_only_positives=False,
+               context_features_score_threshold=0.7,
+               keep_only_positives_gt=False,
+               max_num_elements_in_context_features=5000,
+               pad_context_features=False,
+               output_type='tf_example', max_clip_length=None):
+    """Initialization function.
+
+    Args:
+      sequence_key: A feature name to use as a key for grouping sequences.
+      add_context_features: Whether to keep and store the contextual memory
+        bank.
+      image_ids_to_keep: A list of image ids to save, to use to build data
+        subsets for evaluation.
+      keep_context_features_image_id_list: Whether to save an ordered list of
+        the ids of the images in the contextual memory bank.
+      subsample_context_features_rate: What rate to subsample images for the
+        contextual memory bank.
+      keep_only_positives: Whether to only keep high scoring
+        (>context_features_score_threshold) features in the contextual memory
+        bank.
+      context_features_score_threshold: What threshold to use for keeping
+        features.
+      keep_only_positives_gt: Whether to only keep features from images that
+        contain objects based on the ground truth (for training).
+      max_num_elements_in_context_features: the maximum number of elements in
+        the memory bank
+      pad_context_features: Whether to pad the context features to a fixed size.
+      output_type: What type of output, tf_example of tf_sequence_example
+      max_clip_length: The maximum length of a sequence example, before
+        splitting into multiple
+    """
+    self._session = None
+    self._num_examples_processed = beam.metrics.Metrics.counter(
+        'sequence_data_generation', 'num_seq_examples_processed')
+    self._num_keys_processed = beam.metrics.Metrics.counter(
+        'sequence_data_generation', 'num_keys_processed')
+    self._sequence_key = sequence_key
+    self._add_context_features = add_context_features
+    self._pad_context_features = pad_context_features
+    self._output_type = output_type
+    self._max_clip_length = max_clip_length
+    if six.ensure_str(image_ids_to_keep) == 'All':
+      self._image_ids_to_keep = None
+    else:
+      with tf.io.gfile.GFile(image_ids_to_keep) as f:
+        self._image_ids_to_keep = json.load(f)
+    self._keep_context_features_image_id_list = (
+        keep_context_features_image_id_list)
+    self._subsample_context_features_rate = subsample_context_features_rate
+    self._keep_only_positives = keep_only_positives
+    self._keep_only_positives_gt = keep_only_positives_gt
+    self._context_features_score_threshold = context_features_score_threshold
+    self._max_num_elements_in_context_features = (
+        max_num_elements_in_context_features)
+
+    self._images_kept = beam.metrics.Metrics.counter(
+        'sequence_data_generation', 'images_kept')
+    self._images_loaded = beam.metrics.Metrics.counter(
+        'sequence_data_generation', 'images_loaded')
+
+  def process(self, grouped_entry):
+    return self._add_context_to_example(copy.deepcopy(grouped_entry))
+
+  def _build_context_features(self, example_list):
+    context_features = []
+    context_features_image_id_list = []
+    count = 0
+    example_embedding = []
+
+    for idx, example in enumerate(example_list):
+      if self._subsample_context_features_rate > 0:
+        if (idx % self._subsample_context_features_rate) != 0:
+          example.features.feature[
+              'context_features_idx'].int64_list.value.append(
+                  self._max_num_elements_in_context_features + 1)
+          continue
+      if self._keep_only_positives:
+        if example.features.feature[
+            'image/embedding_score'
+            ].float_list.value[0] < self._context_features_score_threshold:
+          example.features.feature[
+              'context_features_idx'].int64_list.value.append(
+                  self._max_num_elements_in_context_features + 1)
+          continue
+      if self._keep_only_positives_gt:
+        if len(example.features.feature[
+            'image/object/bbox/xmin'
+            ].float_list.value) < 1:
+          example.features.feature[
+              'context_features_idx'].int64_list.value.append(
+                  self._max_num_elements_in_context_features + 1)
+          continue
+
+      example_embedding = list(example.features.feature[
+          'image/embedding'].float_list.value)
+      context_features.extend(example_embedding)
+      example.features.feature[
+          'context_features_idx'].int64_list.value.append(count)
+      count += 1
+      example_image_id = example.features.feature[
+          'image/source_id'].bytes_list.value[0]
+      context_features_image_id_list.append(example_image_id)
+
+    if not example_embedding:
+      example_embedding.append(np.zeros(DEFAULT_FEATURE_LENGTH))
+
+    feature_length = DEFAULT_FEATURE_LENGTH
+
+    # If the example_list is not empty and image/embedding_length is in the
+    # featture dict, feature_length will be assigned to that. Otherwise, it will
+    # be kept as default.
+    if example_list and (
+        'image/embedding_length' in example_list[0].features.feature):
+      feature_length = example_list[0].features.feature[
+          'image/embedding_length'].int64_list.value[0]
+
+    if self._pad_context_features:
+      while len(context_features_image_id_list) < (
+          self._max_num_elements_in_context_features):
+        context_features_image_id_list.append('')
+
+    return context_features, feature_length, context_features_image_id_list
+
+  def _add_context_to_example(self, grouped_entry):
+    key, example_collection = grouped_entry
+    list_of_examples = []
+
+    example_list = list(example_collection)
+
+    if self._add_context_features:
+      context_features, feature_length, context_features_image_id_list = (
+          self._build_context_features(example_list))
+
+    if self._image_ids_to_keep is not None:
+      new_example_list = []
+      for example in example_list:
+        im_id = example.features.feature['image/source_id'].bytes_list.value[0]
+        self._images_loaded.inc(1)
+        if six.ensure_str(im_id) in self._image_ids_to_keep:
+          self._images_kept.inc(1)
+          new_example_list.append(example)
+      if new_example_list:
+        example_list = new_example_list
+      else:
+        return []
+
+    if self._output_type == 'tf_sequence_example':
+      if self._max_clip_length is not None:
+        # For now, no overlap
+        clips = get_sliding_window(
+            example_list, self._max_clip_length, self._max_clip_length)
+      else:
+        clips = [example_list]
+
+      for clip_num, clip_list in enumerate(clips):
+        # initialize sequence example
+        seq_example = tf.train.SequenceExample()
+        video_id = six.ensure_str(key)+'_'+ str(clip_num)
+        seq_example.context.feature['clip/media_id'].bytes_list.value.append(
+            video_id.encode('utf8'))
+        seq_example.context.feature['clip/frames'].int64_list.value.append(
+            len(clip_list))
+
+        seq_example.context.feature[
+            'clip/start/timestamp'].int64_list.value.append(0)
+        seq_example.context.feature[
+            'clip/end/timestamp'].int64_list.value.append(len(clip_list))
+        seq_example.context.feature['image/format'].bytes_list.value.append(
+            six.ensure_binary('JPG'))
+        seq_example.context.feature['image/channels'].int64_list.value.append(3)
+        context_example = clip_list[0]
+        seq_example.context.feature['image/height'].int64_list.value.append(
+            context_example.features.feature[
+                'image/height'].int64_list.value[0])
+        seq_example.context.feature['image/width'].int64_list.value.append(
+            context_example.features.feature['image/width'].int64_list.value[0])
+
+        seq_example.context.feature[
+            'image/context_feature_length'].int64_list.value.append(
+                feature_length)
+        seq_example.context.feature[
+            'image/context_features'].float_list.value.extend(
+                context_features)
+        if self._keep_context_features_image_id_list:
+          seq_example.context.feature[
+              'image/context_features_image_id_list'].bytes_list.value.extend(
+                  context_features_image_id_list)
+
+        encoded_image_list = seq_example.feature_lists.feature_list[
+            'image/encoded']
+        timestamps_list = seq_example.feature_lists.feature_list[
+            'image/timestamp']
+        context_features_idx_list = seq_example.feature_lists.feature_list[
+            'image/context_features_idx']
+        date_captured_list = seq_example.feature_lists.feature_list[
+            'image/date_captured']
+        unix_time_list = seq_example.feature_lists.feature_list[
+            'image/unix_time']
+        location_list = seq_example.feature_lists.feature_list['image/location']
+        image_ids_list = seq_example.feature_lists.feature_list[
+            'image/source_id']
+        gt_xmin_list = seq_example.feature_lists.feature_list[
+            'region/bbox/xmin']
+        gt_xmax_list = seq_example.feature_lists.feature_list[
+            'region/bbox/xmax']
+        gt_ymin_list = seq_example.feature_lists.feature_list[
+            'region/bbox/ymin']
+        gt_ymax_list = seq_example.feature_lists.feature_list[
+            'region/bbox/ymax']
+        gt_type_list = seq_example.feature_lists.feature_list[
+            'region/label/index']
+        gt_type_string_list = seq_example.feature_lists.feature_list[
+            'region/label/string']
+        gt_is_annotated_list = seq_example.feature_lists.feature_list[
+            'region/is_annotated']
+
+        for idx, example in enumerate(clip_list):
+
+          encoded_image = encoded_image_list.feature.add()
+          encoded_image.bytes_list.value.extend(
+              example.features.feature['image/encoded'].bytes_list.value)
+
+          image_id = image_ids_list.feature.add()
+          image_id.bytes_list.value.append(
+              example.features.feature['image/source_id'].bytes_list.value[0])
+
+          timestamp = timestamps_list.feature.add()
+          # Timestamp is currently order in the list.
+          timestamp.int64_list.value.extend([idx])
+
+          context_features_idx = context_features_idx_list.feature.add()
+          context_features_idx.int64_list.value.extend(
+              example.features.feature['context_features_idx'].int64_list.value)
+
+          date_captured = date_captured_list.feature.add()
+          date_captured.bytes_list.value.extend(
+              example.features.feature['image/date_captured'].bytes_list.value)
+          unix_time = unix_time_list.feature.add()
+          unix_time.float_list.value.extend(
+              example.features.feature['image/unix_time'].float_list.value)
+          location = location_list.feature.add()
+          location.bytes_list.value.extend(
+              example.features.feature['image/location'].bytes_list.value)
+
+          gt_xmin = gt_xmin_list.feature.add()
+          gt_xmax = gt_xmax_list.feature.add()
+          gt_ymin = gt_ymin_list.feature.add()
+          gt_ymax = gt_ymax_list.feature.add()
+          gt_type = gt_type_list.feature.add()
+          gt_type_str = gt_type_string_list.feature.add()
+
+          gt_is_annotated = gt_is_annotated_list.feature.add()
+          gt_is_annotated.int64_list.value.append(1)
+
+          gt_xmin.float_list.value.extend(
+              example.features.feature[
+                  'image/object/bbox/xmin'].float_list.value)
+          gt_xmax.float_list.value.extend(
+              example.features.feature[
+                  'image/object/bbox/xmax'].float_list.value)
+          gt_ymin.float_list.value.extend(
+              example.features.feature[
+                  'image/object/bbox/ymin'].float_list.value)
+          gt_ymax.float_list.value.extend(
+              example.features.feature[
+                  'image/object/bbox/ymax'].float_list.value)
+
+          gt_type.int64_list.value.extend(
+              example.features.feature[
+                  'image/object/class/label'].int64_list.value)
+          gt_type_str.bytes_list.value.extend(
+              example.features.feature[
+                  'image/object/class/text'].bytes_list.value)
+
+        self._num_examples_processed.inc(1)
+        list_of_examples.append(seq_example)
+
+    elif self._output_type == 'tf_example':
+
+      for example in example_list:
+        im_id = example.features.feature['image/source_id'].bytes_list.value[0]
+
+        if self._add_context_features:
+          example.features.feature[
+              'image/context_features'].float_list.value.extend(
+                  context_features)
+          example.features.feature[
+              'image/context_feature_length'].int64_list.value.append(
+                  feature_length)
+
+        if self._keep_context_features_image_id_list:
+          example.features.feature[
+              'image/context_features_image_id_list'].bytes_list.value.extend(
+                  context_features_image_id_list)
+
+        self._num_examples_processed.inc(1)
+        list_of_examples.append(example)
+
+    return list_of_examples
+
+
+def construct_pipeline(input_tfrecord,
+                       output_tfrecord,
+                       sequence_key,
+                       time_horizon=None,
+                       subsample_context_features_rate=0,
+                       reduce_image_size=True,
+                       max_image_dimension=1024,
+                       add_context_features=True,
+                       sorted_image_ids=True,
+                       image_ids_to_keep='All',
+                       keep_context_features_image_id_list=False,
+                       keep_only_positives=False,
+                       context_features_score_threshold=0.7,
+                       keep_only_positives_gt=False,
+                       max_num_elements_in_context_features=5000,
+                       num_shards=0,
+                       output_type='tf_example',
+                       max_clip_length=None):
+  """Returns a beam pipeline to run object detection inference.
+
+  Args:
+    input_tfrecord: An TFRecord of tf.train.Example protos containing images.
+    output_tfrecord: An TFRecord of tf.train.Example protos that contain images
+      in the input TFRecord and the detections from the model.
+    sequence_key: A feature name to use as a key for grouping sequences.
+    time_horizon: What length of time to use to partition the data when building
+      the memory banks. Options: `year`, `month`, `week`, `day `, `hour`,
+      `minute`, None.
+    subsample_context_features_rate: What rate to subsample images for the
+        contextual memory bank.
+    reduce_image_size: Whether to reduce the size of the stored images.
+    max_image_dimension: The maximum image dimension to use for resizing.
+    add_context_features: Whether to keep and store the contextual memory bank.
+    sorted_image_ids: Whether the image ids are sortable, and can be used as
+      datetime tie-breakers when building memory banks.
+    image_ids_to_keep: A list of image ids to save, to use to build data subsets
+      for evaluation.
+    keep_context_features_image_id_list: Whether to save an ordered list of the
+      ids of the images in the contextual memory bank.
+    keep_only_positives: Whether to only keep high scoring
+      (>context_features_score_threshold) features in the contextual memory
+      bank.
+    context_features_score_threshold: What threshold to use for keeping
+      features.
+    keep_only_positives_gt: Whether to only keep features from images that
+      contain objects based on the ground truth (for training).
+    max_num_elements_in_context_features: the maximum number of elements in the
+      memory bank
+    num_shards: The number of output shards.
+    output_type: What type of output, tf_example of tf_sequence_example
+    max_clip_length: The maximum length of a sequence example, before
+      splitting into multiple
+  """
+  def pipeline(root):
+    if output_type == 'tf_example':
+      coder = beam.coders.ProtoCoder(tf.train.Example)
+    elif output_type == 'tf_sequence_example':
+      coder = beam.coders.ProtoCoder(tf.train.SequenceExample)
+    else:
+      raise ValueError('Unsupported output type.')
+    input_collection = (
+        root | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
+            input_tfrecord,
+            coder=beam.coders.BytesCoder()))
+    rekey_collection = input_collection | 'RekeyExamples' >> beam.ParDo(
+        ReKeyDataFn(sequence_key, time_horizon,
+                    reduce_image_size, max_image_dimension))
+    grouped_collection = (
+        rekey_collection | 'GroupBySequenceKey' >> beam.GroupByKey())
+    grouped_collection = (
+        grouped_collection | 'ReshuffleGroups' >> beam.Reshuffle())
+    ordered_collection = (
+        grouped_collection | 'OrderByFrameNumber' >> beam.ParDo(
+            SortGroupedDataFn(sequence_key, sorted_image_ids,
+                              max_num_elements_in_context_features)))
+    ordered_collection = (
+        ordered_collection | 'ReshuffleSortedGroups' >> beam.Reshuffle())
+    output_collection = (
+        ordered_collection | 'AddContextToExamples' >> beam.ParDo(
+            GenerateContextFn(
+                sequence_key, add_context_features, image_ids_to_keep,
+                keep_context_features_image_id_list=(
+                    keep_context_features_image_id_list),
+                subsample_context_features_rate=subsample_context_features_rate,
+                keep_only_positives=keep_only_positives,
+                keep_only_positives_gt=keep_only_positives_gt,
+                context_features_score_threshold=(
+                    context_features_score_threshold),
+                max_num_elements_in_context_features=(
+                    max_num_elements_in_context_features),
+                output_type=output_type,
+                max_clip_length=max_clip_length)))
+
+    output_collection = (
+        output_collection | 'ReshuffleExamples' >> beam.Reshuffle())
+    _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
+        output_tfrecord,
+        num_shards=num_shards,
+        coder=coder)
+  return pipeline
+
+
+def main(_):
+  """Runs the Beam pipeline that builds context features.
+
+  Args:
+    _: unused
+  """
+  # must create before flags are used
+  runner = runners.DirectRunner()
+
+  dirname = os.path.dirname(FLAGS.output_tfrecord)
+  tf.io.gfile.makedirs(dirname)
+  runner.run(
+      construct_pipeline(FLAGS.input_tfrecord,
+                         FLAGS.output_tfrecord,
+                         FLAGS.sequence_key,
+                         FLAGS.time_horizon,
+                         FLAGS.subsample_context_features_rate,
+                         FLAGS.reduce_image_size,
+                         FLAGS.max_image_dimension,
+                         FLAGS.add_context_features,
+                         FLAGS.sorted_image_ids,
+                         FLAGS.image_ids_to_keep,
+                         FLAGS.keep_context_features_image_id_list,
+                         FLAGS.keep_only_positives,
+                         FLAGS.context_features_score_threshold,
+                         FLAGS.keep_only_positives_gt,
+                         FLAGS.max_num_elements_in_context_features,
+                         FLAGS.num_shards,
+                         FLAGS.output_type,
+                         FLAGS.max_clip_length))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required([
+      'input_tfrecord',
+      'output_tfrecord'
+  ])
+  app.run(main)
--- a/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/add_context_to_examples_tf1_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for add_context_to_examples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import datetime
+import os
+import tempfile
+import unittest
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+
+from object_detection.dataset_tools.context_rcnn import add_context_to_examples
+from object_detection.utils import tf_version
+from apache_beam import runners
+
+
+@contextlib.contextmanager
+def InMemoryTFRecord(entries):
+  temp = tempfile.NamedTemporaryFile(delete=False)
+  filename = temp.name
+  try:
+    with tf.python_io.TFRecordWriter(filename) as writer:
+      for value in entries:
+        writer.write(value)
+    yield filename
+  finally:
+    os.unlink(temp.name)
+
+
+def BytesFeature(value):
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def BytesListFeature(value):
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
+
+
+def Int64Feature(value):
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+
+def Int64ListFeature(value):
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def FloatListFeature(value):
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
+class GenerateContextDataTest(tf.test.TestCase):
+
+  def _create_first_tf_example(self):
+    with self.test_session():
+      encoded_image = tf.image.encode_jpeg(
+          tf.constant(np.ones((4, 4, 3)).astype(np.uint8))).eval()
+
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image/encoded': BytesFeature(encoded_image),
+        'image/source_id': BytesFeature(six.ensure_binary('image_id_1')),
+        'image/height': Int64Feature(4),
+        'image/width': Int64Feature(4),
+        'image/object/class/label': Int64ListFeature([5, 5]),
+        'image/object/class/text': BytesListFeature([six.ensure_binary('hyena'),
+                                                     six.ensure_binary('hyena')
+                                                    ]),
+        'image/object/bbox/xmin': FloatListFeature([0.0, 0.1]),
+        'image/object/bbox/xmax': FloatListFeature([0.2, 0.3]),
+        'image/object/bbox/ymin': FloatListFeature([0.4, 0.5]),
+        'image/object/bbox/ymax': FloatListFeature([0.6, 0.7]),
+        'image/seq_id': BytesFeature(six.ensure_binary('01')),
+        'image/seq_num_frames': Int64Feature(2),
+        'image/seq_frame_num': Int64Feature(0),
+        'image/date_captured': BytesFeature(
+            six.ensure_binary(str(datetime.datetime(2020, 1, 1, 1, 0, 0)))),
+        'image/embedding': FloatListFeature([0.1, 0.2, 0.3]),
+        'image/embedding_score': FloatListFeature([0.9]),
+        'image/embedding_length': Int64Feature(3)
+
+    }))
+
+    return example.SerializeToString()
+
+  def _create_second_tf_example(self):
+    with self.test_session():
+      encoded_image = tf.image.encode_jpeg(
+          tf.constant(np.ones((4, 4, 3)).astype(np.uint8))).eval()
+
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image/encoded': BytesFeature(encoded_image),
+        'image/source_id': BytesFeature(six.ensure_binary('image_id_2')),
+        'image/height': Int64Feature(4),
+        'image/width': Int64Feature(4),
+        'image/object/class/label': Int64ListFeature([5]),
+        'image/object/class/text': BytesListFeature([six.ensure_binary('hyena')
+                                                    ]),
+        'image/object/bbox/xmin': FloatListFeature([0.0]),
+        'image/object/bbox/xmax': FloatListFeature([0.1]),
+        'image/object/bbox/ymin': FloatListFeature([0.2]),
+        'image/object/bbox/ymax': FloatListFeature([0.3]),
+        'image/seq_id': BytesFeature(six.ensure_binary('01')),
+        'image/seq_num_frames': Int64Feature(2),
+        'image/seq_frame_num': Int64Feature(1),
+        'image/date_captured': BytesFeature(
+            six.ensure_binary(str(datetime.datetime(2020, 1, 1, 1, 1, 0)))),
+        'image/embedding': FloatListFeature([0.4, 0.5, 0.6]),
+        'image/embedding_score': FloatListFeature([0.9]),
+        'image/embedding_length': Int64Feature(3)
+    }))
+
+    return example.SerializeToString()
+
+  def assert_expected_examples(self, tf_example_list):
+    self.assertAllEqual(
+        {tf_example.features.feature['image/source_id'].bytes_list.value[0]
+         for tf_example in tf_example_list},
+        {six.ensure_binary('image_id_1'), six.ensure_binary('image_id_2')})
+    self.assertAllClose(
+        tf_example_list[0].features.feature[
+            'image/context_features'].float_list.value,
+        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
+    self.assertAllClose(
+        tf_example_list[1].features.feature[
+            'image/context_features'].float_list.value,
+        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
+
+  def assert_expected_sequence_example(self, tf_sequence_example_list):
+    tf_sequence_example = tf_sequence_example_list[0]
+    num_frames = 2
+
+    self.assertAllEqual(
+        tf_sequence_example.context.feature[
+            'clip/media_id'].bytes_list.value[0], six.ensure_binary(
+                '01_0'))
+    self.assertAllClose(
+        tf_sequence_example.context.feature[
+            'image/context_features'].float_list.value,
+        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
+
+    seq_feature_dict = tf_sequence_example.feature_lists.feature_list
+
+    self.assertLen(
+        seq_feature_dict['image/encoded'].feature[:],
+        num_frames)
+    actual_timestamps = [
+        feature.int64_list.value[0] for feature
+        in seq_feature_dict['image/timestamp'].feature]
+    timestamps = [0, 1]
+    self.assertAllEqual(timestamps, actual_timestamps)
+
+    # First image.
+    self.assertAllClose(
+        [0.4, 0.5],
+        seq_feature_dict['region/bbox/ymin'].feature[0].float_list.value[:])
+    self.assertAllClose(
+        [0.0, 0.1],
+        seq_feature_dict['region/bbox/xmin'].feature[0].float_list.value[:])
+    self.assertAllClose(
+        [0.6, 0.7],
+        seq_feature_dict['region/bbox/ymax'].feature[0].float_list.value[:])
+    self.assertAllClose(
+        [0.2, 0.3],
+        seq_feature_dict['region/bbox/xmax'].feature[0].float_list.value[:])
+    self.assertAllEqual(
+        [six.ensure_binary('hyena'), six.ensure_binary('hyena')],
+        seq_feature_dict['region/label/string'].feature[0].bytes_list.value[:])
+
+    # Second example.
+    self.assertAllClose(
+        [0.2],
+        seq_feature_dict['region/bbox/ymin'].feature[1].float_list.value[:])
+    self.assertAllClose(
+        [0.0],
+        seq_feature_dict['region/bbox/xmin'].feature[1].float_list.value[:])
+    self.assertAllClose(
+        [0.3],
+        seq_feature_dict['region/bbox/ymax'].feature[1].float_list.value[:])
+    self.assertAllClose(
+        [0.1],
+        seq_feature_dict['region/bbox/xmax'].feature[1].float_list.value[:])
+    self.assertAllEqual(
+        [six.ensure_binary('hyena')],
+        seq_feature_dict['region/label/string'].feature[1].bytes_list.value[:])
+
+  def assert_expected_key(self, key):
+    self.assertAllEqual(key, '01')
+
+  def assert_sorted(self, example_collection):
+    example_list = list(example_collection)
+    counter = 0
+    for example in example_list:
+      frame_num = example.features.feature[
+          'image/seq_frame_num'].int64_list.value[0]
+      self.assertGreaterEqual(frame_num, counter)
+      counter = frame_num
+
+  def assert_context(self, example_collection):
+    example_list = list(example_collection)
+    for example in example_list:
+      context = example.features.feature[
+          'image/context_features'].float_list.value
+      self.assertAllClose([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], context)
+
+  def assert_resized(self, example):
+    width = example.features.feature['image/width'].int64_list.value[0]
+    self.assertAllEqual(width, 2)
+    height = example.features.feature['image/height'].int64_list.value[0]
+    self.assertAllEqual(height, 2)
+
+  def assert_size(self, example):
+    width = example.features.feature['image/width'].int64_list.value[0]
+    self.assertAllEqual(width, 4)
+    height = example.features.feature['image/height'].int64_list.value[0]
+    self.assertAllEqual(height, 4)
+
+  def test_sliding_window(self):
+    example_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    max_clip_length = 3
+    stride_length = 3
+    out_list = [list(i) for i in add_context_to_examples.get_sliding_window(
+        example_list, max_clip_length, stride_length)]
+    self.assertAllEqual(out_list, [['a', 'b', 'c'],
+                                   ['d', 'e', 'f'],
+                                   ['g']])
+
+  def test_rekey_data_fn(self):
+    sequence_key = 'image/seq_id'
+    time_horizon = None
+    reduce_image_size = False
+    max_dim = None
+
+    rekey_fn = add_context_to_examples.ReKeyDataFn(
+        sequence_key, time_horizon,
+        reduce_image_size, max_dim)
+    output = rekey_fn.process(self._create_first_tf_example())
+
+    self.assert_expected_key(output[0][0])
+    self.assert_size(output[0][1])
+
+  def test_rekey_data_fn_w_resize(self):
+    sequence_key = 'image/seq_id'
+    time_horizon = None
+    reduce_image_size = True
+    max_dim = 2
+
+    rekey_fn = add_context_to_examples.ReKeyDataFn(
+        sequence_key, time_horizon,
+        reduce_image_size, max_dim)
+    output = rekey_fn.process(self._create_first_tf_example())
+
+    self.assert_expected_key(output[0][0])
+    self.assert_resized(output[0][1])
+
+  def test_sort_fn(self):
+    sequence_key = 'image/seq_id'
+    sorted_image_ids = False
+    max_num_elements_in_context_features = 10
+    sort_fn = add_context_to_examples.SortGroupedDataFn(
+        sequence_key, sorted_image_ids, max_num_elements_in_context_features)
+    output = sort_fn.process(
+        ('dummy_key', [tf.train.Example.FromString(
+            self._create_second_tf_example()),
+                       tf.train.Example.FromString(
+                           self._create_first_tf_example())]))
+
+    self.assert_sorted(output[0][1])
+
+  def test_add_context_fn(self):
+    sequence_key = 'image/seq_id'
+    add_context_features = True
+    image_ids_to_keep = 'All'
+    context_fn = add_context_to_examples.GenerateContextFn(
+        sequence_key, add_context_features, image_ids_to_keep)
+    output = context_fn.process(
+        ('dummy_key', [tf.train.Example.FromString(
+            self._create_first_tf_example()),
+                       tf.train.Example.FromString(
+                           self._create_second_tf_example())]))
+
+    self.assertEqual(len(output), 2)
+    self.assert_context(output)
+
+  def test_add_context_fn_output_sequence_example(self):
+    sequence_key = 'image/seq_id'
+    add_context_features = True
+    image_ids_to_keep = 'All'
+    context_fn = add_context_to_examples.GenerateContextFn(
+        sequence_key, add_context_features, image_ids_to_keep,
+        output_type='tf_sequence_example')
+    output = context_fn.process(
+        ('01',
+         [tf.train.Example.FromString(self._create_first_tf_example()),
+          tf.train.Example.FromString(self._create_second_tf_example())]))
+
+    self.assertEqual(len(output), 1)
+    self.assert_expected_sequence_example(output)
+
+  def test_add_context_fn_output_sequence_example_cliplen(self):
+    sequence_key = 'image/seq_id'
+    add_context_features = True
+    image_ids_to_keep = 'All'
+    context_fn = add_context_to_examples.GenerateContextFn(
+        sequence_key, add_context_features, image_ids_to_keep,
+        output_type='tf_sequence_example', max_clip_length=1)
+    output = context_fn.process(
+        ('01',
+         [tf.train.Example.FromString(self._create_first_tf_example()),
+          tf.train.Example.FromString(self._create_second_tf_example())]))
+    self.assertEqual(len(output), 2)
+
+  def test_beam_pipeline(self):
+    with InMemoryTFRecord(
+        [self._create_first_tf_example(),
+         self._create_second_tf_example()]) as input_tfrecord:
+      runner = runners.DirectRunner()
+      temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+      output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
+      sequence_key = six.ensure_binary('image/seq_id')
+      max_num_elements = 10
+      num_shards = 1
+      pipeline = add_context_to_examples.construct_pipeline(
+          input_tfrecord,
+          output_tfrecord,
+          sequence_key,
+          max_num_elements_in_context_features=max_num_elements,
+          num_shards=num_shards)
+      runner.run(pipeline)
+      filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
+      actual_output = []
+      record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
+      for record in record_iterator:
+        actual_output.append(record)
+      self.assertEqual(len(actual_output), 2)
+      self.assert_expected_examples([tf.train.Example.FromString(
+          tf_example) for tf_example in actual_output])
+
+  def test_beam_pipeline_sequence_example(self):
+    with InMemoryTFRecord(
+        [self._create_first_tf_example(),
+         self._create_second_tf_example()]) as input_tfrecord:
+      runner = runners.DirectRunner()
+      temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+      output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
+      sequence_key = six.ensure_binary('image/seq_id')
+      max_num_elements = 10
+      num_shards = 1
+      pipeline = add_context_to_examples.construct_pipeline(
+          input_tfrecord,
+          output_tfrecord,
+          sequence_key,
+          max_num_elements_in_context_features=max_num_elements,
+          num_shards=num_shards,
+          output_type='tf_sequence_example')
+      runner.run(pipeline)
+      filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
+      actual_output = []
+      record_iterator = tf.python_io.tf_record_iterator(
+          path=filenames[0])
+      for record in record_iterator:
+        actual_output.append(record)
+      self.assertEqual(len(actual_output), 1)
+      self.assert_expected_sequence_example(
+          [tf.train.SequenceExample.FromString(
+              tf_example) for tf_example in actual_output])
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_main.py
+++ b/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_main.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Beam pipeline to create COCO Camera Traps Object Detection TFRecords.
+
+Please note that this tool creates sharded output files.
+
+This tool assumes the input annotations are in the COCO Camera Traps json
+format, specified here:
+https://github.com/Microsoft/CameraTraps/blob/master/data_management/README.md
+
+Example usage:
+
+    python create_cococameratraps_tfexample_main.py \
+      --alsologtostderr \
+      --output_tfrecord_prefix="/path/to/output/tfrecord/location/prefix" \
+      --image_directory="/path/to/image/folder/" \
+      --input_annotations_file="path/to/annotations.json"
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+import io
+import json
+import logging
+import os
+from absl import app
+from absl import flags
+import apache_beam as beam
+import numpy as np
+import PIL.Image
+import tensorflow.compat.v1 as tf
+from apache_beam import runners
+from object_detection.utils import dataset_util
+
+flags.DEFINE_string('image_directory', None, 'Directory where images are '
+                    'stored')
+flags.DEFINE_string('output_tfrecord_prefix', None,
+                    'TFRecord containing images in tf.Example format.')
+flags.DEFINE_string('input_annotations_file', None, 'Path to Coco-CameraTraps'
+                    'style annotations file')
+flags.DEFINE_integer('num_images_per_shard',
+                     200,
+                     'The number of  images to be stored in each shard.')
+
+FLAGS = flags.FLAGS
+
+
+class ParseImage(beam.DoFn):
+  """A DoFn that parses a COCO-CameraTraps json and emits TFRecords."""
+
+  def __init__(self, image_directory, images, annotations, categories,
+               keep_bboxes):
+    """Initialization function.
+
+    Args:
+      image_directory: Path to image directory
+      images: list of COCO Camera Traps style image dictionaries
+      annotations: list of COCO Camera Traps style annotation dictionaries
+      categories: list of COCO Camera Traps style category dictionaries
+      keep_bboxes: Whether to keep any bounding boxes that exist in the
+        annotations
+    """
+
+    self._image_directory = image_directory
+    self._image_dict = {im['id']: im for im in images}
+    self._annotation_dict = {im['id']: [] for im in images}
+    self._category_dict = {int(cat['id']): cat for cat in categories}
+    for ann in annotations:
+      self._annotation_dict[ann['image_id']].append(ann)
+    self._images = images
+    self._keep_bboxes = keep_bboxes
+
+    self._num_examples_processed = beam.metrics.Metrics.counter(
+        'cococameratraps_data_generation', 'num_tf_examples_processed')
+
+  def process(self, image_id):
+    """Builds a tf.Example given an image id.
+
+    Args:
+      image_id: the image id of the associated image
+
+    Returns:
+      List of tf.Examples.
+    """
+
+    image = self._image_dict[image_id]
+    annotations = self._annotation_dict[image_id]
+    image_height = image['height']
+    image_width = image['width']
+    filename = image['file_name']
+    image_id = image['id']
+    image_location_id = image['location']
+
+    image_datetime = str(image['date_captured'])
+
+    image_sequence_id = str(image['seq_id'])
+    image_sequence_num_frames = int(image['seq_num_frames'])
+    image_sequence_frame_num = int(image['frame_num'])
+
+    full_path = os.path.join(self._image_directory, filename)
+
+    try:
+      # Ensure the image exists and is not corrupted
+      with tf.io.gfile.GFile(full_path, 'rb') as fid:
+        encoded_jpg = fid.read()
+      encoded_jpg_io = io.BytesIO(encoded_jpg)
+      image = PIL.Image.open(encoded_jpg_io)
+      # Ensure the image can be read by tf
+      with tf.Graph().as_default():
+        image = tf.image.decode_jpeg(encoded_jpg, channels=3)
+        init_op = tf.initialize_all_tables()
+        with tf.Session() as sess:
+          sess.run(init_op)
+          sess.run(image)
+    except Exception as e:  # pylint: disable=broad-except
+      # The image file is missing or corrupt
+      tf.logging.error(str(e))
+      return []
+
+    key = hashlib.sha256(encoded_jpg).hexdigest()
+    feature_dict = {
+        'image/height':
+            dataset_util.int64_feature(image_height),
+        'image/width':
+            dataset_util.int64_feature(image_width),
+        'image/filename':
+            dataset_util.bytes_feature(filename.encode('utf8')),
+        'image/source_id':
+            dataset_util.bytes_feature(str(image_id).encode('utf8')),
+        'image/key/sha256':
+            dataset_util.bytes_feature(key.encode('utf8')),
+        'image/encoded':
+            dataset_util.bytes_feature(encoded_jpg),
+        'image/format':
+            dataset_util.bytes_feature('jpeg'.encode('utf8')),
+        'image/location':
+            dataset_util.bytes_feature(str(image_location_id).encode('utf8')),
+        'image/seq_num_frames':
+            dataset_util.int64_feature(image_sequence_num_frames),
+        'image/seq_frame_num':
+            dataset_util.int64_feature(image_sequence_frame_num),
+        'image/seq_id':
+            dataset_util.bytes_feature(image_sequence_id.encode('utf8')),
+        'image/date_captured':
+            dataset_util.bytes_feature(image_datetime.encode('utf8'))
+    }
+
+    num_annotations_skipped = 0
+    if annotations:
+      xmin = []
+      xmax = []
+      ymin = []
+      ymax = []
+      category_names = []
+      category_ids = []
+      area = []
+
+      for object_annotations in annotations:
+        if 'bbox' in object_annotations and self._keep_bboxes:
+          (x, y, width, height) = tuple(object_annotations['bbox'])
+          if width <= 0 or height <= 0:
+            num_annotations_skipped += 1
+            continue
+          if x + width > image_width or y + height > image_height:
+            num_annotations_skipped += 1
+            continue
+          xmin.append(float(x) / image_width)
+          xmax.append(float(x + width) / image_width)
+          ymin.append(float(y) / image_height)
+          ymax.append(float(y + height) / image_height)
+          if 'area' in object_annotations:
+            area.append(object_annotations['area'])
+          else:
+            # approximate area using l*w/2
+            area.append(width*height/2.0)
+
+        category_id = int(object_annotations['category_id'])
+        category_ids.append(category_id)
+        category_names.append(
+            self._category_dict[category_id]['name'].encode('utf8'))
+
+      feature_dict.update({
+          'image/object/bbox/xmin':
+              dataset_util.float_list_feature(xmin),
+          'image/object/bbox/xmax':
+              dataset_util.float_list_feature(xmax),
+          'image/object/bbox/ymin':
+              dataset_util.float_list_feature(ymin),
+          'image/object/bbox/ymax':
+              dataset_util.float_list_feature(ymax),
+          'image/object/class/text':
+              dataset_util.bytes_list_feature(category_names),
+          'image/object/class/label':
+              dataset_util.int64_list_feature(category_ids),
+          'image/object/area':
+              dataset_util.float_list_feature(area),
+      })
+
+      # For classification, add the first category to image/class/label and
+      # image/class/text
+      if not category_ids:
+        feature_dict.update({
+            'image/class/label':
+                dataset_util.int64_list_feature([0]),
+            'image/class/text':
+                dataset_util.bytes_list_feature(['empty'.encode('utf8')]),
+        })
+      else:
+        feature_dict.update({
+            'image/class/label':
+                dataset_util.int64_list_feature([category_ids[0]]),
+            'image/class/text':
+                dataset_util.bytes_list_feature([category_names[0]]),
+        })
+
+    else:
+      # Add empty class if there are no annotations
+      feature_dict.update({
+          'image/class/label':
+              dataset_util.int64_list_feature([0]),
+          'image/class/text':
+              dataset_util.bytes_list_feature(['empty'.encode('utf8')]),
+      })
+
+    example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+    self._num_examples_processed.inc(1)
+
+    return [(example)]
+
+
+def _load_json_data(data_file):
+  with tf.io.gfile.GFile(data_file, 'r') as fid:
+    data_dict = json.load(fid)
+  return data_dict
+
+
+def create_pipeline(image_directory,
+                    input_annotations_file,
+                    output_tfrecord_prefix=None,
+                    num_images_per_shard=200,
+                    keep_bboxes=True):
+  """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset.
+
+  Args:
+    image_directory: Path to image directory
+    input_annotations_file: Path to a coco-cameratraps annotation file
+    output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will
+      be named {output_tfrecord_prefix}@N.
+    num_images_per_shard: The number of images to store in each shard
+    keep_bboxes: Whether to keep any bounding boxes that exist in the json file
+
+  Returns:
+    A Beam pipeline.
+  """
+
+  logging.info('Reading data from COCO-CameraTraps Dataset.')
+
+  data = _load_json_data(input_annotations_file)
+
+  num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard))
+
+  def pipeline(root):
+    """Builds beam pipeline."""
+
+    image_examples = (
+        root
+        | ('CreateCollections') >> beam.Create(
+            [im['id'] for im in data['images']])
+        | ('ParseImage') >> beam.ParDo(ParseImage(
+            image_directory, data['images'], data['annotations'],
+            data['categories'], keep_bboxes=keep_bboxes)))
+    _ = (image_examples
+         | ('Reshuffle') >> beam.Reshuffle()
+         | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord(
+             output_tfrecord_prefix,
+             num_shards=num_shards,
+             coder=beam.coders.ProtoCoder(tf.train.Example)))
+
+  return pipeline
+
+
+def main(_):
+  """Runs the Beam pipeline that performs inference.
+
+  Args:
+    _: unused
+  """
+
+  # must create before flags are used
+  runner = runners.DirectRunner()
+
+  dirname = os.path.dirname(FLAGS.output_tfrecord_prefix)
+  tf.io.gfile.makedirs(dirname)
+
+  runner.run(
+      create_pipeline(
+          image_directory=FLAGS.image_directory,
+          input_annotations_file=FLAGS.input_annotations_file,
+          output_tfrecord_prefix=FLAGS.output_tfrecord_prefix,
+          num_images_per_shard=FLAGS.num_images_per_shard))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required([
+      'image_directory',
+      'input_annotations_file',
+      'output_tfrecord_prefix'
+  ])
+  app.run(main)
--- a/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_tf1_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for create_cococameratraps_tfexample_main."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import datetime
+import json
+import os
+import tempfile
+import unittest
+import numpy as np
+
+from PIL import Image
+import tensorflow.compat.v1 as tf
+from object_detection.dataset_tools.context_rcnn import create_cococameratraps_tfexample_main
+from object_detection.utils import tf_version
+from apache_beam import runners
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
+class CreateCOCOCameraTrapsTfexampleTest(tf.test.TestCase):
+
+  IMAGE_HEIGHT = 360
+  IMAGE_WIDTH = 480
+
+  def _write_random_images_to_directory(self, directory, num_frames):
+    for frame_num in range(num_frames):
+      img = np.random.randint(0, high=256,
+                              size=(self.IMAGE_HEIGHT, self.IMAGE_WIDTH, 3),
+                              dtype=np.uint8)
+      pil_image = Image.fromarray(img)
+      fname = 'im_' + str(frame_num) + '.jpg'
+      pil_image.save(os.path.join(directory, fname), 'JPEG')
+
+  def _create_json_file(self, directory, num_frames, keep_bboxes=False):
+    json_dict = {'images': [], 'annotations': []}
+    json_dict['categories'] = [{'id': 0, 'name': 'empty'},
+                               {'id': 1, 'name': 'animal'}]
+    for idx in range(num_frames):
+      im = {'id': 'im_' + str(idx),
+            'file_name': 'im_' + str(idx) + '.jpg',
+            'height': self.IMAGE_HEIGHT,
+            'width': self.IMAGE_WIDTH,
+            'seq_id': 'seq_1',
+            'seq_num_frames': num_frames,
+            'frame_num': idx,
+            'location': 'loc_' + str(idx),
+            'date_captured': str(datetime.datetime.now())
+           }
+      json_dict['images'].append(im)
+      ann = {'id': 'ann' + str(idx),
+             'image_id': 'im_' + str(idx),
+             'category_id': 1,
+            }
+      if keep_bboxes:
+        ann['bbox'] = [0.0 * self.IMAGE_WIDTH,
+                       0.1 * self.IMAGE_HEIGHT,
+                       0.5 * self.IMAGE_WIDTH,
+                       0.5 * self.IMAGE_HEIGHT]
+      json_dict['annotations'].append(ann)
+
+    json_path = os.path.join(directory, 'test_file.json')
+    with tf.io.gfile.GFile(json_path, 'w') as f:
+      json.dump(json_dict, f)
+    return json_path
+
+  def assert_expected_example_bbox(self, example):
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [0.1])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [0.0])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [0.6])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [0.5])
+    self.assertAllClose(
+        example.features.feature['image/object/class/label']
+        .int64_list.value, [1])
+    self.assertAllEqual(
+        example.features.feature['image/object/class/text']
+        .bytes_list.value, ['animal'])
+    self.assertAllClose(
+        example.features.feature['image/class/label']
+        .int64_list.value, [1])
+    self.assertAllEqual(
+        example.features.feature['image/class/text']
+        .bytes_list.value, ['animal'])
+
+    # Check other essential attributes.
+    self.assertAllEqual(
+        example.features.feature['image/height'].int64_list.value,
+        [self.IMAGE_HEIGHT])
+    self.assertAllEqual(
+        example.features.feature['image/width'].int64_list.value,
+        [self.IMAGE_WIDTH])
+    self.assertAllEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        ['im_0'])
+    self.assertTrue(
+        example.features.feature['image/encoded'].bytes_list.value)
+
+  def assert_expected_example(self, example):
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [])
+    self.assertAllClose(
+        example.features.feature['image/object/class/label']
+        .int64_list.value, [1])
+    self.assertAllEqual(
+        example.features.feature['image/object/class/text']
+        .bytes_list.value, ['animal'])
+    self.assertAllClose(
+        example.features.feature['image/class/label']
+        .int64_list.value, [1])
+    self.assertAllEqual(
+        example.features.feature['image/class/text']
+        .bytes_list.value, ['animal'])
+
+    # Check other essential attributes.
+    self.assertAllEqual(
+        example.features.feature['image/height'].int64_list.value,
+        [self.IMAGE_HEIGHT])
+    self.assertAllEqual(
+        example.features.feature['image/width'].int64_list.value,
+        [self.IMAGE_WIDTH])
+    self.assertAllEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        ['im_0'])
+    self.assertTrue(
+        example.features.feature['image/encoded'].bytes_list.value)
+
+  def test_beam_pipeline(self):
+    runner = runners.DirectRunner()
+    num_frames = 1
+    temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+    json_path = self._create_json_file(temp_dir, num_frames)
+    output_tfrecord = temp_dir+'/output'
+    self._write_random_images_to_directory(temp_dir, num_frames)
+    pipeline = create_cococameratraps_tfexample_main.create_pipeline(
+        temp_dir, json_path,
+        output_tfrecord_prefix=output_tfrecord)
+    runner.run(pipeline)
+    filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
+    actual_output = []
+    record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
+    for record in record_iterator:
+      actual_output.append(record)
+    self.assertEqual(len(actual_output), num_frames)
+    self.assert_expected_example(tf.train.Example.FromString(
+        actual_output[0]))
+
+  def test_beam_pipeline_bbox(self):
+    runner = runners.DirectRunner()
+    num_frames = 1
+    temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+    json_path = self._create_json_file(temp_dir, num_frames, keep_bboxes=True)
+    output_tfrecord = temp_dir+'/output'
+    self._write_random_images_to_directory(temp_dir, num_frames)
+    pipeline = create_cococameratraps_tfexample_main.create_pipeline(
+        temp_dir, json_path,
+        output_tfrecord_prefix=output_tfrecord,
+        keep_bboxes=True)
+    runner.run(pipeline)
+    filenames = tf.io.gfile.glob(output_tfrecord+'-?????-of-?????')
+    actual_output = []
+    record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
+    for record in record_iterator:
+      actual_output.append(record)
+    self.assertEqual(len(actual_output), num_frames)
+    self.assert_expected_example_bbox(tf.train.Example.FromString(
+        actual_output[0]))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/dataset_tools/context_rcnn/generate_detection_data.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_detection_data.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A Beam job to generate detection data for camera trap images.
+
+This tools allows to run inference with an exported Object Detection model in
+`saved_model` format and produce raw detection boxes on images in tf.Examples,
+with the assumption that the bounding box class label will match the image-level
+class label in the tf.Example.
+
+Steps to generate a detection dataset:
+1. Use object_detection/export_inference_graph.py to get a `saved_model` for
+  inference. The input node must accept a tf.Example proto.
+2. Run this tool with `saved_model` from step 1 and an TFRecord of tf.Example
+  protos containing images for inference.
+
+Example Usage:
+--------------
+python tensorflow_models/object_detection/export_inference_graph.py \
+    --alsologtostderr \
+    --input_type tf_example \
+    --pipeline_config_path path/to/detection_model.config \
+    --trained_checkpoint_prefix path/to/model.ckpt \
+    --output_directory path/to/exported_model_directory
+
+python generate_detection_data.py \
+    --alsologtostderr \
+    --input_tfrecord path/to/input_tfrecord@X \
+    --output_tfrecord path/to/output_tfrecord@X \
+    --model_dir path/to/exported_model_directory/saved_model
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+from absl import app
+from absl import flags
+import apache_beam as beam
+import tensorflow.compat.v1 as tf
+from apache_beam import runners
+
+
+flags.DEFINE_string('detection_input_tfrecord', None, 'TFRecord containing '
+                    'images in tf.Example format for object detection.')
+flags.DEFINE_string('detection_output_tfrecord', None,
+                    'TFRecord containing detections in tf.Example format.')
+flags.DEFINE_string('detection_model_dir', None, 'Path to directory containing'
+                    'an object detection SavedModel.')
+flags.DEFINE_float('confidence_threshold', 0.9,
+                   'Min confidence to keep bounding boxes')
+flags.DEFINE_integer('num_shards', 0, 'Number of output shards.')
+
+FLAGS = flags.FLAGS
+
+
+class GenerateDetectionDataFn(beam.DoFn):
+  """Generates detection data for camera trap images.
+
+  This Beam DoFn performs inference with an object detection `saved_model` and
+  produces detection boxes for camera trap data, matched to the
+  object class.
+  """
+  session_lock = threading.Lock()
+
+  def __init__(self, model_dir, confidence_threshold):
+    """Initialization function.
+
+    Args:
+      model_dir: A directory containing saved model.
+      confidence_threshold: the confidence threshold for boxes to keep
+    """
+    self._model_dir = model_dir
+    self._confidence_threshold = confidence_threshold
+    self._session = None
+    self._num_examples_processed = beam.metrics.Metrics.counter(
+        'detection_data_generation', 'num_tf_examples_processed')
+
+  def start_bundle(self):
+    self._load_inference_model()
+
+  def _load_inference_model(self):
+    # Because initialization of the tf.Session is expensive we share
+    # one instance across all threads in the worker. This is possible since
+    # tf.Session.run() is thread safe.
+    with self.session_lock:
+      if self._session is None:
+        graph = tf.Graph()
+        self._session = tf.Session(graph=graph)
+        with graph.as_default():
+          meta_graph = tf.saved_model.loader.load(
+              self._session, [tf.saved_model.tag_constants.SERVING],
+              self._model_dir)
+        signature = meta_graph.signature_def['serving_default']
+        input_tensor_name = signature.inputs['inputs'].name
+        self._input = graph.get_tensor_by_name(input_tensor_name)
+        self._boxes_node = graph.get_tensor_by_name(
+            signature.outputs['detection_boxes'].name)
+        self._scores_node = graph.get_tensor_by_name(
+            signature.outputs['detection_scores'].name)
+        self._num_detections_node = graph.get_tensor_by_name(
+            signature.outputs['num_detections'].name)
+
+  def process(self, tfrecord_entry):
+    return self._run_inference_and_generate_detections(tfrecord_entry)
+
+  def _run_inference_and_generate_detections(self, tfrecord_entry):
+    input_example = tf.train.Example.FromString(tfrecord_entry)
+    if input_example.features.feature[
+        'image/object/bbox/ymin'].float_list.value:
+      # There are already ground truth boxes for this image, just keep them.
+      return [input_example]
+
+    detection_boxes, detection_scores, num_detections = self._session.run(
+        [self._boxes_node, self._scores_node, self._num_detections_node],
+        feed_dict={self._input: [tfrecord_entry]})
+
+    example = tf.train.Example()
+
+    num_detections = int(num_detections[0])
+
+    image_class_labels = input_example.features.feature[
+        'image/object/class/label'].int64_list.value
+
+    image_class_texts = input_example.features.feature[
+        'image/object/class/text'].bytes_list.value
+
+    # Ignore any images with multiple classes,
+    # we can't match the class to the box.
+    if len(image_class_labels) > 1:
+      return []
+
+    # Don't add boxes for images already labeled empty (for now)
+    if len(image_class_labels) == 1:
+      # Add boxes over confidence threshold.
+      for idx, score in enumerate(detection_scores[0]):
+        if score >= self._confidence_threshold and idx < num_detections:
+          example.features.feature[
+              'image/object/bbox/ymin'].float_list.value.extend([
+                  detection_boxes[0, idx, 0]])
+          example.features.feature[
+              'image/object/bbox/xmin'].float_list.value.extend([
+                  detection_boxes[0, idx, 1]])
+          example.features.feature[
+              'image/object/bbox/ymax'].float_list.value.extend([
+                  detection_boxes[0, idx, 2]])
+          example.features.feature[
+              'image/object/bbox/xmax'].float_list.value.extend([
+                  detection_boxes[0, idx, 3]])
+
+          # Add box scores and class texts and labels.
+          example.features.feature[
+              'image/object/class/score'].float_list.value.extend(
+                  [score])
+
+          example.features.feature[
+              'image/object/class/label'].int64_list.value.extend(
+                  [image_class_labels[0]])
+
+          example.features.feature[
+              'image/object/class/text'].bytes_list.value.extend(
+                  [image_class_texts[0]])
+
+    # Add other essential example attributes
+    example.features.feature['image/encoded'].bytes_list.value.extend(
+        input_example.features.feature['image/encoded'].bytes_list.value)
+    example.features.feature['image/height'].int64_list.value.extend(
+        input_example.features.feature['image/height'].int64_list.value)
+    example.features.feature['image/width'].int64_list.value.extend(
+        input_example.features.feature['image/width'].int64_list.value)
+    example.features.feature['image/source_id'].bytes_list.value.extend(
+        input_example.features.feature['image/source_id'].bytes_list.value)
+    example.features.feature['image/location'].bytes_list.value.extend(
+        input_example.features.feature['image/location'].bytes_list.value)
+
+    example.features.feature['image/date_captured'].bytes_list.value.extend(
+        input_example.features.feature['image/date_captured'].bytes_list.value)
+
+    example.features.feature['image/class/text'].bytes_list.value.extend(
+        input_example.features.feature['image/class/text'].bytes_list.value)
+    example.features.feature['image/class/label'].int64_list.value.extend(
+        input_example.features.feature['image/class/label'].int64_list.value)
+
+    example.features.feature['image/seq_id'].bytes_list.value.extend(
+        input_example.features.feature['image/seq_id'].bytes_list.value)
+    example.features.feature['image/seq_num_frames'].int64_list.value.extend(
+        input_example.features.feature['image/seq_num_frames'].int64_list.value)
+    example.features.feature['image/seq_frame_num'].int64_list.value.extend(
+        input_example.features.feature['image/seq_frame_num'].int64_list.value)
+
+    self._num_examples_processed.inc(1)
+    return [example]
+
+
+def construct_pipeline(input_tfrecord, output_tfrecord, model_dir,
+                       confidence_threshold, num_shards):
+  """Returns a Beam pipeline to run object detection inference.
+
+  Args:
+    input_tfrecord: A TFRecord of tf.train.Example protos containing images.
+    output_tfrecord: A TFRecord of tf.train.Example protos that contain images
+      in the input TFRecord and the detections from the model.
+    model_dir: Path to `saved_model` to use for inference.
+    confidence_threshold: Threshold to use when keeping detection results.
+    num_shards: The number of output shards.
+  Returns:
+    pipeline: A Beam pipeline.
+  """
+  def pipeline(root):
+    input_collection = (
+        root | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
+            input_tfrecord,
+            coder=beam.coders.BytesCoder()))
+    output_collection = input_collection | 'RunInference' >> beam.ParDo(
+        GenerateDetectionDataFn(model_dir, confidence_threshold))
+    output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
+    _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
+        output_tfrecord,
+        num_shards=num_shards,
+        coder=beam.coders.ProtoCoder(tf.train.Example))
+  return pipeline
+
+
+def main(_):
+  """Runs the Beam pipeline that performs inference.
+
+  Args:
+    _: unused
+  """
+  # must create before flags are used
+  runner = runners.DirectRunner()
+
+  dirname = os.path.dirname(FLAGS.detection_output_tfrecord)
+  tf.io.gfile.makedirs(dirname)
+  runner.run(
+      construct_pipeline(FLAGS.detection_input_tfrecord,
+                         FLAGS.detection_output_tfrecord,
+                         FLAGS.detection_model_dir,
+                         FLAGS.confidence_threshold,
+                         FLAGS.num_shards))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required([
+      'detection_input_tfrecord',
+      'detection_output_tfrecord',
+      'detection_model_dir'
+  ])
+  app.run(main)
--- a/research/object_detection/dataset_tools/context_rcnn/generate_detection_data_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_detection_data_tf1_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for generate_detection_data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import os
+import tempfile
+import unittest
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+
+from object_detection import exporter
+from object_detection.builders import model_builder
+from object_detection.core import model
+from object_detection.dataset_tools.context_rcnn import generate_detection_data
+from object_detection.protos import pipeline_pb2
+from object_detection.utils import tf_version
+from apache_beam import runners
+
+if six.PY2:
+  import mock  # pylint: disable=g-import-not-at-top
+else:
+  mock = unittest.mock
+
+
+class FakeModel(model.DetectionModel):
+  """A Fake Detection model with expected output nodes from post-processing."""
+
+  def preprocess(self, inputs):
+    true_image_shapes = []  # Doesn't matter for the fake model.
+    return tf.identity(inputs), true_image_shapes
+
+  def predict(self, preprocessed_inputs, true_image_shapes):
+    return {'image': tf.layers.conv2d(preprocessed_inputs, 3, 1)}
+
+  def postprocess(self, prediction_dict, true_image_shapes):
+    with tf.control_dependencies(prediction_dict.values()):
+      postprocessed_tensors = {
+          'detection_boxes': tf.constant([[[0.0, 0.1, 0.5, 0.6],
+                                           [0.5, 0.5, 0.8, 0.8]]], tf.float32),
+          'detection_scores': tf.constant([[0.95, 0.6]], tf.float32),
+          'detection_multiclass_scores': tf.constant([[[0.1, 0.7, 0.2],
+                                                       [0.3, 0.1, 0.6]]],
+                                                     tf.float32),
+          'detection_classes': tf.constant([[0, 1]], tf.float32),
+          'num_detections': tf.constant([2], tf.float32)
+      }
+    return postprocessed_tensors
+
+  def restore_map(self, checkpoint_path, fine_tune_checkpoint_type):
+    pass
+
+  def loss(self, prediction_dict, true_image_shapes):
+    pass
+
+  def regularization_losses(self):
+    pass
+
+  def updates(self):
+    pass
+
+
+@contextlib.contextmanager
+def InMemoryTFRecord(entries):
+  temp = tempfile.NamedTemporaryFile(delete=False)
+  filename = temp.name
+  try:
+    with tf.python_io.TFRecordWriter(filename) as writer:
+      for value in entries:
+        writer.write(value)
+    yield filename
+  finally:
+    os.unlink(filename)
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
+class GenerateDetectionDataTest(tf.test.TestCase):
+
+  def _save_checkpoint_from_mock_model(self, checkpoint_path):
+    """A function to save checkpoint from a fake Detection Model.
+
+    Args:
+      checkpoint_path: Path to save checkpoint from Fake model.
+    """
+    g = tf.Graph()
+    with g.as_default():
+      mock_model = FakeModel(num_classes=5)
+      preprocessed_inputs, true_image_shapes = mock_model.preprocess(
+          tf.placeholder(tf.float32, shape=[None, None, None, 3]))
+      predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
+      mock_model.postprocess(predictions, true_image_shapes)
+      tf.train.get_or_create_global_step()
+      saver = tf.train.Saver()
+      init = tf.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        sess.run(init)
+        saver.save(sess, checkpoint_path)
+
+  def _export_saved_model(self):
+    tmp_dir = self.get_temp_dir()
+    checkpoint_path = os.path.join(tmp_dir, 'model.ckpt')
+    self._save_checkpoint_from_mock_model(checkpoint_path)
+    output_directory = os.path.join(tmp_dir, 'output')
+    saved_model_path = os.path.join(output_directory, 'saved_model')
+    tf.io.gfile.makedirs(output_directory)
+    with mock.patch.object(
+        model_builder, 'build', autospec=True) as mock_builder:
+      mock_builder.return_value = FakeModel(num_classes=5)
+      pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+      pipeline_config.eval_config.use_moving_averages = False
+      detection_model = model_builder.build(pipeline_config.model,
+                                            is_training=False)
+      outputs, placeholder_tensor = exporter.build_detection_graph(
+          input_type='tf_example',
+          detection_model=detection_model,
+          input_shape=None,
+          output_collection_name='inference_op',
+          graph_hook_fn=None)
+      output_node_names = ','.join(outputs.keys())
+      saver = tf.train.Saver()
+      input_saver_def = saver.as_saver_def()
+      frozen_graph_def = exporter.freeze_graph_with_def_protos(
+          input_graph_def=tf.get_default_graph().as_graph_def(),
+          input_saver_def=input_saver_def,
+          input_checkpoint=checkpoint_path,
+          output_node_names=output_node_names,
+          restore_op_name='save/restore_all',
+          filename_tensor_name='save/Const:0',
+          output_graph='',
+          clear_devices=True,
+          initializer_nodes='')
+      exporter.write_saved_model(
+          saved_model_path=saved_model_path,
+          frozen_graph_def=frozen_graph_def,
+          inputs=placeholder_tensor,
+          outputs=outputs)
+      return saved_model_path
+
+  def _create_tf_example(self):
+    with self.test_session():
+      encoded_image = tf.image.encode_jpeg(
+          tf.constant(np.ones((4, 6, 3)).astype(np.uint8))).eval()
+
+    def BytesFeature(value):
+      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+    def Int64Feature(value):
+      return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image/encoded': BytesFeature(encoded_image),
+        'image/source_id': BytesFeature(b'image_id'),
+        'image/height': Int64Feature(4),
+        'image/width': Int64Feature(6),
+        'image/object/class/label': Int64Feature(5),
+        'image/object/class/text': BytesFeature(b'hyena'),
+        'image/class/label': Int64Feature(5),
+        'image/class/text': BytesFeature(b'hyena'),
+    }))
+
+    return example.SerializeToString()
+
+  def assert_expected_example(self, example):
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [0.0])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [0.1])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [0.5])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [0.6])
+    self.assertAllClose(
+        example.features.feature['image/object/class/score']
+        .float_list.value, [0.95])
+    self.assertAllClose(
+        example.features.feature['image/object/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(
+        example.features.feature['image/object/class/text']
+        .bytes_list.value, [b'hyena'])
+    self.assertAllClose(
+        example.features.feature['image/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(
+        example.features.feature['image/class/text']
+        .bytes_list.value, [b'hyena'])
+
+    # Check other essential attributes.
+    self.assertAllEqual(
+        example.features.feature['image/height'].int64_list.value, [4])
+    self.assertAllEqual(
+        example.features.feature['image/width'].int64_list.value, [6])
+    self.assertAllEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        [b'image_id'])
+    self.assertTrue(
+        example.features.feature['image/encoded'].bytes_list.value)
+
+  def test_generate_detection_data_fn(self):
+    saved_model_path = self._export_saved_model()
+    confidence_threshold = 0.8
+    inference_fn = generate_detection_data.GenerateDetectionDataFn(
+        saved_model_path, confidence_threshold)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(tf.train.Example.FromString(
+        generated_example).features.feature['image/object/class/label']
+                        .int64_list.value, [5])
+    self.assertAllEqual(tf.train.Example.FromString(
+        generated_example).features.feature['image/object/class/text']
+                        .bytes_list.value, [b'hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+
+    self.assertAllEqual(
+        output_example.features.feature['image/object/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(output_example.features.feature['image/width']
+                        .int64_list.value, [6])
+
+    self.assert_expected_example(output_example)
+
+  def test_beam_pipeline(self):
+    with InMemoryTFRecord([self._create_tf_example()]) as input_tfrecord:
+      runner = runners.DirectRunner()
+      temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+      output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
+      saved_model_path = self._export_saved_model()
+      confidence_threshold = 0.8
+      num_shards = 1
+      pipeline = generate_detection_data.construct_pipeline(
+          input_tfrecord, output_tfrecord, saved_model_path,
+          confidence_threshold, num_shards)
+      runner.run(pipeline)
+      filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
+      actual_output = []
+      record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
+      for record in record_iterator:
+        actual_output.append(record)
+      self.assertEqual(len(actual_output), 1)
+      self.assert_expected_example(tf.train.Example.FromString(
+          actual_output[0]))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A Beam job to generate embedding data for camera trap images.
+
+This tool runs inference with an exported Object Detection model in
+`saved_model` format and produce raw embeddings for camera trap data. These
+embeddings contain an object-centric feature embedding from Faster R-CNN, the
+datetime that the image was taken (normalized in a specific way), and the
+position of the object of interest. By default, only the highest-scoring object
+embedding is included.
+
+Steps to generate a embedding dataset:
+1. Use object_detection/export_inference_graph.py to get a Faster R-CNN
+  `saved_model` for inference. The input node must accept a tf.Example proto.
+2. Run this tool with `saved_model` from step 1 and an TFRecord of tf.Example
+  protos containing images for inference.
+
+Example Usage:
+--------------
+python tensorflow_models/object_detection/export_inference_graph.py \
+    --alsologtostderr \
+    --input_type tf_example \
+    --pipeline_config_path path/to/faster_rcnn_model.config \
+    --trained_checkpoint_prefix path/to/model.ckpt \
+    --output_directory path/to/exported_model_directory
+
+python generate_embedding_data.py \
+    --alsologtostderr \
+    --embedding_input_tfrecord path/to/input_tfrecords* \
+    --embedding_output_tfrecord path/to/output_tfrecords \
+    --embedding_model_dir path/to/exported_model_directory/saved_model
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import datetime
+import os
+import threading
+from absl import app
+from absl import flags
+import apache_beam as beam
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+from apache_beam import runners
+
+flags.DEFINE_string('embedding_input_tfrecord', None, 'TFRecord containing'
+                    'images in tf.Example format for object detection.')
+flags.DEFINE_string('embedding_output_tfrecord', None,
+                    'TFRecord containing embeddings in tf.Example format.')
+flags.DEFINE_string('embedding_model_dir', None, 'Path to directory containing'
+                    'an object detection SavedModel with'
+                    'detection_box_classifier_features in the output.')
+flags.DEFINE_integer('top_k_embedding_count', 1,
+                     'The number of top k embeddings to add to the memory bank.'
+                    )
+flags.DEFINE_integer('bottom_k_embedding_count', 0,
+                     'The number of bottom k embeddings to add to the memory '
+                     'bank.')
+flags.DEFINE_integer('num_shards', 0, 'Number of output shards.')
+
+
+FLAGS = flags.FLAGS
+
+
+class GenerateEmbeddingDataFn(beam.DoFn):
+  """Generates embedding data for camera trap images.
+
+  This Beam DoFn performs inference with an object detection `saved_model` and
+  produces contextual embedding vectors.
+  """
+  session_lock = threading.Lock()
+
+  def __init__(self, model_dir, top_k_embedding_count,
+               bottom_k_embedding_count):
+    """Initialization function.
+
+    Args:
+      model_dir: A directory containing saved model.
+      top_k_embedding_count: the number of high-confidence embeddings to store
+      bottom_k_embedding_count: the number of low-confidence embeddings to store
+    """
+    self._model_dir = model_dir
+    self._session = None
+    self._num_examples_processed = beam.metrics.Metrics.counter(
+        'embedding_data_generation', 'num_tf_examples_processed')
+    self._top_k_embedding_count = top_k_embedding_count
+    self._bottom_k_embedding_count = bottom_k_embedding_count
+
+  def start_bundle(self):
+    self._load_inference_model()
+
+  def _load_inference_model(self):
+    # Because initialization of the tf.Session is expensive we share
+    # one instance across all threads in the worker. This is possible since
+    # tf.Session.run() is thread safe.
+    with self.session_lock:
+      if self._session is None:
+        graph = tf.Graph()
+        self._session = tf.Session(graph=graph)
+        with graph.as_default():
+          meta_graph = tf.saved_model.loader.load(
+              self._session, [tf.saved_model.tag_constants.SERVING],
+              self._model_dir)
+        signature = meta_graph.signature_def['serving_default']
+        input_tensor_name = signature.inputs['inputs'].name
+        detection_features_name = signature.outputs['detection_features'].name
+        detection_boxes_name = signature.outputs['detection_boxes'].name
+        num_detections_name = signature.outputs['num_detections'].name
+        self._input = graph.get_tensor_by_name(input_tensor_name)
+        self._embedding_node = graph.get_tensor_by_name(detection_features_name)
+        self._box_node = graph.get_tensor_by_name(detection_boxes_name)
+        self._scores_node = graph.get_tensor_by_name(
+            signature.outputs['detection_scores'].name)
+        self._num_detections = graph.get_tensor_by_name(num_detections_name)
+        tf.logging.info(signature.outputs['detection_features'].name)
+        tf.logging.info(signature.outputs['detection_boxes'].name)
+        tf.logging.info(signature.outputs['num_detections'].name)
+
+  def process(self, tfrecord_entry):
+    return self._run_inference_and_generate_embedding(tfrecord_entry)
+
+  def _run_inference_and_generate_embedding(self, tfrecord_entry):
+    input_example = tf.train.Example.FromString(tfrecord_entry)
+    # Convert date_captured datetime string to unix time integer and store
+
+    def get_date_captured(example):
+      date_captured = datetime.datetime.strptime(
+          six.ensure_str(
+              example.features.feature[
+                  'image/date_captured'].bytes_list.value[0]),
+          '%Y-%m-%d %H:%M:%S')
+      return date_captured
+
+    try:
+      date_captured = get_date_captured(input_example)
+    except Exception:  # pylint: disable=broad-except
+      # we require date_captured to be available for all images
+      return []
+
+    def embed_date_captured(date_captured):
+      """Encodes the datetime of the image."""
+      embedded_date_captured = []
+      month_max = 12.0
+      day_max = 31.0
+      hour_max = 24.0
+      minute_max = 60.0
+      min_year = 1990.0
+      max_year = 2030.0
+
+      year = (date_captured.year-min_year)/float(max_year-min_year)
+      embedded_date_captured.append(year)
+
+      month = (date_captured.month-1)/month_max
+      embedded_date_captured.append(month)
+
+      day = (date_captured.day-1)/day_max
+      embedded_date_captured.append(day)
+
+      hour = date_captured.hour/hour_max
+      embedded_date_captured.append(hour)
+
+      minute = date_captured.minute/minute_max
+      embedded_date_captured.append(minute)
+
+      return np.asarray(embedded_date_captured)
+
+    def embed_position_and_size(box):
+      """Encodes the bounding box of the object of interest."""
+      ymin = box[0]
+      xmin = box[1]
+      ymax = box[2]
+      xmax = box[3]
+      w = xmax - xmin
+      h = ymax - ymin
+      x = xmin + w / 2.0
+      y = ymin + h / 2.0
+      return np.asarray([x, y, w, h])
+
+    unix_time = (
+        (date_captured - datetime.datetime.fromtimestamp(0)).total_seconds())
+
+    example = tf.train.Example()
+    example.features.feature['image/unix_time'].float_list.value.extend(
+        [unix_time])
+
+    (detection_features, detection_boxes, num_detections,
+     detection_scores) = self._session.run(
+         [
+             self._embedding_node, self._box_node, self._num_detections[0],
+             self._scores_node
+         ],
+         feed_dict={self._input: [tfrecord_entry]})
+
+    num_detections = int(num_detections)
+    embed_all = []
+    score_all = []
+
+    detection_features = np.asarray(detection_features)
+
+    def get_bb_embedding(detection_features, detection_boxes, detection_scores,
+                         index):
+      embedding = detection_features[0][index]
+      pooled_embedding = np.mean(np.mean(embedding, axis=1), axis=0)
+
+      box = detection_boxes[0][index]
+      position_embedding = embed_position_and_size(box)
+
+      score = detection_scores[0][index]
+      return np.concatenate((pooled_embedding, position_embedding)), score
+
+    temporal_embedding = embed_date_captured(date_captured)
+
+    embedding_count = 0
+    for index in range(min(num_detections, self._top_k_embedding_count)):
+      bb_embedding, score = get_bb_embedding(
+          detection_features, detection_boxes, detection_scores, index)
+      embed_all.extend(bb_embedding)
+      embed_all.extend(temporal_embedding)
+      score_all.append(score)
+      embedding_count += 1
+
+    for index in range(
+        max(0, num_detections - 1),
+        max(-1, num_detections - 1 - self._bottom_k_embedding_count), -1):
+      bb_embedding, score = get_bb_embedding(
+          detection_features, detection_boxes, detection_scores, index)
+      embed_all.extend(bb_embedding)
+      embed_all.extend(temporal_embedding)
+      score_all.append(score)
+      embedding_count += 1
+
+    if embedding_count == 0:
+      bb_embedding, score = get_bb_embedding(
+          detection_features, detection_boxes, detection_scores, 0)
+      embed_all.extend(bb_embedding)
+      embed_all.extend(temporal_embedding)
+      score_all.append(score)
+
+    # Takes max in case embedding_count is 0.
+    embedding_length = len(embed_all) // max(1, embedding_count)
+
+    embed_all = np.asarray(embed_all)
+
+    example.features.feature['image/embedding'].float_list.value.extend(
+        embed_all)
+    example.features.feature['image/embedding_score'].float_list.value.extend(
+        score_all)
+    example.features.feature['image/embedding_length'].int64_list.value.append(
+        embedding_length)
+    example.features.feature['image/embedding_count'].int64_list.value.append(
+        embedding_count)
+
+    # Add other essential example attributes
+    example.features.feature['image/encoded'].bytes_list.value.extend(
+        input_example.features.feature['image/encoded'].bytes_list.value)
+    example.features.feature['image/height'].int64_list.value.extend(
+        input_example.features.feature['image/height'].int64_list.value)
+    example.features.feature['image/width'].int64_list.value.extend(
+        input_example.features.feature['image/width'].int64_list.value)
+    example.features.feature['image/source_id'].bytes_list.value.extend(
+        input_example.features.feature['image/source_id'].bytes_list.value)
+    example.features.feature['image/location'].bytes_list.value.extend(
+        input_example.features.feature['image/location'].bytes_list.value)
+
+    example.features.feature['image/date_captured'].bytes_list.value.extend(
+        input_example.features.feature['image/date_captured'].bytes_list.value)
+
+    example.features.feature['image/class/text'].bytes_list.value.extend(
+        input_example.features.feature['image/class/text'].bytes_list.value)
+    example.features.feature['image/class/label'].int64_list.value.extend(
+        input_example.features.feature['image/class/label'].int64_list.value)
+
+    example.features.feature['image/seq_id'].bytes_list.value.extend(
+        input_example.features.feature['image/seq_id'].bytes_list.value)
+    example.features.feature['image/seq_num_frames'].int64_list.value.extend(
+        input_example.features.feature['image/seq_num_frames'].int64_list.value)
+    example.features.feature['image/seq_frame_num'].int64_list.value.extend(
+        input_example.features.feature['image/seq_frame_num'].int64_list.value)
+
+    example.features.feature['image/object/bbox/ymax'].float_list.value.extend(
+        input_example.features.feature[
+            'image/object/bbox/ymax'].float_list.value)
+    example.features.feature['image/object/bbox/ymin'].float_list.value.extend(
+        input_example.features.feature[
+            'image/object/bbox/ymin'].float_list.value)
+    example.features.feature['image/object/bbox/xmax'].float_list.value.extend(
+        input_example.features.feature[
+            'image/object/bbox/xmax'].float_list.value)
+    example.features.feature['image/object/bbox/xmin'].float_list.value.extend(
+        input_example.features.feature[
+            'image/object/bbox/xmin'].float_list.value)
+    example.features.feature[
+        'image/object/class/score'].float_list.value.extend(
+            input_example.features.feature[
+                'image/object/class/score'].float_list.value)
+    example.features.feature[
+        'image/object/class/label'].int64_list.value.extend(
+            input_example.features.feature[
+                'image/object/class/label'].int64_list.value)
+    example.features.feature[
+        'image/object/class/text'].bytes_list.value.extend(
+            input_example.features.feature[
+                'image/object/class/text'].bytes_list.value)
+
+    self._num_examples_processed.inc(1)
+    return [example]
+
+
+def construct_pipeline(input_tfrecord, output_tfrecord, model_dir,
+                       top_k_embedding_count, bottom_k_embedding_count,
+                       num_shards):
+  """Returns a beam pipeline to run object detection inference.
+
+  Args:
+    input_tfrecord: An TFRecord of tf.train.Example protos containing images.
+    output_tfrecord: An TFRecord of tf.train.Example protos that contain images
+      in the input TFRecord and the detections from the model.
+    model_dir: Path to `saved_model` to use for inference.
+    top_k_embedding_count: The number of high-confidence embeddings to store.
+    bottom_k_embedding_count: The number of low-confidence embeddings to store.
+    num_shards: The number of output shards.
+  """
+  def pipeline(root):
+    input_collection = (
+        root | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
+            input_tfrecord,
+            coder=beam.coders.BytesCoder()))
+    output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
+        GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
+                                bottom_k_embedding_count))
+    output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
+    _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
+        output_tfrecord,
+        num_shards=num_shards,
+        coder=beam.coders.ProtoCoder(tf.train.Example))
+  return pipeline
+
+
+def main(_):
+  """Runs the Beam pipeline that performs inference.
+
+  Args:
+    _: unused
+  """
+  # must create before flags are used
+  runner = runners.DirectRunner()
+
+  dirname = os.path.dirname(FLAGS.embedding_output_tfrecord)
+  tf.io.gfile.makedirs(dirname)
+  runner.run(
+      construct_pipeline(FLAGS.embedding_input_tfrecord,
+                         FLAGS.embedding_output_tfrecord,
+                         FLAGS.embedding_model_dir, FLAGS.top_k_embedding_count,
+                         FLAGS.bottom_k_embedding_count, FLAGS.num_shards))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required([
+      'embedding_input_tfrecord',
+      'embedding_output_tfrecord',
+      'embedding_model_dir'
+  ])
+  app.run(main)
--- a/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data_tf1_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for generate_embedding_data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import os
+import tempfile
+import unittest
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+from object_detection import exporter
+from object_detection.builders import model_builder
+from object_detection.core import model
+from object_detection.dataset_tools.context_rcnn import generate_embedding_data
+from object_detection.protos import pipeline_pb2
+from object_detection.utils import tf_version
+from apache_beam import runners
+
+if six.PY2:
+  import mock  # pylint: disable=g-import-not-at-top
+else:
+  mock = unittest.mock
+
+
+class FakeModel(model.DetectionModel):
+  """A Fake Detection model with expected output nodes from post-processing."""
+
+  def preprocess(self, inputs):
+    true_image_shapes = []  # Doesn't matter for the fake model.
+    return tf.identity(inputs), true_image_shapes
+
+  def predict(self, preprocessed_inputs, true_image_shapes):
+    return {'image': tf.layers.conv2d(preprocessed_inputs, 3, 1)}
+
+  def postprocess(self, prediction_dict, true_image_shapes):
+    with tf.control_dependencies(prediction_dict.values()):
+      num_features = 100
+      feature_dims = 10
+      classifier_feature = np.ones(
+          (2, feature_dims, feature_dims, num_features),
+          dtype=np.float32).tolist()
+      postprocessed_tensors = {
+          'detection_boxes': tf.constant([[[0.0, 0.1, 0.5, 0.6],
+                                           [0.5, 0.5, 0.8, 0.8]]], tf.float32),
+          'detection_scores': tf.constant([[0.95, 0.6]], tf.float32),
+          'detection_multiclass_scores': tf.constant([[[0.1, 0.7, 0.2],
+                                                       [0.3, 0.1, 0.6]]],
+                                                     tf.float32),
+          'detection_classes': tf.constant([[0, 1]], tf.float32),
+          'num_detections': tf.constant([2], tf.float32),
+          'detection_features':
+              tf.constant([classifier_feature],
+                          tf.float32)
+      }
+    return postprocessed_tensors
+
+  def restore_map(self, checkpoint_path, fine_tune_checkpoint_type):
+    pass
+
+  def loss(self, prediction_dict, true_image_shapes):
+    pass
+
+  def regularization_losses(self):
+    pass
+
+  def updates(self):
+    pass
+
+
+@contextlib.contextmanager
+def InMemoryTFRecord(entries):
+  temp = tempfile.NamedTemporaryFile(delete=False)
+  filename = temp.name
+  try:
+    with tf.python_io.TFRecordWriter(filename) as writer:
+      for value in entries:
+        writer.write(value)
+    yield filename
+  finally:
+    os.unlink(temp.name)
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
+class GenerateEmbeddingData(tf.test.TestCase):
+
+  def _save_checkpoint_from_mock_model(self, checkpoint_path):
+    """A function to save checkpoint from a fake Detection Model.
+
+    Args:
+      checkpoint_path: Path to save checkpoint from Fake model.
+    """
+    g = tf.Graph()
+    with g.as_default():
+      mock_model = FakeModel(num_classes=5)
+      preprocessed_inputs, true_image_shapes = mock_model.preprocess(
+          tf.placeholder(tf.float32, shape=[None, None, None, 3]))
+      predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
+      mock_model.postprocess(predictions, true_image_shapes)
+      tf.train.get_or_create_global_step()
+      saver = tf.train.Saver()
+      init = tf.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        sess.run(init)
+        saver.save(sess, checkpoint_path)
+
+  def _export_saved_model(self):
+    tmp_dir = self.get_temp_dir()
+    checkpoint_path = os.path.join(tmp_dir, 'model.ckpt')
+    self._save_checkpoint_from_mock_model(checkpoint_path)
+    output_directory = os.path.join(tmp_dir, 'output')
+    saved_model_path = os.path.join(output_directory, 'saved_model')
+    tf.io.gfile.makedirs(output_directory)
+    with mock.patch.object(
+        model_builder, 'build', autospec=True) as mock_builder:
+      mock_builder.return_value = FakeModel(num_classes=5)
+      pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+      pipeline_config.eval_config.use_moving_averages = False
+      detection_model = model_builder.build(pipeline_config.model,
+                                            is_training=False)
+      outputs, placeholder_tensor = exporter.build_detection_graph(
+          input_type='tf_example',
+          detection_model=detection_model,
+          input_shape=None,
+          output_collection_name='inference_op',
+          graph_hook_fn=None)
+      output_node_names = ','.join(outputs.keys())
+      saver = tf.train.Saver()
+      input_saver_def = saver.as_saver_def()
+      frozen_graph_def = exporter.freeze_graph_with_def_protos(
+          input_graph_def=tf.get_default_graph().as_graph_def(),
+          input_saver_def=input_saver_def,
+          input_checkpoint=checkpoint_path,
+          output_node_names=output_node_names,
+          restore_op_name='save/restore_all',
+          filename_tensor_name='save/Const:0',
+          output_graph='',
+          clear_devices=True,
+          initializer_nodes='')
+      exporter.write_saved_model(
+          saved_model_path=saved_model_path,
+          frozen_graph_def=frozen_graph_def,
+          inputs=placeholder_tensor,
+          outputs=outputs)
+      return saved_model_path
+
+  def _create_tf_example(self):
+    with self.test_session():
+      encoded_image = tf.image.encode_jpeg(
+          tf.constant(np.ones((4, 4, 3)).astype(np.uint8))).eval()
+
+    def BytesFeature(value):
+      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+    def Int64Feature(value):
+      return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+    def FloatFeature(value):
+      return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': BytesFeature(encoded_image),
+                'image/source_id': BytesFeature(b'image_id'),
+                'image/height': Int64Feature(400),
+                'image/width': Int64Feature(600),
+                'image/class/label': Int64Feature(5),
+                'image/class/text': BytesFeature(b'hyena'),
+                'image/object/bbox/xmin': FloatFeature(0.1),
+                'image/object/bbox/xmax': FloatFeature(0.6),
+                'image/object/bbox/ymin': FloatFeature(0.0),
+                'image/object/bbox/ymax': FloatFeature(0.5),
+                'image/object/class/score': FloatFeature(0.95),
+                'image/object/class/label': Int64Feature(5),
+                'image/object/class/text': BytesFeature(b'hyena'),
+                'image/date_captured': BytesFeature(b'2019-10-20 12:12:12')
+            }))
+
+    return example.SerializeToString()
+
+  def assert_expected_example(self, example, topk=False, botk=False):
+    # Check embeddings
+    if topk or botk:
+      self.assertEqual(len(
+          example.features.feature['image/embedding'].float_list.value),
+                       218)
+      self.assertAllEqual(
+          example.features.feature['image/embedding_count'].int64_list.value,
+          [2])
+    else:
+      self.assertEqual(len(
+          example.features.feature['image/embedding'].float_list.value),
+                       109)
+      self.assertAllEqual(
+          example.features.feature['image/embedding_count'].int64_list.value,
+          [1])
+
+    self.assertAllEqual(
+        example.features.feature['image/embedding_length'].int64_list.value,
+        [109])
+
+    # Check annotations
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [0.0])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [0.1])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [0.5])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [0.6])
+    self.assertAllClose(
+        example.features.feature['image/object/class/score']
+        .float_list.value, [0.95])
+    self.assertAllClose(
+        example.features.feature['image/object/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(
+        example.features.feature['image/object/class/text']
+        .bytes_list.value, ['hyena'])
+    self.assertAllClose(
+        example.features.feature['image/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(
+        example.features.feature['image/class/text']
+        .bytes_list.value, ['hyena'])
+
+    # Check other essential attributes.
+    self.assertAllEqual(
+        example.features.feature['image/height'].int64_list.value, [400])
+    self.assertAllEqual(
+        example.features.feature['image/width'].int64_list.value, [600])
+    self.assertAllEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        ['image_id'])
+    self.assertTrue(
+        example.features.feature['image/encoded'].bytes_list.value)
+
+  def test_generate_embedding_data_fn(self):
+    saved_model_path = self._export_saved_model()
+    top_k_embedding_count = 1
+    bottom_k_embedding_count = 0
+    inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
+        saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(tf.train.Example.FromString(
+        generated_example).features.feature['image/object/class/label']
+                        .int64_list.value, [5])
+    self.assertAllEqual(tf.train.Example.FromString(
+        generated_example).features.feature['image/object/class/text']
+                        .bytes_list.value, ['hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+    self.assert_expected_example(output_example)
+
+  def test_generate_embedding_data_with_top_k_boxes(self):
+    saved_model_path = self._export_saved_model()
+    top_k_embedding_count = 2
+    bottom_k_embedding_count = 0
+    inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
+        saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/label'].int64_list.value, [5])
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/text'].bytes_list.value, [b'hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+    self.assert_expected_example(output_example, topk=True)
+
+  def test_generate_embedding_data_with_bottom_k_boxes(self):
+    saved_model_path = self._export_saved_model()
+    top_k_embedding_count = 0
+    bottom_k_embedding_count = 2
+    inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
+        saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/label'].int64_list.value, [5])
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/text'].bytes_list.value, ['hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+    self.assert_expected_example(output_example, botk=True)
+
+  def test_beam_pipeline(self):
+    with InMemoryTFRecord([self._create_tf_example()]) as input_tfrecord:
+      runner = runners.DirectRunner()
+      temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+      output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
+      saved_model_path = self._export_saved_model()
+      top_k_embedding_count = 1
+      bottom_k_embedding_count = 0
+      num_shards = 1
+      pipeline = generate_embedding_data.construct_pipeline(
+          input_tfrecord, output_tfrecord, saved_model_path,
+          top_k_embedding_count, bottom_k_embedding_count, num_shards)
+      runner.run(pipeline)
+      filenames = tf.io.gfile.glob(
+          output_tfrecord + '-?????-of-?????')
+      actual_output = []
+      record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
+      for record in record_iterator:
+        actual_output.append(record)
+      self.assertEqual(len(actual_output), 1)
+      self.assert_expected_example(tf.train.Example.FromString(
+          actual_output[0]))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/dataset_tools/seq_example_util_test.py
+++ b/research/object_detection/dataset_tools/seq_example_util_test.py
@@ -24,10 +24,18 @@ import six
 import tensorflow.compat.v1 as tf

 from object_detection.dataset_tools import seq_example_util
+from object_detection.utils import tf_version


 class SeqExampleUtilTest(tf.test.TestCase):

+  def materialize_tensors(self, list_of_tensors):
+    if tf_version.is_tf2():
+      return [tensor.numpy() for tensor in list_of_tensors]
+    else:
+      with self.cached_session() as sess:
+        return sess.run(list_of_tensors)
+
  def test_make_unlabeled_example(self):
    num_frames = 5
    image_height = 100
@@ -41,8 +49,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
    image_source_ids = [str(idx) for idx in range(num_frames)]
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
-    with tf.Session() as sess:
-      encoded_images = sess.run(encoded_images_list)
+    encoded_images = self.materialize_tensors(encoded_images_list)
    seq_example = seq_example_util.make_sequence_example(
        dataset_name=dataset_name,
        video_id=video_id,
@@ -109,8 +116,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
        dtype=tf.int32), dtype=tf.uint8)
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
-    with tf.Session() as sess:
-      encoded_images = sess.run(encoded_images_list)
+    encoded_images = self.materialize_tensors(encoded_images_list)
    timestamps = [100000, 110000]
    is_annotated = [1, 0]
    bboxes = [
@@ -208,8 +214,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
        dtype=tf.int32), dtype=tf.uint8)
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
-    with tf.Session() as sess:
-      encoded_images = sess.run(encoded_images_list)
+    encoded_images = self.materialize_tensors(encoded_images_list)
    bboxes = [
        np.array([[0., 0., 0.75, 0.75],
                  [0., 0., 1., 1.]], dtype=np.float32),

--- a/research/object_detection/eval_util.py
+++ b/research/object_detection/eval_util.py
@@ -52,6 +52,8 @@ EVAL_METRICS_CLASS_DICT = {
        coco_evaluation.CocoKeypointEvaluator,
    'coco_mask_metrics':
        coco_evaluation.CocoMaskEvaluator,
+    'coco_panoptic_metrics':
+        coco_evaluation.CocoPanopticSegmentationEvaluator,
    'oid_challenge_detection_metrics':
        object_detection_evaluation.OpenImagesDetectionChallengeEvaluator,
    'oid_challenge_segmentation_metrics':

--- a/research/object_detection/eval_util_test.py
+++ b/research/object_detection/eval_util_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 from absl.testing import parameterized

 import numpy as np
@@ -30,6 +31,7 @@ from object_detection.core import standard_fields as fields
 from object_detection.metrics import coco_evaluation
 from object_detection.protos import eval_pb2
 from object_detection.utils import test_case
+from object_detection.utils import tf_version


 class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
@@ -127,6 +129,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      {'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
      {'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
  )
+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_for_coco_detections(self, batch_size=1,
                                                   max_gt_boxes=None,
                                                   scale_to_absolute=False):
@@ -155,6 +158,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      {'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
      {'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
  )
+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_for_coco_detections_and_masks(
      self, batch_size=1, max_gt_boxes=None, scale_to_absolute=False):
    eval_config = eval_pb2.EvalConfig()
@@ -185,6 +189,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      {'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
      {'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
  )
+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_for_coco_detections_and_resized_masks(
      self, batch_size=1, max_gt_boxes=None, scale_to_absolute=False):
    eval_config = eval_pb2.EvalConfig()
@@ -210,6 +215,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      self.assertAlmostEqual(1.0, metrics['DetectionBoxes_Precision/mAP'])
      self.assertAlmostEqual(1.0, metrics['DetectionMasks_Precision/mAP'])

+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_raises_error_with_unsupported_metric(self):
    eval_config = eval_pb2.EvalConfig()
    eval_config.metrics_set.extend(['unsupported_metric'])
@@ -334,63 +340,67 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
                               dtype=np.float32)
    detection_keypoints = np.array([[0.0, 0.0], [0.5, 0.5], [1.0, 1.0]],
                                   dtype=np.float32)
-    detections = {
-        detection_fields.detection_boxes:
-            tf.constant(detection_boxes),
-        detection_fields.detection_scores:
-            tf.constant([[1.], [1.]]),
-        detection_fields.detection_classes:
-            tf.constant([[1], [2]]),
-        detection_fields.num_detections:
-            tf.constant([1, 1]),
-        detection_fields.detection_keypoints:
-            tf.tile(
-                tf.reshape(
-                    tf.constant(detection_keypoints), shape=[1, 1, 3, 2]),
-                multiples=[2, 1, 1, 1])
-    }
-
-    gt_boxes = detection_boxes
-    groundtruth = {
-        input_data_fields.groundtruth_boxes:
-            tf.constant(gt_boxes),
-        input_data_fields.groundtruth_classes:
-            tf.constant([[1.], [1.]]),
-        input_data_fields.groundtruth_keypoints:
-            tf.tile(
-                tf.reshape(
-                    tf.constant(detection_keypoints), shape=[1, 1, 3, 2]),
-                multiples=[2, 1, 1, 1])
-    }
-
-    image = tf.zeros((2, 100, 100, 3), dtype=tf.float32)
-
-    true_image_shapes = tf.constant([[100, 100, 3], [50, 100, 3]])
-    original_image_spatial_shapes = tf.constant([[200, 200], [150, 300]])
-
-    result = eval_util.result_dict_for_batched_example(
-        image, key, detections, groundtruth,
-        scale_to_absolute=True,
-        true_image_shapes=true_image_shapes,
-        original_image_spatial_shapes=original_image_spatial_shapes,
-        max_gt_boxes=tf.constant(1))
-
-    with self.test_session() as sess:
-      result = sess.run(result)
-      self.assertAllEqual(
-          [[[0., 0., 200., 200.]], [[0.0, 0.0, 150., 150.]]],
-          result[input_data_fields.groundtruth_boxes])
-      self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
-                           [[[0., 0.], [150., 150.], [300., 300.]]]],
-                          result[input_data_fields.groundtruth_keypoints])
-
-      # Predictions from the model are not scaled.
-      self.assertAllEqual(
-          [[[0., 0., 200., 200.]], [[0.0, 0.0, 75., 150.]]],
-          result[detection_fields.detection_boxes])
-      self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
-                           [[[0., 0.], [75., 150.], [150., 300.]]]],
-                          result[detection_fields.detection_keypoints])
+    def graph_fn():
+      detections = {
+          detection_fields.detection_boxes:
+              tf.constant(detection_boxes),
+          detection_fields.detection_scores:
+              tf.constant([[1.], [1.]]),
+          detection_fields.detection_classes:
+              tf.constant([[1], [2]]),
+          detection_fields.num_detections:
+              tf.constant([1, 1]),
+          detection_fields.detection_keypoints:
+              tf.tile(
+                  tf.reshape(
+                      tf.constant(detection_keypoints), shape=[1, 1, 3, 2]),
+                  multiples=[2, 1, 1, 1])
+      }
+
+      gt_boxes = detection_boxes
+      groundtruth = {
+          input_data_fields.groundtruth_boxes:
+              tf.constant(gt_boxes),
+          input_data_fields.groundtruth_classes:
+              tf.constant([[1.], [1.]]),
+          input_data_fields.groundtruth_keypoints:
+              tf.tile(
+                  tf.reshape(
+                      tf.constant(detection_keypoints), shape=[1, 1, 3, 2]),
+                  multiples=[2, 1, 1, 1])
+      }
+
+      image = tf.zeros((2, 100, 100, 3), dtype=tf.float32)
+
+      true_image_shapes = tf.constant([[100, 100, 3], [50, 100, 3]])
+      original_image_spatial_shapes = tf.constant([[200, 200], [150, 300]])
+
+      result = eval_util.result_dict_for_batched_example(
+          image, key, detections, groundtruth,
+          scale_to_absolute=True,
+          true_image_shapes=true_image_shapes,
+          original_image_spatial_shapes=original_image_spatial_shapes,
+          max_gt_boxes=tf.constant(1))
+      return (result[input_data_fields.groundtruth_boxes],
+              result[input_data_fields.groundtruth_keypoints],
+              result[detection_fields.detection_boxes],
+              result[detection_fields.detection_keypoints])
+    (gt_boxes, gt_keypoints, detection_boxes,
+     detection_keypoints) = self.execute_cpu(graph_fn, [])
+    self.assertAllEqual(
+        [[[0., 0., 200., 200.]], [[0.0, 0.0, 150., 150.]]],
+        gt_boxes)
+    self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
+                         [[[0., 0.], [150., 150.], [300., 300.]]]],
+                        gt_keypoints)
+
+    # Predictions from the model are not scaled.
+    self.assertAllEqual(
+        [[[0., 0., 200., 200.]], [[0.0, 0.0, 75., 150.]]],
+        detection_boxes)
+    self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
+                         [[[0., 0.], [75., 150.], [150., 300.]]]],
+                        detection_keypoints)


 if __name__ == '__main__':

--- a/research/object_detection/export_inference_graph.py
+++ b/research/object_detection/export_inference_graph.py
@@ -134,6 +134,30 @@ flags.DEFINE_string('config_override', '',
                    'text proto to override pipeline_config_path.')
 flags.DEFINE_boolean('write_inference_graph', False,
                     'If true, writes inference graph to disk.')
+flags.DEFINE_string('additional_output_tensor_names', None,
+                    'Additional Tensors to output, to be specified as a comma '
+                    'separated list of tensor names.')
+flags.DEFINE_boolean('use_side_inputs', False,
+                     'If True, uses side inputs as well as image inputs.')
+flags.DEFINE_string('side_input_shapes', None,
+                    'If use_side_inputs is True, this explicitly sets '
+                    'the shape of the side input tensors to a fixed size. The '
+                    'dimensions are to be provided as a comma-separated list '
+                    'of integers. A value of -1 can be used for unknown '
+                    'dimensions. A `/` denotes a break, starting the shape of '
+                    'the next side input tensor. This flag is required if '
+                    'using side inputs.')
+flags.DEFINE_string('side_input_types', None,
+                    'If use_side_inputs is True, this explicitly sets '
+                    'the type of the side input tensors. The '
+                    'dimensions are to be provided as a comma-separated list '
+                    'of types, each of `string`, `integer`, or `float`. '
+                    'This flag is required if using side inputs.')
+flags.DEFINE_string('side_input_names', None,
+                    'If use_side_inputs is True, this explicitly sets '
+                    'the names of the side input tensors required by the model '
+                    'assuming the names will be a comma-separated list of '
+                    'strings. This flag is required if using side inputs.')
 tf.app.flags.mark_flag_as_required('pipeline_config_path')
 tf.app.flags.mark_flag_as_required('trained_checkpoint_prefix')
 tf.app.flags.mark_flag_as_required('output_directory')
@@ -152,10 +176,30 @@ def main(_):
    ]
  else:
    input_shape = None
+  if FLAGS.use_side_inputs:
+    side_input_shapes, side_input_names, side_input_types = (
+        exporter.parse_side_inputs(
+            FLAGS.side_input_shapes,
+            FLAGS.side_input_names,
+            FLAGS.side_input_types))
+  else:
+    side_input_shapes = None
+    side_input_names = None
+    side_input_types = None
+  if FLAGS.additional_output_tensor_names:
+    additional_output_tensor_names = list(
+        FLAGS.additional_output_tensor_names.split(','))
+  else:
+    additional_output_tensor_names = None
  exporter.export_inference_graph(
      FLAGS.input_type, pipeline_config, FLAGS.trained_checkpoint_prefix,
      FLAGS.output_directory, input_shape=input_shape,
-      write_inference_graph=FLAGS.write_inference_graph)
+      write_inference_graph=FLAGS.write_inference_graph,
+      additional_output_tensor_names=additional_output_tensor_names,
+      use_side_inputs=FLAGS.use_side_inputs,
+      side_input_shapes=side_input_shapes,
+      side_input_names=side_input_names,
+      side_input_types=side_input_types)


 if __name__ == '__main__':