resolve conflict with master

b0ccdb11 · Shixin Luo · e61588cd · 1611a8c5 · b0ccdb11 · b0ccdb11
Commit b0ccdb11 authored Sep 28, 2020 by Shixin Luo
20 changed files
--- a/research/delf/delf/python/delg/r101delg_gldv2clean_config.pbtxt
+++ b/research/delf/delf/python/delg/r101delg_gldv2clean_config.pbtxt
+use_local_features: true
+use_global_features: true
+model_path: "parameters/r101delg_gldv2clean_20200914"
+image_scales: 0.25
+image_scales: 0.35355338
+image_scales: 0.5
+image_scales: 0.70710677
+image_scales: 1.0
+image_scales: 1.4142135
+image_scales: 2.0
+delf_local_config {
+  use_pca: false
+  max_feature_num: 1000
+  score_threshold: 357.48
+}
+delf_global_config {
+  use_pca: false
+  image_scales_ind: 3
+  image_scales_ind: 4
+  image_scales_ind: 5
+}
+max_image_size: 1024
--- a/research/delf/delf/python/delg/r50delg_gldv2clean_config.pbtxt
+++ b/research/delf/delf/python/delg/r50delg_gldv2clean_config.pbtxt
+use_local_features: true
+use_global_features: true
+model_path: "parameters/r50delg_gldv2clean_20200914"
+image_scales: 0.25
+image_scales: 0.35355338
+image_scales: 0.5
+image_scales: 0.70710677
+image_scales: 1.0
+image_scales: 1.4142135
+image_scales: 2.0
+delf_local_config {
+  use_pca: false
+  max_feature_num: 1000
+  score_threshold: 454.6
+}
+delf_global_config {
+  use_pca: false
+  image_scales_ind: 3
+  image_scales_ind: 4
+  image_scales_ind: 5
+}
+max_image_size: 1024
--- a/research/delf/delf/python/examples/extractor.py
+++ b/research/delf/delf/python/examples/extractor.py
@@ -180,18 +180,17 @@ def MakeExtractor(config):
    if hasattr(config, 'is_tf2_exported') and config.is_tf2_exported:
      predict = model.signatures['serving_default']
      if config.use_local_features and config.use_global_features:
-        if config.use_global_features:
-          output_dict = predict(
-              input_image=image_tensor,
-              input_scales=image_scales_tensor,
-              input_max_feature_num=max_feature_num_tensor,
-              input_abs_thres=score_threshold_tensor,
-              input_global_scales_ind=global_scales_ind_tensor)
-          output = [
-              output_dict['boxes'], output_dict['features'],
-              output_dict['scales'], output_dict['scores'],
-              output_dict['global_descriptors']
-          ]
+        output_dict = predict(
+            input_image=image_tensor,
+            input_scales=image_scales_tensor,
+            input_max_feature_num=max_feature_num_tensor,
+            input_abs_thres=score_threshold_tensor,
+            input_global_scales_ind=global_scales_ind_tensor)
+        output = [
+            output_dict['boxes'], output_dict['features'],
+            output_dict['scales'], output_dict['scores'],
+            output_dict['global_descriptors']
+        ]
      elif config.use_local_features:
        output_dict = predict(
            input_image=image_tensor,

--- a/research/delf/delf/python/training/README.md
+++ b/research/delf/delf/python/training/README.md
@@ -143,6 +143,8 @@ curl -Os http://storage.googleapis.com/delf/resnet50_imagenet_weights.tar.gz
 tar -xzvf resnet50_imagenet_weights.tar.gz
 ```

+### Training with Local Features
+
 Assuming the TFRecord files were generated in the `gldv2_dataset/tfrecord/`
 directory, running the following command should start training a model and
 output the results in the `gldv2_training` directory:
@@ -156,13 +158,7 @@ python3 train.py \
  --logdir=gldv2_training/
 ```

-On a multi-GPU machine the batch size can be increased to speed up the training
-using the `--batch_size` parameter. On a 8 Tesla P100 GPUs machine you can set
-the batch size to `256`:
-
-```
--batch_size=256
-```
+### Training with Local and Global Features

 It is also possible to train the model with an improved global features head as
 introduced in the [DELG paper](https://arxiv.org/abs/2001.05027). To do this,
@@ -179,6 +175,15 @@ python3 train.py \
  --delg_global_features
 ```

+### Hyperparameter Guidelines
+
+In order to improve the convergence of the training, the following
+hyperparameter values have been tested and validated on the following
+infrastructures, the remaining `train.py` flags keeping their **default 
+values**:
+* 8 Tesla P100 GPUs: `--batch_size=256`, `--initial_lr=0.01`
+* 4 Tesla P100 GPUs: `--batch_size=128`, `--initial_lr=0.005`
+
 *NOTE*: We are currently working on adding the autoencoder described in the DELG
 paper to this codebase. Currently, it is not yet implemented here. Stay tuned!


--- a/research/object_detection/README.md
+++ b/research/object_detection/README.md
@@ -76,7 +76,7 @@ documentation of the Object Detection API:
 ### Mobile Inference for TF2 models

 TF2 OD API models can now be converted to TensorFlow Lite! Only SSD models
-currently supported. See <a href='running_on_mobile_tf2.md'>documentation</a>.
+currently supported. See <a href='g3doc/running_on_mobile_tf2.md'>documentation</a>.

 **Thanks to contributors**: Sachin Joglekar


--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -50,6 +50,7 @@ from object_detection.utils import tf_version
 if tf_version.is_tf2():
  from object_detection.models import center_net_hourglass_feature_extractor
  from object_detection.models import center_net_mobilenet_v2_feature_extractor
+  from object_detection.models import center_net_mobilenet_v2_fpn_feature_extractor
  from object_detection.models import center_net_resnet_feature_extractor
  from object_detection.models import center_net_resnet_v1_fpn_feature_extractor
  from object_detection.models import faster_rcnn_inception_resnet_v2_keras_feature_extractor as frcnn_inc_res_keras
@@ -140,8 +141,10 @@ if tf_version.is_tf2():
  }

  CENTER_NET_EXTRACTOR_FUNCTION_MAP = {
-      'resnet_v2_50': center_net_resnet_feature_extractor.resnet_v2_50,
-      'resnet_v2_101': center_net_resnet_feature_extractor.resnet_v2_101,
+      'resnet_v2_50':
+          center_net_resnet_feature_extractor.resnet_v2_50,
+      'resnet_v2_101':
+          center_net_resnet_feature_extractor.resnet_v2_101,
      'resnet_v1_18_fpn':
          center_net_resnet_v1_fpn_feature_extractor.resnet_v1_18_fpn,
      'resnet_v1_34_fpn':
@@ -154,6 +157,8 @@ if tf_version.is_tf2():
          center_net_hourglass_feature_extractor.hourglass_104,
      'mobilenet_v2':
          center_net_mobilenet_v2_feature_extractor.mobilenet_v2,
+      'mobilenet_v2_fpn':
+          center_net_mobilenet_v2_fpn_feature_extractor.mobilenet_v2_fpn,
  }

  FEATURE_EXTRACTOR_MAPS = [
@@ -936,6 +941,21 @@ def tracking_proto_to_params(tracking_config):
      task_loss_weight=tracking_config.task_loss_weight)


+def temporal_offset_proto_to_params(temporal_offset_config):
+  """Converts CenterNet.TemporalOffsetEstimation proto to param-tuple."""
+  loss = losses_pb2.Loss()
+  # Add dummy classification loss to avoid the loss_builder throwing error.
+  # TODO(yuhuic): update the loss builder to take the classification loss
+  # directly.
+  loss.classification_loss.weighted_sigmoid.CopyFrom(
+      losses_pb2.WeightedSigmoidClassificationLoss())
+  loss.localization_loss.CopyFrom(temporal_offset_config.localization_loss)
+  _, localization_loss, _, _, _, _, _ = losses_builder.build(loss)
+  return center_net_meta_arch.TemporalOffsetParams(
+      localization_loss=localization_loss,
+      task_loss_weight=temporal_offset_config.task_loss_weight)
+
+
 def _build_center_net_model(center_net_config, is_training, add_summaries):
  """Build a CenterNet detection model.

@@ -998,6 +1018,11 @@ def _build_center_net_model(center_net_config, is_training, add_summaries):
    track_params = tracking_proto_to_params(
        center_net_config.track_estimation_task)

+  temporal_offset_params = None
+  if center_net_config.HasField('temporal_offset_task'):
+    temporal_offset_params = temporal_offset_proto_to_params(
+        center_net_config.temporal_offset_task)
+
  return center_net_meta_arch.CenterNetMetaArch(
      is_training=is_training,
      add_summaries=add_summaries,
@@ -1009,7 +1034,9 @@ def _build_center_net_model(center_net_config, is_training, add_summaries):
      keypoint_params_dict=keypoint_params_dict,
      mask_params=mask_params,
      densepose_params=densepose_params,
-      track_params=track_params)
+      track_params=track_params,
+      temporal_offset_params=temporal_offset_params,
+      use_depthwise=center_net_config.use_depthwise)


 def _build_center_net_feature_extractor(

--- a/research/object_detection/core/box_list_ops.py
+++ b/research/object_detection/core/box_list_ops.py
@@ -151,7 +151,10 @@ def clip_to_window(boxlist, window, filter_nonoverlapping=True, scope=None):
  with tf.name_scope(scope, 'ClipToWindow'):
    y_min, x_min, y_max, x_max = tf.split(
        value=boxlist.get(), num_or_size_splits=4, axis=1)
-    win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
+    win_y_min = window[0]
+    win_x_min = window[1]
+    win_y_max = window[2]
+    win_x_max = window[3]
    y_min_clipped = tf.maximum(tf.minimum(y_min, win_y_max), win_y_min)
    y_max_clipped = tf.maximum(tf.minimum(y_max, win_y_max), win_y_min)
    x_min_clipped = tf.maximum(tf.minimum(x_min, win_x_max), win_x_min)

--- a/research/object_detection/core/model.py
+++ b/research/object_detection/core/model.py
@@ -102,7 +102,8 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    Args:
      field: a string key, options are
        fields.BoxListFields.{boxes,classes,masks,keypoints,
-        keypoint_visibilities, densepose_*, track_ids}
+        keypoint_visibilities, densepose_*, track_ids,
+        temporal_offsets, track_match_flags}
        fields.InputDataFields.is_annotated.

    Returns:
@@ -304,6 +305,8 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      groundtruth_dp_part_ids_list=None,
      groundtruth_dp_surface_coords_list=None,
      groundtruth_track_ids_list=None,
+      groundtruth_temporal_offsets_list=None,
+      groundtruth_track_match_flags_list=None,
      groundtruth_weights_list=None,
      groundtruth_confidences_list=None,
      groundtruth_is_crowd_list=None,
@@ -345,6 +348,12 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
        padding.
      groundtruth_track_ids_list: a list of 1-D tf.int32 tensors of shape
        [num_boxes] containing the track IDs of groundtruth objects.
+      groundtruth_temporal_offsets_list: a list of 2-D tf.float32 tensors
+        of shape [num_boxes, 2] containing the spatial offsets of objects'
+        centers compared with the previous frame.
+      groundtruth_track_match_flags_list: a list of 1-D tf.float32 tensors
+        of shape [num_boxes] containing 0-1 flags that indicate if an object
+        has existed in the previous frame.
      groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape
        [num_boxes] containing weights for groundtruth boxes.
      groundtruth_confidences_list: A list of 2-D tf.float32 tensors of shape
@@ -397,6 +406,14 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    if groundtruth_track_ids_list:
      self._groundtruth_lists[
          fields.BoxListFields.track_ids] = groundtruth_track_ids_list
+    if groundtruth_temporal_offsets_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.temporal_offsets] = (
+              groundtruth_temporal_offsets_list)
+    if groundtruth_track_match_flags_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.track_match_flags] = (
+              groundtruth_track_match_flags_list)
    if groundtruth_is_crowd_list:
      self._groundtruth_lists[
          fields.BoxListFields.is_crowd] = groundtruth_is_crowd_list

--- a/research/object_detection/core/preprocessor.py
+++ b/research/object_detection/core/preprocessor.py
@@ -4143,6 +4143,7 @@ def random_scale_crop_and_pad_to_square(
    label_weights,
    masks=None,
    keypoints=None,
+    label_confidences=None,
    scale_min=0.1,
    scale_max=2.0,
    output_size=512,
@@ -4176,6 +4177,8 @@ def random_scale_crop_and_pad_to_square(
      as the input `image`.
    keypoints: (optional) rank 3 float32 tensor with shape [num_instances,
      num_keypoints, 2]. The keypoints are in y-x normalized coordinates.
+    label_confidences: (optional) float32 tensor of shape [num_instance]
+      representing the confidence for each box.
    scale_min: float, the minimum value for the random scale factor.
    scale_max: float, the maximum value for the random scale factor.
    output_size: int, the desired (square) output image size.
@@ -4191,9 +4194,8 @@ def random_scale_crop_and_pad_to_square(
    label_weights: rank 1 float32 tensor with shape [num_instances].
    masks: rank 3 float32 tensor with shape [num_instances, height, width]
           containing instance masks.
-
+    label_confidences: confidences for retained boxes.
  """
-
  img_shape = tf.shape(image)
  input_height, input_width = img_shape[0], img_shape[1]
  random_scale = tf.random_uniform([], scale_min, scale_max, seed=seed)
@@ -4258,6 +4260,9 @@ def random_scale_crop_and_pad_to_square(
        keypoints, [0.0, 0.0, 1.0, 1.0])
    return_values.append(keypoints)

+  if label_confidences is not None:
+    return_values.append(tf.gather(label_confidences, indices))
+
  return return_values


@@ -4498,7 +4503,7 @@ def get_default_func_arg_map(include_label_weights=True,
           fields.InputDataFields.groundtruth_boxes,
           fields.InputDataFields.groundtruth_classes,
           groundtruth_label_weights, groundtruth_instance_masks,
-           groundtruth_keypoints),
+           groundtruth_keypoints, groundtruth_label_confidences),
  }

  return prep_func_arg_map

--- a/research/object_detection/core/preprocessor_test.py
+++ b/research/object_detection/core/preprocessor_test.py
@@ -3931,6 +3931,32 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllClose(image[:, :, 0],
                        masks[0, :, :])

+  def test_random_scale_crop_and_pad_to_square_handles_confidences(self):
+
+    def graph_fn():
+      image = tf.zeros([10, 10, 1])
+      boxes = tf.constant([[0, 0, 0.5, 0.5], [0.5, 0.5, 0.75, 0.75]])
+      label_weights = tf.constant([1.0, 1.0])
+      box_labels = tf.constant([0, 1])
+      box_confidences = tf.constant([-1.0, 1.0])
+
+      (_, new_boxes, _, _,
+       new_confidences) = preprocessor.random_scale_crop_and_pad_to_square(
+           image,
+           boxes,
+           box_labels,
+           label_weights,
+           label_confidences=box_confidences,
+           scale_min=0.8,
+           scale_max=0.9,
+           output_size=10)
+      return new_boxes, new_confidences
+
+    boxes, confidences = self.execute_cpu(graph_fn, [])
+
+    self.assertLen(boxes, 2)
+    self.assertAllEqual(confidences, [-1.0, 1.0])
+

 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/core/region_similarity_calculator.py
+++ b/research/object_detection/core/region_similarity_calculator.py
+# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -79,6 +80,39 @@ class IouSimilarity(RegionSimilarityCalculator):
    return box_list_ops.iou(boxlist1, boxlist2)


+class DETRSimilarity(RegionSimilarityCalculator):
+  """Class to compute similarity for the Detection Transformer model.
+
+  This class computes pairwise DETR similarity between two BoxLists using a
+  weighted combination of GIOU, classification scores, and the L1 loss.
+  """
+
+  def __init__(self, l1_weight=5, giou_weight=2):
+    super().__init__()
+    self.l1_weight = l1_weight
+    self.giou_weight = giou_weight
+
+  def _compare(self, boxlist1, boxlist2):
+    """Compute pairwise DETR similarity between the two BoxLists.
+
+    Args:
+      boxlist1: BoxList holding N groundtruth boxes.
+      boxlist2: BoxList holding M predicted boxes.
+
+    Returns:
+      A tensor with shape [N, M] representing pairwise DETR similarity scores.
+    """
+    groundtruth_labels = boxlist1.get_field(fields.BoxListFields.classes)
+    predicted_labels = boxlist2.get_field(fields.BoxListFields.classes)
+    classification_scores = tf.matmul(groundtruth_labels,
+                                      predicted_labels,
+                                      transpose_b=True)
+    loss = self.l1_weight * box_list_ops.l1(
+        boxlist1, boxlist2) + self.giou_weight * (1 - box_list_ops.giou(
+            boxlist1, boxlist2)) - classification_scores
+    return -loss
+
+
 class NegSqDistSimilarity(RegionSimilarityCalculator):
  """Class to compute similarity based on the squared distance metric.


--- a/research/object_detection/core/region_similarity_calculator_test.py
+++ b/research/object_detection/core/region_similarity_calculator_test.py
@@ -93,6 +93,25 @@ class RegionSimilarityCalculatorTest(test_case.TestCase):
    iou_output = self.execute(graph_fn, [])
    self.assertAllClose(iou_output, exp_output)

+  def test_detr_similarity(self):
+    def graph_fn():
+      corners1 = tf.constant([[5.0, 7.0, 7.0, 9.0]])
+      corners2 = tf.constant([[5.0, 7.0, 7.0, 9.0], [5.0, 11.0, 7.0, 13.0]])
+      groundtruth_labels = tf.constant([[1.0, 0.0]])
+      predicted_labels = tf.constant([[0.0, 1000.0], [1000.0, 0.0]])
+      boxes1 = box_list.BoxList(corners1)
+      boxes2 = box_list.BoxList(corners2)
+      boxes1.add_field(fields.BoxListFields.classes, groundtruth_labels)
+      boxes2.add_field(fields.BoxListFields.classes, predicted_labels)
+      detr_similarity_calculator = \
+          region_similarity_calculator.DETRSimilarity()
+      detr_similarity = detr_similarity_calculator.compare(
+          boxes1, boxes2, None)
+      return detr_similarity
+    exp_output = [[0.0, -20 - 8.0/3.0 + 1000.0]]
+    sim_output = self.execute(graph_fn, [])
+    self.assertAllClose(sim_output, exp_output)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/core/standard_fields.py
+++ b/research/object_detection/core/standard_fields.py
@@ -47,6 +47,10 @@ class InputDataFields(object):
    groundtruth_boxes: coordinates of the ground truth boxes in the image.
    groundtruth_classes: box-level class labels.
    groundtruth_track_ids: box-level track ID labels.
+    groundtruth_temporal_offset: box-level temporal offsets, i.e.,
+      movement of the box center in adjacent frames.
+    groundtruth_track_match_flags: box-level flags indicating if objects
+      exist in the previous frame.
    groundtruth_confidences: box-level class confidences. The shape should be
      the same as the shape of groundtruth_classes.
    groundtruth_label_types: box-level label types (e.g. explicit negative).
@@ -99,6 +103,8 @@ class InputDataFields(object):
  groundtruth_boxes = 'groundtruth_boxes'
  groundtruth_classes = 'groundtruth_classes'
  groundtruth_track_ids = 'groundtruth_track_ids'
+  groundtruth_temporal_offset = 'groundtruth_temporal_offset'
+  groundtruth_track_match_flags = 'groundtruth_track_match_flags'
  groundtruth_confidences = 'groundtruth_confidences'
  groundtruth_label_types = 'groundtruth_label_types'
  groundtruth_is_crowd = 'groundtruth_is_crowd'
@@ -170,6 +176,7 @@ class DetectionResultFields(object):
  detection_keypoints = 'detection_keypoints'
  detection_keypoint_scores = 'detection_keypoint_scores'
  detection_embeddings = 'detection_embeddings'
+  detection_offsets = 'detection_temporal_offsets'
  num_detections = 'num_detections'
  raw_detection_boxes = 'raw_detection_boxes'
  raw_detection_scores = 'raw_detection_scores'
@@ -194,6 +201,8 @@ class BoxListFields(object):
    densepose_part_ids: DensePose part ids per bounding box.
    densepose_surface_coords: DensePose surface coordinates per bounding box.
    is_crowd: is_crowd annotation per bounding box.
+    temporal_offsets: temporal center offsets per bounding box.
+    track_match_flags: match flags per bounding box.
  """
  boxes = 'boxes'
  classes = 'classes'
@@ -212,6 +221,8 @@ class BoxListFields(object):
  is_crowd = 'is_crowd'
  group_of = 'group_of'
  track_ids = 'track_ids'
+  temporal_offsets = 'temporal_offsets'
+  track_match_flags = 'track_match_flags'


 class PredictionFields(object):

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -51,6 +51,7 @@ from object_detection.core import matcher as mat
 from object_detection.core import region_similarity_calculator as sim_calc
 from object_detection.core import standard_fields as fields
 from object_detection.matchers import argmax_matcher
+from object_detection.matchers import hungarian_matcher
 from object_detection.utils import shape_utils
 from object_detection.utils import target_assigner_utils as ta_utils
 from object_detection.utils import tf_version
@@ -510,7 +511,8 @@ def batch_assign(target_assigner,
      anchors_batch, gt_box_batch, gt_class_targets_batch, gt_weights_batch):
    (cls_targets, cls_weights,
     reg_targets, reg_weights, match) = target_assigner.assign(
-         anchors, gt_boxes, gt_class_targets, unmatched_class_label, gt_weights)
+         anchors, gt_boxes, gt_class_targets, unmatched_class_label,
+         gt_weights)
    cls_targets_list.append(cls_targets)
    cls_weights_list.append(cls_weights)
    reg_targets_list.append(reg_targets)
@@ -1980,3 +1982,291 @@ class CenterNetCornerOffsetTargetAssigner(object):

    return (tf.stack(corner_targets, axis=0),
            tf.stack(foreground_targets, axis=0))
+
+
+class CenterNetTemporalOffsetTargetAssigner(object):
+  """Wrapper to compute target tensors for the temporal offset task.
+
+  This class has methods that take as input a batch of ground truth tensors
+  (in the form of a list) and returns the targets required to train the
+  temporal offset task.
+  """
+
+  def __init__(self, stride):
+    """Initializes the target assigner.
+
+    Args:
+      stride: int, the stride of the network in output pixels.
+    """
+
+    self._stride = stride
+
+  def assign_temporal_offset_targets(self,
+                                     height,
+                                     width,
+                                     gt_boxes_list,
+                                     gt_offsets_list,
+                                     gt_match_list,
+                                     gt_weights_list=None):
+    """Returns the temporal offset targets and their indices.
+
+    For each ground truth box, this function assigns it the corresponding
+    temporal offset to train the model.
+
+    Args:
+      height: int, height of input to the model. This is used to determine the
+        height of the output.
+      width: int, width of the input to the model. This is used to determine the
+        width of the output.
+      gt_boxes_list: A list of float tensors with shape [num_boxes, 4]
+        representing the groundtruth detection bounding boxes for each sample in
+        the batch. The coordinates are expected in normalized coordinates.
+      gt_offsets_list: A list of 2-D tf.float32 tensors of shape [num_boxes, 2]
+        containing the spatial offsets of objects' centers compared with the
+        previous frame.
+      gt_match_list: A list of 1-D tf.float32 tensors of shape [num_boxes]
+        containing flags that indicate if an object has existed in the
+        previous frame.
+      gt_weights_list: A list of tensors with shape [num_boxes] corresponding to
+        the weight of each groundtruth detection box.
+
+    Returns:
+      batch_indices: an integer tensor of shape [num_boxes, 3] holding the
+        indices inside the predicted tensor which should be penalized. The
+        first column indicates the index along the batch dimension and the
+        second and third columns indicate the index along the y and x
+        dimensions respectively.
+      batch_temporal_offsets: a float tensor of shape [num_boxes, 2] of the
+        expected y and x temporal offset of each object center in the
+        output space.
+      batch_weights: a float tensor of shape [num_boxes] indicating the
+        weight of each prediction.
+    """
+
+    if gt_weights_list is None:
+      gt_weights_list = [None] * len(gt_boxes_list)
+
+    batch_indices = []
+    batch_weights = []
+    batch_temporal_offsets = []
+
+    for i, (boxes, offsets, match_flags, weights) in enumerate(zip(
+        gt_boxes_list, gt_offsets_list, gt_match_list, gt_weights_list)):
+      boxes = box_list.BoxList(boxes)
+      boxes = box_list_ops.to_absolute_coordinates(boxes,
+                                                   height // self._stride,
+                                                   width // self._stride)
+      # Get the box center coordinates. Each returned tensors have the shape of
+      # [num_boxes]
+      (y_center, x_center, _, _) = boxes.get_center_coordinates_and_sizes()
+      num_boxes = tf.shape(x_center)
+
+      # Compute the offsets and indices of the box centers. Shape:
+      #   offsets: [num_boxes, 2]
+      #   indices: [num_boxes, 2]
+      (_, indices) = ta_utils.compute_floor_offsets_with_indices(
+          y_source=y_center, x_source=x_center)
+
+      # Assign ones if weights are not provided.
+      # if an object is not matched, its weight becomes zero.
+      if weights is None:
+        weights = tf.ones(num_boxes, dtype=tf.float32)
+      weights *= match_flags
+
+      # Shape of [num_boxes, 1] integer tensor filled with current batch index.
+      batch_index = i * tf.ones_like(indices[:, 0:1], dtype=tf.int32)
+      batch_indices.append(tf.concat([batch_index, indices], axis=1))
+      batch_weights.append(weights)
+      batch_temporal_offsets.append(offsets)
+
+    batch_indices = tf.concat(batch_indices, axis=0)
+    batch_weights = tf.concat(batch_weights, axis=0)
+    batch_temporal_offsets = tf.concat(batch_temporal_offsets, axis=0)
+    return (batch_indices, batch_temporal_offsets, batch_weights)
+
+
+class DETRTargetAssigner(object):
+  """Target assigner for DETR (https://arxiv.org/abs/2005.12872).
+
+  Detection Transformer (DETR) matches predicted boxes to groundtruth directly
+  to determine targets instead of matching anchors to groundtruth. Hence, the
+  new target assigner.
+  """
+
+  def __init__(self):
+    """Construct Object Detection Target Assigner."""
+    self._similarity_calc = sim_calc.DETRSimilarity()
+    self._matcher = hungarian_matcher.HungarianBipartiteMatcher()
+
+  def batch_assign(self,
+                   pred_box_batch,
+                   gt_box_batch,
+                   pred_class_batch,
+                   gt_class_targets_batch,
+                   gt_weights_batch=None,
+                   unmatched_class_label_batch=None):
+    """Batched assignment of classification and regression targets.
+
+    Args:
+      pred_box_batch: a tensor of shape [batch_size, num_queries, 4]
+        representing predicted bounding boxes.
+      gt_box_batch: a tensor of shape [batch_size, num_queries, 4]
+        representing groundtruth bounding boxes.
+      pred_class_batch: A list of tensors with length batch_size, where each
+        each tensor has shape [num_queries, num_classes] to be used
+        by certain similarity calculators.
+      gt_class_targets_batch: a list of tensors with length batch_size, where
+        each tensor has shape [num_gt_boxes_i, num_classes] and
+        num_gt_boxes_i is the number of boxes in the ith boxlist of
+        gt_box_batch.
+      gt_weights_batch: A list of 1-D tf.float32 tensors of shape
+        [num_boxes] containing weights for groundtruth boxes.
+      unmatched_class_label_batch: a float32 tensor with shape
+        [d_1, d_2, ..., d_k] which is consistent with the classification target
+        for each anchor (and can be empty for scalar targets).  This shape must
+        thus be compatible with the `gt_class_targets_batch`.
+
+    Returns:
+      batch_cls_targets: a tensor with shape [batch_size, num_pred_boxes,
+        num_classes],
+      batch_cls_weights: a tensor with shape [batch_size, num_pred_boxes,
+        num_classes],
+      batch_reg_targets: a tensor with shape [batch_size, num_pred_boxes,
+        box_code_dimension]
+      batch_reg_weights: a tensor with shape [batch_size, num_pred_boxes].
+    """
+    pred_box_batch = [
+        box_list.BoxList(pred_box)
+        for pred_box in tf.unstack(pred_box_batch)]
+    gt_box_batch = [
+        box_list.BoxList(gt_box)
+        for gt_box in tf.unstack(gt_box_batch)]
+
+    cls_targets_list = []
+    cls_weights_list = []
+    reg_targets_list = []
+    reg_weights_list = []
+    if gt_weights_batch is None:
+      gt_weights_batch = [None] * len(gt_class_targets_batch)
+    if unmatched_class_label_batch is None:
+      unmatched_class_label_batch = [None] * len(gt_class_targets_batch)
+    pred_class_batch = tf.unstack(pred_class_batch)
+    for (pred_boxes, gt_boxes, pred_class_batch, gt_class_targets, gt_weights,
+         unmatched_class_label) in zip(pred_box_batch, gt_box_batch,
+                                       pred_class_batch, gt_class_targets_batch,
+                                       gt_weights_batch,
+                                       unmatched_class_label_batch):
+      (cls_targets, cls_weights, reg_targets,
+       reg_weights) = self.assign(pred_boxes, gt_boxes, pred_class_batch,
+                                  gt_class_targets, gt_weights,
+                                  unmatched_class_label)
+      cls_targets_list.append(cls_targets)
+      cls_weights_list.append(cls_weights)
+      reg_targets_list.append(reg_targets)
+      reg_weights_list.append(reg_weights)
+    batch_cls_targets = tf.stack(cls_targets_list)
+    batch_cls_weights = tf.stack(cls_weights_list)
+    batch_reg_targets = tf.stack(reg_targets_list)
+    batch_reg_weights = tf.stack(reg_weights_list)
+    return (batch_cls_targets, batch_cls_weights, batch_reg_targets,
+            batch_reg_weights)
+
+  def assign(self,
+             pred_boxes,
+             gt_boxes,
+             pred_classes,
+             gt_labels,
+             gt_weights=None,
+             unmatched_class_label=None):
+    """Assign classification and regression targets to each box_pred.
+
+    For a given set of pred_boxes and groundtruth detections, match pred_boxes
+    to gt_boxes and assign classification and regression targets to
+    each box_pred as well as weights based on the resulting match (specifying,
+    e.g., which pred_boxes should not contribute to training loss).
+
+    pred_boxes that are not matched to anything are given a classification
+    target of `unmatched_cls_target`.
+
+    Args:
+      pred_boxes: a BoxList representing N pred_boxes
+      gt_boxes: a BoxList representing M groundtruth boxes
+      pred_classes: A tensor with shape [max_num_boxes, num_classes]
+        to be used by certain similarity calculators.
+      gt_labels:  a tensor of shape [M, num_classes]
+        with labels for each of the ground_truth boxes. The subshape
+        [num_classes] can be empty (corresponding to scalar inputs).  When set
+        to None, gt_labels assumes a binary problem where all
+        ground_truth boxes get a positive label (of 1).
+      gt_weights: a float tensor of shape [M] indicating the weight to
+        assign to all pred_boxes match to a particular groundtruth box. The
+        weights must be in [0., 1.]. If None, all weights are set to 1.
+        Generally no groundtruth boxes with zero weight match to any pred_boxes
+        as matchers are aware of groundtruth weights. Additionally,
+        `cls_weights` and `reg_weights` are calculated using groundtruth
+        weights as an added safety.
+      unmatched_class_label: a float32 tensor with shape [d_1, d_2, ..., d_k]
+        which is consistent with the classification target for each
+        anchor (and can be empty for scalar targets).  This shape must thus be
+        compatible with the groundtruth labels that are passed to the "assign"
+        function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]).
+
+    Returns:
+      cls_targets: a float32 tensor with shape [num_pred_boxes, num_classes],
+        where the subshape [num_classes] is compatible with gt_labels
+        which has shape [num_gt_boxes, num_classes].
+      cls_weights: a float32 tensor with shape [num_pred_boxes, num_classes],
+        representing weights for each element in cls_targets.
+      reg_targets: a float32 tensor with shape [num_pred_boxes,
+        box_code_dimension]
+      reg_weights: a float32 tensor with shape [num_pred_boxes]
+
+    """
+    if not unmatched_class_label:
+      unmatched_class_label = tf.constant(
+          [1] + [0] * (gt_labels.shape[1] - 1), tf.float32)
+
+    if gt_weights is None:
+      num_gt_boxes = gt_boxes.num_boxes_static()
+      if not num_gt_boxes:
+        num_gt_boxes = gt_boxes.num_boxes()
+      gt_weights = tf.ones([num_gt_boxes], dtype=tf.float32)
+
+    gt_boxes.add_field(fields.BoxListFields.classes, gt_labels)
+    pred_boxes.add_field(fields.BoxListFields.classes, pred_classes)
+
+    match_quality_matrix = self._similarity_calc.compare(
+        gt_boxes,
+        pred_boxes)
+    match = self._matcher.match(match_quality_matrix,
+                                valid_rows=tf.greater(gt_weights, 0))
+
+    matched_gt_boxes = match.gather_based_on_match(
+        gt_boxes.get(),
+        unmatched_value=tf.zeros(4),
+        ignored_value=tf.zeros(4))
+    matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
+    ty, tx, th, tw = matched_gt_boxlist.get_center_coordinates_and_sizes()
+    reg_targets = tf.transpose(tf.stack([ty, tx, th, tw]))
+    cls_targets = match.gather_based_on_match(
+        gt_labels,
+        unmatched_value=unmatched_class_label,
+        ignored_value=unmatched_class_label)
+    reg_weights = match.gather_based_on_match(
+        gt_weights,
+        ignored_value=0.,
+        unmatched_value=0.)
+    cls_weights = match.gather_based_on_match(
+        gt_weights,
+        ignored_value=0.,
+        unmatched_value=1)
+
+    # convert cls_weights from per-box_pred to per-class.
+    class_label_shape = tf.shape(cls_targets)[1:]
+    weights_multiple = tf.concat(
+        [tf.constant([1]), class_label_shape],
+        axis=0)
+    cls_weights = tf.expand_dims(cls_weights, -1)
+    cls_weights = tf.tile(cls_weights, weights_multiple)
+
+    return (cls_targets, cls_weights, reg_targets, reg_weights)
--- a/research/object_detection/core/target_assigner_test.py
+++ b/research/object_detection/core/target_assigner_test.py
@@ -115,6 +115,7 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertEqual(reg_weights_out.dtype, np.float32)

  def test_assign_agnostic_with_keypoints(self):
+
    def graph_fn(anchor_means, groundtruth_box_corners,
                 groundtruth_keypoints):
      similarity_calc = region_similarity_calculator.IouSimilarity()
@@ -2290,6 +2291,215 @@ class CornerOffsetTargetAssignerTest(test_case.TestCase):
    self.assertAllClose(foreground, np.zeros((1, 5, 5)))


+class CenterNetTemporalOffsetTargetAssigner(test_case.TestCase):
+
+  def setUp(self):
+    super(CenterNetTemporalOffsetTargetAssigner, self).setUp()
+    self._box_center = [0.0, 0.0, 1.0, 1.0]
+    self._box_center_small = [0.25, 0.25, 0.75, 0.75]
+    self._box_lower_left = [0.5, 0.0, 1.0, 0.5]
+    self._box_center_offset = [0.1, 0.05, 1.0, 1.0]
+    self._box_odd_coordinates = [0.1625, 0.2125, 0.5625, 0.9625]
+    self._offset_center = [0.5, 0.4]
+    self._offset_center_small = [0.1, 0.1]
+    self._offset_lower_left = [-0.1, 0.1]
+    self._offset_center_offset = [0.4, 0.3]
+    self._offset_odd_coord = [0.125, -0.125]
+
+  def test_assign_empty_groundtruths(self):
+    """Tests the assign_offset_targets function with empty inputs."""
+    def graph_fn():
+      box_batch = [
+          tf.zeros((0, 4), dtype=tf.float32),
+      ]
+
+      offset_batch = [
+          tf.zeros((0, 2), dtype=tf.float32),
+      ]
+
+      match_flag_batch = [
+          tf.zeros((0), dtype=tf.float32),
+      ]
+
+      assigner = targetassigner.CenterNetTemporalOffsetTargetAssigner(4)
+      indices, temporal_offset, weights = assigner.assign_temporal_offset_targets(
+          80, 80, box_batch, offset_batch, match_flag_batch)
+      return indices, temporal_offset, weights
+    indices, temporal_offset, weights = self.execute(graph_fn, [])
+    self.assertEqual(indices.shape, (0, 3))
+    self.assertEqual(temporal_offset.shape, (0, 2))
+    self.assertEqual(weights.shape, (0,))
+
+  def test_assign_offset_targets(self):
+    """Tests the assign_offset_targets function."""
+    def graph_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center_offset]),
+          tf.constant([self._box_center_small, self._box_odd_coordinates]),
+      ]
+
+      offset_batch = [
+          tf.constant([self._offset_center, self._offset_lower_left]),
+          tf.constant([self._offset_center_offset]),
+          tf.constant([self._offset_center_small, self._offset_odd_coord]),
+      ]
+
+      match_flag_batch = [
+          tf.constant([1.0, 1.0]),
+          tf.constant([1.0]),
+          tf.constant([1.0, 1.0]),
+      ]
+
+      assigner = targetassigner.CenterNetTemporalOffsetTargetAssigner(4)
+      indices, temporal_offset, weights = assigner.assign_temporal_offset_targets(
+          80, 80, box_batch, offset_batch, match_flag_batch)
+      return indices, temporal_offset, weights
+    indices, temporal_offset, weights = self.execute(graph_fn, [])
+    self.assertEqual(indices.shape, (5, 3))
+    self.assertEqual(temporal_offset.shape, (5, 2))
+    self.assertEqual(weights.shape, (5,))
+    np.testing.assert_array_equal(
+        indices,
+        [[0, 10, 10], [0, 15, 5], [1, 11, 10], [2, 10, 10], [2, 7, 11]])
+    np.testing.assert_array_almost_equal(
+        temporal_offset,
+        [[0.5, 0.4], [-0.1, 0.1], [0.4, 0.3], [0.1, 0.1], [0.125, -0.125]])
+    np.testing.assert_array_equal(weights, 1)
+
+  def test_assign_offset_targets_with_match_flags(self):
+    """Tests the assign_offset_targets function with match flags."""
+    def graph_fn():
+      box_batch = [
+          tf.constant([self._box_center, self._box_lower_left]),
+          tf.constant([self._box_center_offset]),
+          tf.constant([self._box_center_small, self._box_odd_coordinates]),
+      ]
+
+      offset_batch = [
+          tf.constant([self._offset_center, self._offset_lower_left]),
+          tf.constant([self._offset_center_offset]),
+          tf.constant([self._offset_center_small, self._offset_odd_coord]),
+      ]
+
+      match_flag_batch = [
+          tf.constant([0.0, 1.0]),
+          tf.constant([1.0]),
+          tf.constant([1.0, 1.0]),
+      ]
+
+      cn_assigner = targetassigner.CenterNetTemporalOffsetTargetAssigner(4)
+      weights_batch = [
+          tf.constant([1.0, 0.0]),
+          tf.constant([1.0]),
+          tf.constant([1.0, 1.0])
+      ]
+      indices, temporal_offset, weights = cn_assigner.assign_temporal_offset_targets(
+          80, 80, box_batch, offset_batch, match_flag_batch, weights_batch)
+      return indices, temporal_offset, weights
+    indices, temporal_offset, weights = self.execute(graph_fn, [])
+    self.assertEqual(indices.shape, (5, 3))
+    self.assertEqual(temporal_offset.shape, (5, 2))
+    self.assertEqual(weights.shape, (5,))
+
+    np.testing.assert_array_equal(
+        indices,
+        [[0, 10, 10], [0, 15, 5], [1, 11, 10], [2, 10, 10], [2, 7, 11]])
+    np.testing.assert_array_almost_equal(
+        temporal_offset,
+        [[0.5, 0.4], [-0.1, 0.1], [0.4, 0.3], [0.1, 0.1], [0.125, -0.125]])
+    np.testing.assert_array_equal(weights, [0, 0, 1, 1, 1])
+
+
+class DETRTargetAssignerTest(test_case.TestCase):
+
+  def test_assign_detr(self):
+    def graph_fn(pred_corners, groundtruth_box_corners,
+                 groundtruth_labels, predicted_labels):
+      detr_target_assigner = targetassigner.DETRTargetAssigner()
+      pred_boxlist = box_list.BoxList(pred_corners)
+      groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners)
+      result = detr_target_assigner.assign(
+          pred_boxlist, groundtruth_boxlist,
+          predicted_labels, groundtruth_labels)
+      (cls_targets, cls_weights, reg_targets, reg_weights) = result
+      return (cls_targets, cls_weights, reg_targets, reg_weights)
+
+    pred_corners = np.array([[0.25, 0.25, 0.4, 0.2],
+                             [0.5, 0.8, 1.0, 0.8],
+                             [0.9, 0.5, 0.1, 1.0]], dtype=np.float32)
+    groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5],
+                                        [0.5, 0.5, 0.9, 0.9]],
+                                       dtype=np.float32)
+    predicted_labels = np.array([[-3.0, 3.0], [2.0, 9.4], [5.0, 1.0]],
+                                dtype=np.float32)
+    groundtruth_labels = np.array([[0.0, 1.0], [0.0, 1.0]],
+                                  dtype=np.float32)
+
+    exp_cls_targets = [[0, 1], [0, 1], [1, 0]]
+    exp_cls_weights = [[1, 1], [1, 1], [1, 1]]
+    exp_reg_targets = [[0.25, 0.25, 0.5, 0.5],
+                       [0.7, 0.7, 0.4, 0.4],
+                       [0, 0, 0, 0]]
+    exp_reg_weights = [1, 1, 0]
+
+    (cls_targets_out,
+     cls_weights_out, reg_targets_out, reg_weights_out) = self.execute_cpu(
+         graph_fn, [pred_corners, groundtruth_box_corners,
+                    groundtruth_labels, predicted_labels])
+
+    self.assertAllClose(cls_targets_out, exp_cls_targets)
+    self.assertAllClose(cls_weights_out, exp_cls_weights)
+    self.assertAllClose(reg_targets_out, exp_reg_targets)
+    self.assertAllClose(reg_weights_out, exp_reg_weights)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
+
+  def test_batch_assign_detr(self):
+    def graph_fn(pred_corners, groundtruth_box_corners,
+                 groundtruth_labels, predicted_labels):
+      detr_target_assigner = targetassigner.DETRTargetAssigner()
+      result = detr_target_assigner.batch_assign(
+          pred_corners, groundtruth_box_corners,
+          [predicted_labels], [groundtruth_labels])
+      (cls_targets, cls_weights, reg_targets, reg_weights) = result
+      return (cls_targets, cls_weights, reg_targets, reg_weights)
+
+    pred_corners = np.array([[[0.25, 0.25, 0.4, 0.2],
+                              [0.5, 0.8, 1.0, 0.8],
+                              [0.9, 0.5, 0.1, 1.0]]], dtype=np.float32)
+    groundtruth_box_corners = np.array([[[0.0, 0.0, 0.5, 0.5],
+                                         [0.5, 0.5, 0.9, 0.9]]],
+                                       dtype=np.float32)
+    predicted_labels = np.array([[-3.0, 3.0], [2.0, 9.4], [5.0, 1.0]],
+                                dtype=np.float32)
+    groundtruth_labels = np.array([[0.0, 1.0], [0.0, 1.0]],
+                                  dtype=np.float32)
+
+    exp_cls_targets = [[[0, 1], [0, 1], [1, 0]]]
+    exp_cls_weights = [[[1, 1], [1, 1], [1, 1]]]
+    exp_reg_targets = [[[0.25, 0.25, 0.5, 0.5],
+                        [0.7, 0.7, 0.4, 0.4],
+                        [0, 0, 0, 0]]]
+    exp_reg_weights = [[1, 1, 0]]
+
+    (cls_targets_out,
+     cls_weights_out, reg_targets_out, reg_weights_out) = self.execute_cpu(
+         graph_fn, [pred_corners, groundtruth_box_corners,
+                    groundtruth_labels, predicted_labels])
+
+    self.assertAllClose(cls_targets_out, exp_cls_targets)
+    self.assertAllClose(cls_weights_out, exp_cls_weights)
+    self.assertAllClose(reg_targets_out, exp_reg_targets)
+    self.assertAllClose(reg_weights_out, exp_reg_weights)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
+
+
 if __name__ == '__main__':
  tf.enable_v2_behavior()
  tf.test.main()
--- a/research/object_detection/g3doc/running_on_mobile_tf2.md
+++ b/research/object_detection/g3doc/running_on_mobile_tf2.md
 # Running TF2 Detection API Models on mobile

-[![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
+[![TensorFlow 2.3](https://img.shields.io/badge/TensorFlow-2.3-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.3.0)
 [![Python 3.6](https://img.shields.io/badge/Python-3.6-3776AB)](https://www.python.org/downloads/release/python-360/)

+**NOTE:** This support was added *after* TF2.3, so please use the latest nightly
+for the TensorFlow Lite Converter for this to work.
+
 [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/)(TFLite) is
 TensorFlow’s lightweight solution for mobile and embedded devices. It enables
 on-device machine learning inference with low latency and a small binary size.
@@ -54,16 +57,30 @@ python object_detection/export_tflite_graph_tf2.py \
    --output_directory path/to/exported_model_directory
 ```

-Use `--help` with the aboev script to get the full list of supported parameters.
+Use `--help` with the above script to get the full list of supported parameters.
+These can fine-tune accuracy and speed for your model.

 ### Step 2: Convert to TFLite

 Use the [TensorFlow Lite Converter](https://www.tensorflow.org/lite/convert) to
-convert the `SavedModel` to TFLite. You can also leverage
+convert the `SavedModel` to TFLite. Note that you need to use `from_saved_model`
+for TFLite conversion with the Python API.
+
+You can also leverage
 [Post-training Quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
 to
 [optimize performance](https://www.tensorflow.org/lite/performance/model_optimization)
-and obtain a smaller model.
+and obtain a smaller model. Note that this is only possible from the *Python
+API*. Be sure to use a
+[representative dataset](https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization)
+and set the following options on the converter:
+
+```
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
+                                       tf.lite.OpsSet.TFLITE_BUILTINS]
+converter.representative_dataset = <...>
+```

 ## Running our model on Android


--- a/research/object_detection/g3doc/tf2_detection_zoo.md
+++ b/research/object_detection/g3doc/tf2_detection_zoo.md
@@ -15,7 +15,7 @@ They are also useful for initializing your models when training on novel
 datasets. You can try this out on our few-shot training
 [colab](../colab_tutorials/eager_few_shot_od_training_tf2_colab.ipynb).

-Please look at [this guide](../running_on_mobile_tf2.md) for mobile inference.
+Please look at [this guide](running_on_mobile_tf2.md) for mobile inference.

 <!-- mdlint on -->


--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -139,7 +139,7 @@ class CenterNetFeatureExtractor(tf.keras.Model):


 def make_prediction_net(num_out_channels, kernel_size=3, num_filters=256,
-                        bias_fill=None):
+                        bias_fill=None, use_depthwise=False, name=None):
  """Creates a network to predict the given number of output channels.

  This function is intended to make the prediction heads for the CenterNet
@@ -151,12 +151,19 @@ def make_prediction_net(num_out_channels, kernel_size=3, num_filters=256,
    num_filters: The number of filters in the intermediate conv layer.
    bias_fill: If not None, is used to initialize the bias in the final conv
      layer.
+    use_depthwise: If true, use SeparableConv2D to construct the Sequential
+      layers instead of Conv2D.
+    name: Optional name for the prediction net.

  Returns:
    net: A keras module which when called on an input tensor of size
      [batch_size, height, width, num_in_channels] returns an output
      of size [batch_size, height, width, num_out_channels]
  """
+  if use_depthwise:
+    conv_fn = tf.keras.layers.SeparableConv2D
+  else:
+    conv_fn = tf.keras.layers.Conv2D

  out_conv = tf.keras.layers.Conv2D(num_out_channels, kernel_size=1)

@@ -164,11 +171,10 @@ def make_prediction_net(num_out_channels, kernel_size=3, num_filters=256,
    out_conv.bias_initializer = tf.keras.initializers.constant(bias_fill)

  net = tf.keras.Sequential(
-      [tf.keras.layers.Conv2D(num_filters, kernel_size=kernel_size,
-                              padding='same'),
+      [conv_fn(num_filters, kernel_size=kernel_size, padding='same'),
       tf.keras.layers.ReLU(),
-       out_conv]
-  )
+       out_conv],
+      name=name)

  return net

@@ -329,6 +335,39 @@ def prediction_tensors_to_boxes(detection_scores, y_indices, x_indices,
  return boxes, detection_classes, detection_scores, num_detections


+def prediction_tensors_to_temporal_offsets(
+    y_indices, x_indices, offset_predictions):
+  """Converts CenterNet temporal offset map predictions to batched format.
+
+  This function is similiar to the box offset conversion function, as both
+  temporal offsets and box offsets are size-2 vectors.
+
+  Args:
+    y_indices: A [batch, num_boxes] int32 tensor with y indices corresponding to
+      object center locations (expressed in output coordinate frame).
+    x_indices: A [batch, num_boxes] int32 tensor with x indices corresponding to
+      object center locations (expressed in output coordinate frame).
+    offset_predictions: A float tensor of shape [batch_size, height, width, 2]
+      representing the y and x offsets of a box's center across adjacent frames.
+
+  Returns:
+    offsets: A tensor of shape [batch_size, num_boxes, 2] holding the
+      the object temporal offsets of (y, x) dimensions.
+
+  """
+  _, _, width, _ = _get_shape(offset_predictions, 4)
+
+  peak_spatial_indices = flattened_indices_from_row_col_indices(
+      y_indices, x_indices, width)
+  y_indices = _to_float32(y_indices)
+  x_indices = _to_float32(x_indices)
+
+  offsets_flat = _flatten_spatial_dimensions(offset_predictions)
+
+  offsets = tf.gather(offsets_flat, peak_spatial_indices, batch_dims=1)
+  return offsets
+
+
 def prediction_tensors_to_keypoint_candidates(
    keypoint_heatmap_predictions,
    keypoint_heatmap_offsets,
@@ -555,15 +594,23 @@ def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,

  # Pairwise squared distances between regressed keypoints and candidate
  # keypoints (for a single keypoint type).
-  # Shape [batch_size, num_instances, max_candidates, num_keypoints].
+  # Shape [batch_size, num_instances, 1, num_keypoints, 2].
  regressed_keypoint_expanded = tf.expand_dims(regressed_keypoints,
                                               axis=2)
+  # Shape [batch_size, 1, max_candidates, num_keypoints, 2].
  keypoint_candidates_expanded = tf.expand_dims(
      keypoint_candidates_with_nans, axis=1)
-  sqrd_distances = tf.math.reduce_sum(
-      tf.math.squared_difference(regressed_keypoint_expanded,
-                                 keypoint_candidates_expanded),
-      axis=-1)
+  # Use explicit tensor shape broadcasting (since the tensor dimensions are
+  # expanded to 5D) to make it tf.lite compatible.
+  regressed_keypoint_expanded = tf.tile(
+      regressed_keypoint_expanded, multiples=[1, 1, max_candidates, 1, 1])
+  keypoint_candidates_expanded = tf.tile(
+      keypoint_candidates_expanded, multiples=[1, num_instances, 1, 1, 1])
+  # Replace tf.math.squared_difference by "-" operator and tf.multiply ops since
+  # tf.lite convert doesn't support squared_difference with undetermined
+  # dimension.
+  diff = regressed_keypoint_expanded - keypoint_candidates_expanded
+  sqrd_distances = tf.math.reduce_sum(tf.multiply(diff, diff), axis=-1)
  distances = tf.math.sqrt(sqrd_distances)

  # Determine the candidates that have the minimum distance to the regressed
@@ -935,9 +982,16 @@ def convert_strided_predictions_to_normalized_keypoints(
    def clip_to_window(inputs):
      keypoints, window = inputs
      return keypoint_ops.clip_to_window(keypoints, window)
+
+    # Specify the TensorSpec explicitly in the tf.map_fn to make it tf.lite
+    # compatible.
+    kpts_dims = _get_shape(keypoint_coords_normalized, 4)
+    output_spec = tf.TensorSpec(
+        shape=[kpts_dims[1], kpts_dims[2], kpts_dims[3]], dtype=tf.float32)
    keypoint_coords_normalized = tf.map_fn(
        clip_to_window, (keypoint_coords_normalized, batch_window),
-        dtype=tf.float32, back_prop=False)
+        dtype=tf.float32, back_prop=False,
+        fn_output_signature=output_spec)
    keypoint_scores = tf.where(valid_indices, keypoint_scores,
                               tf.zeros_like(keypoint_scores))
  return keypoint_coords_normalized, keypoint_scores
@@ -1534,6 +1588,32 @@ class TrackParams(
                              num_fc_layers, classification_loss,
                              task_loss_weight)

+
+class TemporalOffsetParams(
+    collections.namedtuple('TemporalOffsetParams', [
+        'localization_loss', 'task_loss_weight'
+    ])):
+  """Namedtuple to store temporal offset related parameters."""
+
+  __slots__ = ()
+
+  def __new__(cls,
+              localization_loss,
+              task_loss_weight=1.0):
+    """Constructor with default values for TrackParams.
+
+    Args:
+      localization_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the temporal offset in CenterNet.
+      task_loss_weight: float, the loss weight for the temporal offset
+        task.
+
+    Returns:
+      An initialized TemporalOffsetParams namedtuple.
+    """
+    return super(TemporalOffsetParams,
+                 cls).__new__(cls, localization_loss, task_loss_weight)
+
 # The following constants are used to generate the keys of the
 # (prediction, loss, target assigner,...) dictionaries used in CenterNetMetaArch
 # class.
@@ -1552,6 +1632,8 @@ DENSEPOSE_REGRESSION = 'densepose/regression'
 LOSS_KEY_PREFIX = 'Loss'
 TRACK_TASK = 'track_task'
 TRACK_REID = 'track/reid'
+TEMPORALOFFSET_TASK = 'temporal_offset_task'
+TEMPORAL_OFFSET = 'track/offset'


 def get_keypoint_name(task_name, head_name):
@@ -1596,7 +1678,9 @@ class CenterNetMetaArch(model.DetectionModel):
               keypoint_params_dict=None,
               mask_params=None,
               densepose_params=None,
-               track_params=None):
+               track_params=None,
+               temporal_offset_params=None,
+               use_depthwise=False):
    """Initializes a CenterNet model.

    Args:
@@ -1631,6 +1715,10 @@ class CenterNetMetaArch(model.DetectionModel):
      track_params: A TrackParams namedtuple. This object
        holds the hyper-parameters for tracking. Please see the class
        definition for more details.
+      temporal_offset_params: A TemporalOffsetParams namedtuple. This object
+        holds the hyper-parameters for offset prediction based tracking.
+      use_depthwise: If true, all task heads will be constructed using
+        separable_conv. Otherwise, standard convoltuions will be used.
    """
    assert object_detection_params or keypoint_params_dict
    # Shorten the name for convenience and better formatting.
@@ -1651,6 +1739,9 @@ class CenterNetMetaArch(model.DetectionModel):
                       'be supplied.')
    self._densepose_params = densepose_params
    self._track_params = track_params
+    self._temporal_offset_params = temporal_offset_params
+
+    self._use_depthwise = use_depthwise

    # Construct the prediction head nets.
    self._prediction_head_dict = self._construct_prediction_heads(
@@ -1695,58 +1786,75 @@ class CenterNetMetaArch(model.DetectionModel):
    """
    prediction_heads = {}
    prediction_heads[OBJECT_CENTER] = [
-        make_prediction_net(num_classes, bias_fill=class_prediction_bias_init)
+        make_prediction_net(num_classes, bias_fill=class_prediction_bias_init,
+                            use_depthwise=self._use_depthwise)
        for _ in range(num_feature_outputs)
    ]
    if self._od_params is not None:
      prediction_heads[BOX_SCALE] = [
-          make_prediction_net(NUM_SIZE_CHANNELS)
+          make_prediction_net(
+              NUM_SIZE_CHANNELS, use_depthwise=self._use_depthwise)
          for _ in range(num_feature_outputs)
      ]
      prediction_heads[BOX_OFFSET] = [
-          make_prediction_net(NUM_OFFSET_CHANNELS)
+          make_prediction_net(
+              NUM_OFFSET_CHANNELS, use_depthwise=self._use_depthwise)
          for _ in range(num_feature_outputs)
      ]
    if self._kp_params_dict is not None:
      for task_name, kp_params in self._kp_params_dict.items():
        num_keypoints = len(kp_params.keypoint_indices)
+        # pylint: disable=g-complex-comprehension
        prediction_heads[get_keypoint_name(task_name, KEYPOINT_HEATMAP)] = [
            make_prediction_net(
-                num_keypoints, bias_fill=kp_params.heatmap_bias_init)
+                num_keypoints,
+                bias_fill=kp_params.heatmap_bias_init,
+                use_depthwise=self._use_depthwise)
            for _ in range(num_feature_outputs)
        ]
+        # pylint: enable=g-complex-comprehension
        prediction_heads[get_keypoint_name(task_name, KEYPOINT_REGRESSION)] = [
-            make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints)
+            make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints,
+                                use_depthwise=self._use_depthwise)
            for _ in range(num_feature_outputs)
        ]
        if kp_params.per_keypoint_offset:
          prediction_heads[get_keypoint_name(task_name, KEYPOINT_OFFSET)] = [
-              make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints)
+              make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints,
+                                  use_depthwise=self._use_depthwise)
              for _ in range(num_feature_outputs)
          ]
        else:
          prediction_heads[get_keypoint_name(task_name, KEYPOINT_OFFSET)] = [
-              make_prediction_net(NUM_OFFSET_CHANNELS)
+              make_prediction_net(NUM_OFFSET_CHANNELS,
+                                  use_depthwise=self._use_depthwise)
              for _ in range(num_feature_outputs)
          ]
+    # pylint: disable=g-complex-comprehension
    if self._mask_params is not None:
      prediction_heads[SEGMENTATION_HEATMAP] = [
-          make_prediction_net(num_classes,
-                              bias_fill=self._mask_params.heatmap_bias_init)
+          make_prediction_net(
+              num_classes,
+              bias_fill=self._mask_params.heatmap_bias_init,
+              use_depthwise=self._use_depthwise)
          for _ in range(num_feature_outputs)]
    if self._densepose_params is not None:
      prediction_heads[DENSEPOSE_HEATMAP] = [
-          make_prediction_net(  # pylint: disable=g-complex-comprehension
+          make_prediction_net(
              self._densepose_params.num_parts,
-              bias_fill=self._densepose_params.heatmap_bias_init)
+              bias_fill=self._densepose_params.heatmap_bias_init,
+              use_depthwise=self._use_depthwise)
          for _ in range(num_feature_outputs)]
      prediction_heads[DENSEPOSE_REGRESSION] = [
-          make_prediction_net(2 * self._densepose_params.num_parts)
+          make_prediction_net(2 * self._densepose_params.num_parts,
+                              use_depthwise=self._use_depthwise)
          for _ in range(num_feature_outputs)
      ]
+    # pylint: enable=g-complex-comprehension
    if self._track_params is not None:
      prediction_heads[TRACK_REID] = [
-          make_prediction_net(self._track_params.reid_embed_size)
+          make_prediction_net(self._track_params.reid_embed_size,
+                              use_depthwise=self._use_depthwise)
          for _ in range(num_feature_outputs)]

      # Creates a classification network to train object embeddings by learning
@@ -1764,6 +1872,12 @@ class CenterNetMetaArch(model.DetectionModel):
          tf.keras.layers.Dense(self._track_params.num_track_ids,
                                input_shape=(
                                    self._track_params.reid_embed_size,)))
+    if self._temporal_offset_params is not None:
+      prediction_heads[TEMPORAL_OFFSET] = [
+          make_prediction_net(NUM_OFFSET_CHANNELS,
+                              use_depthwise=self._use_depthwise)
+          for _ in range(num_feature_outputs)
+      ]
    return prediction_heads

  def _initialize_target_assigners(self, stride, min_box_overlap_iou):
@@ -1806,6 +1920,9 @@ class CenterNetMetaArch(model.DetectionModel):
      target_assigners[TRACK_TASK] = (
          cn_assigner.CenterNetTrackTargetAssigner(
              stride, self._track_params.num_track_ids))
+    if self._temporal_offset_params is not None:
+      target_assigners[TEMPORALOFFSET_TASK] = (
+          cn_assigner.CenterNetTemporalOffsetTargetAssigner(stride))

    return target_assigners

@@ -2394,6 +2511,54 @@ class CenterNetMetaArch(model.DetectionModel):

    return loss_per_instance

+  def _compute_temporal_offset_loss(self, input_height,
+                                    input_width, prediction_dict):
+    """Computes the temporal offset loss for tracking.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      prediction_dict: The dictionary returned from the predict() method.
+
+    Returns:
+      A dictionary with track/temporal_offset losses.
+    """
+    gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
+    gt_offsets_list = self.groundtruth_lists(
+        fields.BoxListFields.temporal_offsets)
+    gt_match_list = self.groundtruth_lists(
+        fields.BoxListFields.track_match_flags)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+    num_boxes = tf.cast(
+        get_num_instances_from_weights(gt_weights_list), tf.float32)
+
+    offset_predictions = prediction_dict[TEMPORAL_OFFSET]
+    num_predictions = float(len(offset_predictions))
+
+    assigner = self._target_assigner_dict[TEMPORALOFFSET_TASK]
+    (batch_indices, batch_offset_targets,
+     batch_weights) = assigner.assign_temporal_offset_targets(
+         height=input_height,
+         width=input_width,
+         gt_boxes_list=gt_boxes_list,
+         gt_offsets_list=gt_offsets_list,
+         gt_match_list=gt_match_list,
+         gt_weights_list=gt_weights_list)
+    batch_weights = tf.expand_dims(batch_weights, -1)
+
+    offset_loss_fn = self._temporal_offset_params.localization_loss
+    loss_dict = {}
+    offset_loss = 0
+    for offset_pred in offset_predictions:
+      offset_pred = cn_assigner.get_batch_predictions_from_indices(
+          offset_pred, batch_indices)
+      offset_loss += offset_loss_fn(offset_pred[:, None],
+                                    batch_offset_targets[:, None],
+                                    weights=batch_weights)
+    offset_loss = tf.reduce_sum(offset_loss) / (num_predictions * num_boxes)
+    loss_dict[TEMPORAL_OFFSET] = offset_loss
+    return loss_dict
+
  def preprocess(self, inputs):
    outputs = shape_utils.resize_images_and_return_shapes(
        inputs, self._image_resizer_fn)
@@ -2490,6 +2655,7 @@ class CenterNetMetaArch(model.DetectionModel):
        'Loss/densepose/heatmap', (optional)
        'Loss/densepose/regression', (optional)
        'Loss/track/reid'] (optional)
+        'Loss/track/offset'] (optional)
        scalar tensors corresponding to the losses for different tasks. Note the
        $TASK_NAME is provided by the KeypointEstimation namedtuple used to
        differentiate between different keypoint tasks.
@@ -2567,6 +2733,16 @@ class CenterNetMetaArch(model.DetectionModel):
            track_losses[key] * self._track_params.task_loss_weight)
      losses.update(track_losses)

+    if self._temporal_offset_params is not None:
+      offset_losses = self._compute_temporal_offset_loss(
+          input_height=input_height,
+          input_width=input_width,
+          prediction_dict=prediction_dict)
+      for key in offset_losses:
+        offset_losses[key] = (
+            offset_losses[key] * self._temporal_offset_params.task_loss_weight)
+      losses.update(offset_losses)
+
    # Prepend the LOSS_KEY_PREFIX to the keys in the dictionary such that the
    # losses will be grouped together in Tensorboard.
    return dict([('%s/%s' % (LOSS_KEY_PREFIX, key), val)
@@ -2683,6 +2859,12 @@ class CenterNetMetaArch(model.DetectionModel):
          fields.DetectionResultFields.detection_embeddings: embeddings
      })

+    if self._temporal_offset_params:
+      offsets = prediction_tensors_to_temporal_offsets(
+          y_indices, x_indices,
+          prediction_dict[TEMPORAL_OFFSET][-1])
+      postprocess_dict[fields.DetectionResultFields.detection_offsets] = offsets
+
    return postprocess_dict

  def _postprocess_embeddings(self, prediction_dict, y_indices, x_indices):
@@ -2753,6 +2935,7 @@ class CenterNetMetaArch(model.DetectionModel):
            get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
        instance_inds = self._get_instance_indices(
            classes, num_detections, ex_ind, kp_params.class_id)
+        num_ind = _get_shape(instance_inds, 1)

        def true_fn(
            keypoint_heatmap, keypoint_offsets, keypoint_regression,
@@ -2787,7 +2970,8 @@ class CenterNetMetaArch(model.DetectionModel):
            true_fn, keypoint_heatmap, keypoint_offsets, keypoint_regression,
            classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
            kp_params)
-        results = tf.cond(tf.size(instance_inds) > 0, true_fn, false_fn)
+        # Use dimension values instead of tf.size for tf.lite compatibility.
+        results = tf.cond(num_ind[0] > 0, true_fn, false_fn)

        kpt_coords_for_class_list.append(results[0])
        kpt_scores_for_class_list.append(results[1])
@@ -2799,7 +2983,9 @@ class CenterNetMetaArch(model.DetectionModel):
      instance_inds_for_example = tf.concat(instance_inds_for_class_list,
                                            axis=0)

-      if tf.size(instance_inds_for_example) > 0:
+      # Use dimension values instead of tf.size for tf.lite compatibility.
+      num_inds = _get_shape(instance_inds_for_example, 1)
+      if num_inds[0] > 0:
        # Scatter into tensor where instances align with original detection
        # instances. New shape of keypoint coordinates and scores are
        # [1, max_detections, num_total_keypoints, 2] and
@@ -2839,7 +3025,7 @@ class CenterNetMetaArch(model.DetectionModel):
      class_id: Class id

    Returns:
-      instance_inds: A [num_instances] int tensor where each element indicates
+      instance_inds: A [num_instances] int32 tensor where each element indicates
        the instance location within the `classes` tensor. This is useful to
        associate the refined keypoints with the original detections (i.e.
        boxes)
@@ -2848,11 +3034,14 @@ class CenterNetMetaArch(model.DetectionModel):
    _, max_detections = shape_utils.combined_static_and_dynamic_shape(
        classes)
    # Get the detection indices corresponding to the target class.
+    # Call tf.math.equal with matched tensor shape to make it tf.lite
+    # compatible.
    valid_detections_with_kpt_class = tf.math.logical_and(
        tf.range(max_detections) < num_detections[batch_index],
-        classes[0] == class_id)
+        tf.math.equal(classes[0], tf.fill(classes[0].shape, class_id)))
    instance_inds = tf.where(valid_detections_with_kpt_class)[:, 0]
-    return instance_inds
+    # Cast the indices tensor to int32 for tf.lite compatibility.
+    return tf.cast(instance_inds, tf.int32)

  def _postprocess_keypoints_for_class_and_image(
      self, keypoint_heatmap, keypoint_offsets, keypoint_regression, classes,

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -35,11 +35,14 @@ from object_detection.utils import tf_version


 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
-class CenterNetMetaArchPredictionHeadTest(test_case.TestCase):
+class CenterNetMetaArchPredictionHeadTest(
+    test_case.TestCase, parameterized.TestCase):
  """Test CenterNet meta architecture prediction head."""

-  def test_prediction_head(self):
-    head = cnma.make_prediction_net(num_out_channels=7)
+  @parameterized.parameters([True, False])
+  def test_prediction_head(self, use_depthwise):
+    head = cnma.make_prediction_net(num_out_channels=7,
+                                    use_depthwise=use_depthwise)
    output = head(np.zeros((4, 128, 128, 8)))

    self.assertEqual((4, 128, 128, 7), output.shape)
@@ -547,6 +550,53 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    np.testing.assert_allclose(scores[1][:1], [.9])
    np.testing.assert_allclose(scores[2], [1., .8])

+  def test_offset_prediction(self):
+
+    class_pred = np.zeros((3, 128, 128, 5), dtype=np.float32)
+    offset_pred = np.zeros((3, 128, 128, 2), dtype=np.float32)
+
+    # Sample 1, 2 boxes
+    class_pred[0, 10, 20] = [0.3, .7, 0.0, 0.0, 0.0]
+    offset_pred[0, 10, 20] = [1, 2]
+
+    class_pred[0, 50, 60] = [0.55, 0.0, 0.0, 0.0, 0.45]
+    offset_pred[0, 50, 60] = [0, 0]
+
+    # Sample 2, 2 boxes (at same location)
+    class_pred[1, 100, 100] = [0.0, 0.1, 0.9, 0.0, 0.0]
+    offset_pred[1, 100, 100] = [1, 3]
+
+    # Sample 3, 3 boxes
+    class_pred[2, 60, 90] = [0.0, 0.0, 0.0, 0.2, 0.8]
+    offset_pred[2, 60, 90] = [0, 0]
+
+    class_pred[2, 65, 95] = [0.0, 0.7, 0.3, 0.0, 0.0]
+    offset_pred[2, 65, 95] = [1, 2]
+
+    class_pred[2, 75, 85] = [1.0, 0.0, 0.0, 0.0, 0.0]
+    offset_pred[2, 75, 85] = [5, 2]
+
+    def graph_fn():
+      class_pred_tensor = tf.constant(class_pred)
+      offset_pred_tensor = tf.constant(offset_pred)
+
+      _, y_indices, x_indices, _ = (
+          cnma.top_k_feature_map_locations(
+              class_pred_tensor, max_pool_kernel_size=3, k=2))
+
+      offsets = cnma.prediction_tensors_to_temporal_offsets(
+          y_indices, x_indices, offset_pred_tensor)
+      return offsets
+
+    offsets = self.execute(graph_fn, [])
+
+    np.testing.assert_allclose(
+        [[1, 2], [0, 0]], offsets[0])
+    np.testing.assert_allclose(
+        [[1, 3], [1, 3]], offsets[1])
+    np.testing.assert_allclose(
+        [[5, 2], [0, 0]], offsets[2])
+
  def test_keypoint_candidate_prediction(self):
    keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
    keypoint_heatmap_np[0, 0, 0, 0] = 1.0
@@ -1156,6 +1206,13 @@ def get_fake_track_params():
      task_loss_weight=1.0)


+def get_fake_temporal_offset_params():
+  """Returns the fake temporal offset parameter namedtuple."""
+  return cnma.TemporalOffsetParams(
+      localization_loss=losses.WeightedSmoothL1LocalizationLoss(),
+      task_loss_weight=1.0)
+
+
 def build_center_net_meta_arch(build_resnet=False):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
@@ -1185,7 +1242,8 @@ def build_center_net_meta_arch(build_resnet=False):
      keypoint_params_dict={_TASK_NAME: get_fake_kp_params()},
      mask_params=get_fake_mask_params(),
      densepose_params=get_fake_densepose_params(),
-      track_params=get_fake_track_params())
+      track_params=get_fake_track_params(),
+      temporal_offset_params=get_fake_temporal_offset_params())


 def _logit(p):
@@ -1284,6 +1342,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        fake_feature_map)
    self.assertEqual((4, 128, 128, _REID_EMBED_SIZE), output.shape)

+    # "temporal offset" head:
+    output = model._prediction_head_dict[cnma.TEMPORAL_OFFSET][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, 2), output.shape)
+
  def test_initialize_target_assigners(self):
    model = build_center_net_meta_arch()
    assigner_dict = model._initialize_target_assigners(
@@ -1315,6 +1378,10 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertIsInstance(assigner_dict[cnma.TRACK_TASK],
                          cn_assigner.CenterNetTrackTargetAssigner)

+    # Temporal Offset target assigner:
+    self.assertIsInstance(assigner_dict[cnma.TEMPORALOFFSET_TASK],
+                          cn_assigner.CenterNetTemporalOffsetTargetAssigner)
+
  def test_predict(self):
    """Test the predict function."""

@@ -1341,6 +1408,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                     (2, 32, 32, 2 * _DENSEPOSE_NUM_PARTS))
    self.assertEqual(prediction_dict[cnma.TRACK_REID][0].shape,
                     (2, 32, 32, _REID_EMBED_SIZE))
+    self.assertEqual(prediction_dict[cnma.TEMPORAL_OFFSET][0].shape,
+                     (2, 32, 32, 2))

  def test_loss(self):
    """Test the loss function."""
@@ -1361,7 +1430,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        groundtruth_dp_surface_coords_list=groundtruth_dict[
            fields.BoxListFields.densepose_surface_coords],
        groundtruth_track_ids_list=groundtruth_dict[
-            fields.BoxListFields.track_ids])
+            fields.BoxListFields.track_ids],
+        groundtruth_track_match_flags_list=groundtruth_dict[
+            fields.BoxListFields.track_match_flags],
+        groundtruth_temporal_offsets_list=groundtruth_dict[
+            fields.BoxListFields.temporal_offsets])

    kernel_initializer = tf.constant_initializer(
        [[1, 1, 0], [-1000000, -1000000, 1000000]])
@@ -1413,6 +1486,9 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertGreater(
        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
                                   cnma.TRACK_REID)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.TEMPORAL_OFFSET)])

  @parameterized.parameters(
      {'target_class_id': 1},
@@ -1463,6 +1539,9 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                                    dtype=np.float32)
    track_reid_embedding[0, 16, 16, :] = np.ones(embedding_size)

+    temporal_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    temporal_offsets[..., 1] = 1
+
    class_center = tf.constant(class_center)
    height_width = tf.constant(height_width)
    offset = tf.constant(offset)
@@ -1473,6 +1552,7 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    dp_part_heatmap = tf.constant(dp_part_heatmap, dtype=tf.float32)
    dp_surf_coords = tf.constant(dp_surf_coords, dtype=tf.float32)
    track_reid_embedding = tf.constant(track_reid_embedding, dtype=tf.float32)
+    temporal_offsets = tf.constant(temporal_offsets, dtype=tf.float32)

    prediction_dict = {
        cnma.OBJECT_CENTER: [class_center],
@@ -1487,7 +1567,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
        cnma.SEGMENTATION_HEATMAP: [segmentation_heatmap],
        cnma.DENSEPOSE_HEATMAP: [dp_part_heatmap],
        cnma.DENSEPOSE_REGRESSION: [dp_surf_coords],
-        cnma.TRACK_REID: [track_reid_embedding]
+        cnma.TRACK_REID: [track_reid_embedding],
+        cnma.TEMPORAL_OFFSET: [temporal_offsets],
    }

    def graph_fn():
@@ -1519,6 +1600,8 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                        detections['detection_masks'].shape)
    self.assertAllEqual([1, max_detection, embedding_size],
                        detections['detection_embeddings'].shape)
+    self.assertAllEqual([1, max_detection, 2],
+                        detections['detection_temporal_offsets'].shape)

    # Masks should be empty for everything but the first detection.
    self.assertAllEqual(
@@ -1632,6 +1715,10 @@ def get_fake_prediction_dict(input_height, input_width, stride):
                                   _REID_EMBED_SIZE), dtype=np.float32)
  track_reid_embedding[0, 2, 4, :] = np.arange(_REID_EMBED_SIZE)

+  temporal_offsets = np.zeros((2, output_height, output_width, 2),
+                              dtype=np.float32)
+  temporal_offsets[0, 2, 4, :] = 5
+
  prediction_dict = {
      'preprocessed_inputs':
          tf.zeros((2, input_height, input_width, 3)),
@@ -1674,7 +1761,11 @@ def get_fake_prediction_dict(input_height, input_width, stride):
      cnma.TRACK_REID: [
          tf.constant(track_reid_embedding),
          tf.constant(track_reid_embedding),
-      ]
+      ],
+      cnma.TEMPORAL_OFFSET: [
+          tf.constant(temporal_offsets),
+          tf.constant(temporal_offsets),
+      ],
  }
  return prediction_dict

@@ -1736,6 +1827,14 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
      tf.constant([2], dtype=tf.int32),
      tf.constant([1], dtype=tf.int32),
  ]
+  temporal_offsets = [
+      tf.constant([[5.0, 5.0]], dtype=tf.float32),
+      tf.constant([[2.0, 3.0]], dtype=tf.float32),
+  ]
+  track_match_flags = [
+      tf.constant([1.0], dtype=tf.float32),
+      tf.constant([1.0], dtype=tf.float32),
+  ]
  groundtruth_dict = {
      fields.BoxListFields.boxes: boxes,
      fields.BoxListFields.weights: weights,
@@ -1747,6 +1846,8 @@ def get_fake_groundtruth_dict(input_height, input_width, stride):
      fields.BoxListFields.densepose_surface_coords:
          densepose_surface_coords,
      fields.BoxListFields.track_ids: track_ids,
+      fields.BoxListFields.temporal_offsets: temporal_offsets,
+      fields.BoxListFields.track_match_flags: track_match_flags,
      fields.InputDataFields.groundtruth_labeled_classes: labeled_classes,
  }
  return groundtruth_dict

--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -1286,15 +1286,15 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
    metric_names = ['DetectionMasks_Precision/mAP',
                    'DetectionMasks_Precision/mAP@.50IOU',
                    'DetectionMasks_Precision/mAP@.75IOU',
-                    'DetectionMasks_Precision/mAP (large)',
-                    'DetectionMasks_Precision/mAP (medium)',
                    'DetectionMasks_Precision/mAP (small)',
+                    'DetectionMasks_Precision/mAP (medium)',
+                    'DetectionMasks_Precision/mAP (large)',
                    'DetectionMasks_Recall/AR@1',
                    'DetectionMasks_Recall/AR@10',
                    'DetectionMasks_Recall/AR@100',
-                    'DetectionMasks_Recall/AR@100 (large)',
+                    'DetectionMasks_Recall/AR@100 (small)',
                    'DetectionMasks_Recall/AR@100 (medium)',
-                    'DetectionMasks_Recall/AR@100 (small)']
+                    'DetectionMasks_Recall/AR@100 (large)']
    if self._include_metrics_per_category:
      for category_dict in self._categories:
        metric_names.append('DetectionMasks_PerformanceByCategory/mAP/' +