Refactor tests for Object Detection API. (#8688)

Internal changes -- PiperOrigin-RevId: 316837667

Refactor tests for Object Detection API. (#8688)
Internal changes -- PiperOrigin-RevId: 316837667
420a7253 · pkulzc · GitHub · d0ef3913 · 420a7253 · 420a7253
Unverified Commit 420a7253 authored Jun 17, 2020 by pkulzc Committed by GitHub Jun 17, 2020
20 changed files
--- a/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data_tf1_test.py
+++ b/research/object_detection/dataset_tools/context_rcnn/generate_embedding_data_tf1_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for generate_embedding_data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import os
+import tempfile
+import unittest
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+from object_detection import exporter
+from object_detection.builders import model_builder
+from object_detection.core import model
+from object_detection.dataset_tools.context_rcnn import generate_embedding_data
+from object_detection.protos import pipeline_pb2
+from object_detection.utils import tf_version
+from apache_beam import runners
+
+if six.PY2:
+  import mock  # pylint: disable=g-import-not-at-top
+else:
+  mock = unittest.mock
+
+
+class FakeModel(model.DetectionModel):
+  """A Fake Detection model with expected output nodes from post-processing."""
+
+  def preprocess(self, inputs):
+    true_image_shapes = []  # Doesn't matter for the fake model.
+    return tf.identity(inputs), true_image_shapes
+
+  def predict(self, preprocessed_inputs, true_image_shapes):
+    return {'image': tf.layers.conv2d(preprocessed_inputs, 3, 1)}
+
+  def postprocess(self, prediction_dict, true_image_shapes):
+    with tf.control_dependencies(prediction_dict.values()):
+      num_features = 100
+      feature_dims = 10
+      classifier_feature = np.ones(
+          (2, feature_dims, feature_dims, num_features),
+          dtype=np.float32).tolist()
+      postprocessed_tensors = {
+          'detection_boxes': tf.constant([[[0.0, 0.1, 0.5, 0.6],
+                                           [0.5, 0.5, 0.8, 0.8]]], tf.float32),
+          'detection_scores': tf.constant([[0.95, 0.6]], tf.float32),
+          'detection_multiclass_scores': tf.constant([[[0.1, 0.7, 0.2],
+                                                       [0.3, 0.1, 0.6]]],
+                                                     tf.float32),
+          'detection_classes': tf.constant([[0, 1]], tf.float32),
+          'num_detections': tf.constant([2], tf.float32),
+          'detection_features':
+              tf.constant([classifier_feature],
+                          tf.float32)
+      }
+    return postprocessed_tensors
+
+  def restore_map(self, checkpoint_path, fine_tune_checkpoint_type):
+    pass
+
+  def loss(self, prediction_dict, true_image_shapes):
+    pass
+
+  def regularization_losses(self):
+    pass
+
+  def updates(self):
+    pass
+
+
+@contextlib.contextmanager
+def InMemoryTFRecord(entries):
+  temp = tempfile.NamedTemporaryFile(delete=False)
+  filename = temp.name
+  try:
+    with tf.python_io.TFRecordWriter(filename) as writer:
+      for value in entries:
+        writer.write(value)
+    yield filename
+  finally:
+    os.unlink(temp.name)
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
+class GenerateEmbeddingData(tf.test.TestCase):
+
+  def _save_checkpoint_from_mock_model(self, checkpoint_path):
+    """A function to save checkpoint from a fake Detection Model.
+
+    Args:
+      checkpoint_path: Path to save checkpoint from Fake model.
+    """
+    g = tf.Graph()
+    with g.as_default():
+      mock_model = FakeModel(num_classes=5)
+      preprocessed_inputs, true_image_shapes = mock_model.preprocess(
+          tf.placeholder(tf.float32, shape=[None, None, None, 3]))
+      predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
+      mock_model.postprocess(predictions, true_image_shapes)
+      tf.train.get_or_create_global_step()
+      saver = tf.train.Saver()
+      init = tf.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        sess.run(init)
+        saver.save(sess, checkpoint_path)
+
+  def _export_saved_model(self):
+    tmp_dir = self.get_temp_dir()
+    checkpoint_path = os.path.join(tmp_dir, 'model.ckpt')
+    self._save_checkpoint_from_mock_model(checkpoint_path)
+    output_directory = os.path.join(tmp_dir, 'output')
+    saved_model_path = os.path.join(output_directory, 'saved_model')
+    tf.io.gfile.makedirs(output_directory)
+    with mock.patch.object(
+        model_builder, 'build', autospec=True) as mock_builder:
+      mock_builder.return_value = FakeModel(num_classes=5)
+      pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+      pipeline_config.eval_config.use_moving_averages = False
+      detection_model = model_builder.build(pipeline_config.model,
+                                            is_training=False)
+      outputs, placeholder_tensor = exporter.build_detection_graph(
+          input_type='tf_example',
+          detection_model=detection_model,
+          input_shape=None,
+          output_collection_name='inference_op',
+          graph_hook_fn=None)
+      output_node_names = ','.join(outputs.keys())
+      saver = tf.train.Saver()
+      input_saver_def = saver.as_saver_def()
+      frozen_graph_def = exporter.freeze_graph_with_def_protos(
+          input_graph_def=tf.get_default_graph().as_graph_def(),
+          input_saver_def=input_saver_def,
+          input_checkpoint=checkpoint_path,
+          output_node_names=output_node_names,
+          restore_op_name='save/restore_all',
+          filename_tensor_name='save/Const:0',
+          output_graph='',
+          clear_devices=True,
+          initializer_nodes='')
+      exporter.write_saved_model(
+          saved_model_path=saved_model_path,
+          frozen_graph_def=frozen_graph_def,
+          inputs=placeholder_tensor,
+          outputs=outputs)
+      return saved_model_path
+
+  def _create_tf_example(self):
+    with self.test_session():
+      encoded_image = tf.image.encode_jpeg(
+          tf.constant(np.ones((4, 4, 3)).astype(np.uint8))).eval()
+
+    def BytesFeature(value):
+      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+    def Int64Feature(value):
+      return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+    def FloatFeature(value):
+      return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': BytesFeature(encoded_image),
+                'image/source_id': BytesFeature(b'image_id'),
+                'image/height': Int64Feature(400),
+                'image/width': Int64Feature(600),
+                'image/class/label': Int64Feature(5),
+                'image/class/text': BytesFeature(b'hyena'),
+                'image/object/bbox/xmin': FloatFeature(0.1),
+                'image/object/bbox/xmax': FloatFeature(0.6),
+                'image/object/bbox/ymin': FloatFeature(0.0),
+                'image/object/bbox/ymax': FloatFeature(0.5),
+                'image/object/class/score': FloatFeature(0.95),
+                'image/object/class/label': Int64Feature(5),
+                'image/object/class/text': BytesFeature(b'hyena'),
+                'image/date_captured': BytesFeature(b'2019-10-20 12:12:12')
+            }))
+
+    return example.SerializeToString()
+
+  def assert_expected_example(self, example, topk=False, botk=False):
+    # Check embeddings
+    if topk or botk:
+      self.assertEqual(len(
+          example.features.feature['image/embedding'].float_list.value),
+                       218)
+      self.assertAllEqual(
+          example.features.feature['image/embedding_count'].int64_list.value,
+          [2])
+    else:
+      self.assertEqual(len(
+          example.features.feature['image/embedding'].float_list.value),
+                       109)
+      self.assertAllEqual(
+          example.features.feature['image/embedding_count'].int64_list.value,
+          [1])
+
+    self.assertAllEqual(
+        example.features.feature['image/embedding_length'].int64_list.value,
+        [109])
+
+    # Check annotations
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [0.0])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [0.1])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [0.5])
+    self.assertAllClose(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [0.6])
+    self.assertAllClose(
+        example.features.feature['image/object/class/score']
+        .float_list.value, [0.95])
+    self.assertAllClose(
+        example.features.feature['image/object/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(
+        example.features.feature['image/object/class/text']
+        .bytes_list.value, ['hyena'])
+    self.assertAllClose(
+        example.features.feature['image/class/label']
+        .int64_list.value, [5])
+    self.assertAllEqual(
+        example.features.feature['image/class/text']
+        .bytes_list.value, ['hyena'])
+
+    # Check other essential attributes.
+    self.assertAllEqual(
+        example.features.feature['image/height'].int64_list.value, [400])
+    self.assertAllEqual(
+        example.features.feature['image/width'].int64_list.value, [600])
+    self.assertAllEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        ['image_id'])
+    self.assertTrue(
+        example.features.feature['image/encoded'].bytes_list.value)
+
+  def test_generate_embedding_data_fn(self):
+    saved_model_path = self._export_saved_model()
+    top_k_embedding_count = 1
+    bottom_k_embedding_count = 0
+    inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
+        saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(tf.train.Example.FromString(
+        generated_example).features.feature['image/object/class/label']
+                        .int64_list.value, [5])
+    self.assertAllEqual(tf.train.Example.FromString(
+        generated_example).features.feature['image/object/class/text']
+                        .bytes_list.value, ['hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+    self.assert_expected_example(output_example)
+
+  def test_generate_embedding_data_with_top_k_boxes(self):
+    saved_model_path = self._export_saved_model()
+    top_k_embedding_count = 2
+    bottom_k_embedding_count = 0
+    inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
+        saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/label'].int64_list.value, [5])
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/text'].bytes_list.value, [b'hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+    self.assert_expected_example(output_example, topk=True)
+
+  def test_generate_embedding_data_with_bottom_k_boxes(self):
+    saved_model_path = self._export_saved_model()
+    top_k_embedding_count = 0
+    bottom_k_embedding_count = 2
+    inference_fn = generate_embedding_data.GenerateEmbeddingDataFn(
+        saved_model_path, top_k_embedding_count, bottom_k_embedding_count)
+    inference_fn.start_bundle()
+    generated_example = self._create_tf_example()
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/label'].int64_list.value, [5])
+    self.assertAllEqual(
+        tf.train.Example.FromString(generated_example).features
+        .feature['image/object/class/text'].bytes_list.value, ['hyena'])
+    output = inference_fn.process(generated_example)
+    output_example = output[0]
+    self.assert_expected_example(output_example, botk=True)
+
+  def test_beam_pipeline(self):
+    with InMemoryTFRecord([self._create_tf_example()]) as input_tfrecord:
+      runner = runners.DirectRunner()
+      temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
+      output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
+      saved_model_path = self._export_saved_model()
+      top_k_embedding_count = 1
+      bottom_k_embedding_count = 0
+      num_shards = 1
+      pipeline = generate_embedding_data.construct_pipeline(
+          input_tfrecord, output_tfrecord, saved_model_path,
+          top_k_embedding_count, bottom_k_embedding_count, num_shards)
+      runner.run(pipeline)
+      filenames = tf.io.gfile.glob(
+          output_tfrecord + '-?????-of-?????')
+      actual_output = []
+      record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
+      for record in record_iterator:
+        actual_output.append(record)
+      self.assertEqual(len(actual_output), 1)
+      self.assert_expected_example(tf.train.Example.FromString(
+          actual_output[0]))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/dataset_tools/seq_example_util_test.py
+++ b/research/object_detection/dataset_tools/seq_example_util_test.py
@@ -24,10 +24,18 @@ import six
 import tensorflow.compat.v1 as tf

 from object_detection.dataset_tools import seq_example_util
+from object_detection.utils import tf_version


 class SeqExampleUtilTest(tf.test.TestCase):

+  def materialize_tensors(self, list_of_tensors):
+    if tf_version.is_tf2():
+      return [tensor.numpy() for tensor in list_of_tensors]
+    else:
+      with self.cached_session() as sess:
+        return sess.run(list_of_tensors)
+
  def test_make_unlabeled_example(self):
    num_frames = 5
    image_height = 100
@@ -41,8 +49,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
    image_source_ids = [str(idx) for idx in range(num_frames)]
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
-    with tf.Session() as sess:
-      encoded_images = sess.run(encoded_images_list)
+    encoded_images = self.materialize_tensors(encoded_images_list)
    seq_example = seq_example_util.make_sequence_example(
        dataset_name=dataset_name,
        video_id=video_id,
@@ -109,8 +116,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
        dtype=tf.int32), dtype=tf.uint8)
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
-    with tf.Session() as sess:
-      encoded_images = sess.run(encoded_images_list)
+    encoded_images = self.materialize_tensors(encoded_images_list)
    timestamps = [100000, 110000]
    is_annotated = [1, 0]
    bboxes = [
@@ -208,8 +214,7 @@ class SeqExampleUtilTest(tf.test.TestCase):
        dtype=tf.int32), dtype=tf.uint8)
    images_list = tf.unstack(images, axis=0)
    encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list]
-    with tf.Session() as sess:
-      encoded_images = sess.run(encoded_images_list)
+    encoded_images = self.materialize_tensors(encoded_images_list)
    bboxes = [
        np.array([[0., 0., 0.75, 0.75],
                  [0., 0., 1., 1.]], dtype=np.float32),

--- a/research/object_detection/eval_util.py
+++ b/research/object_detection/eval_util.py
@@ -52,6 +52,8 @@ EVAL_METRICS_CLASS_DICT = {
        coco_evaluation.CocoKeypointEvaluator,
    'coco_mask_metrics':
        coco_evaluation.CocoMaskEvaluator,
+    'coco_panoptic_metrics':
+        coco_evaluation.CocoPanopticSegmentationEvaluator,
    'oid_challenge_detection_metrics':
        object_detection_evaluation.OpenImagesDetectionChallengeEvaluator,
    'oid_challenge_segmentation_metrics':

--- a/research/object_detection/eval_util_test.py
+++ b/research/object_detection/eval_util_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 from absl.testing import parameterized

 import numpy as np
@@ -30,6 +31,7 @@ from object_detection.core import standard_fields as fields
 from object_detection.metrics import coco_evaluation
 from object_detection.protos import eval_pb2
 from object_detection.utils import test_case
+from object_detection.utils import tf_version


 class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
@@ -127,6 +129,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      {'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
      {'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
  )
+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_for_coco_detections(self, batch_size=1,
                                                   max_gt_boxes=None,
                                                   scale_to_absolute=False):
@@ -155,6 +158,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      {'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
      {'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
  )
+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_for_coco_detections_and_masks(
      self, batch_size=1, max_gt_boxes=None, scale_to_absolute=False):
    eval_config = eval_pb2.EvalConfig()
@@ -185,6 +189,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      {'batch_size': 1, 'max_gt_boxes': None, 'scale_to_absolute': False},
      {'batch_size': 8, 'max_gt_boxes': [1], 'scale_to_absolute': False}
  )
+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_for_coco_detections_and_resized_masks(
      self, batch_size=1, max_gt_boxes=None, scale_to_absolute=False):
    eval_config = eval_pb2.EvalConfig()
@@ -210,6 +215,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      self.assertAlmostEqual(1.0, metrics['DetectionBoxes_Precision/mAP'])
      self.assertAlmostEqual(1.0, metrics['DetectionMasks_Precision/mAP'])

+  @unittest.skipIf(tf_version.is_tf2(), 'Only compatible with TF1.X')
  def test_get_eval_metric_ops_raises_error_with_unsupported_metric(self):
    eval_config = eval_pb2.EvalConfig()
    eval_config.metrics_set.extend(['unsupported_metric'])
@@ -334,6 +340,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
                               dtype=np.float32)
    detection_keypoints = np.array([[0.0, 0.0], [0.5, 0.5], [1.0, 1.0]],
                                   dtype=np.float32)
+    def graph_fn():
      detections = {
          detection_fields.detection_boxes:
              tf.constant(detection_boxes),
@@ -374,23 +381,26 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
          true_image_shapes=true_image_shapes,
          original_image_spatial_shapes=original_image_spatial_shapes,
          max_gt_boxes=tf.constant(1))
-
-    with self.test_session() as sess:
-      result = sess.run(result)
+      return (result[input_data_fields.groundtruth_boxes],
+              result[input_data_fields.groundtruth_keypoints],
+              result[detection_fields.detection_boxes],
+              result[detection_fields.detection_keypoints])
+    (gt_boxes, gt_keypoints, detection_boxes,
+     detection_keypoints) = self.execute_cpu(graph_fn, [])
    self.assertAllEqual(
        [[[0., 0., 200., 200.]], [[0.0, 0.0, 150., 150.]]],
-          result[input_data_fields.groundtruth_boxes])
+        gt_boxes)
    self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
                         [[[0., 0.], [150., 150.], [300., 300.]]]],
-                          result[input_data_fields.groundtruth_keypoints])
+                        gt_keypoints)

    # Predictions from the model are not scaled.
    self.assertAllEqual(
        [[[0., 0., 200., 200.]], [[0.0, 0.0, 75., 150.]]],
-          result[detection_fields.detection_boxes])
+        detection_boxes)
    self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
                         [[[0., 0.], [75., 150.], [150., 300.]]]],
-                          result[detection_fields.detection_keypoints])
+                        detection_keypoints)


 if __name__ == '__main__':

--- a/research/object_detection/export_inference_graph.py
+++ b/research/object_detection/export_inference_graph.py
@@ -134,6 +134,30 @@ flags.DEFINE_string('config_override', '',
                    'text proto to override pipeline_config_path.')
 flags.DEFINE_boolean('write_inference_graph', False,
                     'If true, writes inference graph to disk.')
+flags.DEFINE_string('additional_output_tensor_names', None,
+                    'Additional Tensors to output, to be specified as a comma '
+                    'separated list of tensor names.')
+flags.DEFINE_boolean('use_side_inputs', False,
+                     'If True, uses side inputs as well as image inputs.')
+flags.DEFINE_string('side_input_shapes', None,
+                    'If use_side_inputs is True, this explicitly sets '
+                    'the shape of the side input tensors to a fixed size. The '
+                    'dimensions are to be provided as a comma-separated list '
+                    'of integers. A value of -1 can be used for unknown '
+                    'dimensions. A `/` denotes a break, starting the shape of '
+                    'the next side input tensor. This flag is required if '
+                    'using side inputs.')
+flags.DEFINE_string('side_input_types', None,
+                    'If use_side_inputs is True, this explicitly sets '
+                    'the type of the side input tensors. The '
+                    'dimensions are to be provided as a comma-separated list '
+                    'of types, each of `string`, `integer`, or `float`. '
+                    'This flag is required if using side inputs.')
+flags.DEFINE_string('side_input_names', None,
+                    'If use_side_inputs is True, this explicitly sets '
+                    'the names of the side input tensors required by the model '
+                    'assuming the names will be a comma-separated list of '
+                    'strings. This flag is required if using side inputs.')
 tf.app.flags.mark_flag_as_required('pipeline_config_path')
 tf.app.flags.mark_flag_as_required('trained_checkpoint_prefix')
 tf.app.flags.mark_flag_as_required('output_directory')
@@ -152,10 +176,30 @@ def main(_):
    ]
  else:
    input_shape = None
+  if FLAGS.use_side_inputs:
+    side_input_shapes, side_input_names, side_input_types = (
+        exporter.parse_side_inputs(
+            FLAGS.side_input_shapes,
+            FLAGS.side_input_names,
+            FLAGS.side_input_types))
+  else:
+    side_input_shapes = None
+    side_input_names = None
+    side_input_types = None
+  if FLAGS.additional_output_tensor_names:
+    additional_output_tensor_names = list(
+        FLAGS.additional_output_tensor_names.split(','))
+  else:
+    additional_output_tensor_names = None
  exporter.export_inference_graph(
      FLAGS.input_type, pipeline_config, FLAGS.trained_checkpoint_prefix,
      FLAGS.output_directory, input_shape=input_shape,
-      write_inference_graph=FLAGS.write_inference_graph)
+      write_inference_graph=FLAGS.write_inference_graph,
+      additional_output_tensor_names=additional_output_tensor_names,
+      use_side_inputs=FLAGS.use_side_inputs,
+      side_input_shapes=side_input_shapes,
+      side_input_names=side_input_names,
+      side_input_types=side_input_types)


 if __name__ == '__main__':

--- a/research/object_detection/export_tflite_ssd_graph_lib.py
+++ b/research/object_detection/export_tflite_ssd_graph_lib.py
@@ -24,16 +24,19 @@ import tensorflow.compat.v1 as tf
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import saver_pb2
-from tensorflow.tools.graph_transforms import TransformGraph
 from object_detection import exporter
 from object_detection.builders import graph_rewriter_builder
 from object_detection.builders import model_builder
 from object_detection.builders import post_processing_builder
 from object_detection.core import box_list
+from object_detection.utils import tf_version

 _DEFAULT_NUM_CHANNELS = 3
 _DEFAULT_NUM_COORD_BOX = 4

+if tf_version.is_tf1():
+  from tensorflow.tools.graph_transforms import TransformGraph  # pylint: disable=g-import-not-at-top
+

 def get_const_center_size_encoded_anchors(anchors):
  """Exports center-size encoded anchors as a constant tensor.

--- a/research/object_detection/export_tflite_ssd_graph_lib_test.py
+++ b/research/object_detection/export_tflite_ssd_graph_lib_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
+import unittest
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf
@@ -32,6 +33,7 @@ from object_detection.core import model
 from object_detection.protos import graph_rewriter_pb2
 from object_detection.protos import pipeline_pb2
 from object_detection.protos import post_processing_pb2
+from object_detection.utils import tf_version

 # pylint: disable=g-import-not-at-top

@@ -82,6 +84,7 @@ class FakeModel(model.DetectionModel):
    pass


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class ExportTfliteGraphTest(tf.test.TestCase):

  def _save_checkpoint_from_mock_model(self,

--- a/research/object_detection/exporter.py
+++ b/research/object_detection/exporter.py
@@ -39,6 +39,54 @@ except ImportError:
 freeze_graph_with_def_protos = freeze_graph.freeze_graph_with_def_protos


+def parse_side_inputs(side_input_shapes_string, side_input_names_string,
+                      side_input_types_string):
+  """Parses side input flags.
+
+  Args:
+    side_input_shapes_string: The shape of the side input tensors, provided as a
+      comma-separated list of integers. A value of -1 is used for unknown
+      dimensions. A `/` denotes a break, starting the shape of the next side
+      input tensor.
+    side_input_names_string: The names of the side input tensors, provided as a
+      comma-separated list of strings.
+    side_input_types_string: The type of the side input tensors, provided as a
+      comma-separated list of types, each of `string`, `integer`, or `float`.
+
+  Returns:
+    side_input_shapes: A list of shapes.
+    side_input_names: A list of strings.
+    side_input_types: A list of tensorflow dtypes.
+
+  """
+  if side_input_shapes_string:
+    side_input_shapes = []
+    for side_input_shape_list in side_input_shapes_string.split('/'):
+      side_input_shape = [
+          int(dim) if dim != '-1' else None
+          for dim in side_input_shape_list.split(',')
+      ]
+      side_input_shapes.append(side_input_shape)
+  else:
+    raise ValueError('When using side_inputs, side_input_shapes must be '
+                     'specified in the input flags.')
+  if side_input_names_string:
+    side_input_names = list(side_input_names_string.split(','))
+  else:
+    raise ValueError('When using side_inputs, side_input_names must be '
+                     'specified in the input flags.')
+  if side_input_types_string:
+    typelookup = {'float': tf.float32, 'int': tf.int32, 'string': tf.string}
+    side_input_types = [
+        typelookup[side_input_type]
+        for side_input_type in side_input_types_string.split(',')
+    ]
+  else:
+    raise ValueError('When using side_inputs, side_input_types must be '
+                     'specified in the input flags.')
+  return side_input_shapes, side_input_names, side_input_types
+
+
 def rewrite_nn_resize_op(is_quantized=False):
  """Replaces a custom nearest-neighbor resize op with the Tensorflow version.

@@ -140,6 +188,14 @@ def _image_tensor_input_placeholder(input_shape=None):
  return input_tensor, input_tensor


+def _side_input_tensor_placeholder(side_input_shape, side_input_name,
+                                   side_input_type):
+  """Returns side input placeholder and side input tensor."""
+  side_input_tensor = tf.placeholder(
+      dtype=side_input_type, shape=side_input_shape, name=side_input_name)
+  return side_input_tensor, side_input_tensor
+
+
 def _tf_example_input_placeholder(input_shape=None):
  """Returns input that accepts a batch of strings with tf examples.

@@ -200,7 +256,7 @@ input_placeholder_fn_map = {
    'image_tensor': _image_tensor_input_placeholder,
    'encoded_image_string_tensor':
    _encoded_image_string_tensor_input_placeholder,
-    'tf_example': _tf_example_input_placeholder,
+    'tf_example': _tf_example_input_placeholder
 }


@@ -312,7 +368,7 @@ def write_saved_model(saved_model_path,
  Args:
    saved_model_path: Path to write SavedModel.
    frozen_graph_def: tf.GraphDef holding frozen graph.
-    inputs: The input placeholder tensor.
+    inputs: A tensor dictionary containing the inputs to a DetectionModel.
    outputs: A tensor dictionary containing the outputs of a DetectionModel.
  """
  with tf.Graph().as_default():
@@ -322,8 +378,13 @@ def write_saved_model(saved_model_path,

      builder = tf.saved_model.builder.SavedModelBuilder(saved_model_path)

-      tensor_info_inputs = {
-          'inputs': tf.saved_model.utils.build_tensor_info(inputs)}
+      tensor_info_inputs = {}
+      if isinstance(inputs, dict):
+        for k, v in inputs.items():
+          tensor_info_inputs[k] = tf.saved_model.utils.build_tensor_info(v)
+      else:
+        tensor_info_inputs['inputs'] = tf.saved_model.utils.build_tensor_info(
+            inputs)
      tensor_info_outputs = {}
      for k, v in outputs.items():
        tensor_info_outputs[k] = tf.saved_model.utils.build_tensor_info(v)
@@ -364,11 +425,11 @@ def write_graph_and_checkpoint(inference_graph_def,


 def _get_outputs_from_inputs(input_tensors, detection_model,
-                             output_collection_name):
+                             output_collection_name, **side_inputs):
  inputs = tf.cast(input_tensors, dtype=tf.float32)
  preprocessed_inputs, true_image_shapes = detection_model.preprocess(inputs)
  output_tensors = detection_model.predict(
-      preprocessed_inputs, true_image_shapes)
+      preprocessed_inputs, true_image_shapes, **side_inputs)
  postprocessed_tensors = detection_model.postprocess(
      output_tensors, true_image_shapes)
  return add_output_tensor_nodes(postprocessed_tensors,
@@ -376,32 +437,45 @@ def _get_outputs_from_inputs(input_tensors, detection_model,


 def build_detection_graph(input_type, detection_model, input_shape,
-                          output_collection_name, graph_hook_fn):
+                          output_collection_name, graph_hook_fn,
+                          use_side_inputs=False, side_input_shapes=None,
+                          side_input_names=None, side_input_types=None):
  """Build the detection graph."""
  if input_type not in input_placeholder_fn_map:
    raise ValueError('Unknown input type: {}'.format(input_type))
  placeholder_args = {}
+  side_inputs = {}
  if input_shape is not None:
    if (input_type != 'image_tensor' and
        input_type != 'encoded_image_string_tensor' and
-        input_type != 'tf_example'):
+        input_type != 'tf_example' and
+        input_type != 'tf_sequence_example'):
      raise ValueError('Can only specify input shape for `image_tensor`, '
-                       '`encoded_image_string_tensor`, or `tf_example` '
-                       'inputs.')
+                       '`encoded_image_string_tensor`, `tf_example`, '
+                       ' or `tf_sequence_example` inputs.')
    placeholder_args['input_shape'] = input_shape
  placeholder_tensor, input_tensors = input_placeholder_fn_map[input_type](
      **placeholder_args)
+  placeholder_tensors = {'inputs': placeholder_tensor}
+  if use_side_inputs:
+    for idx, side_input_name in enumerate(side_input_names):
+      side_input_placeholder, side_input = _side_input_tensor_placeholder(
+          side_input_shapes[idx], side_input_name, side_input_types[idx])
+      print(side_input)
+      side_inputs[side_input_name] = side_input
+      placeholder_tensors[side_input_name] = side_input_placeholder
  outputs = _get_outputs_from_inputs(
      input_tensors=input_tensors,
      detection_model=detection_model,
-      output_collection_name=output_collection_name)
+      output_collection_name=output_collection_name,
+      **side_inputs)

  # Add global step to the graph.
  slim.get_or_create_global_step()

  if graph_hook_fn: graph_hook_fn()

-  return outputs, placeholder_tensor
+  return outputs, placeholder_tensors


 def _export_inference_graph(input_type,
@@ -414,7 +488,11 @@ def _export_inference_graph(input_type,
                            output_collection_name='inference_op',
                            graph_hook_fn=None,
                            write_inference_graph=False,
-                            temp_checkpoint_prefix=''):
+                            temp_checkpoint_prefix='',
+                            use_side_inputs=False,
+                            side_input_shapes=None,
+                            side_input_names=None,
+                            side_input_types=None):
  """Export helper."""
  tf.gfile.MakeDirs(output_directory)
  frozen_graph_path = os.path.join(output_directory,
@@ -422,12 +500,16 @@ def _export_inference_graph(input_type,
  saved_model_path = os.path.join(output_directory, 'saved_model')
  model_path = os.path.join(output_directory, 'model.ckpt')

-  outputs, placeholder_tensor = build_detection_graph(
+  outputs, placeholder_tensor_dict = build_detection_graph(
      input_type=input_type,
      detection_model=detection_model,
      input_shape=input_shape,
      output_collection_name=output_collection_name,
-      graph_hook_fn=graph_hook_fn)
+      graph_hook_fn=graph_hook_fn,
+      use_side_inputs=use_side_inputs,
+      side_input_shapes=side_input_shapes,
+      side_input_names=side_input_names,
+      side_input_types=side_input_types)

  profile_inference_graph(tf.get_default_graph())
  saver_kwargs = {}
@@ -464,7 +546,8 @@ def _export_inference_graph(input_type,
      f.write(str(inference_graph_def))

  if additional_output_tensor_names is not None:
-    output_node_names = ','.join(outputs.keys()+additional_output_tensor_names)
+    output_node_names = ','.join(list(outputs.keys())+(
+        additional_output_tensor_names))
  else:
    output_node_names = ','.join(outputs.keys())

@@ -480,7 +563,7 @@ def _export_inference_graph(input_type,
      initializer_nodes='')

  write_saved_model(saved_model_path, frozen_graph_def,
-                    placeholder_tensor, outputs)
+                    placeholder_tensor_dict, outputs)


 def export_inference_graph(input_type,
@@ -490,7 +573,11 @@ def export_inference_graph(input_type,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
-                           write_inference_graph=False):
+                           write_inference_graph=False,
+                           use_side_inputs=False,
+                           side_input_shapes=None,
+                           side_input_names=None,
+                           side_input_types=None):
  """Exports inference graph for the model specified in the pipeline config.

  Args:
@@ -506,6 +593,13 @@ def export_inference_graph(input_type,
    additional_output_tensor_names: list of additional output
      tensors to include in the frozen graph.
    write_inference_graph: If true, writes inference graph to disk.
+    use_side_inputs: If True, the model requires side_inputs.
+    side_input_shapes: List of shapes of the side input tensors,
+      required if use_side_inputs is True.
+    side_input_names: List of names of the side input tensors,
+      required if use_side_inputs is True.
+    side_input_types: List of types of the side input tensors,
+      required if use_side_inputs is True.
  """
  detection_model = model_builder.build(pipeline_config.model,
                                        is_training=False)
@@ -524,7 +618,11 @@ def export_inference_graph(input_type,
      input_shape,
      output_collection_name,
      graph_hook_fn=graph_rewriter_fn,
-      write_inference_graph=write_inference_graph)
+      write_inference_graph=write_inference_graph,
+      use_side_inputs=use_side_inputs,
+      side_input_shapes=side_input_shapes,
+      side_input_names=side_input_names,
+      side_input_types=side_input_types)
  pipeline_config.eval_config.use_moving_averages = False
  config_util.save_pipeline_config(pipeline_config, output_directory)


--- a/research/object_detection/exporter_lib_tf2_test.py
+++ b/research/object_detection/exporter_lib_tf2_test.py
+# Lint as: python2, python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Test for exporter_lib_v2.py."""
+
+from __future__ import division
+import io
+import os
+import unittest
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import six
+
+import tensorflow.compat.v2 as tf
+
+from object_detection import exporter_lib_v2
+from object_detection.builders import model_builder
+from object_detection.core import model
+from object_detection.core import standard_fields as fields
+from object_detection.protos import pipeline_pb2
+from object_detection.utils import dataset_util
+from object_detection.utils import tf_version
+
+if six.PY2:
+  import mock  # pylint: disable=g-importing-member,g-import-not-at-top
+else:
+  from unittest import mock  # pylint: disable=g-importing-member,g-import-not-at-top
+
+
+class FakeModel(model.DetectionModel):
+
+  def __init__(self, conv_weight_scalar=1.0):
+    super(FakeModel, self).__init__(num_classes=2)
+    self._conv = tf.keras.layers.Conv2D(
+        filters=1, kernel_size=1, strides=(1, 1), padding='valid',
+        kernel_initializer=tf.keras.initializers.Constant(
+            value=conv_weight_scalar))
+
+  def preprocess(self, inputs):
+    true_image_shapes = []  # Doesn't matter for the fake model.
+    return tf.identity(inputs), true_image_shapes
+
+  def predict(self, preprocessed_inputs, true_image_shapes):
+    return {'image': self._conv(preprocessed_inputs)}
+
+  def postprocess(self, prediction_dict, true_image_shapes):
+    predict_tensor_sum = tf.reduce_sum(prediction_dict['image'])
+    with tf.control_dependencies(list(prediction_dict.values())):
+      postprocessed_tensors = {
+          'detection_boxes': tf.constant([[[0.0, 0.0, 0.5, 0.5],
+                                           [0.5, 0.5, 0.8, 0.8]],
+                                          [[0.5, 0.5, 1.0, 1.0],
+                                           [0.0, 0.0, 0.0, 0.0]]], tf.float32),
+          'detection_scores': predict_tensor_sum + tf.constant(
+              [[0.7, 0.6], [0.9, 0.0]], tf.float32),
+          'detection_classes': tf.constant([[0, 1],
+                                            [1, 0]], tf.float32),
+          'num_detections': tf.constant([2, 1], tf.float32),
+      }
+    return postprocessed_tensors
+
+  def restore_map(self, checkpoint_path, fine_tune_checkpoint_type):
+    pass
+
+  def loss(self, prediction_dict, true_image_shapes):
+    pass
+
+  def regularization_losses(self):
+    pass
+
+  def updates(self):
+    pass
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class ExportInferenceGraphTest(tf.test.TestCase, parameterized.TestCase):
+
+  def _save_checkpoint_from_mock_model(
+      self, checkpoint_dir, conv_weight_scalar=6.0):
+    mock_model = FakeModel(conv_weight_scalar)
+    fake_image = tf.zeros(shape=[1, 10, 10, 3], dtype=tf.float32)
+    preprocessed_inputs, true_image_shapes = mock_model.preprocess(fake_image)
+    predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
+    mock_model.postprocess(predictions, true_image_shapes)
+
+    ckpt = tf.train.Checkpoint(model=mock_model)
+    exported_checkpoint_manager = tf.train.CheckpointManager(
+        ckpt, checkpoint_dir, max_to_keep=1)
+    exported_checkpoint_manager.save(checkpoint_number=0)
+
+  @parameterized.parameters(
+      {'input_type': 'image_tensor'},
+      {'input_type': 'encoded_image_string_tensor'},
+      {'input_type': 'tf_example'},
+  )
+  def test_export_yields_correct_directory_structure(
+      self, input_type='image_tensor'):
+    tmp_dir = self.get_temp_dir()
+    self._save_checkpoint_from_mock_model(tmp_dir)
+    with mock.patch.object(
+        model_builder, 'build', autospec=True) as mock_builder:
+      mock_builder.return_value = FakeModel()
+      output_directory = os.path.join(tmp_dir, 'output')
+      pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+      exporter_lib_v2.export_inference_graph(
+          input_type=input_type,
+          pipeline_config=pipeline_config,
+          trained_checkpoint_dir=tmp_dir,
+          output_directory=output_directory)
+      self.assertTrue(os.path.exists(os.path.join(
+          output_directory, 'saved_model', 'saved_model.pb')))
+      self.assertTrue(os.path.exists(os.path.join(
+          output_directory, 'saved_model', 'variables', 'variables.index')))
+      self.assertTrue(os.path.exists(os.path.join(
+          output_directory, 'saved_model', 'variables',
+          'variables.data-00000-of-00001')))
+      self.assertTrue(os.path.exists(os.path.join(
+          output_directory, 'checkpoint', 'ckpt-0.index')))
+      self.assertTrue(os.path.exists(os.path.join(
+          output_directory, 'checkpoint', 'ckpt-0.data-00000-of-00001')))
+      self.assertTrue(os.path.exists(os.path.join(
+          output_directory, 'pipeline.config')))
+
+  def get_dummy_input(self, input_type):
+    """Get dummy input for the given input type."""
+
+    if input_type == 'image_tensor':
+      return np.zeros(shape=(1, 20, 20, 3), dtype=np.uint8)
+    if input_type == 'float_image_tensor':
+      return np.zeros(shape=(1, 20, 20, 3), dtype=np.float32)
+    elif input_type == 'encoded_image_string_tensor':
+      image = Image.new('RGB', (20, 20))
+      byte_io = io.BytesIO()
+      image.save(byte_io, 'PNG')
+      return [byte_io.getvalue()]
+    elif input_type == 'tf_example':
+      image_tensor = tf.zeros((20, 20, 3), dtype=tf.uint8)
+      encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy()
+      example = tf.train.Example(
+          features=tf.train.Features(
+              feature={
+                  'image/encoded':
+                  dataset_util.bytes_feature(encoded_jpeg),
+                  'image/format':
+                  dataset_util.bytes_feature(six.b('jpeg')),
+                  'image/source_id':
+                  dataset_util.bytes_feature(six.b('image_id')),
+              })).SerializeToString()
+      return [example]
+
+  @parameterized.parameters(
+      {'input_type': 'image_tensor'},
+      {'input_type': 'encoded_image_string_tensor'},
+      {'input_type': 'tf_example'},
+      {'input_type': 'float_image_tensor'},
+  )
+  def test_export_saved_model_and_run_inference(
+      self, input_type='image_tensor'):
+    tmp_dir = self.get_temp_dir()
+    self._save_checkpoint_from_mock_model(tmp_dir)
+    with mock.patch.object(
+        model_builder, 'build', autospec=True) as mock_builder:
+      mock_builder.return_value = FakeModel()
+      output_directory = os.path.join(tmp_dir, 'output')
+      pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+      exporter_lib_v2.export_inference_graph(
+          input_type=input_type,
+          pipeline_config=pipeline_config,
+          trained_checkpoint_dir=tmp_dir,
+          output_directory=output_directory)
+
+      saved_model_path = os.path.join(output_directory, 'saved_model')
+      detect_fn = tf.saved_model.load(saved_model_path)
+      image = self.get_dummy_input(input_type)
+      detections = detect_fn(image)
+
+      detection_fields = fields.DetectionResultFields
+      self.assertAllClose(detections[detection_fields.detection_boxes],
+                          [[[0.0, 0.0, 0.5, 0.5],
+                            [0.5, 0.5, 0.8, 0.8]],
+                           [[0.5, 0.5, 1.0, 1.0],
+                            [0.0, 0.0, 0.0, 0.0]]])
+      self.assertAllClose(detections[detection_fields.detection_scores],
+                          [[0.7, 0.6], [0.9, 0.0]])
+      self.assertAllClose(detections[detection_fields.detection_classes],
+                          [[1, 2], [2, 1]])
+      self.assertAllClose(detections[detection_fields.num_detections], [2, 1])
+
+  def test_export_checkpoint_and_run_inference_with_image(self):
+    tmp_dir = self.get_temp_dir()
+    self._save_checkpoint_from_mock_model(tmp_dir, conv_weight_scalar=2.0)
+    with mock.patch.object(
+        model_builder, 'build', autospec=True) as mock_builder:
+      mock_builder.return_value = FakeModel()
+      output_directory = os.path.join(tmp_dir, 'output')
+      pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+      exporter_lib_v2.export_inference_graph(
+          input_type='image_tensor',
+          pipeline_config=pipeline_config,
+          trained_checkpoint_dir=tmp_dir,
+          output_directory=output_directory)
+
+      mock_model = FakeModel()
+      ckpt = tf.compat.v2.train.Checkpoint(
+          model=mock_model)
+      checkpoint_dir = os.path.join(tmp_dir, 'output', 'checkpoint')
+      manager = tf.compat.v2.train.CheckpointManager(
+          ckpt, checkpoint_dir, max_to_keep=7)
+      ckpt.restore(manager.latest_checkpoint).expect_partial()
+
+      fake_image = tf.ones(shape=[1, 5, 5, 3], dtype=tf.float32)
+      preprocessed_inputs, true_image_shapes = mock_model.preprocess(fake_image)
+      predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
+      detections = mock_model.postprocess(predictions, true_image_shapes)
+
+      # 150 = conv_weight_scalar * height * width * channels = 2 * 5 * 5 * 3.
+      self.assertAllClose(detections['detection_scores'],
+                          [[150 + 0.7, 150 + 0.6], [150 + 0.9, 150 + 0.0]])
+
+
+if __name__ == '__main__':
+  tf.enable_v2_behavior()
+  tf.test.main()
--- a/research/object_detection/exporter_lib_v2.py
+++ b/research/object_detection/exporter_lib_v2.py
+# Lint as: python2, python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions to export object detection inference graph."""
+import os
+import tensorflow.compat.v2 as tf
+from object_detection.builders import model_builder
+from object_detection.core import standard_fields as fields
+from object_detection.data_decoders import tf_example_decoder
+from object_detection.utils import config_util
+
+
+def _decode_image(encoded_image_string_tensor):
+  image_tensor = tf.image.decode_image(encoded_image_string_tensor,
+                                       channels=3)
+  image_tensor.set_shape((None, None, 3))
+  return image_tensor
+
+
+def _decode_tf_example(tf_example_string_tensor):
+  tensor_dict = tf_example_decoder.TfExampleDecoder().decode(
+      tf_example_string_tensor)
+  image_tensor = tensor_dict[fields.InputDataFields.image]
+  return image_tensor
+
+
+class DetectionInferenceModule(tf.Module):
+  """Detection Inference Module."""
+
+  def __init__(self, detection_model):
+    """Initializes a module for detection.
+
+    Args:
+      detection_model: The detection model to use for inference.
+    """
+    self._model = detection_model
+
+  def _run_inference_on_images(self, image):
+    """Cast image to float and run inference.
+
+    Args:
+      image: uint8 Tensor of shape [1, None, None, 3]
+    Returns:
+      Tensor dictionary holding detections.
+    """
+    label_id_offset = 1
+
+    image = tf.cast(image, tf.float32)
+    image, shapes = self._model.preprocess(image)
+    prediction_dict = self._model.predict(image, shapes)
+    detections = self._model.postprocess(prediction_dict, shapes)
+    classes_field = fields.DetectionResultFields.detection_classes
+    detections[classes_field] = (
+        tf.cast(detections[classes_field], tf.float32) + label_id_offset)
+
+    for key, val in detections.items():
+      detections[key] = tf.cast(val, tf.float32)
+
+    return detections
+
+
+class DetectionFromImageModule(DetectionInferenceModule):
+  """Detection Inference Module for image inputs."""
+
+  @tf.function(
+      input_signature=[
+          tf.TensorSpec(shape=[1, None, None, 3], dtype=tf.uint8)])
+  def __call__(self, input_tensor):
+    return self._run_inference_on_images(input_tensor)
+
+
+class DetectionFromFloatImageModule(DetectionInferenceModule):
+  """Detection Inference Module for float image inputs."""
+
+  @tf.function(
+      input_signature=[
+          tf.TensorSpec(shape=[1, None, None, 3], dtype=tf.float32)])
+  def __call__(self, input_tensor):
+    return self._run_inference_on_images(input_tensor)
+
+
+class DetectionFromEncodedImageModule(DetectionInferenceModule):
+  """Detection Inference Module for encoded image string inputs."""
+
+  @tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.string)])
+  def __call__(self, input_tensor):
+    with tf.device('cpu:0'):
+      image = tf.map_fn(
+          _decode_image,
+          elems=input_tensor,
+          dtype=tf.uint8,
+          parallel_iterations=32,
+          back_prop=False)
+    return self._run_inference_on_images(image)
+
+
+class DetectionFromTFExampleModule(DetectionInferenceModule):
+  """Detection Inference Module for TF.Example inputs."""
+
+  @tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.string)])
+  def __call__(self, input_tensor):
+    with tf.device('cpu:0'):
+      image = tf.map_fn(
+          _decode_tf_example,
+          elems=input_tensor,
+          dtype=tf.uint8,
+          parallel_iterations=32,
+          back_prop=False)
+    return self._run_inference_on_images(image)
+
+DETECTION_MODULE_MAP = {
+    'image_tensor': DetectionFromImageModule,
+    'encoded_image_string_tensor':
+    DetectionFromEncodedImageModule,
+    'tf_example': DetectionFromTFExampleModule,
+    'float_image_tensor': DetectionFromFloatImageModule
+}
+
+
+def export_inference_graph(input_type,
+                           pipeline_config,
+                           trained_checkpoint_dir,
+                           output_directory):
+  """Exports inference graph for the model specified in the pipeline config.
+
+  This function creates `output_directory` if it does not already exist,
+  which will hold a copy of the pipeline config with filename `pipeline.config`,
+  and two subdirectories named `checkpoint` and `saved_model`
+  (containing the exported checkpoint and SavedModel respectively).
+
+  Args:
+    input_type: Type of input for the graph. Can be one of ['image_tensor',
+      'encoded_image_string_tensor', 'tf_example'].
+    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
+    trained_checkpoint_dir: Path to the trained checkpoint file.
+    output_directory: Path to write outputs.
+  Raises:
+    ValueError: if input_type is invalid.
+  """
+  output_checkpoint_directory = os.path.join(output_directory, 'checkpoint')
+  output_saved_model_directory = os.path.join(output_directory, 'saved_model')
+
+  detection_model = model_builder.build(pipeline_config.model,
+                                        is_training=False)
+
+  ckpt = tf.train.Checkpoint(
+      model=detection_model)
+  manager = tf.train.CheckpointManager(
+      ckpt, trained_checkpoint_dir, max_to_keep=1)
+  status = ckpt.restore(manager.latest_checkpoint).expect_partial()
+
+  if input_type not in DETECTION_MODULE_MAP:
+    raise ValueError('Unrecognized `input_type`')
+  detection_module = DETECTION_MODULE_MAP[input_type](detection_model)
+  # Getting the concrete function traces the graph and forces variables to
+  # be constructed --- only after this can we save the checkpoint and
+  # saved model.
+  concrete_function = detection_module.__call__.get_concrete_function()
+  status.assert_existing_objects_matched()
+
+  exported_checkpoint_manager = tf.train.CheckpointManager(
+      ckpt, output_checkpoint_directory, max_to_keep=1)
+  exported_checkpoint_manager.save(checkpoint_number=0)
+
+  tf.saved_model.save(detection_module,
+                      output_saved_model_directory,
+                      signatures=concrete_function)
+
+  config_util.save_pipeline_config(pipeline_config, output_directory)
--- a/research/object_detection/exporter_main_v2.py
+++ b/research/object_detection/exporter_main_v2.py
+# Lint as: python2, python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+r"""Tool to export an object detection model for inference.
+
+Prepares an object detection tensorflow graph for inference using model
+configuration and a trained checkpoint. Outputs associated checkpoint files,
+a SavedModel, and a copy of the model config.
+
+The inference graph contains one of three input nodes depending on the user
+specified option.
+  * `image_tensor`: Accepts a uint8 4-D tensor of shape [1, None, None, 3]
+  * `float_image_tensor`: Accepts a float32 4-D tensor of shape
+    [1, None, None, 3]
+  * `encoded_image_string_tensor`: Accepts a 1-D string tensor of shape [None]
+    containing encoded PNG or JPEG images. Image resolutions are expected to be
+    the same if more than 1 image is provided.
+  * `tf_example`: Accepts a 1-D string tensor of shape [None] containing
+    serialized TFExample protos. Image resolutions are expected to be the same
+    if more than 1 image is provided.
+
+and the following output nodes returned by the model.postprocess(..):
+  * `num_detections`: Outputs float32 tensors of the form [batch]
+      that specifies the number of valid boxes per image in the batch.
+  * `detection_boxes`: Outputs float32 tensors of the form
+      [batch, num_boxes, 4] containing detected boxes.
+  * `detection_scores`: Outputs float32 tensors of the form
+      [batch, num_boxes] containing class scores for the detections.
+  * `detection_classes`: Outputs float32 tensors of the form
+      [batch, num_boxes] containing classes for the detections.
+
+
+Example Usage:
+--------------
+python exporter_main_v2.py \
+    --input_type image_tensor \
+    --pipeline_config_path path/to/ssd_inception_v2.config \
+    --trained_checkpoint_dir path/to/checkpoint \
+    --output_directory path/to/exported_model_directory
+
+The expected output would be in the directory
+path/to/exported_model_directory (which is created if it does not exist)
+holding two subdirectories (corresponding to checkpoint and SavedModel,
+respectively) and a copy of the pipeline config.
+
+Config overrides (see the `config_override` flag) are text protobufs
+(also of type pipeline_pb2.TrainEvalPipelineConfig) which are used to override
+certain fields in the provided pipeline_config_path.  These are useful for
+making small changes to the inference graph that differ from the training or
+eval config.
+
+Example Usage (in which we change the second stage post-processing score
+threshold to be 0.5):
+
+python exporter_main_v2.py \
+    --input_type image_tensor \
+    --pipeline_config_path path/to/ssd_inception_v2.config \
+    --trained_checkpoint_dir path/to/checkpoint \
+    --output_directory path/to/exported_model_directory \
+    --config_override " \
+            model{ \
+              faster_rcnn { \
+                second_stage_post_processing { \
+                  batch_non_max_suppression { \
+                    score_threshold: 0.5 \
+                  } \
+                } \
+              } \
+            }"
+"""
+from absl import app
+from absl import flags
+
+import tensorflow.compat.v2 as tf
+from google.protobuf import text_format
+from object_detection import exporter_lib_v2
+from object_detection.protos import pipeline_pb2
+
+tf.enable_v2_behavior()
+
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('input_type', 'image_tensor', 'Type of input node. Can be '
+                    'one of [`image_tensor`, `encoded_image_string_tensor`, '
+                    '`tf_example`, `float_image_tensor`]')
+flags.DEFINE_string('pipeline_config_path', None,
+                    'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
+                    'file.')
+flags.DEFINE_string('trained_checkpoint_dir', None,
+                    'Path to trained checkpoint directory')
+flags.DEFINE_string('output_directory', None, 'Path to write outputs.')
+flags.DEFINE_string('config_override', '',
+                    'pipeline_pb2.TrainEvalPipelineConfig '
+                    'text proto to override pipeline_config_path.')
+
+flags.mark_flag_as_required('pipeline_config_path')
+flags.mark_flag_as_required('trained_checkpoint_dir')
+flags.mark_flag_as_required('output_directory')
+
+
+def main(_):
+  pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+  with tf.io.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f:
+    text_format.Merge(f.read(), pipeline_config)
+  text_format.Merge(FLAGS.config_override, pipeline_config)
+  exporter_lib_v2.export_inference_graph(
+      FLAGS.input_type, pipeline_config, FLAGS.trained_checkpoint_dir,
+      FLAGS.output_directory)
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/research/object_detection/exporter_test.py
+++ b/research/object_detection/exporter_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
+import unittest
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf
@@ -33,12 +34,13 @@ from object_detection.core import model
 from object_detection.protos import graph_rewriter_pb2
 from object_detection.protos import pipeline_pb2
 from object_detection.utils import ops
+from object_detection.utils import tf_version
 from object_detection.utils import variables_helper

 if six.PY2:
  import mock  # pylint: disable=g-import-not-at-top
 else:
-  from unittest import mock  # pylint: disable=g-import-not-at-top
+  mock = unittest.mock  # pylint: disable=g-import-not-at-top, g-importing-member

 # pylint: disable=g-import-not-at-top
 try:
@@ -113,6 +115,7 @@ class FakeModel(model.DetectionModel):
    pass


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class ExportInferenceGraphTest(tf.test.TestCase):

  def _save_checkpoint_from_mock_model(self,

--- a/research/object_detection/g3doc/context_rcnn.md
+++ b/research/object_detection/g3doc/context_rcnn.md
+# Context R-CNN
+
+Context R-CNN is an object detection model that uses contextual features to
+improve object detection. See https://arxiv.org/abs/1912.03538 for more details.
+
+## Table of Contents
+
+*   [Preparing Context Data for Context R-CNN](#preparing-context-data-for-context-r-cnn)
+    +   [Generating TfRecords from a set of images and a COCO-CameraTraps style
+        JSON](#generating-tfrecords-from-a-set-of-images-and-a-coco-cameratraps-style-json)
+    +   [Generating weakly-supervised bounding box labels for image-labeled data](#generating-weakly-supervised-bounding-box-labels-for-image-labeled-data)
+    +   [Generating and saving contextual features for each image](#generating-and-saving-contextual-features-for-each-image)
+    +   [Building up contextual memory banks and storing them for each context
+        group](#building-up-contextual-memory-banks-and-storing-them-for-each-context-group)
+-   [Training a Context R-CNN Model](#training-a-context-r-cnn-model)
+-   [Exporting a Context R-CNN Model](#exporting-a-context-r-cnn-model)
+
+## Preparing Context Data for Context R-CNN
+
+In this section, we will walk through the process of generating TfRecords with
+contextual features. We focus on building context from object-centric features
+generated with a pre-trained Faster R-CNN model, but you can adapt the provided
+code to use alternative feature extractors.
+
+### Generating TfRecords from a set of images and a COCO-CameraTraps style JSON
+
+If your data is already stored in TfRecords, you can skip this first step.
+
+We assume a COCO-CameraTraps json format, as described on
+[LILA.science](https://github.com/microsoft/CameraTraps/blob/master/data_management/README.md).
+
+COCO-CameraTraps is a format that adds static-camera-specific fields, such as a
+location ID and datetime, to the well-established COCO format. To generate
+appropriate context later on, be sure you have specified each contextual group
+with a different location ID, which in the static camera case would be the ID of
+the camera, as well as the datetime each photo was taken. We assume that empty
+images will be labeled 'empty' with class id 0.
+
+To generate TfRecords from your database and local image folder, run
+
+```
+python object_detection/dataset_tools/context_rcnn/create_cococameratraps_tfexample_main.py \
+  --alsologtostderr \
+  --output_tfrecord_prefix="/path/to/output/tfrecord/location/prefix" \
+  --image_directory="/path/to/image/folder/" \
+  --input_annotations_file="path/to/annotations.json"
+```
+
+### Generating weakly-supervised bounding box labels for image-labeled data
+
+If all your data already has bounding box labels you can skip this step.
+
+Many camera trap datasets do not have bounding box labels, or only have bounding
+box labels for some of the data. We have provided code to add bounding boxes
+from a pretrained model (such as the
+[Microsoft AI for Earth MegaDetector](https://github.com/microsoft/CameraTraps/blob/master/megadetector.md))
+and match the boxes to the image-level class label.
+
+To export your pretrained detection model, run
+
+```
+python object_detection/export_inference_graph.py \
+  --alsologtostderr \
+  --input_type tf_example \
+  --pipeline_config_path path/to/faster_rcnn_model.config \
+  --trained_checkpoint_prefix path/to/model.ckpt \
+  --output_directory path/to/exported_model_directory
+```
+
+To add bounding boxes to your dataset using the above model, run
+
+```
+python object_detection/dataset_tools/context_rcnn/generate_detection_data.py \
+  --alsologtostderr \
+  --input_tfrecord path/to/input_tfrecord@X \
+  --output_tfrecord path/to/output_tfrecord@X \
+  --model_dir path/to/exported_model_directory/saved_model
+```
+
+If an image already has bounding box labels, those labels are left unchanged. If
+an image is labeled 'empty' (class ID 0), we will not generate boxes for that
+image.
+
+### Generating and saving contextual features for each image
+
+We next extract and store features for each image from a pretrained model. This
+model can be the same model as above, or be a class-specific detection model
+trained on data from your classes of interest.
+
+To export your pretrained detection model, run
+
+```
+python object_detection/export_inference_graph.py \
+  --alsologtostderr \
+  --input_type tf_example \
+  --pipeline_config_path path/to/pipeline.config \
+  --trained_checkpoint_prefix path/to/model.ckpt \
+  --output_directory path/to/exported_model_directory \
+  --additional_output_tensor_names detection_features
+```
+
+To generate and save contextual features for your data, run
+
+```
+python object_detection/dataset_tools/context_rcnn/generate_embedding_data.py \
+  --alsologtostderr \
+  --embedding_input_tfrecord path/to/input_tfrecords* \
+  --embedding_output_tfrecord path/to/output_tfrecords \
+  --embedding_model_dir path/to/exported_model_directory/saved_model
+```
+
+### Building up contextual memory banks and storing them for each context group
+
+To build the context features into memory banks, run
+
+```
+python object_detection/dataset_tools/context_rcnn/add_context_to_examples.py \
+  --input_tfrecord path/to/input_tfrecords* \
+  --output_tfrecord path/to/output_tfrecords \
+  --sequence_key image/location \
+  --time_horizon month
+```
+
+For all options, see add_context_to_examples.py. By default, this code builds
+TfSequenceExamples, which are more data efficient (this allows you to store the
+context features once for each context group, as opposed to once per image). If
+you would like to export TfExamples instead, set flag `--output_type
+tf_example`.
+
+If you use TfSequenceExamples, you must be sure to set `input_type:
+TF_SEQUENCE_EXAMPLE` within your Context R-CNN configs for both
+train_input_reader and test_input_reader. See
+`object_detection/test_data/context_rcnn_camera_trap.config`
+for an example.
+
+## Training a Context R-CNN Model
+
+To train a Context R-CNN model, you must first set up your config file. See
+`test_data/context_rcnn_camera_trap.config` for an example. The important
+difference between this config and a Faster R-CNN config is the inclusion of a
+`context_config` within the model, which defines the necessary Context R-CNN
+parameters.
+
+```
+context_config {
+      max_num_context_features: 2000
+      context_feature_length: 2057
+    }
+```
+
+Once your config file has been updated with your local paths, you can follow
+along with documentation for running [locally](running_locally.md), or
+[on the cloud](running_on_cloud.md).
+
+## Exporting a Context R-CNN Model
+
+Since Context R-CNN takes context features as well as images as input, we have
+to explicitly define the other inputs ("side_inputs") to the model when
+exporting, as below. This example is shown with default context feature shapes.
+
+```
+python export_inference_graph.py \
+    --input_type image_tensor \
+    --input_shape 1,-1,-1,3 \
+    --pipeline_config_path /path/to/context_rcnn_model/pipeline.config \
+    --trained_checkpoint_prefix /path/to/context_rcnn_model/model.ckpt \
+    --output_directory /path/to/output_directory \
+    --use_side_inputs True \
+    --side_input_shapes 1,2000,2057/1 \
+    --side_input_names context_features,valid_context_size \
+    --side_input_types float,int
+
+```
--- a/research/object_detection/g3doc/detection_model_zoo.md
+++ b/research/object_detection/g3doc/detection_model_zoo.md
 # Tensorflow detection model zoo

-We provide a collection of detection models pre-trained on the [COCO
-dataset](http://cocodataset.org), the [Kitti dataset](http://www.cvlibs.net/datasets/kitti/),
-the
+We provide a collection of detection models pre-trained on the
+[COCO dataset](http://cocodataset.org), the
+[Kitti dataset](http://www.cvlibs.net/datasets/kitti/), the
 [Open Images dataset](https://storage.googleapis.com/openimages/web/index.html),
-the [AVA v2.1 dataset](https://research.google.com/ava/) and the
-[iNaturalist Species Detection Dataset](https://github.com/visipedia/inat_comp/blob/master/2017/README.md#bounding-boxes).
+the [AVA v2.1 dataset](https://research.google.com/ava/) the
+[iNaturalist Species Detection Dataset](https://github.com/visipedia/inat_comp/blob/master/2017/README.md#bounding-boxes)
+and the
+[Snapshot Serengeti Dataset](http://lila.science/datasets/snapshot-serengeti).
 These models can be useful for out-of-the-box inference if you are interested in
 categories already in those datasets. They are also useful for initializing your
 models when training on novel datasets.
@@ -15,17 +17,17 @@ In the table below, we list each such pre-trained model including:
 *   a model name that corresponds to a config file that was used to train this
    model in the `samples/configs` directory,
 *   a download link to a tar.gz file containing the pre-trained model,
-* model speed --- we report running time in ms per 600x600 image (including all
-  pre and post-processing), but please be
-  aware that these timings depend highly on one's specific hardware
-  configuration (these timings were performed using an Nvidia
-  GeForce GTX TITAN X card) and should be treated more as relative timings in
-  many cases. Also note that desktop GPU timing does not always reflect mobile
-  run time. For example Mobilenet V2 is faster on mobile devices than Mobilenet
-  V1, but is slightly slower on desktop GPU.
-* detector performance on subset of the COCO validation set or Open Images test split as measured by the dataset-specific mAP measure.
-  Here, higher is better, and we only report bounding box mAP rounded to the
-  nearest integer.
+*   model speed --- we report running time in ms per 600x600 image (including
+    all pre and post-processing), but please be aware that these timings depend
+    highly on one's specific hardware configuration (these timings were
+    performed using an Nvidia GeForce GTX TITAN X card) and should be treated
+    more as relative timings in many cases. Also note that desktop GPU timing
+    does not always reflect mobile run time. For example Mobilenet V2 is faster
+    on mobile devices than Mobilenet V1, but is slightly slower on desktop GPU.
+*   detector performance on subset of the COCO validation set, Open Images test
+    split, iNaturalist test split, or Snapshot Serengeti LILA.science test
+    split. as measured by the dataset-specific mAP measure. Here, higher is
+    better, and we only report bounding box mAP rounded to the nearest integer.
 *   Output types (`Boxes`, and `Masks` if applicable )

 You can un-tar each tar.gz file via, e.g.,:
@@ -53,57 +55,59 @@ Inside the un-tar'ed directory, you will find:

 Some remarks on frozen inference graphs:

-* If you try to evaluate the frozen graph, you may find performance numbers for
-  some of the models to be slightly lower than what we report in the below
-  tables.  This is because we discard detections with scores below a
-  threshold (typically 0.3) when creating the frozen graph.  This corresponds
-  effectively to picking a point on the precision recall curve of
-  a detector (and discarding the part past that point), which negatively impacts
-  standard mAP metrics.
+*   If you try to evaluate the frozen graph, you may find performance numbers
+    for some of the models to be slightly lower than what we report in the below
+    tables. This is because we discard detections with scores below a threshold
+    (typically 0.3) when creating the frozen graph. This corresponds effectively
+    to picking a point on the precision recall curve of a detector (and
+    discarding the part past that point), which negatively impacts standard mAP
+    metrics.
 *   Our frozen inference graphs are generated using the
-  [v1.12.0](https://github.com/tensorflow/tensorflow/tree/v1.12.0)
-  release version of Tensorflow and we do not guarantee that these will work
-  with other versions; this being said, each frozen inference graph can be
+    [v1.12.0](https://github.com/tensorflow/tensorflow/tree/v1.12.0) release
+    version of Tensorflow and we do not guarantee that these will work with
+    other versions; this being said, each frozen inference graph can be
    regenerated using your current version of Tensorflow by re-running the
    [exporter](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/exporting_models.md),
-  pointing it at the model directory as well as the corresponding config file in
+    pointing it at the model directory as well as the corresponding config file
+    in
    [samples/configs](https://github.com/tensorflow/models/tree/master/research/object_detection/samples/configs).

-
 ## COCO-trained models

-| Model name  | Speed (ms) | COCO mAP[^1] | Outputs |
-| ------------ | :--------------: | :--------------: | :-------------: |
-| [ssd_mobilenet_v1_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz) | 30 | 21 | Boxes |
-| [ssd_mobilenet_v1_0.75_depth_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_300x300_coco14_sync_2018_07_03.tar.gz) | 26 | 18 | Boxes |
-| [ssd_mobilenet_v1_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz) | 29 | 18 | Boxes |
-| [ssd_mobilenet_v1_0.75_depth_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_quantized_300x300_coco14_sync_2018_07_18.tar.gz) | 29 | 16 | Boxes |
-| [ssd_mobilenet_v1_ppn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_ppn_shared_box_predictor_300x300_coco14_sync_2018_07_03.tar.gz) | 26 | 20 | Boxes |
-| [ssd_mobilenet_v1_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz) | 56 | 32 | Boxes |
-| [ssd_resnet_50_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz) | 76 | 35 | Boxes |
-| [ssd_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz) | 31 | 22 | Boxes |
-| [ssd_mobilenet_v2_quantized_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz) | 29 | 22 | Boxes |
-| [ssdlite_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobilenet_v2_coco_2018_05_09.tar.gz) | 27 | 22 | Boxes |
-| [ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2018_01_28.tar.gz) | 42 | 24 | Boxes |
-| [faster_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 58 | 28 | Boxes |
-| [faster_rcnn_resnet50_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz) | 89 | 30 | Boxes |
-| [faster_rcnn_resnet50_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_lowproposals_coco_2018_01_28.tar.gz) | 64 |  | Boxes |
-| [rfcn_resnet101_coco](http://download.tensorflow.org/models/object_detection/rfcn_resnet101_coco_2018_01_28.tar.gz)  | 92 | 30 | Boxes |
-| [faster_rcnn_resnet101_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_2018_01_28.tar.gz) | 106 | 32 | Boxes |
-| [faster_rcnn_resnet101_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_lowproposals_coco_2018_01_28.tar.gz) | 82 |  | Boxes |
-| [faster_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 620 | 37 | Boxes |
-| [faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco_2018_01_28.tar.gz) | 241 |  | Boxes |
-| [faster_rcnn_nas](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_coco_2018_01_28.tar.gz) | 1833 | 43 | Boxes |
-| [faster_rcnn_nas_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_lowproposals_coco_2018_01_28.tar.gz) | 540 |  | Boxes |
-| [mask_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 771 | 36 | Masks |
-| [mask_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 79 | 25 | Masks |
-| [mask_rcnn_resnet101_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet101_atrous_coco_2018_01_28.tar.gz) | 470 | 33 | Masks |
-| [mask_rcnn_resnet50_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet50_atrous_coco_2018_01_28.tar.gz) | 343 | 29 | Masks |
-
-Note: The asterisk (☆) at the end of model name indicates that this model supports TPU training.
-
-Note: If you download the tar.gz file of quantized models and un-tar, you will get different set of files - a checkpoint, a config file and tflite frozen graphs (txt/binary).
-
+Model name                                                                                                                                                                                    | Speed (ms) | COCO mAP[^1] | Outputs
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :----------: | :-----:
+[ssd_mobilenet_v1_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz)                                                                       | 30         | 21           | Boxes
+[ssd_mobilenet_v1_0.75_depth_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_300x300_coco14_sync_2018_07_03.tar.gz)                                | 26         | 18           | Boxes
+[ssd_mobilenet_v1_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz)                                  | 29         | 18           | Boxes
+[ssd_mobilenet_v1_0.75_depth_quantized_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_0.75_depth_quantized_300x300_coco14_sync_2018_07_18.tar.gz)            | 29         | 16           | Boxes
+[ssd_mobilenet_v1_ppn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_ppn_shared_box_predictor_300x300_coco14_sync_2018_07_03.tar.gz)                         | 26         | 20           | Boxes
+[ssd_mobilenet_v1_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz)                         | 56         | 32           | Boxes
+[ssd_resnet_50_fpn_coco ☆](http://download.tensorflow.org/models/object_detection/ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz)                             | 76         | 35           | Boxes
+[ssd_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz)                                                                       | 31         | 22           | Boxes
+[ssd_mobilenet_v2_quantized_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz)                                           | 29         | 22           | Boxes
+[ssdlite_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobilenet_v2_coco_2018_05_09.tar.gz)                                                               | 27         | 22           | Boxes
+[ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2018_01_28.tar.gz)                                                                       | 42         | 24           | Boxes
+[faster_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz)                                                       | 58         | 28           | Boxes
+[faster_rcnn_resnet50_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz)                                                               | 89         | 30           | Boxes
+[faster_rcnn_resnet50_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_lowproposals_coco_2018_01_28.tar.gz)                                     | 64         |              | Boxes
+[rfcn_resnet101_coco](http://download.tensorflow.org/models/object_detection/rfcn_resnet101_coco_2018_01_28.tar.gz)                                                                           | 92         | 30           | Boxes
+[faster_rcnn_resnet101_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_2018_01_28.tar.gz)                                                             | 106        | 32           | Boxes
+[faster_rcnn_resnet101_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_lowproposals_coco_2018_01_28.tar.gz)                                   | 82         |              | Boxes
+[faster_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz)                           | 620        | 37           | Boxes
+[faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco_2018_01_28.tar.gz) | 241        |              | Boxes
+[faster_rcnn_nas](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_coco_2018_01_28.tar.gz)                                                                              | 1833       | 43           | Boxes
+[faster_rcnn_nas_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_lowproposals_coco_2018_01_28.tar.gz)                                               | 540        |              | Boxes
+[mask_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz)                               | 771        | 36           | Masks
+[mask_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_v2_coco_2018_01_28.tar.gz)                                                           | 79         | 25           | Masks
+[mask_rcnn_resnet101_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet101_atrous_coco_2018_01_28.tar.gz)                                                   | 470        | 33           | Masks
+[mask_rcnn_resnet50_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet50_atrous_coco_2018_01_28.tar.gz)                                                     | 343        | 29           | Masks
+
+Note: The asterisk (☆) at the end of model name indicates that this model
+supports TPU training.
+
+Note: If you download the tar.gz file of quantized models and un-tar, you will
+get different set of files - a checkpoint, a config file and tflite frozen
+graphs (txt/binary).

 ### Mobile models

@@ -115,20 +119,22 @@ Model name
 [ssd_mobilenet_v3_small_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v3_small_coco_2020_01_14.tar.gz)                                       | 43                   | 15.4     | Boxes

 ### Pixel4 Edge TPU models
+
 Model name                                                                                                                                    | Pixel 4 Edge TPU Latency (ms) | COCO mAP (fp32/uint8) | Outputs
----------------------------------------------------------------------------------------------------------------------------------- | :------------------: | :------: | :-----:
+--------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------: | :-------------------: | :-----:
 [ssd_mobiledet_edgetpu_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobiledet_edgetpu_320x320_coco_2020_05_19.tar.gz) | 6.9                           | 25.9/25.6             | Boxes
 [ssd_mobilenet_edgetpu_coco](https://storage.cloud.google.com/mobilenet_edgetpu/checkpoints/ssdlite_mobilenet_edgetpu_coco_quant.tar.gz)      | 6.6                           | -/24.3                | Boxes

 ### Pixel4 DSP models
+
 Model name                                                                                                                            | Pixel 4 DSP Latency (ms) | COCO mAP (fp32/uint8) | Outputs
----------------------------------------------------------------------------------------------------------------------------------- | :------------------: | :------: | :-----:
+------------------------------------------------------------------------------------------------------------------------------------- | :----------------------: | :-------------------: | :-----:
 [ssd_mobiledet_dsp_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobiledet_dsp_320x320_coco_2020_05_19.tar.gz) | 12.3                     | 28.9/28.8             | Boxes

 ## Kitti-trained models

 Model name                                                                                                                          | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
+----------------------------------------------------------------------------------------------------------------------------------- | :--------: | :------------: | :-----:
 [faster_rcnn_resnet101_kitti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_kitti_2018_01_28.tar.gz) | 79         | 87             | Boxes

 ## Open Images-trained models
@@ -140,31 +146,42 @@ Model name
 [facessd_mobilenet_v2_quantized_open_image_v4](http://download.tensorflow.org/models/object_detection/facessd_mobilenet_v2_quantized_320x320_open_image_v4.tar.gz) [^3]                       | 20         | 73 (faces)              | Boxes

 Model name                                                                                                                                                             | Speed (ms) | Open Images mAP@0.5[^4] | Outputs
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :---------------------: | :-----:
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :---------------------: | :-----:
 [faster_rcnn_inception_resnet_v2_atrous_oidv4](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_oid_v4_2018_12_12.tar.gz) | 425        | 54                      | Boxes
 [ssd_mobilenetv2_oidv4](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_oid_v4_2018_12_12.tar.gz)                                              | 89         | 36                      | Boxes
 [ssd_resnet_101_fpn_oidv4](http://download.tensorflow.org/models/object_detection/ssd_resnet101_v1_fpn_shared_box_predictor_oid_512x512_sync_2019_01_20.tar.gz)        | 237        | 38                      | Boxes
+
 ## iNaturalist Species-trained models

 Model name                                                                                                                        | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
+--------------------------------------------------------------------------------------------------------------------------------- | :--------: | :------------: | :-----:
 [faster_rcnn_resnet101_fgvc](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_fgvc_2018_07_19.tar.gz) | 395        | 58             | Boxes
 [faster_rcnn_resnet50_fgvc](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_fgvc_2018_07_19.tar.gz)   | 366        | 55             | Boxes

-
 ## AVA v2.1 trained models

 Model name                                                                                                                                | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
+----------------------------------------------------------------------------------------------------------------------------------------- | :--------: | :------------: | :-----:
 [faster_rcnn_resnet101_ava_v2.1](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_ava_v2.1_2018_04_30.tar.gz) | 93         | 11             | Boxes

-
-[^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval). The COCO mAP numbers here are evaluated on COCO 14 minival set (note that our split is different from COCO 17 Val). A full list of image ids used in our split could be fould [here](https://github.com/tensorflow/models/blob/master/research/object_detection/data/mscoco_minival_ids.txt).
-
-
-[^2]: This is PASCAL mAP with a slightly different way of true positives computation: see [Open Images evaluation protocols](evaluation_protocols.md), oid_V2_detection_metrics.
-
-[^3]: Non-face boxes are dropped during training and non-face groundtruth boxes are ignored when evaluating.
-
-[^4]: This is Open Images Challenge metric: see [Open Images evaluation protocols](evaluation_protocols.md), oid_challenge_detection_metrics.
-
+## Snapshot Serengeti Camera Trap trained models
+
+Model name                                                                                                                                                      | COCO mAP@0.5 | Outputs
+--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-----:
+[faster_rcnn_resnet101_snapshot_serengeti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_snapshot_serengeti_2020_06_10.tar.gz)   | 38           | Boxes
+[context_rcnn_resnet101_snapshot_serengeti](http://download.tensorflow.org/models/object_detection/context_rcnn_resnet101_snapshot_serengeti_2020_06_10.tar.gz) | 56           | Boxes
+
+[^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval).
+    The COCO mAP numbers here are evaluated on COCO 14 minival set (note that
+    our split is different from COCO 17 Val). A full list of image ids used in
+    our split could be fould
+    [here](https://github.com/tensorflow/models/blob/master/research/object_detection/data/mscoco_minival_ids.txt).
+[^2]: This is PASCAL mAP with a slightly different way of true positives
+    computation: see
+    [Open Images evaluation protocols](evaluation_protocols.md),
+    oid_V2_detection_metrics.
+[^3]: Non-face boxes are dropped during training and non-face groundtruth boxes
+    are ignored when evaluating.
+[^4]: This is Open Images Challenge metric: see
+    [Open Images evaluation protocols](evaluation_protocols.md),
+    oid_challenge_detection_metrics.
--- a/research/object_detection/inference/detection_inference_test.py
+++ b/research/object_detection/inference/detection_inference_test.py
@@ -15,7 +15,7 @@
 r"""Tests for detection_inference.py."""

 import os
-
+import unittest
 import numpy as np
 from PIL import Image
 import six
@@ -25,6 +25,7 @@ from google.protobuf import text_format
 from object_detection.core import standard_fields
 from object_detection.inference import detection_inference
 from object_detection.utils import dataset_util
+from object_detection.utils import tf_version


 def get_mock_tfrecord_path():
@@ -74,6 +75,7 @@ def create_mock_graph():
    fl.write(graph_def.SerializeToString())


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class InferDetectionsTests(tf.test.TestCase):

  def test_simple(self):

--- a/research/object_detection/inputs.py
+++ b/research/object_detection/inputs.py
@@ -64,7 +64,6 @@ def _multiclass_scores_or_one_hot_labels(multiclass_scores,
                      [tf.shape(groundtruth_boxes)[0], num_classes])
  def false_fn():
    return tf.one_hot(groundtruth_classes, num_classes)
-
  return tf.cond(tf.size(multiclass_scores) > 0, true_fn, false_fn)


@@ -1006,14 +1005,21 @@ def get_reduce_to_frame_fn(input_reader_config, is_training):
    `reduce_to_frame_fn` for the dataset builder
  """
  if input_reader_config.input_type != (
-      input_reader_pb2.InputType.TF_SEQUENCE_EXAMPLE):
-    return lambda d: d
+      input_reader_pb2.InputType.Value('TF_SEQUENCE_EXAMPLE')):
+    return lambda dataset, dataset_map_fn, batch_size, config: dataset
  else:
-    def reduce_to_frame(dataset):
+    def reduce_to_frame(dataset, dataset_map_fn, batch_size,
+                        input_reader_config):
      """Returns a function reducing sequence tensors to single frame tensors.

      Args:
        dataset: A tf dataset containing sequence tensors.
+        dataset_map_fn: A function that handles whether to
+          map_with_legacy_function for this dataset
+        batch_size: used if map_with_legacy_function is true to determine
+          num_parallel_calls
+        input_reader_config: used if map_with_legacy_function is true to
+          determine num_parallel_calls

      Returns:
        A tf dataset containing single frame tensors.
@@ -1046,13 +1052,14 @@ def get_reduce_to_frame_fn(input_reader_config, is_training):
              # Copy all context tensors.
              out_tensor_dict[key] = tensor_dict[key]
          return out_tensor_dict
-        dataset = dataset.map(get_single_frame, tf.data.experimental.AUTOTUNE)
+        dataset = dataset_map_fn(dataset, get_single_frame, batch_size,
+                                 input_reader_config)
      else:
-        dataset = dataset.map(util_ops.tile_context_tensors,
-                              tf.data.experimental.AUTOTUNE)
+        dataset = dataset_map_fn(dataset, util_ops.tile_context_tensors,
+                                 batch_size, input_reader_config)
        dataset = dataset.unbatch()
      # Decode frame here as SequenceExample tensors contain encoded images.
-      dataset = dataset.map(util_ops.decode_image,
-                            tf.data.experimental.AUTOTUNE)
+      dataset = dataset_map_fn(dataset, util_ops.decode_image, batch_size,
+                               input_reader_config)
      return dataset
    return reduce_to_frame
--- a/research/object_detection/inputs_test.py
+++ b/research/object_detection/inputs_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function

 import functools
 import os
+import unittest
 from absl import logging
 from absl.testing import parameterized
-
 import numpy as np
+import six
 import tensorflow.compat.v1 as tf

 from object_detection import inputs
@@ -31,6 +32,13 @@ from object_detection.core import preprocessor
 from object_detection.core import standard_fields as fields
 from object_detection.utils import config_util
 from object_detection.utils import test_case
+from object_detection.utils import test_utils
+from object_detection.utils import tf_version
+
+if six.PY2:
+  import mock  # pylint: disable=g-import-not-at-top
+else:
+  from unittest import mock  # pylint: disable=g-import-not-at-top, g-importing-member

 FLAGS = tf.flags.FLAGS

@@ -86,7 +94,8 @@ def _make_initializable_iterator(dataset):
  return iterator


-class InputsTest(test_case.TestCase, parameterized.TestCase):
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only tests under TF2.X.')
+class InputFnTest(test_case.TestCase, parameterized.TestCase):

  def test_faster_rcnn_resnet50_train_input(self):
    """Tests the training input function for FasterRcnnResnet50."""
@@ -402,7 +411,7 @@ class InputsTest(test_case.TestCase, parameterized.TestCase):

  def test_ssd_inceptionV2_eval_input_with_additional_channels(
      self, eval_batch_size=1):
-    """Tests the eval input function for SSDInceptionV2 with additional channels.
+    """Tests the eval input function for SSDInceptionV2 with additional channel.

    Args:
      eval_batch_size: Batch size for eval set.
@@ -638,6 +647,7 @@ class DataAugmentationFnTest(test_case.TestCase):
    data_augmentation_fn = functools.partial(
        inputs.augment_input_data,
        data_augmentation_options=data_augmentation_options)
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
@@ -645,17 +655,12 @@ class DataAugmentationFnTest(test_case.TestCase):
              tf.constant(np.array([[.5, .5, 1., 1.]], np.float32))
      }
      augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
-    with self.test_session() as sess:
-      augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
-
-    self.assertAllEqual(
-        augmented_tensor_dict_out[fields.InputDataFields.image].shape,
-        [20, 20, 3]
-    )
-    self.assertAllClose(
-        augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes],
-        [[10, 10, 20, 20]]
-    )
+      return (augmented_tensor_dict[fields.InputDataFields.image],
+              augmented_tensor_dict[fields.InputDataFields.
+                                    groundtruth_boxes])
+    image, groundtruth_boxes = self.execute_cpu(graph_fn, [])
+    self.assertAllEqual(image.shape, [20, 20, 3])
+    self.assertAllClose(groundtruth_boxes, [[10, 10, 20, 20]])

  def test_apply_image_and_box_augmentation_with_scores(self):
    data_augmentation_options = [
@@ -669,6 +674,7 @@ class DataAugmentationFnTest(test_case.TestCase):
    data_augmentation_fn = functools.partial(
        inputs.augment_input_data,
        data_augmentation_options=data_augmentation_options)
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
@@ -680,26 +686,16 @@ class DataAugmentationFnTest(test_case.TestCase):
              tf.constant(np.array([0.8], np.float32)),
      }
      augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
-    with self.test_session() as sess:
-      augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
-
-    self.assertAllEqual(
-        augmented_tensor_dict_out[fields.InputDataFields.image].shape,
-        [20, 20, 3]
-    )
-    self.assertAllClose(
-        augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes],
-        [[10, 10, 20, 20]]
-    )
-    self.assertAllClose(
-        augmented_tensor_dict_out[fields.InputDataFields.groundtruth_classes],
-        [1.0]
-    )
-    self.assertAllClose(
-        augmented_tensor_dict_out[
-            fields.InputDataFields.groundtruth_weights],
-        [0.8]
-    )
+      return (augmented_tensor_dict[fields.InputDataFields.image],
+              augmented_tensor_dict[fields.InputDataFields.groundtruth_boxes],
+              augmented_tensor_dict[fields.InputDataFields.groundtruth_classes],
+              augmented_tensor_dict[fields.InputDataFields.groundtruth_weights])
+    (image, groundtruth_boxes,
+     groundtruth_classes, groundtruth_weights) = self.execute_cpu(graph_fn, [])
+    self.assertAllEqual(image.shape, [20, 20, 3])
+    self.assertAllClose(groundtruth_boxes, [[10, 10, 20, 20]])
+    self.assertAllClose(groundtruth_classes.shape, [1.0])
+    self.assertAllClose(groundtruth_weights, [0.8])

  def test_include_masks_in_data_augmentation(self):
    data_augmentation_options = [
@@ -712,6 +708,7 @@ class DataAugmentationFnTest(test_case.TestCase):
    data_augmentation_fn = functools.partial(
        inputs.augment_input_data,
        data_augmentation_options=data_augmentation_options)
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
@@ -719,14 +716,12 @@ class DataAugmentationFnTest(test_case.TestCase):
              tf.constant(np.zeros([2, 10, 10], np.uint8))
      }
      augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
-    with self.test_session() as sess:
-      augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
-
-    self.assertAllEqual(
-        augmented_tensor_dict_out[fields.InputDataFields.image].shape,
-        [20, 20, 3])
-    self.assertAllEqual(augmented_tensor_dict_out[
-        fields.InputDataFields.groundtruth_instance_masks].shape, [2, 20, 20])
+      return (augmented_tensor_dict[fields.InputDataFields.image],
+              augmented_tensor_dict[fields.InputDataFields.
+                                    groundtruth_instance_masks])
+    image, masks = self.execute_cpu(graph_fn, [])
+    self.assertAllEqual(image.shape, [20, 20, 3])
+    self.assertAllEqual(masks.shape, [2, 20, 20])

  def test_include_keypoints_in_data_augmentation(self):
    data_augmentation_options = [
@@ -740,6 +735,7 @@ class DataAugmentationFnTest(test_case.TestCase):
    data_augmentation_fn = functools.partial(
        inputs.augment_input_data,
        data_augmentation_options=data_augmentation_options)
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(10, 10, 3).astype(np.float32)),
@@ -749,21 +745,14 @@ class DataAugmentationFnTest(test_case.TestCase):
              tf.constant(np.array([[[0.5, 1.0], [0.5, 0.5]]], np.float32))
      }
      augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict)
-    with self.test_session() as sess:
-      augmented_tensor_dict_out = sess.run(augmented_tensor_dict)
-
-    self.assertAllEqual(
-        augmented_tensor_dict_out[fields.InputDataFields.image].shape,
-        [20, 20, 3]
-    )
-    self.assertAllClose(
-        augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes],
-        [[10, 10, 20, 20]]
-    )
-    self.assertAllClose(
-        augmented_tensor_dict_out[fields.InputDataFields.groundtruth_keypoints],
-        [[[10, 20], [10, 10]]]
-    )
+      return (augmented_tensor_dict[fields.InputDataFields.image],
+              augmented_tensor_dict[fields.InputDataFields.groundtruth_boxes],
+              augmented_tensor_dict[fields.InputDataFields.
+                                    groundtruth_keypoints])
+    image, boxes, keypoints = self.execute_cpu(graph_fn, [])
+    self.assertAllEqual(image.shape, [20, 20, 3])
+    self.assertAllClose(boxes, [[10, 10, 20, 20]])
+    self.assertAllClose(keypoints, [[[10, 20], [10, 10]]])


 def _fake_model_preprocessor_fn(image):
@@ -787,13 +776,12 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
  def test_combine_additional_channels_if_present(self):
    image = np.random.rand(4, 4, 3).astype(np.float32)
    additional_channels = np.random.rand(4, 4, 2).astype(np.float32)
+    def graph_fn(image, additional_channels):
      tensor_dict = {
-        fields.InputDataFields.image:
-            tf.constant(image),
-        fields.InputDataFields.image_additional_channels:
-            tf.constant(additional_channels),
+          fields.InputDataFields.image: image,
+          fields.InputDataFields.image_additional_channels: additional_channels,
          fields.InputDataFields.groundtruth_classes:
-            tf.constant(np.array([1, 1], np.int32))
+              tf.constant([1, 1], tf.int32)
      }

      input_transformation_fn = functools.partial(
@@ -801,23 +789,22 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=1)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-    self.assertAllEqual(transformed_inputs[fields.InputDataFields.image].dtype,
-                        tf.float32)
-    self.assertAllEqual(transformed_inputs[fields.InputDataFields.image].shape,
-                        [4, 4, 5])
-    self.assertAllClose(transformed_inputs[fields.InputDataFields.image],
-                        np.concatenate((image, additional_channels), axis=2))
+      out_tensors = input_transformation_fn(tensor_dict=tensor_dict)
+      return out_tensors[fields.InputDataFields.image]
+    out_image = self.execute_cpu(graph_fn, [image, additional_channels])
+    self.assertAllEqual(out_image.dtype, tf.float32)
+    self.assertAllEqual(out_image.shape, [4, 4, 5])
+    self.assertAllClose(out_image, np.concatenate((image, additional_channels),
+                                                  axis=2))

  def test_use_multiclass_scores_when_present(self):
-    image = np.random.rand(4, 4, 3).astype(np.float32)
+    def graph_fn():
      tensor_dict = {
-        fields.InputDataFields.image:
-            tf.constant(image),
+          fields.InputDataFields.image: tf.constant(np.random.rand(4, 4, 3).
+                                                    astype(np.float32)),
          fields.InputDataFields.groundtruth_boxes:
-            tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)),
+              tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]],
+                                   np.float32)),
          fields.InputDataFields.multiclass_scores:
              tf.constant(np.array([0.2, 0.3, 0.5, 0.1, 0.6, 0.3], np.float32)),
          fields.InputDataFields.groundtruth_classes:
@@ -829,23 +816,26 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=3, use_multiclass_scores=True)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-
+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      return transformed_inputs[fields.InputDataFields.groundtruth_classes]
+    groundtruth_classes = self.execute_cpu(graph_fn, [])
    self.assertAllClose(
        np.array([[0.2, 0.3, 0.5], [0.1, 0.6, 0.3]], np.float32),
-        transformed_inputs[fields.InputDataFields.groundtruth_classes])
+        groundtruth_classes)

+  @unittest.skipIf(tf_version.is_tf2(), ('Skipping due to different behaviour '
+                                         'in TF 2.X'))
  def test_use_multiclass_scores_when_not_present(self):
-    image = np.random.rand(4, 4, 3).astype(np.float32)
+    def graph_fn():
+      zero_num_elements = tf.random.uniform([], minval=0, maxval=1,
+                                            dtype=tf.int32)
      tensor_dict = {
          fields.InputDataFields.image:
-            tf.constant(image),
+              tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
          fields.InputDataFields.groundtruth_boxes:
-            tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)),
-        fields.InputDataFields.multiclass_scores:
-            tf.placeholder(tf.float32),
+              tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]],
+                                   np.float32)),
+          fields.InputDataFields.multiclass_scores: tf.zeros(zero_num_elements),
          fields.InputDataFields.groundtruth_classes:
              tf.constant(np.array([1, 2], np.int32))
      }
@@ -855,17 +845,13 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=3, use_multiclass_scores=True)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict),
-          feed_dict={
-              tensor_dict[fields.InputDataFields.multiclass_scores]:
-                  np.array([], dtype=np.float32)
-          })

+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      return transformed_inputs[fields.InputDataFields.groundtruth_classes]
+    groundtruth_classes = self.execute_cpu(graph_fn, [])
    self.assertAllClose(
        np.array([[0, 1, 0], [0, 0, 1]], np.float32),
-        transformed_inputs[fields.InputDataFields.groundtruth_classes])
+        groundtruth_classes)

  @parameterized.parameters(
      {'labeled_classes': [1, 2]},
@@ -916,6 +902,7 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
        transformed_inputs[fields.InputDataFields.groundtruth_labeled_classes])

  def test_returns_correct_class_label_encodings(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
@@ -930,18 +917,17 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-
-    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_classes],
-        [[0, 0, 1], [1, 0, 0]])
-    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_confidences],
-        [[0, 0, 1], [1, 0, 0]])
+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_classes],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_confidences])
+    (groundtruth_classes, groundtruth_confidences) = self.execute_cpu(graph_fn,
+                                                                      [])
+    self.assertAllClose(groundtruth_classes, [[0, 0, 1], [1, 0, 0]])
+    self.assertAllClose(groundtruth_confidences, [[0, 0, 1], [1, 0, 0]])

  def test_returns_correct_labels_with_unrecognized_class(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
@@ -973,46 +959,46 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-
-    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_classes],
-        [[0, 0, 1], [1, 0, 0]])
-    self.assertAllEqual(
-        transformed_inputs[fields.InputDataFields.num_groundtruth_boxes], 2)
-    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_area], [.5, .3])
-    self.assertAllEqual(
-        transformed_inputs[fields.InputDataFields.groundtruth_confidences],
-        [[0, 0, 1], [1, 0, 0]])
-    self.assertAllClose(
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_classes],
+              transformed_inputs[fields.InputDataFields.num_groundtruth_boxes],
+              transformed_inputs[fields.InputDataFields.groundtruth_area],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_confidences],
              transformed_inputs[fields.InputDataFields.groundtruth_boxes],
-        [[0, 0, 1, 1], [.5, .5, 1, 1]])
-    self.assertAllClose(
              transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
-        [[[.1, .1]], [[.5, .5]]])
-    self.assertAllEqual(
-        transformed_inputs[
-            fields.InputDataFields.groundtruth_keypoint_visibilities],
-        [[True, True], [True, True]])
-    self.assertAllEqual(
-        transformed_inputs[
-            fields.InputDataFields.groundtruth_instance_masks].shape, [2, 4, 4])
-    self.assertAllEqual(
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_keypoint_visibilities],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_instance_masks],
              transformed_inputs[fields.InputDataFields.groundtruth_is_crowd],
-        [False, False])
-    self.assertAllEqual(
-        transformed_inputs[fields.InputDataFields.groundtruth_difficult],
-        [0, 1])
+              transformed_inputs[fields.InputDataFields.groundtruth_difficult])
+    (groundtruth_classes, num_groundtruth_boxes, groundtruth_area,
+     groundtruth_confidences, groundtruth_boxes, groundtruth_keypoints,
+     groundtruth_keypoint_visibilities, groundtruth_instance_masks,
+     groundtruth_is_crowd, groundtruth_difficult) = self.execute_cpu(graph_fn,
+                                                                     [])
+
+    self.assertAllClose(groundtruth_classes, [[0, 0, 1], [1, 0, 0]])
+    self.assertAllEqual(num_groundtruth_boxes, 2)
+    self.assertAllClose(groundtruth_area, [.5, .3])
+    self.assertAllEqual(groundtruth_confidences, [[0, 0, 1], [1, 0, 0]])
+    self.assertAllClose(groundtruth_boxes, [[0, 0, 1, 1], [.5, .5, 1, 1]])
+    self.assertAllClose(groundtruth_keypoints, [[[.1, .1]], [[.5, .5]]])
+    self.assertAllEqual(groundtruth_keypoint_visibilities,
+                        [[True, True], [True, True]])
+    self.assertAllEqual(groundtruth_instance_masks.shape, [2, 4, 4])
+    self.assertAllEqual(groundtruth_is_crowd, [False, False])
+    self.assertAllEqual(groundtruth_difficult, [0, 1])

  def test_returns_correct_merged_boxes(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
          fields.InputDataFields.groundtruth_boxes:
-            tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)),
+              tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]],
+                                   np.float32)),
          fields.InputDataFields.groundtruth_classes:
              tf.constant(np.array([3, 1], np.int32))
      }
@@ -1024,24 +1010,29 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes,
          merge_multiple_boxes=True)
-
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_boxes],
+              transformed_inputs[fields.InputDataFields.groundtruth_classes],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_confidences],
+              transformed_inputs[fields.InputDataFields.num_groundtruth_boxes])
+    (groundtruth_boxes, groundtruth_classes, groundtruth_confidences,
+     num_groundtruth_boxes) = self.execute_cpu(graph_fn, [])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_boxes],
+        groundtruth_boxes,
        [[.5, .5, 1., 1.]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_classes],
+        groundtruth_classes,
        [[1, 0, 1]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_confidences],
+        groundtruth_confidences,
        [[1, 0, 1]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.num_groundtruth_boxes],
+        num_groundtruth_boxes,
        1)

  def test_returns_correct_groundtruth_confidences_when_input_present(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
@@ -1058,18 +1049,21 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_classes],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_confidences])
+    groundtruth_classes, groundtruth_confidences = self.execute_cpu(graph_fn,
+                                                                    [])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_classes],
+        groundtruth_classes,
        [[0, 0, 1], [1, 0, 0]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_confidences],
+        groundtruth_confidences,
        [[0, 0, 1], [-1, 0, 0]])

  def test_returns_resized_masks(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(4, 4, 3).astype(np.float32)),
@@ -1099,23 +1093,24 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          image_resizer_fn=fake_image_resizer_fn,
          num_classes=num_classes,
          retain_original_image=True)
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-    self.assertAllEqual(transformed_inputs[
-        fields.InputDataFields.original_image].dtype, tf.uint8)
-    self.assertAllEqual(transformed_inputs[
-        fields.InputDataFields.original_image_spatial_shape], [4, 4])
-    self.assertAllEqual(transformed_inputs[
-        fields.InputDataFields.original_image].shape, [8, 8, 3])
-    self.assertAllEqual(transformed_inputs[
-        fields.InputDataFields.groundtruth_instance_masks].shape, [2, 8, 8])
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.original_image],
+              transformed_inputs[fields.InputDataFields.
+                                 original_image_spatial_shape],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_instance_masks])
+    (original_image, original_image_shape,
+     groundtruth_instance_masks) = self.execute_cpu(graph_fn, [])
+    self.assertEqual(original_image.dtype, np.uint8)
+    self.assertAllEqual(original_image_shape, [4, 4])
+    self.assertAllEqual(original_image.shape, [8, 8, 3])
+    self.assertAllEqual(groundtruth_instance_masks.shape, [2, 8, 8])

  def test_applies_model_preprocess_fn_to_image_tensor(self):
    np_image = np.random.randint(256, size=(4, 4, 3))
+    def graph_fn(image):
      tensor_dict = {
-        fields.InputDataFields.image:
-            tf.constant(np_image),
+          fields.InputDataFields.image: image,
          fields.InputDataFields.groundtruth_classes:
              tf.constant(np.array([3, 1], np.int32))
      }
@@ -1129,21 +1124,18 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=fake_model_preprocessor_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes)
-
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-    self.assertAllClose(transformed_inputs[fields.InputDataFields.image],
-                        np_image / 255.)
-    self.assertAllClose(transformed_inputs[fields.InputDataFields.
-                                           true_image_shape],
-                        [4, 4, 3])
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.image],
+              transformed_inputs[fields.InputDataFields.true_image_shape])
+    image, true_image_shape = self.execute_cpu(graph_fn, [np_image])
+    self.assertAllClose(image, np_image / 255.)
+    self.assertAllClose(true_image_shape, [4, 4, 3])

  def test_applies_data_augmentation_fn_to_tensor_dict(self):
    np_image = np.random.randint(256, size=(4, 4, 3))
+    def graph_fn(image):
      tensor_dict = {
-        fields.InputDataFields.image:
-            tf.constant(np_image),
+          fields.InputDataFields.image: image,
          fields.InputDataFields.groundtruth_classes:
              tf.constant(np.array([3, 1], np.int32))
      }
@@ -1158,21 +1150,20 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes,
          data_augmentation_fn=add_one_data_augmentation_fn)
-    with self.test_session() as sess:
-      augmented_tensor_dict = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-
-    self.assertAllEqual(augmented_tensor_dict[fields.InputDataFields.image],
-                        np_image + 1)
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.image],
+              transformed_inputs[fields.InputDataFields.groundtruth_classes])
+    image, groundtruth_classes = self.execute_cpu(graph_fn, [np_image])
+    self.assertAllEqual(image, np_image + 1)
    self.assertAllEqual(
-        augmented_tensor_dict[fields.InputDataFields.groundtruth_classes],
+        groundtruth_classes,
        [[0, 0, 0, 1], [0, 1, 0, 0]])

  def test_applies_data_augmentation_fn_before_model_preprocess_fn(self):
    np_image = np.random.randint(256, size=(4, 4, 3))
+    def graph_fn(image):
      tensor_dict = {
-        fields.InputDataFields.image:
-            tf.constant(np_image),
+          fields.InputDataFields.image: image,
          fields.InputDataFields.groundtruth_classes:
              tf.constant(np.array([3, 1], np.int32))
      }
@@ -1191,15 +1182,13 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes,
          data_augmentation_fn=add_five_to_image_data_augmentation_fn)
-    with self.test_session() as sess:
-      augmented_tensor_dict = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
-
-    self.assertAllEqual(augmented_tensor_dict[fields.InputDataFields.image],
-                        (np_image + 5) * 2)
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return transformed_inputs[fields.InputDataFields.image]
+    image = self.execute_cpu(graph_fn, [np_image])
+    self.assertAllEqual(image, (np_image + 5) * 2)

  def test_resize_with_padding(self):
-
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
@@ -1218,18 +1207,19 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_resize50_preprocess_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes,)
-
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
+      transformed_inputs = input_transformation_fn(tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_boxes],
+              transformed_inputs[fields.InputDataFields.groundtruth_keypoints])
+    groundtruth_boxes, groundtruth_keypoints = self.execute_cpu(graph_fn, [])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_boxes],
+        groundtruth_boxes,
        [[.5, .25, 1., .5], [.0, .0, .5, .25]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
+        groundtruth_keypoints,
        [[[.1, .1]], [[.3, .2]]])

  def test_groundtruth_keypoint_weights(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
@@ -1253,19 +1243,23 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes,
          keypoint_type_weight=keypoint_type_weight)
+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_keypoint_weights])

-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
+    groundtruth_keypoints, groundtruth_keypoint_weights = self.execute_cpu(
+        graph_fn, [])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
+        groundtruth_keypoints,
        [[[0.1, 0.1], [0.3, 0.2]],
         [[0.5, 0.3], [0.7, 0.4]]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_keypoint_weights],
+        groundtruth_keypoint_weights,
        [[1.0, 0.0], [1.0, 2.0]])

  def test_groundtruth_keypoint_weights_default(self):
+    def graph_fn():
      tensor_dict = {
          fields.InputDataFields.image:
              tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
@@ -1285,16 +1279,18 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
          model_preprocess_fn=_fake_resize50_preprocess_fn,
          image_resizer_fn=_fake_image_resizer_fn,
          num_classes=num_classes)
-
-    with self.test_session() as sess:
-      transformed_inputs = sess.run(
-          input_transformation_fn(tensor_dict=tensor_dict))
+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      return (transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
+              transformed_inputs[fields.InputDataFields.
+                                 groundtruth_keypoint_weights])
+    groundtruth_keypoints, groundtruth_keypoint_weights = self.execute_cpu(
+        graph_fn, [])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_keypoints],
+        groundtruth_keypoints,
        [[[0.1, 0.1], [0.3, 0.2]],
         [[0.5, 0.3], [0.7, 0.4]]])
    self.assertAllClose(
-        transformed_inputs[fields.InputDataFields.groundtruth_keypoint_weights],
+        groundtruth_keypoint_weights,
        [[1.0, 1.0], [1.0, 1.0]])


@@ -1303,15 +1299,15 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
  def test_pad_images_boxes_and_classes(self):
    input_tensor_dict = {
        fields.InputDataFields.image:
-            tf.placeholder(tf.float32, [None, None, 3]),
+            tf.random.uniform([3, 3, 3]),
        fields.InputDataFields.groundtruth_boxes:
-            tf.placeholder(tf.float32, [None, 4]),
+            tf.random.uniform([2, 4]),
        fields.InputDataFields.groundtruth_classes:
-            tf.placeholder(tf.int32, [None, 3]),
+            tf.random.uniform([2, 3], minval=0, maxval=2, dtype=tf.int32),
        fields.InputDataFields.true_image_shape:
-            tf.placeholder(tf.int32, [3]),
+            tf.constant([3, 3, 3]),
        fields.InputDataFields.original_image_spatial_shape:
-            tf.placeholder(tf.int32, [2])
+            tf.constant([3, 3])
    }
    padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
        tensor_dict=input_tensor_dict,
@@ -1336,69 +1332,35 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
        .shape.as_list(), [3, 3])

  def test_clip_boxes_and_classes(self):
+    def graph_fn():
      input_tensor_dict = {
          fields.InputDataFields.groundtruth_boxes:
-            tf.placeholder(tf.float32, [None, 4]),
+              tf.random.uniform([5, 4]),
          fields.InputDataFields.groundtruth_classes:
-            tf.placeholder(tf.int32, [None, 3]),
+              tf.random.uniform([2, 3], maxval=10, dtype=tf.int32),
          fields.InputDataFields.num_groundtruth_boxes:
-            tf.placeholder(tf.int32, [])
+              tf.constant(5)
      }
      padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
          tensor_dict=input_tensor_dict,
          max_num_boxes=3,
          num_classes=3,
          spatial_image_shape=[5, 6])
-
-    self.assertAllEqual(
-        padded_tensor_dict[fields.InputDataFields.groundtruth_boxes]
-        .shape.as_list(), [3, 4])
-    self.assertAllEqual(
-        padded_tensor_dict[fields.InputDataFields.groundtruth_classes]
-        .shape.as_list(), [3, 3])
-
-    with self.test_session() as sess:
-      out_tensor_dict = sess.run(
-          padded_tensor_dict,
-          feed_dict={
-              input_tensor_dict[fields.InputDataFields.groundtruth_boxes]:
-                  np.random.rand(5, 4),
-              input_tensor_dict[fields.InputDataFields.groundtruth_classes]:
-                  np.random.rand(2, 3),
-              input_tensor_dict[fields.InputDataFields.num_groundtruth_boxes]:
-                  5,
-          })
-
-    self.assertAllEqual(
-        out_tensor_dict[fields.InputDataFields.groundtruth_boxes].shape, [3, 4])
-    self.assertAllEqual(
-        out_tensor_dict[fields.InputDataFields.groundtruth_classes].shape,
-        [3, 3])
-    self.assertEqual(
-        out_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
-        3)
-
-  def test_do_not_pad_dynamic_images(self):
-    input_tensor_dict = {
-        fields.InputDataFields.image:
-            tf.placeholder(tf.float32, [None, None, 3]),
-    }
-    padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
-        tensor_dict=input_tensor_dict,
-        max_num_boxes=3,
-        num_classes=3,
-        spatial_image_shape=[None, None])
-
-    self.assertAllEqual(
-        padded_tensor_dict[fields.InputDataFields.image].shape.as_list(),
-        [None, None, 3])
+      return (padded_tensor_dict[fields.InputDataFields.groundtruth_boxes],
+              padded_tensor_dict[fields.InputDataFields.groundtruth_classes],
+              padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes])
+    (groundtruth_boxes, groundtruth_classes,
+     num_groundtruth_boxes) = self.execute_cpu(graph_fn, [])
+    self.assertAllEqual(groundtruth_boxes.shape, [3, 4])
+    self.assertAllEqual(groundtruth_classes.shape, [3, 3])
+    self.assertEqual(num_groundtruth_boxes, 3)

  def test_images_and_additional_channels(self):
    input_tensor_dict = {
        fields.InputDataFields.image:
-            tf.placeholder(tf.float32, [None, None, 5]),
+            test_utils.image_with_dynamic_shape(4, 3, 5),
        fields.InputDataFields.image_additional_channels:
-            tf.placeholder(tf.float32, [None, None, 2]),
+            test_utils.image_with_dynamic_shape(4, 3, 2),
    }
    padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
        tensor_dict=input_tensor_dict,
@@ -1418,11 +1380,11 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
  def test_images_and_additional_channels_errors(self):
    input_tensor_dict = {
        fields.InputDataFields.image:
-            tf.placeholder(tf.float32, [None, None, 3]),
+            test_utils.image_with_dynamic_shape(10, 10, 3),
        fields.InputDataFields.image_additional_channels:
-            tf.placeholder(tf.float32, [None, None, 2]),
+            test_utils.image_with_dynamic_shape(10, 10, 2),
        fields.InputDataFields.original_image:
-            tf.placeholder(tf.float32, [None, None, 3]),
+            test_utils.image_with_dynamic_shape(10, 10, 3),
    }
    with self.assertRaises(ValueError):
      _ = inputs.pad_input_data_to_static_shapes(
@@ -1434,7 +1396,7 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
  def test_gray_images(self):
    input_tensor_dict = {
        fields.InputDataFields.image:
-            tf.placeholder(tf.float32, [None, None, 1]),
+            test_utils.image_with_dynamic_shape(4, 4, 1),
    }
    padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
        tensor_dict=input_tensor_dict,
@@ -1449,9 +1411,9 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
  def test_gray_images_and_additional_channels(self):
    input_tensor_dict = {
        fields.InputDataFields.image:
-            tf.placeholder(tf.float32, [None, None, 3]),
+            test_utils.image_with_dynamic_shape(4, 4, 3),
        fields.InputDataFields.image_additional_channels:
-            tf.placeholder(tf.float32, [None, None, 2]),
+            test_utils.image_with_dynamic_shape(4, 4, 2),
    }
    # pad_input_data_to_static_shape assumes that image is already concatenated
    # with additional channels.
@@ -1469,11 +1431,14 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
        .shape.as_list(), [5, 6, 2])

  def test_keypoints(self):
+    keypoints = test_utils.keypoints_with_dynamic_shape(10, 16, 4)
+    visibilities = tf.cast(tf.random.uniform(tf.shape(keypoints)[:-1], minval=0,
+                                             maxval=2, dtype=tf.int32), tf.bool)
    input_tensor_dict = {
        fields.InputDataFields.groundtruth_keypoints:
-            tf.placeholder(tf.float32, [None, 16, 4]),
+            test_utils.keypoints_with_dynamic_shape(10, 16, 4),
        fields.InputDataFields.groundtruth_keypoint_visibilities:
-            tf.placeholder(tf.bool, [None, 16]),
+            visibilities
    }
    padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
        tensor_dict=input_tensor_dict,
@@ -1493,12 +1458,12 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
    context_memory_size = 8
    context_feature_length = 10
    max_num_context_features = 20
+    def graph_fn():
      input_tensor_dict = {
          fields.InputDataFields.context_features:
-            tf.placeholder(tf.float32,
-                           [context_memory_size, context_feature_length]),
+              tf.ones([context_memory_size, context_feature_length]),
          fields.InputDataFields.context_feature_length:
-            tf.placeholder(tf.float32, [])
+              tf.constant(context_feature_length)
      }
      padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
          tensor_dict=input_tensor_dict,
@@ -1512,20 +1477,57 @@ class PadInputDataToStaticShapesFnTest(test_case.TestCase):
          padded_tensor_dict[
              fields.InputDataFields.context_features].shape.as_list(),
          [max_num_context_features, context_feature_length])
+      return padded_tensor_dict[fields.InputDataFields.valid_context_size]

-    with self.test_session() as sess:
-      feed_dict = {
-          input_tensor_dict[fields.InputDataFields.context_features]:
-              np.ones([context_memory_size, context_feature_length],
-                      dtype=np.float32),
-          input_tensor_dict[fields.InputDataFields.context_feature_length]:
-              context_feature_length
+    valid_context_size = self.execute_cpu(graph_fn, [])
+    self.assertEqual(valid_context_size, context_memory_size)
+
+
+class NegativeSizeTest(test_case.TestCase):
+  """Test for inputs and related funcitons."""
+
+  def test_negative_size_error(self):
+    """Test that error is raised for negative size boxes."""
+
+    def graph_fn():
+      tensors = {
+          fields.InputDataFields.image: tf.zeros((128, 128, 3)),
+          fields.InputDataFields.groundtruth_classes:
+              tf.constant([1, 1], tf.int32),
+          fields.InputDataFields.groundtruth_boxes:
+              tf.constant([[0.5, 0.5, 0.4, 0.5]], tf.float32)
      }
-      padded_tensor_dict_out = sess.run(padded_tensor_dict, feed_dict=feed_dict)
+      tensors = inputs.transform_input_data(
+          tensors, _fake_model_preprocessor_fn, _fake_image_resizer_fn,
+          num_classes=10)
+      return tensors[fields.InputDataFields.groundtruth_boxes]
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      self.execute_cpu(graph_fn, [])
+
+  def test_negative_size_no_assert(self):
+    """Test that negative size boxes are filtered out without assert.
+
+    This test simulates the behaviour when we run on TPU and Assert ops are
+    not supported.
+    """

-    self.assertEqual(
-        padded_tensor_dict_out[fields.InputDataFields.valid_context_size],
-        context_memory_size)
+    tensors = {
+        fields.InputDataFields.image: tf.zeros((128, 128, 3)),
+        fields.InputDataFields.groundtruth_classes:
+            tf.constant([1, 1], tf.int32),
+        fields.InputDataFields.groundtruth_boxes:
+            tf.constant([[0.5, 0.5, 0.4, 0.5], [0.5, 0.5, 0.6, 0.6]],
+                        tf.float32)
+    }
+
+    with mock.patch.object(tf, 'Assert') as tf_assert:
+      tf_assert.return_value = tf.no_op()
+      tensors = inputs.transform_input_data(
+          tensors, _fake_model_preprocessor_fn, _fake_image_resizer_fn,
+          num_classes=10)
+
+      self.assertAllClose(tensors[fields.InputDataFields.groundtruth_boxes],
+                          [[0.5, 0.5, 0.6, 0.6]])


 if __name__ == '__main__':

--- a/research/object_detection/legacy/trainer_test.py
+++ b/research/object_detection/legacy/trainer_test.py
@@ -14,7 +14,7 @@
 # ==============================================================================

 """Tests for object_detection.trainer."""
-
+import unittest
 import tensorflow.compat.v1 as tf
 import tf_slim as slim
 from google.protobuf import text_format
@@ -24,6 +24,7 @@ from object_detection.core import model
 from object_detection.core import standard_fields as fields
 from object_detection.legacy import trainer
 from object_detection.protos import train_pb2
+from object_detection.utils import tf_version


 NUMBER_OF_CLASSES = 2
@@ -197,6 +198,7 @@ class FakeDetectionModel(model.DetectionModel):
    pass


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class TrainerTest(tf.test.TestCase):

  def test_configure_trainer_and_train_two_steps(self):

--- a/research/object_detection/matchers/bipartite_matcher_test.py
+++ b/research/object_detection/matchers/bipartite_matcher_test.py
@@ -14,14 +14,18 @@
 # ==============================================================================

 """Tests for object_detection.core.bipartite_matcher."""
-
+import unittest
 import numpy as np
 import tensorflow.compat.v1 as tf

-from object_detection.matchers import bipartite_matcher
 from object_detection.utils import test_case
+from object_detection.utils import tf_version
+
+if tf_version.is_tf1():
+  from object_detection.matchers import bipartite_matcher  # pylint: disable=g-import-not-at-top


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class GreedyBipartiteMatcherTest(test_case.TestCase):

  def test_get_expected_matches_when_all_rows_are_valid(self):

--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The CenterNet meta architecture as described in the "Objects as Points" paper [1].
+
+[1]: https://arxiv.org/abs/1904.07850
+
+"""
+
+import abc
+import collections
+import functools
+import numpy as np
+import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf2
+
+from object_detection.core import box_list
+from object_detection.core import box_list_ops
+from object_detection.core import keypoint_ops
+from object_detection.core import model
+from object_detection.core import standard_fields as fields
+from object_detection.core import target_assigner as cn_assigner
+from object_detection.utils import shape_utils
+
+# Number of channels needed to predict size and offsets.
+NUM_OFFSET_CHANNELS = 2
+NUM_SIZE_CHANNELS = 2
+
+# Error range for detecting peaks.
+PEAK_EPSILON = 1e-6
+
+# Constants shared between all keypoint tasks.
+UNMATCHED_KEYPOINT_SCORE = 0.1
+KEYPOINT_CANDIDATE_SEARCH_SCALE = 0.3
+
+
+class CenterNetFeatureExtractor(tf.keras.Model):
+  """Base class for feature extractors for the CenterNet meta architecture.
+
+  Child classes are expected to override the _output_model property which will
+  return 1 or more tensors predicted by the feature extractor.
+
+  """
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self, name=None, channel_means=(0., 0., 0.),
+               channel_stds=(1., 1., 1.), bgr_ordering=False):
+    """Initializes a CenterNet feature extractor.
+
+    Args:
+      name: str, the name used for the underlying keras model.
+      channel_means: A tuple of floats, denoting the mean of each channel
+        which will be subtracted from it. If None or empty, we use 0s.
+      channel_stds: A tuple of floats, denoting the standard deviation of each
+        channel. Each channel will be divided by its standard deviation value.
+        If None or empty, we use 1s.
+      bgr_ordering: bool, if set will change the channel ordering to be in the
+        [blue, red, green] order.
+    """
+    super(CenterNetFeatureExtractor, self).__init__(name=name)
+
+    if channel_means is None or len(channel_means) == 0:  # pylint:disable=g-explicit-length-test
+      channel_means = [0., 0., 0.]
+
+    if channel_stds is None or len(channel_stds) == 0:  # pylint:disable=g-explicit-length-test
+      channel_stds = [1., 1., 1.]
+
+    self._channel_means = channel_means
+    self._channel_stds = channel_stds
+    self._bgr_ordering = bgr_ordering
+
+  def preprocess(self, inputs):
+    """Converts a batch of unscaled images to a scale suitable for the model.
+
+    This method normalizes the image using the given `channel_means` and
+    `channels_stds` values at initialization time while optionally flipping
+    the channel order if `bgr_ordering` is set.
+
+    Args:
+      inputs: a [batch, height, width, channels] float32 tensor
+
+    Returns:
+      outputs: a [batch, height, width, channels] float32 tensor
+
+    """
+
+    if self._bgr_ordering:
+      red, green, blue = tf.unstack(inputs, axis=3)
+      inputs = tf.stack([blue, green, red], axis=3)
+
+    channel_means = tf.reshape(tf.constant(self._channel_means),
+                               [1, 1, 1, -1])
+    channel_stds = tf.reshape(tf.constant(self._channel_stds),
+                              [1, 1, 1, -1])
+
+    return (inputs - channel_means)/channel_stds
+
+  @property
+  @abc.abstractmethod
+  def out_stride(self):
+    """The stride in the output image of the network."""
+    pass
+
+  @property
+  @abc.abstractmethod
+  def num_feature_outputs(self):
+    """Ther number of feature outputs returned by the feature extractor."""
+    pass
+
+
+def make_prediction_net(num_out_channels, kernel_size=3, num_filters=256,
+                        bias_fill=None):
+  """Creates a network to predict the given number of output channels.
+
+  This function is intended to make the prediction heads for the CenterNet
+  meta architecture.
+
+  Args:
+    num_out_channels: Number of output channels.
+    kernel_size: The size of the conv kernel in the intermediate layer
+    num_filters: The number of filters in the intermediate conv layer.
+    bias_fill: If not None, is used to initialize the bias in the final conv
+      layer.
+
+  Returns:
+    net: A keras module which when called on an input tensor of size
+      [batch_size, height, width, num_in_channels] returns an output
+      of size [batch_size, height, width, num_out_channels]
+  """
+
+  out_conv = tf.keras.layers.Conv2D(num_out_channels, kernel_size=1)
+
+  if bias_fill is not None:
+    out_conv.bias_initializer = tf.keras.initializers.constant(bias_fill)
+
+  net = tf.keras.Sequential(
+      [tf.keras.layers.Conv2D(num_filters, kernel_size=kernel_size,
+                              padding='same'),
+       tf.keras.layers.ReLU(),
+       out_conv]
+  )
+
+  return net
+
+
+def _to_float32(x):
+  return tf.cast(x, tf.float32)
+
+
+def _get_shape(tensor, num_dims):
+  tf.Assert(tensor.get_shape().ndims == num_dims, [tensor])
+  return shape_utils.combined_static_and_dynamic_shape(tensor)
+
+
+def _flatten_spatial_dimensions(batch_images):
+  batch_size, height, width, channels = _get_shape(batch_images, 4)
+  return tf.reshape(batch_images, [batch_size, height * width,
+                                   channels])
+
+
+def top_k_feature_map_locations(feature_map, max_pool_kernel_size=3, k=100,
+                                per_channel=False):
+  """Returns the top k scores and their locations in a feature map.
+
+  Given a feature map, the top k values (based on activation) are returned. If
+  `per_channel` is True, the top k values **per channel** are returned.
+
+  The `max_pool_kernel_size` argument allows for selecting local peaks in a
+  region. This filtering is done per channel, so nothing prevents two values at
+  the same location to be returned.
+
+  Args:
+    feature_map: [batch, height, width, channels] float32 feature map.
+    max_pool_kernel_size: integer, the max pool kernel size to use to pull off
+      peak score locations in a neighborhood (independently for each channel).
+      For example, to make sure no two neighboring values (in the same channel)
+      are returned, set max_pool_kernel_size=3. If None or 1, will not apply max
+      pooling.
+    k: The number of highest scoring locations to return.
+    per_channel: If True, will return the top k scores and locations per
+      feature map channel. If False, the top k across the entire feature map
+      (height x width x channels) are returned.
+
+  Returns:
+    Tuple of
+    scores: A [batch, N] float32 tensor with scores from the feature map in
+      descending order. If per_channel is False, N = k. Otherwise,
+      N = k * channels, and the first k elements correspond to channel 0, the
+      second k correspond to channel 1, etc.
+    y_indices: A [batch, N] int tensor with y indices of the top k feature map
+      locations. If per_channel is False, N = k. Otherwise,
+      N = k * channels.
+    x_indices: A [batch, N] int tensor with x indices of the top k feature map
+      locations. If per_channel is False, N = k. Otherwise,
+      N = k * channels.
+    channel_indices: A [batch, N] int tensor with channel indices of the top k
+      feature map locations. If per_channel is False, N = k. Otherwise,
+      N = k * channels.
+  """
+  if not max_pool_kernel_size or max_pool_kernel_size == 1:
+    feature_map_peaks = feature_map
+  else:
+    feature_map_max_pool = tf.nn.max_pool(
+        feature_map, ksize=max_pool_kernel_size, strides=1, padding='SAME')
+
+    feature_map_peak_mask = tf.math.abs(
+        feature_map - feature_map_max_pool) < PEAK_EPSILON
+
+    # Zero out everything that is not a peak.
+    feature_map_peaks = (
+        feature_map * _to_float32(feature_map_peak_mask))
+
+  batch_size, _, width, num_channels = _get_shape(feature_map, 4)
+
+  if per_channel:
+    # Perform top k over batch and channels.
+    feature_map_peaks_transposed = tf.transpose(feature_map_peaks,
+                                                perm=[0, 3, 1, 2])
+    feature_map_peaks_transposed = tf.reshape(
+        feature_map_peaks_transposed, [batch_size, num_channels, -1])
+    scores, peak_flat_indices = tf.math.top_k(feature_map_peaks_transposed, k=k)
+    # Convert the indices such that they represent the location in the full
+    # (flattened) feature map of size [batch, height * width * channels].
+    channel_idx = tf.range(num_channels)[tf.newaxis, :, tf.newaxis]
+    peak_flat_indices = num_channels * peak_flat_indices + channel_idx
+    scores = tf.reshape(scores, [batch_size, -1])
+    peak_flat_indices = tf.reshape(peak_flat_indices, [batch_size, -1])
+  else:
+    feature_map_peaks_flat = tf.reshape(feature_map_peaks, [batch_size, -1])
+    scores, peak_flat_indices = tf.math.top_k(feature_map_peaks_flat, k=k)
+
+  # Get x, y and channel indices corresponding to the top indices in the flat
+  # array.
+  y_indices, x_indices, channel_indices = (
+      row_col_channel_indices_from_flattened_indices(
+          peak_flat_indices, width, num_channels))
+  return scores, y_indices, x_indices, channel_indices
+
+
+def prediction_tensors_to_boxes(detection_scores, y_indices, x_indices,
+                                channel_indices, height_width_predictions,
+                                offset_predictions):
+  """Converts CenterNet class-center, offset and size predictions to boxes.
+
+  Args:
+    detection_scores: A [batch, num_boxes] float32 tensor with detection
+      scores in range [0, 1].
+    y_indices: A [batch, num_boxes] int32 tensor with y indices corresponding to
+      object center locations (expressed in output coordinate frame).
+    x_indices: A [batch, num_boxes] int32 tensor with x indices corresponding to
+      object center locations (expressed in output coordinate frame).
+    channel_indices: A [batch, num_boxes] int32 tensor with channel indices
+      corresponding to object classes.
+    height_width_predictions: A float tensor of shape [batch_size, height,
+      width, 2] representing the height and width of a box centered at each
+      pixel.
+    offset_predictions: A float tensor of shape [batch_size, height, width, 2]
+      representing the y and x offsets of a box centered at each pixel. This
+      helps reduce the error from downsampling.
+
+  Returns:
+    detection_boxes: A tensor of shape [batch_size, num_boxes, 4] holding the
+      the raw bounding box coordinates of boxes.
+    detection_classes: An integer tensor of shape [batch_size, num_boxes]
+      indicating the predicted class for each box.
+    detection_scores: A float tensor of shape [batch_size, num_boxes] indicating
+      the score for each box.
+    num_detections: An integer tensor of shape [batch_size,] indicating the
+      number of boxes detected for each sample in the batch.
+
+  """
+  _, _, width, _ = _get_shape(height_width_predictions, 4)
+
+  peak_spatial_indices = flattened_indices_from_row_col_indices(
+      y_indices, x_indices, width)
+  y_indices = _to_float32(y_indices)
+  x_indices = _to_float32(x_indices)
+
+  height_width_flat = _flatten_spatial_dimensions(height_width_predictions)
+  offsets_flat = _flatten_spatial_dimensions(offset_predictions)
+
+  height_width = tf.gather(height_width_flat, peak_spatial_indices,
+                           batch_dims=1)
+  offsets = tf.gather(offsets_flat, peak_spatial_indices, batch_dims=1)
+
+  heights, widths = tf.unstack(height_width, axis=2)
+  y_offsets, x_offsets = tf.unstack(offsets, axis=2)
+
+  detection_classes = channel_indices
+
+  num_detections = tf.reduce_sum(tf.to_int32(detection_scores > 0), axis=1)
+
+  boxes = tf.stack([y_indices + y_offsets - heights / 2.0,
+                    x_indices + x_offsets - widths / 2.0,
+                    y_indices + y_offsets + heights / 2.0,
+                    x_indices + x_offsets + widths / 2.0], axis=2)
+
+  return boxes, detection_classes, detection_scores, num_detections
+
+
+def prediction_tensors_to_keypoint_candidates(
+    keypoint_heatmap_predictions,
+    keypoint_heatmap_offsets,
+    keypoint_score_threshold=0.1,
+    max_pool_kernel_size=1,
+    max_candidates=20):
+  """Convert keypoint heatmap predictions and offsets to keypoint candidates.
+
+  Args:
+    keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
+      width, num_keypoints] representing the per-keypoint heatmaps.
+    keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
+      width, 2] (or [batch_size, height, width, 2 * num_keypoints] if
+      'per_keypoint_offset' is set True) representing the per-keypoint offsets.
+    keypoint_score_threshold: float, the threshold for considering a keypoint
+      a candidate.
+    max_pool_kernel_size: integer, the max pool kernel size to use to pull off
+      peak score locations in a neighborhood. For example, to make sure no two
+      neighboring values for the same keypoint are returned, set
+      max_pool_kernel_size=3. If None or 1, will not apply any local filtering.
+    max_candidates: integer, maximum number of keypoint candidates per
+      keypoint type.
+
+  Returns:
+    keypoint_candidates: A tensor of shape
+      [batch_size, max_candidates, num_keypoints, 2] holding the
+      location of keypoint candidates in [y, x] format (expressed in absolute
+      coordinates in the output coordinate frame).
+    keypoint_scores: A float tensor of shape
+      [batch_size, max_candidates, num_keypoints] with the scores for each
+      keypoint candidate. The scores come directly from the heatmap predictions.
+    num_keypoint_candidates: An integer tensor of shape
+      [batch_size, num_keypoints] with the number of candidates for each
+      keypoint type, as it's possible to filter some candidates due to the score
+      threshold.
+  """
+  batch_size, _, width, num_keypoints = _get_shape(
+      keypoint_heatmap_predictions, 4)
+  # Get x, y and channel indices corresponding to the top indices in the
+  # keypoint heatmap predictions.
+  # Note that the top k candidates are produced for **each keypoint type**.
+  # Might be worth eventually trying top k in the feature map, independent of
+  # the keypoint type.
+  keypoint_scores, y_indices, x_indices, channel_indices = (
+      top_k_feature_map_locations(keypoint_heatmap_predictions,
+                                  max_pool_kernel_size=max_pool_kernel_size,
+                                  k=max_candidates,
+                                  per_channel=True))
+
+  peak_spatial_indices = flattened_indices_from_row_col_indices(
+      y_indices, x_indices, width)
+  y_indices = _to_float32(y_indices)
+  x_indices = _to_float32(x_indices)
+
+  offsets_flat = _flatten_spatial_dimensions(keypoint_heatmap_offsets)
+
+  selected_offsets = tf.gather(offsets_flat, peak_spatial_indices, batch_dims=1)
+  _, num_indices, num_channels = _get_shape(selected_offsets, 3)
+  if num_channels > 2:
+    reshaped_offsets = tf.reshape(selected_offsets,
+                                  [batch_size, num_indices, -1, 2])
+    offsets = tf.gather(reshaped_offsets, channel_indices, batch_dims=2)
+  else:
+    offsets = selected_offsets
+  y_offsets, x_offsets = tf.unstack(offsets, axis=2)
+
+  keypoint_candidates = tf.stack([y_indices + y_offsets,
+                                  x_indices + x_offsets], axis=2)
+  keypoint_candidates = tf.reshape(
+      keypoint_candidates,
+      [batch_size, num_keypoints, max_candidates, 2])
+  keypoint_candidates = tf.transpose(keypoint_candidates, [0, 2, 1, 3])
+  keypoint_scores = tf.reshape(
+      keypoint_scores,
+      [batch_size, num_keypoints, max_candidates])
+  keypoint_scores = tf.transpose(keypoint_scores, [0, 2, 1])
+  num_candidates = tf.reduce_sum(
+      tf.to_int32(keypoint_scores >= keypoint_score_threshold), axis=1)
+
+  return keypoint_candidates, keypoint_scores, num_candidates
+
+
+def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
+                                          y_indices, x_indices):
+  """Returns the regressed keypoints at specified object centers.
+
+  The original keypoint predictions are regressed relative to each feature map
+  location. The returned keypoints are expressed in absolute coordinates in the
+  output frame (i.e. the center offsets are added to each individual regressed
+  set of keypoints).
+
+  Args:
+    regressed_keypoint_predictions: A float tensor of shape
+      [batch_size, height, width, 2 * num_keypoints] holding regressed
+      keypoints. The last dimension has keypoint coordinates ordered as follows:
+      [y0, x0, y1, x1, ..., y{J-1}, x{J-1}] where J is the number of keypoints.
+    y_indices: A [batch, num_instances] int tensor holding y indices for object
+      centers. These indices correspond to locations in the output feature map.
+    x_indices: A [batch, num_instances] int tensor holding x indices for object
+      centers. These indices correspond to locations in the output feature map.
+
+  Returns:
+    A float tensor of shape [batch_size, num_objects, 2 * num_keypoints] where
+    regressed keypoints are gathered at the provided locations, and converted
+    to absolute coordinates in the output coordinate frame.
+  """
+  batch_size, _, width, _ = _get_shape(regressed_keypoint_predictions, 4)
+  flattened_indices = flattened_indices_from_row_col_indices(
+      y_indices, x_indices, width)
+  _, num_instances = _get_shape(flattened_indices, 2)
+
+  regressed_keypoints_flat = _flatten_spatial_dimensions(
+      regressed_keypoint_predictions)
+
+  relative_regressed_keypoints = tf.gather(
+      regressed_keypoints_flat, flattened_indices, batch_dims=1)
+  relative_regressed_keypoints = tf.reshape(
+      relative_regressed_keypoints,
+      [batch_size, num_instances, -1, 2])
+  relative_regressed_keypoints_y, relative_regressed_keypoints_x = tf.unstack(
+      relative_regressed_keypoints, axis=3)
+  y_indices = _to_float32(tf.expand_dims(y_indices, axis=-1))
+  x_indices = _to_float32(tf.expand_dims(x_indices, axis=-1))
+  absolute_regressed_keypoints = tf.stack(
+      [y_indices + relative_regressed_keypoints_y,
+       x_indices + relative_regressed_keypoints_x],
+      axis=3)
+  return tf.reshape(absolute_regressed_keypoints,
+                    [batch_size, num_instances, -1])
+
+
+def refine_keypoints(regressed_keypoints, keypoint_candidates, keypoint_scores,
+                     num_keypoint_candidates, bboxes=None,
+                     unmatched_keypoint_score=0.1, box_scale=1.2,
+                     candidate_search_scale=0.3,
+                     candidate_ranking_mode='min_distance'):
+  """Refines regressed keypoints by snapping to the nearest candidate keypoints.
+
+  The initial regressed keypoints represent a full set of keypoints regressed
+  from the centers of the objects. The keypoint candidates are estimated
+  independently from heatmaps, and are not associated with any object instances.
+  This function refines the regressed keypoints by "snapping" to the
+  nearest/highest score/highest score-distance ratio (depending on the
+  candidate_ranking_mode) candidate of the same keypoint type (e.g. "nose").
+  If no candidates are nearby, the regressed keypoint remains unchanged.
+
+  In order to snap a regressed keypoint to a candidate keypoint, the following
+  must be satisfied:
+  - the candidate keypoint must be of the same type as the regressed keypoint
+  - the candidate keypoint must not lie outside the predicted boxes (or the
+    boxes which encloses the regressed keypoints for the instance if `bboxes` is
+    not provided). Note that the box is scaled by
+    `regressed_box_scale` in height and width, to provide some margin around the
+    keypoints
+  - the distance to the closest candidate keypoint cannot exceed
+    candidate_search_scale * max(height, width), where height and width refer to
+    the bounding box for the instance.
+
+  Note that the same candidate keypoint is allowed to snap to regressed
+  keypoints in difference instances.
+
+  Args:
+    regressed_keypoints: A float tensor of shape
+      [batch_size, num_instances, num_keypoints, 2] with the initial regressed
+      keypoints.
+    keypoint_candidates: A tensor of shape
+      [batch_size, max_candidates, num_keypoints, 2] holding the location of
+      keypoint candidates in [y, x] format (expressed in absolute coordinates in
+      the output coordinate frame).
+    keypoint_scores: A float tensor of shape
+      [batch_size, max_candidates, num_keypoints] indicating the scores for
+      keypoint candidates.
+    num_keypoint_candidates: An integer tensor of shape
+      [batch_size, num_keypoints] indicating the number of valid candidates for
+      each keypoint type, as there may be padding (dim 1) of
+      `keypoint_candidates` and `keypoint_scores`.
+    bboxes: A tensor of shape [batch_size, num_instances, 4] with predicted
+      bounding boxes for each instance, expressed in the output coordinate
+      frame. If not provided, boxes will be computed from regressed keypoints.
+    unmatched_keypoint_score: float, the default score to use for regressed
+      keypoints that are not successfully snapped to a nearby candidate.
+    box_scale: float, the multiplier to expand the bounding boxes (either the
+      provided boxes or those which tightly cover the regressed keypoints) for
+      an instance. This scale is typically larger than 1.0 when not providing
+      `bboxes`.
+    candidate_search_scale: float, the scale parameter that multiplies the
+      largest dimension of a bounding box. The resulting distance becomes a
+      search radius for candidates in the vicinity of each regressed keypoint.
+    candidate_ranking_mode: A string as one of ['min_distance',
+     'score_distance_ratio'] indicating how to select the candidate. If invalid
+      value is provided, an ValueError will be raised.
+
+  Returns:
+    A tuple with:
+    refined_keypoints: A float tensor of shape
+      [batch_size, num_instances, num_keypoints, 2] with the final, refined
+      keypoints.
+    refined_scores: A float tensor of shape
+      [batch_size, num_instances, num_keypoints] with scores associated with all
+      instances and keypoints in `refined_keypoints`.
+
+  Raises:
+    ValueError: if provided candidate_ranking_mode is not one of
+      ['min_distance', 'score_distance_ratio']
+  """
+  batch_size, num_instances, num_keypoints, _ = (
+      shape_utils.combined_static_and_dynamic_shape(regressed_keypoints))
+  max_candidates = keypoint_candidates.shape[1]
+
+  # Replace all invalid (i.e. padded) keypoint candidates with NaN.
+  # This will prevent them from being considered.
+  range_tiled = tf.tile(
+      tf.reshape(tf.range(max_candidates), [1, max_candidates, 1]),
+      [batch_size, 1, num_keypoints])
+  num_candidates_tiled = tf.tile(tf.expand_dims(num_keypoint_candidates, 1),
+                                 [1, max_candidates, 1])
+  invalid_candidates = range_tiled >= num_candidates_tiled
+  nan_mask = tf.where(
+      invalid_candidates,
+      np.nan * tf.ones_like(invalid_candidates, dtype=tf.float32),
+      tf.ones_like(invalid_candidates, dtype=tf.float32))
+  keypoint_candidates_with_nans = tf.math.multiply(
+      keypoint_candidates, tf.expand_dims(nan_mask, -1))
+
+  # Pairwise squared distances between regressed keypoints and candidate
+  # keypoints (for a single keypoint type).
+  # Shape [batch_size, num_instances, max_candidates, num_keypoints].
+  regressed_keypoint_expanded = tf.expand_dims(regressed_keypoints,
+                                               axis=2)
+  keypoint_candidates_expanded = tf.expand_dims(
+      keypoint_candidates_with_nans, axis=1)
+  sqrd_distances = tf.math.reduce_sum(
+      tf.math.squared_difference(regressed_keypoint_expanded,
+                                 keypoint_candidates_expanded),
+      axis=-1)
+  distances = tf.math.sqrt(sqrd_distances)
+
+  # Determine the candidates that have the minimum distance to the regressed
+  # keypoints. Shape [batch_size, num_instances, num_keypoints].
+  min_distances = tf.math.reduce_min(distances, axis=2)
+  if candidate_ranking_mode == 'min_distance':
+    nearby_candidate_inds = tf.math.argmin(distances, axis=2)
+  elif candidate_ranking_mode == 'score_distance_ratio':
+    # tiled_keypoint_scores:
+    # Shape [batch_size, num_instances, max_candidates, num_keypoints].
+    tiled_keypoint_scores = tf.tile(
+        tf.expand_dims(keypoint_scores, axis=1),
+        multiples=[1, num_instances, 1, 1])
+    ranking_scores = tiled_keypoint_scores / (distances + 1e-6)
+    nearby_candidate_inds = tf.math.argmax(ranking_scores, axis=2)
+  else:
+    raise ValueError('Not recognized candidate_ranking_mode: %s' %
+                     candidate_ranking_mode)
+
+  # Gather the coordinates and scores corresponding to the closest candidates.
+  # Shape of tensors are [batch_size, num_instances, num_keypoints, 2] and
+  # [batch_size, num_instances, num_keypoints], respectively.
+  nearby_candidate_coords, nearby_candidate_scores = (
+      _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
+                                    nearby_candidate_inds))
+
+  if bboxes is None:
+    # Create bboxes from regressed keypoints.
+    # Shape [batch_size * num_instances, 4].
+    regressed_keypoints_flattened = tf.reshape(
+        regressed_keypoints, [-1, num_keypoints, 2])
+    bboxes_flattened = keypoint_ops.keypoints_to_enclosing_bounding_boxes(
+        regressed_keypoints_flattened)
+  else:
+    bboxes_flattened = tf.reshape(bboxes, [-1, 4])
+
+  # Scale the bounding boxes.
+  # Shape [batch_size, num_instances, 4].
+  boxlist = box_list.BoxList(bboxes_flattened)
+  boxlist_scaled = box_list_ops.scale_height_width(
+      boxlist, box_scale, box_scale)
+  bboxes_scaled = boxlist_scaled.get()
+  bboxes = tf.reshape(bboxes_scaled, [batch_size, num_instances, 4])
+
+  # Get ymin, xmin, ymax, xmax bounding box coordinates, tiled per keypoint.
+  # Shape [batch_size, num_instances, num_keypoints].
+  bboxes_tiled = tf.tile(tf.expand_dims(bboxes, 2), [1, 1, num_keypoints, 1])
+  ymin, xmin, ymax, xmax = tf.unstack(bboxes_tiled, axis=3)
+
+  # Produce a mask that indicates whether the original regressed keypoint
+  # should be used instead of a candidate keypoint.
+  # Shape [batch_size, num_instances, num_keypoints].
+  search_radius = (
+      tf.math.maximum(ymax - ymin, xmax - xmin) * candidate_search_scale)
+  mask = (tf.cast(nearby_candidate_coords[:, :, :, 0] < ymin, tf.int32) +
+          tf.cast(nearby_candidate_coords[:, :, :, 0] > ymax, tf.int32) +
+          tf.cast(nearby_candidate_coords[:, :, :, 1] < xmin, tf.int32) +
+          tf.cast(nearby_candidate_coords[:, :, :, 1] > xmax, tf.int32) +
+          # Filter out the chosen candidate with score lower than unmatched
+          # keypoint score.
+          tf.cast(nearby_candidate_scores <
+                  unmatched_keypoint_score, tf.int32) +
+          tf.cast(min_distances > search_radius, tf.int32))
+  mask = mask > 0
+
+  # Create refined keypoints where candidate keypoints replace original
+  # regressed keypoints if they are in the vicinity of the regressed keypoints.
+  # Shape [batch_size, num_instances, num_keypoints, 2].
+  refined_keypoints = tf.where(
+      tf.tile(tf.expand_dims(mask, -1), [1, 1, 1, 2]),
+      regressed_keypoints,
+      nearby_candidate_coords)
+
+  # Update keypoints scores. In the case where we use the original regressed
+  # keypoints, we use a default score of `unmatched_keypoint_score`.
+  # Shape [batch_size, num_instances, num_keypoints].
+  refined_scores = tf.where(
+      mask,
+      unmatched_keypoint_score * tf.ones_like(nearby_candidate_scores),
+      nearby_candidate_scores)
+
+  return refined_keypoints, refined_scores
+
+
+def _pad_to_full_keypoint_dim(keypoint_coords, keypoint_scores, keypoint_inds,
+                              num_total_keypoints):
+  """Scatter keypoint elements into tensors with full keypoints dimension.
+
+  Args:
+    keypoint_coords: a [batch_size, num_instances, num_keypoints, 2] float32
+      tensor.
+    keypoint_scores: a [batch_size, num_instances, num_keypoints] float32
+      tensor.
+    keypoint_inds: a list of integers that indicate the keypoint indices for
+      this specific keypoint class. These indices are used to scatter into
+      tensors that have a `num_total_keypoints` dimension.
+    num_total_keypoints: The total number of keypoints that this model predicts.
+
+  Returns:
+    A tuple with
+    keypoint_coords_padded: a
+      [batch_size, num_instances, num_total_keypoints,2] float32 tensor.
+    keypoint_scores_padded: a [batch_size, num_instances, num_total_keypoints]
+      float32 tensor.
+  """
+  batch_size, num_instances, _, _ = (
+      shape_utils.combined_static_and_dynamic_shape(keypoint_coords))
+  kpt_coords_transposed = tf.transpose(keypoint_coords, [2, 0, 1, 3])
+  kpt_scores_transposed = tf.transpose(keypoint_scores, [2, 0, 1])
+  kpt_inds_tensor = tf.expand_dims(keypoint_inds, axis=-1)
+  kpt_coords_scattered = tf.scatter_nd(
+      indices=kpt_inds_tensor,
+      updates=kpt_coords_transposed,
+      shape=[num_total_keypoints, batch_size, num_instances, 2])
+  kpt_scores_scattered = tf.scatter_nd(
+      indices=kpt_inds_tensor,
+      updates=kpt_scores_transposed,
+      shape=[num_total_keypoints, batch_size, num_instances])
+  keypoint_coords_padded = tf.transpose(kpt_coords_scattered, [1, 2, 0, 3])
+  keypoint_scores_padded = tf.transpose(kpt_scores_scattered, [1, 2, 0])
+  return keypoint_coords_padded, keypoint_scores_padded
+
+
+def _pad_to_full_instance_dim(keypoint_coords, keypoint_scores, instance_inds,
+                              max_instances):
+  """Scatter keypoint elements into tensors with full instance dimension.
+
+  Args:
+    keypoint_coords: a [batch_size, num_instances, num_keypoints, 2] float32
+      tensor.
+    keypoint_scores: a [batch_size, num_instances, num_keypoints] float32
+      tensor.
+    instance_inds: a list of integers that indicate the instance indices for
+      these keypoints. These indices are used to scatter into tensors
+      that have a `max_instances` dimension.
+    max_instances: The maximum number of instances detected by the model.
+
+  Returns:
+    A tuple with
+    keypoint_coords_padded: a [batch_size, max_instances, num_keypoints, 2]
+      float32 tensor.
+    keypoint_scores_padded: a [batch_size, max_instances, num_keypoints]
+      float32 tensor.
+  """
+  batch_size, _, num_keypoints, _ = (
+      shape_utils.combined_static_and_dynamic_shape(keypoint_coords))
+  kpt_coords_transposed = tf.transpose(keypoint_coords, [1, 0, 2, 3])
+  kpt_scores_transposed = tf.transpose(keypoint_scores, [1, 0, 2])
+  instance_inds = tf.expand_dims(instance_inds, axis=-1)
+  kpt_coords_scattered = tf.scatter_nd(
+      indices=instance_inds,
+      updates=kpt_coords_transposed,
+      shape=[max_instances, batch_size, num_keypoints, 2])
+  kpt_scores_scattered = tf.scatter_nd(
+      indices=instance_inds,
+      updates=kpt_scores_transposed,
+      shape=[max_instances, batch_size, num_keypoints])
+  keypoint_coords_padded = tf.transpose(kpt_coords_scattered, [1, 0, 2, 3])
+  keypoint_scores_padded = tf.transpose(kpt_scores_scattered, [1, 0, 2])
+  return keypoint_coords_padded, keypoint_scores_padded
+
+
+def _gather_candidates_at_indices(keypoint_candidates, keypoint_scores,
+                                  indices):
+  """Gathers keypoint candidate coordinates and scores at indices.
+
+  Args:
+    keypoint_candidates: a float tensor of shape [batch_size, max_candidates,
+      num_keypoints, 2] with candidate coordinates.
+    keypoint_scores: a float tensor of shape [batch_size, max_candidates,
+      num_keypoints] with keypoint scores.
+    indices: an integer tensor of shape [batch_size, num_indices, num_keypoints]
+      with indices.
+
+  Returns:
+    A tuple with
+    gathered_keypoint_candidates: a float tensor of shape [batch_size,
+      num_indices, num_keypoints, 2] with gathered coordinates.
+    gathered_keypoint_scores: a float tensor of shape [batch_size,
+      num_indices, num_keypoints, 2].
+  """
+  # Transpose tensors so that all batch dimensions are up front.
+  keypoint_candidates_transposed = tf.transpose(keypoint_candidates,
+                                                [0, 2, 1, 3])
+  keypoint_scores_transposed = tf.transpose(keypoint_scores, [0, 2, 1])
+  nearby_candidate_inds_transposed = tf.transpose(indices,
+                                                  [0, 2, 1])
+  nearby_candidate_coords_tranposed = tf.gather(
+      keypoint_candidates_transposed, nearby_candidate_inds_transposed,
+      batch_dims=2)
+  nearby_candidate_scores_transposed = tf.gather(
+      keypoint_scores_transposed, nearby_candidate_inds_transposed,
+      batch_dims=2)
+  gathered_keypoint_candidates = tf.transpose(nearby_candidate_coords_tranposed,
+                                              [0, 2, 1, 3])
+  gathered_keypoint_scores = tf.transpose(nearby_candidate_scores_transposed,
+                                          [0, 2, 1])
+  return gathered_keypoint_candidates, gathered_keypoint_scores
+
+
+def flattened_indices_from_row_col_indices(row_indices, col_indices, num_cols):
+  """Get the index in a flattened array given row and column indices."""
+  return (row_indices * num_cols) + col_indices
+
+
+def row_col_channel_indices_from_flattened_indices(indices, num_cols,
+                                                   num_channels):
+  """Computes row, column and channel indices from flattened indices.
+
+  Args:
+    indices: An integer tensor of any shape holding the indices in the flattened
+      space.
+    num_cols: Number of columns in the image (width).
+    num_channels: Number of channels in the image.
+
+  Returns:
+    row_indices: The row indices corresponding to each of the input indices.
+      Same shape as indices.
+    col_indices: The column indices corresponding to each of the input indices.
+      Same shape as indices.
+    channel_indices. The channel indices corresponding to each of the input
+      indices.
+
+  """
+  row_indices = (indices // num_channels) // num_cols
+  col_indices = (indices // num_channels) % num_cols
+  channel_indices = indices % num_channels
+
+  return row_indices, col_indices, channel_indices
+
+
+def get_valid_anchor_weights_in_flattened_image(true_image_shapes, height,
+                                                width):
+  """Computes valid anchor weights for an image assuming pixels will be flattened.
+
+  This function is useful when we only want to penalize valid areas in the
+  image in the case when padding is used. The function assumes that the loss
+  function will be applied after flattening the spatial dimensions and returns
+  anchor weights accordingly.
+
+  Args:
+    true_image_shapes: An integer tensor of shape [batch_size, 3] representing
+      the true image shape (without padding) for each sample in the batch.
+    height: height of the prediction from the network.
+    width: width of the prediction from the network.
+
+  Returns:
+    valid_anchor_weights: a float tensor of shape [batch_size, height * width]
+    with 1s in locations where the spatial coordinates fall within the height
+    and width in true_image_shapes.
+  """
+
+  indices = tf.reshape(tf.range(height * width), [1, -1])
+  batch_size = tf.shape(true_image_shapes)[0]
+  batch_indices = tf.ones((batch_size, 1), dtype=tf.int32) * indices
+
+  y_coords, x_coords, _ = row_col_channel_indices_from_flattened_indices(
+      batch_indices, width, 1)
+
+  max_y, max_x = true_image_shapes[:, 0], true_image_shapes[:, 1]
+  max_x = _to_float32(tf.expand_dims(max_x, 1))
+  max_y = _to_float32(tf.expand_dims(max_y, 1))
+
+  x_coords = _to_float32(x_coords)
+  y_coords = _to_float32(y_coords)
+
+  valid_mask = tf.math.logical_and(x_coords < max_x, y_coords < max_y)
+
+  return _to_float32(valid_mask)
+
+
+def convert_strided_predictions_to_normalized_boxes(boxes, stride,
+                                                    true_image_shapes):
+  """Converts predictions in the output space to normalized boxes.
+
+  Boxes falling outside the valid image boundary are clipped to be on the
+  boundary.
+
+  Args:
+    boxes: A tensor of shape [batch_size, num_boxes, 4] holding the raw
+     coordinates of boxes in the model's output space.
+    stride: The stride in the output space.
+    true_image_shapes: A tensor of shape [batch_size, 3] representing the true
+      shape of the input not considering padding.
+
+  Returns:
+    boxes: A tensor of shape [batch_size, num_boxes, 4] representing the
+      coordinates of the normalized boxes.
+  """
+
+  def _normalize_boxlist(args):
+
+    boxes, height, width = args
+    boxes = box_list_ops.scale(boxes, stride, stride)
+    boxes = box_list_ops.to_normalized_coordinates(boxes, height, width)
+    boxes = box_list_ops.clip_to_window(boxes, [0., 0., 1., 1.],
+                                        filter_nonoverlapping=False)
+    return boxes
+
+  box_lists = [box_list.BoxList(boxes) for boxes in tf.unstack(boxes, axis=0)]
+  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
+
+  true_heights_list = tf.unstack(true_heights, axis=0)
+  true_widths_list = tf.unstack(true_widths, axis=0)
+
+  box_lists = list(map(_normalize_boxlist,
+                       zip(box_lists, true_heights_list, true_widths_list)))
+  boxes = tf.stack([box_list_instance.get() for
+                    box_list_instance in box_lists], axis=0)
+
+  return boxes
+
+
+def convert_strided_predictions_to_normalized_keypoints(
+    keypoint_coords, keypoint_scores, stride, true_image_shapes,
+    clip_out_of_frame_keypoints=False):
+  """Converts predictions in the output space to normalized keypoints.
+
+  If clip_out_of_frame_keypoints=False, keypoint coordinates falling outside
+  the valid image boundary are normalized but not clipped; If
+  clip_out_of_frame_keypoints=True, keypoint coordinates falling outside the
+  valid image boundary are clipped to the closest image boundary and the scores
+  will be set to 0.0.
+
+  Args:
+    keypoint_coords: A tensor of shape
+      [batch_size, num_instances, num_keypoints, 2] holding the raw coordinates
+      of keypoints in the model's output space.
+    keypoint_scores: A tensor of shape
+      [batch_size, num_instances, num_keypoints] holding the keypoint scores.
+    stride: The stride in the output space.
+    true_image_shapes: A tensor of shape [batch_size, 3] representing the true
+      shape of the input not considering padding.
+    clip_out_of_frame_keypoints: A boolean indicating whether keypoints outside
+      the image boundary should be clipped. If True, keypoint coords will be
+      clipped to image boundary. If False, keypoints are normalized but not
+      filtered based on their location.
+
+  Returns:
+    keypoint_coords_normalized: A tensor of shape
+      [batch_size, num_instances, num_keypoints, 2] representing the coordinates
+      of the normalized keypoints.
+    keypoint_scores: A tensor of shape
+      [batch_size, num_instances, num_keypoints] representing the updated
+      keypoint scores.
+  """
+  # Flatten keypoints and scores.
+  batch_size, _, _, _ = (
+      shape_utils.combined_static_and_dynamic_shape(keypoint_coords))
+
+  # Scale and normalize keypoints.
+  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
+  yscale = float(stride) / tf.cast(true_heights, tf.float32)
+  xscale = float(stride) / tf.cast(true_widths, tf.float32)
+  yx_scale = tf.stack([yscale, xscale], axis=1)
+  keypoint_coords_normalized = keypoint_coords * tf.reshape(
+      yx_scale, [batch_size, 1, 1, 2])
+
+  if clip_out_of_frame_keypoints:
+    # Determine the keypoints that are in the true image regions.
+    valid_indices = tf.logical_and(
+        tf.logical_and(keypoint_coords_normalized[:, :, :, 0] >= 0.0,
+                       keypoint_coords_normalized[:, :, :, 0] <= 1.0),
+        tf.logical_and(keypoint_coords_normalized[:, :, :, 1] >= 0.0,
+                       keypoint_coords_normalized[:, :, :, 1] <= 1.0))
+    batch_window = tf.tile(
+        tf.constant([[0.0, 0.0, 1.0, 1.0]], dtype=tf.float32),
+        multiples=[batch_size, 1])
+    def clip_to_window(inputs):
+      keypoints, window = inputs
+      return keypoint_ops.clip_to_window(keypoints, window)
+    keypoint_coords_normalized = tf.map_fn(
+        clip_to_window, (keypoint_coords_normalized, batch_window),
+        dtype=tf.float32, back_prop=False)
+    keypoint_scores = tf.where(valid_indices, keypoint_scores,
+                               tf.zeros_like(keypoint_scores))
+  return keypoint_coords_normalized, keypoint_scores
+
+
+def convert_strided_predictions_to_instance_masks(
+    boxes, classes, masks, stride, mask_height, mask_width,
+    true_image_shapes, score_threshold=0.5):
+  """Converts predicted full-image masks into instance masks.
+
+  For each predicted detection box:
+    * Crop and resize the predicted mask based on the detected bounding box
+      coordinates and class prediction. Uses bilinear resampling.
+    * Binarize the mask using the provided score threshold.
+
+  Args:
+    boxes: A tensor of shape [batch, max_detections, 4] holding the predicted
+      boxes, in normalized coordinates (relative to the true image dimensions).
+    classes: An integer tensor of shape [batch, max_detections] containing the
+      detected class for each box (0-indexed).
+    masks: A [batch, output_height, output_width, num_classes] float32
+      tensor with class probabilities.
+    stride: The stride in the output space.
+    mask_height: The desired resized height for instance masks.
+    mask_width: The desired resized width for instance masks.
+    true_image_shapes: A tensor of shape [batch, 3] representing the true
+      shape of the inputs not considering padding.
+    score_threshold: The threshold at which to convert predicted mask
+       into foreground pixels.
+
+  Returns:
+    A [batch_size, max_detections, mask_height, mask_width] uint8 tensor with
+    predicted foreground mask for each instance. The masks take values in
+    {0, 1}.
+  """
+  _, output_height, output_width, _ = (
+      shape_utils.combined_static_and_dynamic_shape(masks))
+  input_height = stride * output_height
+  input_width = stride * output_width
+
+  # Boxes are in normalized coordinates relative to true image shapes. Convert
+  # coordinates to be normalized relative to input image shapes (since masks
+  # may still have padding).
+  # Then crop and resize each mask.
+  def crop_and_threshold_masks(args):
+    """Crops masks based on detection boxes."""
+    boxes, classes, masks, true_height, true_width = args
+    boxlist = box_list.BoxList(boxes)
+    y_scale = true_height / input_height
+    x_scale = true_width / input_width
+    boxlist = box_list_ops.scale(boxlist, y_scale, x_scale)
+    boxes = boxlist.get()
+    # Convert masks from [input_height, input_width, num_classes] to
+    # [num_classes, input_height, input_width, 1].
+    masks_4d = tf.transpose(masks, perm=[2, 0, 1])[:, :, :, tf.newaxis]
+    cropped_masks = tf2.image.crop_and_resize(
+        masks_4d,
+        boxes=boxes,
+        box_indices=classes,
+        crop_size=[mask_height, mask_width],
+        method='bilinear')
+    masks_3d = tf.squeeze(cropped_masks, axis=3)
+    masks_binarized = tf.math.greater_equal(masks_3d, score_threshold)
+    return tf.cast(masks_binarized, tf.uint8)
+
+  true_heights, true_widths, _ = tf.unstack(true_image_shapes, axis=1)
+  masks_for_image = shape_utils.static_or_dynamic_map_fn(
+      crop_and_threshold_masks,
+      elems=[boxes, classes, masks, true_heights, true_widths],
+      dtype=tf.uint8,
+      back_prop=False)
+  masks = tf.stack(masks_for_image, axis=0)
+  return masks
+
+
+class ObjectDetectionParams(
+    collections.namedtuple('ObjectDetectionParams', [
+        'localization_loss', 'scale_loss_weight', 'offset_loss_weight',
+        'task_loss_weight'
+    ])):
+  """Namedtuple to host object detection related parameters.
+
+  This is a wrapper class over the fields that are either the hyper-parameters
+  or the loss functions needed for the object detection task. The class is
+  immutable after constructed. Please see the __new__ function for detailed
+  information for each fields.
+  """
+
+  __slots__ = ()
+
+  def __new__(cls,
+              localization_loss,
+              scale_loss_weight,
+              offset_loss_weight,
+              task_loss_weight=1.0):
+    """Constructor with default values for ObjectDetectionParams.
+
+    Args:
+      localization_loss: a object_detection.core.losses.Loss object to compute
+        the loss for the center offset and height/width predictions in
+        CenterNet.
+      scale_loss_weight: float, The weight for localizing box size. Note that
+        the scale loss is dependent on the input image size, since we penalize
+        the raw height and width. This constant may need to be adjusted
+        depending on the input size.
+      offset_loss_weight: float, The weight for localizing center offsets.
+      task_loss_weight: float, the weight of the object detection loss.
+
+    Returns:
+      An initialized ObjectDetectionParams namedtuple.
+    """
+    return super(ObjectDetectionParams,
+                 cls).__new__(cls, localization_loss, scale_loss_weight,
+                              offset_loss_weight, task_loss_weight)
+
+
+class KeypointEstimationParams(
+    collections.namedtuple('KeypointEstimationParams', [
+        'task_name', 'class_id', 'keypoint_indices', 'classification_loss',
+        'localization_loss', 'keypoint_labels', 'keypoint_std_dev',
+        'keypoint_heatmap_loss_weight', 'keypoint_offset_loss_weight',
+        'keypoint_regression_loss_weight', 'keypoint_candidate_score_threshold',
+        'heatmap_bias_init', 'num_candidates_per_keypoint', 'task_loss_weight',
+        'peak_max_pool_kernel_size', 'unmatched_keypoint_score', 'box_scale',
+        'candidate_search_scale', 'candidate_ranking_mode',
+        'offset_peak_radius', 'per_keypoint_offset'
+    ])):
+  """Namedtuple to host object detection related parameters.
+
+  This is a wrapper class over the fields that are either the hyper-parameters
+  or the loss functions needed for the keypoint estimation task. The class is
+  immutable after constructed. Please see the __new__ function for detailed
+  information for each fields.
+  """
+
+  __slots__ = ()
+
+  def __new__(cls,
+              task_name,
+              class_id,
+              keypoint_indices,
+              classification_loss,
+              localization_loss,
+              keypoint_labels=None,
+              keypoint_std_dev=None,
+              keypoint_heatmap_loss_weight=1.0,
+              keypoint_offset_loss_weight=1.0,
+              keypoint_regression_loss_weight=1.0,
+              keypoint_candidate_score_threshold=0.1,
+              heatmap_bias_init=-2.19,
+              num_candidates_per_keypoint=100,
+              task_loss_weight=1.0,
+              peak_max_pool_kernel_size=3,
+              unmatched_keypoint_score=0.1,
+              box_scale=1.2,
+              candidate_search_scale=0.3,
+              candidate_ranking_mode='min_distance',
+              offset_peak_radius=0,
+              per_keypoint_offset=False):
+    """Constructor with default values for KeypointEstimationParams.
+
+    Args:
+      task_name: string, the name of the task this namedtuple corresponds to.
+        Note that it should be an unique identifier of the task.
+      class_id: int, the ID of the class that contains the target keypoints to
+        considered in this task. For example, if the task is human pose
+        estimation, the class id should correspond to the "human" class. Note
+        that the ID is 0-based, meaning that class 0 corresponds to the first
+        non-background object class.
+      keypoint_indices: A list of integers representing the indicies of the
+        keypoints to be considered in this task. This is used to retrieve the
+        subset of the keypoints from gt_keypoints that should be considered in
+        this task.
+      classification_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the class predictions in CenterNet.
+      localization_loss: an object_detection.core.losses.Loss object to compute
+        the loss for the center offset and height/width predictions in
+        CenterNet.
+      keypoint_labels: A list of strings representing the label text of each
+        keypoint, e.g. "nose", 'left_shoulder". Note that the length of this
+        list should be equal to keypoint_indices.
+      keypoint_std_dev: A list of float represent the standard deviation of the
+        Gaussian kernel used to generate the keypoint heatmap. It is to provide
+        the flexibility of using different sizes of Gaussian kernel for each
+        keypoint class.
+      keypoint_heatmap_loss_weight: float, The weight for the keypoint heatmap.
+      keypoint_offset_loss_weight: float, The weight for the keypoint offsets
+        loss.
+      keypoint_regression_loss_weight: float, The weight for keypoint regression
+        loss. Note that the loss is dependent on the input image size, since we
+        penalize the raw height and width. This constant may need to be adjusted
+        depending on the input size.
+      keypoint_candidate_score_threshold: float, The heatmap score threshold for
+        a keypoint to become a valid candidate.
+      heatmap_bias_init: float, the initial value of bias in the convolutional
+        kernel of the class prediction head. If set to None, the bias is
+        initialized with zeros.
+      num_candidates_per_keypoint: The maximum number of candidates to retrieve
+        for each keypoint.
+      task_loss_weight: float, the weight of the keypoint estimation loss.
+      peak_max_pool_kernel_size: Max pool kernel size to use to pull off peak
+        score locations in a neighborhood (independently for each keypoint
+        types).
+      unmatched_keypoint_score: The default score to use for regressed keypoints
+        that are not successfully snapped to a nearby candidate.
+      box_scale: The multiplier to expand the bounding boxes (either the
+        provided boxes or those which tightly cover the regressed keypoints).
+      candidate_search_scale: The scale parameter that multiplies the largest
+        dimension of a bounding box. The resulting distance becomes a search
+        radius for candidates in the vicinity of each regressed keypoint.
+      candidate_ranking_mode: One of ['min_distance', 'score_distance_ratio']
+        indicating how to select the keypoint candidate.
+      offset_peak_radius: The radius (in the unit of output pixel) around
+        groundtruth heatmap peak to assign the offset targets. If set 0, then
+        the offset target will only be assigned to the heatmap peak (same
+        behavior as the original paper).
+      per_keypoint_offset: A bool indicates whether to assign offsets for each
+        keypoint channel separately. If set False, the output offset target has
+        the shape [batch_size, out_height, out_width, 2] (same behavior as the
+        original paper). If set True, the output offset target has the shape
+        [batch_size, out_height, out_width, 2 * num_keypoints] (recommended when
+        the offset_peak_radius is not zero).
+
+    Returns:
+      An initialized KeypointEstimationParams namedtuple.
+    """
+    return super(KeypointEstimationParams, cls).__new__(
+        cls, task_name, class_id, keypoint_indices, classification_loss,
+        localization_loss, keypoint_labels, keypoint_std_dev,
+        keypoint_heatmap_loss_weight, keypoint_offset_loss_weight,
+        keypoint_regression_loss_weight, keypoint_candidate_score_threshold,
+        heatmap_bias_init, num_candidates_per_keypoint, task_loss_weight,
+        peak_max_pool_kernel_size, unmatched_keypoint_score, box_scale,
+        candidate_search_scale, candidate_ranking_mode, offset_peak_radius,
+        per_keypoint_offset)
+
+
+class ObjectCenterParams(
+    collections.namedtuple('ObjectCenterParams', [
+        'classification_loss', 'object_center_loss_weight', 'heatmap_bias_init',
+        'min_box_overlap_iou', 'max_box_predictions', 'use_only_known_classes'
+    ])):
+  """Namedtuple to store object center prediction related parameters."""
+
+  __slots__ = ()
+
+  def __new__(cls,
+              classification_loss,
+              object_center_loss_weight,
+              heatmap_bias_init=-2.19,
+              min_box_overlap_iou=0.7,
+              max_box_predictions=100,
+              use_labeled_classes=False):
+    """Constructor with default values for ObjectCenterParams.
+
+    Args:
+      classification_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the class predictions in CenterNet.
+      object_center_loss_weight: float, The weight for the object center loss.
+      heatmap_bias_init: float, the initial value of bias in the convolutional
+        kernel of the object center prediction head. If set to None, the bias is
+        initialized with zeros.
+      min_box_overlap_iou: float, the minimum IOU overlap that predicted boxes
+        need have with groundtruth boxes to not be penalized. This is used for
+        computing the class specific center heatmaps.
+      max_box_predictions: int, the maximum number of boxes to predict.
+      use_labeled_classes: boolean, compute the loss only labeled classes.
+
+    Returns:
+      An initialized ObjectCenterParams namedtuple.
+    """
+    return super(ObjectCenterParams,
+                 cls).__new__(cls, classification_loss,
+                              object_center_loss_weight, heatmap_bias_init,
+                              min_box_overlap_iou, max_box_predictions,
+                              use_labeled_classes)
+
+
+class MaskParams(
+    collections.namedtuple('MaskParams', [
+        'classification_loss', 'task_loss_weight', 'mask_height', 'mask_width',
+        'score_threshold', 'heatmap_bias_init'
+    ])):
+  """Namedtuple to store mask prediction related parameters."""
+
+  __slots__ = ()
+
+  def __new__(cls,
+              classification_loss,
+              task_loss_weight=1.0,
+              mask_height=256,
+              mask_width=256,
+              score_threshold=0.5,
+              heatmap_bias_init=-2.19):
+    """Constructor with default values for MaskParams.
+
+    Args:
+      classification_loss: an object_detection.core.losses.Loss object to
+        compute the loss for the semantic segmentation predictions in CenterNet.
+      task_loss_weight: float, The loss weight for the segmentation task.
+      mask_height: The height of the resized instance segmentation mask.
+      mask_width: The width of the resized instance segmentation mask.
+      score_threshold: The threshold at which to convert predicted mask
+        probabilities (after passing through sigmoid) into foreground pixels.
+      heatmap_bias_init: float, the initial value of bias in the convolutional
+        kernel of the semantic segmentation prediction head. If set to None, the
+        bias is initialized with zeros.
+
+    Returns:
+      An initialized MaskParams namedtuple.
+    """
+    return super(MaskParams,
+                 cls).__new__(cls, classification_loss,
+                              task_loss_weight, mask_height, mask_width,
+                              score_threshold, heatmap_bias_init)
+
+
+# The following constants are used to generate the keys of the
+# (prediction, loss, target assigner,...) dictionaries used in CenterNetMetaArch
+# class.
+DETECTION_TASK = 'detection_task'
+OBJECT_CENTER = 'object_center'
+BOX_SCALE = 'box/scale'
+BOX_OFFSET = 'box/offset'
+KEYPOINT_REGRESSION = 'keypoint/regression'
+KEYPOINT_HEATMAP = 'keypoint/heatmap'
+KEYPOINT_OFFSET = 'keypoint/offset'
+SEGMENTATION_TASK = 'segmentation_task'
+SEGMENTATION_HEATMAP = 'segmentation/heatmap'
+LOSS_KEY_PREFIX = 'Loss'
+
+
+def get_keypoint_name(task_name, head_name):
+  return '%s/%s' % (task_name, head_name)
+
+
+def get_num_instances_from_weights(groundtruth_weights_list):
+  """Computes the number of instances/boxes from the weights in a batch.
+
+  Args:
+    groundtruth_weights_list: A list of float tensors with shape
+      [max_num_instances] representing whether there is an actual instance in
+      the image (with non-zero value) or is padded to match the
+      max_num_instances (with value 0.0). The list represents the batch
+      dimension.
+
+  Returns:
+    A scalar integer tensor incidating how many instances/boxes are in the
+    images in the batch. Note that this function is usually used to normalize
+    the loss so the minimum return value is 1 to avoid weird behavior.
+  """
+  num_instances = tf.reduce_sum(
+      [tf.math.count_nonzero(w) for w in groundtruth_weights_list])
+  num_instances = tf.maximum(num_instances, 1)
+  return num_instances
+
+
+class CenterNetMetaArch(model.DetectionModel):
+  """The CenterNet meta architecture [1].
+
+  [1]: https://arxiv.org/abs/1904.07850
+  """
+
+  def __init__(self,
+               is_training,
+               add_summaries,
+               num_classes,
+               feature_extractor,
+               image_resizer_fn,
+               object_center_params,
+               object_detection_params=None,
+               keypoint_params_dict=None,
+               mask_params=None):
+    """Initializes a CenterNet model.
+
+    Args:
+      is_training: Set to True if this model is being built for training.
+      add_summaries: Whether to add tf summaries in the model.
+      num_classes: int, The number of classes that the model should predict.
+      feature_extractor: A CenterNetFeatureExtractor to use to extract features
+        from an image.
+      image_resizer_fn: a callable for image resizing.  This callable always
+        takes a rank-3 image tensor (corresponding to a single image) and
+        returns a rank-3 image tensor, possibly with new spatial dimensions and
+        a 1-D tensor of shape [3] indicating shape of true image within the
+        resized image tensor as the resized image tensor could be padded. See
+        builders/image_resizer_builder.py.
+      object_center_params: An ObjectCenterParams namedtuple. This object holds
+        the hyper-parameters for object center prediction. This is required by
+        either object detection or keypoint estimation tasks.
+      object_detection_params: An ObjectDetectionParams namedtuple. This object
+        holds the hyper-parameters necessary for object detection. Please see
+        the class definition for more details.
+      keypoint_params_dict: A dictionary that maps from task name to the
+        corresponding KeypointEstimationParams namedtuple. This object holds the
+        hyper-parameters necessary for multiple keypoint estimations. Please
+        see the class definition for more details.
+      mask_params: A MaskParams namedtuple. This object
+        holds the hyper-parameters for segmentation. Please see the class
+        definition for more details.
+    """
+    assert object_detection_params or keypoint_params_dict
+    # Shorten the name for convenience and better formatting.
+    self._is_training = is_training
+    # The Objects as Points paper attaches loss functions to multiple
+    # (`num_feature_outputs`) feature maps in the the backbone. E.g.
+    # for the hourglass  backbone, `num_feature_outputs` is 2.
+    self._feature_extractor = feature_extractor
+    self._num_feature_outputs = feature_extractor.num_feature_outputs
+    self._stride = self._feature_extractor.out_stride
+    self._image_resizer_fn = image_resizer_fn
+    self._center_params = object_center_params
+    self._od_params = object_detection_params
+    self._kp_params_dict = keypoint_params_dict
+    self._mask_params = mask_params
+
+    # Construct the prediction head nets.
+    self._prediction_head_dict = self._construct_prediction_heads(
+        num_classes,
+        self._num_feature_outputs,
+        class_prediction_bias_init=self._center_params.heatmap_bias_init)
+    # Initialize the target assigners.
+    self._target_assigner_dict = self._initialize_target_assigners(
+        stride=self._stride,
+        min_box_overlap_iou=self._center_params.min_box_overlap_iou)
+
+    # Will be used in VOD single_frame_meta_arch for tensor reshape.
+    self._batched_prediction_tensor_names = []
+
+    super(CenterNetMetaArch, self).__init__(num_classes)
+
+  @property
+  def batched_prediction_tensor_names(self):
+    if not self._batched_prediction_tensor_names:
+      raise RuntimeError('Must call predict() method to get batched prediction '
+                         'tensor names.')
+    return self._batched_prediction_tensor_names
+
+  def _construct_prediction_heads(self, num_classes, num_feature_outputs,
+                                  class_prediction_bias_init):
+    """Constructs the prediction heads based on the specific parameters.
+
+    Args:
+      num_classes: An integer indicating how many classes in total to predict.
+      num_feature_outputs: An integer indicating how many feature outputs to use
+        for calculating the loss. The Objects as Points paper attaches loss
+        functions to multiple (`num_feature_outputs`) feature maps in the the
+        backbone. E.g. for the hourglass backbone, `num_feature_outputs` is 2.
+      class_prediction_bias_init: float, the initial value of bias in the
+        convolutional kernel of the class prediction head. If set to None, the
+        bias is initialized with zeros.
+
+    Returns:
+      A dictionary of keras modules generated by calling make_prediction_net
+      function.
+    """
+    prediction_heads = {}
+    prediction_heads[OBJECT_CENTER] = [
+        make_prediction_net(num_classes, bias_fill=class_prediction_bias_init)
+        for _ in range(num_feature_outputs)
+    ]
+    if self._od_params is not None:
+      prediction_heads[BOX_SCALE] = [
+          make_prediction_net(NUM_SIZE_CHANNELS)
+          for _ in range(num_feature_outputs)
+      ]
+      prediction_heads[BOX_OFFSET] = [
+          make_prediction_net(NUM_OFFSET_CHANNELS)
+          for _ in range(num_feature_outputs)
+      ]
+    if self._kp_params_dict is not None:
+      for task_name, kp_params in self._kp_params_dict.items():
+        num_keypoints = len(kp_params.keypoint_indices)
+        prediction_heads[get_keypoint_name(task_name, KEYPOINT_HEATMAP)] = [
+            make_prediction_net(
+                num_keypoints, bias_fill=kp_params.heatmap_bias_init)
+            for _ in range(num_feature_outputs)
+        ]
+        prediction_heads[get_keypoint_name(task_name, KEYPOINT_REGRESSION)] = [
+            make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints)
+            for _ in range(num_feature_outputs)
+        ]
+        if kp_params.per_keypoint_offset:
+          prediction_heads[get_keypoint_name(task_name, KEYPOINT_OFFSET)] = [
+              make_prediction_net(NUM_OFFSET_CHANNELS * num_keypoints)
+              for _ in range(num_feature_outputs)
+          ]
+        else:
+          prediction_heads[get_keypoint_name(task_name, KEYPOINT_OFFSET)] = [
+              make_prediction_net(NUM_OFFSET_CHANNELS)
+              for _ in range(num_feature_outputs)
+          ]
+    if self._mask_params is not None:
+      prediction_heads[SEGMENTATION_HEATMAP] = [
+          make_prediction_net(num_classes,
+                              bias_fill=class_prediction_bias_init)
+          for _ in range(num_feature_outputs)]
+    return prediction_heads
+
+  def _initialize_target_assigners(self, stride, min_box_overlap_iou):
+    """Initializes the target assigners and puts them in a dictionary.
+
+    Args:
+      stride: An integer indicating the stride of the image.
+      min_box_overlap_iou: float, the minimum IOU overlap that predicted boxes
+        need have with groundtruth boxes to not be penalized. This is used for
+        computing the class specific center heatmaps.
+
+    Returns:
+      A dictionary of initialized target assigners for each task.
+    """
+    target_assigners = {}
+    target_assigners[OBJECT_CENTER] = (
+        cn_assigner.CenterNetCenterHeatmapTargetAssigner(
+            stride, min_box_overlap_iou))
+    if self._od_params is not None:
+      target_assigners[DETECTION_TASK] = (
+          cn_assigner.CenterNetBoxTargetAssigner(stride))
+    if self._kp_params_dict is not None:
+      for task_name, kp_params in self._kp_params_dict.items():
+        target_assigners[task_name] = (
+            cn_assigner.CenterNetKeypointTargetAssigner(
+                stride=stride,
+                class_id=kp_params.class_id,
+                keypoint_indices=kp_params.keypoint_indices,
+                keypoint_std_dev=kp_params.keypoint_std_dev,
+                peak_radius=kp_params.offset_peak_radius,
+                per_keypoint_offset=kp_params.per_keypoint_offset))
+    if self._mask_params is not None:
+      target_assigners[SEGMENTATION_TASK] = (
+          cn_assigner.CenterNetMaskTargetAssigner(stride))
+
+    return target_assigners
+
+  def _compute_object_center_loss(self, input_height, input_width,
+                                  object_center_predictions, per_pixel_weights):
+    """Computes the object center loss.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      object_center_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, num_classes] representing the object center
+        feature maps.
+      per_pixel_weights: A float tensor of shape [batch_size,
+        out_height * out_width, 1] with 1s in locations where the spatial
+        coordinates fall within the height and width in true_image_shapes.
+
+    Returns:
+      A float scalar tensor representing the object center loss per instance.
+    """
+    gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
+    gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+
+    if self._center_params.use_only_known_classes:
+      gt_labeled_classes_list = self.groundtruth_lists(
+          fields.InputDataFields.groundtruth_labeled_classes)
+      batch_labeled_classes = tf.stack(gt_labeled_classes_list, axis=0)
+      batch_labeled_classes_shape = tf.shape(batch_labeled_classes)
+      batch_labeled_classes = tf.reshape(
+          batch_labeled_classes,
+          [batch_labeled_classes_shape[0], 1, batch_labeled_classes_shape[-1]])
+      per_pixel_weights = per_pixel_weights * batch_labeled_classes
+
+    # Convert the groundtruth to targets.
+    assigner = self._target_assigner_dict[OBJECT_CENTER]
+    heatmap_targets = assigner.assign_center_targets_from_boxes(
+        height=input_height,
+        width=input_width,
+        gt_boxes_list=gt_boxes_list,
+        gt_classes_list=gt_classes_list,
+        gt_weights_list=gt_weights_list)
+
+    flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
+    num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
+
+    loss = 0.0
+    object_center_loss = self._center_params.classification_loss
+    # Loop through each feature output head.
+    for pred in object_center_predictions:
+      pred = _flatten_spatial_dimensions(pred)
+      loss += object_center_loss(
+          pred, flattened_heatmap_targets, weights=per_pixel_weights)
+    loss_per_instance = tf.reduce_sum(loss) / (
+        float(len(object_center_predictions)) * num_boxes)
+    return loss_per_instance
+
+  def _compute_object_detection_losses(self, input_height, input_width,
+                                       prediction_dict, per_pixel_weights):
+    """Computes the weighted object detection losses.
+
+    This wrapper function calls the function which computes the losses for
+    object detection task and applies corresponding weights to the losses.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      prediction_dict: A dictionary holding predicted tensors output by
+        "predict" function. See "predict" function for more detailed
+        description.
+      per_pixel_weights: A float tensor of shape [batch_size,
+        out_height * out_width, 1] with 1s in locations where the spatial
+        coordinates fall within the height and width in true_image_shapes.
+
+    Returns:
+      A dictionary of scalar float tensors representing the weighted losses for
+      object detection task:
+         BOX_SCALE: the weighted scale (height/width) loss.
+         BOX_OFFSET: the weighted object offset loss.
+    """
+    od_scale_loss, od_offset_loss = self._compute_box_scale_and_offset_loss(
+        scale_predictions=prediction_dict[BOX_SCALE],
+        offset_predictions=prediction_dict[BOX_OFFSET],
+        input_height=input_height,
+        input_width=input_width)
+    loss_dict = {}
+    loss_dict[BOX_SCALE] = (
+        self._od_params.scale_loss_weight * od_scale_loss)
+    loss_dict[BOX_OFFSET] = (
+        self._od_params.offset_loss_weight * od_offset_loss)
+    return loss_dict
+
+  def _compute_box_scale_and_offset_loss(self, input_height, input_width,
+                                         scale_predictions, offset_predictions):
+    """Computes the scale loss of the object detection task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      scale_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2] representing the prediction heads of the model
+        for object scale (i.e height and width).
+      offset_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2] representing the prediction heads of the model
+        for object offset.
+
+    Returns:
+      A tuple of two losses:
+        scale_loss: A float scalar tensor representing the object height/width
+          loss normalized by total number of boxes.
+        offset_loss: A float scalar tensor representing the object offset loss
+          normalized by total number of boxes
+    """
+    # TODO(vighneshb) Explore a size invariant version of scale loss.
+    gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+    num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
+    num_predictions = float(len(scale_predictions))
+
+    assigner = self._target_assigner_dict[DETECTION_TASK]
+    (batch_indices, batch_height_width_targets, batch_offset_targets,
+     batch_weights) = assigner.assign_size_and_offset_targets(
+         height=input_height,
+         width=input_width,
+         gt_boxes_list=gt_boxes_list,
+         gt_weights_list=gt_weights_list)
+    batch_weights = tf.expand_dims(batch_weights, -1)
+
+    scale_loss = 0
+    offset_loss = 0
+    localization_loss_fn = self._od_params.localization_loss
+    for scale_pred, offset_pred in zip(scale_predictions, offset_predictions):
+      # Compute the scale loss.
+      scale_pred = cn_assigner.get_batch_predictions_from_indices(
+          scale_pred, batch_indices)
+      scale_loss += localization_loss_fn(
+          scale_pred, batch_height_width_targets, weights=batch_weights)
+      # Compute the offset loss.
+      offset_pred = cn_assigner.get_batch_predictions_from_indices(
+          offset_pred, batch_indices)
+      offset_loss += localization_loss_fn(
+          offset_pred, batch_offset_targets, weights=batch_weights)
+    scale_loss = tf.reduce_sum(scale_loss) / (
+        num_predictions * num_boxes)
+    offset_loss = tf.reduce_sum(offset_loss) / (
+        num_predictions * num_boxes)
+    return scale_loss, offset_loss
+
+  def _compute_keypoint_estimation_losses(self, task_name, input_height,
+                                          input_width, prediction_dict,
+                                          per_pixel_weights):
+    """Computes the weighted keypoint losses."""
+    kp_params = self._kp_params_dict[task_name]
+    heatmap_key = get_keypoint_name(task_name, KEYPOINT_HEATMAP)
+    offset_key = get_keypoint_name(task_name, KEYPOINT_OFFSET)
+    regression_key = get_keypoint_name(task_name, KEYPOINT_REGRESSION)
+    heatmap_loss = self._compute_kp_heatmap_loss(
+        input_height=input_height,
+        input_width=input_width,
+        task_name=task_name,
+        heatmap_predictions=prediction_dict[heatmap_key],
+        classification_loss_fn=kp_params.classification_loss,
+        per_pixel_weights=per_pixel_weights)
+    offset_loss = self._compute_kp_offset_loss(
+        input_height=input_height,
+        input_width=input_width,
+        task_name=task_name,
+        offset_predictions=prediction_dict[offset_key],
+        localization_loss_fn=kp_params.localization_loss)
+    reg_loss = self._compute_kp_regression_loss(
+        input_height=input_height,
+        input_width=input_width,
+        task_name=task_name,
+        regression_predictions=prediction_dict[regression_key],
+        localization_loss_fn=kp_params.localization_loss)
+
+    loss_dict = {}
+    loss_dict[heatmap_key] = (
+        kp_params.keypoint_heatmap_loss_weight * heatmap_loss)
+    loss_dict[offset_key] = (
+        kp_params.keypoint_offset_loss_weight * offset_loss)
+    loss_dict[regression_key] = (
+        kp_params.keypoint_regression_loss_weight * reg_loss)
+    return loss_dict
+
+  def _compute_kp_heatmap_loss(self, input_height, input_width, task_name,
+                               heatmap_predictions, classification_loss_fn,
+                               per_pixel_weights):
+    """Computes the heatmap loss of the keypoint estimation task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      task_name: A string representing the name of the keypoint task.
+      heatmap_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, num_keypoints] representing the prediction heads
+        of the model for keypoint heatmap.
+      classification_loss_fn: An object_detection.core.losses.Loss object to
+        compute the loss for the class predictions in CenterNet.
+      per_pixel_weights: A float tensor of shape [batch_size,
+        out_height * out_width, 1] with 1s in locations where the spatial
+        coordinates fall within the height and width in true_image_shapes.
+
+    Returns:
+      loss: A float scalar tensor representing the object keypoint heatmap loss
+        normalized by number of instances.
+    """
+    gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
+    gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+    gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
+
+    assigner = self._target_assigner_dict[task_name]
+    (keypoint_heatmap, num_instances_per_kp_type,
+     valid_mask_batch) = assigner.assign_keypoint_heatmap_targets(
+         height=input_height,
+         width=input_width,
+         gt_keypoints_list=gt_keypoints_list,
+         gt_weights_list=gt_weights_list,
+         gt_classes_list=gt_classes_list,
+         gt_boxes_list=gt_boxes_list)
+    flattened_valid_mask = _flatten_spatial_dimensions(
+        tf.expand_dims(valid_mask_batch, axis=-1))
+    flattened_heapmap_targets = _flatten_spatial_dimensions(keypoint_heatmap)
+    # Sum over the number of instances per keypoint types to get the total
+    # number of keypoints. Note that this is used to normalized the loss and we
+    # keep the minimum value to be 1 to avoid generating weird loss value when
+    # no keypoint is in the image batch.
+    num_instances = tf.maximum(
+        tf.cast(tf.reduce_sum(num_instances_per_kp_type), dtype=tf.float32),
+        1.0)
+    loss = 0.0
+    # Loop through each feature output head.
+    for pred in heatmap_predictions:
+      pred = _flatten_spatial_dimensions(pred)
+      unweighted_loss = classification_loss_fn(
+          pred,
+          flattened_heapmap_targets,
+          weights=tf.ones_like(per_pixel_weights))
+      # Apply the weights after the loss function to have full control over it.
+      loss += unweighted_loss * per_pixel_weights * flattened_valid_mask
+    loss = tf.reduce_sum(loss) / (
+        float(len(heatmap_predictions)) * num_instances)
+    return loss
+
+  def _compute_kp_offset_loss(self, input_height, input_width, task_name,
+                              offset_predictions, localization_loss_fn):
+    """Computes the offset loss of the keypoint estimation task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      task_name: A string representing the name of the keypoint task.
+      offset_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2] representing the prediction heads of the model
+        for keypoint offset.
+      localization_loss_fn: An object_detection.core.losses.Loss object to
+        compute the loss for the keypoint offset predictions in CenterNet.
+
+    Returns:
+      loss: A float scalar tensor representing the keypoint offset loss
+        normalized by number of total keypoints.
+    """
+    gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
+    gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+
+    assigner = self._target_assigner_dict[task_name]
+    (batch_indices, batch_offsets,
+     batch_weights) = assigner.assign_keypoints_offset_targets(
+         height=input_height,
+         width=input_width,
+         gt_keypoints_list=gt_keypoints_list,
+         gt_weights_list=gt_weights_list,
+         gt_classes_list=gt_classes_list)
+
+    # Keypoint offset loss.
+    loss = 0.0
+    for prediction in offset_predictions:
+      batch_size, out_height, out_width, channels = _get_shape(prediction, 4)
+      if channels > 2:
+        prediction = tf.reshape(
+            prediction, shape=[batch_size, out_height, out_width, -1, 2])
+      prediction = cn_assigner.get_batch_predictions_from_indices(
+          prediction, batch_indices)
+      # The dimensions passed are not as per the doc string but the loss
+      # still computes the correct value.
+      unweighted_loss = localization_loss_fn(
+          prediction,
+          batch_offsets,
+          weights=tf.expand_dims(tf.ones_like(batch_weights), -1))
+      # Apply the weights after the loss function to have full control over it.
+      loss += batch_weights * tf.reduce_sum(unweighted_loss, axis=1)
+
+    loss = tf.reduce_sum(loss) / (
+        float(len(offset_predictions)) *
+        tf.maximum(tf.reduce_sum(batch_weights), 1.0))
+    return loss
+
+  def _compute_kp_regression_loss(self, input_height, input_width, task_name,
+                                  regression_predictions, localization_loss_fn):
+    """Computes the keypoint regression loss of the keypoint estimation task.
+
+    Args:
+      input_height: An integer scalar tensor representing input image height.
+      input_width: An integer scalar tensor representing input image width.
+      task_name: A string representing the name of the keypoint task.
+      regression_predictions: A list of float tensors of shape [batch_size,
+        out_height, out_width, 2 * num_keypoints] representing the prediction
+        heads of the model for keypoint regression offset.
+      localization_loss_fn: An object_detection.core.losses.Loss object to
+        compute the loss for the keypoint regression offset predictions in
+        CenterNet.
+
+    Returns:
+      loss: A float scalar tensor representing the keypoint regression offset
+        loss normalized by number of total keypoints.
+    """
+    gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
+    gt_keypoints_list = self.groundtruth_lists(fields.BoxListFields.keypoints)
+    gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
+    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
+    # keypoint regression offset loss.
+    assigner = self._target_assigner_dict[task_name]
+    (batch_indices, batch_regression_offsets,
+     batch_weights) = assigner.assign_joint_regression_targets(
+         height=input_height,
+         width=input_width,
+         gt_keypoints_list=gt_keypoints_list,
+         gt_classes_list=gt_classes_list,
+         gt_weights_list=gt_weights_list,
+         gt_boxes_list=gt_boxes_list)
+
+    loss = 0.0
+    for prediction in regression_predictions:
+      batch_size, out_height, out_width, _ = _get_shape(prediction, 4)
+      reshaped_prediction = tf.reshape(
+          prediction, shape=[batch_size, out_height, out_width, -1, 2])
+      reg_prediction = cn_assigner.get_batch_predictions_from_indices(
+          reshaped_prediction, batch_indices)
+      unweighted_loss = localization_loss_fn(
+          reg_prediction,
+          batch_regression_offsets,
+          weights=tf.expand_dims(tf.ones_like(batch_weights), -1))
+      # Apply the weights after the loss function to have full control over it.
+      loss += batch_weights * tf.reduce_sum(unweighted_loss, axis=1)
+
+    loss = tf.reduce_sum(loss) / (
+        float(len(regression_predictions)) *
+        tf.maximum(tf.reduce_sum(batch_weights), 1.0))
+    return loss
+
+  def _compute_segmentation_losses(self, prediction_dict, per_pixel_weights):
+    """Computes all the losses associated with segmentation.
+
+    Args:
+      prediction_dict: The dictionary returned from the predict() method.
+      per_pixel_weights: A float tensor of shape [batch_size,
+        out_height * out_width, 1] with 1s in locations where the spatial
+        coordinates fall within the height and width in true_image_shapes.
+
+    Returns:
+      A dictionary with segmentation losses.
+    """
+    segmentation_heatmap = prediction_dict[SEGMENTATION_HEATMAP]
+    mask_loss = self._compute_mask_loss(
+        segmentation_heatmap, per_pixel_weights)
+    losses = {
+        SEGMENTATION_HEATMAP: mask_loss
+    }
+    return losses
+
+  def _compute_mask_loss(self, segmentation_predictions,
+                         per_pixel_weights):
+    """Computes the mask loss.
+
+    Args:
+      segmentation_predictions: A list of float32 tensors of shape [batch_size,
+        out_height, out_width, num_classes].
+      per_pixel_weights: A float tensor of shape [batch_size,
+        out_height * out_width, 1] with 1s in locations where the spatial
+        coordinates fall within the height and width in true_image_shapes.
+
+    Returns:
+      A float scalar tensor representing the mask loss.
+    """
+    gt_masks_list = self.groundtruth_lists(fields.BoxListFields.masks)
+    gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
+
+    # Convert the groundtruth to targets.
+    assigner = self._target_assigner_dict[SEGMENTATION_TASK]
+    heatmap_targets = assigner.assign_segmentation_targets(
+        gt_masks_list=gt_masks_list,
+        gt_classes_list=gt_classes_list)
+
+    flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
+
+    loss = 0.0
+    mask_loss_fn = self._mask_params.classification_loss
+    total_pixels_in_loss = tf.reduce_sum(per_pixel_weights)
+
+    # Loop through each feature output head.
+    for pred in segmentation_predictions:
+      pred = _flatten_spatial_dimensions(pred)
+      loss += mask_loss_fn(
+          pred, flattened_heatmap_targets, weights=per_pixel_weights)
+    # TODO(ronnyvotel): Consider other ways to normalize loss.
+    total_loss = tf.reduce_sum(loss) / (
+        float(len(segmentation_predictions)) * total_pixels_in_loss)
+    return total_loss
+
+  def preprocess(self, inputs):
+    outputs = shape_utils.resize_images_and_return_shapes(
+        inputs, self._image_resizer_fn)
+    resized_inputs, true_image_shapes = outputs
+
+    return (self._feature_extractor.preprocess(resized_inputs),
+            true_image_shapes)
+
+  def predict(self, preprocessed_inputs, _):
+    """Predicts CenterNet prediction tensors given an input batch.
+
+    Feature extractors are free to produce predictions from multiple feature
+    maps and therefore we return a dictionary mapping strings to lists.
+    E.g. the hourglass backbone produces two feature maps.
+
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float32 tensor
+        representing a batch of images.
+
+    Returns:
+      prediction_dict: a dictionary holding predicted tensors with
+        'preprocessed_inputs' - The input image after being resized and
+          preprocessed by the feature extractor.
+        'object_center' - A list of size num_feature_outputs containing
+          float tensors of size [batch_size, output_height, output_width,
+          num_classes] representing the predicted object center heatmap logits.
+        'box/scale' - [optional] A list of size num_feature_outputs holding
+          float tensors of size [batch_size, output_height, output_width, 2]
+          representing the predicted box height and width at each output
+          location. This field exists only when object detection task is
+          specified.
+        'box/offset' - [optional] A list of size num_feature_outputs holding
+          float tensors of size [batch_size, output_height, output_width, 2]
+          representing the predicted y and x offsets at each output location.
+        '$TASK_NAME/keypoint_heatmap' - [optional]  A list of size
+          num_feature_outputs holding float tensors of size [batch_size,
+          output_height, output_width, num_keypoints] representing the predicted
+          keypoint heatmap logits.
+        '$TASK_NAME/keypoint_offset' - [optional] A list of size
+          num_feature_outputs holding float tensors of size [batch_size,
+          output_height, output_width, 2] representing the predicted keypoint
+          offsets at each output location.
+        '$TASK_NAME/keypoint_regression' - [optional] A list of size
+          num_feature_outputs holding float tensors of size [batch_size,
+          output_height, output_width, 2 * num_keypoints] representing the
+          predicted keypoint regression at each output location.
+        'segmentation/heatmap' - [optional] A list of size num_feature_outputs
+          holding float tensors of size [batch_size, output_height,
+          output_width, num_classes] representing the mask logits.
+        Note the $TASK_NAME is provided by the KeypointEstimation namedtuple
+        used to differentiate between different keypoint tasks.
+    """
+    features_list = self._feature_extractor(preprocessed_inputs)
+
+    predictions = {}
+    for head_name, heads in self._prediction_head_dict.items():
+      predictions[head_name] = [
+          head(feature) for (feature, head) in zip(features_list, heads)
+      ]
+    predictions['preprocessed_inputs'] = preprocessed_inputs
+
+    self._batched_prediction_tensor_names = predictions.keys()
+    return predictions
+
+  def loss(self, prediction_dict, true_image_shapes, scope=None):
+    """Computes scalar loss tensors with respect to provided groundtruth.
+
+    This function implements the various CenterNet losses.
+
+    Args:
+      prediction_dict: a dictionary holding predicted tensors returned by
+        "predict" function.
+      true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
+        the form [height, width, channels] indicating the shapes of true images
+        in the resized images, as resized images can be padded with zeros.
+      scope: Optional scope name.
+
+    Returns:
+      A dictionary mapping the keys ['Loss/object_center', 'Loss/box/scale',
+        'Loss/box/offset', 'Loss/$TASK_NAME/keypoint/heatmap',
+        'Loss/$TASK_NAME/keypoint/offset',
+        'Loss/$TASK_NAME/keypoint/regression', 'Loss/segmentation/heatmap'] to
+        scalar tensors corresponding to the losses for different tasks. Note the
+        $TASK_NAME is provided by the KeypointEstimation namedtuple used to
+        differentiate between different keypoint tasks.
+    """
+
+    _, input_height, input_width, _ = _get_shape(
+        prediction_dict['preprocessed_inputs'], 4)
+
+    output_height, output_width = (input_height // self._stride,
+                                   input_width // self._stride)
+
+    # TODO(vighneshb) Explore whether using floor here is safe.
+    output_true_image_shapes = tf.ceil(
+        tf.to_float(true_image_shapes) / self._stride)
+    valid_anchor_weights = get_valid_anchor_weights_in_flattened_image(
+        output_true_image_shapes, output_height, output_width)
+    valid_anchor_weights = tf.expand_dims(valid_anchor_weights, 2)
+
+    object_center_loss = self._compute_object_center_loss(
+        object_center_predictions=prediction_dict[OBJECT_CENTER],
+        input_height=input_height,
+        input_width=input_width,
+        per_pixel_weights=valid_anchor_weights)
+    losses = {
+        OBJECT_CENTER:
+            self._center_params.object_center_loss_weight * object_center_loss
+    }
+    if self._od_params is not None:
+      od_losses = self._compute_object_detection_losses(
+          input_height=input_height,
+          input_width=input_width,
+          prediction_dict=prediction_dict,
+          per_pixel_weights=valid_anchor_weights)
+      for key in od_losses:
+        od_losses[key] = od_losses[key] * self._od_params.task_loss_weight
+      losses.update(od_losses)
+
+    if self._kp_params_dict is not None:
+      for task_name, params in self._kp_params_dict.items():
+        kp_losses = self._compute_keypoint_estimation_losses(
+            task_name=task_name,
+            input_height=input_height,
+            input_width=input_width,
+            prediction_dict=prediction_dict,
+            per_pixel_weights=valid_anchor_weights)
+        for key in kp_losses:
+          kp_losses[key] = kp_losses[key] * params.task_loss_weight
+        losses.update(kp_losses)
+
+    if self._mask_params is not None:
+      seg_losses = self._compute_segmentation_losses(
+          prediction_dict=prediction_dict,
+          per_pixel_weights=valid_anchor_weights)
+      for key in seg_losses:
+        seg_losses[key] = seg_losses[key] * self._mask_params.task_loss_weight
+      losses.update(seg_losses)
+
+    # Prepend the LOSS_KEY_PREFIX to the keys in the dictionary such that the
+    # losses will be grouped together in Tensorboard.
+    return dict([('%s/%s' % (LOSS_KEY_PREFIX, key), val)
+                 for key, val in losses.items()])
+
+  def postprocess(self, prediction_dict, true_image_shapes, **params):
+    """Produces boxes given a prediction dict returned by predict().
+
+    Although predict returns a list of tensors, only the last tensor in
+    each list is used for making box predictions.
+
+    Args:
+      prediction_dict: a dictionary holding predicted tensors from "predict"
+        function.
+      true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
+        the form [height, width, channels] indicating the shapes of true images
+        in the resized images, as resized images can be padded with zeros.
+      **params: Currently ignored.
+
+    Returns:
+      detections: a dictionary containing the following fields
+        detection_boxes - A tensor of shape [batch, max_detections, 4]
+          holding the predicted boxes.
+        detection_scores: A tensor of shape [batch, max_detections] holding
+          the predicted score for each box.
+        detection_classes: An integer tensor of shape [batch, max_detections]
+          containing the detected class for each box.
+        num_detections: An integer tensor of shape [batch] containing the
+          number of detected boxes for each sample in the batch.
+        detection_keypoints: (Optional) A float tensor of shape [batch,
+          max_detections, num_keypoints, 2] with normalized keypoints. Any
+          invalid keypoints have their coordinates and scores set to 0.0.
+        detection_keypoint_scores: (Optional) A float tensor of shape [batch,
+          max_detection, num_keypoints] with scores for each keypoint.
+        detection_masks: (Optional) An int tensor of shape [batch,
+          max_detections, mask_height, mask_width] with binarized masks for each
+          detection.
+    """
+    object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
+    # Get x, y and channel indices corresponding to the top indices in the class
+    # center predictions.
+    detection_scores, y_indices, x_indices, channel_indices = (
+        top_k_feature_map_locations(
+            object_center_prob, max_pool_kernel_size=3,
+            k=self._center_params.max_box_predictions))
+
+    boxes_strided, classes, scores, num_detections = (
+        prediction_tensors_to_boxes(
+            detection_scores, y_indices, x_indices, channel_indices,
+            prediction_dict[BOX_SCALE][-1], prediction_dict[BOX_OFFSET][-1]))
+
+    boxes = convert_strided_predictions_to_normalized_boxes(
+        boxes_strided, self._stride, true_image_shapes)
+
+    postprocess_dict = {
+        fields.DetectionResultFields.detection_boxes: boxes,
+        fields.DetectionResultFields.detection_scores: scores,
+        fields.DetectionResultFields.detection_classes: classes,
+        fields.DetectionResultFields.num_detections: num_detections,
+    }
+
+    if self._kp_params_dict:
+      keypoints, keypoint_scores = self._postprocess_keypoints(
+          prediction_dict, classes, y_indices, x_indices,
+          boxes_strided, num_detections)
+      keypoints, keypoint_scores = (
+          convert_strided_predictions_to_normalized_keypoints(
+              keypoints, keypoint_scores, self._stride, true_image_shapes,
+              clip_out_of_frame_keypoints=True))
+      postprocess_dict.update({
+          fields.DetectionResultFields.detection_keypoints: keypoints,
+          fields.DetectionResultFields.detection_keypoint_scores:
+              keypoint_scores
+      })
+
+    if self._mask_params:
+      masks = tf.nn.sigmoid(prediction_dict[SEGMENTATION_HEATMAP][-1])
+      instance_masks = convert_strided_predictions_to_instance_masks(
+          boxes, classes, masks, self._stride, self._mask_params.mask_height,
+          self._mask_params.mask_width, true_image_shapes,
+          self._mask_params.score_threshold)
+      postprocess_dict.update({
+          fields.DetectionResultFields.detection_masks:
+              instance_masks
+      })
+    return postprocess_dict
+
+  def _postprocess_keypoints(self, prediction_dict, classes, y_indices,
+                             x_indices, boxes, num_detections):
+    """Performs postprocessing on keypoint predictions.
+
+    Args:
+      prediction_dict: a dictionary holding predicted tensors, returned from the
+        predict() method. This dictionary should contain keypoint prediction
+        feature maps for each keypoint task.
+      classes: A [batch_size, max_detections] int tensor with class indices for
+        all detected objects.
+      y_indices: A [batch_size, max_detections] int tensor with y indices for
+        all object centers.
+      x_indices: A [batch_size, max_detections] int tensor with x indices for
+        all object centers.
+      boxes: A [batch_size, max_detections, 4] float32 tensor with bounding
+        boxes in (un-normalized) output space.
+      num_detections: A [batch_size] int tensor with the number of valid
+        detections for each image.
+
+    Returns:
+      A tuple of
+      keypoints: a [batch_size, max_detection, num_total_keypoints, 2] float32
+        tensor with keypoints in the output (strided) coordinate frame.
+      keypoint_scores: a [batch_size, max_detections, num_total_keypoints]
+        float32 tensor with keypoint scores.
+    """
+    total_num_keypoints = sum(len(kp_dict.keypoint_indices) for kp_dict
+                              in self._kp_params_dict.values())
+    batch_size, max_detections, _ = _get_shape(boxes, 3)
+    kpt_coords_for_example_list = []
+    kpt_scores_for_example_list = []
+    for ex_ind in range(batch_size):
+      kpt_coords_for_class_list = []
+      kpt_scores_for_class_list = []
+      instance_inds_for_class_list = []
+      for task_name, kp_params in self._kp_params_dict.items():
+        keypoint_heatmap = prediction_dict[
+            get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]
+        keypoint_offsets = prediction_dict[
+            get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
+        keypoint_regression = prediction_dict[
+            get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
+        instance_inds = self._get_instance_indices(
+            classes, num_detections, ex_ind, kp_params.class_id)
+
+        def true_fn(
+            keypoint_heatmap, keypoint_offsets, keypoint_regression,
+            classes, y_indices, x_indices, boxes, instance_inds,
+            ex_ind, kp_params):
+          """Logics to execute when instance_inds is not an empty set."""
+          # Postprocess keypoints and scores for class and single image. Shapes
+          # are [1, num_instances_i, num_keypoints_i, 2] and
+          # [1, num_instances_i, num_keypoints_i], respectively. Note that
+          # num_instances_i and num_keypoints_i refers to the number of
+          # instances and keypoints for class i, respectively.
+          kpt_coords_for_class, kpt_scores_for_class = (
+              self._postprocess_keypoints_for_class_and_image(
+                  keypoint_heatmap, keypoint_offsets, keypoint_regression,
+                  classes, y_indices, x_indices, boxes, instance_inds,
+                  ex_ind, kp_params))
+          # Expand keypoint dimension (with padding) so that coordinates and
+          # scores have shape [1, num_instances_i, num_total_keypoints, 2] and
+          # [1, num_instances_i, num_total_keypoints], respectively.
+          kpts_coords_for_class_padded, kpt_scores_for_class_padded = (
+              _pad_to_full_keypoint_dim(
+                  kpt_coords_for_class, kpt_scores_for_class,
+                  kp_params.keypoint_indices, total_num_keypoints))
+          return kpts_coords_for_class_padded, kpt_scores_for_class_padded
+
+        def false_fn():
+          """Logics to execute when the instance_inds is an empty set."""
+          return (tf.zeros([1, 0, total_num_keypoints, 2], dtype=tf.float32),
+                  tf.zeros([1, 0, total_num_keypoints], dtype=tf.float32))
+
+        true_fn = functools.partial(
+            true_fn, keypoint_heatmap, keypoint_offsets, keypoint_regression,
+            classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
+            kp_params)
+        results = tf.cond(tf.size(instance_inds) > 0, true_fn, false_fn)
+
+        kpt_coords_for_class_list.append(results[0])
+        kpt_scores_for_class_list.append(results[1])
+        instance_inds_for_class_list.append(instance_inds)
+
+      # Concatenate all keypoints across all classes (single example).
+      kpt_coords_for_example = tf.concat(kpt_coords_for_class_list, axis=1)
+      kpt_scores_for_example = tf.concat(kpt_scores_for_class_list, axis=1)
+      instance_inds_for_example = tf.concat(instance_inds_for_class_list,
+                                            axis=0)
+
+      if tf.size(instance_inds_for_example) > 0:
+        # Scatter into tensor where instances align with original detection
+        # instances. New shape of keypoint coordinates and scores are
+        # [1, max_detections, num_total_keypoints, 2] and
+        # [1, max_detections, num_total_keypoints], respectively.
+        kpt_coords_for_example_all_det, kpt_scores_for_example_all_det = (
+            _pad_to_full_instance_dim(
+                kpt_coords_for_example, kpt_scores_for_example,
+                instance_inds_for_example,
+                self._center_params.max_box_predictions))
+      else:
+        kpt_coords_for_example_all_det = tf.zeros(
+            [1, max_detections, total_num_keypoints, 2], dtype=tf.float32)
+        kpt_scores_for_example_all_det = tf.zeros(
+            [1, max_detections, total_num_keypoints], dtype=tf.float32)
+
+      kpt_coords_for_example_list.append(kpt_coords_for_example_all_det)
+      kpt_scores_for_example_list.append(kpt_scores_for_example_all_det)
+
+    # Concatenate all keypoints and scores from all examples in the batch.
+    # Shapes are [batch_size, max_detections, num_total_keypoints, 2] and
+    # [batch_size, max_detections, num_total_keypoints], respectively.
+    keypoints = tf.concat(kpt_coords_for_example_list, axis=0)
+    keypoint_scores = tf.concat(kpt_scores_for_example_list, axis=0)
+
+    return keypoints, keypoint_scores
+
+  def _get_instance_indices(self, classes, num_detections, batch_index,
+                            class_id):
+    """Gets the instance indices that match the target class ID.
+
+    Args:
+      classes: A [batch_size, max_detections] int tensor with class indices for
+        all detected objects.
+      num_detections: A [batch_size] int tensor with the number of valid
+        detections for each image.
+      batch_index: An integer specifying the index for an example in the batch.
+      class_id: Class id
+
+    Returns:
+      instance_inds: A [num_instances] int tensor where each element indicates
+        the instance location within the `classes` tensor. This is useful to
+        associate the refined keypoints with the original detections (i.e.
+        boxes)
+    """
+    classes = classes[batch_index:batch_index+1, ...]
+    _, max_detections = shape_utils.combined_static_and_dynamic_shape(
+        classes)
+    # Get the detection indices corresponding to the target class.
+    valid_detections_with_kpt_class = tf.math.logical_and(
+        tf.range(max_detections) < num_detections[batch_index],
+        classes[0] == class_id)
+    instance_inds = tf.where(valid_detections_with_kpt_class)[:, 0]
+    return instance_inds
+
+  def _postprocess_keypoints_for_class_and_image(
+      self, keypoint_heatmap, keypoint_offsets, keypoint_regression, classes,
+      y_indices, x_indices, boxes, indices_with_kpt_class, batch_index,
+      kp_params):
+    """Postprocess keypoints for a single image and class.
+
+    This function performs the following postprocessing operations on a single
+    image and single keypoint class:
+    - Converts keypoints scores to range [0, 1] with sigmoid.
+    - Determines the detections that correspond to the specified keypoint class.
+    - Gathers the regressed keypoints at the detection (i.e. box) centers.
+    - Gathers keypoint candidates from the keypoint heatmaps.
+    - Snaps regressed keypoints to nearby keypoint candidates.
+
+    Args:
+      keypoint_heatmap: A [batch_size, height, width, num_keypoints] float32
+        tensor with keypoint heatmaps.
+      keypoint_offsets: A [batch_size, height, width, 2] float32 tensor with
+        local offsets to keypoint centers.
+      keypoint_regression: A [batch_size, height, width, 2 * num_keypoints]
+        float32 tensor with regressed offsets to all keypoints.
+      classes: A [batch_size, max_detections] int tensor with class indices for
+        all detected objects.
+      y_indices: A [batch_size, max_detections] int tensor with y indices for
+        all object centers.
+      x_indices: A [batch_size, max_detections] int tensor with x indices for
+        all object centers.
+      boxes: A [batch_size, max_detections, 4] float32 tensor with detected
+        boxes in the output (strided) frame.
+      indices_with_kpt_class: A [num_instances] int tensor where each element
+        indicates the instance location within the `classes` tensor. This is
+        useful to associate the refined keypoints with the original detections
+        (i.e. boxes)
+      batch_index: An integer specifying the index for an example in the batch.
+      kp_params: A `KeypointEstimationParams` object with parameters for a
+        single keypoint class.
+
+    Returns:
+      A tuple of
+      refined_keypoints: A [1, num_instances, num_keypoints, 2] float32 tensor
+        with refined keypoints for a single class in a single image, expressed
+        in the output (strided) coordinate frame. Note that `num_instances` is a
+        dynamic dimension, and corresponds to the number of valid detections
+        for the specific class.
+      refined_scores: A [1, num_instances, num_keypoints] float32 tensor with
+        keypoint scores.
+    """
+    keypoint_indices = kp_params.keypoint_indices
+    num_keypoints = len(keypoint_indices)
+
+    keypoint_heatmap = tf.nn.sigmoid(
+        keypoint_heatmap[batch_index:batch_index+1, ...])
+    keypoint_offsets = keypoint_offsets[batch_index:batch_index+1, ...]
+    keypoint_regression = keypoint_regression[batch_index:batch_index+1, ...]
+    y_indices = y_indices[batch_index:batch_index+1, ...]
+    x_indices = x_indices[batch_index:batch_index+1, ...]
+
+    # Gather the feature map locations corresponding to the object class.
+    y_indices_for_kpt_class = tf.gather(y_indices, indices_with_kpt_class,
+                                        axis=1)
+    x_indices_for_kpt_class = tf.gather(x_indices, indices_with_kpt_class,
+                                        axis=1)
+    boxes_for_kpt_class = tf.gather(boxes, indices_with_kpt_class, axis=1)
+
+    # Gather the regressed keypoints. Final tensor has shape
+    # [1, num_instances, num_keypoints, 2].
+    regressed_keypoints_for_objects = regressed_keypoints_at_object_centers(
+        keypoint_regression, y_indices_for_kpt_class, x_indices_for_kpt_class)
+    regressed_keypoints_for_objects = tf.reshape(
+        regressed_keypoints_for_objects, [1, -1, num_keypoints, 2])
+
+    # Get the candidate keypoints and scores.
+    # The shape of keypoint_candidates and keypoint_scores is:
+    # [1, num_candidates_per_keypoint, num_keypoints, 2] and
+    #  [1, num_candidates_per_keypoint, num_keypoints], respectively.
+    keypoint_candidates, keypoint_scores, num_keypoint_candidates = (
+        prediction_tensors_to_keypoint_candidates(
+            keypoint_heatmap, keypoint_offsets,
+            keypoint_score_threshold=(
+                kp_params.keypoint_candidate_score_threshold),
+            max_pool_kernel_size=kp_params.peak_max_pool_kernel_size,
+            max_candidates=kp_params.num_candidates_per_keypoint))
+
+    # Get the refined keypoints and scores, of shape
+    # [1, num_instances, num_keypoints, 2] and
+    # [1, num_instances, num_keypoints], respectively.
+    refined_keypoints, refined_scores = refine_keypoints(
+        regressed_keypoints_for_objects, keypoint_candidates, keypoint_scores,
+        num_keypoint_candidates, bboxes=boxes_for_kpt_class,
+        unmatched_keypoint_score=kp_params.unmatched_keypoint_score,
+        box_scale=kp_params.box_scale,
+        candidate_search_scale=kp_params.candidate_search_scale,
+        candidate_ranking_mode=kp_params.candidate_ranking_mode)
+
+    return refined_keypoints, refined_scores
+
+  def regularization_losses(self):
+    return []
+
+  def restore_map(self, fine_tune_checkpoint_type='classification',
+                  load_all_detection_checkpoint_vars=False):
+
+    if fine_tune_checkpoint_type == 'classification':
+      return {'feature_extractor': self._feature_extractor.get_base_model()}
+
+    if fine_tune_checkpoint_type == 'detection':
+      return {'feature_extractor': self._feature_extractor.get_model()}
+
+    else:
+      raise ValueError('Unknown fine tune checkpoint type - {}'.format(
+          fine_tune_checkpoint_type))
+
+  def updates(self):
+    raise RuntimeError('This model is intended to be used with model_lib_v2 '
+                       'which does not support updates()')