Refactor tests for Object Detection API. (#8688)

Internal changes -- PiperOrigin-RevId: 316837667

Refactor tests for Object Detection API. (#8688)
Internal changes -- PiperOrigin-RevId: 316837667
420a7253 · pkulzc · GitHub · d0ef3913 · 420a7253 · 420a7253
Unverified Commit 420a7253 authored Jun 17, 2020 by pkulzc Committed by GitHub Jun 17, 2020
20 changed files
--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the CenterNet Meta architecture code."""
+
+from __future__ import division
+
+import functools
+import unittest
+from absl.testing import parameterized
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+from object_detection.core import losses
+from object_detection.core import preprocessor
+from object_detection.core import standard_fields as fields
+from object_detection.core import target_assigner as cn_assigner
+from object_detection.meta_architectures import center_net_meta_arch as cnma
+from object_detection.models import center_net_resnet_feature_extractor
+from object_detection.utils import test_case
+from object_detection.utils import tf_version
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMetaArchPredictionHeadTest(test_case.TestCase):
+  """Test CenterNet meta architecture prediction head."""
+
+  def test_prediction_head(self):
+    head = cnma.make_prediction_net(num_out_channels=7)
+    output = head(np.zeros((4, 128, 128, 8)))
+
+    self.assertEqual((4, 128, 128, 7), output.shape)
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
+  """Test for CenterNet meta architecture related functions."""
+
+  def test_row_col_indices_from_flattened_indices(self):
+    """Tests that the computation of row, col, channel indices is correct."""
+
+    r_grid, c_grid, ch_grid = (np.zeros((5, 4, 3), dtype=np.int),
+                               np.zeros((5, 4, 3), dtype=np.int),
+                               np.zeros((5, 4, 3), dtype=np.int))
+
+    r_grid[..., 0] = r_grid[..., 1] = r_grid[..., 2] = np.array(
+        [[0, 0, 0, 0],
+         [1, 1, 1, 1],
+         [2, 2, 2, 2],
+         [3, 3, 3, 3],
+         [4, 4, 4, 4]]
+    )
+
+    c_grid[..., 0] = c_grid[..., 1] = c_grid[..., 2] = np.array(
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]
+    )
+
+    for i in range(3):
+      ch_grid[..., i] = i
+
+    indices = np.arange(60)
+    ri, ci, chi = cnma.row_col_channel_indices_from_flattened_indices(
+        indices, 4, 3)
+
+    np.testing.assert_array_equal(ri, r_grid.flatten())
+    np.testing.assert_array_equal(ci, c_grid.flatten())
+    np.testing.assert_array_equal(chi, ch_grid.flatten())
+
+  def test_flattened_indices_from_row_col_indices(self):
+
+    r = np.array(
+        [[0, 0, 0, 0],
+         [1, 1, 1, 1],
+         [2, 2, 2, 2]]
+    )
+
+    c = np.array(
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]
+    )
+
+    idx = cnma.flattened_indices_from_row_col_indices(r, c, 4)
+    np.testing.assert_array_equal(np.arange(12), idx.flatten())
+
+  def test_get_valid_anchor_weights_in_flattened_image(self):
+    """Tests that the anchor weights are valid upon flattening out."""
+
+    valid_weights = np.zeros((2, 5, 5), dtype=np.float)
+
+    valid_weights[0, :3, :4] = 1.0
+    valid_weights[1, :2, :2] = 1.0
+
+    def graph_fn():
+      true_image_shapes = tf.constant([[3, 4], [2, 2]])
+      w = cnma.get_valid_anchor_weights_in_flattened_image(
+          true_image_shapes, 5, 5)
+      return w
+
+    w = self.execute(graph_fn, [])
+    np.testing.assert_allclose(w, valid_weights.reshape(2, -1))
+    self.assertEqual((2, 25), w.shape)
+
+  def test_convert_strided_predictions_to_normalized_boxes(self):
+    """Tests that boxes have correct coordinates in normalized input space."""
+
+    def graph_fn():
+      boxes = np.zeros((2, 3, 4), dtype=np.float32)
+
+      boxes[0] = [[10, 20, 30, 40], [20, 30, 50, 100], [50, 60, 100, 180]]
+      boxes[1] = [[-5, -5, 5, 5], [45, 60, 110, 120], [150, 150, 200, 250]]
+
+      true_image_shapes = tf.constant([[100, 90, 3], [150, 150, 3]])
+
+      clipped_boxes = (
+          cnma.convert_strided_predictions_to_normalized_boxes(
+              boxes, 2, true_image_shapes))
+      return clipped_boxes
+
+    clipped_boxes = self.execute(graph_fn, [])
+
+    expected_boxes = np.zeros((2, 3, 4), dtype=np.float32)
+    expected_boxes[0] = [[0.2, 4./9, 0.6, 8./9], [0.4, 2./3, 1, 1],
+                         [1, 1, 1, 1]]
+    expected_boxes[1] = [[0., 0, 1./15, 1./15], [3./5, 4./5, 1, 1],
+                         [1, 1, 1, 1]]
+
+    np.testing.assert_allclose(expected_boxes, clipped_boxes)
+
+  @parameterized.parameters(
+      {'clip_to_window': True},
+      {'clip_to_window': False}
+  )
+  def test_convert_strided_predictions_to_normalized_keypoints(
+      self, clip_to_window):
+    """Tests that keypoints have correct coordinates in normalized coords."""
+
+    keypoint_coords_np = np.array(
+        [
+            # Example 0.
+            [
+                [[-10., 8.], [60., 22.], [60., 120.]],
+                [[20., 20.], [0., 0.], [0., 0.]],
+            ],
+            # Example 1.
+            [
+                [[40., 50.], [20., 160.], [200., 150.]],
+                [[10., 0.], [40., 10.], [0., 0.]],
+            ],
+        ], dtype=np.float32)
+    keypoint_scores_np = np.array(
+        [
+            # Example 0.
+            [
+                [1.0, 0.9, 0.2],
+                [0.7, 0.0, 0.0],
+            ],
+            # Example 1.
+            [
+                [1.0, 1.0, 0.2],
+                [0.7, 0.6, 0.0],
+            ],
+        ], dtype=np.float32)
+
+    def graph_fn():
+      keypoint_coords = tf.constant(keypoint_coords_np, dtype=tf.float32)
+      keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
+      true_image_shapes = tf.constant([[320, 400, 3], [640, 640, 3]])
+      stride = 4
+
+      keypoint_coords_out, keypoint_scores_out = (
+          cnma.convert_strided_predictions_to_normalized_keypoints(
+              keypoint_coords, keypoint_scores, stride, true_image_shapes,
+              clip_to_window))
+      return keypoint_coords_out, keypoint_scores_out
+
+    keypoint_coords_out, keypoint_scores_out = self.execute(graph_fn, [])
+
+    if clip_to_window:
+      expected_keypoint_coords_np = np.array(
+          [
+              # Example 0.
+              [
+                  [[0.0, 0.08], [0.75, 0.22], [0.75, 1.0]],
+                  [[0.25, 0.2], [0., 0.], [0.0, 0.0]],
+              ],
+              # Example 1.
+              [
+                  [[0.25, 0.3125], [0.125, 1.0], [1.0, 0.9375]],
+                  [[0.0625, 0.], [0.25, 0.0625], [0., 0.]],
+              ],
+          ], dtype=np.float32)
+      expected_keypoint_scores_np = np.array(
+          [
+              # Example 0.
+              [
+                  [0.0, 0.9, 0.0],
+                  [0.7, 0.0, 0.0],
+              ],
+              # Example 1.
+              [
+                  [1.0, 1.0, 0.0],
+                  [0.7, 0.6, 0.0],
+              ],
+          ], dtype=np.float32)
+    else:
+      expected_keypoint_coords_np = np.array(
+          [
+              # Example 0.
+              [
+                  [[-0.125, 0.08], [0.75, 0.22], [0.75, 1.2]],
+                  [[0.25, 0.2], [0., 0.], [0., 0.]],
+              ],
+              # Example 1.
+              [
+                  [[0.25, 0.3125], [0.125, 1.0], [1.25, 0.9375]],
+                  [[0.0625, 0.], [0.25, 0.0625], [0., 0.]],
+              ],
+          ], dtype=np.float32)
+      expected_keypoint_scores_np = np.array(
+          [
+              # Example 0.
+              [
+                  [1.0, 0.9, 0.2],
+                  [0.7, 0.0, 0.0],
+              ],
+              # Example 1.
+              [
+                  [1.0, 1.0, 0.2],
+                  [0.7, 0.6, 0.0],
+              ],
+          ], dtype=np.float32)
+    np.testing.assert_allclose(expected_keypoint_coords_np, keypoint_coords_out)
+    np.testing.assert_allclose(expected_keypoint_scores_np, keypoint_scores_out)
+
+  def test_convert_strided_predictions_to_instance_masks(self):
+
+    def graph_fn():
+      boxes = tf.constant(
+          [
+              [[0.5, 0.5, 1.0, 1.0],
+               [0.0, 0.5, 0.5, 1.0],
+               [0.0, 0.0, 0.0, 0.0]],
+          ], tf.float32)
+      classes = tf.constant(
+          [
+              [0, 1, 0],
+          ], tf.int32)
+      masks_np = np.zeros((1, 4, 4, 2), dtype=np.float32)
+      masks_np[0, :, 2:, 0] = 1  # Class 0.
+      masks_np[0, :, :3, 1] = 1  # Class 1.
+      masks = tf.constant(masks_np)
+      true_image_shapes = tf.constant([[6, 8, 3]])
+      instance_masks = cnma.convert_strided_predictions_to_instance_masks(
+          boxes, classes, masks, stride=2, mask_height=2, mask_width=2,
+          true_image_shapes=true_image_shapes)
+      return instance_masks
+
+    instance_masks = self.execute_cpu(graph_fn, [])
+
+    expected_instance_masks = np.array(
+        [
+            [
+                # Mask 0 (class 0).
+                [[1, 1],
+                 [1, 1]],
+                # Mask 1 (class 1).
+                [[1, 0],
+                 [1, 0]],
+                # Mask 2 (class 0).
+                [[0, 0],
+                 [0, 0]],
+            ]
+        ])
+    np.testing.assert_array_equal(expected_instance_masks, instance_masks)
+
+  def test_top_k_feature_map_locations(self):
+    feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    feature_map_np[0, 2, 0, 1] = 1.0
+    feature_map_np[0, 2, 1, 1] = 0.9  # Get's filtered due to max pool.
+    feature_map_np[0, 0, 1, 0] = 0.7
+    feature_map_np[0, 2, 2, 0] = 0.5
+    feature_map_np[0, 2, 2, 1] = -0.3
+    feature_map_np[1, 2, 1, 1] = 0.7
+    feature_map_np[1, 1, 0, 0] = 0.4
+    feature_map_np[1, 1, 2, 0] = 0.1
+
+    def graph_fn():
+      feature_map = tf.constant(feature_map_np)
+      scores, y_inds, x_inds, channel_inds = (
+          cnma.top_k_feature_map_locations(
+              feature_map, max_pool_kernel_size=3, k=3))
+      return scores, y_inds, x_inds, channel_inds
+
+    scores, y_inds, x_inds, channel_inds = self.execute(graph_fn, [])
+
+    np.testing.assert_allclose([1.0, 0.7, 0.5], scores[0])
+    np.testing.assert_array_equal([2, 0, 2], y_inds[0])
+    np.testing.assert_array_equal([0, 1, 2], x_inds[0])
+    np.testing.assert_array_equal([1, 0, 0], channel_inds[0])
+
+    np.testing.assert_allclose([0.7, 0.4, 0.1], scores[1])
+    np.testing.assert_array_equal([2, 1, 1], y_inds[1])
+    np.testing.assert_array_equal([1, 0, 2], x_inds[1])
+    np.testing.assert_array_equal([1, 0, 0], channel_inds[1])
+
+  def test_top_k_feature_map_locations_no_pooling(self):
+    feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    feature_map_np[0, 2, 0, 1] = 1.0
+    feature_map_np[0, 2, 1, 1] = 0.9
+    feature_map_np[0, 0, 1, 0] = 0.7
+    feature_map_np[0, 2, 2, 0] = 0.5
+    feature_map_np[0, 2, 2, 1] = -0.3
+    feature_map_np[1, 2, 1, 1] = 0.7
+    feature_map_np[1, 1, 0, 0] = 0.4
+    feature_map_np[1, 1, 2, 0] = 0.1
+
+    def graph_fn():
+      feature_map = tf.constant(feature_map_np)
+      scores, y_inds, x_inds, channel_inds = (
+          cnma.top_k_feature_map_locations(
+              feature_map, max_pool_kernel_size=1, k=3))
+      return scores, y_inds, x_inds, channel_inds
+
+    scores, y_inds, x_inds, channel_inds = self.execute(graph_fn, [])
+
+    np.testing.assert_allclose([1.0, 0.9, 0.7], scores[0])
+    np.testing.assert_array_equal([2, 2, 0], y_inds[0])
+    np.testing.assert_array_equal([0, 1, 1], x_inds[0])
+    np.testing.assert_array_equal([1, 1, 0], channel_inds[0])
+
+    np.testing.assert_allclose([0.7, 0.4, 0.1], scores[1])
+    np.testing.assert_array_equal([2, 1, 1], y_inds[1])
+    np.testing.assert_array_equal([1, 0, 2], x_inds[1])
+    np.testing.assert_array_equal([1, 0, 0], channel_inds[1])
+
+  def test_top_k_feature_map_locations_per_channel(self):
+    feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    feature_map_np[0, 2, 0, 0] = 1.0  # Selected.
+    feature_map_np[0, 2, 1, 0] = 0.9  # Get's filtered due to max pool.
+    feature_map_np[0, 0, 1, 0] = 0.7  # Selected.
+    feature_map_np[0, 2, 2, 1] = 0.5  # Selected.
+    feature_map_np[0, 0, 0, 1] = 0.3  # Selected.
+    feature_map_np[1, 2, 1, 0] = 0.7  # Selected.
+    feature_map_np[1, 1, 0, 0] = 0.4  # Get's filtered due to max pool.
+    feature_map_np[1, 1, 2, 0] = 0.3  # Get's filtered due to max pool.
+    feature_map_np[1, 1, 0, 1] = 0.8  # Selected.
+    feature_map_np[1, 1, 2, 1] = 0.3  # Selected.
+
+    def graph_fn():
+      feature_map = tf.constant(feature_map_np)
+      scores, y_inds, x_inds, channel_inds = (
+          cnma.top_k_feature_map_locations(
+              feature_map, max_pool_kernel_size=3, k=2, per_channel=True))
+      return scores, y_inds, x_inds, channel_inds
+
+    scores, y_inds, x_inds, channel_inds = self.execute(graph_fn, [])
+
+    np.testing.assert_allclose([1.0, 0.7, 0.5, 0.3], scores[0])
+    np.testing.assert_array_equal([2, 0, 2, 0], y_inds[0])
+    np.testing.assert_array_equal([0, 1, 2, 0], x_inds[0])
+    np.testing.assert_array_equal([0, 0, 1, 1], channel_inds[0])
+
+    np.testing.assert_allclose([0.7, 0.0, 0.8, 0.3], scores[1])
+    np.testing.assert_array_equal([2, 0, 1, 1], y_inds[1])
+    np.testing.assert_array_equal([1, 0, 0, 2], x_inds[1])
+    np.testing.assert_array_equal([0, 0, 1, 1], channel_inds[1])
+
+  def test_box_prediction(self):
+
+    class_pred = np.zeros((3, 128, 128, 5), dtype=np.float32)
+    hw_pred = np.zeros((3, 128, 128, 2), dtype=np.float32)
+    offset_pred = np.zeros((3, 128, 128, 2), dtype=np.float32)
+
+    # Sample 1, 2 boxes
+    class_pred[0, 10, 20] = [0.3, .7, 0.0, 0.0, 0.0]
+    hw_pred[0, 10, 20] = [40, 60]
+    offset_pred[0, 10, 20] = [1, 2]
+
+    class_pred[0, 50, 60] = [0.55, 0.0, 0.0, 0.0, 0.45]
+    hw_pred[0, 50, 60] = [50, 50]
+    offset_pred[0, 50, 60] = [0, 0]
+
+    # Sample 2, 2 boxes (at same location)
+    class_pred[1, 100, 100] = [0.0, 0.1, 0.9, 0.0, 0.0]
+    hw_pred[1, 100, 100] = [10, 10]
+    offset_pred[1, 100, 100] = [1, 3]
+
+    # Sample 3, 3 boxes
+    class_pred[2, 60, 90] = [0.0, 0.0, 0.0, 0.2, 0.8]
+    hw_pred[2, 60, 90] = [40, 30]
+    offset_pred[2, 60, 90] = [0, 0]
+
+    class_pred[2, 65, 95] = [0.0, 0.7, 0.3, 0.0, 0.0]
+    hw_pred[2, 65, 95] = [20, 20]
+    offset_pred[2, 65, 95] = [1, 2]
+
+    class_pred[2, 75, 85] = [1.0, 0.0, 0.0, 0.0, 0.0]
+    hw_pred[2, 75, 85] = [21, 25]
+    offset_pred[2, 75, 85] = [5, 2]
+
+    def graph_fn():
+      class_pred_tensor = tf.constant(class_pred)
+      hw_pred_tensor = tf.constant(hw_pred)
+      offset_pred_tensor = tf.constant(offset_pred)
+
+      detection_scores, y_indices, x_indices, channel_indices = (
+          cnma.top_k_feature_map_locations(
+              class_pred_tensor, max_pool_kernel_size=3, k=2))
+
+      boxes, classes, scores, num_dets = cnma.prediction_tensors_to_boxes(
+          detection_scores, y_indices, x_indices, channel_indices,
+          hw_pred_tensor, offset_pred_tensor)
+      return boxes, classes, scores, num_dets
+
+    boxes, classes, scores, num_dets = self.execute(graph_fn, [])
+
+    np.testing.assert_array_equal(num_dets, [2, 2, 2])
+
+    np.testing.assert_allclose(
+        [[-9, -8, 31, 52], [25, 35, 75, 85]], boxes[0])
+    np.testing.assert_allclose(
+        [[96, 98, 106, 108], [96, 98, 106, 108]], boxes[1])
+    np.testing.assert_allclose(
+        [[69.5, 74.5, 90.5, 99.5], [40, 75, 80, 105]], boxes[2])
+
+    np.testing.assert_array_equal(classes[0], [1, 0])
+    np.testing.assert_array_equal(classes[1], [2, 1])
+    np.testing.assert_array_equal(classes[2], [0, 4])
+
+    np.testing.assert_allclose(scores[0], [.7, .55])
+    np.testing.assert_allclose(scores[1][:1], [.9])
+    np.testing.assert_allclose(scores[2], [1., .8])
+
+  def test_keypoint_candidate_prediction(self):
+    keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    keypoint_heatmap_np[0, 0, 0, 0] = 1.0
+    keypoint_heatmap_np[0, 2, 1, 0] = 0.7
+    keypoint_heatmap_np[0, 1, 1, 0] = 0.6
+    keypoint_heatmap_np[0, 0, 2, 1] = 0.7
+    keypoint_heatmap_np[0, 1, 1, 1] = 0.3  # Filtered by low score.
+    keypoint_heatmap_np[0, 2, 2, 1] = 0.2
+    keypoint_heatmap_np[1, 1, 0, 0] = 0.6
+    keypoint_heatmap_np[1, 2, 1, 0] = 0.5
+    keypoint_heatmap_np[1, 0, 0, 0] = 0.4
+    keypoint_heatmap_np[1, 0, 0, 1] = 1.0
+    keypoint_heatmap_np[1, 0, 1, 1] = 0.9
+    keypoint_heatmap_np[1, 2, 0, 1] = 0.8
+
+    keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25]
+    keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5]
+    keypoint_heatmap_offsets_np[0, 1, 1] = [0.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 0, 2] = [1.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 2, 2] = [1.0, 1.0]
+    keypoint_heatmap_offsets_np[1, 1, 0] = [0.25, 0.5]
+    keypoint_heatmap_offsets_np[1, 2, 1] = [0.5, 0.0]
+    keypoint_heatmap_offsets_np[1, 0, 0] = [0.0, -0.5]
+    keypoint_heatmap_offsets_np[1, 0, 1] = [0.5, -0.5]
+    keypoint_heatmap_offsets_np[1, 2, 0] = [-1.0, -0.5]
+
+    def graph_fn():
+      keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
+      keypoint_heatmap_offsets = tf.constant(
+          keypoint_heatmap_offsets_np, dtype=tf.float32)
+
+      keypoint_cands, keypoint_scores, num_keypoint_candidates = (
+          cnma.prediction_tensors_to_keypoint_candidates(
+              keypoint_heatmap,
+              keypoint_heatmap_offsets,
+              keypoint_score_threshold=0.5,
+              max_pool_kernel_size=1,
+              max_candidates=2))
+      return keypoint_cands, keypoint_scores, num_keypoint_candidates
+
+    (keypoint_cands, keypoint_scores,
+     num_keypoint_candidates) = self.execute(graph_fn, [])
+
+    expected_keypoint_candidates = [
+        [  # Example 0.
+            [[0.5, 0.25], [1.0, 2.0]],  # Keypoint 1.
+            [[1.75, 1.5], [1.0, 1.0]],  # Keypoint 2.
+        ],
+        [  # Example 1.
+            [[1.25, 0.5], [0.0, -0.5]],  # Keypoint 1.
+            [[2.5, 1.0], [0.5, 0.5]],  # Keypoint 2.
+        ],
+    ]
+    expected_keypoint_scores = [
+        [  # Example 0.
+            [1.0, 0.7],  # Keypoint 1.
+            [0.7, 0.3],  # Keypoint 2.
+        ],
+        [  # Example 1.
+            [0.6, 1.0],  # Keypoint 1.
+            [0.5, 0.9],  # Keypoint 2.
+        ],
+    ]
+    expected_num_keypoint_candidates = [
+        [2, 1],
+        [2, 2]
+    ]
+    np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
+    np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
+    np.testing.assert_array_equal(expected_num_keypoint_candidates,
+                                  num_keypoint_candidates)
+
+  def test_keypoint_candidate_prediction_per_keypoints(self):
+    keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
+    keypoint_heatmap_np[0, 0, 0, 0] = 1.0
+    keypoint_heatmap_np[0, 2, 1, 0] = 0.7
+    keypoint_heatmap_np[0, 1, 1, 0] = 0.6
+    keypoint_heatmap_np[0, 0, 2, 1] = 0.7
+    keypoint_heatmap_np[0, 1, 1, 1] = 0.3  # Filtered by low score.
+    keypoint_heatmap_np[0, 2, 2, 1] = 0.2
+    keypoint_heatmap_np[1, 1, 0, 0] = 0.6
+    keypoint_heatmap_np[1, 2, 1, 0] = 0.5
+    keypoint_heatmap_np[1, 0, 0, 0] = 0.4
+    keypoint_heatmap_np[1, 0, 0, 1] = 1.0
+    keypoint_heatmap_np[1, 0, 1, 1] = 0.9
+    keypoint_heatmap_np[1, 2, 0, 1] = 0.8
+
+    keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 4), dtype=np.float32)
+    keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25, 0.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5, 0.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 1, 1] = [0.0, 0.0, 0.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 0, 2] = [0.0, 0.0, 1.0, 0.0]
+    keypoint_heatmap_offsets_np[0, 2, 2] = [0.0, 0.0, 1.0, 1.0]
+    keypoint_heatmap_offsets_np[1, 1, 0] = [0.25, 0.5, 0.0, 0.0]
+    keypoint_heatmap_offsets_np[1, 2, 1] = [0.5, 0.0, 0.0, 0.0]
+    keypoint_heatmap_offsets_np[1, 0, 0] = [0.0, 0.0, 0.0, -0.5]
+    keypoint_heatmap_offsets_np[1, 0, 1] = [0.0, 0.0, 0.5, -0.5]
+    keypoint_heatmap_offsets_np[1, 2, 0] = [0.0, 0.0, -1.0, -0.5]
+
+    def graph_fn():
+      keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
+      keypoint_heatmap_offsets = tf.constant(
+          keypoint_heatmap_offsets_np, dtype=tf.float32)
+
+      keypoint_cands, keypoint_scores, num_keypoint_candidates = (
+          cnma.prediction_tensors_to_keypoint_candidates(
+              keypoint_heatmap,
+              keypoint_heatmap_offsets,
+              keypoint_score_threshold=0.5,
+              max_pool_kernel_size=1,
+              max_candidates=2))
+      return keypoint_cands, keypoint_scores, num_keypoint_candidates
+
+    (keypoint_cands, keypoint_scores,
+     num_keypoint_candidates) = self.execute(graph_fn, [])
+
+    expected_keypoint_candidates = [
+        [  # Example 0.
+            [[0.5, 0.25], [1.0, 2.0]],  # Candidate 1 of keypoint 1, 2.
+            [[1.75, 1.5], [1.0, 1.0]],  # Candidate 2 of keypoint 1, 2.
+        ],
+        [  # Example 1.
+            [[1.25, 0.5], [0.0, -0.5]],  # Candidate 1 of keypoint 1, 2.
+            [[2.5, 1.0], [0.5, 0.5]],    # Candidate 2 of keypoint 1, 2.
+        ],
+    ]
+    expected_keypoint_scores = [
+        [  # Example 0.
+            [1.0, 0.7],  # Candidate 1 scores of keypoint 1, 2.
+            [0.7, 0.3],  # Candidate 2 scores of keypoint 1, 2.
+        ],
+        [  # Example 1.
+            [0.6, 1.0],  # Candidate 1 scores of keypoint 1, 2.
+            [0.5, 0.9],  # Candidate 2 scores of keypoint 1, 2.
+        ],
+    ]
+    expected_num_keypoint_candidates = [
+        [2, 1],
+        [2, 2]
+    ]
+    np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
+    np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
+    np.testing.assert_array_equal(expected_num_keypoint_candidates,
+                                  num_keypoint_candidates)
+
+  def test_regressed_keypoints_at_object_centers(self):
+    batch_size = 2
+    num_keypoints = 5
+    num_instances = 6
+    regressed_keypoint_feature_map_np = np.random.randn(
+        batch_size, 10, 10, 2 * num_keypoints).astype(np.float32)
+    y_indices = np.random.choice(10, (batch_size, num_instances))
+    x_indices = np.random.choice(10, (batch_size, num_instances))
+    offsets = np.stack([y_indices, x_indices], axis=2).astype(np.float32)
+
+    def graph_fn():
+      regressed_keypoint_feature_map = tf.constant(
+          regressed_keypoint_feature_map_np, dtype=tf.float32)
+
+      gathered_regressed_keypoints = (
+          cnma.regressed_keypoints_at_object_centers(
+              regressed_keypoint_feature_map,
+              tf.constant(y_indices, dtype=tf.int32),
+              tf.constant(x_indices, dtype=tf.int32)))
+      return gathered_regressed_keypoints
+
+    gathered_regressed_keypoints = self.execute(graph_fn, [])
+
+    expected_gathered_keypoints_0 = regressed_keypoint_feature_map_np[
+        0, y_indices[0], x_indices[0], :]
+    expected_gathered_keypoints_1 = regressed_keypoint_feature_map_np[
+        1, y_indices[1], x_indices[1], :]
+    expected_gathered_keypoints = np.stack([
+        expected_gathered_keypoints_0,
+        expected_gathered_keypoints_1], axis=0)
+    expected_gathered_keypoints = np.reshape(
+        expected_gathered_keypoints,
+        [batch_size, num_instances, num_keypoints, 2])
+    expected_gathered_keypoints += np.expand_dims(offsets, axis=2)
+    expected_gathered_keypoints = np.reshape(
+        expected_gathered_keypoints,
+        [batch_size, num_instances, -1])
+    np.testing.assert_allclose(expected_gathered_keypoints,
+                               gathered_regressed_keypoints)
+
+  @parameterized.parameters(
+      {'candidate_ranking_mode': 'min_distance'},
+      {'candidate_ranking_mode': 'score_distance_ratio'},
+  )
+  def test_refine_keypoints(self, candidate_ranking_mode):
+    regressed_keypoints_np = np.array(
+        [
+            # Example 0.
+            [
+                [[2.0, 2.0], [6.0, 10.0], [14.0, 7.0]],  # Instance 0.
+                [[0.0, 6.0], [3.0, 3.0], [5.0, 7.0]],  # Instance 1.
+            ],
+            # Example 1.
+            [
+                [[6.0, 2.0], [0.0, 0.0], [0.1, 0.1]],  # Instance 0.
+                [[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]],  # Instance 1.
+            ],
+        ], dtype=np.float32)
+    keypoint_candidates_np = np.array(
+        [
+            # Example 0.
+            [
+                [[2.0, 2.5], [6.0, 10.5], [4.0, 7.0]],  # Candidate 0.
+                [[1.0, 8.0], [0.0, 0.0], [2.0, 2.0]],  # Candidate 1.
+                [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]],  # Candidate 2.
+            ],
+            # Example 1.
+            [
+                [[6.0, 1.5], [0.1, 0.4], [0.0, 0.0]],  # Candidate 0.
+                [[1.0, 4.0], [0.0, 0.3], [0.0, 0.0]],  # Candidate 1.
+                [[0.0, 0.0], [0.1, 0.3], [0.0, 0.0]],  # Candidate 2.
+            ]
+        ], dtype=np.float32)
+    keypoint_scores_np = np.array(
+        [
+            # Example 0.
+            [
+                [0.8, 0.9, 1.0],  # Candidate 0.
+                [0.6, 0.1, 0.9],  # Candidate 1.
+                [0.0, 0.0, 0.0],  # Candidate 1.
+            ],
+            # Example 1.
+            [
+                [0.7, 0.3, 0.0],  # Candidate 0.
+                [0.6, 0.1, 0.0],  # Candidate 1.
+                [0.0, 0.28, 0.0],  # Candidate 1.
+            ]
+        ], dtype=np.float32)
+    num_keypoints_candidates_np = np.array(
+        [
+            # Example 0.
+            [2, 2, 2],
+            # Example 1.
+            [2, 3, 0],
+        ], dtype=np.int32)
+    unmatched_keypoint_score = 0.1
+
+    def graph_fn():
+      regressed_keypoints = tf.constant(
+          regressed_keypoints_np, dtype=tf.float32)
+      keypoint_candidates = tf.constant(
+          keypoint_candidates_np, dtype=tf.float32)
+      keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
+      num_keypoint_candidates = tf.constant(num_keypoints_candidates_np,
+                                            dtype=tf.int32)
+      refined_keypoints, refined_scores = cnma.refine_keypoints(
+          regressed_keypoints, keypoint_candidates, keypoint_scores,
+          num_keypoint_candidates, bboxes=None,
+          unmatched_keypoint_score=unmatched_keypoint_score,
+          box_scale=1.2, candidate_search_scale=0.3,
+          candidate_ranking_mode=candidate_ranking_mode)
+      return refined_keypoints, refined_scores
+
+    refined_keypoints, refined_scores = self.execute(graph_fn, [])
+
+    if candidate_ranking_mode == 'min_distance':
+      expected_refined_keypoints = np.array(
+          [
+              # Example 0.
+              [
+                  [[2.0, 2.5], [6.0, 10.5], [14.0, 7.0]],  # Instance 0.
+                  [[0.0, 6.0], [3.0, 3.0], [4.0, 7.0]],  # Instance 1.
+              ],
+              # Example 1.
+              [
+                  [[6.0, 1.5], [0.0, 0.3], [0.1, 0.1]],  # Instance 0.
+                  [[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]],  # Instance 1.
+              ],
+          ], dtype=np.float32)
+      expected_refined_scores = np.array(
+          [
+              # Example 0.
+              [
+                  [0.8, 0.9, unmatched_keypoint_score],  # Instance 0.
+                  [unmatched_keypoint_score,  # Instance 1.
+                   unmatched_keypoint_score, 1.0],
+              ],
+              # Example 1.
+              [
+                  [0.7, 0.1, unmatched_keypoint_score],  # Instance 0.
+                  [unmatched_keypoint_score,  # Instance 1.
+                   0.1, unmatched_keypoint_score],
+              ],
+          ], dtype=np.float32)
+    else:
+      expected_refined_keypoints = np.array(
+          [
+              # Example 0.
+              [
+                  [[2.0, 2.5], [6.0, 10.5], [14.0, 7.0]],  # Instance 0.
+                  [[0.0, 6.0], [3.0, 3.0], [4.0, 7.0]],  # Instance 1.
+              ],
+              # Example 1.
+              [
+                  [[6.0, 1.5], [0.1, 0.3], [0.1, 0.1]],  # Instance 0.
+                  [[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]],  # Instance 1.
+              ],
+          ], dtype=np.float32)
+      expected_refined_scores = np.array(
+          [
+              # Example 0.
+              [
+                  [0.8, 0.9, unmatched_keypoint_score],  # Instance 0.
+                  [unmatched_keypoint_score,  # Instance 1.
+                   unmatched_keypoint_score, 1.0],
+              ],
+              # Example 1.
+              [
+                  [0.7, 0.28, unmatched_keypoint_score],  # Instance 0.
+                  [unmatched_keypoint_score,  # Instance 1.
+                   0.1, unmatched_keypoint_score],
+              ],
+          ], dtype=np.float32)
+
+    np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
+    np.testing.assert_allclose(expected_refined_scores, refined_scores)
+
+  def test_refine_keypoints_with_bboxes(self):
+    regressed_keypoints_np = np.array(
+        [
+            # Example 0.
+            [
+                [[2.0, 2.0], [6.0, 10.0], [14.0, 7.0]],  # Instance 0.
+                [[0.0, 6.0], [3.0, 3.0], [5.0, 7.0]],  # Instance 1.
+            ],
+            # Example 1.
+            [
+                [[6.0, 2.0], [0.0, 0.0], [0.1, 0.1]],  # Instance 0.
+                [[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]],  # Instance 1.
+            ],
+        ], dtype=np.float32)
+    keypoint_candidates_np = np.array(
+        [
+            # Example 0.
+            [
+                [[2.0, 2.5], [6.0, 10.5], [4.0, 7.0]],  # Candidate 0.
+                [[1.0, 8.0], [0.0, 0.0], [2.0, 2.0]],  # Candidate 1.
+            ],
+            # Example 1.
+            [
+                [[6.0, 1.5], [5.0, 5.0], [0.0, 0.0]],  # Candidate 0.
+                [[1.0, 4.0], [0.0, 0.3], [0.0, 0.0]],  # Candidate 1.
+            ]
+        ], dtype=np.float32)
+    keypoint_scores_np = np.array(
+        [
+            # Example 0.
+            [
+                [0.8, 0.9, 1.0],  # Candidate 0.
+                [0.6, 0.1, 0.9],  # Candidate 1.
+            ],
+            # Example 1.
+            [
+                [0.7, 0.4, 0.0],  # Candidate 0.
+                [0.6, 0.1, 0.0],  # Candidate 1.
+            ]
+        ], dtype=np.float32)
+    num_keypoints_candidates_np = np.array(
+        [
+            # Example 0.
+            [2, 2, 2],
+            # Example 1.
+            [2, 2, 0],
+        ], dtype=np.int32)
+    bboxes_np = np.array(
+        [
+            # Example 0.
+            [
+                [2.0, 2.0, 14.0, 10.0],  # Instance 0.
+                [0.0, 3.0, 5.0, 7.0],  # Instance 1.
+            ],
+            # Example 1.
+            [
+                [0.0, 0.0, 6.0, 2.0],  # Instance 0.
+                [5.0, 1.4, 9.0, 5.0],  # Instance 1.
+            ],
+        ], dtype=np.float32)
+    unmatched_keypoint_score = 0.1
+
+    def graph_fn():
+      regressed_keypoints = tf.constant(
+          regressed_keypoints_np, dtype=tf.float32)
+      keypoint_candidates = tf.constant(
+          keypoint_candidates_np, dtype=tf.float32)
+      keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
+      num_keypoint_candidates = tf.constant(num_keypoints_candidates_np,
+                                            dtype=tf.int32)
+      bboxes = tf.constant(bboxes_np, dtype=tf.float32)
+      refined_keypoints, refined_scores = cnma.refine_keypoints(
+          regressed_keypoints, keypoint_candidates, keypoint_scores,
+          num_keypoint_candidates, bboxes=bboxes,
+          unmatched_keypoint_score=unmatched_keypoint_score,
+          box_scale=1.0, candidate_search_scale=0.3)
+      return refined_keypoints, refined_scores
+
+    refined_keypoints, refined_scores = self.execute(graph_fn, [])
+
+    expected_refined_keypoints = np.array(
+        [
+            # Example 0.
+            [
+                [[2.0, 2.5], [6.0, 10.0], [14.0, 7.0]],  # Instance 0.
+                [[0.0, 6.0], [3.0, 3.0], [4.0, 7.0]],  # Instance 1.
+            ],
+            # Example 1.
+            [
+                [[6.0, 1.5], [0.0, 0.3], [0.1, 0.1]],  # Instance 0.
+                [[6.0, 1.5], [5.0, 5.0], [9.0, 3.0]],  # Instance 1.
+            ],
+        ], dtype=np.float32)
+    expected_refined_scores = np.array(
+        [
+            # Example 0.
+            [
+                [0.8, unmatched_keypoint_score,  # Instance 0.
+                 unmatched_keypoint_score],
+                [unmatched_keypoint_score,  # Instance 1.
+                 unmatched_keypoint_score, 1.0],
+            ],
+            # Example 1.
+            [
+                [0.7, 0.1, unmatched_keypoint_score],  # Instance 0.
+                [0.7, 0.4, unmatched_keypoint_score],  # Instance 1.
+            ],
+        ], dtype=np.float32)
+
+    np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
+    np.testing.assert_allclose(expected_refined_scores, refined_scores)
+
+  def test_pad_to_full_keypoint_dim(self):
+    batch_size = 4
+    num_instances = 8
+    num_keypoints = 2
+    keypoint_inds = [1, 3]
+    num_total_keypoints = 5
+
+    kpt_coords_np = np.random.randn(batch_size, num_instances, num_keypoints, 2)
+    kpt_scores_np = np.random.randn(batch_size, num_instances, num_keypoints)
+
+    def graph_fn():
+      kpt_coords = tf.constant(kpt_coords_np)
+      kpt_scores = tf.constant(kpt_scores_np)
+      kpt_coords_padded, kpt_scores_padded = (
+          cnma._pad_to_full_keypoint_dim(
+              kpt_coords, kpt_scores, keypoint_inds, num_total_keypoints))
+      return kpt_coords_padded, kpt_scores_padded
+
+    kpt_coords_padded, kpt_scores_padded = self.execute(graph_fn, [])
+
+    self.assertAllEqual([batch_size, num_instances, num_total_keypoints, 2],
+                        kpt_coords_padded.shape)
+    self.assertAllEqual([batch_size, num_instances, num_total_keypoints],
+                        kpt_scores_padded.shape)
+
+    for i, kpt_ind in enumerate(keypoint_inds):
+      np.testing.assert_allclose(kpt_coords_np[:, :, i, :],
+                                 kpt_coords_padded[:, :, kpt_ind, :])
+      np.testing.assert_allclose(kpt_scores_np[:, :, i],
+                                 kpt_scores_padded[:, :, kpt_ind])
+
+  def test_pad_to_full_instance_dim(self):
+    batch_size = 4
+    max_instances = 8
+    num_keypoints = 6
+    num_instances = 2
+    instance_inds = [1, 3]
+
+    kpt_coords_np = np.random.randn(batch_size, num_instances, num_keypoints, 2)
+    kpt_scores_np = np.random.randn(batch_size, num_instances, num_keypoints)
+
+    def graph_fn():
+      kpt_coords = tf.constant(kpt_coords_np)
+      kpt_scores = tf.constant(kpt_scores_np)
+      kpt_coords_padded, kpt_scores_padded = (
+          cnma._pad_to_full_instance_dim(
+              kpt_coords, kpt_scores, instance_inds, max_instances))
+      return kpt_coords_padded, kpt_scores_padded
+
+    kpt_coords_padded, kpt_scores_padded = self.execute(graph_fn, [])
+
+    self.assertAllEqual([batch_size, max_instances, num_keypoints, 2],
+                        kpt_coords_padded.shape)
+    self.assertAllEqual([batch_size, max_instances, num_keypoints],
+                        kpt_scores_padded.shape)
+
+    for i, inst_ind in enumerate(instance_inds):
+      np.testing.assert_allclose(kpt_coords_np[:, i, :, :],
+                                 kpt_coords_padded[:, inst_ind, :, :])
+      np.testing.assert_allclose(kpt_scores_np[:, i, :],
+                                 kpt_scores_padded[:, inst_ind, :])
+
+
+# Common parameters for setting up testing examples across tests.
+_NUM_CLASSES = 10
+_KEYPOINT_INDICES = [0, 1, 2, 3]
+_NUM_KEYPOINTS = len(_KEYPOINT_INDICES)
+_TASK_NAME = 'human_pose'
+
+
+def get_fake_center_params():
+  """Returns the fake object center parameter namedtuple."""
+  return cnma.ObjectCenterParams(
+      classification_loss=losses.WeightedSigmoidClassificationLoss(),
+      object_center_loss_weight=1.0,
+      min_box_overlap_iou=1.0,
+      max_box_predictions=5,
+      use_labeled_classes=False)
+
+
+def get_fake_od_params():
+  """Returns the fake object detection parameter namedtuple."""
+  return cnma.ObjectDetectionParams(
+      localization_loss=losses.L1LocalizationLoss(),
+      offset_loss_weight=1.0,
+      scale_loss_weight=0.1)
+
+
+def get_fake_kp_params():
+  """Returns the fake keypoint estimation parameter namedtuple."""
+  return cnma.KeypointEstimationParams(
+      task_name=_TASK_NAME,
+      class_id=1,
+      keypoint_indices=_KEYPOINT_INDICES,
+      keypoint_std_dev=[0.00001] * len(_KEYPOINT_INDICES),
+      classification_loss=losses.WeightedSigmoidClassificationLoss(),
+      localization_loss=losses.L1LocalizationLoss(),
+      keypoint_candidate_score_threshold=0.1)
+
+
+def get_fake_mask_params():
+  """Returns the fake mask estimation parameter namedtuple."""
+  return cnma.MaskParams(
+      classification_loss=losses.WeightedSoftmaxClassificationLoss(),
+      task_loss_weight=1.0,
+      mask_height=4,
+      mask_width=4)
+
+
+def build_center_net_meta_arch(build_resnet=False):
+  """Builds the CenterNet meta architecture."""
+  if build_resnet:
+    feature_extractor = (
+        center_net_resnet_feature_extractor.CenterNetResnetFeatureExtractor(
+            'resnet_v2_101'))
+  else:
+    feature_extractor = DummyFeatureExtractor(
+        channel_means=(1.0, 2.0, 3.0),
+        channel_stds=(10., 20., 30.),
+        bgr_ordering=False,
+        num_feature_outputs=2,
+        stride=4)
+  image_resizer_fn = functools.partial(
+      preprocessor.resize_to_range,
+      min_dimension=128,
+      max_dimension=128,
+      pad_to_max_dimesnion=True)
+  return cnma.CenterNetMetaArch(
+      is_training=True,
+      add_summaries=False,
+      num_classes=_NUM_CLASSES,
+      feature_extractor=feature_extractor,
+      image_resizer_fn=image_resizer_fn,
+      object_center_params=get_fake_center_params(),
+      object_detection_params=get_fake_od_params(),
+      keypoint_params_dict={_TASK_NAME: get_fake_kp_params()},
+      mask_params=get_fake_mask_params())
+
+
+def _logit(p):
+  return np.log(
+      (p + np.finfo(np.float32).eps) / (1 - p + np.finfo(np.float32).eps))
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMetaArchLibTest(test_case.TestCase):
+  """Test for CenterNet meta architecture related functions."""
+
+  def test_get_keypoint_name(self):
+    self.assertEqual('human_pose/keypoint_offset',
+                     cnma.get_keypoint_name('human_pose', 'keypoint_offset'))
+
+  def test_get_num_instances_from_weights(self):
+    weight1 = tf.constant([0.0, 0.0, 0.0], dtype=tf.float32)
+    weight2 = tf.constant([0.5, 0.9, 0.0], dtype=tf.float32)
+    weight3 = tf.constant([0.0, 0.0, 1.0], dtype=tf.float32)
+
+    def graph_fn_1():
+      # Total of three elements with non-zero values.
+      num_instances = cnma.get_num_instances_from_weights(
+          [weight1, weight2, weight3])
+      return num_instances
+    num_instances = self.execute(graph_fn_1, [])
+    self.assertAlmostEqual(3, num_instances)
+
+    # No non-zero value in the weights. Return minimum value: 1.
+    def graph_fn_2():
+      # Total of three elements with non-zero values.
+      num_instances = cnma.get_num_instances_from_weights([weight1, weight1])
+      return num_instances
+    num_instances = self.execute(graph_fn_2, [])
+    self.assertAlmostEqual(1, num_instances)
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
+  """Tests for the CenterNet meta architecture."""
+
+  def test_construct_prediction_heads(self):
+    model = build_center_net_meta_arch()
+    fake_feature_map = np.zeros((4, 128, 128, 8))
+
+    # Check the dictionary contains expected keys and corresponding heads with
+    # correct dimensions.
+    # "object center" head:
+    output = model._prediction_head_dict[cnma.OBJECT_CENTER][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)
+
+    # "object scale" (height/width) head:
+    output = model._prediction_head_dict[cnma.BOX_SCALE][-1](fake_feature_map)
+    self.assertEqual((4, 128, 128, 2), output.shape)
+
+    # "object offset" head:
+    output = model._prediction_head_dict[cnma.BOX_OFFSET][-1](fake_feature_map)
+    self.assertEqual((4, 128, 128, 2), output.shape)
+
+    # "keypoint offset" head:
+    output = model._prediction_head_dict[
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET)][-1](
+            fake_feature_map)
+    self.assertEqual((4, 128, 128, 2), output.shape)
+
+    # "keypoint heatmap" head:
+    output = model._prediction_head_dict[cnma.get_keypoint_name(
+        _TASK_NAME, cnma.KEYPOINT_HEATMAP)][-1](
+            fake_feature_map)
+    self.assertEqual((4, 128, 128, _NUM_KEYPOINTS), output.shape)
+
+    # "keypoint regression" head:
+    output = model._prediction_head_dict[cnma.get_keypoint_name(
+        _TASK_NAME, cnma.KEYPOINT_REGRESSION)][-1](
+            fake_feature_map)
+    self.assertEqual((4, 128, 128, 2 * _NUM_KEYPOINTS), output.shape)
+
+    # "mask" head:
+    output = model._prediction_head_dict[cnma.SEGMENTATION_HEATMAP][-1](
+        fake_feature_map)
+    self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)
+
+  def test_initialize_target_assigners(self):
+    model = build_center_net_meta_arch()
+    assigner_dict = model._initialize_target_assigners(
+        stride=2,
+        min_box_overlap_iou=0.7)
+
+    # Check whether the correponding target assigner class is initialized.
+    # object center target assigner:
+    self.assertIsInstance(assigner_dict[cnma.OBJECT_CENTER],
+                          cn_assigner.CenterNetCenterHeatmapTargetAssigner)
+
+    # object detection target assigner:
+    self.assertIsInstance(assigner_dict[cnma.DETECTION_TASK],
+                          cn_assigner.CenterNetBoxTargetAssigner)
+
+    # keypoint estimation target assigner:
+    self.assertIsInstance(assigner_dict[_TASK_NAME],
+                          cn_assigner.CenterNetKeypointTargetAssigner)
+
+    # mask estimation target assigner:
+    self.assertIsInstance(assigner_dict[cnma.SEGMENTATION_TASK],
+                          cn_assigner.CenterNetMaskTargetAssigner)
+
+  def test_predict(self):
+    """Test the predict function."""
+
+    model = build_center_net_meta_arch()
+    def graph_fn():
+      prediction_dict = model.predict(tf.zeros([2, 128, 128, 3]), None)
+      return prediction_dict
+
+    prediction_dict = self.execute(graph_fn, [])
+
+    self.assertEqual(prediction_dict['preprocessed_inputs'].shape,
+                     (2, 128, 128, 3))
+    self.assertEqual(prediction_dict[cnma.OBJECT_CENTER][0].shape,
+                     (2, 32, 32, _NUM_CLASSES))
+    self.assertEqual(prediction_dict[cnma.BOX_SCALE][0].shape,
+                     (2, 32, 32, 2))
+    self.assertEqual(prediction_dict[cnma.BOX_OFFSET][0].shape,
+                     (2, 32, 32, 2))
+    self.assertEqual(prediction_dict[cnma.SEGMENTATION_HEATMAP][0].shape,
+                     (2, 32, 32, _NUM_CLASSES))
+
+  def test_loss(self):
+    """Test the loss function."""
+    groundtruth_dict = get_fake_groundtruth_dict(16, 32, 4)
+    model = build_center_net_meta_arch()
+    model.provide_groundtruth(
+        groundtruth_boxes_list=groundtruth_dict[fields.BoxListFields.boxes],
+        groundtruth_weights_list=groundtruth_dict[fields.BoxListFields.weights],
+        groundtruth_classes_list=groundtruth_dict[fields.BoxListFields.classes],
+        groundtruth_keypoints_list=groundtruth_dict[
+            fields.BoxListFields.keypoints],
+        groundtruth_masks_list=groundtruth_dict[
+            fields.BoxListFields.masks])
+
+    prediction_dict = get_fake_prediction_dict(
+        input_height=16, input_width=32, stride=4)
+
+    def graph_fn():
+      loss_dict = model.loss(prediction_dict,
+                             tf.constant([[16, 24, 3], [16, 24, 3]]))
+      return loss_dict
+
+    loss_dict = self.execute(graph_fn, [])
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX, cnma.OBJECT_CENTER)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX, cnma.BOX_SCALE)])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX, cnma.BOX_OFFSET)])
+    self.assertGreater(
+        0.01,
+        loss_dict['%s/%s' %
+                  (cnma.LOSS_KEY_PREFIX,
+                   cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP))])
+    self.assertGreater(
+        0.01,
+        loss_dict['%s/%s' %
+                  (cnma.LOSS_KEY_PREFIX,
+                   cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET))])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.get_keypoint_name(
+                                       _TASK_NAME, cnma.KEYPOINT_REGRESSION))])
+    self.assertGreater(
+        0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
+                                   cnma.SEGMENTATION_HEATMAP)])
+
+  @parameterized.parameters(
+      {'target_class_id': 1},
+      {'target_class_id': 2},
+  )
+  def test_postprocess(self, target_class_id):
+    """Test the postprocess function."""
+    model = build_center_net_meta_arch()
+    max_detection = model._center_params.max_box_predictions
+    num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
+
+    class_center = np.zeros((1, 32, 32, 10), dtype=np.float32)
+    height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
+    keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
+
+    class_probs = np.zeros(10)
+    class_probs[target_class_id] = _logit(0.75)
+    class_center[0, 16, 16] = class_probs
+    height_width[0, 16, 16] = [5, 10]
+    offset[0, 16, 16] = [.25, .5]
+    keypoint_regression[0, 16, 16] = [
+        -1., -1.,
+        -1., 1.,
+        1., -1.,
+        1., 1.]
+    keypoint_heatmaps[0, 14, 14, 0] = _logit(0.9)
+    keypoint_heatmaps[0, 14, 18, 1] = _logit(0.9)
+    keypoint_heatmaps[0, 18, 14, 2] = _logit(0.9)
+    keypoint_heatmaps[0, 18, 18, 3] = _logit(0.05)  # Note the low score.
+
+    segmentation_heatmap = np.zeros((1, 32, 32, 10), dtype=np.float32)
+    segmentation_heatmap[:, 14:18, 14:18, target_class_id] = 1.0
+    segmentation_heatmap = _logit(segmentation_heatmap)
+
+    class_center = tf.constant(class_center)
+    height_width = tf.constant(height_width)
+    offset = tf.constant(offset)
+    keypoint_heatmaps = tf.constant(keypoint_heatmaps, dtype=tf.float32)
+    keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
+    keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
+    segmentation_heatmap = tf.constant(segmentation_heatmap, dtype=tf.float32)
+
+    prediction_dict = {
+        cnma.OBJECT_CENTER: [class_center],
+        cnma.BOX_SCALE: [height_width],
+        cnma.BOX_OFFSET: [offset],
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP):
+            [keypoint_heatmaps],
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET):
+            [keypoint_offsets],
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
+            [keypoint_regression],
+        cnma.SEGMENTATION_HEATMAP: [segmentation_heatmap],
+    }
+
+    def graph_fn():
+      detections = model.postprocess(prediction_dict,
+                                     tf.constant([[128, 128, 3]]))
+      return detections
+
+    detections = self.execute_cpu(graph_fn, [])
+
+    self.assertAllClose(detections['detection_boxes'][0, 0],
+                        np.array([55, 46, 75, 86]) / 128.0)
+    self.assertAllClose(detections['detection_scores'][0],
+                        [.75, .5, .5, .5, .5])
+    self.assertEqual(detections['detection_classes'][0, 0], target_class_id)
+    self.assertEqual(detections['num_detections'], [5])
+    self.assertAllEqual([1, max_detection, num_keypoints, 2],
+                        detections['detection_keypoints'].shape)
+    self.assertAllEqual([1, max_detection, num_keypoints],
+                        detections['detection_keypoint_scores'].shape)
+    self.assertAllEqual([1, max_detection, 4, 4],
+                        detections['detection_masks'].shape)
+
+    # There should be some section of the first mask (correspond to the only
+    # detection) with non-zero mask values.
+    self.assertGreater(np.sum(detections['detection_masks'][0, 0, :, :] > 0), 0)
+    self.assertAllEqual(
+        detections['detection_masks'][0, 1:, :, :],
+        np.zeros_like(detections['detection_masks'][0, 1:, :, :]))
+
+    if target_class_id == 1:
+      expected_kpts_for_obj_0 = np.array(
+          [[14., 14.], [14., 18.], [18., 14.], [17., 17.]]) / 32.
+      expected_kpt_scores_for_obj_0 = np.array(
+          [0.9, 0.9, 0.9, cnma.UNMATCHED_KEYPOINT_SCORE])
+      np.testing.assert_allclose(detections['detection_keypoints'][0][0],
+                                 expected_kpts_for_obj_0, rtol=1e-6)
+      np.testing.assert_allclose(detections['detection_keypoint_scores'][0][0],
+                                 expected_kpt_scores_for_obj_0, rtol=1e-6)
+    else:
+      # All keypoint outputs should be zeros.
+      np.testing.assert_allclose(
+          detections['detection_keypoints'][0][0],
+          np.zeros([num_keypoints, 2], np.float),
+          rtol=1e-6)
+      np.testing.assert_allclose(
+          detections['detection_keypoint_scores'][0][0],
+          np.zeros([num_keypoints], np.float),
+          rtol=1e-6)
+
+  def test_get_instance_indices(self):
+    classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
+    num_detections = tf.constant([1, 3], dtype=tf.int32)
+    batch_index = 1
+    class_id = 2
+    model = build_center_net_meta_arch()
+    valid_indices = model._get_instance_indices(
+        classes, num_detections, batch_index, class_id)
+    self.assertAllEqual(valid_indices.numpy(), [0, 2])
+
+
+def get_fake_prediction_dict(input_height, input_width, stride):
+  """Prepares the fake prediction dictionary."""
+  output_height = input_height // stride
+  output_width = input_width // stride
+  object_center = np.zeros((2, output_height, output_width, _NUM_CLASSES),
+                           dtype=np.float32)
+  # Box center:
+  #   y: floor((0.54 + 0.56) / 2 * 4) = 2,
+  #   x: floor((0.54 + 0.56) / 2 * 8) = 4
+  object_center[0, 2, 4, 1] = 1.0
+  object_center = _logit(object_center)
+
+  # Box size:
+  #   height: (0.56 - 0.54) * 4 = 0.08
+  #   width:  (0.56 - 0.54) * 8 = 0.16
+  object_scale = np.zeros((2, output_height, output_width, 2), dtype=np.float32)
+  object_scale[0, 2, 4] = 0.08, 0.16
+
+  # Box center offset coordinate (0.55, 0.55):
+  #   y-offset: 0.55 * 4 - 2 = 0.2
+  #   x-offset: 0.55 * 8 - 4 = 0.4
+  object_offset = np.zeros((2, output_height, output_width, 2),
+                           dtype=np.float32)
+  object_offset[0, 2, 4] = 0.2, 0.4
+
+  keypoint_heatmap = np.zeros((2, output_height, output_width, _NUM_KEYPOINTS),
+                              dtype=np.float32)
+  keypoint_heatmap[0, 2, 4, 1] = 1.0
+  keypoint_heatmap[0, 2, 4, 3] = 1.0
+  keypoint_heatmap = _logit(keypoint_heatmap)
+
+  keypoint_offset = np.zeros((2, output_height, output_width, 2),
+                             dtype=np.float32)
+  keypoint_offset[0, 2, 4] = 0.2, 0.4
+
+  keypoint_regression = np.zeros(
+      (2, output_height, output_width, 2 * _NUM_KEYPOINTS), dtype=np.float32)
+  keypoint_regression[0, 2, 4] = 0.0, 0.0, 0.2, 0.4, 0.0, 0.0, 0.2, 0.4
+
+  mask_heatmap = np.zeros((2, output_height, output_width, _NUM_CLASSES),
+                          dtype=np.float32)
+  mask_heatmap[0, 2, 4, 1] = 1.0
+  mask_heatmap = _logit(mask_heatmap)
+
+  prediction_dict = {
+      'preprocessed_inputs':
+          tf.zeros((2, input_height, input_width, 3)),
+      cnma.OBJECT_CENTER: [
+          tf.constant(object_center),
+          tf.constant(object_center)
+      ],
+      cnma.BOX_SCALE: [
+          tf.constant(object_scale),
+          tf.constant(object_scale)
+      ],
+      cnma.BOX_OFFSET: [
+          tf.constant(object_offset),
+          tf.constant(object_offset)
+      ],
+      cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP): [
+          tf.constant(keypoint_heatmap),
+          tf.constant(keypoint_heatmap)
+      ],
+      cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET): [
+          tf.constant(keypoint_offset),
+          tf.constant(keypoint_offset)
+      ],
+      cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION): [
+          tf.constant(keypoint_regression),
+          tf.constant(keypoint_regression)
+      ],
+      cnma.SEGMENTATION_HEATMAP: [
+          tf.constant(mask_heatmap),
+          tf.constant(mask_heatmap)
+      ]
+  }
+  return prediction_dict
+
+
+def get_fake_groundtruth_dict(input_height, input_width, stride):
+  """Prepares the fake groundtruth dictionary."""
+  # A small box with center at (0.55, 0.55).
+  boxes = [
+      tf.constant([[0.54, 0.54, 0.56, 0.56]]),
+      tf.constant([[0.0, 0.0, 0.5, 0.5]]),
+  ]
+  classes = [
+      tf.one_hot([1], depth=_NUM_CLASSES),
+      tf.one_hot([0], depth=_NUM_CLASSES),
+  ]
+  weights = [
+      tf.constant([1.]),
+      tf.constant([0.]),
+  ]
+  keypoints = [
+      tf.tile(
+          tf.expand_dims(
+              tf.constant([[float('nan'), 0.55,
+                            float('nan'), 0.55, 0.55, 0.0]]),
+              axis=2),
+          multiples=[1, 1, 2]),
+      tf.tile(
+          tf.expand_dims(
+              tf.constant([[float('nan'), 0.55,
+                            float('nan'), 0.55, 0.55, 0.0]]),
+              axis=2),
+          multiples=[1, 1, 2]),
+  ]
+  labeled_classes = [
+      tf.one_hot([1], depth=_NUM_CLASSES) + tf.one_hot([2], depth=_NUM_CLASSES),
+      tf.one_hot([0], depth=_NUM_CLASSES) + tf.one_hot([1], depth=_NUM_CLASSES),
+  ]
+  mask = np.zeros((1, input_height, input_width), dtype=np.float32)
+  mask[0, 8:8+stride, 16:16+stride] = 1
+  masks = [
+      tf.constant(mask),
+      tf.zeros_like(mask),
+  ]
+  groundtruth_dict = {
+      fields.BoxListFields.boxes: boxes,
+      fields.BoxListFields.weights: weights,
+      fields.BoxListFields.classes: classes,
+      fields.BoxListFields.keypoints: keypoints,
+      fields.BoxListFields.masks: masks,
+      fields.InputDataFields.groundtruth_labeled_classes: labeled_classes,
+  }
+  return groundtruth_dict
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMetaComputeLossTest(test_case.TestCase):
+  """Test for CenterNet loss compuation related functions."""
+
+  def setUp(self):
+    self.model = build_center_net_meta_arch()
+    self.classification_loss_fn = self.model._center_params.classification_loss
+    self.localization_loss_fn = self.model._od_params.localization_loss
+    self.true_image_shapes = tf.constant([[16, 24, 3], [16, 24, 3]])
+    self.input_height = 16
+    self.input_width = 32
+    self.stride = 4
+    self.per_pixel_weights = self.get_per_pixel_weights(self.true_image_shapes,
+                                                        self.input_height,
+                                                        self.input_width,
+                                                        self.stride)
+    self.prediction_dict = get_fake_prediction_dict(self.input_height,
+                                                    self.input_width,
+                                                    self.stride)
+    self.model._groundtruth_lists = get_fake_groundtruth_dict(
+        self.input_height, self.input_width, self.stride)
+    super(CenterNetMetaComputeLossTest, self).setUp()
+
+  def get_per_pixel_weights(self, true_image_shapes, input_height, input_width,
+                            stride):
+    output_height, output_width = (input_height // stride,
+                                   input_width // stride)
+
+    # TODO(vighneshb) Explore whether using floor here is safe.
+    output_true_image_shapes = tf.ceil(tf.to_float(true_image_shapes) / stride)
+    per_pixel_weights = cnma.get_valid_anchor_weights_in_flattened_image(
+        output_true_image_shapes, output_height, output_width)
+    per_pixel_weights = tf.expand_dims(per_pixel_weights, 2)
+    return per_pixel_weights
+
+  def test_compute_object_center_loss(self):
+    def graph_fn():
+      loss = self.model._compute_object_center_loss(
+          object_center_predictions=self.prediction_dict[cnma.OBJECT_CENTER],
+          input_height=self.input_height,
+          input_width=self.input_width,
+          per_pixel_weights=self.per_pixel_weights)
+      return loss
+
+    loss = self.execute(graph_fn, [])
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(0.01, loss)
+
+    default_value = self.model._center_params.use_only_known_classes
+    self.model._center_params = (
+        self.model._center_params._replace(use_only_known_classes=True))
+    loss = self.model._compute_object_center_loss(
+        object_center_predictions=self.prediction_dict[cnma.OBJECT_CENTER],
+        input_height=self.input_height,
+        input_width=self.input_width,
+        per_pixel_weights=self.per_pixel_weights)
+    self.model._center_params = (
+        self.model._center_params._replace(
+            use_only_known_classes=default_value))
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(0.01, loss)
+
+  def test_compute_box_scale_and_offset_loss(self):
+    def graph_fn():
+      scale_loss, offset_loss = self.model._compute_box_scale_and_offset_loss(
+          scale_predictions=self.prediction_dict[cnma.BOX_SCALE],
+          offset_predictions=self.prediction_dict[cnma.BOX_OFFSET],
+          input_height=self.input_height,
+          input_width=self.input_width)
+      return scale_loss, offset_loss
+
+    scale_loss, offset_loss = self.execute(graph_fn, [])
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(0.01, scale_loss)
+    self.assertGreater(0.01, offset_loss)
+
+  def test_compute_kp_heatmap_loss(self):
+    def graph_fn():
+      loss = self.model._compute_kp_heatmap_loss(
+          input_height=self.input_height,
+          input_width=self.input_width,
+          task_name=_TASK_NAME,
+          heatmap_predictions=self.prediction_dict[cnma.get_keypoint_name(
+              _TASK_NAME, cnma.KEYPOINT_HEATMAP)],
+          classification_loss_fn=self.classification_loss_fn,
+          per_pixel_weights=self.per_pixel_weights)
+      return loss
+
+    loss = self.execute(graph_fn, [])
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(0.01, loss)
+
+  def test_compute_kp_offset_loss(self):
+    def graph_fn():
+      loss = self.model._compute_kp_offset_loss(
+          input_height=self.input_height,
+          input_width=self.input_width,
+          task_name=_TASK_NAME,
+          offset_predictions=self.prediction_dict[cnma.get_keypoint_name(
+              _TASK_NAME, cnma.KEYPOINT_OFFSET)],
+          localization_loss_fn=self.localization_loss_fn)
+      return loss
+
+    loss = self.execute(graph_fn, [])
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(0.01, loss)
+
+  def test_compute_kp_regression_loss(self):
+    def graph_fn():
+      loss = self.model._compute_kp_regression_loss(
+          input_height=self.input_height,
+          input_width=self.input_width,
+          task_name=_TASK_NAME,
+          regression_predictions=self.prediction_dict[cnma.get_keypoint_name(
+              _TASK_NAME, cnma.KEYPOINT_REGRESSION,)],
+          localization_loss_fn=self.localization_loss_fn)
+      return loss
+
+    loss = self.execute(graph_fn, [])
+
+    # The prediction and groundtruth are curated to produce very low loss.
+    self.assertGreater(0.01, loss)
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetMetaArchRestoreTest(test_case.TestCase):
+
+  def test_restore_map_resnet(self):
+    """Test restore map for a resnet backbone."""
+
+    model = build_center_net_meta_arch(build_resnet=True)
+    restore_map = model.restore_map('classification')
+    self.assertIsInstance(restore_map['feature_extractor'], tf.keras.Model)
+
+
+class DummyFeatureExtractor(cnma.CenterNetFeatureExtractor):
+
+  def __init__(self,
+               channel_means,
+               channel_stds,
+               bgr_ordering,
+               num_feature_outputs,
+               stride):
+    self._num_feature_outputs = num_feature_outputs
+    self._stride = stride
+    super(DummyFeatureExtractor, self).__init__(
+        channel_means=channel_means, channel_stds=channel_stds,
+        bgr_ordering=bgr_ordering)
+
+  def predict(self):
+    pass
+
+  def loss(self):
+    pass
+
+  def postprocess(self):
+    pass
+
+  def restore_map(self):
+    pass
+
+  def call(self, inputs):
+    batch_size, input_height, input_width, _ = inputs.shape
+    fake_output = tf.ones([
+        batch_size, input_height // self._stride, input_width // self._stride,
+        64
+    ], dtype=tf.float32)
+    return [fake_output] * self._num_feature_outputs
+
+  @property
+  def out_stride(self):
+    return self._stride
+
+  @property
+  def num_feature_outputs(self):
+    return self._num_feature_outputs
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetFeatureExtractorTest(test_case.TestCase):
+  """Test the base feature extractor class."""
+
+  def test_preprocess(self):
+    feature_extractor = DummyFeatureExtractor(
+        channel_means=(1.0, 2.0, 3.0),
+        channel_stds=(10., 20., 30.), bgr_ordering=False,
+        num_feature_outputs=2, stride=4)
+
+    img = np.zeros((2, 32, 32, 3))
+    img[:, :, :] = 11, 22, 33
+
+    def graph_fn():
+      output = feature_extractor.preprocess(img)
+      return output
+
+    output = self.execute(graph_fn, [])
+    self.assertAlmostEqual(output.sum(), 2 * 32 * 32 * 3)
+
+  def test_bgr_ordering(self):
+    feature_extractor = DummyFeatureExtractor(
+        channel_means=(0.0, 0.0, 0.0),
+        channel_stds=(1., 1., 1.), bgr_ordering=True,
+        num_feature_outputs=2, stride=4)
+
+    img = np.zeros((2, 32, 32, 3), dtype=np.float32)
+    img[:, :, :] = 1, 2, 3
+
+    def graph_fn():
+      output = feature_extractor.preprocess(img)
+      return output
+
+    output = self.execute(graph_fn, [])
+    self.assertAllClose(output[..., 2], 1 * np.ones((2, 32, 32)))
+    self.assertAllClose(output[..., 1], 2 * np.ones((2, 32, 32)))
+    self.assertAllClose(output[..., 0], 3 * np.ones((2, 32, 32)))
+
+  def test_default_ordering(self):
+    feature_extractor = DummyFeatureExtractor(
+        channel_means=(0.0, 0.0, 0.0),
+        channel_stds=(1., 1., 1.), bgr_ordering=False,
+        num_feature_outputs=2, stride=4)
+
+    img = np.zeros((2, 32, 32, 3), dtype=np.float32)
+    img[:, :, :] = 1, 2, 3
+
+    def graph_fn():
+      output = feature_extractor.preprocess(img)
+      return output
+
+    output = self.execute(graph_fn, [])
+    self.assertAllClose(output[..., 0], 1 * np.ones((2, 32, 32)))
+    self.assertAllClose(output[..., 1], 2 * np.ones((2, 32, 32)))
+    self.assertAllClose(output[..., 2], 3 * np.ones((2, 32, 32)))
+
+
+if __name__ == '__main__':
+  tf.enable_v2_behavior()
+  tf.test.main()
--- a/research/object_detection/meta_architectures/context_rcnn_lib.py
+++ b/research/object_detection/meta_architectures/context_rcnn_lib.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library functions for ContextRCNN."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+import tf_slim as slim
+
+
+# The negative value used in padding the invalid weights.
+_NEGATIVE_PADDING_VALUE = -100000
+
+
+def filter_weight_value(weights, values, valid_mask):
+  """Filters weights and values based on valid_mask.
+
+  _NEGATIVE_PADDING_VALUE will be added to invalid elements in the weights to
+  avoid their contribution in softmax. 0 will be set for the invalid elements in
+  the values.
+
+  Args:
+    weights: A float Tensor of shape [batch_size, input_size, context_size].
+    values: A float Tensor of shape [batch_size, context_size,
+      projected_dimension].
+    valid_mask: A boolean Tensor of shape [batch_size, context_size]. True means
+      valid and False means invalid.
+
+  Returns:
+    weights: A float Tensor of shape [batch_size, input_size, context_size].
+    values: A float Tensor of shape [batch_size, context_size,
+      projected_dimension].
+
+  Raises:
+    ValueError: If shape of doesn't match.
+  """
+  w_batch_size, _, w_context_size = weights.shape
+  v_batch_size, v_context_size, _ = values.shape
+  m_batch_size, m_context_size = valid_mask.shape
+  if w_batch_size != v_batch_size or v_batch_size != m_batch_size:
+    raise ValueError("Please make sure the first dimension of the input"
+                     " tensors are the same.")
+
+  if w_context_size != v_context_size:
+    raise ValueError("Please make sure the third dimension of weights matches"
+                     " the second dimension of values.")
+
+  if w_context_size != m_context_size:
+    raise ValueError("Please make sure the third dimension of the weights"
+                     " matches the second dimension of the valid_mask.")
+
+  valid_mask = valid_mask[..., tf.newaxis]
+
+  # Force the invalid weights to be very negative so it won't contribute to
+  # the softmax.
+  weights += tf.transpose(
+      tf.cast(tf.math.logical_not(valid_mask), weights.dtype) *
+      _NEGATIVE_PADDING_VALUE,
+      perm=[0, 2, 1])
+
+  # Force the invalid values to be 0.
+  values *= tf.cast(valid_mask, values.dtype)
+
+  return weights, values
+
+
+def compute_valid_mask(num_valid_elements, num_elements):
+  """Computes mask of valid entries within padded context feature.
+
+  Args:
+    num_valid_elements: A int32 Tensor of shape [batch_size].
+    num_elements: An int32 Tensor.
+
+  Returns:
+    A boolean Tensor of the shape [batch_size, num_elements]. True means
+      valid and False means invalid.
+  """
+  batch_size = num_valid_elements.shape[0]
+  element_idxs = tf.range(num_elements, dtype=tf.int32)
+  batch_element_idxs = tf.tile(element_idxs[tf.newaxis, ...], [batch_size, 1])
+  num_valid_elements = num_valid_elements[..., tf.newaxis]
+  valid_mask = tf.less(batch_element_idxs, num_valid_elements)
+  return valid_mask
+
+
+def project_features(features, projection_dimension, is_training, normalize):
+  """Projects features to another feature space.
+
+  Args:
+    features: A float Tensor of shape [batch_size, features_size,
+      num_features].
+    projection_dimension: A int32 Tensor.
+    is_training: A boolean Tensor (affecting batch normalization).
+    normalize: A boolean Tensor. If true, the output features will be l2
+      normalized on the last dimension.
+
+  Returns:
+    A float Tensor of shape [batch, features_size, projection_dimension].
+  """
+  # TODO(guanhangwu) Figure out a better way of specifying the batch norm
+  # params.
+  batch_norm_params = {
+      "is_training": is_training,
+      "decay": 0.97,
+      "epsilon": 0.001,
+      "center": True,
+      "scale": True
+  }
+
+  batch_size, _, num_features = features.shape
+  features = tf.reshape(features, [-1, num_features])
+  projected_features = slim.fully_connected(
+      features,
+      num_outputs=projection_dimension,
+      activation_fn=tf.nn.relu6,
+      normalizer_fn=slim.batch_norm,
+      normalizer_params=batch_norm_params)
+
+  projected_features = tf.reshape(projected_features,
+                                  [batch_size, -1, projection_dimension])
+
+  if normalize:
+    projected_features = tf.math.l2_normalize(projected_features, axis=-1)
+
+  return projected_features
+
+
+def attention_block(input_features, context_features, bottleneck_dimension,
+                    output_dimension, attention_temperature, valid_mask,
+                    is_training):
+  """Generic attention block.
+
+  Args:
+    input_features: A float Tensor of shape [batch_size, input_size,
+      num_input_features].
+    context_features: A float Tensor of shape [batch_size, context_size,
+      num_context_features].
+    bottleneck_dimension: A int32 Tensor representing the bottleneck dimension
+      for intermediate projections.
+    output_dimension: A int32 Tensor representing the last dimension of the
+      output feature.
+    attention_temperature: A float Tensor. It controls the temperature of the
+      softmax for weights calculation. The formula for calculation as follows:
+        weights = exp(weights / temperature) / sum(exp(weights / temperature))
+    valid_mask: A boolean Tensor of shape [batch_size, context_size].
+    is_training: A boolean Tensor (affecting batch normalization).
+
+  Returns:
+    A float Tensor of shape [batch_size, input_size, output_dimension].
+  """
+
+  with tf.variable_scope("AttentionBlock"):
+    queries = project_features(
+        input_features, bottleneck_dimension, is_training, normalize=True)
+    keys = project_features(
+        context_features, bottleneck_dimension, is_training, normalize=True)
+    values = project_features(
+        context_features, bottleneck_dimension, is_training, normalize=True)
+
+  weights = tf.matmul(queries, keys, transpose_b=True)
+
+  weights, values = filter_weight_value(weights, values, valid_mask)
+
+  weights = tf.nn.softmax(weights / attention_temperature)
+
+  features = tf.matmul(weights, values)
+  output_features = project_features(
+      features, output_dimension, is_training, normalize=False)
+  return output_features
+
+
+def compute_box_context_attention(box_features, context_features,
+                                  valid_context_size, bottleneck_dimension,
+                                  attention_temperature, is_training):
+  """Computes the attention feature from the context given a batch of box.
+
+  Args:
+    box_features: A float Tensor of shape [batch_size, max_num_proposals,
+      height, width, channels]. It is pooled features from first stage
+      proposals.
+    context_features: A float Tensor of shape [batch_size, context_size,
+      num_context_features].
+    valid_context_size: A int32 Tensor of shape [batch_size].
+    bottleneck_dimension: A int32 Tensor representing the bottleneck dimension
+      for intermediate projections.
+    attention_temperature: A float Tensor. It controls the temperature of the
+      softmax for weights calculation. The formula for calculation as follows:
+        weights = exp(weights / temperature) / sum(exp(weights / temperature))
+    is_training: A boolean Tensor (affecting batch normalization).
+
+  Returns:
+    A float Tensor of shape [batch_size, max_num_proposals, 1, 1, channels].
+  """
+  _, context_size, _ = context_features.shape
+  valid_mask = compute_valid_mask(valid_context_size, context_size)
+
+  channels = box_features.shape[-1]
+  # Average pools over height and width dimension so that the shape of
+  # box_features becomes [batch_size, max_num_proposals, channels].
+  box_features = tf.reduce_mean(box_features, [2, 3])
+
+  output_features = attention_block(box_features, context_features,
+                                    bottleneck_dimension, channels.value,
+                                    attention_temperature, valid_mask,
+                                    is_training)
+
+  # Expands the dimension back to match with the original feature map.
+  output_features = output_features[:, :, tf.newaxis, tf.newaxis, :]
+
+  return output_features
--- a/research/object_detection/meta_architectures/context_rcnn_lib_tf1_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_lib_tf1_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for context_rcnn_lib."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+from absl.testing import parameterized
+import tensorflow.compat.v1 as tf
+
+from object_detection.meta_architectures import context_rcnn_lib
+from object_detection.utils import test_case
+from object_detection.utils import tf_version
+
+_NEGATIVE_PADDING_VALUE = -100000
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
+class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
+                         tf.test.TestCase):
+  """Tests for the functions in context_rcnn_lib."""
+
+  def test_compute_valid_mask(self):
+    num_elements = tf.constant(3, tf.int32)
+    num_valid_elementss = tf.constant((1, 2), tf.int32)
+    valid_mask = context_rcnn_lib.compute_valid_mask(num_valid_elementss,
+                                                     num_elements)
+    expected_valid_mask = tf.constant([[1, 0, 0], [1, 1, 0]], tf.float32)
+    self.assertAllEqual(valid_mask, expected_valid_mask)
+
+  def test_filter_weight_value(self):
+    weights = tf.ones((2, 3, 2), tf.float32) * 4
+    values = tf.ones((2, 2, 4), tf.float32)
+    valid_mask = tf.constant([[True, True], [True, False]], tf.bool)
+
+    filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
+        weights, values, valid_mask)
+    expected_weights = tf.constant([[[4, 4], [4, 4], [4, 4]],
+                                    [[4, _NEGATIVE_PADDING_VALUE + 4],
+                                     [4, _NEGATIVE_PADDING_VALUE + 4],
+                                     [4, _NEGATIVE_PADDING_VALUE + 4]]])
+
+    expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
+                                   [[1, 1, 1, 1], [0, 0, 0, 0]]])
+    self.assertAllEqual(filtered_weights, expected_weights)
+    self.assertAllEqual(filtered_values, expected_values)
+
+    # Changes the valid_mask so the results will be different.
+    valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
+
+    filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
+        weights, values, valid_mask)
+    expected_weights = tf.constant(
+        [[[4, 4], [4, 4], [4, 4]],
+         [[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
+          [_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
+          [_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4]]])
+
+    expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
+                                   [[0, 0, 0, 0], [0, 0, 0, 0]]])
+    self.assertAllEqual(filtered_weights, expected_weights)
+    self.assertAllEqual(filtered_values, expected_values)
+
+  @parameterized.parameters((2, True, True), (2, False, True),
+                            (10, True, False), (10, False, False))
+  def test_project_features(self, projection_dimension, is_training, normalize):
+    features = tf.ones([2, 3, 4], tf.float32)
+    projected_features = context_rcnn_lib.project_features(
+        features,
+        projection_dimension,
+        is_training=is_training,
+        normalize=normalize)
+
+    # Makes sure the shape is correct.
+    self.assertAllEqual(projected_features.shape, [2, 3, projection_dimension])
+
+  @parameterized.parameters(
+      (2, 10, 1),
+      (3, 10, 2),
+      (4, 20, 3),
+      (5, 20, 4),
+      (7, 20, 5),
+  )
+  def test_attention_block(self, bottleneck_dimension, output_dimension,
+                           attention_temperature):
+    input_features = tf.ones([2, 3, 4], tf.float32)
+    context_features = tf.ones([2, 2, 3], tf.float32)
+    valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
+    is_training = False
+    output_features = context_rcnn_lib.attention_block(
+        input_features, context_features, bottleneck_dimension,
+        output_dimension, attention_temperature, valid_mask, is_training)
+
+    # Makes sure the shape is correct.
+    self.assertAllEqual(output_features.shape, [2, 3, output_dimension])
+
+  @parameterized.parameters(True, False)
+  def test_compute_box_context_attention(self, is_training):
+    box_features = tf.ones([2, 3, 4, 4, 4], tf.float32)
+    context_features = tf.ones([2, 5, 6], tf.float32)
+    valid_context_size = tf.constant((2, 3), tf.int32)
+    bottleneck_dimension = 10
+    attention_temperature = 1
+    attention_features = context_rcnn_lib.compute_box_context_attention(
+        box_features, context_features, valid_context_size,
+        bottleneck_dimension, attention_temperature, is_training)
+    # Makes sure the shape is correct.
+    self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Context R-CNN meta-architecture definition.
+
+This adds the ability to use attention into contextual features within the
+Faster R-CNN object detection framework to improve object detection performance.
+See https://arxiv.org/abs/1912.03538 for more information.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from object_detection.core import standard_fields as fields
+from object_detection.meta_architectures import context_rcnn_lib
+from object_detection.meta_architectures import faster_rcnn_meta_arch
+
+
+class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
+  """Context R-CNN Meta-architecture definition."""
+
+  def __init__(self,
+               is_training,
+               num_classes,
+               image_resizer_fn,
+               feature_extractor,
+               number_of_stages,
+               first_stage_anchor_generator,
+               first_stage_target_assigner,
+               first_stage_atrous_rate,
+               first_stage_box_predictor_arg_scope_fn,
+               first_stage_box_predictor_kernel_size,
+               first_stage_box_predictor_depth,
+               first_stage_minibatch_size,
+               first_stage_sampler,
+               first_stage_non_max_suppression_fn,
+               first_stage_max_proposals,
+               first_stage_localization_loss_weight,
+               first_stage_objectness_loss_weight,
+               crop_and_resize_fn,
+               initial_crop_size,
+               maxpool_kernel_size,
+               maxpool_stride,
+               second_stage_target_assigner,
+               second_stage_mask_rcnn_box_predictor,
+               second_stage_batch_size,
+               second_stage_sampler,
+               second_stage_non_max_suppression_fn,
+               second_stage_score_conversion_fn,
+               second_stage_localization_loss_weight,
+               second_stage_classification_loss_weight,
+               second_stage_classification_loss,
+               second_stage_mask_prediction_loss_weight=1.0,
+               hard_example_miner=None,
+               parallel_iterations=16,
+               add_summaries=True,
+               clip_anchors_to_image=False,
+               use_static_shapes=False,
+               resize_masks=True,
+               freeze_batchnorm=False,
+               return_raw_detections_during_predict=False,
+               output_final_box_features=False,
+               attention_bottleneck_dimension=None,
+               attention_temperature=None):
+    """ContextRCNNMetaArch Constructor.
+
+    Args:
+      is_training: A boolean indicating whether the training version of the
+        computation graph should be constructed.
+      num_classes: Number of classes.  Note that num_classes *does not*
+        include the background category, so if groundtruth labels take values
+        in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
+        assigned classification targets can range from {0,... K}).
+      image_resizer_fn: A callable for image resizing.  This callable
+        takes a rank-3 image tensor of shape [height, width, channels]
+        (corresponding to a single image), an optional rank-3 instance mask
+        tensor of shape [num_masks, height, width] and returns a resized rank-3
+        image tensor, a resized mask tensor if one was provided in the input. In
+        addition this callable must also return a 1-D tensor of the form
+        [height, width, channels] containing the size of the true image, as the
+        image resizer can perform zero padding. See protos/image_resizer.proto.
+      feature_extractor: A FasterRCNNFeatureExtractor object.
+      number_of_stages:  An integer values taking values in {1, 2, 3}. If
+        1, the function will construct only the Region Proposal Network (RPN)
+        part of the model. If 2, the function will perform box refinement and
+        other auxiliary predictions all in the second stage. If 3, it will
+        extract features from refined boxes and perform the auxiliary
+        predictions on the non-maximum suppressed refined boxes.
+        If is_training is true and the value of number_of_stages is 3, it is
+        reduced to 2 since all the model heads are trained in parallel in second
+        stage during training.
+      first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
+        (note that currently we only support
+        grid_anchor_generator.GridAnchorGenerator objects)
+      first_stage_target_assigner: Target assigner to use for first stage of
+        Faster R-CNN (RPN).
+      first_stage_atrous_rate: A single integer indicating the atrous rate for
+        the single convolution op which is applied to the `rpn_features_to_crop`
+        tensor to obtain a tensor to be used for box prediction. Some feature
+        extractors optionally allow for producing feature maps computed at
+        denser resolutions.  The atrous rate is used to compensate for the
+        denser feature maps by using an effectively larger receptive field.
+        (This should typically be set to 1).
+      first_stage_box_predictor_arg_scope_fn: Either a
+        Keras layer hyperparams object or a function to construct tf-slim
+        arg_scope for conv2d, separable_conv2d and fully_connected ops. Used
+        for the RPN box predictor. If it is a keras hyperparams object the
+        RPN box predictor will be a Keras model. If it is a function to
+        construct an arg scope it will be a tf-slim box predictor.
+      first_stage_box_predictor_kernel_size: Kernel size to use for the
+        convolution op just prior to RPN box predictions.
+      first_stage_box_predictor_depth: Output depth for the convolution op
+        just prior to RPN box predictions.
+      first_stage_minibatch_size: The "batch size" to use for computing the
+        objectness and location loss of the region proposal network. This
+        "batch size" refers to the number of anchors selected as contributing
+        to the loss function for any given image within the image batch and is
+        only called "batch_size" due to terminology from the Faster R-CNN paper.
+      first_stage_sampler: Sampler to use for first stage loss (RPN loss).
+      first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
+        callable that takes `boxes`, `scores` and optional `clip_window`(with
+        all other inputs already set) and returns a dictionary containing
+        tensors with keys: `detection_boxes`, `detection_scores`,
+        `detection_classes`, `num_detections`. This is used to perform non max
+        suppression  on the boxes predicted by the Region Proposal Network
+        (RPN).
+        See `post_processing.batch_multiclass_non_max_suppression` for the type
+        and shape of these tensors.
+      first_stage_max_proposals: Maximum number of boxes to retain after
+        performing Non-Max Suppression (NMS) on the boxes predicted by the
+        Region Proposal Network (RPN).
+      first_stage_localization_loss_weight: A float
+      first_stage_objectness_loss_weight: A float
+      crop_and_resize_fn: A differentiable resampler to use for cropping RPN
+        proposal features.
+      initial_crop_size: A single integer indicating the output size
+        (width and height are set to be the same) of the initial bilinear
+        interpolation based cropping during ROI pooling.
+      maxpool_kernel_size: A single integer indicating the kernel size of the
+        max pool op on the cropped feature map during ROI pooling.
+      maxpool_stride: A single integer indicating the stride of the max pool
+        op on the cropped feature map during ROI pooling.
+      second_stage_target_assigner: Target assigner to use for second stage of
+        Faster R-CNN. If the model is configured with multiple prediction heads,
+        this target assigner is used to generate targets for all heads (with the
+        correct `unmatched_class_label`).
+      second_stage_mask_rcnn_box_predictor: Mask R-CNN box predictor to use for
+        the second stage.
+      second_stage_batch_size: The batch size used for computing the
+        classification and refined location loss of the box classifier.  This
+        "batch size" refers to the number of proposals selected as contributing
+        to the loss function for any given image within the image batch and is
+        only called "batch_size" due to terminology from the Faster R-CNN paper.
+      second_stage_sampler:  Sampler to use for second stage loss (box
+        classifier loss).
+      second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
+        callable that takes `boxes`, `scores`, optional `clip_window` and
+        optional (kwarg) `mask` inputs (with all other inputs already set)
+        and returns a dictionary containing tensors with keys:
+        `detection_boxes`, `detection_scores`, `detection_classes`,
+        `num_detections`, and (optionally) `detection_masks`. See
+        `post_processing.batch_multiclass_non_max_suppression` for the type and
+        shape of these tensors.
+      second_stage_score_conversion_fn: Callable elementwise nonlinearity
+        (that takes tensors as inputs and returns tensors).  This is usually
+        used to convert logits to probabilities.
+      second_stage_localization_loss_weight: A float indicating the scale factor
+        for second stage localization loss.
+      second_stage_classification_loss_weight: A float indicating the scale
+        factor for second stage classification loss.
+      second_stage_classification_loss: Classification loss used by the second
+        stage classifier. Either losses.WeightedSigmoidClassificationLoss or
+        losses.WeightedSoftmaxClassificationLoss.
+      second_stage_mask_prediction_loss_weight: A float indicating the scale
+        factor for second stage mask prediction loss. This is applicable only if
+        second stage box predictor is configured to predict masks.
+      hard_example_miner:  A losses.HardExampleMiner object (can be None).
+      parallel_iterations: (Optional) The number of iterations allowed to run
+        in parallel for calls to tf.map_fn.
+      add_summaries: boolean (default: True) controlling whether summary ops
+        should be added to tensorflow graph.
+      clip_anchors_to_image: Normally, anchors generated for a given image size
+        are pruned during training if they lie outside the image window. This
+        option clips the anchors to be within the image instead of pruning.
+      use_static_shapes: If True, uses implementation of ops with static shape
+        guarantees.
+      resize_masks: Indicates whether the masks presend in the groundtruth
+        should be resized in the model with `image_resizer_fn`
+      freeze_batchnorm: Whether to freeze batch norm parameters in the first
+        stage box predictor during training or not. When training with a small
+        batch size (e.g. 1), it is desirable to freeze batch norm update and
+        use pretrained batch norm params.
+      return_raw_detections_during_predict: Whether to return raw detection
+        boxes in the predict() method. These are decoded boxes that have not
+        been through postprocessing (i.e. NMS). Default False.
+      output_final_box_features: Whether to output final box features. If true,
+        it crops the feauture map based on the final box prediction and returns
+        in the dict as detection_features.
+      attention_bottleneck_dimension: A single integer. The bottleneck feature
+        dimension of the attention block.
+      attention_temperature: A single float. The attention temperature.
+
+    Raises:
+      ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
+        training time.
+      ValueError: If first_stage_anchor_generator is not of type
+        grid_anchor_generator.GridAnchorGenerator.
+    """
+    super(ContextRCNNMetaArch, self).__init__(
+        is_training,
+        num_classes,
+        image_resizer_fn,
+        feature_extractor,
+        number_of_stages,
+        first_stage_anchor_generator,
+        first_stage_target_assigner,
+        first_stage_atrous_rate,
+        first_stage_box_predictor_arg_scope_fn,
+        first_stage_box_predictor_kernel_size,
+        first_stage_box_predictor_depth,
+        first_stage_minibatch_size,
+        first_stage_sampler,
+        first_stage_non_max_suppression_fn,
+        first_stage_max_proposals,
+        first_stage_localization_loss_weight,
+        first_stage_objectness_loss_weight,
+        crop_and_resize_fn,
+        initial_crop_size,
+        maxpool_kernel_size,
+        maxpool_stride,
+        second_stage_target_assigner,
+        second_stage_mask_rcnn_box_predictor,
+        second_stage_batch_size,
+        second_stage_sampler,
+        second_stage_non_max_suppression_fn,
+        second_stage_score_conversion_fn,
+        second_stage_localization_loss_weight,
+        second_stage_classification_loss_weight,
+        second_stage_classification_loss,
+        second_stage_mask_prediction_loss_weight=(
+            second_stage_mask_prediction_loss_weight),
+        hard_example_miner=hard_example_miner,
+        parallel_iterations=parallel_iterations,
+        add_summaries=add_summaries,
+        clip_anchors_to_image=clip_anchors_to_image,
+        use_static_shapes=use_static_shapes,
+        resize_masks=resize_masks,
+        freeze_batchnorm=freeze_batchnorm,
+        return_raw_detections_during_predict=(
+            return_raw_detections_during_predict),
+        output_final_box_features=output_final_box_features)
+
+    self._context_feature_extract_fn = functools.partial(
+        context_rcnn_lib.compute_box_context_attention,
+        bottleneck_dimension=attention_bottleneck_dimension,
+        attention_temperature=attention_temperature,
+        is_training=is_training)
+
+  @staticmethod
+  def get_side_inputs(features):
+    """Overrides the get_side_inputs function in the base class.
+
+    This function returns context_features and valid_context_size, which will be
+    used in the _compute_second_stage_input_feature_maps function.
+
+    Args:
+      features: A dictionary of tensors.
+
+    Returns:
+      A dictionary of tensors contains context_features and valid_context_size.
+
+    Raises:
+      ValueError: If context_features or valid_context_size is not in the
+        features.
+    """
+    if (fields.InputDataFields.context_features not in features or
+        fields.InputDataFields.valid_context_size not in features):
+      raise ValueError(
+          "Please make sure context_features and valid_context_size are in the "
+          "features")
+
+    return {
+        fields.InputDataFields.context_features:
+            features[fields.InputDataFields.context_features],
+        fields.InputDataFields.valid_context_size:
+            features[fields.InputDataFields.valid_context_size]
+    }
+
+  def _compute_second_stage_input_feature_maps(self, features_to_crop,
+                                               proposal_boxes_normalized,
+                                               context_features,
+                                               valid_context_size):
+    """Crops to a set of proposals from the feature map for a batch of images.
+
+    This function overrides the one in the FasterRCNNMetaArch. Aside from
+    cropping and resizing the feature maps, which is done in the parent class,
+    it adds context attention features to the box features.
+
+    Args:
+      features_to_crop: A float32 Tensor with shape [batch_size, height, width,
+        depth]
+      proposal_boxes_normalized: A float32 Tensor with shape [batch_size,
+        num_proposals, box_code_size] containing proposal boxes in normalized
+        coordinates.
+      context_features: A float Tensor of shape [batch_size, context_size,
+        num_context_features].
+      valid_context_size: A int32 Tensor of shape [batch_size].
+
+    Returns:
+      A float32 Tensor with shape [K, new_height, new_width, depth].
+    """
+    box_features = self._crop_and_resize_fn(
+        features_to_crop, proposal_boxes_normalized,
+        [self._initial_crop_size, self._initial_crop_size])
+
+    attention_features = self._context_feature_extract_fn(
+        box_features=box_features,
+        context_features=context_features,
+        valid_context_size=valid_context_size)
+
+    # Adds box features with attention features.
+    box_features += attention_features
+
+    flattened_feature_maps = self._flatten_first_two_dimensions(box_features)
+
+    return self._maxpool_layer(flattened_feature_maps)
--- a/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
+++ b/research/object_detection/meta_architectures/context_rcnn_meta_arch_tf1_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.context_meta_arch."""
+"""Tests for object_detection.meta_architectures.context_meta_arch."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import functools
-
+import unittest
 from absl.testing import parameterized
 import mock
 import tensorflow.compat.v1 as tf
@@ -109,6 +109,7 @@ class FakeFasterRCNNKerasFeatureExtractor(
    ])


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):

  def _get_model(self, box_predictor, **common_kwargs):

--- a/research/object_detection/metrics/calibration_evaluation_test.py
+++ b/research/object_detection/metrics/calibration_evaluation_test.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 import tensorflow.compat.v1 as tf
 from object_detection.core import standard_fields
 from object_detection.metrics import calibration_evaluation
+from object_detection.utils import tf_version


 def _get_categories_list():
@@ -36,6 +38,7 @@ def _get_categories_list():
  }]


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class CalibrationDetectionEvaluationTest(tf.test.TestCase):

  def _get_ece(self, ece_op, update_op):

--- a/research/object_detection/metrics/calibration_metrics_test.py
+++ b/research/object_detection/metrics/calibration_metrics_test.py
@@ -18,11 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 import numpy as np
 import tensorflow.compat.v1 as tf
 from object_detection.metrics import calibration_metrics
+from object_detection.utils import tf_version


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class CalibrationLibTest(tf.test.TestCase):

  @staticmethod

--- a/research/object_detection/metrics/coco_evaluation.py
+++ b/research/object_detection/metrics/coco_evaluation.py
@@ -24,6 +24,7 @@ import tensorflow.compat.v1 as tf
 from object_detection.core import standard_fields
 from object_detection.metrics import coco_tools
 from object_detection.utils import json_utils
+from object_detection.utils import np_mask_ops
 from object_detection.utils import object_detection_evaluation


@@ -1263,3 +1264,535 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
        eval_metric_ops[metric_name] = (tf.py_func(
            value_func_factory(metric_name), [], np.float32), update_op)
    return eval_metric_ops
+
+
+class CocoPanopticSegmentationEvaluator(
+    object_detection_evaluation.DetectionEvaluator):
+  """Class to evaluate PQ (panoptic quality) metric on COCO dataset.
+
+  More details about this metric: https://arxiv.org/pdf/1801.00868.pdf.
+  """
+
+  def __init__(self,
+               categories,
+               include_metrics_per_category=False,
+               iou_threshold=0.5,
+               ioa_threshold=0.5):
+    """Constructor.
+
+    Args:
+      categories: A list of dicts, each of which has the following keys -
+        'id': (required) an integer id uniquely identifying this category.
+        'name': (required) string representing category name e.g., 'cat', 'dog'.
+      include_metrics_per_category: If True, include metrics for each category.
+      iou_threshold: intersection-over-union threshold for mask matching (with
+        normal groundtruths).
+      ioa_threshold: intersection-over-area threshold for mask matching with
+        "is_crowd" groundtruths.
+    """
+    super(CocoPanopticSegmentationEvaluator, self).__init__(categories)
+    self._groundtruth_masks = {}
+    self._groundtruth_class_labels = {}
+    self._groundtruth_is_crowd = {}
+    self._predicted_masks = {}
+    self._predicted_class_labels = {}
+    self._include_metrics_per_category = include_metrics_per_category
+    self._iou_threshold = iou_threshold
+    self._ioa_threshold = ioa_threshold
+
+  def clear(self):
+    """Clears the state to prepare for a fresh evaluation."""
+    self._groundtruth_masks.clear()
+    self._groundtruth_class_labels.clear()
+    self._groundtruth_is_crowd.clear()
+    self._predicted_masks.clear()
+    self._predicted_class_labels.clear()
+
+  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+    """Adds groundtruth for a single image to be used for evaluation.
+
+    If the image has already been added, a warning is logged, and groundtruth is
+    ignored.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      groundtruth_dict: A dictionary containing -
+        InputDataFields.groundtruth_classes: integer numpy array of shape
+          [num_masks] containing 1-indexed groundtruth classes for the mask.
+        InputDataFields.groundtruth_instance_masks: uint8 numpy array of shape
+          [num_masks, image_height, image_width] containing groundtruth masks.
+          The elements of the array must be in {0, 1}.
+        InputDataFields.groundtruth_is_crowd (optional): integer numpy array of
+          shape [num_boxes] containing iscrowd flag for groundtruth boxes.
+    """
+
+    if image_id in self._groundtruth_masks:
+      tf.logging.warning(
+          'Ignoring groundtruth with image %s, since it has already been '
+          'added to the ground truth database.', image_id)
+      return
+
+    self._groundtruth_masks[image_id] = groundtruth_dict[
+        standard_fields.InputDataFields.groundtruth_instance_masks]
+    self._groundtruth_class_labels[image_id] = groundtruth_dict[
+        standard_fields.InputDataFields.groundtruth_classes]
+    groundtruth_is_crowd = groundtruth_dict.get(
+        standard_fields.InputDataFields.groundtruth_is_crowd)
+    # Drop groundtruth_is_crowd if empty tensor.
+    if groundtruth_is_crowd is not None and not groundtruth_is_crowd.size > 0:
+      groundtruth_is_crowd = None
+    if groundtruth_is_crowd is not None:
+      self._groundtruth_is_crowd[image_id] = groundtruth_is_crowd
+
+  def add_single_detected_image_info(self, image_id, detections_dict):
+    """Adds detections for a single image to be used for evaluation.
+
+    If a detection has already been added for this image id, a warning is
+    logged, and the detection is skipped.
+
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      detections_dict: A dictionary containing -
+        DetectionResultFields.detection_classes: integer numpy array of shape
+          [num_masks] containing 1-indexed detection classes for the masks.
+        DetectionResultFields.detection_masks: optional uint8 numpy array of
+          shape [num_masks, image_height, image_width] containing instance
+          masks. The elements of the array must be in {0, 1}.
+
+    Raises:
+      ValueError: If results and groundtruth shape don't match.
+    """
+
+    if image_id not in self._groundtruth_masks:
+      raise ValueError('Missing groundtruth for image id: {}'.format(image_id))
+
+    detection_masks = detections_dict[
+        standard_fields.DetectionResultFields.detection_masks]
+    self._predicted_masks[image_id] = detection_masks
+    self._predicted_class_labels[image_id] = detections_dict[
+        standard_fields.DetectionResultFields.detection_classes]
+    groundtruth_mask_shape = self._groundtruth_masks[image_id].shape
+    if groundtruth_mask_shape[1:] != detection_masks.shape[1:]:
+      raise ValueError("The shape of results doesn't match groundtruth.")
+
+  def evaluate(self):
+    """Evaluates the detection masks and returns a dictionary of coco metrics.
+
+    Returns:
+      A dictionary holding -
+
+      1. summary_metric:
+      'PanopticQuality@%.2fIOU': mean panoptic quality averaged over classes at
+        the required IOU.
+      'SegmentationQuality@%.2fIOU': mean segmentation quality averaged over
+        classes at the required IOU.
+      'RecognitionQuality@%.2fIOU': mean recognition quality averaged over
+        classes at the required IOU.
+      'NumValidClasses': number of valid classes. A valid class should have at
+        least one normal (is_crowd=0) groundtruth mask or one predicted mask.
+      'NumTotalClasses': number of total classes.
+
+      2. per_category_pq: if include_metrics_per_category is True, category
+      specific results with keys of the form:
+      'PanopticQuality@%.2fIOU_ByCategory/category'.
+    """
+    # Evaluate and accumulate the iou/tp/fp/fn.
+    sum_tp_iou, sum_num_tp, sum_num_fp, sum_num_fn = self._evaluate_all_masks()
+    # Compute PQ metric for each category and average over all classes.
+    mask_metrics = self._compute_panoptic_metrics(sum_tp_iou, sum_num_tp,
+                                                  sum_num_fp, sum_num_fn)
+    return mask_metrics
+
+  def get_estimator_eval_metric_ops(self, eval_dict):
+    """Returns a dictionary of eval metric ops.
+
+    Note that once value_op is called, the detections and groundtruth added via
+    update_op are cleared.
+
+    Args:
+      eval_dict: A dictionary that holds tensors for evaluating object detection
+        performance. For single-image evaluation, this dictionary may be
+        produced from eval_util.result_dict_for_single_example(). If multi-image
+        evaluation, `eval_dict` should contain the fields
+        'num_gt_masks_per_image' and 'num_det_masks_per_image' to properly unpad
+        the tensors from the batch.
+
+    Returns:
+      a dictionary of metric names to tuple of value_op and update_op that can
+      be used as eval metric ops in tf.estimator.EstimatorSpec. Note that all
+      update ops  must be run together and similarly all value ops must be run
+      together to guarantee correct behaviour.
+    """
+
+    def update_op(image_id_batched, groundtruth_classes_batched,
+                  groundtruth_instance_masks_batched,
+                  groundtruth_is_crowd_batched, num_gt_masks_per_image,
+                  detection_classes_batched, detection_masks_batched,
+                  num_det_masks_per_image):
+      """Update op for metrics."""
+      for (image_id, groundtruth_classes, groundtruth_instance_masks,
+           groundtruth_is_crowd, num_gt_mask, detection_classes,
+           detection_masks, num_det_mask) in zip(
+               image_id_batched, groundtruth_classes_batched,
+               groundtruth_instance_masks_batched, groundtruth_is_crowd_batched,
+               num_gt_masks_per_image, detection_classes_batched,
+               detection_masks_batched, num_det_masks_per_image):
+
+        self.add_single_ground_truth_image_info(
+            image_id, {
+                'groundtruth_classes':
+                    groundtruth_classes[:num_gt_mask],
+                'groundtruth_instance_masks':
+                    groundtruth_instance_masks[:num_gt_mask],
+                'groundtruth_is_crowd':
+                    groundtruth_is_crowd[:num_gt_mask]
+            })
+        self.add_single_detected_image_info(
+            image_id, {
+                'detection_classes': detection_classes[:num_det_mask],
+                'detection_masks': detection_masks[:num_det_mask]
+            })
+
+    # Unpack items from the evaluation dictionary.
+    (image_id, groundtruth_classes, groundtruth_instance_masks,
+     groundtruth_is_crowd, num_gt_masks_per_image, detection_classes,
+     detection_masks, num_det_masks_per_image
+    ) = self._unpack_evaluation_dictionary_items(eval_dict)
+
+    update_op = tf.py_func(update_op, [
+        image_id, groundtruth_classes, groundtruth_instance_masks,
+        groundtruth_is_crowd, num_gt_masks_per_image, detection_classes,
+        detection_masks, num_det_masks_per_image
+    ], [])
+
+    metric_names = [
+        'PanopticQuality@%.2fIOU' % self._iou_threshold,
+        'SegmentationQuality@%.2fIOU' % self._iou_threshold,
+        'RecognitionQuality@%.2fIOU' % self._iou_threshold
+    ]
+    if self._include_metrics_per_category:
+      for category_dict in self._categories:
+        metric_names.append('PanopticQuality@%.2fIOU_ByCategory/%s' %
+                            (self._iou_threshold, category_dict['name']))
+
+    def first_value_func():
+      self._metrics = self.evaluate()
+      self.clear()
+      return np.float32(self._metrics[metric_names[0]])
+
+    def value_func_factory(metric_name):
+
+      def value_func():
+        return np.float32(self._metrics[metric_name])
+
+      return value_func
+
+    # Ensure that the metrics are only evaluated once.
+    first_value_op = tf.py_func(first_value_func, [], tf.float32)
+    eval_metric_ops = {metric_names[0]: (first_value_op, update_op)}
+    with tf.control_dependencies([first_value_op]):
+      for metric_name in metric_names[1:]:
+        eval_metric_ops[metric_name] = (tf.py_func(
+            value_func_factory(metric_name), [], np.float32), update_op)
+    return eval_metric_ops
+
+  def _evaluate_all_masks(self):
+    """Evaluate all masks and compute sum iou/TP/FP/FN."""
+
+    sum_num_tp = {category['id']: 0 for category in self._categories}
+    sum_num_fp = sum_num_tp.copy()
+    sum_num_fn = sum_num_tp.copy()
+    sum_tp_iou = sum_num_tp.copy()
+
+    for image_id in self._groundtruth_class_labels:
+      # Separate normal and is_crowd groundtruth
+      crowd_gt_indices = self._groundtruth_is_crowd.get(image_id)
+      (normal_gt_masks, normal_gt_classes, crowd_gt_masks,
+       crowd_gt_classes) = self._separate_normal_and_crowd_labels(
+           crowd_gt_indices, self._groundtruth_masks[image_id],
+           self._groundtruth_class_labels[image_id])
+
+      # Mask matching to normal GT.
+      predicted_masks = self._predicted_masks[image_id]
+      predicted_class_labels = self._predicted_class_labels[image_id]
+      (overlaps, pred_matched,
+       gt_matched) = self._match_predictions_to_groundtruths(
+           predicted_masks,
+           predicted_class_labels,
+           normal_gt_masks,
+           normal_gt_classes,
+           self._iou_threshold,
+           is_crowd=False,
+           with_replacement=False)
+
+      # Accumulate true positives.
+      for (class_id, is_matched, overlap) in zip(predicted_class_labels,
+                                                 pred_matched, overlaps):
+        if is_matched:
+          sum_num_tp[class_id] += 1
+          sum_tp_iou[class_id] += overlap
+
+      # Accumulate false negatives.
+      for (class_id, is_matched) in zip(normal_gt_classes, gt_matched):
+        if not is_matched:
+          sum_num_fn[class_id] += 1
+
+      # Match remaining predictions to crowd gt.
+      remained_pred_indices = np.logical_not(pred_matched)
+      remained_pred_masks = predicted_masks[remained_pred_indices, :, :]
+      remained_pred_classes = predicted_class_labels[remained_pred_indices]
+      _, pred_matched, _ = self._match_predictions_to_groundtruths(
+          remained_pred_masks,
+          remained_pred_classes,
+          crowd_gt_masks,
+          crowd_gt_classes,
+          self._ioa_threshold,
+          is_crowd=True,
+          with_replacement=True)
+
+      # Accumulate false positives
+      for (class_id, is_matched) in zip(remained_pred_classes, pred_matched):
+        if not is_matched:
+          sum_num_fp[class_id] += 1
+    return sum_tp_iou, sum_num_tp, sum_num_fp, sum_num_fn
+
+  def _compute_panoptic_metrics(self, sum_tp_iou, sum_num_tp, sum_num_fp,
+                                sum_num_fn):
+    """Compute PQ metric for each category and average over all classes.
+
+    Args:
+      sum_tp_iou: dict, summed true positive intersection-over-union (IoU) for
+        each class, keyed by class_id.
+      sum_num_tp: the total number of true positives for each class, keyed by
+        class_id.
+      sum_num_fp: the total number of false positives for each class, keyed by
+        class_id.
+      sum_num_fn: the total number of false negatives for each class, keyed by
+        class_id.
+
+    Returns:
+      mask_metrics: a dictionary containing averaged metrics over all classes,
+        and per-category metrics if required.
+    """
+    mask_metrics = {}
+    sum_pq = 0
+    sum_sq = 0
+    sum_rq = 0
+    num_valid_classes = 0
+    for category in self._categories:
+      class_id = category['id']
+      (panoptic_quality, segmentation_quality,
+       recognition_quality) = self._compute_panoptic_metrics_single_class(
+           sum_tp_iou[class_id], sum_num_tp[class_id], sum_num_fp[class_id],
+           sum_num_fn[class_id])
+      if panoptic_quality is not None:
+        sum_pq += panoptic_quality
+        sum_sq += segmentation_quality
+        sum_rq += recognition_quality
+        num_valid_classes += 1
+        if self._include_metrics_per_category:
+          mask_metrics['PanopticQuality@%.2fIOU_ByCategory/%s' %
+                       (self._iou_threshold,
+                        category['name'])] = panoptic_quality
+    mask_metrics['PanopticQuality@%.2fIOU' %
+                 self._iou_threshold] = sum_pq / num_valid_classes
+    mask_metrics['SegmentationQuality@%.2fIOU' %
+                 self._iou_threshold] = sum_sq / num_valid_classes
+    mask_metrics['RecognitionQuality@%.2fIOU' %
+                 self._iou_threshold] = sum_rq / num_valid_classes
+    mask_metrics['NumValidClasses'] = num_valid_classes
+    mask_metrics['NumTotalClasses'] = len(self._categories)
+    return mask_metrics
+
+  def _compute_panoptic_metrics_single_class(self, sum_tp_iou, num_tp, num_fp,
+                                             num_fn):
+    """Compute panoptic metrics: panoptic/segmentation/recognition quality.
+
+    More computation details in https://arxiv.org/pdf/1801.00868.pdf.
+    Args:
+      sum_tp_iou: summed true positive intersection-over-union (IoU) for a
+        specific class.
+      num_tp: the total number of true positives for a specific class.
+      num_fp: the total number of false positives for a specific class.
+      num_fn: the total number of false negatives for a specific class.
+
+    Returns:
+      panoptic_quality: sum_tp_iou / (num_tp + 0.5*num_fp + 0.5*num_fn).
+      segmentation_quality: sum_tp_iou / num_tp.
+      recognition_quality: num_tp / (num_tp + 0.5*num_fp + 0.5*num_fn).
+    """
+    denominator = num_tp + 0.5 * num_fp + 0.5 * num_fn
+    # Calculate metric only if there is at least one GT or one prediction.
+    if denominator > 0:
+      recognition_quality = num_tp / denominator
+      if num_tp > 0:
+        segmentation_quality = sum_tp_iou / num_tp
+      else:
+        # If there is no TP for this category.
+        segmentation_quality = 0
+      panoptic_quality = segmentation_quality * recognition_quality
+      return panoptic_quality, segmentation_quality, recognition_quality
+    else:
+      return None, None, None
+
+  def _separate_normal_and_crowd_labels(self, crowd_gt_indices,
+                                        groundtruth_masks, groundtruth_classes):
+    """Separate normal and crowd groundtruth class_labels and masks.
+
+    Args:
+      crowd_gt_indices: None or array of shape [num_groundtruths]. If None, all
+        groundtruths are treated as normal ones.
+      groundtruth_masks: array of shape [num_groundtruths, height, width].
+      groundtruth_classes: array of shape [num_groundtruths].
+
+    Returns:
+      normal_gt_masks: array of shape [num_normal_groundtruths, height, width].
+      normal_gt_classes: array of shape [num_normal_groundtruths].
+      crowd_gt_masks: array of shape [num_crowd_groundtruths, height, width].
+      crowd_gt_classes: array of shape [num_crowd_groundtruths].
+    Raises:
+      ValueError: if the shape of groundtruth classes doesn't match groundtruth
+        masks or if the shape of crowd_gt_indices.
+    """
+    if groundtruth_masks.shape[0] != groundtruth_classes.shape[0]:
+      raise ValueError(
+          "The number of masks doesn't match the number of labels.")
+    if crowd_gt_indices is None:
+      # All gts are treated as normal
+      crowd_gt_indices = np.zeros(groundtruth_masks.shape, dtype=np.bool)
+    else:
+      if groundtruth_masks.shape[0] != crowd_gt_indices.shape[0]:
+        raise ValueError(
+            "The number of masks doesn't match the number of is_crowd labels.")
+      crowd_gt_indices = crowd_gt_indices.astype(np.bool)
+    normal_gt_indices = np.logical_not(crowd_gt_indices)
+    if normal_gt_indices.size:
+      normal_gt_masks = groundtruth_masks[normal_gt_indices, :, :]
+      normal_gt_classes = groundtruth_classes[normal_gt_indices]
+      crowd_gt_masks = groundtruth_masks[crowd_gt_indices, :, :]
+      crowd_gt_classes = groundtruth_classes[crowd_gt_indices]
+    else:
+      # No groundtruths available, groundtruth_masks.shape = (0, h, w)
+      normal_gt_masks = groundtruth_masks
+      normal_gt_classes = groundtruth_classes
+      crowd_gt_masks = groundtruth_masks
+      crowd_gt_classes = groundtruth_classes
+    return normal_gt_masks, normal_gt_classes, crowd_gt_masks, crowd_gt_classes
+
+  def _match_predictions_to_groundtruths(self,
+                                         predicted_masks,
+                                         predicted_classes,
+                                         groundtruth_masks,
+                                         groundtruth_classes,
+                                         matching_threshold,
+                                         is_crowd=False,
+                                         with_replacement=False):
+    """Match the predicted masks to groundtruths.
+
+    Args:
+      predicted_masks: array of shape [num_predictions, height, width].
+      predicted_classes: array of shape [num_predictions].
+      groundtruth_masks: array of shape [num_groundtruths, height, width].
+      groundtruth_classes: array of shape [num_groundtruths].
+      matching_threshold: if the overlap between a prediction and a groundtruth
+        is larger than this threshold, the prediction is true positive.
+      is_crowd: whether the groundtruths are crowd annotation or not. If True,
+        use intersection over area (IoA) as the overlapping metric; otherwise
+        use intersection over union (IoU).
+      with_replacement: whether a groundtruth can be matched to multiple
+        predictions. By default, for normal groundtruths, only 1-1 matching is
+        allowed for normal groundtruths; for crowd groundtruths, 1-to-many must
+        be allowed.
+
+    Returns:
+      best_overlaps: array of shape [num_predictions]. Values representing the
+      IoU
+        or IoA with best matched groundtruth.
+      pred_matched: array of shape [num_predictions]. Boolean value representing
+        whether the ith prediction is matched to a groundtruth.
+      gt_matched: array of shape [num_groundtruth]. Boolean value representing
+        whether the ith groundtruth is matched to a prediction.
+    Raises:
+      ValueError: if the shape of groundtruth/predicted masks doesn't match
+        groundtruth/predicted classes.
+    """
+    if groundtruth_masks.shape[0] != groundtruth_classes.shape[0]:
+      raise ValueError(
+          "The number of GT masks doesn't match the number of labels.")
+    if predicted_masks.shape[0] != predicted_classes.shape[0]:
+      raise ValueError(
+          "The number of predicted masks doesn't match the number of labels.")
+    gt_matched = np.zeros(groundtruth_classes.shape, dtype=np.bool)
+    pred_matched = np.zeros(predicted_classes.shape, dtype=np.bool)
+    best_overlaps = np.zeros(predicted_classes.shape)
+    for pid in range(predicted_classes.shape[0]):
+      best_overlap = 0
+      matched_gt_id = -1
+      for gid in range(groundtruth_classes.shape[0]):
+        if predicted_classes[pid] == groundtruth_classes[gid]:
+          if (not with_replacement) and gt_matched[gid]:
+            continue
+          if not is_crowd:
+            overlap = np_mask_ops.iou(predicted_masks[pid:pid + 1],
+                                      groundtruth_masks[gid:gid + 1])[0, 0]
+          else:
+            overlap = np_mask_ops.ioa(groundtruth_masks[gid:gid + 1],
+                                      predicted_masks[pid:pid + 1])[0, 0]
+          if overlap >= matching_threshold and overlap > best_overlap:
+            matched_gt_id = gid
+            best_overlap = overlap
+      if matched_gt_id >= 0:
+        gt_matched[matched_gt_id] = True
+        pred_matched[pid] = True
+        best_overlaps[pid] = best_overlap
+    return best_overlaps, pred_matched, gt_matched
+
+  def _unpack_evaluation_dictionary_items(self, eval_dict):
+    """Unpack items from the evaluation dictionary."""
+    input_data_fields = standard_fields.InputDataFields
+    detection_fields = standard_fields.DetectionResultFields
+    image_id = eval_dict[input_data_fields.key]
+    groundtruth_classes = eval_dict[input_data_fields.groundtruth_classes]
+    groundtruth_instance_masks = eval_dict[
+        input_data_fields.groundtruth_instance_masks]
+    groundtruth_is_crowd = eval_dict.get(input_data_fields.groundtruth_is_crowd,
+                                         None)
+    num_gt_masks_per_image = eval_dict.get(
+        input_data_fields.num_groundtruth_boxes, None)
+    detection_classes = eval_dict[detection_fields.detection_classes]
+    detection_masks = eval_dict[detection_fields.detection_masks]
+    num_det_masks_per_image = eval_dict.get(detection_fields.num_detections,
+                                            None)
+    if groundtruth_is_crowd is None:
+      groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
+
+    if not image_id.shape.as_list():
+      # Apply a batch dimension to all tensors.
+      image_id = tf.expand_dims(image_id, 0)
+      groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
+      groundtruth_instance_masks = tf.expand_dims(groundtruth_instance_masks, 0)
+      groundtruth_is_crowd = tf.expand_dims(groundtruth_is_crowd, 0)
+      detection_classes = tf.expand_dims(detection_classes, 0)
+      detection_masks = tf.expand_dims(detection_masks, 0)
+
+      if num_gt_masks_per_image is None:
+        num_gt_masks_per_image = tf.shape(groundtruth_classes)[1:2]
+      else:
+        num_gt_masks_per_image = tf.expand_dims(num_gt_masks_per_image, 0)
+
+      if num_det_masks_per_image is None:
+        num_det_masks_per_image = tf.shape(detection_classes)[1:2]
+      else:
+        num_det_masks_per_image = tf.expand_dims(num_det_masks_per_image, 0)
+    else:
+      if num_gt_masks_per_image is None:
+        num_gt_masks_per_image = tf.tile(
+            tf.shape(groundtruth_classes)[1:2],
+            multiples=tf.shape(groundtruth_classes)[0:1])
+      if num_det_masks_per_image is None:
+        num_det_masks_per_image = tf.tile(
+            tf.shape(detection_classes)[1:2],
+            multiples=tf.shape(detection_classes)[0:1])
+    return (image_id, groundtruth_classes, groundtruth_instance_masks,
+            groundtruth_is_crowd, num_gt_masks_per_image, detection_classes,
+            detection_masks, num_det_masks_per_image)
--- a/research/object_detection/metrics/coco_evaluation_test.py
+++ b/research/object_detection/metrics/coco_evaluation_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import unittest
 import numpy as np
 import tensorflow.compat.v1 as tf
 from object_detection.core import standard_fields
 from object_detection.metrics import coco_evaluation
+from object_detection.utils import tf_version


 def _get_categories_list():
@@ -250,6 +252,7 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
          })


+@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
 class CocoEvaluationPyFuncTest(tf.test.TestCase):

  def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
@@ -926,6 +929,7 @@ class CocoKeypointEvaluationTest(tf.test.TestCase):
                           -1.0)


+@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
 class CocoKeypointEvaluationPyFuncTest(tf.test.TestCase):

  def testGetOneMAPWithMatchingKeypoints(self):
@@ -1438,6 +1442,7 @@ class CocoMaskEvaluationTest(tf.test.TestCase):
    self.assertFalse(coco_evaluator._detection_masks_list)


+@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
 class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):

  def testAddEvalDict(self):
@@ -1716,5 +1721,221 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
    self.assertFalse(coco_evaluator._detection_masks_list)


+def _get_panoptic_test_data():
+  # image1 contains 3 people in gt, (2 normal annotation and 1 "is_crowd"
+  # annotation), and 3 people in prediction.
+  gt_masks1 = np.zeros((3, 50, 50), dtype=np.uint8)
+  result_masks1 = np.zeros((3, 50, 50), dtype=np.uint8)
+  gt_masks1[0, 10:20, 20:30] = 1
+  result_masks1[0, 10:18, 20:30] = 1
+  gt_masks1[1, 25:30, 25:35] = 1
+  result_masks1[1, 18:25, 25:30] = 1
+  gt_masks1[2, 40:50, 40:50] = 1
+  result_masks1[2, 47:50, 47:50] = 1
+  gt_class1 = np.array([1, 1, 1])
+  gt_is_crowd1 = np.array([0, 0, 1])
+  result_class1 = np.array([1, 1, 1])
+
+  # image2 contains 1 dog and 1 cat in gt, while 1 person and 1 dog in
+  # prediction.
+  gt_masks2 = np.zeros((2, 30, 40), dtype=np.uint8)
+  result_masks2 = np.zeros((2, 30, 40), dtype=np.uint8)
+  gt_masks2[0, 5:15, 20:35] = 1
+  gt_masks2[1, 20:30, 0:10] = 1
+  result_masks2[0, 20:25, 10:15] = 1
+  result_masks2[1, 6:15, 15:35] = 1
+  gt_class2 = np.array([2, 3])
+  gt_is_crowd2 = np.array([0, 0])
+  result_class2 = np.array([1, 2])
+
+  gt_class = [gt_class1, gt_class2]
+  gt_masks = [gt_masks1, gt_masks2]
+  gt_is_crowd = [gt_is_crowd1, gt_is_crowd2]
+  result_class = [result_class1, result_class2]
+  result_masks = [result_masks1, result_masks2]
+  return gt_class, gt_masks, gt_is_crowd, result_class, result_masks
+
+
+class CocoPanopticEvaluationTest(tf.test.TestCase):
+
+  def test_panoptic_quality(self):
+    pq_evaluator = coco_evaluation.CocoPanopticSegmentationEvaluator(
+        _get_categories_list(), include_metrics_per_category=True)
+    (gt_class, gt_masks, gt_is_crowd, result_class,
+     result_masks) = _get_panoptic_test_data()
+
+    for i in range(2):
+      pq_evaluator.add_single_ground_truth_image_info(
+          image_id='image%d' % i,
+          groundtruth_dict={
+              standard_fields.InputDataFields.groundtruth_classes:
+                  gt_class[i],
+              standard_fields.InputDataFields.groundtruth_instance_masks:
+                  gt_masks[i],
+              standard_fields.InputDataFields.groundtruth_is_crowd:
+                  gt_is_crowd[i]
+          })
+
+      pq_evaluator.add_single_detected_image_info(
+          image_id='image%d' % i,
+          detections_dict={
+              standard_fields.DetectionResultFields.detection_classes:
+                  result_class[i],
+              standard_fields.DetectionResultFields.detection_masks:
+                  result_masks[i]
+          })
+
+    metrics = pq_evaluator.evaluate()
+    self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU_ByCategory/person'],
+                           0.32)
+    self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU_ByCategory/dog'],
+                           135.0 / 195)
+    self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU_ByCategory/cat'], 0)
+    self.assertAlmostEqual(metrics['SegmentationQuality@0.50IOU'],
+                           (0.8 + 135.0 / 195) / 3)
+    self.assertAlmostEqual(metrics['RecognitionQuality@0.50IOU'], (0.4 + 1) / 3)
+    self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU'],
+                           (0.32 + 135.0 / 195) / 3)
+    self.assertEqual(metrics['NumValidClasses'], 3)
+    self.assertEqual(metrics['NumTotalClasses'], 3)
+
+
+@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
+class CocoPanopticEvaluationPyFuncTest(tf.test.TestCase):
+
+  def testPanopticQualityNoBatch(self):
+    pq_evaluator = coco_evaluation.CocoPanopticSegmentationEvaluator(
+        _get_categories_list(), include_metrics_per_category=True)
+
+    image_id = tf.placeholder(tf.string, shape=())
+    groundtruth_classes = tf.placeholder(tf.int32, shape=(None))
+    groundtruth_masks = tf.placeholder(tf.uint8, shape=(None, None, None))
+    groundtruth_is_crowd = tf.placeholder(tf.int32, shape=(None))
+    detection_classes = tf.placeholder(tf.int32, shape=(None))
+    detection_masks = tf.placeholder(tf.uint8, shape=(None, None, None))
+
+    input_data_fields = standard_fields.InputDataFields
+    detection_fields = standard_fields.DetectionResultFields
+    eval_dict = {
+        input_data_fields.key: image_id,
+        input_data_fields.groundtruth_classes: groundtruth_classes,
+        input_data_fields.groundtruth_instance_masks: groundtruth_masks,
+        input_data_fields.groundtruth_is_crowd: groundtruth_is_crowd,
+        detection_fields.detection_classes: detection_classes,
+        detection_fields.detection_masks: detection_masks,
+    }
+
+    eval_metric_ops = pq_evaluator.get_estimator_eval_metric_ops(eval_dict)
+
+    _, update_op = eval_metric_ops['PanopticQuality@0.50IOU']
+    (gt_class, gt_masks, gt_is_crowd, result_class,
+     result_masks) = _get_panoptic_test_data()
+
+    with self.test_session() as sess:
+      for i in range(2):
+        sess.run(
+            update_op,
+            feed_dict={
+                image_id: 'image%d' % i,
+                groundtruth_classes: gt_class[i],
+                groundtruth_masks: gt_masks[i],
+                groundtruth_is_crowd: gt_is_crowd[i],
+                detection_classes: result_class[i],
+                detection_masks: result_masks[i]
+            })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.items():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU'],
+                           (0.32 + 135.0 / 195) / 3)
+
+  def testPanopticQualityBatched(self):
+    pq_evaluator = coco_evaluation.CocoPanopticSegmentationEvaluator(
+        _get_categories_list(), include_metrics_per_category=True)
+    batch_size = 2
+    image_id = tf.placeholder(tf.string, shape=(batch_size))
+    groundtruth_classes = tf.placeholder(tf.int32, shape=(batch_size, None))
+    groundtruth_masks = tf.placeholder(
+        tf.uint8, shape=(batch_size, None, None, None))
+    groundtruth_is_crowd = tf.placeholder(tf.int32, shape=(batch_size, None))
+    detection_classes = tf.placeholder(tf.int32, shape=(batch_size, None))
+    detection_masks = tf.placeholder(
+        tf.uint8, shape=(batch_size, None, None, None))
+    num_gt_masks_per_image = tf.placeholder(tf.int32, shape=(batch_size))
+    num_det_masks_per_image = tf.placeholder(tf.int32, shape=(batch_size))
+
+    input_data_fields = standard_fields.InputDataFields
+    detection_fields = standard_fields.DetectionResultFields
+    eval_dict = {
+        input_data_fields.key: image_id,
+        input_data_fields.groundtruth_classes: groundtruth_classes,
+        input_data_fields.groundtruth_instance_masks: groundtruth_masks,
+        input_data_fields.groundtruth_is_crowd: groundtruth_is_crowd,
+        input_data_fields.num_groundtruth_boxes: num_gt_masks_per_image,
+        detection_fields.detection_classes: detection_classes,
+        detection_fields.detection_masks: detection_masks,
+        detection_fields.num_detections: num_det_masks_per_image,
+    }
+
+    eval_metric_ops = pq_evaluator.get_estimator_eval_metric_ops(eval_dict)
+
+    _, update_op = eval_metric_ops['PanopticQuality@0.50IOU']
+    (gt_class, gt_masks, gt_is_crowd, result_class,
+     result_masks) = _get_panoptic_test_data()
+    with self.test_session() as sess:
+      sess.run(
+          update_op,
+          feed_dict={
+              image_id: ['image0', 'image1'],
+              groundtruth_classes:
+                  np.stack([
+                      gt_class[0],
+                      np.pad(gt_class[1], (0, 1), mode='constant')
+                  ],
+                           axis=0),
+              groundtruth_masks:
+                  np.stack([
+                      np.pad(
+                          gt_masks[0], ((0, 0), (0, 10), (0, 10)),
+                          mode='constant'),
+                      np.pad(
+                          gt_masks[1], ((0, 1), (0, 30), (0, 20)),
+                          mode='constant'),
+                  ],
+                           axis=0),
+              groundtruth_is_crowd:
+                  np.stack([
+                      gt_is_crowd[0],
+                      np.pad(gt_is_crowd[1], (0, 1), mode='constant')
+                  ],
+                           axis=0),
+              num_gt_masks_per_image: np.array([3, 2]),
+              detection_classes:
+                  np.stack([
+                      result_class[0],
+                      np.pad(result_class[1], (0, 1), mode='constant')
+                  ],
+                           axis=0),
+              detection_masks:
+                  np.stack([
+                      np.pad(
+                          result_masks[0], ((0, 0), (0, 10), (0, 10)),
+                          mode='constant'),
+                      np.pad(
+                          result_masks[1], ((0, 1), (0, 30), (0, 20)),
+                          mode='constant'),
+                  ],
+                           axis=0),
+              num_det_masks_per_image: np.array([3, 2]),
+          })
+    metrics = {}
+    for key, (value_op, _) in eval_metric_ops.items():
+      metrics[key] = value_op
+    metrics = sess.run(metrics)
+    self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU'],
+                           (0.32 + 135.0 / 195) / 3)
+
+
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/metrics/coco_tools.py
+++ b/research/object_detection/metrics/coco_tools.py
@@ -52,6 +52,7 @@ from pycocotools import coco
 from pycocotools import cocoeval
 from pycocotools import mask

+import six
 from six.moves import range
 from six.moves import zip
 import tensorflow.compat.v1 as tf
@@ -353,7 +354,9 @@ def _RleCompress(masks):
  Returns:
    A pycocotools Run-length encoding of the mask.
  """
-  return mask.encode(np.asfortranarray(masks))
+  rle = mask.encode(np.asfortranarray(masks))
+  rle['counts'] = six.ensure_str(rle['counts'])
+  return rle


 def ExportSingleImageGroundtruthToCoco(image_id,

--- a/research/object_detection/metrics/offline_eval_map_corloc.py
+++ b/research/object_detection/metrics/offline_eval_map_corloc.py
@@ -36,8 +36,8 @@ import os
 import re
 import tensorflow.compat.v1 as tf

+from object_detection import eval_util
 from object_detection.core import standard_fields
-from object_detection.legacy import evaluator
 from object_detection.metrics import tf_example_parser
 from object_detection.utils import config_util
 from object_detection.utils import label_map_util
@@ -94,7 +94,7 @@ def read_data_and_evaluate(input_config, eval_config):
    categories = label_map_util.create_categories_from_labelmap(
        input_config.label_map_path)

-    object_detection_evaluators = evaluator.get_evaluators(
+    object_detection_evaluators = eval_util.get_evaluators(
        eval_config, categories)
    # Support a single evaluator
    object_detection_evaluator = object_detection_evaluators[0]

--- a/research/object_detection/model_lib_test.py
+++ b/research/object_detection/model_lib_test.py
@@ -20,19 +20,17 @@ from __future__ import print_function

 import functools
 import os
-
+import unittest
 import numpy as np
 import tensorflow.compat.v1 as tf

-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-
 from object_detection import inputs
 from object_detection import model_hparams
 from object_detection import model_lib
 from object_detection.builders import model_builder
 from object_detection.core import standard_fields as fields
 from object_detection.utils import config_util
+from object_detection.utils import tf_version


 # Model for test. Options are:
@@ -122,6 +120,7 @@ def _make_initializable_iterator(dataset):
  return iterator


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class ModelLibTest(tf.test.TestCase):

  @classmethod
@@ -337,8 +336,7 @@ class ModelLibTest(tf.test.TestCase):

  def test_create_tpu_estimator_and_inputs(self):
    """Tests that number of train/eval defaults to config values."""
-
-    run_config = tpu_config.RunConfig()
+    run_config = tf.estimator.tpu.RunConfig()
    hparams = model_hparams.create_hparams(
        hparams_overrides='load_pretrained=false')
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
@@ -352,7 +350,7 @@ class ModelLibTest(tf.test.TestCase):
    estimator = train_and_eval_dict['estimator']
    train_steps = train_and_eval_dict['train_steps']

-    self.assertIsInstance(estimator, tpu_estimator.TPUEstimator)
+    self.assertIsInstance(estimator, tf.estimator.tpu.TPUEstimator)
    self.assertEqual(20, train_steps)

  def test_create_train_and_eval_specs(self):
@@ -406,6 +404,7 @@ class ModelLibTest(tf.test.TestCase):
    self.assertEqual(None, experiment.eval_steps)


+@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
 class UnbatchTensorsTest(tf.test.TestCase):

  def test_unbatch_without_unpadding(self):

--- a/research/object_detection/model_lib_v2_test.py
+++ b/research/object_detection/model_lib_v2_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function

 import os
 import tempfile
-
+import unittest
 import numpy as np
 import six
 import tensorflow.compat.v1 as tf
@@ -32,6 +32,7 @@ from object_detection.builders import model_builder
 from object_detection.core import model
 from object_detection.protos import train_pb2
 from object_detection.utils import config_util
+from object_detection.utils import tf_version

 if six.PY2:
  import mock  # pylint: disable=g-importing-member,g-import-not-at-top
@@ -72,6 +73,7 @@ def _get_config_kwarg_overrides():
  }


+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class ModelLibTest(tf.test.TestCase):

  @classmethod
@@ -139,6 +141,7 @@ class SimpleModel(model.DetectionModel):
    return []


+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class ModelCheckpointTest(tf.test.TestCase):
  """Test for model checkpoint related functionality."""

@@ -171,6 +174,7 @@ class IncompatibleModel(SimpleModel):
    return {'weight': self.weight}


+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class CheckpointV2Test(tf.test.TestCase):

  def setUp(self):

--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -358,7 +358,7 @@ def load_fine_tune_checkpoint(
    ckpt.restore(checkpoint_path).assert_existing_objects_matched()


-def _get_filepath(strategy, filepath):
+def get_filepath(strategy, filepath):
  """Get appropriate filepath for worker.

  Args:
@@ -377,7 +377,7 @@ def _get_filepath(strategy, filepath):
    return os.path.join(filepath, 'temp_worker_{:03d}'.format(task_id))


-def _clean_temporary_directories(strategy, filepath):
+def clean_temporary_directories(strategy, filepath):
  """Temporary directory clean up for MultiWorker Mirrored Strategy.

  This is needed for all non-chief workers.
@@ -539,8 +539,8 @@ def train_loop(
  ## Train the model
  # Get the appropriate filepath (temporary or not) based on whether the worker
  # is the chief.
-  summary_writer_filepath = _get_filepath(strategy,
-                                          os.path.join(model_dir, 'train'))
+  summary_writer_filepath = get_filepath(strategy,
+                                         os.path.join(model_dir, 'train'))
  summary_writer = tf.compat.v2.summary.create_file_writer(
      summary_writer_filepath)

@@ -567,7 +567,7 @@ def train_loop(
        ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)

-        manager_dir = _get_filepath(strategy, model_dir)
+        manager_dir = get_filepath(strategy, model_dir)
        if not strategy.extended.should_checkpoint:
          checkpoint_max_to_keep = 1
        manager = tf.compat.v2.train.CheckpointManager(
@@ -615,6 +615,10 @@ def train_loop(
          return _sample_and_train(strategy, train_step_fn, data_iterator)

        train_input_iter = iter(train_input)
+
+        if int(global_step.value()) == 0:
+          manager.save()
+
        checkpointed_step = int(global_step.value())
        logged_step = global_step.value()

@@ -646,8 +650,8 @@ def train_loop(
  # Remove the checkpoint directories of the non-chief workers that
  # MultiWorkerMirroredStrategy forces us to save during sync distributed
  # training.
-  _clean_temporary_directories(strategy, manager_dir)
-  _clean_temporary_directories(strategy, summary_writer_filepath)
+  clean_temporary_directories(strategy, manager_dir)
+  clean_temporary_directories(strategy, summary_writer_filepath)


 def eager_eval_loop(

--- a/research/object_detection/model_main_tf2.py
+++ b/research/object_detection/model_main_tf2.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+r"""Creates and runs TF2 object detection models.
+
+##################################
+NOTE: This module has not been fully tested; please bear with us while we iron
+out the kinks.
+##################################
+
+When a TPU device is available, this binary uses TPUStrategy. Otherwise, it uses
+GPUS with MirroredStrategy/MultiWorkerMirroredStrategy.
+
+For local training/evaluation run:
+PIPELINE_CONFIG_PATH=path/to/pipeline.config
+MODEL_DIR=/tmp/model_outputs
+NUM_TRAIN_STEPS=10000
+SAMPLE_1_OF_N_EVAL_EXAMPLES=1
+python model_main_tf2.py -- \
+  --model_dir=$MODEL_DIR --num_train_steps=$NUM_TRAIN_STEPS \
+  --sample_1_of_n_eval_examples=$SAMPLE_1_OF_N_EVAL_EXAMPLES \
+  --pipeline_config_path=$PIPELINE_CONFIG_PATH \
+  --alsologtostderr
+"""
+from absl import flags
+import tensorflow.compat.v2 as tf
+from object_detection import model_hparams
+from object_detection import model_lib_v2
+
+flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
+                    'file.')
+flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
+flags.DEFINE_bool('eval_on_train_data', False, 'Enable evaluating on train '
+                  'data (only supported in distributed training).')
+flags.DEFINE_integer('sample_1_of_n_eval_examples', None, 'Will sample one of '
+                     'every n eval input examples, where n is provided.')
+flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
+                     'one of every n train input examples for evaluation, '
+                     'where n is provided. This is only used if '
+                     '`eval_training_data` is True.')
+flags.DEFINE_string(
+    'hparams_overrides', None, 'Hyperparameter overrides, '
+    'represented as a string containing comma-separated '
+    'hparam_name=value pairs.')
+flags.DEFINE_string(
+    'model_dir', None, 'Path to output model directory '
+                       'where event and checkpoint files will be written.')
+flags.DEFINE_string(
+    'checkpoint_dir', None, 'Path to directory holding a checkpoint.  If '
+    '`checkpoint_dir` is provided, this binary operates in eval-only mode, '
+    'writing resulting metrics to `model_dir`.')
+
+flags.DEFINE_integer('eval_timeout', 3600, 'Number of seconds to wait for an'
+                     'evaluation checkpoint before exiting.')
+flags.DEFINE_integer(
+    'num_workers', 1, 'When num_workers > 1, training uses '
+    'MultiWorkerMirroredStrategy. When num_workers = 1 it uses '
+    'MirroredStrategy.')
+
+FLAGS = flags.FLAGS
+
+
+def main(unused_argv):
+  flags.mark_flag_as_required('model_dir')
+  flags.mark_flag_as_required('pipeline_config_path')
+  tf.config.set_soft_device_placement(True)
+
+  if FLAGS.checkpoint_dir:
+    model_lib_v2.eval_continuously(
+        hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
+        pipeline_config_path=FLAGS.pipeline_config_path,
+        model_dir=FLAGS.model_dir,
+        train_steps=FLAGS.num_train_steps,
+        sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
+        sample_1_of_n_eval_on_train_examples=(
+            FLAGS.sample_1_of_n_eval_on_train_examples),
+        checkpoint_dir=FLAGS.checkpoint_dir,
+        wait_interval=300, timeout=FLAGS.eval_timeout)
+  else:
+    if tf.config.get_visible_devices('TPU'):
+      resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+      tf.config.experimental_connect_to_cluster(resolver)
+      tf.tpu.experimental.initialize_tpu_system(resolver)
+      strategy = tf.distribute.experimental.TPUStrategy(resolver)
+    elif FLAGS.num_workers > 1:
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+    else:
+      strategy = tf.compat.v2.distribute.MirroredStrategy()
+
+    with strategy.scope():
+      model_lib_v2.train_loop(
+          hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
+          pipeline_config_path=FLAGS.pipeline_config_path,
+          model_dir=FLAGS.model_dir,
+          train_steps=FLAGS.num_train_steps,
+          use_tpu=FLAGS.use_tpu)
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/research/object_detection/models/center_net_hourglass_feature_extractor.py
+++ b/research/object_detection/models/center_net_hourglass_feature_extractor.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Hourglass[1] feature extractor for CenterNet[2] meta architecture.
+
+[1]: https://arxiv.org/abs/1603.06937
+[2]: https://arxiv.org/abs/1904.07850
+"""
+
+from object_detection.meta_architectures import center_net_meta_arch
+from object_detection.models.keras_models import hourglass_network
+
+
+class CenterNetHourglassFeatureExtractor(
+    center_net_meta_arch.CenterNetFeatureExtractor):
+  """The hourglass feature extractor for CenterNet.
+
+  This class is a thin wrapper around the HourglassFeatureExtractor class
+  along with some preprocessing methods inherited from the base class.
+  """
+
+  def __init__(self, hourglass_net, channel_means=(0., 0., 0.),
+               channel_stds=(1., 1., 1.), bgr_ordering=False):
+    """Intializes the feature extractor.
+
+    Args:
+      hourglass_net: The underlying hourglass network to use.
+      channel_means: A tuple of floats, denoting the mean of each channel
+        which will be subtracted from it.
+      channel_stds: A tuple of floats, denoting the standard deviation of each
+        channel. Each channel will be divided by its standard deviation value.
+      bgr_ordering: bool, if set will change the channel ordering to be in the
+        [blue, red, green] order.
+    """
+
+    super(CenterNetHourglassFeatureExtractor, self).__init__(
+        channel_means=channel_means, channel_stds=channel_stds,
+        bgr_ordering=bgr_ordering)
+    self._network = hourglass_net
+
+  def call(self, inputs):
+    return self._network(inputs)
+
+  @property
+  def out_stride(self):
+    """The stride in the output image of the network."""
+    return 4
+
+  @property
+  def num_feature_outputs(self):
+    """Ther number of feature outputs returned by the feature extractor."""
+    return self._network.num_hourglasses
+
+  def get_model(self):
+    return self._network
+
+
+def hourglass_104(channel_means, channel_stds, bgr_ordering):
+  """The Hourglass-104 backbone for CenterNet."""
+
+  network = hourglass_network.hourglass_104()
+  return CenterNetHourglassFeatureExtractor(
+      network, channel_means=channel_means, channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering)
--- a/research/object_detection/models/center_net_hourglass_feature_extractor_tf2_test.py
+++ b/research/object_detection/models/center_net_hourglass_feature_extractor_tf2_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Testing hourglass feature extractor for CenterNet."""
+import unittest
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+from object_detection.models import center_net_hourglass_feature_extractor as hourglass
+from object_detection.models.keras_models import hourglass_network
+from object_detection.utils import test_case
+from object_detection.utils import tf_version
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetHourglassFeatureExtractorTest(test_case.TestCase):
+
+  def test_center_net_hourglass_feature_extractor(self):
+
+    net = hourglass_network.HourglassNetwork(
+        num_stages=4, blocks_per_stage=[2, 3, 4, 5, 6],
+        channel_dims=[4, 6, 8, 10, 12, 14], num_hourglasses=2)
+
+    model = hourglass.CenterNetHourglassFeatureExtractor(net)
+    def graph_fn():
+      return model(tf.zeros((2, 64, 64, 3), dtype=np.float32))
+    outputs = self.execute(graph_fn, [])
+    self.assertEqual(outputs[0].shape, (2, 16, 16, 6))
+    self.assertEqual(outputs[1].shape, (2, 16, 16, 6))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/center_net_resnet_feature_extractor.py
+++ b/research/object_detection/models/center_net_resnet_feature_extractor.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Resnetv2 based feature extractors for CenterNet[1] meta architecture.
+
+[1]: https://arxiv.org/abs/1904.07850
+"""
+
+
+import tensorflow.compat.v1 as tf
+
+from object_detection.meta_architectures.center_net_meta_arch import CenterNetFeatureExtractor
+
+
+class CenterNetResnetFeatureExtractor(CenterNetFeatureExtractor):
+  """Resnet v2 base feature extractor for the CenterNet model."""
+
+  def __init__(self, resnet_type, channel_means=(0., 0., 0.),
+               channel_stds=(1., 1., 1.), bgr_ordering=False):
+    """Initializes the feature extractor with a specific ResNet architecture.
+
+    Args:
+      resnet_type: A string specifying which kind of ResNet to use. Currently
+        only `resnet_v2_50` and `resnet_v2_101` are supported.
+      channel_means: A tuple of floats, denoting the mean of each channel
+        which will be subtracted from it.
+      channel_stds: A tuple of floats, denoting the standard deviation of each
+        channel. Each channel will be divided by its standard deviation value.
+      bgr_ordering: bool, if set will change the channel ordering to be in the
+        [blue, red, green] order.
+
+    """
+
+    super(CenterNetResnetFeatureExtractor, self).__init__(
+        channel_means=channel_means, channel_stds=channel_stds,
+        bgr_ordering=bgr_ordering)
+    if resnet_type == 'resnet_v2_101':
+      self._base_model = tf.keras.applications.ResNet101V2(weights=None)
+      output_layer = 'conv5_block3_out'
+    elif resnet_type == 'resnet_v2_50':
+      self._base_model = tf.keras.applications.ResNet50V2(weights=None)
+      output_layer = 'conv5_block3_out'
+    else:
+      raise ValueError('Unknown Resnet Model {}'.format(resnet_type))
+    output_layer = self._base_model.get_layer(output_layer)
+
+    self._resnet_model = tf.keras.models.Model(inputs=self._base_model.input,
+                                               outputs=output_layer.output)
+    resnet_output = self._resnet_model(self._base_model.input)
+
+    for num_filters in [256, 128, 64]:
+      # TODO(vighneshb) This section has a few differences from the paper
+      # Figure out how much of a performance impact they have.
+
+      # 1. We use a simple convolution instead of a deformable convolution
+      conv = tf.keras.layers.Conv2D(filters=num_filters, kernel_size=3,
+                                    strides=1, padding='same')
+      resnet_output = conv(resnet_output)
+      resnet_output = tf.keras.layers.BatchNormalization()(resnet_output)
+      resnet_output = tf.keras.layers.ReLU()(resnet_output)
+
+      # 2. We use the default initialization for the convolution layers
+      # instead of initializing it to do bilinear upsampling.
+      conv_transpose = tf.keras.layers.Conv2DTranspose(filters=num_filters,
+                                                       kernel_size=3, strides=2,
+                                                       padding='same')
+      resnet_output = conv_transpose(resnet_output)
+      resnet_output = tf.keras.layers.BatchNormalization()(resnet_output)
+      resnet_output = tf.keras.layers.ReLU()(resnet_output)
+
+    self._feature_extractor_model = tf.keras.models.Model(
+        inputs=self._base_model.input, outputs=resnet_output)
+
+  def preprocess(self, resized_inputs):
+    """Preprocess input images for the ResNet model.
+
+    This scales images in the range [0, 255] to the range [-1, 1]
+
+    Args:
+      resized_inputs: a [batch, height, width, channels] float32 tensor.
+
+    Returns:
+      outputs: a [batch, height, width, channels] float32 tensor.
+
+    """
+    resized_inputs = super(CenterNetResnetFeatureExtractor, self).preprocess(
+        resized_inputs)
+    return tf.keras.applications.resnet_v2.preprocess_input(resized_inputs)
+
+  def load_feature_extractor_weights(self, path):
+    self._base_model.load_weights(path)
+
+  def get_base_model(self):
+    """Get base resnet model for inspection and testing."""
+    return self._base_model
+
+  def call(self, inputs):
+    """Returns image features extracted by the backbone.
+
+    Args:
+      inputs: An image tensor of shape [batch_size, input_height,
+        input_width, 3]
+
+    Returns:
+      features_list: A list of length 1 containing a tensor of shape
+        [batch_size, input_height // 4, input_width // 4, 64] containing
+        the features extracted by the ResNet.
+    """
+    return [self._feature_extractor_model(inputs)]
+
+  @property
+  def num_feature_outputs(self):
+    return 1
+
+  @property
+  def out_stride(self):
+    return 4
+
+
+def resnet_v2_101(channel_means, channel_stds, bgr_ordering):
+  """The ResNet v2 101 feature extractor."""
+
+  return CenterNetResnetFeatureExtractor(
+      resnet_type='resnet_v2_101',
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering
+  )
+
+
+def resnet_v2_50(channel_means, channel_stds, bgr_ordering):
+  """The ResNet v2 50 feature extractor."""
+
+  return CenterNetResnetFeatureExtractor(
+      resnet_type='resnet_v2_50',
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering)
--- a/research/object_detection/models/center_net_resnet_feature_extractor_tf2_test.py
+++ b/research/object_detection/models/center_net_resnet_feature_extractor_tf2_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Testing ResNet v2 models for the CenterNet meta architecture."""
+import unittest
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+from object_detection.models import center_net_resnet_feature_extractor
+from object_detection.utils import test_case
+from object_detection.utils import tf_version
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class CenterNetResnetFeatureExtractorTest(test_case.TestCase):
+
+  def test_output_size(self):
+    """Verify that shape of features returned by the backbone is correct."""
+
+    model = center_net_resnet_feature_extractor.\
+                CenterNetResnetFeatureExtractor('resnet_v2_101')
+    def graph_fn():
+      img = np.zeros((8, 224, 224, 3), dtype=np.float32)
+      processed_img = model.preprocess(img)
+      return model(processed_img)
+    outputs = self.execute(graph_fn, [])
+    self.assertEqual(outputs.shape, (8, 56, 56, 64))
+
+  def test_output_size_resnet50(self):
+    """Verify that shape of features returned by the backbone is correct."""
+
+    model = center_net_resnet_feature_extractor.\
+                CenterNetResnetFeatureExtractor('resnet_v2_50')
+    def graph_fn():
+      img = np.zeros((8, 224, 224, 3), dtype=np.float32)
+      processed_img = model.preprocess(img)
+      return model(processed_img)
+    outputs = self.execute(graph_fn, [])
+    self.assertEqual(outputs.shape, (8, 56, 56, 64))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/models/center_net_resnet_v1_fpn_feature_extractor.py
+++ b/research/object_detection/models/center_net_resnet_v1_fpn_feature_extractor.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Resnetv1 FPN [1] based feature extractors for CenterNet[2] meta architecture.
+
+
+[1]: https://arxiv.org/abs/1612.03144.
+[2]: https://arxiv.org/abs/1904.07850.
+"""
+import tensorflow.compat.v1 as tf
+
+from object_detection.meta_architectures.center_net_meta_arch import CenterNetFeatureExtractor
+
+
+_RESNET_MODEL_OUTPUT_LAYERS = {
+    'resnet_v1_50': ['conv2_block3_out', 'conv3_block4_out',
+                     'conv4_block6_out', 'conv5_block3_out'],
+    'resnet_v1_101': ['conv2_block3_out', 'conv3_block4_out',
+                      'conv4_block23_out', 'conv5_block3_out'],
+}
+
+
+class CenterNetResnetV1FpnFeatureExtractor(CenterNetFeatureExtractor):
+  """Resnet v1 FPN base feature extractor for the CenterNet model.
+
+  This feature extractor uses residual skip connections and nearest neighbor
+  upsampling to produce an output feature map of stride 4, which has precise
+  localization information along with strong semantic information from the top
+  of the net. This design does not exactly follow the original FPN design,
+  specifically:
+  - Since only one output map is necessary for heatmap prediction (stride 4
+    output), the top-down feature maps can have different numbers of channels.
+    Specifically, the top down feature maps have the following sizes:
+    [h/4, w/4, 64], [h/8, w/8, 128], [h/16, w/16, 256], [h/32, w/32, 256].
+  - No additional coarse features are used after conv5_x.
+  """
+
+  def __init__(self, resnet_type, channel_means=(0., 0., 0.),
+               channel_stds=(1., 1., 1.), bgr_ordering=False):
+    """Initializes the feature extractor with a specific ResNet architecture.
+
+    Args:
+      resnet_type: A string specifying which kind of ResNet to use. Currently
+        only `resnet_v1_50` and `resnet_v1_101` are supported.
+      channel_means: A tuple of floats, denoting the mean of each channel
+        which will be subtracted from it.
+      channel_stds: A tuple of floats, denoting the standard deviation of each
+        channel. Each channel will be divided by its standard deviation value.
+      bgr_ordering: bool, if set will change the channel ordering to be in the
+        [blue, red, green] order.
+
+    """
+
+    super(CenterNetResnetV1FpnFeatureExtractor, self).__init__(
+        channel_means=channel_means, channel_stds=channel_stds,
+        bgr_ordering=bgr_ordering)
+    if resnet_type == 'resnet_v1_50':
+      self._base_model = tf.keras.applications.ResNet50(weights=None)
+    elif resnet_type == 'resnet_v1_101':
+      self._base_model = tf.keras.applications.ResNet101(weights=None)
+    else:
+      raise ValueError('Unknown Resnet Model {}'.format(resnet_type))
+    output_layers = _RESNET_MODEL_OUTPUT_LAYERS[resnet_type]
+    outputs = [self._base_model.get_layer(output_layer_name).output
+               for output_layer_name in output_layers]
+
+    self._resnet_model = tf.keras.models.Model(inputs=self._base_model.input,
+                                               outputs=outputs)
+    resnet_outputs = self._resnet_model(self._base_model.input)
+
+    # Construct the top-down feature maps.
+    top_layer = resnet_outputs[-1]
+    residual_op = tf.keras.layers.Conv2D(filters=256, kernel_size=1,
+                                         strides=1, padding='same')
+    top_down = residual_op(top_layer)
+
+    num_filters_list = [256, 128, 64]
+    for i, num_filters in enumerate(num_filters_list):
+      level_ind = 2 - i
+      # Upsample.
+      upsample_op = tf.keras.layers.UpSampling2D(2, interpolation='nearest')
+      top_down = upsample_op(top_down)
+
+      # Residual (skip-connection) from bottom-up pathway.
+      residual_op = tf.keras.layers.Conv2D(filters=num_filters, kernel_size=1,
+                                           strides=1, padding='same')
+      residual = residual_op(resnet_outputs[level_ind])
+
+      # Merge.
+      top_down = top_down + residual
+      next_num_filters = num_filters_list[i+1] if i + 1 <= 2 else 64
+      conv = tf.keras.layers.Conv2D(filters=next_num_filters,
+                                    kernel_size=3, strides=1, padding='same')
+      top_down = conv(top_down)
+      top_down = tf.keras.layers.BatchNormalization()(top_down)
+      top_down = tf.keras.layers.ReLU()(top_down)
+
+    self._feature_extractor_model = tf.keras.models.Model(
+        inputs=self._base_model.input, outputs=top_down)
+
+  def preprocess(self, resized_inputs):
+    """Preprocess input images for the ResNet model.
+
+    This scales images in the range [0, 255] to the range [-1, 1]
+
+    Args:
+      resized_inputs: a [batch, height, width, channels] float32 tensor.
+
+    Returns:
+      outputs: a [batch, height, width, channels] float32 tensor.
+
+    """
+    resized_inputs = super(
+        CenterNetResnetV1FpnFeatureExtractor, self).preprocess(resized_inputs)
+    return tf.keras.applications.resnet.preprocess_input(resized_inputs)
+
+  def load_feature_extractor_weights(self, path):
+    self._base_model.load_weights(path)
+
+  def get_base_model(self):
+    """Get base resnet model for inspection and testing."""
+    return self._base_model
+
+  def call(self, inputs):
+    """Returns image features extracted by the backbone.
+
+    Args:
+      inputs: An image tensor of shape [batch_size, input_height,
+        input_width, 3]
+
+    Returns:
+      features_list: A list of length 1 containing a tensor of shape
+        [batch_size, input_height // 4, input_width // 4, 64] containing
+        the features extracted by the ResNet.
+    """
+    return [self._feature_extractor_model(inputs)]
+
+  @property
+  def num_feature_outputs(self):
+    return 1
+
+  @property
+  def out_stride(self):
+    return 4
+
+
+def resnet_v1_101_fpn(channel_means, channel_stds, bgr_ordering):
+  """The ResNet v1 101 FPN feature extractor."""
+
+  return CenterNetResnetV1FpnFeatureExtractor(
+      resnet_type='resnet_v1_101',
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering
+  )
+
+
+def resnet_v1_50_fpn(channel_means, channel_stds, bgr_ordering):
+  """The ResNet v1 50 FPN feature extractor."""
+
+  return CenterNetResnetV1FpnFeatureExtractor(
+      resnet_type='resnet_v1_50',
+      channel_means=channel_means,
+      channel_stds=channel_stds,
+      bgr_ordering=bgr_ordering)