Merge branch 'panoptic-segmentation' into panoptic-segmentation

ca552843 · Srihari Humbarwadi · GitHub · 7e2f7a35 · 6b90e134 · ca552843
Unverified Commit ca552843 authored Sep 16, 2021 by Srihari Humbarwadi Committed by GitHub Sep 16, 2021
20 changed files
--- a/official/vision/image_classification/resnet/resnet_runnable.py
+++ b/official/vision/image_classification/resnet/resnet_runnable.py
@@ -16,9 +16,8 @@
 import orbit
 import tensorflow as tf
+from official.modeling import grad_utils
 from official.modeling import performance
-from official.staging.training import grad_utils
 from official.utils.flags import core as flags_core
 from official.vision.image_classification.resnet import common
 from official.vision.image_classification.resnet import imagenet_preprocessing

--- a/official/vision/keras_cv/layers/deeplab.py
+++ b/official/vision/keras_cv/layers/deeplab.py
@@ -21,9 +21,11 @@ import tensorflow as tf
 class SpatialPyramidPooling(tf.keras.layers.Layer):
  """Implements the Atrous Spatial Pyramid Pooling.
-  Reference:
+  References:
    [Rethinking Atrous Convolution for Semantic Image Segmentation](
      https://arxiv.org/pdf/1706.05587.pdf)
+    [Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
  """
  def __init__(
@@ -39,6 +41,7 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
      kernel_initializer='glorot_uniform',
      kernel_regularizer=None,
      interpolation='bilinear',
+      use_depthwise_convolution=False,
      **kwargs):
    """Initializes `SpatialPyramidPooling`.
@@ -60,6 +63,10 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
      kernel_regularizer: Kernel regularizer for conv layers. Defaults to None.
      interpolation: The interpolation method for upsampling. Defaults to
        `bilinear`.
+      use_depthwise_convolution: Allows spatial pooling to be separable
+         depthwise convolusions. [Encoder-Decoder with Atrous Separable
+         Convolution for Semantic Image Segmentation](
+         https://arxiv.org/pdf/1802.02611.pdf)
      **kwargs: Other keyword arguments for the layer.
    """
    super(SpatialPyramidPooling, self).__init__(**kwargs)
@@ -76,6 +83,7 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
    self.interpolation = interpolation
    self.input_spec = tf.keras.layers.InputSpec(ndim=4)
    self.pool_kernel_size = pool_kernel_size
+    self.use_depthwise_convolution = use_depthwise_convolution
  def build(self, input_shape):
    height = input_shape[1]
@@ -109,9 +117,20 @@ class SpatialPyramidPooling(tf.keras.layers.Layer):
    self.aspp_layers.append(conv_sequential)
    for dilation_rate in self.dilation_rates:
-      conv_sequential = tf.keras.Sequential([
+      leading_layers = []
+      kernel_size = (3, 3)
+      if self.use_depthwise_convolution:
+        leading_layers += [
+            tf.keras.layers.DepthwiseConv2D(
+                depth_multiplier=1, kernel_size=kernel_size,
+                padding='same', depthwise_regularizer=self.kernel_regularizer,
+                depthwise_initializer=self.kernel_initializer,
+                dilation_rate=dilation_rate, use_bias=False)
+        ]
+        kernel_size = (1, 1)
+      conv_sequential = tf.keras.Sequential(leading_layers + [
          tf.keras.layers.Conv2D(
-              filters=self.output_channels, kernel_size=(3, 3),
+              filters=self.output_channels, kernel_size=kernel_size,
              padding='same', kernel_regularizer=self.kernel_regularizer,
              kernel_initializer=self.kernel_initializer,
              dilation_rate=dilation_rate, use_bias=False),

--- a/orbit/standard_runner_test.py
+++ b/orbit/standard_runner_test.py
@@ -91,10 +91,10 @@ class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator):
    super().__init__(eval_dataset=dataset, options=options)
  def eval_begin(self):
-    return tf.constant((0.0,))
+    return {"logits": tf.constant((0.0,))}
  def eval_reduce(self, state, step_outputs):
-    state = tf.concat([state, step_outputs], 0)
+    state["logits"] = tf.concat([state["logits"], step_outputs], 0)
    return state
  def eval_step(self, iterator):
@@ -107,7 +107,7 @@ class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator):
        self.strategy.run(replica_step, args=(next(iterator),)))
  def eval_end(self, outputs):
-    return tf.reduce_sum(outputs)
+    return tf.reduce_sum(outputs["logits"])
 class StandardRunnerTest(parameterized.TestCase):

--- a/orbit/utils/loop_fns.py
+++ b/orbit/utils/loop_fns.py
@@ -159,6 +159,21 @@ def create_tf_while_loop_fn_with_state(step_fn):
          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
          "cause unnecessary retracing when wrapped by `tf.function`.")
+    def _get_relaxed_tensor_shape(t):
+      """Returns a `TensorShape` with all `None` dimensions."""
+      if not tf.is_tensor(t):
+        return None
+      shape = t.shape
+      if shape.rank is not None and shape.rank > 0:
+        return tf.TensorShape([None] * shape.rank)
+      return shape
+    def _get_relaxed_shape_structure(s):
+      """Returns the relaxed shape of the input nested structure `s`."""
+      return tf.nest.pack_sequence_as(
+          state, [_get_relaxed_tensor_shape(t) for t in tf.nest.flatten(s)])
    for _ in tf.range(num_steps):
      # Clear out the outer name scope so the ops created inside `tf.while_loop`
      # don't get "while/" as name prefix.
@@ -167,9 +182,7 @@ def create_tf_while_loop_fn_with_state(step_fn):
        # across iterations. This is useful to aggregate outputs from each step
        # and concat to `state`.
        tf.autograph.experimental.set_loop_options(
-            shape_invariants=[(t, tf.TensorShape([None] * t.shape.rank))
+            shape_invariants=[(state, _get_relaxed_shape_structure(state))])
-                              for t in tf.nest.flatten(state)
-                              if tf.is_tensor(t)])
        outputs = step_fn(iterator)
        state = reduce_fn(state, outputs)
    return state

--- a/research/attention_ocr/python/metrics.py
+++ b/research/attention_ocr/python/metrics.py
@@ -46,7 +46,7 @@ def char_accuracy(predictions, targets, rej_char, streaming=False):
            correct_chars, weights), axis=1),
        tf.reduce_sum(input_tensor=weights, axis=1))
    if streaming:
-      return tf.contrib.metrics.streaming_mean(accuracy_per_example)
+      return tf.metrics.mean(accuracy_per_example)
    else:
      return tf.reduce_mean(input_tensor=accuracy_per_example)
@@ -87,6 +87,6 @@ def sequence_accuracy(predictions, targets, rej_char, streaming=False):
    accuracy_per_example = tf.cast(
        tf.equal(correct_chars_counts, target_chars_counts), dtype=tf.float32)
    if streaming:
-      return tf.contrib.metrics.streaming_mean(accuracy_per_example)
+      return tf.metrics.mean(accuracy_per_example)
    else:
      return tf.reduce_mean(input_tensor=accuracy_per_example)
--- a/research/audioset/yamnet/export.py
+++ b/research/audioset/yamnet/export.py
@@ -44,20 +44,24 @@ def log(msg):
 class YAMNet(tf.Module):
-  "''A TF2 Module wrapper around YAMNet."""
+  """A TF2 Module wrapper around YAMNet."""
  def __init__(self, weights_path, params):
    super().__init__()
    self._yamnet = yamnet.yamnet_frames_model(params)
    self._yamnet.load_weights(weights_path)
    self._class_map_asset = tf.saved_model.Asset('yamnet_class_map.csv')
-  @tf.function
+  @tf.function(input_signature=[])
  def class_map_path(self):
    return self._class_map_asset.asset_path
-  @tf.function(input_signature=(tf.TensorSpec(shape=[None], dtype=tf.float32),))
+  @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.float32)])
  def __call__(self, waveform):
-    return self._yamnet(waveform)
+    predictions, embeddings, log_mel_spectrogram = self._yamnet(waveform)
+    return {'predictions': predictions,
+            'embeddings': embeddings, 
+            'log_mel_spectrogram': log_mel_spectrogram}
 def check_model(model_fn, class_map_path, params):
@@ -65,7 +69,10 @@ def check_model(model_fn, class_map_path, params):
  """Applies yamnet_test's sanity checks to an instance of YAMNet."""
  def clip_test(waveform, expected_class_name, top_n=10):
-    predictions, embeddings, log_mel_spectrogram = model_fn(waveform)
+    results = model_fn(waveform=waveform)
+    predictions = results['predictions']
+    embeddings = results['embeddings']
+    log_mel_spectrogram = results['log_mel_spectrogram']
    clip_predictions = np.mean(predictions, axis=0)
    top_n_indices = np.argsort(clip_predictions)[-top_n:]
    top_n_scores = clip_predictions[top_n_indices]
@@ -106,7 +113,9 @@ def make_tf2_export(weights_path, export_dir):
  # Make TF2 SavedModel export.
  log('Making TF2 SavedModel export ...')
-  tf.saved_model.save(yamnet, export_dir)
+  tf.saved_model.save(
+      yamnet, export_dir,
+      signatures={'serving_default': yamnet.__call__.get_concrete_function()})
  log('Done')
  # Check export with TF-Hub in TF2.
@@ -143,7 +152,9 @@ def make_tflite_export(weights_path, export_dir):
  log('Making TF-Lite SavedModel export ...')
  saved_model_dir = os.path.join(export_dir, 'saved_model')
  os.makedirs(saved_model_dir)
-  tf.saved_model.save(yamnet, saved_model_dir)
+  tf.saved_model.save(
+      yamnet, saved_model_dir,
+      signatures={'serving_default': yamnet.__call__.get_concrete_function()})
  log('Done')
  # Check that the export can be loaded and works.
@@ -154,7 +165,8 @@ def make_tflite_export(weights_path, export_dir):
  # Make a TF-Lite model from the SavedModel.
  log('Making TF-Lite model ...')
-  tflite_converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+  tflite_converter = tf.lite.TFLiteConverter.from_saved_model(
+      saved_model_dir, signature_keys=['serving_default'])
  tflite_model = tflite_converter.convert()
  tflite_model_path = os.path.join(export_dir, 'yamnet.tflite')
  with open(tflite_model_path, 'wb') as f:
@@ -164,19 +176,8 @@ def make_tflite_export(weights_path, export_dir):
  # Check the TF-Lite export.
  log('Checking TF-Lite model ...')
  interpreter = tf.lite.Interpreter(tflite_model_path)
-  audio_input_index = interpreter.get_input_details()[0]['index']
+  runner = interpreter.get_signature_runner('serving_default')
-  scores_output_index = interpreter.get_output_details()[0]['index']
+  check_model(runner, 'yamnet_class_map.csv', params)
-  embeddings_output_index = interpreter.get_output_details()[1]['index']
-  spectrogram_output_index = interpreter.get_output_details()[2]['index']
-  def run_model(waveform):
-    interpreter.resize_tensor_input(audio_input_index, [len(waveform)], strict=True)
-    interpreter.allocate_tensors()
-    interpreter.set_tensor(audio_input_index, waveform)
-    interpreter.invoke()
-    return (interpreter.get_tensor(scores_output_index),
-            interpreter.get_tensor(embeddings_output_index),
-            interpreter.get_tensor(spectrogram_output_index))
-  check_model(run_model, 'yamnet_class_map.csv', params)
  log('Done')
  return saved_model_dir

--- a/research/deeplab/README.md
+++ b/research/deeplab/README.md
 # DeepLab: Deep Labelling for Semantic Image Segmentation
+**To new and existing DeepLab users**: We have released a unified codebase for
+dense pixel labeling tasks in TensorFlow2 at https://github.com/google-research/deeplab2.
+Please consider switching to the newer codebase for better support. 
 DeepLab is a state-of-art deep learning model for semantic image segmentation,
 where the goal is to assign semantic labels (e.g., person, dog, cat and so on)
 to every pixel in the input image. Current implementation includes the following

--- a/research/object_detection/builders/losses_builder.py
+++ b/research/object_detection/builders/losses_builder.py
@@ -263,7 +263,8 @@ def _build_classification_loss(loss_config):
  elif loss_type == 'weighted_dice_classification_loss':
    config = loss_config.weighted_dice_classification_loss
    return losses.WeightedDiceClassificationLoss(
-        squared_normalization=config.squared_normalization)
+        squared_normalization=config.squared_normalization,
+        is_prediction_probability=config.is_prediction_probability)
  else:
    raise ValueError('Empty loss config.')
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -916,7 +916,9 @@ def keypoint_proto_to_params(kp_config, keypoint_map_dict):
      regress_head_kernel_sizes=regress_head_kernel_sizes,
      score_distance_multiplier=kp_config.score_distance_multiplier,
      std_dev_multiplier=kp_config.std_dev_multiplier,
-      rescoring_threshold=kp_config.rescoring_threshold)
+      rescoring_threshold=kp_config.rescoring_threshold,
+      gaussian_denom_ratio=kp_config.gaussian_denom_ratio,
+      argmax_postprocessing=kp_config.argmax_postprocessing)
 def object_detection_proto_to_params(od_config):
@@ -981,7 +983,8 @@ def object_center_proto_to_params(oc_config):
      use_labeled_classes=oc_config.use_labeled_classes,
      keypoint_weights_for_center=keypoint_weights_for_center,
      center_head_num_filters=center_head_num_filters,
-      center_head_kernel_sizes=center_head_kernel_sizes)
+      center_head_kernel_sizes=center_head_kernel_sizes,
+      peak_max_pool_kernel_size=oc_config.peak_max_pool_kernel_size)
 def mask_proto_to_params(mask_config):

--- a/research/object_detection/builders/model_builder_tf2_test.py
+++ b/research/object_detection/builders/model_builder_tf2_test.py
@@ -126,6 +126,8 @@ class ModelBuilderTF2Test(
      score_distance_multiplier: 11.0
      std_dev_multiplier: 2.8
      rescoring_threshold: 0.5
+      gaussian_denom_ratio: 0.3
+      argmax_postprocessing: True
    """
    if customize_head_params:
      task_proto_txt += """
@@ -158,6 +160,7 @@ class ModelBuilderTF2Test(
          beta: 4.0
        }
      }
+      peak_max_pool_kernel_size: 5
    """
    if customize_head_params:
      proto_txt += """
@@ -319,6 +322,7 @@ class ModelBuilderTF2Test(
    else:
      self.assertEqual(model._center_params.center_head_num_filters, [256])
      self.assertEqual(model._center_params.center_head_kernel_sizes, [3])
+    self.assertEqual(model._center_params.peak_max_pool_kernel_size, 5)
    # Check object detection related parameters.
    self.assertAlmostEqual(model._od_params.offset_loss_weight, 0.1)
@@ -376,6 +380,8 @@ class ModelBuilderTF2Test(
      self.assertEqual(kp_params.heatmap_head_kernel_sizes, [3])
      self.assertEqual(kp_params.offset_head_num_filters, [256])
      self.assertEqual(kp_params.offset_head_kernel_sizes, [3])
+    self.assertAlmostEqual(kp_params.gaussian_denom_ratio, 0.3)
+    self.assertEqual(kp_params.argmax_postprocessing, True)
    # Check mask related parameters.
    self.assertAlmostEqual(model._mask_params.task_loss_weight, 0.7)

--- a/research/object_detection/colab_tutorials/centernet_on_device.ipynb
+++ b/research/object_detection/colab_tutorials/centernet_on_device.ipynb
@@ -244,10 +244,10 @@
        "\r\n",
        "  interpreter.invoke()\r\n",
        "\r\n",
-        "  boxes = interpreter.get_tensor(output_details[0]['index'])\r\n",
+        "  scores = interpreter.get_tensor(output_details[0]['index'])\r\n",
-        "  classes = interpreter.get_tensor(output_details[1]['index'])\r\n",
+        "  boxes = interpreter.get_tensor(output_details[1]['index'])\r\n",
-        "  scores = interpreter.get_tensor(output_details[2]['index'])\r\n",
+        "  num_detections = interpreter.get_tensor(output_details[2]['index'])\r\n",
-        "  num_detections = interpreter.get_tensor(output_details[3]['index'])\r\n",
+        "  classes = interpreter.get_tensor(output_details[3]['index'])\r\n",
        "\r\n",
        "  if include_keypoint:\r\n",
        "    kpts = interpreter.get_tensor(output_details[4]['index'])\r\n",
@@ -759,4 +759,4 @@
      ]
    }
  ]
 }
\ No newline at end of file
--- a/research/object_detection/colab_tutorials/generate_ssd_anchor_box_aspect_ratios_using_k_means_clustering.ipynb
+++ b/research/object_detection/colab_tutorials/generate_ssd_anchor_box_aspect_ratios_using_k_means_clustering.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qENhcLrkK9hX"
+      },
+      "source": [
+        "# Generate SSD anchor box aspect ratios using k-means clustering\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KD164da8WQ0U"
+      },
+      "source": [
+        "Many  object detection models use anchor boxes as a region-sampling strategy, so that during training, the model learns to match one of several pre-defined anchor boxes to the ground truth bounding boxes. To optimize the accuracy and efficiency of your object detection model, it's helpful if you tune these anchor boxes to fit your model dataset, because the configuration files that comes with TensorFlow's trained checkpoints include aspect ratios that are intended to cover a very broad set of objects.\n",
+        "\n",
+        "So in this notebook tutorial, you'll learn how to discover a set of aspect ratios that are custom-fit for your dataset, as discovered through k-means clustering of all the ground-truth bounding-box ratios.\n",
+        "\n",
+        "For demonstration purpsoses, we're using a subset of the [PETS dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/) (cats and dogs), which matches some other model training tutorials out there (such as [this one for the Edge TPU](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb#scrollTo=LvEMJSafnyEC)), but you can use this script with a different dataset, and we'll show how to tune it to meet your model's goals, including how to optimize speed over accuracy or accuracy over speed.\n",
+        "\n",
+        "The result of this notebook is a new [pipeline `.config` file](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md) that you can copy into your model training script. With the new customized anchor box configuration, you should observe a faster training pipeline and slightly improved model accuracy.\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cNBjMwIvCrhf"
+      },
+      "source": [
+        "## Get the required libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hCQlBGJkZTR2"
+      },
+      "source": [
+        "import tensorflow as tf"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "aw-Ba-5RUhMs"
+      },
+      "source": [
+        "# Install the tensorflow Object Detection API...\n",
+        "# If you're running this offline, you also might need to install the protobuf-compiler:\n",
+        "#   apt-get install protobuf-compiler\n",
+        "\n",
+        "! git clone -n https://github.com/tensorflow/models.git\n",
+        "%cd models\n",
+        "!git checkout 461b3587ef38b42cda151fa3b7d37706d77e4244\n",
+        "%cd research\n",
+        "! protoc object_detection/protos/*.proto --python_out=.\n",
+        "\n",
+        "# Install TensorFlow Object Detection API\n",
+        "%cp object_detection/packages/tf2/setup.py .\n",
+        "! python -m pip install --upgrade pip\n",
+        "! python -m pip install --use-feature=2020-resolver .\n",
+        "\n",
+        "# Test the installation\n",
+        "! python object_detection/builders/model_builder_tf2_test.py"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "InjvvtaMECr9"
+      },
+      "source": [
+        "## Prepare the dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T62-oddjEH8r"
+      },
+      "source": [
+        "Although this notebook does not perform model training, you need to use the same dataset here that you'll use when training the model.\n",
+        "\n",
+        "To find the best anchor box ratios, you should use all of your training dataset (or as much of it as is reasonable). That's because, as mentioned in the introduction, you want to measure the precise variety of images that you expect your model to encounter—anything less and the anchor boxes might not cover the variety of objects you model encounters, so it might have weak accuracy. (Whereas the alternative, in which the ratios are based on data that is beyond the scope of your model's application, usually creates an inefficient model that can also have weaker accuracy.)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sKYfhq7CKZ4B"
+      },
+      "source": [
+        "%mkdir /content/dataset\n",
+        "%cd /content/dataset\n",
+        "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n",
+        "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n",
+        "! tar zxf images.tar.gz\n",
+        "! tar zxf annotations.tar.gz"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "44vtL0nsAqXg"
+      },
+      "source": [
+        "In this case, we want to reduce the PETS dataset to match the collection of cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)):\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8gcUoBU2K_s7"
+      },
+      "source": [
+        "! cp /content/dataset/annotations/list.txt /content/dataset/annotations/list_petsdataset.txt\n",
+        "! cp /content/dataset/annotations/trainval.txt /content/dataset/annotations/trainval_petsdataset.txt\n",
+        "! cp /content/dataset/annotations/test.txt /content/dataset/annotations/test_petsdataset.txt\n",
+        "! grep \"Abyssinian\" /content/dataset/annotations/list_petsdataset.txt >  /content/dataset/annotations/list.txt\n",
+        "! grep \"american_bulldog\" /content/dataset/annotations/list_petsdataset.txt >> /content/dataset/annotations/list.txt\n",
+        "! grep \"Abyssinian\" /content/dataset/annotations/trainval_petsdataset.txt > /content/dataset/annotations/trainval.txt\n",
+        "! grep \"american_bulldog\" /content/dataset/annotations/trainval_petsdataset.txt >> /content/dataset/annotations/trainval.txt\n",
+        "! grep \"Abyssinian\" /content/dataset/annotations/test_petsdataset.txt > /content/dataset/annotations/test.txt\n",
+        "! grep \"american_bulldog\" /content/dataset/annotations/test_petsdataset.txt >> /content/dataset/annotations/test.txt"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Cs_71ZXMOctb"
+      },
+      "source": [
+        "## Find the aspect ratios using k-means"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R3k5WrMYHPyL"
+      },
+      "source": [
+        "We are trying to find a group of aspect ratios that overlap the majority of object shapes in the dataset. We do that by finding common clusters of bounding boxes of the dataset, using the k-means clustering algorithm to find centroids of these clusters.\n",
+        "\n",
+        "To help with this, we need to calculate following:\n",
+        "\n",
+        "+ The k-means cluster centroids of the given bounding boxes\n",
+        "(see the `kmeans_aspect_ratios()` function below).\n",
+        "\n",
+        "+ The average intersection of bounding boxes with given aspect ratios.\n",
+        "(see the `average_iou()` function below).\n",
+        "This does not affect the outcome of the final box ratios, but serves as a useful metric for you to decide whether the selected boxes are effective and whether you want to try with more/fewer aspect ratios. (We'll discuss this score more below.)\n",
+        "\n",
+        "**NOTE:**\n",
+        "The term \"centroid\" used here refers to the center of the k-means cluster (the boxes (height,width) vector)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vCB8Dfs0Xlyv"
+      },
+      "source": [
+        "import sys\n",
+        "import glob\n",
+        "import numpy as np\n",
+        "import xml.etree.ElementTree as ET\n",
+        "\n",
+        "from sklearn.cluster import KMeans\n",
+        "\n",
+        "def xml_to_boxes(path, classes, rescale_width=None, rescale_height=None):\n",
+        "  \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n",
+        "\n",
+        "  Args:\n",
+        "  path : Path to .xml annotation files for your dataset.\n",
+        "  classes : List of classes that are part of dataset.\n",
+        "  rescale_width : Scaling factor to rescale width of bounding box.\n",
+        "  rescale_height : Scaling factor to rescale height of bounding box.\n",
+        "\n",
+        "  Returns:\n",
+        "  bboxes : A numpy array with pairs of box dimensions as [width, height].\n",
+        "  \"\"\"\n",
+        "\n",
+        "  xml_list = []\n",
+        "  for clss in classes:\n",
+        "    for xml_file in glob.glob(path + '/'+clss+'*'):\n",
+        "      if xml_file.endswith('.xml'):\n",
+        "        tree = ET.parse(xml_file)\n",
+        "        root = tree.getroot()\n",
+        "        for member in root.findall('object'):\n",
+        "          bndbox = member.find('bndbox')\n",
+        "          bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
+        "          bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
+        "          if rescale_width and rescale_height:\n",
+        "            size = root.find('size')\n",
+        "            bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
+        "            bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
+        "\n",
+        "          xml_list.append([bbox_width, bbox_height])\n",
+        "      else:\n",
+        "        continue\n",
+        "  bboxes = np.array(xml_list)\n",
+        "  return bboxes\n",
+        "\n",
+        "\n",
+        "def average_iou(bboxes, anchors):\n",
+        "    \"\"\"Calculates the Intersection over Union (IoU) between bounding boxes and\n",
+        "    anchors.\n",
+        "\n",
+        "    Args:\n",
+        "    bboxes : Array of bounding boxes in [width, height] format.\n",
+        "    anchors : Array of aspect ratios [n, 2] format.\n",
+        "\n",
+        "    Returns:\n",
+        "    avg_iou_perc : A Float value, average of IOU scores from each aspect ratio\n",
+        "    \"\"\"\n",
+        "    intersection_width = np.minimum(anchors[:, [0]], bboxes[:, 0]).T\n",
+        "    intersection_height = np.minimum(anchors[:, [1]], bboxes[:, 1]).T\n",
+        "\n",
+        "    if np.any(intersection_width == 0) or np.any(intersection_height == 0):\n",
+        "        raise ValueError(\"Some boxes have zero size.\")\n",
+        "\n",
+        "    intersection_area = intersection_width * intersection_height\n",
+        "    boxes_area = np.prod(bboxes, axis=1, keepdims=True)\n",
+        "    anchors_area = np.prod(anchors, axis=1, keepdims=True).T\n",
+        "    union_area = boxes_area + anchors_area - intersection_area\n",
+        "    avg_iou_perc = np.mean(np.max(intersection_area / union_area, axis=1)) * 100\n",
+        "\n",
+        "    return avg_iou_perc\n",
+        "\n",
+        "def kmeans_aspect_ratios(bboxes, kmeans_max_iter, num_aspect_ratios):\n",
+        "  \"\"\"Calculate the centroid of bounding boxes clusters using Kmeans algorithm.\n",
+        "\n",
+        "  Args:\n",
+        "  bboxes : Array of bounding boxes in [width, height] format.\n",
+        "  kmeans_max_iter : Maximum number of iterations to find centroids.\n",
+        "  num_aspect_ratios : Number of centroids to optimize kmeans.\n",
+        "\n",
+        "  Returns:\n",
+        "  aspect_ratios : Centroids of cluster (optmised for dataset).\n",
+        "  avg_iou_prec : Average score of bboxes intersecting with new aspect ratios.\n",
+        "  \"\"\"\n",
+        "\n",
+        "  assert len(bboxes), \"You must provide bounding boxes\"\n",
+        "\n",
+        "  normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n",
+        "\n",
+        "   # Using kmeans to find centroids of the width/height clusters\n",
+        "  kmeans = KMeans(\n",
+        "      init='random', n_clusters=num_aspect_ratios,random_state=0, max_iter=kmeans_max_iter)\n",
+        "  kmeans.fit(X=normalized_bboxes)\n",
+        "  ar = kmeans.cluster_centers_\n",
+        "\n",
+        "  assert len(ar), \"Unable to find k-means centroid, try increasing kmeans_max_iter.\"\n",
+        "\n",
+        "  avg_iou_perc = average_iou(normalized_bboxes, ar)\n",
+        "\n",
+        "  if not np.isfinite(avg_iou_perc):\n",
+        "    sys.exit(\"Failed to get aspect ratios due to numerical errors in k-means\")\n",
+        "\n",
+        "  aspect_ratios = [w/h for w,h in ar]\n",
+        "\n",
+        "  return aspect_ratios, avg_iou_perc"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eU2SuLvu55Ds"
+      },
+      "source": [
+        "In the next code block, we'll call the above functions to discover the ideal anchor box aspect ratios.\n",
+        "\n",
+        "You can tune the parameters below to suit your performance objectives.\n",
+        "\n",
+        "Most importantly, you should consider the number of aspect ratios you want to generate. At opposite ends of the decision spectrum, there are two objectives you might seek:\n",
+        "\n",
+        "1. **Low accuracy and fast inference**: Try 2-3 aspect ratios. \n",
+        "    *  This is if your application is okay with accuracy or confidence scores around/below 80%.\n",
+        "    *  The average IOU score (from `avg_iou_perc`) will be around 70-85.\n",
+        "    *  This reduces the model's overall computations during inference, which makes inference faster.\n",
+        "\n",
+        "2. **High accuracy and slow inference**: Try 5-6 aspect ratios.\n",
+        "    *  This is if your application requires accuracy or confidence scores around 95%.\n",
+        "    *  The average IOU score (from `avg_iou_perc`) should be over 95.\n",
+        "    *  This increases the model's overall computations during inference, which makes inference slower.\n",
+        "\n",
+        "The initial configuration below aims somewhere in between: it searches for 4 aspect ratios.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "cNw-vX3nfl1g"
+      },
+      "source": [
+        "classes  = ['Abyssinian','american_bulldog']\n",
+        "xml_path = '/content/dataset/annotations/xmls'\n",
+        "\n",
+        "# Tune this based on your accuracy/speed goals as described above\n",
+        "num_aspect_ratios = 4 # can be [2,3,4,5,6]\n",
+        "\n",
+        "# Tune the iterations based on the size and distribution of your dataset\n",
+        "# You can check avg_iou_prec every 100 iterations to see how centroids converge\n",
+        "kmeans_max_iter = 500\n",
+        "\n",
+        "# These should match the training pipeline config ('fixed_shape_resizer' param)\n",
+        "width = 320\n",
+        "height = 320\n",
+        "\n",
+        "# Get the ground-truth bounding boxes for our dataset\n",
+        "bboxes = xml_to_boxes(path=xml_path, classes=classes,\n",
+        "                      rescale_width=width, rescale_height=height)\n",
+        "\n",
+        "aspect_ratios, avg_iou_perc =  kmeans_aspect_ratios(\n",
+        "                                      bboxes=bboxes,\n",
+        "                                      kmeans_max_iter=kmeans_max_iter,\n",
+        "                                      num_aspect_ratios=num_aspect_ratios)\n",
+        "\n",
+        "aspect_ratios = sorted(aspect_ratios)\n",
+        "\n",
+        "print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n",
+        "print('Average IOU with anchors:', avg_iou_perc)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0xHqOpuxgmD0"
+      },
+      "source": [
+        "## Generate a new pipeline config file"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZB6jqVT6gpmT"
+      },
+      "source": [
+        "That's it. Now we just need the `.config` file your model started with, and we'll merge the new `ssd_anchor_generator` properties into it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "AlMffd3rgKW2"
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "from google.protobuf import text_format\n",
+        "from object_detection.protos import pipeline_pb2\n",
+        "\n",
+        "pipeline = pipeline_pb2.TrainEvalPipelineConfig()\n",
+        "config_path = '/content/models/research/object_detection/samples/configs/ssdlite_mobiledet_edgetpu_320x320_coco_sync_4x4.config'\n",
+        "pipeline_save = '/content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config'\n",
+        "with tf.io.gfile.GFile(config_path, \"r\") as f:\n",
+        "    proto_str = f.read()\n",
+        "    text_format.Merge(proto_str, pipeline)\n",
+        "pipeline.model.ssd.num_classes = 2\n",
+        "while pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios:\n",
+        "  pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios.pop()\n",
+        "\n",
+        "for i in range(len(aspect_ratios)):\n",
+        "  pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios.append(aspect_ratios[i])\n",
+        "\n",
+        "config_text = text_format.MessageToString(pipeline)\n",
+        "with tf.io.gfile.GFile(pipeline_save, \"wb\") as f:\n",
+        "    f.write(config_text)\n",
+        "# Check for updated aspect ratios in the config\n",
+        "!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3kzWdu7ai1om"
+      },
+      "source": [
+        "## Summary and next steps"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FltDhShbi06h"
+      },
+      "source": [
+        "If you look at the new `.config` file printed above, you'll find the `anchor_generator` specification, which includes the new `aspect_ratio` values that we generated with the k-means code above.\n",
+        "\n",
+        "The original config file ([`ssdlite_mobiledet_edgetpu_320x320_coco_sync_4x4.config`](https://github.com/tensorflow/models/blob/master/research/object_detection/samples/configs/ssd_mobilenet_v1_pets.config)) did have some default anchor box aspect ratios already, but we've replaced those with values that are optimized for our dataset. These new anchor boxes should  improve the model accuracy (compared to the default anchors) and speed up the training process.\n",
+        "\n",
+        "If you want to use this configuration to train a model, then check out this tutorial to [retrain MobileDet for the Coral Edge TPU](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb), which uses this exact cats/dogs dataset. Just copy the `.config` file printed above and add it to that training notebook. (Or download the file from the **Files** panel on the left side of the Colab UI: it's called `ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config`.)\n",
+        "\n",
+        "For more information about the pipeline configuration file, read [Configuring the Object Detection Training Pipeline](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md).\n",
+        "\n",
+        "### About anchor scales...\n",
+        "\n",
+        "This notebook is focused on anchor box aspect ratios because that's often the most difficult to tune for each dataset. But you should also consider different configurations for the anchor box scales, which specify the number of different anchor box sizes and their min/max sizes—which affects how well your model detects objects of varying sizes.\n",
+        "\n",
+        "Tuning the anchor scales is much easier to do by hand, by estimating the min/max sizes you expect the model to encounter in your application environment. Just like when choosing the number of aspect ratios above, the number of different box sizes also affects your model accuracy and speed (using more box scales is more accurate, but also slower).\n",
+        "\n",
+        "You can also read more about anchor scales in [Configuring the Object Detection Training Pipeline](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md).\n",
+        "\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/research/object_detection/core/losses.py
+++ b/research/object_detection/core/losses.py
@@ -286,15 +286,19 @@ class WeightedDiceClassificationLoss(Loss):
  """
-  def __init__(self, squared_normalization):
+  def __init__(self, squared_normalization, is_prediction_probability=False):
    """Initializes the loss object.
    Args:
      squared_normalization: boolean, if set, we square the probabilities in the
        denominator term used for normalization.
+      is_prediction_probability: boolean, whether or not the input
+        prediction_tensor represents a probability. If false, it is
+        first converted to a probability by applying sigmoid.
    """
    self._squared_normalization = squared_normalization
+    self.is_prediction_probability = is_prediction_probability
    super(WeightedDiceClassificationLoss, self).__init__()
  def _compute_loss(self,
@@ -332,7 +336,10 @@ class WeightedDiceClassificationLoss(Loss):
                                      tf.shape(prediction_tensor)[2]),
          [1, 1, -1])
-    prob_tensor = tf.nn.sigmoid(prediction_tensor)
+    if self.is_prediction_probability:
+      prob_tensor = prediction_tensor
+    else:
+      prob_tensor = tf.nn.sigmoid(prediction_tensor)
    if self._squared_normalization:
      prob_tensor = tf.pow(prob_tensor, 2)

--- a/research/object_detection/core/post_processing.py
+++ b/research/object_detection/core/post_processing.py
@@ -388,6 +388,28 @@ def _clip_window_prune_boxes(sorted_boxes, clip_window, pad_to_max_output_size,
  return sorted_boxes, num_valid_nms_boxes_cumulative
+def _clip_boxes(boxes, clip_window):
+  """Clips boxes to the given window.
+  Args:
+    boxes: A [batch, num_boxes, 4] float32 tensor containing box coordinates in
+      [ymin, xmin, ymax, xmax] form.
+    clip_window: A [batch, 4] float32 tensor with left top and right bottom
+      coordinate of the window in [ymin, xmin, ymax, xmax] form.
+  Returns:
+    A [batch, num_boxes, 4] float32 tensor containing boxes clipped to the given
+    window.
+  """
+  ymin, xmin, ymax, xmax = tf.unstack(boxes, axis=-1)
+  clipped_ymin = tf.maximum(ymin, clip_window[:, 0, tf.newaxis])
+  clipped_xmin = tf.maximum(xmin, clip_window[:, 1, tf.newaxis])
+  clipped_ymax = tf.minimum(ymax, clip_window[:, 2, tf.newaxis])
+  clipped_xmax = tf.minimum(xmax, clip_window[:, 3, tf.newaxis])
+  return tf.stack([clipped_ymin, clipped_xmin, clipped_ymax, clipped_xmax],
+                  axis=-1)
 class NullContextmanager(object):
  def __enter__(self):
@@ -985,10 +1007,10 @@ def batch_multiclass_non_max_suppression(boxes,
      raise ValueError('Soft NMS is not supported by combined_nms.')
    if use_class_agnostic_nms:
      raise ValueError('class-agnostic NMS is not supported by combined_nms.')
-    if clip_window is not None:
+    if clip_window is None:
      tf.logging.warning(
-          'clip_window is not supported by combined_nms unless it is'
+          'A default clip window of [0. 0. 1. 1.] will be applied for the '
-          ' [0. 0. 1. 1.] for each image.')
+          'boxes.')
    if additional_fields is not None:
      tf.logging.warning('additional_fields is not supported by combined_nms.')
    if parallel_iterations != 32:
@@ -1007,7 +1029,14 @@ def batch_multiclass_non_max_suppression(boxes,
           max_total_size=max_total_size,
           iou_threshold=iou_thresh,
           score_threshold=score_thresh,
+           clip_boxes=(True if clip_window is None else False),
           pad_per_class=use_static_shapes)
+      if clip_window is not None:
+        if clip_window.shape.ndims == 1:
+          boxes_shape = boxes.shape
+          batch_size = shape_utils.get_dim_as_int(boxes_shape[0])
+          clip_window = tf.tile(clip_window[tf.newaxis, :], [batch_size, 1])
+        batch_nmsed_boxes = _clip_boxes(batch_nmsed_boxes, clip_window)
      # Not supported by combined_non_max_suppression.
      batch_nmsed_masks = None
      # Not supported by combined_non_max_suppression.

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -961,7 +961,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
                                       width,
                                       gt_boxes_list,
                                       gt_classes_list,
-                                       gt_weights_list=None):
+                                       gt_weights_list=None,
+                                       maximum_normalized_coordinate=1.1):
    """Computes the object center heatmap target.
    Args:
@@ -977,6 +978,9 @@ class CenterNetCenterHeatmapTargetAssigner(object):
        in the gt_boxes_list.
      gt_weights_list: A list of float tensors with shape [num_boxes]
        representing the weight of each groundtruth detection box.
+      maximum_normalized_coordinate: Maximum coordinate value to be considered
+        as normalized, default to 1.1. This is used to check bounds during
+        converting normalized coordinates to absolute coordinates.
    Returns:
      heatmap: A Tensor of size [batch_size, output_height, output_width,
@@ -1002,7 +1006,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
      boxes = box_list_ops.to_absolute_coordinates(
          boxes,
          tf.maximum(height // self._stride, 1),
-          tf.maximum(width // self._stride, 1))
+          tf.maximum(width // self._stride, 1),
+          maximum_normalized_coordinate=maximum_normalized_coordinate)
      # Get the box center coordinates. Each returned tensors have the shape of
      # [num_instances]
      (y_center, x_center, boxes_height,

--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -782,6 +782,269 @@ def prediction_to_single_instance_keypoints(
  return keypoint_candidates, keypoint_scores, None
+def _gaussian_weighted_map_const_multi(
+    y_grid, x_grid, heatmap, points_y, points_x, boxes,
+    gaussian_denom_ratio):
+  """Rescores heatmap using the distance information.
+  The function is called when the candidate_ranking_mode in the
+  KeypointEstimationParams is set to be 'gaussian_weighted_const'. The
+  keypoint candidates are ranked using the formula:
+    heatmap_score * exp((-distances^2) / (gaussian_denom))
+  where 'gaussian_denom' is determined by:
+    min(output_feature_height, output_feature_width) * gaussian_denom_ratio
+  the 'distances' are the distances between the grid coordinates and the target
+  points.
+  Note that the postfix 'const' refers to the fact that the denominator is a
+  constant given the input image size, not scaled by the size of each of the
+  instances.
+  Args:
+    y_grid: A float tensor with shape [height, width] representing the
+      y-coordinate of each pixel grid.
+    x_grid: A float tensor with shape [height, width] representing the
+      x-coordinate of each pixel grid.
+    heatmap: A float tensor with shape [batch_size, height, width,
+      num_keypoints] representing the heatmap to be rescored.
+    points_y: A float tensor with shape [batch_size, num_instances,
+      num_keypoints] representing the y coordinates of the target points for
+      each channel.
+    points_x: A float tensor with shape [batch_size, num_instances,
+      num_keypoints] representing the x coordinates of the target points for
+      each channel.
+    boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
+      bounding boxes for each instance, expressed in the output coordinate
+      frame.
+    gaussian_denom_ratio: A constant used in the above formula that determines
+      the denominator of the Gaussian kernel.
+  Returns:
+    A float tensor with shape [batch_size, height, width, channel] representing
+    the rescored heatmap.
+  """
+  batch_size, num_instances, _ = _get_shape(boxes, 3)
+  _, height, width, num_keypoints = _get_shape(heatmap, 4)
+  # [batch_size, height, width, num_instances, num_keypoints].
+  # Note that we intentionally avoid using tf.newaxis as TfLite converter
+  # doesn't like it.
+  y_diff = (
+      tf.reshape(y_grid, [1, height, width, 1, 1]) -
+      tf.reshape(points_y, [batch_size, 1, 1, num_instances, num_keypoints]))
+  x_diff = (
+      tf.reshape(x_grid, [1, height, width, 1, 1]) -
+      tf.reshape(points_x, [batch_size, 1, 1, num_instances, num_keypoints]))
+  distance_square = y_diff**2 + x_diff**2
+  y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=2)
+  # Make the mask with all 1.0 in the box regions.
+  # Shape: [batch_size, height, width, num_instances]
+  in_boxes = tf.math.logical_and(
+      tf.math.logical_and(
+          tf.reshape(y_grid, [1, height, width, 1]) >= tf.reshape(
+              y_min, [batch_size, 1, 1, num_instances]),
+          tf.reshape(y_grid, [1, height, width, 1]) < tf.reshape(
+              y_max, [batch_size, 1, 1, num_instances])),
+      tf.math.logical_and(
+          tf.reshape(x_grid, [1, height, width, 1]) >= tf.reshape(
+              x_min, [batch_size, 1, 1, num_instances]),
+          tf.reshape(x_grid, [1, height, width, 1]) < tf.reshape(
+              x_max, [batch_size, 1, 1, num_instances])))
+  in_boxes = tf.cast(in_boxes, dtype=tf.float32)
+  gaussian_denom = tf.cast(
+      tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio
+  # shape: [batch_size, height, width, num_instances, num_keypoints]
+  gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom)
+  return tf.expand_dims(
+      heatmap, axis=3) * gaussian_map * tf.reshape(
+          in_boxes, [batch_size, height, width, num_instances, 1])
+def prediction_tensors_to_multi_instance_kpts(
+    keypoint_heatmap_predictions,
+    keypoint_heatmap_offsets,
+    keypoint_score_heatmap=None):
+  """Converts keypoint heatmap predictions and offsets to keypoint candidates.
+  This function is similar to the 'prediction_tensors_to_single_instance_kpts'
+  function except that the input keypoint_heatmap_predictions is prepared to
+  have an additional 'num_instances' dimension for multi-instance prediction.
+  Args:
+    keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
+      width, num_instances, num_keypoints] representing the per-keypoint and
+      per-instance heatmaps which is used for finding the best keypoint
+      candidate locations.
+    keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
+      width, 2 * num_keypoints] representing the per-keypoint offsets.
+    keypoint_score_heatmap: (optional) A float tensor of shape [batch_size,
+      height, width, num_keypoints] representing the heatmap
+      which is used for reporting the confidence scores. If not provided, then
+      the values in the keypoint_heatmap_predictions will be used.
+  Returns:
+    keypoint_candidates: A tensor of shape
+      [batch_size, max_candidates, num_keypoints, 2] holding the
+      location of keypoint candidates in [y, x] format (expressed in absolute
+      coordinates in the output coordinate frame).
+    keypoint_scores: A float tensor of shape
+      [batch_size, max_candidates, num_keypoints] with the scores for each
+      keypoint candidate. The scores come directly from the heatmap predictions.
+  """
+  batch_size, height, width, num_instances, num_keypoints = _get_shape(
+      keypoint_heatmap_predictions, 5)
+  # [batch_size, height * width, num_instances * num_keypoints].
+  feature_map_flattened = tf.reshape(
+      keypoint_heatmap_predictions,
+      [batch_size, -1, num_instances * num_keypoints])
+  # [batch_size, num_instances * num_keypoints].
+  peak_flat_indices = tf.math.argmax(
+      feature_map_flattened, axis=1, output_type=tf.dtypes.int32)
+  # Get x and y indices corresponding to the top indices in the flat array.
+  y_indices, x_indices = (
+      row_col_indices_from_flattened_indices(peak_flat_indices, width))
+  # [batch_size * num_instances * num_keypoints].
+  y_indices = tf.reshape(y_indices, [-1])
+  x_indices = tf.reshape(x_indices, [-1])
+  # Prepare the indices to gather the offsets from the keypoint_heatmap_offsets.
+  batch_idx = _multi_range(
+      limit=batch_size, value_repetitions=num_keypoints * num_instances)
+  kpts_idx = _multi_range(
+      limit=num_keypoints, value_repetitions=1,
+      range_repetitions=batch_size * num_instances)
+  combined_indices = tf.stack([
+      batch_idx,
+      y_indices,
+      x_indices,
+      kpts_idx
+  ], axis=1)
+  keypoint_heatmap_offsets = tf.reshape(
+      keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, 2])
+  # Retrieve the keypoint offsets: shape:
+  # [batch_size * num_instance * num_keypoints, 2].
+  selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
+                                       combined_indices)
+  y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
+  keypoint_candidates = tf.stack([
+      tf.cast(y_indices, dtype=tf.float32) + tf.expand_dims(y_offsets, axis=0),
+      tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
+  ], axis=2)
+  keypoint_candidates = tf.reshape(
+      keypoint_candidates, [batch_size, num_instances, num_keypoints, 2])
+  if keypoint_score_heatmap is None:
+    keypoint_scores = tf.gather_nd(
+        tf.reduce_max(keypoint_heatmap_predictions, axis=3), combined_indices)
+  else:
+    keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices)
+  return keypoint_candidates, tf.reshape(
+      keypoint_scores, [batch_size, num_instances, num_keypoints])
+def prediction_to_keypoints_argmax(
+    prediction_dict,
+    object_y_indices,
+    object_x_indices,
+    boxes,
+    task_name,
+    kp_params):
+  """Postprocess function to predict multi instance keypoints with argmax op.
+  This is a different implementation of the original keypoint postprocessing
+  function such that it avoids using topk op (replaced by argmax) as it runs
+  much slower in the browser.
+  Args:
+    prediction_dict: a dictionary holding predicted tensors, returned from the
+      predict() method. This dictionary should contain keypoint prediction
+      feature maps for each keypoint task.
+    object_y_indices: A float tensor of shape [batch_size, max_instances]
+      representing the location indices of the object centers.
+    object_x_indices: A float tensor of shape [batch_size, max_instances]
+      representing the location indices of the object centers.
+    boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
+      bounding boxes for each instance, expressed in the output coordinate
+      frame.
+    task_name: string, the name of the task this namedtuple corresponds to.
+      Note that it should be an unique identifier of the task.
+    kp_params: A `KeypointEstimationParams` object with parameters for a single
+      keypoint class.
+  Returns:
+    A tuple of two tensors:
+      keypoint_candidates: A float tensor with shape [batch_size,
+        num_instances, num_keypoints, 2] representing the yx-coordinates of
+        the keypoints in the output feature map space.
+      keypoint_scores: A float tensor with shape [batch_size, num_instances,
+        num_keypoints] representing the keypoint prediction scores.
+  Raises:
+    ValueError: if the candidate_ranking_mode is not supported.
+  """
+  keypoint_heatmap = tf.nn.sigmoid(prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1])
+  keypoint_offset = prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
+  keypoint_regression = prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
+  batch_size, height, width, num_keypoints = _get_shape(keypoint_heatmap, 4)
+  # Create the y,x grids: [height, width]
+  (y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width)
+  # Prepare the indices to retrieve the information from object centers.
+  num_instances = _get_shape(object_y_indices, 2)[1]
+  combined_obj_indices = tf.stack([
+      _multi_range(batch_size, value_repetitions=num_instances),
+      tf.reshape(object_y_indices, [-1]),
+      tf.reshape(object_x_indices, [-1])
+  ], axis=1)
+  # Select the regression vectors from the object center.
+  selected_regression_flat = tf.gather_nd(
+      keypoint_regression, combined_obj_indices)
+  selected_regression = tf.reshape(
+      selected_regression_flat, [batch_size, num_instances, num_keypoints, 2])
+  (y_reg, x_reg) = tf.unstack(selected_regression, axis=3)
+  # shape: [batch_size, num_instances, num_keypoints].
+  y_regressed = tf.cast(
+      tf.reshape(object_y_indices, [batch_size, num_instances, 1]),
+      dtype=tf.float32) + y_reg
+  x_regressed = tf.cast(
+      tf.reshape(object_x_indices, [batch_size, num_instances, 1]),
+      dtype=tf.float32) + x_reg
+  if kp_params.candidate_ranking_mode == 'gaussian_weighted_const':
+    rescored_heatmap = _gaussian_weighted_map_const_multi(
+        y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed, boxes,
+        kp_params.gaussian_denom_ratio)
+    # shape: [batch_size, height, width, num_keypoints].
+    keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=3)
+  else:
+    raise ValueError(
+        'Unsupported ranking mode in the multipose no topk method: %s' %
+        kp_params.candidate_ranking_mode)
+  (keypoint_candidates,
+   keypoint_scores) = prediction_tensors_to_multi_instance_kpts(
+       keypoint_heatmap_predictions=rescored_heatmap,
+       keypoint_heatmap_offsets=keypoint_offset,
+       keypoint_score_heatmap=keypoint_score_heatmap)
+  return keypoint_candidates, keypoint_scores
 def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
                                          y_indices, x_indices):
  """Returns the regressed keypoints at specified object centers.
@@ -1533,15 +1796,9 @@ def convert_strided_predictions_to_normalized_keypoints(
      keypoints, window = inputs
      return keypoint_ops.clip_to_window(keypoints, window)
-    # Specify the TensorSpec explicitly in the tf.map_fn to make it tf.lite
+    keypoint_coords_normalized = shape_utils.static_or_dynamic_map_fn(
-    # compatible.
+        clip_to_window, [keypoint_coords_normalized, batch_window],
-    kpts_dims = _get_shape(keypoint_coords_normalized, 4)
+        dtype=tf.float32, back_prop=False)
-    output_spec = tf.TensorSpec(
-        shape=[kpts_dims[1], kpts_dims[2], kpts_dims[3]], dtype=tf.float32)
-    keypoint_coords_normalized = tf.map_fn(
-        clip_to_window, (keypoint_coords_normalized, batch_window),
-        dtype=tf.float32, back_prop=False,
-        fn_output_signature=output_spec)
    keypoint_scores = tf.where(valid_indices, keypoint_scores,
                               tf.zeros_like(keypoint_scores))
  return keypoint_coords_normalized, keypoint_scores
@@ -1900,7 +2157,8 @@ class KeypointEstimationParams(
        'heatmap_head_kernel_sizes', 'offset_head_num_filters',
        'offset_head_kernel_sizes', 'regress_head_num_filters',
        'regress_head_kernel_sizes', 'score_distance_multiplier',
-        'std_dev_multiplier', 'rescoring_threshold'
+        'std_dev_multiplier', 'rescoring_threshold', 'gaussian_denom_ratio',
+        'argmax_postprocessing'
    ])):
  """Namedtuple to host object detection related parameters.
@@ -1948,7 +2206,9 @@ class KeypointEstimationParams(
              regress_head_kernel_sizes=(3),
              score_distance_multiplier=0.1,
              std_dev_multiplier=1.0,
-              rescoring_threshold=0.0):
+              rescoring_threshold=0.0,
+              argmax_postprocessing=False,
+              gaussian_denom_ratio=0.1):
    """Constructor with default values for KeypointEstimationParams.
    Args:
@@ -2049,6 +2309,12 @@ class KeypointEstimationParams(
        True. The detection score of an instance is set to be the average over
        the scores of the keypoints which their scores higher than the
        threshold.
+      argmax_postprocessing: Whether to use the keypoint postprocessing logic
+        that replaces the topk op with argmax. Usually used when exporting the
+        model for predicting keypoints of multiple instances in the browser.
+      gaussian_denom_ratio: The ratio used to multiply the image size to
+        determine the denominator of the Gaussian formula. Only applicable when
+        the candidate_ranking_mode is set to be 'gaussian_weighted_const'.
    Returns:
      An initialized KeypointEstimationParams namedtuple.
@@ -2067,7 +2333,8 @@ class KeypointEstimationParams(
        heatmap_head_num_filters, heatmap_head_kernel_sizes,
        offset_head_num_filters, offset_head_kernel_sizes,
        regress_head_num_filters, regress_head_kernel_sizes,
-        score_distance_multiplier, std_dev_multiplier, rescoring_threshold)
+        score_distance_multiplier, std_dev_multiplier, rescoring_threshold,
+        argmax_postprocessing, gaussian_denom_ratio)
 class ObjectCenterParams(
@@ -2075,7 +2342,7 @@ class ObjectCenterParams(
        'classification_loss', 'object_center_loss_weight', 'heatmap_bias_init',
        'min_box_overlap_iou', 'max_box_predictions', 'use_labeled_classes',
        'keypoint_weights_for_center', 'center_head_num_filters',
-        'center_head_kernel_sizes'
+        'center_head_kernel_sizes', 'peak_max_pool_kernel_size'
    ])):
  """Namedtuple to store object center prediction related parameters."""
@@ -2090,7 +2357,8 @@ class ObjectCenterParams(
              use_labeled_classes=False,
              keypoint_weights_for_center=None,
              center_head_num_filters=(256),
-              center_head_kernel_sizes=(3)):
+              center_head_kernel_sizes=(3),
+              peak_max_pool_kernel_size=3):
    """Constructor with default values for ObjectCenterParams.
    Args:
@@ -2115,6 +2383,8 @@ class ObjectCenterParams(
        by the object center prediction head.
      center_head_kernel_sizes: kernel size of the convolutional layers used
        by the object center prediction head.
+      peak_max_pool_kernel_size: Max pool kernel size to use to pull off peak
+        score locations in a neighborhood for the object detection heatmap.
    Returns:
      An initialized ObjectCenterParams namedtuple.
    """
@@ -2123,7 +2393,8 @@ class ObjectCenterParams(
                              object_center_loss_weight, heatmap_bias_init,
                              min_box_overlap_iou, max_box_predictions,
                              use_labeled_classes, keypoint_weights_for_center,
-                              center_head_num_filters, center_head_kernel_sizes)
+                              center_head_num_filters, center_head_kernel_sizes,
+                              peak_max_pool_kernel_size)
 class MaskParams(
@@ -2627,16 +2898,12 @@ class CenterNetMetaArch(model.DetectionModel):
      self.track_reid_classification_net = tf.keras.Sequential()
      for _ in range(self._track_params.num_fc_layers - 1):
        self.track_reid_classification_net.add(
-            tf.keras.layers.Dense(self._track_params.reid_embed_size,
+            tf.keras.layers.Dense(self._track_params.reid_embed_size))
-                                  input_shape=(
-                                      self._track_params.reid_embed_size,)))
        self.track_reid_classification_net.add(
            tf.keras.layers.BatchNormalization())
        self.track_reid_classification_net.add(tf.keras.layers.ReLU())
      self.track_reid_classification_net.add(
-          tf.keras.layers.Dense(self._track_params.num_track_ids,
+          tf.keras.layers.Dense(self._track_params.num_track_ids))
-                                input_shape=(
-                                    self._track_params.reid_embed_size,)))
    if self._temporal_offset_params is not None:
      prediction_heads[TEMPORAL_OFFSET] = self._make_prediction_net_list(
          num_feature_outputs, NUM_OFFSET_CHANNELS, name='temporal_offset',
@@ -2714,7 +2981,8 @@ class CenterNetMetaArch(model.DetectionModel):
    return target_assigners
  def _compute_object_center_loss(self, input_height, input_width,
-                                  object_center_predictions, per_pixel_weights):
+                                  object_center_predictions, per_pixel_weights,
+                                  maximum_normalized_coordinate=1.1):
    """Computes the object center loss.
    Args:
@@ -2726,6 +2994,9 @@ class CenterNetMetaArch(model.DetectionModel):
      per_pixel_weights: A float tensor of shape [batch_size,
        out_height * out_width, 1] with 1s in locations where the spatial
        coordinates fall within the height and width in true_image_shapes.
+      maximum_normalized_coordinate: Maximum coordinate value to be considered
+        as normalized, default to 1.1. This is used to check bounds during
+        converting normalized coordinates to absolute coordinates.
    Returns:
      A float scalar tensor representing the object center loss per instance.
@@ -2752,7 +3023,8 @@ class CenterNetMetaArch(model.DetectionModel):
          width=input_width,
          gt_classes_list=gt_classes_list,
          gt_keypoints_list=gt_keypoints_list,
-          gt_weights_list=gt_weights_list)
+          gt_weights_list=gt_weights_list,
+          maximum_normalized_coordinate=maximum_normalized_coordinate)
    else:
      gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
      heatmap_targets = assigner.assign_center_targets_from_boxes(
@@ -2760,7 +3032,8 @@ class CenterNetMetaArch(model.DetectionModel):
          width=input_width,
          gt_boxes_list=gt_boxes_list,
          gt_classes_list=gt_classes_list,
-          gt_weights_list=gt_weights_list)
+          gt_weights_list=gt_weights_list,
+          maximum_normalized_coordinate=maximum_normalized_coordinate)
    flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
    num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
@@ -3577,7 +3850,9 @@ class CenterNetMetaArch(model.DetectionModel):
    self._batched_prediction_tensor_names = predictions.keys()
    return predictions
-  def loss(self, prediction_dict, true_image_shapes, scope=None):
+  def loss(
+      self, prediction_dict, true_image_shapes, scope=None,
+      maximum_normalized_coordinate=1.1):
    """Computes scalar loss tensors with respect to provided groundtruth.
    This function implements the various CenterNet losses.
@@ -3589,6 +3864,9 @@ class CenterNetMetaArch(model.DetectionModel):
        the form [height, width, channels] indicating the shapes of true images
        in the resized images, as resized images can be padded with zeros.
      scope: Optional scope name.
+      maximum_normalized_coordinate: Maximum coordinate value to be considered
+        as normalized, default to 1.1. This is used to check bounds during
+        converting normalized coordinates to absolute coordinates.
    Returns:
      A dictionary mapping the keys [
@@ -3616,7 +3894,7 @@ class CenterNetMetaArch(model.DetectionModel):
    # TODO(vighneshb) Explore whether using floor here is safe.
    output_true_image_shapes = tf.ceil(
-        tf.to_float(true_image_shapes) / self._stride)
+        tf.cast(true_image_shapes, tf.float32) / self._stride)
    valid_anchor_weights = get_valid_anchor_weights_in_flattened_image(
        output_true_image_shapes, output_height, output_width)
    valid_anchor_weights = tf.expand_dims(valid_anchor_weights, 2)
@@ -3625,7 +3903,8 @@ class CenterNetMetaArch(model.DetectionModel):
        object_center_predictions=prediction_dict[OBJECT_CENTER],
        input_height=input_height,
        input_width=input_width,
-        per_pixel_weights=valid_anchor_weights)
+        per_pixel_weights=valid_anchor_weights,
+        maximum_normalized_coordinate=maximum_normalized_coordinate)
    losses = {
        OBJECT_CENTER:
            self._center_params.object_center_loss_weight * object_center_loss
@@ -3742,21 +4021,32 @@ class CenterNetMetaArch(model.DetectionModel):
    """
    object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
-    # Mask object centers by true_image_shape. [batch, h, w, 1]
+    if true_image_shapes is None:
-    object_center_mask = mask_from_true_image_shape(
+      # If true_image_shapes is not provided, we assume the whole image is valid
-        _get_shape(object_center_prob, 4), true_image_shapes)
+      # and infer the true_image_shapes from the object_center_prob shape.
-    object_center_prob *= object_center_mask
+      batch_size, strided_height, strided_width, _ = _get_shape(
+          object_center_prob, 4)
+      true_image_shapes = tf.stack(
+          [strided_height * self._stride, strided_width * self._stride,
+           tf.constant(len(self._feature_extractor._channel_means))])   # pylint: disable=protected-access
+      true_image_shapes = tf.stack([true_image_shapes] * batch_size, axis=0)
+    else:
+      # Mask object centers by true_image_shape. [batch, h, w, 1]
+      object_center_mask = mask_from_true_image_shape(
+          _get_shape(object_center_prob, 4), true_image_shapes)
+      object_center_prob *= object_center_mask
    # Get x, y and channel indices corresponding to the top indices in the class
    # center predictions.
    detection_scores, y_indices, x_indices, channel_indices = (
        top_k_feature_map_locations(
-            object_center_prob, max_pool_kernel_size=3,
+            object_center_prob,
+            max_pool_kernel_size=self._center_params.peak_max_pool_kernel_size,
            k=self._center_params.max_box_predictions))
    multiclass_scores = tf.gather_nd(
        object_center_prob, tf.stack([y_indices, x_indices], -1), batch_dims=1)
+    num_detections = tf.reduce_sum(
-    num_detections = tf.reduce_sum(tf.to_int32(detection_scores > 0), axis=1)
+        tf.cast(detection_scores > 0, tf.int32), axis=1)
    postprocess_dict = {
        fields.DetectionResultFields.detection_scores: detection_scores,
        fields.DetectionResultFields.detection_multiclass_scores:
@@ -3786,10 +4076,22 @@ class CenterNetMetaArch(model.DetectionModel):
      # the ops that are supported by tf.lite on GPU.
      clip_keypoints = self._should_clip_keypoints()
      if len(self._kp_params_dict) == 1 and self._num_classes == 1:
-        (keypoints, keypoint_scores,
+        task_name, kp_params = next(iter(self._kp_params_dict.items()))
-         keypoint_depths) = self._postprocess_keypoints_single_class(
+        keypoint_depths = None
-             prediction_dict, channel_indices, y_indices, x_indices,
+        if kp_params.argmax_postprocessing:
-             boxes_strided, num_detections)
+          keypoints, keypoint_scores = (
+              prediction_to_keypoints_argmax(
+                  prediction_dict,
+                  object_y_indices=y_indices,
+                  object_x_indices=x_indices,
+                  boxes=boxes_strided,
+                  task_name=task_name,
+                  kp_params=kp_params))
+        else:
+          (keypoints, keypoint_scores,
+           keypoint_depths) = self._postprocess_keypoints_single_class(
+               prediction_dict, channel_indices, y_indices, x_indices,
+               boxes_strided, num_detections)
        keypoints, keypoint_scores = (
            convert_strided_predictions_to_normalized_keypoints(
                keypoints, keypoint_scores, self._stride, true_image_shapes,
@@ -4073,9 +4375,13 @@ class CenterNetMetaArch(model.DetectionModel):
    kpt_coords_for_example_list = []
    kpt_scores_for_example_list = []
    for ex_ind in range(batch_size):
-      kpt_coords_for_class_list = []
+      # The tensors that host the keypoint coordinates and scores for all
-      kpt_scores_for_class_list = []
+      # instances and all keypoints. They will be updated by scatter_nd_add for
-      instance_inds_for_class_list = []
+      # each keypoint tasks.
+      kpt_coords_for_example_all_det = tf.zeros(
+          [max_detections, total_num_keypoints, 2])
+      kpt_scores_for_example_all_det = tf.zeros(
+          [max_detections, total_num_keypoints])
      for task_name, kp_params in self._kp_params_dict.items():
        keypoint_heatmap = prediction_dict[
            get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]
@@ -4085,77 +4391,62 @@ class CenterNetMetaArch(model.DetectionModel):
            get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
        instance_inds = self._get_instance_indices(
            classes, num_detections, ex_ind, kp_params.class_id)
-        num_ind = _get_shape(instance_inds, 1)
+        # Gather the feature map locations corresponding to the object class.
-        def true_fn(keypoint_heatmap, keypoint_offsets, keypoint_regression,
+        y_indices_for_kpt_class = tf.gather(y_indices, instance_inds, axis=1)
-                    classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
+        x_indices_for_kpt_class = tf.gather(x_indices, instance_inds, axis=1)
-                    kp_params):
+        if boxes is None:
-          """Logics to execute when instance_inds is not an empty set."""
+          boxes_for_kpt_class = None
-          # Gather the feature map locations corresponding to the object class.
+        else:
-          y_indices_for_kpt_class = tf.gather(y_indices, instance_inds, axis=1)
+          boxes_for_kpt_class = tf.gather(boxes, instance_inds, axis=1)
-          x_indices_for_kpt_class = tf.gather(x_indices, instance_inds, axis=1)
-          if boxes is None:
+        # Postprocess keypoints and scores for class and single image. Shapes
-            boxes_for_kpt_class = None
+        # are [1, num_instances_i, num_keypoints_i, 2] and
-          else:
+        # [1, num_instances_i, num_keypoints_i], respectively. Note that
-            boxes_for_kpt_class = tf.gather(boxes, instance_inds, axis=1)
+        # num_instances_i and num_keypoints_i refers to the number of
+        # instances and keypoints for class i, respectively.
-          # Postprocess keypoints and scores for class and single image. Shapes
+        (kpt_coords_for_class, kpt_scores_for_class, _) = (
-          # are [1, num_instances_i, num_keypoints_i, 2] and
+            self._postprocess_keypoints_for_class_and_image(
-          # [1, num_instances_i, num_keypoints_i], respectively. Note that
+                keypoint_heatmap,
-          # num_instances_i and num_keypoints_i refers to the number of
+                keypoint_offsets,
-          # instances and keypoints for class i, respectively.
+                keypoint_regression,
-          (kpt_coords_for_class, kpt_scores_for_class, _) = (
+                classes,
-              self._postprocess_keypoints_for_class_and_image(
+                y_indices_for_kpt_class,
-                  keypoint_heatmap,
+                x_indices_for_kpt_class,
-                  keypoint_offsets,
+                boxes_for_kpt_class,
-                  keypoint_regression,
+                ex_ind,
-                  classes,
+                kp_params,
-                  y_indices_for_kpt_class,
+            ))
-                  x_indices_for_kpt_class,
-                  boxes_for_kpt_class,
+        # Prepare the indices for scatter_nd. The resulting combined_inds has
-                  ex_ind,
+        # the shape of [num_instances_i * num_keypoints_i, 2], where the first
-                  kp_params,
+        # column corresponds to the instance IDs and the second column
-              ))
+        # corresponds to the keypoint IDs.
+        kpt_inds = tf.constant(kp_params.keypoint_indices, dtype=tf.int32)
-          # Expand keypoint dimension (with padding) so that coordinates and
+        kpt_inds = tf.expand_dims(kpt_inds, axis=0)
-          # scores have shape [1, num_instances_i, num_total_keypoints, 2] and
+        instance_inds_expand = tf.expand_dims(instance_inds, axis=-1)
-          # [1, num_instances_i, num_total_keypoints], respectively.
+        kpt_inds_expand = kpt_inds * tf.ones_like(instance_inds_expand)
-          kpts_coords_for_class_padded, kpt_scores_for_class_padded = (
+        instance_inds_expand = instance_inds_expand * tf.ones_like(kpt_inds)
-              _pad_to_full_keypoint_dim(kpt_coords_for_class,
+        combined_inds = tf.stack(
-                                        kpt_scores_for_class,
+            [instance_inds_expand, kpt_inds_expand], axis=2)
-                                        kp_params.keypoint_indices,
+        combined_inds = tf.reshape(combined_inds, [-1, 2])
-                                        total_num_keypoints))
-          return kpts_coords_for_class_padded, kpt_scores_for_class_padded
+        # Reshape the keypoint coordinates/scores to [num_instances_i *
+        # num_keypoints_i, 2]/[num_instances_i * num_keypoints_i] to be used
-        def false_fn():
+        # by scatter_nd_add.
-          """Logics to execute when the instance_inds is an empty set."""
+        kpt_coords_for_class = tf.reshape(kpt_coords_for_class, [-1, 2])
-          return (tf.zeros([1, 0, total_num_keypoints, 2], dtype=tf.float32),
+        kpt_scores_for_class = tf.reshape(kpt_scores_for_class, [-1])
-                  tf.zeros([1, 0, total_num_keypoints], dtype=tf.float32))
+        kpt_coords_for_example_all_det = tf.tensor_scatter_nd_add(
+            kpt_coords_for_example_all_det,
-        true_fn = functools.partial(
+            combined_inds, kpt_coords_for_class)
-            true_fn, keypoint_heatmap, keypoint_offsets, keypoint_regression,
+        kpt_scores_for_example_all_det = tf.tensor_scatter_nd_add(
-            classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
+            kpt_scores_for_example_all_det,
-            kp_params)
+            combined_inds, kpt_scores_for_class)
-        # Use dimension values instead of tf.size for tf.lite compatibility.
-        results = tf.cond(num_ind[0] > 0, true_fn, false_fn)
+      kpt_coords_for_example_list.append(
+          tf.expand_dims(kpt_coords_for_example_all_det, axis=0))
-        kpt_coords_for_class_list.append(results[0])
+      kpt_scores_for_example_list.append(
-        kpt_scores_for_class_list.append(results[1])
+          tf.expand_dims(kpt_scores_for_example_all_det, axis=0))
-        instance_inds_for_class_list.append(instance_inds)
-      # Concatenate all keypoints across all classes (single example).
-      kpt_coords_for_example = tf.concat(kpt_coords_for_class_list, axis=1)
-      kpt_scores_for_example = tf.concat(kpt_scores_for_class_list, axis=1)
-      instance_inds_for_example = tf.concat(instance_inds_for_class_list,
-                                            axis=0)
-      (kpt_coords_for_example_all_det,
-       kpt_scores_for_example_all_det) = self._scatter_keypoints_to_batch(
-           num_ind, kpt_coords_for_example, kpt_scores_for_example,
-           instance_inds_for_example, max_detections, total_num_keypoints)
-    kpt_coords_for_example_list.append(kpt_coords_for_example_all_det)
-    kpt_scores_for_example_list.append(kpt_scores_for_example_all_det)
    # Concatenate all keypoints and scores from all examples in the batch.
    # Shapes are [batch_size, max_detections, num_total_keypoints, 2] and

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -807,6 +807,77 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
    np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
+  @parameterized.parameters({'provide_keypoint_score': True},
+                            {'provide_keypoint_score': False})
+  def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score):
+    image_size = (9, 9)
+    keypoint_heatmap_np = np.zeros((1, image_size[0], image_size[1], 3, 4),
+                                   dtype=np.float32)
+    # Instance 0.
+    keypoint_heatmap_np[0, 1, 1, 0, 0] = 0.9
+    keypoint_heatmap_np[0, 1, 7, 0, 1] = 0.9
+    keypoint_heatmap_np[0, 7, 1, 0, 2] = 0.9
+    keypoint_heatmap_np[0, 7, 7, 0, 3] = 0.9
+    # Instance 1.
+    keypoint_heatmap_np[0, 2, 2, 1, 0] = 0.8
+    keypoint_heatmap_np[0, 2, 8, 1, 1] = 0.8
+    keypoint_heatmap_np[0, 8, 2, 1, 2] = 0.8
+    keypoint_heatmap_np[0, 8, 8, 1, 3] = 0.8
+    keypoint_offset_np = np.zeros((1, image_size[0], image_size[1], 8),
+                                  dtype=np.float32)
+    keypoint_offset_np[0, 1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
+    keypoint_offset_np[0, 7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
+    keypoint_offset_np[0, 2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
+    keypoint_offset_np[0, 8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
+    def graph_fn():
+      keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
+      keypoint_offset = tf.constant(keypoint_offset_np, dtype=tf.float32)
+      if provide_keypoint_score:
+        (keypoint_cands, keypoint_scores) = (
+            cnma.prediction_tensors_to_multi_instance_kpts(
+                keypoint_heatmap,
+                keypoint_offset,
+                tf.reduce_max(keypoint_heatmap, axis=3)))
+      else:
+        (keypoint_cands, keypoint_scores) = (
+            cnma.prediction_tensors_to_multi_instance_kpts(
+                keypoint_heatmap,
+                keypoint_offset))
+      return keypoint_cands, keypoint_scores
+    (keypoint_cands, keypoint_scores) = self.execute(graph_fn, [])
+    expected_keypoint_candidates_0 = [
+        [1.5, 1.5],  # top-left
+        [1.5, 6.5],  # top-right
+        [6.5, 1.5],  # bottom-left
+        [6.5, 6.5],  # bottom-right
+    ]
+    expected_keypoint_scores_0 = [0.9, 0.9, 0.9, 0.9]
+    expected_keypoint_candidates_1 = [
+        [2.3, 2.3],  # top-left
+        [2.3, 7.7],  # top-right
+        [7.7, 2.3],  # bottom-left
+        [7.7, 7.7],  # bottom-right
+    ]
+    expected_keypoint_scores_1 = [0.8, 0.8, 0.8, 0.8]
+    np.testing.assert_allclose(
+        expected_keypoint_candidates_0, keypoint_cands[0, 0, :, :])
+    np.testing.assert_allclose(
+        expected_keypoint_candidates_1, keypoint_cands[0, 1, :, :])
+    np.testing.assert_allclose(
+        expected_keypoint_scores_0, keypoint_scores[0, 0, :])
+    np.testing.assert_allclose(
+        expected_keypoint_scores_1, keypoint_scores[0, 1, :])
  def test_keypoint_candidate_prediction_per_keypoints(self):
    keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
    keypoint_heatmap_np[0, 0, 0, 0] = 1.0
@@ -1644,7 +1715,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100,
                       predict_depth=False,
                       per_keypoint_depth=False,
                       peak_radius=0,
-                       candidate_ranking_mode='min_distance'):
+                       candidate_ranking_mode='min_distance',
+                       argmax_postprocessing=False):
  """Returns the fake keypoint estimation parameter namedtuple."""
  return cnma.KeypointEstimationParams(
      task_name=_TASK_NAME,
@@ -1660,7 +1732,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100,
      predict_depth=predict_depth,
      per_keypoint_depth=per_keypoint_depth,
      offset_peak_radius=peak_radius,
-      candidate_ranking_mode=candidate_ranking_mode)
+      candidate_ranking_mode=candidate_ranking_mode,
+      argmax_postprocessing=argmax_postprocessing)
 def get_fake_mask_params():
@@ -1715,7 +1788,8 @@ def build_center_net_meta_arch(build_resnet=False,
                               per_keypoint_depth=False,
                               peak_radius=0,
                               keypoint_only=False,
-                               candidate_ranking_mode='min_distance'):
+                               candidate_ranking_mode='min_distance',
+                               argmax_postprocessing=False):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
    feature_extractor = (
@@ -1762,7 +1836,8 @@ def build_center_net_meta_arch(build_resnet=False,
                get_fake_kp_params(num_candidates_per_keypoint,
                                   per_keypoint_offset, predict_depth,
                                   per_keypoint_depth, peak_radius,
-                                   candidate_ranking_mode)
+                                   candidate_ranking_mode,
+                                   argmax_postprocessing)
        },
        non_max_suppression_fn=non_max_suppression_fn)
  elif detection_only:
@@ -1790,7 +1865,8 @@ def build_center_net_meta_arch(build_resnet=False,
                get_fake_kp_params(num_candidates_per_keypoint,
                                   per_keypoint_offset, predict_depth,
                                   per_keypoint_depth, peak_radius,
-                                   candidate_ranking_mode)
+                                   candidate_ranking_mode,
+                                   argmax_postprocessing)
        },
        non_max_suppression_fn=non_max_suppression_fn)
  else:
@@ -2056,10 +2132,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
                                   cnma.TEMPORAL_OFFSET)])
  @parameterized.parameters(
-      {'target_class_id': 1},
+      {'target_class_id': 1, 'with_true_image_shape': True},
-      {'target_class_id': 2},
+      {'target_class_id': 2, 'with_true_image_shape': True},
+      {'target_class_id': 1, 'with_true_image_shape': False},
  )
-  def test_postprocess(self, target_class_id):
+  def test_postprocess(self, target_class_id, with_true_image_shape):
    """Test the postprocess function."""
    model = build_center_net_meta_arch()
    max_detection = model._center_params.max_box_predictions
@@ -2140,8 +2217,11 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    }
    def graph_fn():
-      detections = model.postprocess(prediction_dict,
+      if with_true_image_shape:
-                                     tf.constant([[128, 128, 3]]))
+        detections = model.postprocess(prediction_dict,
+                                       tf.constant([[128, 128, 3]]))
+      else:
+        detections = model.postprocess(prediction_dict, None)
      return detections
    detections = self.execute_cpu(graph_fn, [])
@@ -2320,17 +2400,32 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllClose(expected_multiclass_scores,
                        detections['detection_multiclass_scores'][0][0])
-  def test_postprocess_single_class(self):
+  @parameterized.parameters(
+      {
+          'candidate_ranking_mode': 'min_distance',
+          'argmax_postprocessing': False
+      },
+      {
+          'candidate_ranking_mode': 'gaussian_weighted_const',
+          'argmax_postprocessing': True
+      })
+  def test_postprocess_single_class(self, candidate_ranking_mode,
+                                    argmax_postprocessing):
    """Test the postprocess function."""
-    model = build_center_net_meta_arch(num_classes=1)
+    model = build_center_net_meta_arch(
+        num_classes=1, max_box_predictions=5, per_keypoint_offset=True,
+        candidate_ranking_mode=candidate_ranking_mode,
+        argmax_postprocessing=argmax_postprocessing)
    max_detection = model._center_params.max_box_predictions
    num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
    class_center = np.zeros((1, 32, 32, 1), dtype=np.float32)
    height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
    offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
-    keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
+    keypoint_heatmaps = np.ones(
-    keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
+        (1, 32, 32, num_keypoints), dtype=np.float32) * _logit(0.01)
+    keypoint_offsets = np.zeros(
+        (1, 32, 32, num_keypoints * 2), dtype=np.float32)
    keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
    class_probs = np.zeros(1)
@@ -2383,6 +2478,9 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertEqual(detections['num_detections'], [5])
    self.assertAllEqual([1, max_detection, num_keypoints, 2],
                        detections['detection_keypoints'].shape)
+    self.assertAllClose(
+        [[0.4375, 0.4375], [0.4375, 0.5625], [0.5625, 0.4375]],
+        detections['detection_keypoints'][0, 0, 0:3, :])
    self.assertAllEqual([1, max_detection, num_keypoints],
                        detections['detection_keypoint_scores'].shape)

--- a/research/object_detection/meta_architectures/deepmac_meta_arch.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch.py
@@ -36,7 +36,8 @@ class DeepMACParams(
        'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
        'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
        'predict_full_resolution_masks', 'postprocess_crop_size',
-        'max_roi_jitter_ratio', 'roi_jitter_mode', 'box_consistency_loss_weight'
+        'max_roi_jitter_ratio', 'roi_jitter_mode',
+        'box_consistency_loss_weight',
    ])):
  """Class holding the DeepMAC network configutration."""
@@ -125,6 +126,9 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
      raise ValueError('Mask size must be set.')
    return FullyConnectedMaskHead(num_init_channels, mask_size)
+  elif name == 'embedding_projection':
+    return tf.keras.layers.Lambda(lambda x: x)
  elif name.startswith('resnet'):
    return ResNetMaskNetwork(name, num_init_channels)
@@ -262,6 +266,24 @@ def fill_boxes(boxes, height, width):
  return tf.cast(filled_boxes, tf.float32)
+def embedding_projection(x, y):
+  """Compute dot product between two given embeddings.
+  Args:
+    x: [num_instances, height, width, dimension] float tensor input.
+    y: [num_instances, height, width, dimension] or
+      [num_instances, 1, 1, dimension] float tensor input. When the height
+      and width dimensions are 1, TF will broadcast it.
+  Returns:
+    dist: [num_instances, height, width, 1] A float tensor returning
+      the per-pixel embedding projection.
+  """
+  dot = tf.reduce_sum(x * y, axis=3, keepdims=True)
+  return  dot
 class ResNetMaskNetwork(tf.keras.layers.Layer):
  """A small wrapper around ResNet blocks to predict masks."""
@@ -341,6 +363,92 @@ class FullyConnectedMaskHead(tf.keras.layers.Layer):
                      [num_instances, self.mask_size, self.mask_size, 1])
+class DenseResidualBlock(tf.keras.layers.Layer):
+  """Residual block for 1D inputs.
+  This class implemented the pre-activation version of the ResNet block.
+  """
+  def __init__(self, hidden_size, use_shortcut_linear):
+    """Residual Block for 1D inputs.
+    Args:
+      hidden_size: size of the hidden layer.
+      use_shortcut_linear: bool, whether or not to use a linear layer for
+        shortcut.
+    """
+    super(DenseResidualBlock, self).__init__()
+    self.bn_0 = tf.keras.layers.experimental.SyncBatchNormalization(axis=-1)
+    self.bn_1 = tf.keras.layers.experimental.SyncBatchNormalization(axis=-1)
+    self.fc_0 = tf.keras.layers.Dense(
+        hidden_size, activation=None)
+    self.fc_1 = tf.keras.layers.Dense(
+        hidden_size, activation=None, kernel_initializer='zeros')
+    self.activation = tf.keras.layers.Activation('relu')
+    if use_shortcut_linear:
+      self.shortcut = tf.keras.layers.Dense(
+          hidden_size, activation=None, use_bias=False)
+    else:
+      self.shortcut = tf.keras.layers.Lambda(lambda x: x)
+  def __call__(self, inputs):
+    """Layer's forward pass.
+    Args:
+      inputs: input tensor.
+    Returns:
+      Tensor after residual block w/ CondBatchNorm.
+    """
+    out = self.fc_0(self.activation(self.bn_0(inputs)))
+    residual_inp = self.fc_1(self.activation(self.bn_1(out)))
+    skip = self.shortcut(inputs)
+    return residual_inp + skip
+class DenseResNet(tf.keras.layers.Layer):
+  """Resnet with dense layers."""
+  def __init__(self, num_layers, hidden_size, output_size):
+    """Resnet with dense layers.
+    Args:
+      num_layers: int, the number of layers.
+      hidden_size: size of the hidden layer.
+      output_size: size of the output.
+    """
+    super(DenseResNet, self).__init__()
+    self.input_proj = DenseResidualBlock(hidden_size, use_shortcut_linear=True)
+    if num_layers < 4:
+      raise ValueError(
+          'Cannot construct a DenseResNet with less than 4 layers')
+    num_blocks = (num_layers - 2) // 2
+    if ((num_blocks * 2) + 2) != num_layers:
+      raise ValueError(('DenseResNet depth has to be of the form (2n + 2). '
+                        f'Found {num_layers}'))
+    self._num_blocks = num_blocks
+    blocks = [DenseResidualBlock(hidden_size, use_shortcut_linear=False)
+              for _ in range(num_blocks)]
+    self.resnet = tf.keras.Sequential(blocks)
+    self.out_conv = tf.keras.layers.Dense(output_size)
+  def __call__(self, inputs):
+    net = self.input_proj(inputs)
+    return self.out_conv(self.resnet(net))
 class MaskHeadNetwork(tf.keras.layers.Layer):
  """Mask head class for DeepMAC."""
@@ -366,8 +474,18 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
        network_type, num_init_channels, mask_size)
    self._use_instance_embedding = use_instance_embedding
-    self.project_out = tf.keras.layers.Conv2D(
+    self._network_type = network_type
-        filters=1, kernel_size=1, activation=None)
+    if (self._use_instance_embedding and
+        (self._network_type == 'embedding_projection')):
+      raise ValueError(('Cannot feed instance embedding to mask head when '
+                        'computing embedding projection.'))
+    if network_type == 'embedding_projection':
+      self.project_out = tf.keras.layers.Lambda(lambda x: x)
+    else:
+      self.project_out = tf.keras.layers.Conv2D(
+          filters=1, kernel_size=1, activation=None)
  def __call__(self, instance_embedding, pixel_embedding, training):
    """Returns mask logits given object center and spatial embeddings.
@@ -388,10 +506,9 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
    height = tf.shape(pixel_embedding)[1]
    width = tf.shape(pixel_embedding)[2]
-    instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
-    instance_embedding = tf.tile(instance_embedding, [1, height, width, 1])
    if self._use_instance_embedding:
+      instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
+      instance_embedding = tf.tile(instance_embedding, [1, height, width, 1])
      inputs = tf.concat([pixel_embedding, instance_embedding], axis=3)
    else:
      inputs = pixel_embedding
@@ -400,6 +517,10 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
    if isinstance(out, list):
      out = out[-1]
+    if self._network_type == 'embedding_projection':
+      instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
+      out = embedding_projection(instance_embedding, out)
    if out.shape[-1] > 1:
      out = self.project_out(out)
@@ -466,6 +587,21 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    if self._deepmac_params.mask_num_subsamples > 0:
      raise ValueError('Subsampling masks is currently not supported.')
+    if self._deepmac_params.network_type == 'embedding_projection':
+      if self._deepmac_params.use_xy:
+        raise ValueError(
+            'Cannot use x/y coordinates when using embedding projection.')
+      pixel_embedding_dim = self._deepmac_params.pixel_embedding_dim
+      dim = self._deepmac_params.dim
+      if dim != pixel_embedding_dim:
+        raise ValueError(
+            'When using embedding projection mask head, '
+            f'pixel_embedding_dim({pixel_embedding_dim}) '
+            f'must be same as dim({dim}).')
+      loss = self._deepmac_params.classification_loss
    super(DeepMACMetaArch, self).__init__(
        is_training=is_training, add_summaries=add_summaries,
        num_classes=num_classes, feature_extractor=feature_extractor,

--- a/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
@@ -61,7 +61,10 @@ class MockMaskNet(tf.keras.layers.Layer):
 def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
-                    mask_num_subsamples=-1):
+                    use_instance_embedding=True, mask_num_subsamples=-1,
+                    network_type='hourglass10', use_xy=True,
+                    pixel_embedding_dim=2,
+                    dice_loss_prediction_probability=False):
  """Builds the DeepMAC meta architecture."""
  feature_extractor = DummyFeatureExtractor(
@@ -84,7 +87,9 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
      use_labeled_classes=False)
  if use_dice_loss:
-    classification_loss = losses.WeightedDiceClassificationLoss(False)
+    classification_loss = losses.WeightedDiceClassificationLoss(
+        squared_normalization=False,
+        is_prediction_probability=dice_loss_prediction_probability)
  else:
    classification_loss = losses.WeightedSigmoidClassificationLoss()
@@ -92,13 +97,13 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
      classification_loss=classification_loss,
      dim=8,
      task_loss_weight=1.0,
-      pixel_embedding_dim=2,
+      pixel_embedding_dim=pixel_embedding_dim,
      allowed_masked_classes_ids=[],
      mask_size=16,
      mask_num_subsamples=mask_num_subsamples,
-      use_xy=True,
+      use_xy=use_xy,
-      network_type='hourglass10',
+      network_type=network_type,
-      use_instance_embedding=True,
+      use_instance_embedding=use_instance_embedding,
      num_init_channels=8,
      predict_full_resolution_masks=predict_full_resolution_masks,
      postprocess_crop_size=128,
@@ -125,7 +130,7 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
-class DeepMACUtilsTest(tf.test.TestCase):
+class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
  def test_subsample_trivial(self):
    """Test subsampling masks."""
@@ -169,12 +174,41 @@ class DeepMACUtilsTest(tf.test.TestCase):
        features, boxes, 32)
    self.assertEqual(output.shape, (5, 32, 32, 7))
+  def test_embedding_projection_prob_shape(self):
+    dist = deepmac_meta_arch.embedding_projection(
+        tf.ones((4, 32, 32, 8)), tf.zeros((4, 32, 32, 8)))
+    self.assertEqual(dist.shape, (4, 32, 32, 1))
+  @parameterized.parameters([1e-20, 1e20])
+  def test_embedding_projection_value(self, value):
+    dist = deepmac_meta_arch.embedding_projection(
+        tf.zeros((1, 1, 1, 8)), value + tf.zeros((1, 1, 1, 8))).numpy()
+    max_float = np.finfo(dist.dtype).max
+    self.assertLess(dist.max(), max_float)
+    self.assertGreater(dist.max(), -max_float)
+  @parameterized.named_parameters(
+      [('no_conv_shortcut', (False,)),
+       ('conv_shortcut', (True,))]
+      )
+  def test_res_dense_block(self, conv_shortcut):
+    net = deepmac_meta_arch.DenseResidualBlock(32, conv_shortcut)
+    out = net(tf.zeros((2, 32)))
+    self.assertEqual(out.shape, (2, 32))
+  @parameterized.parameters(
+      [4, 8, 20]
+  )
+  def test_dense_resnet(self, num_layers):
-@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+    net = deepmac_meta_arch.DenseResNet(num_layers, 16, 8)
-class DeepMACMetaArchTest(tf.test.TestCase):
+    out = net(tf.zeros((2, 24)))
+    self.assertEqual(out.shape, (2, 8))
-  def setUp(self):  # pylint:disable=g-missing-super-call
-    self.model = build_meta_arch()
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
  def test_mask_network(self):
    net = deepmac_meta_arch.MaskHeadNetwork('hourglass10', 8)
@@ -203,6 +237,38 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
    self.assertEqual(out.shape, (2, 32, 32))
+  def test_mask_network_embedding_projection_zero(self):
+    net = deepmac_meta_arch.MaskHeadNetwork(
+        'embedding_projection', num_init_channels=8,
+        use_instance_embedding=False)
+    call_func = tf.function(net.__call__)
+    out = call_func(tf.zeros((2, 7)), tf.zeros((2, 32, 32, 7)), training=True)
+    self.assertEqual(out.shape, (2, 32, 32))
+    self.assertAllGreater(out.numpy(), -np.inf)
+    self.assertAllLess(out.numpy(), np.inf)
+  def test_mask_network_embedding_projection_small(self):
+    net = deepmac_meta_arch.MaskHeadNetwork(
+        'embedding_projection', num_init_channels=-1,
+        use_instance_embedding=False)
+    call_func = tf.function(net.__call__)
+    out = call_func(1e6 + tf.zeros((2, 7)),
+                    tf.zeros((2, 32, 32, 7)), training=True)
+    self.assertEqual(out.shape, (2, 32, 32))
+    self.assertAllGreater(out.numpy(), -np.inf)
+    self.assertAllLess(out.numpy(), np.inf)
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):  # pylint:disable=g-missing-super-call
+    self.model = build_meta_arch()
  def test_get_mask_head_input(self):
    boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
@@ -349,6 +415,36 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    prob = tf.nn.sigmoid(0.9).numpy()
    self.assertAllClose(masks, prob * np.ones((2, 3, 16, 16)))
+  def test_postprocess_emb_proj(self):
+    model = build_meta_arch(network_type='embedding_projection',
+                            use_instance_embedding=False,
+                            use_xy=False, pixel_embedding_dim=8,
+                            use_dice_loss=True,
+                            dice_loss_prediction_probability=True)
+    boxes = np.zeros((2, 3, 4), dtype=np.float32)
+    boxes[:, :, [0, 2]] = 0.0
+    boxes[:, :, [1, 3]] = 8.0
+    boxes = tf.constant(boxes)
+    masks = model._postprocess_masks(
+        boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
+    self.assertEqual(masks.shape, (2, 3, 16, 16))
+  def test_postprocess_emb_proj_fullres(self):
+    model = build_meta_arch(network_type='embedding_projection',
+                            predict_full_resolution_masks=True,
+                            use_instance_embedding=False,
+                            pixel_embedding_dim=8, use_xy=False,
+                            use_dice_loss=True)
+    boxes = np.zeros((2, 3, 4), dtype=np.float32)
+    boxes = tf.constant(boxes)
+    masks = model._postprocess_masks(
+        boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
+    self.assertEqual(masks.shape, (2, 3, 128, 128))
  def test_postprocess_no_crop_resize_shape(self):
    model = build_meta_arch(predict_full_resolution_masks=True)
@@ -494,7 +590,7 @@ class FullyConnectedMaskHeadTest(tf.test.TestCase):
 class ResNetMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.parameters(['resnet4', 'resnet8', 'resnet20'])
-  def test_pass(self, name):
+  def test_forward(self, name):
    net = deepmac_meta_arch.ResNetMaskNetwork(name, 8)
    out = net(tf.zeros((3, 32, 32, 16)))
    self.assertEqual(out.shape[:3], (3, 32, 32))

--- a/research/object_detection/packages/tf2/setup.py
+++ b/research/object_detection/packages/tf2/setup.py
@@ -21,11 +21,7 @@ REQUIRED_PACKAGES = [
    'lvis',
    'scipy',
    'pandas',
-    # tensorflow 2.5.0 requires grpcio~=1.34.0.
+    'tf-models-official>=2.5.1',
-    # tf-models-official (which requires google-could-bigquery)  ends
-    # up installing the latest grpcio which causes problems later.
-    'google-cloud-bigquery==1.21.0',
-    'tf-models-official',
 ]
 setup(