Merge branch 'master' into master

1f8b5b27 · Simon Geisler · GitHub · 0eeeaf98 · 8fcf177e · 1f8b5b27
Unverified Commit 1f8b5b27 authored Sep 03, 2021 by Simon Geisler Committed by GitHub Sep 03, 2021
19 changed files
--- a/official/vision/image_classification/resnet/resnet_model.py
+++ b/official/vision/image_classification/resnet/resnet_model.py
@@ -311,8 +311,7 @@ def resnet50(num_classes,
  x = layers.GlobalAveragePooling2D()(x)
  x = layers.Dense(
      num_classes,
-      kernel_initializer=tf.compat.v1.keras.initializers.random_normal(
-          stddev=0.01),
+      kernel_initializer=tf.initializers.random_normal(stddev=0.01),
      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
      name='fc1000')(

--- a/official/vision/image_classification/resnet/resnet_runnable.py
+++ b/official/vision/image_classification/resnet/resnet_runnable.py
@@ -16,9 +16,8 @@

 import orbit
 import tensorflow as tf
-
+from official.modeling import grad_utils
 from official.modeling import performance
-from official.staging.training import grad_utils
 from official.utils.flags import core as flags_core
 from official.vision.image_classification.resnet import common
 from official.vision.image_classification.resnet import imagenet_preprocessing

--- a/research/attention_ocr/python/metrics.py
+++ b/research/attention_ocr/python/metrics.py
@@ -46,7 +46,7 @@ def char_accuracy(predictions, targets, rej_char, streaming=False):
            correct_chars, weights), axis=1),
        tf.reduce_sum(input_tensor=weights, axis=1))
    if streaming:
-      return tf.contrib.metrics.streaming_mean(accuracy_per_example)
+      return tf.metrics.mean(accuracy_per_example)
    else:
      return tf.reduce_mean(input_tensor=accuracy_per_example)

@@ -87,6 +87,6 @@ def sequence_accuracy(predictions, targets, rej_char, streaming=False):
    accuracy_per_example = tf.cast(
        tf.equal(correct_chars_counts, target_chars_counts), dtype=tf.float32)
    if streaming:
-      return tf.contrib.metrics.streaming_mean(accuracy_per_example)
+      return tf.metrics.mean(accuracy_per_example)
    else:
      return tf.reduce_mean(input_tensor=accuracy_per_example)
--- a/research/deeplab/README.md
+++ b/research/deeplab/README.md
 # DeepLab: Deep Labelling for Semantic Image Segmentation

+**To new and existing DeepLab users**: We have released a unified codebase for
+dense pixel labeling tasks in TensorFlow2 at https://github.com/google-research/deeplab2.
+Please consider switching to the newer codebase for better support. 
+
 DeepLab is a state-of-art deep learning model for semantic image segmentation,
 where the goal is to assign semantic labels (e.g., person, dog, cat and so on)
 to every pixel in the input image. Current implementation includes the following

--- a/research/object_detection/builders/losses_builder.py
+++ b/research/object_detection/builders/losses_builder.py
@@ -263,7 +263,8 @@ def _build_classification_loss(loss_config):
  elif loss_type == 'weighted_dice_classification_loss':
    config = loss_config.weighted_dice_classification_loss
    return losses.WeightedDiceClassificationLoss(
-        squared_normalization=config.squared_normalization)
+        squared_normalization=config.squared_normalization,
+        is_prediction_probability=config.is_prediction_probability)

  else:
    raise ValueError('Empty loss config.')
--- a/research/object_detection/builders/model_builder.py
+++ b/research/object_detection/builders/model_builder.py
@@ -916,7 +916,9 @@ def keypoint_proto_to_params(kp_config, keypoint_map_dict):
      regress_head_kernel_sizes=regress_head_kernel_sizes,
      score_distance_multiplier=kp_config.score_distance_multiplier,
      std_dev_multiplier=kp_config.std_dev_multiplier,
-      rescoring_threshold=kp_config.rescoring_threshold)
+      rescoring_threshold=kp_config.rescoring_threshold,
+      gaussian_denom_ratio=kp_config.gaussian_denom_ratio,
+      argmax_postprocessing=kp_config.argmax_postprocessing)


 def object_detection_proto_to_params(od_config):
@@ -981,7 +983,8 @@ def object_center_proto_to_params(oc_config):
      use_labeled_classes=oc_config.use_labeled_classes,
      keypoint_weights_for_center=keypoint_weights_for_center,
      center_head_num_filters=center_head_num_filters,
-      center_head_kernel_sizes=center_head_kernel_sizes)
+      center_head_kernel_sizes=center_head_kernel_sizes,
+      peak_max_pool_kernel_size=oc_config.peak_max_pool_kernel_size)


 def mask_proto_to_params(mask_config):

--- a/research/object_detection/builders/model_builder_tf2_test.py
+++ b/research/object_detection/builders/model_builder_tf2_test.py
@@ -126,6 +126,8 @@ class ModelBuilderTF2Test(
      score_distance_multiplier: 11.0
      std_dev_multiplier: 2.8
      rescoring_threshold: 0.5
+      gaussian_denom_ratio: 0.3
+      argmax_postprocessing: True
    """
    if customize_head_params:
      task_proto_txt += """
@@ -158,6 +160,7 @@ class ModelBuilderTF2Test(
          beta: 4.0
        }
      }
+      peak_max_pool_kernel_size: 5
    """
    if customize_head_params:
      proto_txt += """
@@ -319,6 +322,7 @@ class ModelBuilderTF2Test(
    else:
      self.assertEqual(model._center_params.center_head_num_filters, [256])
      self.assertEqual(model._center_params.center_head_kernel_sizes, [3])
+    self.assertEqual(model._center_params.peak_max_pool_kernel_size, 5)

    # Check object detection related parameters.
    self.assertAlmostEqual(model._od_params.offset_loss_weight, 0.1)
@@ -376,6 +380,8 @@ class ModelBuilderTF2Test(
      self.assertEqual(kp_params.heatmap_head_kernel_sizes, [3])
      self.assertEqual(kp_params.offset_head_num_filters, [256])
      self.assertEqual(kp_params.offset_head_kernel_sizes, [3])
+    self.assertAlmostEqual(kp_params.gaussian_denom_ratio, 0.3)
+    self.assertEqual(kp_params.argmax_postprocessing, True)

    # Check mask related parameters.
    self.assertAlmostEqual(model._mask_params.task_loss_weight, 0.7)

--- a/research/object_detection/colab_tutorials/generate_ssd_anchor_box_aspect_ratios_using_k_means_clustering.ipynb
+++ b/research/object_detection/colab_tutorials/generate_ssd_anchor_box_aspect_ratios_using_k_means_clustering.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qENhcLrkK9hX"
+      },
+      "source": [
+        "# Generate SSD anchor box aspect ratios using k-means clustering\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KD164da8WQ0U"
+      },
+      "source": [
+        "Many  object detection models use anchor boxes as a region-sampling strategy, so that during training, the model learns to match one of several pre-defined anchor boxes to the ground truth bounding boxes. To optimize the accuracy and efficiency of your object detection model, it's helpful if you tune these anchor boxes to fit your model dataset, because the configuration files that comes with TensorFlow's trained checkpoints include aspect ratios that are intended to cover a very broad set of objects.\n",
+        "\n",
+        "So in this notebook tutorial, you'll learn how to discover a set of aspect ratios that are custom-fit for your dataset, as discovered through k-means clustering of all the ground-truth bounding-box ratios.\n",
+        "\n",
+        "For demonstration purpsoses, we're using a subset of the [PETS dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/) (cats and dogs), which matches some other model training tutorials out there (such as [this one for the Edge TPU](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb#scrollTo=LvEMJSafnyEC)), but you can use this script with a different dataset, and we'll show how to tune it to meet your model's goals, including how to optimize speed over accuracy or accuracy over speed.\n",
+        "\n",
+        "The result of this notebook is a new [pipeline `.config` file](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md) that you can copy into your model training script. With the new customized anchor box configuration, you should observe a faster training pipeline and slightly improved model accuracy.\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cNBjMwIvCrhf"
+      },
+      "source": [
+        "## Get the required libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hCQlBGJkZTR2"
+      },
+      "source": [
+        "import tensorflow as tf"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "aw-Ba-5RUhMs"
+      },
+      "source": [
+        "# Install the tensorflow Object Detection API...\n",
+        "# If you're running this offline, you also might need to install the protobuf-compiler:\n",
+        "#   apt-get install protobuf-compiler\n",
+        "\n",
+        "! git clone -n https://github.com/tensorflow/models.git\n",
+        "%cd models\n",
+        "!git checkout 461b3587ef38b42cda151fa3b7d37706d77e4244\n",
+        "%cd research\n",
+        "! protoc object_detection/protos/*.proto --python_out=.\n",
+        "\n",
+        "# Install TensorFlow Object Detection API\n",
+        "%cp object_detection/packages/tf2/setup.py .\n",
+        "! python -m pip install --upgrade pip\n",
+        "! python -m pip install --use-feature=2020-resolver .\n",
+        "\n",
+        "# Test the installation\n",
+        "! python object_detection/builders/model_builder_tf2_test.py"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "InjvvtaMECr9"
+      },
+      "source": [
+        "## Prepare the dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T62-oddjEH8r"
+      },
+      "source": [
+        "Although this notebook does not perform model training, you need to use the same dataset here that you'll use when training the model.\n",
+        "\n",
+        "To find the best anchor box ratios, you should use all of your training dataset (or as much of it as is reasonable). That's because, as mentioned in the introduction, you want to measure the precise variety of images that you expect your model to encounter—anything less and the anchor boxes might not cover the variety of objects you model encounters, so it might have weak accuracy. (Whereas the alternative, in which the ratios are based on data that is beyond the scope of your model's application, usually creates an inefficient model that can also have weaker accuracy.)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sKYfhq7CKZ4B"
+      },
+      "source": [
+        "%mkdir /content/dataset\n",
+        "%cd /content/dataset\n",
+        "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n",
+        "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n",
+        "! tar zxf images.tar.gz\n",
+        "! tar zxf annotations.tar.gz"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "44vtL0nsAqXg"
+      },
+      "source": [
+        "In this case, we want to reduce the PETS dataset to match the collection of cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)):\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8gcUoBU2K_s7"
+      },
+      "source": [
+        "! cp /content/dataset/annotations/list.txt /content/dataset/annotations/list_petsdataset.txt\n",
+        "! cp /content/dataset/annotations/trainval.txt /content/dataset/annotations/trainval_petsdataset.txt\n",
+        "! cp /content/dataset/annotations/test.txt /content/dataset/annotations/test_petsdataset.txt\n",
+        "! grep \"Abyssinian\" /content/dataset/annotations/list_petsdataset.txt >  /content/dataset/annotations/list.txt\n",
+        "! grep \"american_bulldog\" /content/dataset/annotations/list_petsdataset.txt >> /content/dataset/annotations/list.txt\n",
+        "! grep \"Abyssinian\" /content/dataset/annotations/trainval_petsdataset.txt > /content/dataset/annotations/trainval.txt\n",
+        "! grep \"american_bulldog\" /content/dataset/annotations/trainval_petsdataset.txt >> /content/dataset/annotations/trainval.txt\n",
+        "! grep \"Abyssinian\" /content/dataset/annotations/test_petsdataset.txt > /content/dataset/annotations/test.txt\n",
+        "! grep \"american_bulldog\" /content/dataset/annotations/test_petsdataset.txt >> /content/dataset/annotations/test.txt"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Cs_71ZXMOctb"
+      },
+      "source": [
+        "## Find the aspect ratios using k-means"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "R3k5WrMYHPyL"
+      },
+      "source": [
+        "We are trying to find a group of aspect ratios that overlap the majority of object shapes in the dataset. We do that by finding common clusters of bounding boxes of the dataset, using the k-means clustering algorithm to find centroids of these clusters.\n",
+        "\n",
+        "To help with this, we need to calculate following:\n",
+        "\n",
+        "+ The k-means cluster centroids of the given bounding boxes\n",
+        "(see the `kmeans_aspect_ratios()` function below).\n",
+        "\n",
+        "+ The average intersection of bounding boxes with given aspect ratios.\n",
+        "(see the `average_iou()` function below).\n",
+        "This does not affect the outcome of the final box ratios, but serves as a useful metric for you to decide whether the selected boxes are effective and whether you want to try with more/fewer aspect ratios. (We'll discuss this score more below.)\n",
+        "\n",
+        "**NOTE:**\n",
+        "The term \"centroid\" used here refers to the center of the k-means cluster (the boxes (height,width) vector)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vCB8Dfs0Xlyv"
+      },
+      "source": [
+        "import sys\n",
+        "import glob\n",
+        "import numpy as np\n",
+        "import xml.etree.ElementTree as ET\n",
+        "\n",
+        "from sklearn.cluster import KMeans\n",
+        "\n",
+        "def xml_to_boxes(path, classes, rescale_width=None, rescale_height=None):\n",
+        "  \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n",
+        "\n",
+        "  Args:\n",
+        "  path : Path to .xml annotation files for your dataset.\n",
+        "  classes : List of classes that are part of dataset.\n",
+        "  rescale_width : Scaling factor to rescale width of bounding box.\n",
+        "  rescale_height : Scaling factor to rescale height of bounding box.\n",
+        "\n",
+        "  Returns:\n",
+        "  bboxes : A numpy array with pairs of box dimensions as [width, height].\n",
+        "  \"\"\"\n",
+        "\n",
+        "  xml_list = []\n",
+        "  for clss in classes:\n",
+        "    for xml_file in glob.glob(path + '/'+clss+'*'):\n",
+        "      if xml_file.endswith('.xml'):\n",
+        "        tree = ET.parse(xml_file)\n",
+        "        root = tree.getroot()\n",
+        "        for member in root.findall('object'):\n",
+        "          bndbox = member.find('bndbox')\n",
+        "          bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
+        "          bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
+        "          if rescale_width and rescale_height:\n",
+        "            size = root.find('size')\n",
+        "            bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
+        "            bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
+        "\n",
+        "          xml_list.append([bbox_width, bbox_height])\n",
+        "      else:\n",
+        "        continue\n",
+        "  bboxes = np.array(xml_list)\n",
+        "  return bboxes\n",
+        "\n",
+        "\n",
+        "def average_iou(bboxes, anchors):\n",
+        "    \"\"\"Calculates the Intersection over Union (IoU) between bounding boxes and\n",
+        "    anchors.\n",
+        "\n",
+        "    Args:\n",
+        "    bboxes : Array of bounding boxes in [width, height] format.\n",
+        "    anchors : Array of aspect ratios [n, 2] format.\n",
+        "\n",
+        "    Returns:\n",
+        "    avg_iou_perc : A Float value, average of IOU scores from each aspect ratio\n",
+        "    \"\"\"\n",
+        "    intersection_width = np.minimum(anchors[:, [0]], bboxes[:, 0]).T\n",
+        "    intersection_height = np.minimum(anchors[:, [1]], bboxes[:, 1]).T\n",
+        "\n",
+        "    if np.any(intersection_width == 0) or np.any(intersection_height == 0):\n",
+        "        raise ValueError(\"Some boxes have zero size.\")\n",
+        "\n",
+        "    intersection_area = intersection_width * intersection_height\n",
+        "    boxes_area = np.prod(bboxes, axis=1, keepdims=True)\n",
+        "    anchors_area = np.prod(anchors, axis=1, keepdims=True).T\n",
+        "    union_area = boxes_area + anchors_area - intersection_area\n",
+        "    avg_iou_perc = np.mean(np.max(intersection_area / union_area, axis=1)) * 100\n",
+        "\n",
+        "    return avg_iou_perc\n",
+        "\n",
+        "def kmeans_aspect_ratios(bboxes, kmeans_max_iter, num_aspect_ratios):\n",
+        "  \"\"\"Calculate the centroid of bounding boxes clusters using Kmeans algorithm.\n",
+        "\n",
+        "  Args:\n",
+        "  bboxes : Array of bounding boxes in [width, height] format.\n",
+        "  kmeans_max_iter : Maximum number of iterations to find centroids.\n",
+        "  num_aspect_ratios : Number of centroids to optimize kmeans.\n",
+        "\n",
+        "  Returns:\n",
+        "  aspect_ratios : Centroids of cluster (optmised for dataset).\n",
+        "  avg_iou_prec : Average score of bboxes intersecting with new aspect ratios.\n",
+        "  \"\"\"\n",
+        "\n",
+        "  assert len(bboxes), \"You must provide bounding boxes\"\n",
+        "\n",
+        "  normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n",
+        "\n",
+        "   # Using kmeans to find centroids of the width/height clusters\n",
+        "  kmeans = KMeans(\n",
+        "      init='random', n_clusters=num_aspect_ratios,random_state=0, max_iter=kmeans_max_iter)\n",
+        "  kmeans.fit(X=normalized_bboxes)\n",
+        "  ar = kmeans.cluster_centers_\n",
+        "\n",
+        "  assert len(ar), \"Unable to find k-means centroid, try increasing kmeans_max_iter.\"\n",
+        "\n",
+        "  avg_iou_perc = average_iou(normalized_bboxes, ar)\n",
+        "\n",
+        "  if not np.isfinite(avg_iou_perc):\n",
+        "    sys.exit(\"Failed to get aspect ratios due to numerical errors in k-means\")\n",
+        "\n",
+        "  aspect_ratios = [w/h for w,h in ar]\n",
+        "\n",
+        "  return aspect_ratios, avg_iou_perc"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eU2SuLvu55Ds"
+      },
+      "source": [
+        "In the next code block, we'll call the above functions to discover the ideal anchor box aspect ratios.\n",
+        "\n",
+        "You can tune the parameters below to suit your performance objectives.\n",
+        "\n",
+        "Most importantly, you should consider the number of aspect ratios you want to generate. At opposite ends of the decision spectrum, there are two objectives you might seek:\n",
+        "\n",
+        "1. **Low accuracy and fast inference**: Try 2-3 aspect ratios. \n",
+        "    *  This is if your application is okay with accuracy or confidence scores around/below 80%.\n",
+        "    *  The average IOU score (from `avg_iou_perc`) will be around 70-85.\n",
+        "    *  This reduces the model's overall computations during inference, which makes inference faster.\n",
+        "\n",
+        "2. **High accuracy and slow inference**: Try 5-6 aspect ratios.\n",
+        "    *  This is if your application requires accuracy or confidence scores around 95%.\n",
+        "    *  The average IOU score (from `avg_iou_perc`) should be over 95.\n",
+        "    *  This increases the model's overall computations during inference, which makes inference slower.\n",
+        "\n",
+        "The initial configuration below aims somewhere in between: it searches for 4 aspect ratios.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "cNw-vX3nfl1g"
+      },
+      "source": [
+        "classes  = ['Abyssinian','american_bulldog']\n",
+        "xml_path = '/content/dataset/annotations/xmls'\n",
+        "\n",
+        "# Tune this based on your accuracy/speed goals as described above\n",
+        "num_aspect_ratios = 4 # can be [2,3,4,5,6]\n",
+        "\n",
+        "# Tune the iterations based on the size and distribution of your dataset\n",
+        "# You can check avg_iou_prec every 100 iterations to see how centroids converge\n",
+        "kmeans_max_iter = 500\n",
+        "\n",
+        "# These should match the training pipeline config ('fixed_shape_resizer' param)\n",
+        "width = 320\n",
+        "height = 320\n",
+        "\n",
+        "# Get the ground-truth bounding boxes for our dataset\n",
+        "bboxes = xml_to_boxes(path=xml_path, classes=classes,\n",
+        "                      rescale_width=width, rescale_height=height)\n",
+        "\n",
+        "aspect_ratios, avg_iou_perc =  kmeans_aspect_ratios(\n",
+        "                                      bboxes=bboxes,\n",
+        "                                      kmeans_max_iter=kmeans_max_iter,\n",
+        "                                      num_aspect_ratios=num_aspect_ratios)\n",
+        "\n",
+        "aspect_ratios = sorted(aspect_ratios)\n",
+        "\n",
+        "print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n",
+        "print('Average IOU with anchors:', avg_iou_perc)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0xHqOpuxgmD0"
+      },
+      "source": [
+        "## Generate a new pipeline config file"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZB6jqVT6gpmT"
+      },
+      "source": [
+        "That's it. Now we just need the `.config` file your model started with, and we'll merge the new `ssd_anchor_generator` properties into it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "AlMffd3rgKW2"
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "from google.protobuf import text_format\n",
+        "from object_detection.protos import pipeline_pb2\n",
+        "\n",
+        "pipeline = pipeline_pb2.TrainEvalPipelineConfig()\n",
+        "config_path = '/content/models/research/object_detection/samples/configs/ssdlite_mobiledet_edgetpu_320x320_coco_sync_4x4.config'\n",
+        "pipeline_save = '/content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config'\n",
+        "with tf.io.gfile.GFile(config_path, \"r\") as f:\n",
+        "    proto_str = f.read()\n",
+        "    text_format.Merge(proto_str, pipeline)\n",
+        "pipeline.model.ssd.num_classes = 2\n",
+        "while pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios:\n",
+        "  pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios.pop()\n",
+        "\n",
+        "for i in range(len(aspect_ratios)):\n",
+        "  pipeline.model.ssd.anchor_generator.ssd_anchor_generator.aspect_ratios.append(aspect_ratios[i])\n",
+        "\n",
+        "config_text = text_format.MessageToString(pipeline)\n",
+        "with tf.io.gfile.GFile(pipeline_save, \"wb\") as f:\n",
+        "    f.write(config_text)\n",
+        "# Check for updated aspect ratios in the config\n",
+        "!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3kzWdu7ai1om"
+      },
+      "source": [
+        "## Summary and next steps"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FltDhShbi06h"
+      },
+      "source": [
+        "If you look at the new `.config` file printed above, you'll find the `anchor_generator` specification, which includes the new `aspect_ratio` values that we generated with the k-means code above.\n",
+        "\n",
+        "The original config file ([`ssdlite_mobiledet_edgetpu_320x320_coco_sync_4x4.config`](https://github.com/tensorflow/models/blob/master/research/object_detection/samples/configs/ssd_mobilenet_v1_pets.config)) did have some default anchor box aspect ratios already, but we've replaced those with values that are optimized for our dataset. These new anchor boxes should  improve the model accuracy (compared to the default anchors) and speed up the training process.\n",
+        "\n",
+        "If you want to use this configuration to train a model, then check out this tutorial to [retrain MobileDet for the Coral Edge TPU](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb), which uses this exact cats/dogs dataset. Just copy the `.config` file printed above and add it to that training notebook. (Or download the file from the **Files** panel on the left side of the Colab UI: it's called `ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config`.)\n",
+        "\n",
+        "For more information about the pipeline configuration file, read [Configuring the Object Detection Training Pipeline](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md).\n",
+        "\n",
+        "### About anchor scales...\n",
+        "\n",
+        "This notebook is focused on anchor box aspect ratios because that's often the most difficult to tune for each dataset. But you should also consider different configurations for the anchor box scales, which specify the number of different anchor box sizes and their min/max sizes—which affects how well your model detects objects of varying sizes.\n",
+        "\n",
+        "Tuning the anchor scales is much easier to do by hand, by estimating the min/max sizes you expect the model to encounter in your application environment. Just like when choosing the number of aspect ratios above, the number of different box sizes also affects your model accuracy and speed (using more box scales is more accurate, but also slower).\n",
+        "\n",
+        "You can also read more about anchor scales in [Configuring the Object Detection Training Pipeline](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/configuring_jobs.md).\n",
+        "\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/research/object_detection/core/losses.py
+++ b/research/object_detection/core/losses.py
@@ -286,15 +286,19 @@ class WeightedDiceClassificationLoss(Loss):

  """

-  def __init__(self, squared_normalization):
+  def __init__(self, squared_normalization, is_prediction_probability=False):
    """Initializes the loss object.

    Args:
      squared_normalization: boolean, if set, we square the probabilities in the
        denominator term used for normalization.
+      is_prediction_probability: boolean, whether or not the input
+        prediction_tensor represents a probability. If false, it is
+        first converted to a probability by applying sigmoid.
    """

    self._squared_normalization = squared_normalization
+    self.is_prediction_probability = is_prediction_probability
    super(WeightedDiceClassificationLoss, self).__init__()

  def _compute_loss(self,
@@ -332,6 +336,9 @@ class WeightedDiceClassificationLoss(Loss):
                                      tf.shape(prediction_tensor)[2]),
          [1, 1, -1])

+    if self.is_prediction_probability:
+      prob_tensor = prediction_tensor
+    else:
      prob_tensor = tf.nn.sigmoid(prediction_tensor)

    if self._squared_normalization:

--- a/research/object_detection/core/post_processing.py
+++ b/research/object_detection/core/post_processing.py
@@ -388,6 +388,28 @@ def _clip_window_prune_boxes(sorted_boxes, clip_window, pad_to_max_output_size,
  return sorted_boxes, num_valid_nms_boxes_cumulative


+def _clip_boxes(boxes, clip_window):
+  """Clips boxes to the given window.
+
+  Args:
+    boxes: A [batch, num_boxes, 4] float32 tensor containing box coordinates in
+      [ymin, xmin, ymax, xmax] form.
+    clip_window: A [batch, 4] float32 tensor with left top and right bottom
+      coordinate of the window in [ymin, xmin, ymax, xmax] form.
+
+  Returns:
+    A [batch, num_boxes, 4] float32 tensor containing boxes clipped to the given
+    window.
+  """
+  ymin, xmin, ymax, xmax = tf.unstack(boxes, axis=-1)
+  clipped_ymin = tf.maximum(ymin, clip_window[:, 0, tf.newaxis])
+  clipped_xmin = tf.maximum(xmin, clip_window[:, 1, tf.newaxis])
+  clipped_ymax = tf.minimum(ymax, clip_window[:, 2, tf.newaxis])
+  clipped_xmax = tf.minimum(xmax, clip_window[:, 3, tf.newaxis])
+  return tf.stack([clipped_ymin, clipped_xmin, clipped_ymax, clipped_xmax],
+                  axis=-1)
+
+
 class NullContextmanager(object):

  def __enter__(self):
@@ -985,10 +1007,10 @@ def batch_multiclass_non_max_suppression(boxes,
      raise ValueError('Soft NMS is not supported by combined_nms.')
    if use_class_agnostic_nms:
      raise ValueError('class-agnostic NMS is not supported by combined_nms.')
-    if clip_window is not None:
+    if clip_window is None:
      tf.logging.warning(
-          'clip_window is not supported by combined_nms unless it is'
-          ' [0. 0. 1. 1.] for each image.')
+          'A default clip window of [0. 0. 1. 1.] will be applied for the '
+          'boxes.')
    if additional_fields is not None:
      tf.logging.warning('additional_fields is not supported by combined_nms.')
    if parallel_iterations != 32:
@@ -1007,7 +1029,14 @@ def batch_multiclass_non_max_suppression(boxes,
           max_total_size=max_total_size,
           iou_threshold=iou_thresh,
           score_threshold=score_thresh,
+           clip_boxes=(True if clip_window is None else False),
           pad_per_class=use_static_shapes)
+      if clip_window is not None:
+        if clip_window.shape.ndims == 1:
+          boxes_shape = boxes.shape
+          batch_size = shape_utils.get_dim_as_int(boxes_shape[0])
+          clip_window = tf.tile(clip_window[tf.newaxis, :], [batch_size, 1])
+        batch_nmsed_boxes = _clip_boxes(batch_nmsed_boxes, clip_window)
      # Not supported by combined_non_max_suppression.
      batch_nmsed_masks = None
      # Not supported by combined_non_max_suppression.

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -961,7 +961,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
                                       width,
                                       gt_boxes_list,
                                       gt_classes_list,
-                                       gt_weights_list=None):
+                                       gt_weights_list=None,
+                                       maximum_normalized_coordinate=1.1):
    """Computes the object center heatmap target.

    Args:
@@ -977,6 +978,9 @@ class CenterNetCenterHeatmapTargetAssigner(object):
        in the gt_boxes_list.
      gt_weights_list: A list of float tensors with shape [num_boxes]
        representing the weight of each groundtruth detection box.
+      maximum_normalized_coordinate: Maximum coordinate value to be considered
+        as normalized, default to 1.1. This is used to check bounds during
+        converting normalized coordinates to absolute coordinates.

    Returns:
      heatmap: A Tensor of size [batch_size, output_height, output_width,
@@ -1002,7 +1006,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
      boxes = box_list_ops.to_absolute_coordinates(
          boxes,
          tf.maximum(height // self._stride, 1),
-          tf.maximum(width // self._stride, 1))
+          tf.maximum(width // self._stride, 1),
+          maximum_normalized_coordinate=maximum_normalized_coordinate)
      # Get the box center coordinates. Each returned tensors have the shape of
      # [num_instances]
      (y_center, x_center, boxes_height,

--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -782,6 +782,269 @@ def prediction_to_single_instance_keypoints(
  return keypoint_candidates, keypoint_scores, None


+def _gaussian_weighted_map_const_multi(
+    y_grid, x_grid, heatmap, points_y, points_x, boxes,
+    gaussian_denom_ratio):
+  """Rescores heatmap using the distance information.
+
+  The function is called when the candidate_ranking_mode in the
+  KeypointEstimationParams is set to be 'gaussian_weighted_const'. The
+  keypoint candidates are ranked using the formula:
+    heatmap_score * exp((-distances^2) / (gaussian_denom))
+
+  where 'gaussian_denom' is determined by:
+    min(output_feature_height, output_feature_width) * gaussian_denom_ratio
+
+  the 'distances' are the distances between the grid coordinates and the target
+  points.
+
+  Note that the postfix 'const' refers to the fact that the denominator is a
+  constant given the input image size, not scaled by the size of each of the
+  instances.
+
+  Args:
+    y_grid: A float tensor with shape [height, width] representing the
+      y-coordinate of each pixel grid.
+    x_grid: A float tensor with shape [height, width] representing the
+      x-coordinate of each pixel grid.
+    heatmap: A float tensor with shape [batch_size, height, width,
+      num_keypoints] representing the heatmap to be rescored.
+    points_y: A float tensor with shape [batch_size, num_instances,
+      num_keypoints] representing the y coordinates of the target points for
+      each channel.
+    points_x: A float tensor with shape [batch_size, num_instances,
+      num_keypoints] representing the x coordinates of the target points for
+      each channel.
+    boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
+      bounding boxes for each instance, expressed in the output coordinate
+      frame.
+    gaussian_denom_ratio: A constant used in the above formula that determines
+      the denominator of the Gaussian kernel.
+
+  Returns:
+    A float tensor with shape [batch_size, height, width, channel] representing
+    the rescored heatmap.
+  """
+  batch_size, num_instances, _ = _get_shape(boxes, 3)
+  _, height, width, num_keypoints = _get_shape(heatmap, 4)
+
+  # [batch_size, height, width, num_instances, num_keypoints].
+  # Note that we intentionally avoid using tf.newaxis as TfLite converter
+  # doesn't like it.
+  y_diff = (
+      tf.reshape(y_grid, [1, height, width, 1, 1]) -
+      tf.reshape(points_y, [batch_size, 1, 1, num_instances, num_keypoints]))
+  x_diff = (
+      tf.reshape(x_grid, [1, height, width, 1, 1]) -
+      tf.reshape(points_x, [batch_size, 1, 1, num_instances, num_keypoints]))
+  distance_square = y_diff**2 + x_diff**2
+
+  y_min, x_min, y_max, x_max = tf.split(boxes, 4, axis=2)
+
+  # Make the mask with all 1.0 in the box regions.
+  # Shape: [batch_size, height, width, num_instances]
+  in_boxes = tf.math.logical_and(
+      tf.math.logical_and(
+          tf.reshape(y_grid, [1, height, width, 1]) >= tf.reshape(
+              y_min, [batch_size, 1, 1, num_instances]),
+          tf.reshape(y_grid, [1, height, width, 1]) < tf.reshape(
+              y_max, [batch_size, 1, 1, num_instances])),
+      tf.math.logical_and(
+          tf.reshape(x_grid, [1, height, width, 1]) >= tf.reshape(
+              x_min, [batch_size, 1, 1, num_instances]),
+          tf.reshape(x_grid, [1, height, width, 1]) < tf.reshape(
+              x_max, [batch_size, 1, 1, num_instances])))
+  in_boxes = tf.cast(in_boxes, dtype=tf.float32)
+
+  gaussian_denom = tf.cast(
+      tf.minimum(height, width), dtype=tf.float32) * gaussian_denom_ratio
+  # shape: [batch_size, height, width, num_instances, num_keypoints]
+  gaussian_map = tf.exp((-1 * distance_square) / gaussian_denom)
+  return tf.expand_dims(
+      heatmap, axis=3) * gaussian_map * tf.reshape(
+          in_boxes, [batch_size, height, width, num_instances, 1])
+
+
+def prediction_tensors_to_multi_instance_kpts(
+    keypoint_heatmap_predictions,
+    keypoint_heatmap_offsets,
+    keypoint_score_heatmap=None):
+  """Converts keypoint heatmap predictions and offsets to keypoint candidates.
+
+  This function is similar to the 'prediction_tensors_to_single_instance_kpts'
+  function except that the input keypoint_heatmap_predictions is prepared to
+  have an additional 'num_instances' dimension for multi-instance prediction.
+
+  Args:
+    keypoint_heatmap_predictions: A float tensor of shape [batch_size, height,
+      width, num_instances, num_keypoints] representing the per-keypoint and
+      per-instance heatmaps which is used for finding the best keypoint
+      candidate locations.
+    keypoint_heatmap_offsets: A float tensor of shape [batch_size, height,
+      width, 2 * num_keypoints] representing the per-keypoint offsets.
+    keypoint_score_heatmap: (optional) A float tensor of shape [batch_size,
+      height, width, num_keypoints] representing the heatmap
+      which is used for reporting the confidence scores. If not provided, then
+      the values in the keypoint_heatmap_predictions will be used.
+
+  Returns:
+    keypoint_candidates: A tensor of shape
+      [batch_size, max_candidates, num_keypoints, 2] holding the
+      location of keypoint candidates in [y, x] format (expressed in absolute
+      coordinates in the output coordinate frame).
+    keypoint_scores: A float tensor of shape
+      [batch_size, max_candidates, num_keypoints] with the scores for each
+      keypoint candidate. The scores come directly from the heatmap predictions.
+  """
+  batch_size, height, width, num_instances, num_keypoints = _get_shape(
+      keypoint_heatmap_predictions, 5)
+
+  # [batch_size, height * width, num_instances * num_keypoints].
+  feature_map_flattened = tf.reshape(
+      keypoint_heatmap_predictions,
+      [batch_size, -1, num_instances * num_keypoints])
+
+  # [batch_size, num_instances * num_keypoints].
+  peak_flat_indices = tf.math.argmax(
+      feature_map_flattened, axis=1, output_type=tf.dtypes.int32)
+
+  # Get x and y indices corresponding to the top indices in the flat array.
+  y_indices, x_indices = (
+      row_col_indices_from_flattened_indices(peak_flat_indices, width))
+  # [batch_size * num_instances * num_keypoints].
+  y_indices = tf.reshape(y_indices, [-1])
+  x_indices = tf.reshape(x_indices, [-1])
+
+  # Prepare the indices to gather the offsets from the keypoint_heatmap_offsets.
+  batch_idx = _multi_range(
+      limit=batch_size, value_repetitions=num_keypoints * num_instances)
+  kpts_idx = _multi_range(
+      limit=num_keypoints, value_repetitions=1,
+      range_repetitions=batch_size * num_instances)
+  combined_indices = tf.stack([
+      batch_idx,
+      y_indices,
+      x_indices,
+      kpts_idx
+  ], axis=1)
+
+  keypoint_heatmap_offsets = tf.reshape(
+      keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, 2])
+  # Retrieve the keypoint offsets: shape:
+  # [batch_size * num_instance * num_keypoints, 2].
+  selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
+                                       combined_indices)
+  y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
+
+  keypoint_candidates = tf.stack([
+      tf.cast(y_indices, dtype=tf.float32) + tf.expand_dims(y_offsets, axis=0),
+      tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
+  ], axis=2)
+  keypoint_candidates = tf.reshape(
+      keypoint_candidates, [batch_size, num_instances, num_keypoints, 2])
+
+  if keypoint_score_heatmap is None:
+    keypoint_scores = tf.gather_nd(
+        tf.reduce_max(keypoint_heatmap_predictions, axis=3), combined_indices)
+  else:
+    keypoint_scores = tf.gather_nd(keypoint_score_heatmap, combined_indices)
+  return keypoint_candidates, tf.reshape(
+      keypoint_scores, [batch_size, num_instances, num_keypoints])
+
+
+def prediction_to_keypoints_argmax(
+    prediction_dict,
+    object_y_indices,
+    object_x_indices,
+    boxes,
+    task_name,
+    kp_params):
+  """Postprocess function to predict multi instance keypoints with argmax op.
+
+  This is a different implementation of the original keypoint postprocessing
+  function such that it avoids using topk op (replaced by argmax) as it runs
+  much slower in the browser.
+
+  Args:
+    prediction_dict: a dictionary holding predicted tensors, returned from the
+      predict() method. This dictionary should contain keypoint prediction
+      feature maps for each keypoint task.
+    object_y_indices: A float tensor of shape [batch_size, max_instances]
+      representing the location indices of the object centers.
+    object_x_indices: A float tensor of shape [batch_size, max_instances]
+      representing the location indices of the object centers.
+    boxes: A tensor of shape [batch_size, num_instances, 4] with predicted
+      bounding boxes for each instance, expressed in the output coordinate
+      frame.
+    task_name: string, the name of the task this namedtuple corresponds to.
+      Note that it should be an unique identifier of the task.
+    kp_params: A `KeypointEstimationParams` object with parameters for a single
+      keypoint class.
+
+  Returns:
+    A tuple of two tensors:
+      keypoint_candidates: A float tensor with shape [batch_size,
+        num_instances, num_keypoints, 2] representing the yx-coordinates of
+        the keypoints in the output feature map space.
+      keypoint_scores: A float tensor with shape [batch_size, num_instances,
+        num_keypoints] representing the keypoint prediction scores.
+
+  Raises:
+    ValueError: if the candidate_ranking_mode is not supported.
+  """
+  keypoint_heatmap = tf.nn.sigmoid(prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1])
+  keypoint_offset = prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_OFFSET)][-1]
+  keypoint_regression = prediction_dict[
+      get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
+  batch_size, height, width, num_keypoints = _get_shape(keypoint_heatmap, 4)
+
+  # Create the y,x grids: [height, width]
+  (y_grid, x_grid) = ta_utils.image_shape_to_grids(height, width)
+
+  # Prepare the indices to retrieve the information from object centers.
+  num_instances = _get_shape(object_y_indices, 2)[1]
+  combined_obj_indices = tf.stack([
+      _multi_range(batch_size, value_repetitions=num_instances),
+      tf.reshape(object_y_indices, [-1]),
+      tf.reshape(object_x_indices, [-1])
+  ], axis=1)
+
+  # Select the regression vectors from the object center.
+  selected_regression_flat = tf.gather_nd(
+      keypoint_regression, combined_obj_indices)
+  selected_regression = tf.reshape(
+      selected_regression_flat, [batch_size, num_instances, num_keypoints, 2])
+  (y_reg, x_reg) = tf.unstack(selected_regression, axis=3)
+
+  # shape: [batch_size, num_instances, num_keypoints].
+  y_regressed = tf.cast(
+      tf.reshape(object_y_indices, [batch_size, num_instances, 1]),
+      dtype=tf.float32) + y_reg
+  x_regressed = tf.cast(
+      tf.reshape(object_x_indices, [batch_size, num_instances, 1]),
+      dtype=tf.float32) + x_reg
+
+  if kp_params.candidate_ranking_mode == 'gaussian_weighted_const':
+    rescored_heatmap = _gaussian_weighted_map_const_multi(
+        y_grid, x_grid, keypoint_heatmap, y_regressed, x_regressed, boxes,
+        kp_params.gaussian_denom_ratio)
+
+    # shape: [batch_size, height, width, num_keypoints].
+    keypoint_score_heatmap = tf.math.reduce_max(rescored_heatmap, axis=3)
+  else:
+    raise ValueError(
+        'Unsupported ranking mode in the multipose no topk method: %s' %
+        kp_params.candidate_ranking_mode)
+  (keypoint_candidates,
+   keypoint_scores) = prediction_tensors_to_multi_instance_kpts(
+       keypoint_heatmap_predictions=rescored_heatmap,
+       keypoint_heatmap_offsets=keypoint_offset,
+       keypoint_score_heatmap=keypoint_score_heatmap)
+  return keypoint_candidates, keypoint_scores
+
+
 def regressed_keypoints_at_object_centers(regressed_keypoint_predictions,
                                          y_indices, x_indices):
  """Returns the regressed keypoints at specified object centers.
@@ -1533,15 +1796,9 @@ def convert_strided_predictions_to_normalized_keypoints(
      keypoints, window = inputs
      return keypoint_ops.clip_to_window(keypoints, window)

-    # Specify the TensorSpec explicitly in the tf.map_fn to make it tf.lite
-    # compatible.
-    kpts_dims = _get_shape(keypoint_coords_normalized, 4)
-    output_spec = tf.TensorSpec(
-        shape=[kpts_dims[1], kpts_dims[2], kpts_dims[3]], dtype=tf.float32)
-    keypoint_coords_normalized = tf.map_fn(
-        clip_to_window, (keypoint_coords_normalized, batch_window),
-        dtype=tf.float32, back_prop=False,
-        fn_output_signature=output_spec)
+    keypoint_coords_normalized = shape_utils.static_or_dynamic_map_fn(
+        clip_to_window, [keypoint_coords_normalized, batch_window],
+        dtype=tf.float32, back_prop=False)
    keypoint_scores = tf.where(valid_indices, keypoint_scores,
                               tf.zeros_like(keypoint_scores))
  return keypoint_coords_normalized, keypoint_scores
@@ -1900,7 +2157,8 @@ class KeypointEstimationParams(
        'heatmap_head_kernel_sizes', 'offset_head_num_filters',
        'offset_head_kernel_sizes', 'regress_head_num_filters',
        'regress_head_kernel_sizes', 'score_distance_multiplier',
-        'std_dev_multiplier', 'rescoring_threshold'
+        'std_dev_multiplier', 'rescoring_threshold', 'gaussian_denom_ratio',
+        'argmax_postprocessing'
    ])):
  """Namedtuple to host object detection related parameters.

@@ -1948,7 +2206,9 @@ class KeypointEstimationParams(
              regress_head_kernel_sizes=(3),
              score_distance_multiplier=0.1,
              std_dev_multiplier=1.0,
-              rescoring_threshold=0.0):
+              rescoring_threshold=0.0,
+              argmax_postprocessing=False,
+              gaussian_denom_ratio=0.1):
    """Constructor with default values for KeypointEstimationParams.

    Args:
@@ -2049,6 +2309,12 @@ class KeypointEstimationParams(
        True. The detection score of an instance is set to be the average over
        the scores of the keypoints which their scores higher than the
        threshold.
+      argmax_postprocessing: Whether to use the keypoint postprocessing logic
+        that replaces the topk op with argmax. Usually used when exporting the
+        model for predicting keypoints of multiple instances in the browser.
+      gaussian_denom_ratio: The ratio used to multiply the image size to
+        determine the denominator of the Gaussian formula. Only applicable when
+        the candidate_ranking_mode is set to be 'gaussian_weighted_const'.

    Returns:
      An initialized KeypointEstimationParams namedtuple.
@@ -2067,7 +2333,8 @@ class KeypointEstimationParams(
        heatmap_head_num_filters, heatmap_head_kernel_sizes,
        offset_head_num_filters, offset_head_kernel_sizes,
        regress_head_num_filters, regress_head_kernel_sizes,
-        score_distance_multiplier, std_dev_multiplier, rescoring_threshold)
+        score_distance_multiplier, std_dev_multiplier, rescoring_threshold,
+        argmax_postprocessing, gaussian_denom_ratio)


 class ObjectCenterParams(
@@ -2075,7 +2342,7 @@ class ObjectCenterParams(
        'classification_loss', 'object_center_loss_weight', 'heatmap_bias_init',
        'min_box_overlap_iou', 'max_box_predictions', 'use_labeled_classes',
        'keypoint_weights_for_center', 'center_head_num_filters',
-        'center_head_kernel_sizes'
+        'center_head_kernel_sizes', 'peak_max_pool_kernel_size'
    ])):
  """Namedtuple to store object center prediction related parameters."""

@@ -2090,7 +2357,8 @@ class ObjectCenterParams(
              use_labeled_classes=False,
              keypoint_weights_for_center=None,
              center_head_num_filters=(256),
-              center_head_kernel_sizes=(3)):
+              center_head_kernel_sizes=(3),
+              peak_max_pool_kernel_size=3):
    """Constructor with default values for ObjectCenterParams.

    Args:
@@ -2115,6 +2383,8 @@ class ObjectCenterParams(
        by the object center prediction head.
      center_head_kernel_sizes: kernel size of the convolutional layers used
        by the object center prediction head.
+      peak_max_pool_kernel_size: Max pool kernel size to use to pull off peak
+        score locations in a neighborhood for the object detection heatmap.
    Returns:
      An initialized ObjectCenterParams namedtuple.
    """
@@ -2123,7 +2393,8 @@ class ObjectCenterParams(
                              object_center_loss_weight, heatmap_bias_init,
                              min_box_overlap_iou, max_box_predictions,
                              use_labeled_classes, keypoint_weights_for_center,
-                              center_head_num_filters, center_head_kernel_sizes)
+                              center_head_num_filters, center_head_kernel_sizes,
+                              peak_max_pool_kernel_size)


 class MaskParams(
@@ -2627,16 +2898,12 @@ class CenterNetMetaArch(model.DetectionModel):
      self.track_reid_classification_net = tf.keras.Sequential()
      for _ in range(self._track_params.num_fc_layers - 1):
        self.track_reid_classification_net.add(
-            tf.keras.layers.Dense(self._track_params.reid_embed_size,
-                                  input_shape=(
-                                      self._track_params.reid_embed_size,)))
+            tf.keras.layers.Dense(self._track_params.reid_embed_size))
        self.track_reid_classification_net.add(
            tf.keras.layers.BatchNormalization())
        self.track_reid_classification_net.add(tf.keras.layers.ReLU())
      self.track_reid_classification_net.add(
-          tf.keras.layers.Dense(self._track_params.num_track_ids,
-                                input_shape=(
-                                    self._track_params.reid_embed_size,)))
+          tf.keras.layers.Dense(self._track_params.num_track_ids))
    if self._temporal_offset_params is not None:
      prediction_heads[TEMPORAL_OFFSET] = self._make_prediction_net_list(
          num_feature_outputs, NUM_OFFSET_CHANNELS, name='temporal_offset',
@@ -2714,7 +2981,8 @@ class CenterNetMetaArch(model.DetectionModel):
    return target_assigners

  def _compute_object_center_loss(self, input_height, input_width,
-                                  object_center_predictions, per_pixel_weights):
+                                  object_center_predictions, per_pixel_weights,
+                                  maximum_normalized_coordinate=1.1):
    """Computes the object center loss.

    Args:
@@ -2726,6 +2994,9 @@ class CenterNetMetaArch(model.DetectionModel):
      per_pixel_weights: A float tensor of shape [batch_size,
        out_height * out_width, 1] with 1s in locations where the spatial
        coordinates fall within the height and width in true_image_shapes.
+      maximum_normalized_coordinate: Maximum coordinate value to be considered
+        as normalized, default to 1.1. This is used to check bounds during
+        converting normalized coordinates to absolute coordinates.

    Returns:
      A float scalar tensor representing the object center loss per instance.
@@ -2752,7 +3023,8 @@ class CenterNetMetaArch(model.DetectionModel):
          width=input_width,
          gt_classes_list=gt_classes_list,
          gt_keypoints_list=gt_keypoints_list,
-          gt_weights_list=gt_weights_list)
+          gt_weights_list=gt_weights_list,
+          maximum_normalized_coordinate=maximum_normalized_coordinate)
    else:
      gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
      heatmap_targets = assigner.assign_center_targets_from_boxes(
@@ -2760,7 +3032,8 @@ class CenterNetMetaArch(model.DetectionModel):
          width=input_width,
          gt_boxes_list=gt_boxes_list,
          gt_classes_list=gt_classes_list,
-          gt_weights_list=gt_weights_list)
+          gt_weights_list=gt_weights_list,
+          maximum_normalized_coordinate=maximum_normalized_coordinate)

    flattened_heatmap_targets = _flatten_spatial_dimensions(heatmap_targets)
    num_boxes = _to_float32(get_num_instances_from_weights(gt_weights_list))
@@ -3577,7 +3850,9 @@ class CenterNetMetaArch(model.DetectionModel):
    self._batched_prediction_tensor_names = predictions.keys()
    return predictions

-  def loss(self, prediction_dict, true_image_shapes, scope=None):
+  def loss(
+      self, prediction_dict, true_image_shapes, scope=None,
+      maximum_normalized_coordinate=1.1):
    """Computes scalar loss tensors with respect to provided groundtruth.

    This function implements the various CenterNet losses.
@@ -3589,6 +3864,9 @@ class CenterNetMetaArch(model.DetectionModel):
        the form [height, width, channels] indicating the shapes of true images
        in the resized images, as resized images can be padded with zeros.
      scope: Optional scope name.
+      maximum_normalized_coordinate: Maximum coordinate value to be considered
+        as normalized, default to 1.1. This is used to check bounds during
+        converting normalized coordinates to absolute coordinates.

    Returns:
      A dictionary mapping the keys [
@@ -3616,7 +3894,7 @@ class CenterNetMetaArch(model.DetectionModel):

    # TODO(vighneshb) Explore whether using floor here is safe.
    output_true_image_shapes = tf.ceil(
-        tf.to_float(true_image_shapes) / self._stride)
+        tf.cast(true_image_shapes, tf.float32) / self._stride)
    valid_anchor_weights = get_valid_anchor_weights_in_flattened_image(
        output_true_image_shapes, output_height, output_width)
    valid_anchor_weights = tf.expand_dims(valid_anchor_weights, 2)
@@ -3625,7 +3903,8 @@ class CenterNetMetaArch(model.DetectionModel):
        object_center_predictions=prediction_dict[OBJECT_CENTER],
        input_height=input_height,
        input_width=input_width,
-        per_pixel_weights=valid_anchor_weights)
+        per_pixel_weights=valid_anchor_weights,
+        maximum_normalized_coordinate=maximum_normalized_coordinate)
    losses = {
        OBJECT_CENTER:
            self._center_params.object_center_loss_weight * object_center_loss
@@ -3761,12 +4040,13 @@ class CenterNetMetaArch(model.DetectionModel):
    # center predictions.
    detection_scores, y_indices, x_indices, channel_indices = (
        top_k_feature_map_locations(
-            object_center_prob, max_pool_kernel_size=3,
+            object_center_prob,
+            max_pool_kernel_size=self._center_params.peak_max_pool_kernel_size,
            k=self._center_params.max_box_predictions))
    multiclass_scores = tf.gather_nd(
        object_center_prob, tf.stack([y_indices, x_indices], -1), batch_dims=1)
-
-    num_detections = tf.reduce_sum(tf.to_int32(detection_scores > 0), axis=1)
+    num_detections = tf.reduce_sum(
+        tf.cast(detection_scores > 0, tf.int32), axis=1)
    postprocess_dict = {
        fields.DetectionResultFields.detection_scores: detection_scores,
        fields.DetectionResultFields.detection_multiclass_scores:
@@ -3796,6 +4076,18 @@ class CenterNetMetaArch(model.DetectionModel):
      # the ops that are supported by tf.lite on GPU.
      clip_keypoints = self._should_clip_keypoints()
      if len(self._kp_params_dict) == 1 and self._num_classes == 1:
+        task_name, kp_params = next(iter(self._kp_params_dict.items()))
+        keypoint_depths = None
+        if kp_params.argmax_postprocessing:
+          keypoints, keypoint_scores = (
+              prediction_to_keypoints_argmax(
+                  prediction_dict,
+                  object_y_indices=y_indices,
+                  object_x_indices=x_indices,
+                  boxes=boxes_strided,
+                  task_name=task_name,
+                  kp_params=kp_params))
+        else:
          (keypoints, keypoint_scores,
           keypoint_depths) = self._postprocess_keypoints_single_class(
               prediction_dict, channel_indices, y_indices, x_indices,
@@ -4083,9 +4375,13 @@ class CenterNetMetaArch(model.DetectionModel):
    kpt_coords_for_example_list = []
    kpt_scores_for_example_list = []
    for ex_ind in range(batch_size):
-      kpt_coords_for_class_list = []
-      kpt_scores_for_class_list = []
-      instance_inds_for_class_list = []
+      # The tensors that host the keypoint coordinates and scores for all
+      # instances and all keypoints. They will be updated by scatter_nd_add for
+      # each keypoint tasks.
+      kpt_coords_for_example_all_det = tf.zeros(
+          [max_detections, total_num_keypoints, 2])
+      kpt_scores_for_example_all_det = tf.zeros(
+          [max_detections, total_num_keypoints])
      for task_name, kp_params in self._kp_params_dict.items():
        keypoint_heatmap = prediction_dict[
            get_keypoint_name(task_name, KEYPOINT_HEATMAP)][-1]
@@ -4095,12 +4391,7 @@ class CenterNetMetaArch(model.DetectionModel):
            get_keypoint_name(task_name, KEYPOINT_REGRESSION)][-1]
        instance_inds = self._get_instance_indices(
            classes, num_detections, ex_ind, kp_params.class_id)
-        num_ind = _get_shape(instance_inds, 1)

-        def true_fn(keypoint_heatmap, keypoint_offsets, keypoint_regression,
-                    classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
-                    kp_params):
-          """Logics to execute when instance_inds is not an empty set."""
        # Gather the feature map locations corresponding to the object class.
        y_indices_for_kpt_class = tf.gather(y_indices, instance_inds, axis=1)
        x_indices_for_kpt_class = tf.gather(x_indices, instance_inds, axis=1)
@@ -4127,45 +4418,35 @@ class CenterNetMetaArch(model.DetectionModel):
                kp_params,
            ))

-          # Expand keypoint dimension (with padding) so that coordinates and
-          # scores have shape [1, num_instances_i, num_total_keypoints, 2] and
-          # [1, num_instances_i, num_total_keypoints], respectively.
-          kpts_coords_for_class_padded, kpt_scores_for_class_padded = (
-              _pad_to_full_keypoint_dim(kpt_coords_for_class,
-                                        kpt_scores_for_class,
-                                        kp_params.keypoint_indices,
-                                        total_num_keypoints))
-          return kpts_coords_for_class_padded, kpt_scores_for_class_padded
-
-        def false_fn():
-          """Logics to execute when the instance_inds is an empty set."""
-          return (tf.zeros([1, 0, total_num_keypoints, 2], dtype=tf.float32),
-                  tf.zeros([1, 0, total_num_keypoints], dtype=tf.float32))
-
-        true_fn = functools.partial(
-            true_fn, keypoint_heatmap, keypoint_offsets, keypoint_regression,
-            classes, y_indices, x_indices, boxes, instance_inds, ex_ind,
-            kp_params)
-        # Use dimension values instead of tf.size for tf.lite compatibility.
-        results = tf.cond(num_ind[0] > 0, true_fn, false_fn)
-
-        kpt_coords_for_class_list.append(results[0])
-        kpt_scores_for_class_list.append(results[1])
-        instance_inds_for_class_list.append(instance_inds)
-
-      # Concatenate all keypoints across all classes (single example).
-      kpt_coords_for_example = tf.concat(kpt_coords_for_class_list, axis=1)
-      kpt_scores_for_example = tf.concat(kpt_scores_for_class_list, axis=1)
-      instance_inds_for_example = tf.concat(instance_inds_for_class_list,
-                                            axis=0)
-
-      (kpt_coords_for_example_all_det,
-       kpt_scores_for_example_all_det) = self._scatter_keypoints_to_batch(
-           num_ind, kpt_coords_for_example, kpt_scores_for_example,
-           instance_inds_for_example, max_detections, total_num_keypoints)
-
-    kpt_coords_for_example_list.append(kpt_coords_for_example_all_det)
-    kpt_scores_for_example_list.append(kpt_scores_for_example_all_det)
+        # Prepare the indices for scatter_nd. The resulting combined_inds has
+        # the shape of [num_instances_i * num_keypoints_i, 2], where the first
+        # column corresponds to the instance IDs and the second column
+        # corresponds to the keypoint IDs.
+        kpt_inds = tf.constant(kp_params.keypoint_indices, dtype=tf.int32)
+        kpt_inds = tf.expand_dims(kpt_inds, axis=0)
+        instance_inds_expand = tf.expand_dims(instance_inds, axis=-1)
+        kpt_inds_expand = kpt_inds * tf.ones_like(instance_inds_expand)
+        instance_inds_expand = instance_inds_expand * tf.ones_like(kpt_inds)
+        combined_inds = tf.stack(
+            [instance_inds_expand, kpt_inds_expand], axis=2)
+        combined_inds = tf.reshape(combined_inds, [-1, 2])
+
+        # Reshape the keypoint coordinates/scores to [num_instances_i *
+        # num_keypoints_i, 2]/[num_instances_i * num_keypoints_i] to be used
+        # by scatter_nd_add.
+        kpt_coords_for_class = tf.reshape(kpt_coords_for_class, [-1, 2])
+        kpt_scores_for_class = tf.reshape(kpt_scores_for_class, [-1])
+        kpt_coords_for_example_all_det = tf.tensor_scatter_nd_add(
+            kpt_coords_for_example_all_det,
+            combined_inds, kpt_coords_for_class)
+        kpt_scores_for_example_all_det = tf.tensor_scatter_nd_add(
+            kpt_scores_for_example_all_det,
+            combined_inds, kpt_scores_for_class)
+
+      kpt_coords_for_example_list.append(
+          tf.expand_dims(kpt_coords_for_example_all_det, axis=0))
+      kpt_scores_for_example_list.append(
+          tf.expand_dims(kpt_scores_for_example_all_det, axis=0))

    # Concatenate all keypoints and scores from all examples in the batch.
    # Shapes are [batch_size, max_detections, num_total_keypoints, 2] and

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -807,6 +807,77 @@ class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
    np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
    np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)

+  @parameterized.parameters({'provide_keypoint_score': True},
+                            {'provide_keypoint_score': False})
+  def test_prediction_to_multi_instance_keypoints(self, provide_keypoint_score):
+    image_size = (9, 9)
+    keypoint_heatmap_np = np.zeros((1, image_size[0], image_size[1], 3, 4),
+                                   dtype=np.float32)
+    # Instance 0.
+    keypoint_heatmap_np[0, 1, 1, 0, 0] = 0.9
+    keypoint_heatmap_np[0, 1, 7, 0, 1] = 0.9
+    keypoint_heatmap_np[0, 7, 1, 0, 2] = 0.9
+    keypoint_heatmap_np[0, 7, 7, 0, 3] = 0.9
+    # Instance 1.
+    keypoint_heatmap_np[0, 2, 2, 1, 0] = 0.8
+    keypoint_heatmap_np[0, 2, 8, 1, 1] = 0.8
+    keypoint_heatmap_np[0, 8, 2, 1, 2] = 0.8
+    keypoint_heatmap_np[0, 8, 8, 1, 3] = 0.8
+
+    keypoint_offset_np = np.zeros((1, image_size[0], image_size[1], 8),
+                                  dtype=np.float32)
+    keypoint_offset_np[0, 1, 1] = [0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 1, 7] = [0.0, 0.0, 0.5, -0.5, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 7, 1] = [0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 0.0, 0.0]
+    keypoint_offset_np[0, 7, 7] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, -0.5]
+    keypoint_offset_np[0, 2, 2] = [0.3, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 2, 8] = [0.0, 0.0, 0.3, -0.3, 0.0, 0.0, 0.0, 0.0]
+    keypoint_offset_np[0, 8, 2] = [0.0, 0.0, 0.0, 0.0, -0.3, 0.3, 0.0, 0.0]
+    keypoint_offset_np[0, 8, 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3, -0.3]
+
+    def graph_fn():
+      keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
+      keypoint_offset = tf.constant(keypoint_offset_np, dtype=tf.float32)
+
+      if provide_keypoint_score:
+        (keypoint_cands, keypoint_scores) = (
+            cnma.prediction_tensors_to_multi_instance_kpts(
+                keypoint_heatmap,
+                keypoint_offset,
+                tf.reduce_max(keypoint_heatmap, axis=3)))
+      else:
+        (keypoint_cands, keypoint_scores) = (
+            cnma.prediction_tensors_to_multi_instance_kpts(
+                keypoint_heatmap,
+                keypoint_offset))
+
+      return keypoint_cands, keypoint_scores
+
+    (keypoint_cands, keypoint_scores) = self.execute(graph_fn, [])
+
+    expected_keypoint_candidates_0 = [
+        [1.5, 1.5],  # top-left
+        [1.5, 6.5],  # top-right
+        [6.5, 1.5],  # bottom-left
+        [6.5, 6.5],  # bottom-right
+    ]
+    expected_keypoint_scores_0 = [0.9, 0.9, 0.9, 0.9]
+    expected_keypoint_candidates_1 = [
+        [2.3, 2.3],  # top-left
+        [2.3, 7.7],  # top-right
+        [7.7, 2.3],  # bottom-left
+        [7.7, 7.7],  # bottom-right
+    ]
+    expected_keypoint_scores_1 = [0.8, 0.8, 0.8, 0.8]
+    np.testing.assert_allclose(
+        expected_keypoint_candidates_0, keypoint_cands[0, 0, :, :])
+    np.testing.assert_allclose(
+        expected_keypoint_candidates_1, keypoint_cands[0, 1, :, :])
+    np.testing.assert_allclose(
+        expected_keypoint_scores_0, keypoint_scores[0, 0, :])
+    np.testing.assert_allclose(
+        expected_keypoint_scores_1, keypoint_scores[0, 1, :])
+
  def test_keypoint_candidate_prediction_per_keypoints(self):
    keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
    keypoint_heatmap_np[0, 0, 0, 0] = 1.0
@@ -1644,7 +1715,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100,
                       predict_depth=False,
                       per_keypoint_depth=False,
                       peak_radius=0,
-                       candidate_ranking_mode='min_distance'):
+                       candidate_ranking_mode='min_distance',
+                       argmax_postprocessing=False):
  """Returns the fake keypoint estimation parameter namedtuple."""
  return cnma.KeypointEstimationParams(
      task_name=_TASK_NAME,
@@ -1660,7 +1732,8 @@ def get_fake_kp_params(num_candidates_per_keypoint=100,
      predict_depth=predict_depth,
      per_keypoint_depth=per_keypoint_depth,
      offset_peak_radius=peak_radius,
-      candidate_ranking_mode=candidate_ranking_mode)
+      candidate_ranking_mode=candidate_ranking_mode,
+      argmax_postprocessing=argmax_postprocessing)


 def get_fake_mask_params():
@@ -1715,7 +1788,8 @@ def build_center_net_meta_arch(build_resnet=False,
                               per_keypoint_depth=False,
                               peak_radius=0,
                               keypoint_only=False,
-                               candidate_ranking_mode='min_distance'):
+                               candidate_ranking_mode='min_distance',
+                               argmax_postprocessing=False):
  """Builds the CenterNet meta architecture."""
  if build_resnet:
    feature_extractor = (
@@ -1762,7 +1836,8 @@ def build_center_net_meta_arch(build_resnet=False,
                get_fake_kp_params(num_candidates_per_keypoint,
                                   per_keypoint_offset, predict_depth,
                                   per_keypoint_depth, peak_radius,
-                                   candidate_ranking_mode)
+                                   candidate_ranking_mode,
+                                   argmax_postprocessing)
        },
        non_max_suppression_fn=non_max_suppression_fn)
  elif detection_only:
@@ -1790,7 +1865,8 @@ def build_center_net_meta_arch(build_resnet=False,
                get_fake_kp_params(num_candidates_per_keypoint,
                                   per_keypoint_offset, predict_depth,
                                   per_keypoint_depth, peak_radius,
-                                   candidate_ranking_mode)
+                                   candidate_ranking_mode,
+                                   argmax_postprocessing)
        },
        non_max_suppression_fn=non_max_suppression_fn)
  else:
@@ -2324,17 +2400,32 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllClose(expected_multiclass_scores,
                        detections['detection_multiclass_scores'][0][0])

-  def test_postprocess_single_class(self):
+  @parameterized.parameters(
+      {
+          'candidate_ranking_mode': 'min_distance',
+          'argmax_postprocessing': False
+      },
+      {
+          'candidate_ranking_mode': 'gaussian_weighted_const',
+          'argmax_postprocessing': True
+      })
+  def test_postprocess_single_class(self, candidate_ranking_mode,
+                                    argmax_postprocessing):
    """Test the postprocess function."""
-    model = build_center_net_meta_arch(num_classes=1)
+    model = build_center_net_meta_arch(
+        num_classes=1, max_box_predictions=5, per_keypoint_offset=True,
+        candidate_ranking_mode=candidate_ranking_mode,
+        argmax_postprocessing=argmax_postprocessing)
    max_detection = model._center_params.max_box_predictions
    num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)

    class_center = np.zeros((1, 32, 32, 1), dtype=np.float32)
    height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
    offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
-    keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
-    keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_heatmaps = np.ones(
+        (1, 32, 32, num_keypoints), dtype=np.float32) * _logit(0.01)
+    keypoint_offsets = np.zeros(
+        (1, 32, 32, num_keypoints * 2), dtype=np.float32)
    keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)

    class_probs = np.zeros(1)
@@ -2387,6 +2478,9 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertEqual(detections['num_detections'], [5])
    self.assertAllEqual([1, max_detection, num_keypoints, 2],
                        detections['detection_keypoints'].shape)
+    self.assertAllClose(
+        [[0.4375, 0.4375], [0.4375, 0.5625], [0.5625, 0.4375]],
+        detections['detection_keypoints'][0, 0, 0:3, :])
    self.assertAllEqual([1, max_detection, num_keypoints],
                        detections['detection_keypoint_scores'].shape)


--- a/research/object_detection/meta_architectures/deepmac_meta_arch.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch.py
@@ -36,7 +36,8 @@ class DeepMACParams(
        'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
        'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
        'predict_full_resolution_masks', 'postprocess_crop_size',
-        'max_roi_jitter_ratio', 'roi_jitter_mode', 'box_consistency_loss_weight'
+        'max_roi_jitter_ratio', 'roi_jitter_mode',
+        'box_consistency_loss_weight',
    ])):
  """Class holding the DeepMAC network configutration."""

@@ -125,6 +126,9 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
      raise ValueError('Mask size must be set.')
    return FullyConnectedMaskHead(num_init_channels, mask_size)

+  elif name == 'embedding_distance_probability':
+    return tf.keras.layers.Lambda(lambda x: x)
+
  elif name.startswith('resnet'):
    return ResNetMaskNetwork(name, num_init_channels)

@@ -262,6 +266,25 @@ def fill_boxes(boxes, height, width):
  return tf.cast(filled_boxes, tf.float32)


+def embedding_distance_to_probability(x, y):
+  """Compute probability based on pixel-wise embedding distance.
+
+  Args:
+    x: [num_instances, height, width, dimension] float tensor input.
+    y: [num_instances, height, width, dimension] or
+      [num_instances, 1, 1, dimension] float tensor input. When the height
+      and width dimensions are 1, TF will broadcast it.
+
+  Returns:
+    dist: [num_instances, height, width, 1] A float tensor returning
+      the per-pixel probability. Pixels whose embeddings are close in
+      euclidean distance get a probability of close to 1.
+  """
+  diff = x - y
+  squared_dist = tf.reduce_sum(diff * diff, axis=3, keepdims=True)
+  return tf.exp(-squared_dist)
+
+
 class ResNetMaskNetwork(tf.keras.layers.Layer):
  """A small wrapper around ResNet blocks to predict masks."""

@@ -366,6 +389,16 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
        network_type, num_init_channels, mask_size)
    self._use_instance_embedding = use_instance_embedding

+    self._network_type = network_type
+
+    if (self._use_instance_embedding and
+        (self._network_type == 'embedding_distance_probability')):
+      raise ValueError(('Cannot feed instance embedding to mask head when '
+                        'computing distance from instance embedding.'))
+
+    if network_type == 'embedding_distance_probability':
+      self.project_out = tf.keras.layers.Lambda(lambda x: x)
+    else:
      self.project_out = tf.keras.layers.Conv2D(
          filters=1, kernel_size=1, activation=None)

@@ -388,10 +421,9 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
    height = tf.shape(pixel_embedding)[1]
    width = tf.shape(pixel_embedding)[2]

+    if self._use_instance_embedding:
      instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
      instance_embedding = tf.tile(instance_embedding, [1, height, width, 1])
-
-    if self._use_instance_embedding:
      inputs = tf.concat([pixel_embedding, instance_embedding], axis=3)
    else:
      inputs = pixel_embedding
@@ -400,6 +432,10 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
    if isinstance(out, list):
      out = out[-1]

+    if self._network_type == 'embedding_distance_probability':
+      instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
+      out = embedding_distance_to_probability(instance_embedding, out)
+
    if out.shape[-1] > 1:
      out = self.project_out(out)

@@ -466,6 +502,25 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    if self._deepmac_params.mask_num_subsamples > 0:
      raise ValueError('Subsampling masks is currently not supported.')

+    if self._deepmac_params.network_type == 'embedding_distance_probability':
+      if self._deepmac_params.use_xy:
+        raise ValueError(
+            'Cannot use x/y coordinates when using embedding distance.')
+
+      pixel_embedding_dim = self._deepmac_params.pixel_embedding_dim
+      dim = self._deepmac_params.dim
+      if dim != pixel_embedding_dim:
+        raise ValueError(
+            'When using embedding distance mask head, '
+            f'pixel_embedding_dim({pixel_embedding_dim}) '
+            f'must be same as dim({dim}).')
+
+      loss = self._deepmac_params.classification_loss
+      if ((not isinstance(loss, losses.WeightedDiceClassificationLoss))
+          or (not loss.is_prediction_probability)):
+        raise ValueError('Only dice loss with is_prediction_probability=true '
+                         'is supported with embedding distance mask head.')
+
    super(DeepMACMetaArch, self).__init__(
        is_training=is_training, add_summaries=add_summaries,
        num_classes=num_classes, feature_extractor=feature_extractor,
@@ -909,6 +964,9 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
      mask_logits = crop_masks_within_boxes(
          mask_logits, boxes, self._deepmac_params.postprocess_crop_size)

+    if self._deepmac_params.network_type == 'embedding_distance_probability':
+      masks_prob = mask_logits
+    else:
      masks_prob = tf.nn.sigmoid(mask_logits)

    return masks_prob

--- a/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
@@ -61,7 +61,10 @@ class MockMaskNet(tf.keras.layers.Layer):


 def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
-                    mask_num_subsamples=-1):
+                    use_instance_embedding=True, mask_num_subsamples=-1,
+                    network_type='hourglass10', use_xy=True,
+                    pixel_embedding_dim=2,
+                    dice_loss_prediction_probability=False):
  """Builds the DeepMAC meta architecture."""

  feature_extractor = DummyFeatureExtractor(
@@ -84,7 +87,9 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
      use_labeled_classes=False)

  if use_dice_loss:
-    classification_loss = losses.WeightedDiceClassificationLoss(False)
+    classification_loss = losses.WeightedDiceClassificationLoss(
+        squared_normalization=False,
+        is_prediction_probability=dice_loss_prediction_probability)
  else:
    classification_loss = losses.WeightedSigmoidClassificationLoss()

@@ -92,13 +97,13 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
      classification_loss=classification_loss,
      dim=8,
      task_loss_weight=1.0,
-      pixel_embedding_dim=2,
+      pixel_embedding_dim=pixel_embedding_dim,
      allowed_masked_classes_ids=[],
      mask_size=16,
      mask_num_subsamples=mask_num_subsamples,
-      use_xy=True,
-      network_type='hourglass10',
-      use_instance_embedding=True,
+      use_xy=use_xy,
+      network_type=network_type,
+      use_instance_embedding=use_instance_embedding,
      num_init_channels=8,
      predict_full_resolution_masks=predict_full_resolution_masks,
      postprocess_crop_size=128,
@@ -125,7 +130,7 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,


 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
-class DeepMACUtilsTest(tf.test.TestCase):
+class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):

  def test_subsample_trivial(self):
    """Test subsampling masks."""
@@ -169,12 +174,22 @@ class DeepMACUtilsTest(tf.test.TestCase):
        features, boxes, 32)
    self.assertEqual(output.shape, (5, 32, 32, 7))

+  def test_embedding_distance_prob_shape(self):
+    dist = deepmac_meta_arch.embedding_distance_to_probability(
+        tf.ones((4, 32, 32, 8)), tf.zeros((4, 32, 32, 8)))
+    self.assertEqual(dist.shape, (4, 32, 32, 1))

-@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
-class DeepMACMetaArchTest(tf.test.TestCase):
+  @parameterized.parameters([1e-20, 1e20])
+  def test_embedding_distance_prob_value(self, value):
+    dist = deepmac_meta_arch.embedding_distance_to_probability(
+        tf.zeros((1, 1, 1, 8)), value + tf.zeros((1, 1, 1, 8))).numpy()
+    max_float = np.finfo(dist.dtype).max
+    self.assertLess(dist.max(), max_float)
+    self.assertGreater(dist.max(), -max_float)

-  def setUp(self):  # pylint:disable=g-missing-super-call
-    self.model = build_meta_arch()
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class DeepMACMaskHeadTest(tf.test.TestCase):

  def test_mask_network(self):
    net = deepmac_meta_arch.MaskHeadNetwork('hourglass10', 8)
@@ -203,6 +218,38 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
    self.assertEqual(out.shape, (2, 32, 32))

+  def test_mask_network_embedding_distance_zero_dist(self):
+
+    net = deepmac_meta_arch.MaskHeadNetwork(
+        'embedding_distance_probability', num_init_channels=8,
+        use_instance_embedding=False)
+    call_func = tf.function(net.__call__)
+
+    out = call_func(tf.zeros((2, 7)), tf.zeros((2, 32, 32, 7)), training=True)
+    self.assertEqual(out.shape, (2, 32, 32))
+    self.assertAllGreater(out.numpy(), -np.inf)
+    self.assertAllLess(out.numpy(), np.inf)
+
+  def test_mask_network_embedding_distance_small_dist(self):
+
+    net = deepmac_meta_arch.MaskHeadNetwork(
+        'embedding_distance_probability', num_init_channels=-1,
+        use_instance_embedding=False)
+    call_func = tf.function(net.__call__)
+
+    out = call_func(1e6 + tf.zeros((2, 7)),
+                    tf.zeros((2, 32, 32, 7)), training=True)
+    self.assertEqual(out.shape, (2, 32, 32))
+    self.assertAllGreater(out.numpy(), -np.inf)
+    self.assertAllLess(out.numpy(), np.inf)
+
+
+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
+class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):  # pylint:disable=g-missing-super-call
+    self.model = build_meta_arch()
+
  def test_get_mask_head_input(self):

    boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
@@ -349,6 +396,37 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    prob = tf.nn.sigmoid(0.9).numpy()
    self.assertAllClose(masks, prob * np.ones((2, 3, 16, 16)))

+  def test_postprocess_emb_dist(self):
+
+    model = build_meta_arch(network_type='embedding_distance_probability',
+                            use_instance_embedding=False,
+                            use_xy=False, pixel_embedding_dim=8,
+                            use_dice_loss=True,
+                            dice_loss_prediction_probability=True)
+    boxes = np.zeros((2, 3, 4), dtype=np.float32)
+    boxes[:, :, [0, 2]] = 0.0
+    boxes[:, :, [1, 3]] = 8.0
+    boxes = tf.constant(boxes)
+
+    masks = model._postprocess_masks(
+        boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
+    self.assertEqual(masks.shape, (2, 3, 16, 16))
+
+  def test_postprocess_emb_dist_fullres(self):
+
+    model = build_meta_arch(network_type='embedding_distance_probability',
+                            predict_full_resolution_masks=True,
+                            use_instance_embedding=False,
+                            pixel_embedding_dim=8, use_xy=False,
+                            use_dice_loss=True,
+                            dice_loss_prediction_probability=True)
+    boxes = np.zeros((2, 3, 4), dtype=np.float32)
+    boxes = tf.constant(boxes)
+
+    masks = model._postprocess_masks(
+        boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
+    self.assertEqual(masks.shape, (2, 3, 128, 128))
+
  def test_postprocess_no_crop_resize_shape(self):

    model = build_meta_arch(predict_full_resolution_masks=True)
@@ -494,7 +572,7 @@ class FullyConnectedMaskHeadTest(tf.test.TestCase):
 class ResNetMaskHeadTest(tf.test.TestCase, parameterized.TestCase):

  @parameterized.parameters(['resnet4', 'resnet8', 'resnet20'])
-  def test_pass(self, name):
+  def test_forward(self, name):
    net = deepmac_meta_arch.ResNetMaskNetwork(name, 8)
    out = net(tf.zeros((3, 32, 32, 16)))
    self.assertEqual(out.shape[:3], (3, 32, 32))

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -111,6 +111,10 @@ message CenterNet {
    // Parameters to determine the architecture of the object center prediction
    // head.
    optional PredictionHeadParams center_head_params = 8;
+
+    // Max pool kernel size to use to pull off peak score locations in a
+    // neighborhood for the object detection heatmap.
+    optional int32 peak_max_pool_kernel_size = 9 [default = 3];
  }
  optional ObjectCenterParams object_center_params = 5;

@@ -266,6 +270,16 @@ message CenterNet {
    // with scores higher than the threshold.
    optional float rescoring_threshold = 30 [default = 0.0];

+    // The ratio used to multiply the output feature map size to determine the
+    // denominator in the Gaussian formula. Only applicable when the
+    // candidate_ranking_mode is set to be 'gaussian_weighted_const'.
+    optional float gaussian_denom_ratio = 31 [default = 0.1];
+
+    // Whether to use the keypoint postprocessing logic that replaces topk op
+    // with argmax. Usually used when exporting the model for predicting
+    // keypoints of multiple instances in the browser.
+    optional bool argmax_postprocessing = 32 [default = false];
+
    // Parameters to determine the architecture of the keypoint heatmap
    // prediction head.
    optional PredictionHeadParams heatmap_head_params = 25;

--- a/research/object_detection/protos/losses.proto
+++ b/research/object_detection/protos/losses.proto
@@ -231,6 +231,10 @@ message WeightedDiceClassificationLoss {
  // If set, we square the probabilities in the denominator term used for
  // normalization.
  optional bool squared_normalization = 1 [default=false];
+
+  // Whether or not the input prediction to the loss function is a
+  // probability. If not, the input is to be interpreted as logit
+  optional bool is_prediction_probability = 2 [default=false];
 }


--- a/research/object_detection/utils/ops.py
+++ b/research/object_detection/utils/ops.py
@@ -948,7 +948,8 @@ def merge_boxes_with_multiple_labels(boxes,


 def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None,
-                                width_scale=None):
+                                width_scale=None,
+                                name='nearest_neighbor_upsampling'):
  """Nearest neighbor upsampling implementation.

  Nearest neighbor upsampling function that maps input tensor with shape
@@ -965,6 +966,7 @@ def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None,
      option when provided overrides `scale` option.
    width_scale: An integer multiple to scale the width of input image. This
      option when provided overrides `scale` option.
+    name: A name for the operation (optional).
  Returns:
    data_up: A float32 tensor of size
      [batch, height_in*scale, width_in*scale, channels].
@@ -976,13 +978,13 @@ def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None,
  if not scale and (height_scale is None or width_scale is None):
    raise ValueError('Provide either `scale` or `height_scale` and'
                     ' `width_scale`.')
-  with tf.name_scope('nearest_neighbor_upsampling'):
+  with tf.name_scope(name):
    h_scale = scale if height_scale is None else height_scale
    w_scale = scale if width_scale is None else width_scale
    (batch_size, height, width,
     channels) = shape_utils.combined_static_and_dynamic_shape(input_tensor)
-    output_tensor = tf.stack([input_tensor] * w_scale, axis=3)
-    output_tensor = tf.stack([output_tensor] * h_scale, axis=2)
+    output_tensor = tf.stack([input_tensor] * w_scale, axis=3, name='w_stack')
+    output_tensor = tf.stack([output_tensor] * h_scale, axis=2, name='h_stack')
    return tf.reshape(output_tensor,
                      [batch_size, height * h_scale, width * w_scale, channels])


--- a/research/vid2depth/README.md
+++ b/research/vid2depth/README.md
@@ -21,6 +21,11 @@ ArXiv: [https://arxiv.org/pdf/1802.05522.pdf](https://arxiv.org/pdf/1802.05522.p
 <a href="https://sites.google.com/view/vid2depth"><img src='https://storage.googleapis.com/vid2depth/media/approach.png' width=400></a>
 </p>

+## Update: TF2 version.
+
+Please see [https://github.com/IAMAl/vid2depth_tf2](https://github.com/IAMAl/vid2depth_tf2)
+for a TF2 implementation of vid2depth.
+
 ## 1. Installation

 ### Requirements
@@ -36,10 +41,6 @@ pip install scipy
 pip install tensorflow
 ```

-#### For building the ICP op (work in progress)
-
-* Bazel: https://bazel.build/
-
 ### Download vid2depth

 ```shell
@@ -60,11 +61,27 @@ unzip "*.zip"

 ### Download Cityscapes dataset (110GB) (optional)

-You will need to register in order to download the data.  Download the following files:
+You will need to register in order to download the data.  Download the following
+files:

 * leftImg8bit_sequence_trainvaltest.zip
 * camera_trainvaltest.zip

+### Download Bike dataset (34GB) (optional)
+
+Please see [https://research.google/tools/datasets/bike-video/](https://research.google/tools/datasets/bike-video/)
+for info on the bike video dataset.
+
+Special thanks to [Guangming Wang](https://guangmingw.github.io/) for helping us
+restore this dataset after it was accidentally deleted.
+
+```shell
+mkdir -p ~/vid2depth/bike-uncompressed
+cd ~/vid2depth/bike-uncompressed
+wget https://storage.googleapis.com/vid2depth/dataset/BikeVideoDataset.tar
+tar xvf BikeVideoDataset.tar
+```
+
 ## 3. Inference

 ### Download trained model
@@ -113,23 +130,28 @@ python dataset/gen_data.py \
  --seq_length 3
 ```

-### Compile the ICP op (work in progress)
-
-The ICP op depends on multiple software packages (TensorFlow, Point Cloud
-Library, FLANN, Boost, HDF5).  The Bazel build system requires individual BUILD
-files for each of these packages.  We have included a partial implementation of
-these BUILD files inside the third_party directory.  But they are not ready for
-compiling the op.  If you manage to build the op, please let us know so we can
-include your contribution.
+### Prepare Bike training sequences (optional)

 ```shell
+# Prepare training sequences.
 cd tensorflow/models/research/vid2depth
-bazel build ops:pcl_demo  # Build test program using PCL only.
-bazel build ops:icp_op.so
+python dataset/gen_data.py \
+  --dataset_name bike \
+  --dataset_dir ~/vid2depth/bike-uncompressed \
+  --data_dir ~/vid2depth/data/bike \
+  --seq_length 3
 ```

-For the time being, it is possible to run inference on the pre-trained model and
-run training without the icp loss.
+### Compile the ICP op
+
+The pre-trained model is trained using the ICP loss.  It is possible to run
+inference on this pre-trained model without compiling the ICP op.  It is also
+possible to train a new model from scratch without compiling the ICP op by
+setting the icp loss to zero.
+
+If you would like to compile the op and run a new training job using it, please
+use the CMakeLists.txt file at
+[https://github.com/IAMAl/vid2depth_tf2/tree/master/ops](https://github.com/IAMAl/vid2depth_tf2/tree/master/ops).

 ### Run training