Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

09d9656f · Srihari Humbarwadi · GitHub · ac671306 · 49a5706c · 09d9656f
Unverified Commit 09d9656f authored Jan 13, 2022 by Srihari Humbarwadi Committed by GitHub Jan 13, 2022
20 changed files
--- a/official/vision/beta/tasks/semantic_segmentation.py
+++ b/official/vision/beta/tasks/semantic_segmentation.py
@@ -135,7 +135,15 @@ class SemanticSegmentationTask(base_task.Task):
        use_groundtruth_dimension=loss_params.use_groundtruth_dimension,
        top_k_percent_pixels=loss_params.top_k_percent_pixels)
-    total_loss = segmentation_loss_fn(model_outputs, labels['masks'])
+    total_loss = segmentation_loss_fn(model_outputs['logits'], labels['masks'])
+    if 'mask_scores' in model_outputs:
+      mask_scoring_loss_fn = segmentation_losses.MaskScoringLoss(
+          loss_params.ignore_label)
+      total_loss += mask_scoring_loss_fn(
+          model_outputs['mask_scores'],
+          model_outputs['logits'],
+          labels['masks'])
    if aux_losses:
      total_loss += tf.add_n(aux_losses)
@@ -144,6 +152,28 @@ class SemanticSegmentationTask(base_task.Task):
    return total_loss
+  def process_metrics(self, metrics, labels, model_outputs, **kwargs):
+    """Process and update metrics.
+    Called when using custom training loop API.
+    Args:
+      metrics: a nested structure of metrics objects. The return of function
+        self.build_metrics.
+      labels: a tensor or a nested structure of tensors.
+      model_outputs: a tensor or a nested structure of tensors. For example,
+        output of the keras model built by self.build_model.
+      **kwargs: other args.
+    """
+    for metric in metrics:
+      if 'mask_scores_mse' is metric.name:
+        actual_mask_scores = segmentation_losses.get_actual_mask_scores(
+            model_outputs['logits'], labels['masks'],
+            self.task_config.losses.ignore_label)
+        metric.update_state(actual_mask_scores, model_outputs['mask_scores'])
+      else:
+        metric.update_state(labels, model_outputs['logits'])
  def build_metrics(self, training: bool = True):
    """Gets streaming metrics for training/validation."""
    metrics = []
@@ -153,6 +183,9 @@ class SemanticSegmentationTask(base_task.Task):
          num_classes=self.task_config.model.num_classes,
          rescale_predictions=False,
          dtype=tf.float32))
+      if self.task_config.model.get('mask_scoring_head'):
+        metrics.append(
+            tf.keras.metrics.MeanSquaredError(name='mask_scores_mse'))
    else:
      self.iou_metric = segmentation_metrics.PerClassIoU(
          name='per_class_iou',
@@ -160,6 +193,11 @@ class SemanticSegmentationTask(base_task.Task):
          rescale_predictions=not self.task_config.validation_data
          .resize_eval_groundtruth,
          dtype=tf.float32)
+      if self.task_config.validation_data.resize_eval_groundtruth and self.task_config.model.get('mask_scoring_head'):  # pylint: disable=line-too-long
+        # Masks scores metric can only be computed if labels are scaled to match
+        # preticted mask scores.
+        metrics.append(
+            tf.keras.metrics.MeanSquaredError(name='mask_scores_mse'))
      # Update state on CPU if TPUStrategy due to dynamic resizing.
      self._process_iou_metric_on_cpu = isinstance(
@@ -194,6 +232,8 @@ class SemanticSegmentationTask(base_task.Task):
    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
    with tf.GradientTape() as tape:
      outputs = model(features, training=True)
+      if isinstance(outputs, tf.Tensor):
+        outputs = {'logits': outputs}
      # Casting output layer as float32 is necessary when mixed_precision is
      # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
      outputs = tf.nest.map_structure(
@@ -249,6 +289,8 @@ class SemanticSegmentationTask(base_task.Task):
          features, input_partition_dims)
    outputs = self.inference_step(features, model)
+    if isinstance(outputs, tf.Tensor):
+      outputs = {'logits': outputs}
    outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
    if self.task_config.validation_data.resize_eval_groundtruth:
@@ -260,9 +302,9 @@ class SemanticSegmentationTask(base_task.Task):
    logs = {self.loss: loss}
    if self._process_iou_metric_on_cpu:
-      logs.update({self.iou_metric.name: (labels, outputs)})
+      logs.update({self.iou_metric.name: (labels, outputs['logits'])})
    else:
-      self.iou_metric.update_state(labels, outputs)
+      self.iou_metric.update_state(labels, outputs['logits'])
    if metrics:
      self.process_metrics(metrics, labels, outputs)

--- a/official/vision/detection/README.md
+++ b/official/vision/detection/README.md
 # Object Detection Models on TensorFlow 2
-**WARNING**: This repository will be deprecated and replaced by the solid
+This repository is deprecated and replaced by the solid
-implementations inside vision/beta/.
+implementations inside vision/beta/. All the content has been moved to
+[official/legacy/detection](https://github.com/tensorflow/models/tree/master/official/legacy/detection).
-## Prerequsite
-To get started, download the code from TensorFlow models GitHub repository or
-use the pre-installed Google Cloud VM.
-```bash
-git clone https://github.com/tensorflow/models.git
-```
-Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
-a few package you need to install to get started:
-```bash
-sudo apt-get install -y python-tk && \
-pip3 install -r ~/models/official/requirements.txt
-```
-## Train RetinaNet on TPU
-### Train a vanilla ResNet-50 based RetinaNet.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
-TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
-EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
-VAL_JSON_FILE="<path to the validation annotation JSON file>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu="${TPU_NAME?}" \
-  --model_dir="${MODEL_DIR?}" \
-  --mode=train \
-  --params_override="{ type: retinanet, train: { checkpoint: { path: ${RESNET_CHECKPOINT?}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
-```
-The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
-Note: The ResNet implementation under
-[detection/](https://github.com/tensorflow/models/tree/master/official/vision/detection)
-is currently different from the one under
-[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
-so the checkpoints are not compatible.
-We will unify the implementation soon.
-### Train a SpineNet-49 based RetinaNet.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
-EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
-VAL_JSON_FILE="<path to the validation annotation JSON file>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu="${TPU_NAME?}" \
-  --model_dir="${MODEL_DIR?}" \
-  --mode=train \
-  --params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
-```
-### Train a custom RetinaNet using the config file.
-First, create a YAML config file, e.g. *my_retinanet.yaml*. This file specifies
-the parameters to be overridden, which should at least include the following
-fields.
-```YAML
-# my_retinanet.yaml
-type: 'retinanet'
-train:
-  train_file_pattern: <path to the TFRecord training data>
-eval:
-  eval_file_pattern: <path to the TFRecord validation data>
-  val_json_file: <path to the validation annotation JSON file>
-```
-Once the YAML config file is created, you can launch the training using the
-following command.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu="${TPU_NAME?}" \
-  --model_dir="${MODEL_DIR?}" \
-  --mode=train \
-  --config_file="my_retinanet.yaml"
-```
-## Train RetinaNet on GPU
-Training on GPU is similar to that on TPU. The major change is the strategy
-type (use "[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)" for multiple GPU and
-"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)" for single GPU).
-Multi-GPUs example (assuming there are 8GPU connected to the host):
-```bash
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=mirrored \
-  --num_gpus=8 \
-  --model_dir="${MODEL_DIR?}" \
-  --mode=train \
-  --config_file="my_retinanet.yaml"
-```
-```bash
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=one_device \
-  --num_gpus=1 \
-  --model_dir="${MODEL_DIR?}" \
-  --mode=train \
-  --config_file="my_retinanet.yaml"
-```
-An example with inline configuration (YAML or JSON format):
-```
-python3 ~/models/official/vision/detection/main.py \
-  --model_dir=<model folder> \
-  --strategy_type=one_device \
-  --num_gpus=1 \
-  --mode=train \
-  --params_override="eval:
- eval_file_pattern: <Eval TFRecord file pattern>
- batch_size: 8
- val_json_file: <COCO format groundtruth JSON file>
-predict:
- predict_batch_size: 8
-architecture:
- use_bfloat16: False
-train:
- total_steps: 1
- batch_size: 8
- train_file_pattern: <Eval TFRecord file pattern>
-use_tpu: False
-"
-```
---
-## Train Mask R-CNN on TPU
-### Train a vanilla ResNet-50 based Mask R-CNN.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
-TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
-EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
-VAL_JSON_FILE="<path to the validation annotation JSON file>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu=${TPU_NAME} \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=mask_rcnn \
-  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
-```
-The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
-Note: The ResNet implementation under
-[detection/](https://github.com/tensorflow/models/tree/master/official/vision/detection)
-is currently different from the one under
-[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
-so the checkpoints are not compatible.
-We will unify the implementation soon.
-### Train a SpineNet-49 based Mask R-CNN.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
-EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
-VAL_JSON_FILE="<path to the validation annotation JSON file>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu="${TPU_NAME?}" \
-  --model_dir="${MODEL_DIR?}" \
-  --mode=train \
-  --model=mask_rcnn \
-  --params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
-```
-### Train a custom Mask R-CNN using the config file.
-First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
-This file specifies the parameters to be overridden,
-which should at least include the following fields.
-```YAML
-# my_maskrcnn.yaml
-train:
-  train_file_pattern: <path to the TFRecord training data>
-eval:
-  eval_file_pattern: <path to the TFRecord validation data>
-  val_json_file: <path to the validation annotation JSON file>
-```
-Once the YAML config file is created, you can launch the training using the
-following command.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu=${TPU_NAME} \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=mask_rcnn \
-  --config_file="my_maskrcnn.yaml"
-```
-## Train Mask R-CNN on GPU
-Training on GPU is similar to that on TPU. The major change is the strategy type
-(use
-"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
-for multiple GPU and
-"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
-for single GPU).
-Multi-GPUs example (assuming there are 8GPU connected to the host):
-```bash
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=mirrored \
-  --num_gpus=8 \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=mask_rcnn \
-  --config_file="my_maskrcnn.yaml"
-```
-```bash
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=one_device \
-  --num_gpus=1 \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=mask_rcnn \
-  --config_file="my_maskrcnn.yaml"
-```
-An example with inline configuration (YAML or JSON format):
-```
-python3 ~/models/official/vision/detection/main.py \
-  --model_dir=<model folder> \
-  --strategy_type=one_device \
-  --num_gpus=1 \
-  --mode=train \
-  --model=mask_rcnn \
-  --params_override="eval:
- eval_file_pattern: <Eval TFRecord file pattern>
- batch_size: 8
- val_json_file: <COCO format groundtruth JSON file>
-predict:
- predict_batch_size: 8
-architecture:
- use_bfloat16: False
-train:
- total_steps: 1000
- batch_size: 8
- train_file_pattern: <Eval TFRecord file pattern>
-use_tpu: False
-"
-```
-## Train ShapeMask on TPU
-### Train a ResNet-50 based ShapeMask.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
-TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
-EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
-VAL_JSON_FILE="<path to the validation annotation JSON file>"
-SHAPE_PRIOR_PATH="<path to shape priors>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu=${TPU_NAME} \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=shapemask \
-  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
-```
-The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
-The shape priors can be downloaded [here]
-(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
-### Train a custom ShapeMask using the config file.
-First, create a YAML config file, e.g. *my_shapemask.yaml*.
-This file specifies the parameters to be overridden:
-```YAML
-# my_shapemask.yaml
-train:
-  train_file_pattern: <path to the TFRecord training data>
-  total_steps: <total steps to train>
-  batch_size: <training batch size>
-eval:
-  eval_file_pattern: <path to the TFRecord validation data>
-  val_json_file: <path to the validation annotation JSON file>
-  batch_size: <evaluation batch size>
-shapemask_head:
-  shape_prior_path: <path to shape priors>
-```
-Once the YAML config file is created, you can launch the training using the
-following command.
-```bash
-TPU_NAME="<your GCP TPU name>"
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=tpu \
-  --tpu=${TPU_NAME} \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=shapemask \
-  --config_file="my_shapemask.yaml"
-```
-## Train ShapeMask on GPU
-Training on GPU is similar to that on TPU. The major change is the strategy type
-(use
-"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
-for multiple GPU and
-"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
-for single GPU).
-Multi-GPUs example (assuming there are 8GPU connected to the host):
-```bash
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=mirrored \
-  --num_gpus=8 \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=shapemask \
-  --config_file="my_shapemask.yaml"
-```
-A single GPU example
-```bash
-MODEL_DIR="<path to the directory to store model files>"
-python3 ~/models/official/vision/detection/main.py \
-  --strategy_type=one_device \
-  --num_gpus=1 \
-  --model_dir=${MODEL_DIR} \
-  --mode=train \
-  --model=shapemask \
-  --config_file="my_shapemask.yaml"
-```
-An example with inline configuration (YAML or JSON format):
-```
-python3 ~/models/official/vision/detection/main.py \
-  --model_dir=<model folder> \
-  --strategy_type=one_device \
-  --num_gpus=1 \
-  --mode=train \
-  --model=shapemask \
-  --params_override="eval:
- eval_file_pattern: <Eval TFRecord file pattern>
- batch_size: 8
- val_json_file: <COCO format groundtruth JSON file>
-train:
- total_steps: 1000
- batch_size: 8
- train_file_pattern: <Eval TFRecord file pattern>
-use_tpu: False
-"
-```
-### Run the evaluation (after training)
-```
-python3 /usr/share/models/official/vision/detection/main.py \
-   --strategy_type=tpu \
-   --tpu=${TPU_NAME} \
-   --model_dir=${MODEL_DIR} \
-   --mode=eval \
-   --model=shapemask \
-   --params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
-```
-`MODEL_DIR` needs to point to the trained path of ShapeMask model.
-Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
-Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
-downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
-## References
-1.  [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
-    Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. IEEE
-    International Conference on Computer Vision (ICCV), 2017.
--- a/official/vision/detection/__init__.py
+++ b/official/vision/detection/__init__.py
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Deprecating the vision/detection folder."""
+raise ImportError('This module has been moved to official/legacy/detection')
--- a/official/vision/image_classification/README.md
+++ b/official/vision/image_classification/README.md
-# Image Classification
+This repository is deprecated and replaced by the solid
+implementations inside vision/beta/. All the content has been moved to
-**Warning:** the features in the `image_classification/` folder have been fully
+[official/legacy/image_classification](https://github.com/tensorflow/models/tree/master/official/legacy/image_classification).
-intergrated into vision/beta. Please use the [new code base](../beta/README.md).
-This folder contains TF 2.0 model examples for image classification:
-* [MNIST](#mnist)
-* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
-compile/fit methods for image classification models, including:
-  * ResNet
-  * EfficientNet[^1]
-[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
-For more information about other types of models, please refer to this
-[README file](../../README.md).
-## Before you begin
-Please make sure that you have the latest version of TensorFlow
-installed and
-[add the models folder to your Python path](/official/#running-the-models).
-### ImageNet preparation
-#### Using TFDS
-`classifier_trainer.py` supports ImageNet with
-[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
-Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
-for more information on how to use TFDS to download and prepare datasets, and
-specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
-for manual download instructions.
-#### Legacy TFRecords
-Download the ImageNet dataset and convert it to TFRecord format.
-The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
-and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
-provide a few options.
-Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
-require TFRecords whereas `classifier_trainer.py` can use both by setting the
-builder to 'records' or 'tfds' in the configurations.
-### Running on Cloud TPUs
-Note: These models will **not** work with TPUs on Colab.
-You can train image classification models on Cloud TPUs using
-[tf.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf.distribute.TPUStrategy?version=nightly).
-If you are not familiar with Cloud TPUs, it is strongly recommended that you go
-through the
-[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
-create a TPU and GCE VM.
-### Running on multiple GPU hosts
-You can also train these models on multiple hosts, each with GPUs, using
-[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
-The easiest way to run multi-host benchmarks is to set the
-[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
-appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
-2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
-host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
-"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
-available GPUs at each host.
-## MNIST
-To download the data and run the MNIST sample model locally for the first time,
-run one of the following command:
-```bash
-python3 mnist_main.py \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --train_epochs=10 \
-  --distribution_strategy=one_device \
-  --num_gpus=$NUM_GPUS \
-  --download
-```
-To train the model on a Cloud TPU, run the following command:
-```bash
-python3 mnist_main.py \
-  --tpu=$TPU_NAME \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --train_epochs=10 \
-  --distribution_strategy=tpu \
-  --download
-```
-Note: the `--download` flag is only required the first time you run the model.
-## Classifier Trainer
-The classifier trainer is a unified framework for running image classification
-models using Keras's compile/fit methods. Experiments should be provided in the
-form of YAML files, some examples are included within the configs/examples
-folder. Please see [configs/examples](./configs/examples) for more example
-configurations.
-The provided configuration files use a per replica batch size and is scaled
-by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
-the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
-would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
-be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
-### ResNet50
-#### On GPU:
-```bash
-python3 classifier_trainer.py \
-  --mode=train_and_eval \
-  --model_type=resnet \
-  --dataset=imagenet \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --config_file=configs/examples/resnet/imagenet/gpu.yaml \
-  --params_override='runtime.num_gpus=$NUM_GPUS'
-```
-To train on multiple hosts, each with GPUs attached using
-[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
-please update `runtime` section in gpu.yaml
-(or override using `--params_override`) with:
-```YAML
-# gpu.yaml
-runtime:
-  distribution_strategy: 'multi_worker_mirrored'
-  worker_hosts: '$HOST1:port,$HOST2:port'
-  num_gpus: $NUM_GPUS
-  task_index: 0
-```
-By having `task_index: 0` on the first host and `task_index: 1` on the second
-and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
-can be chosen any free port on the hosts. Only the first host will write
-TensorBoard Summaries and save checkpoints.
-#### On TPU:
-```bash
-python3 classifier_trainer.py \
-  --mode=train_and_eval \
-  --model_type=resnet \
-  --dataset=imagenet \
-  --tpu=$TPU_NAME \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --config_file=configs/examples/resnet/imagenet/tpu.yaml
-```
-### EfficientNet
-**Note: EfficientNet development is a work in progress.**
-#### On GPU:
-```bash
-python3 classifier_trainer.py \
-  --mode=train_and_eval \
-  --model_type=efficientnet \
-  --dataset=imagenet \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
-  --params_override='runtime.num_gpus=$NUM_GPUS'
-```
-#### On TPU:
-```bash
-python3 classifier_trainer.py \
-  --mode=train_and_eval \
-  --model_type=efficientnet \
-  --dataset=imagenet \
-  --tpu=$TPU_NAME \
-  --model_dir=$MODEL_DIR \
-  --data_dir=$DATA_DIR \
-  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
-```
-Note that the number of GPU devices can be overridden in the command line using
-`--params_overrides`. The TPU does not need this override as the device is fixed
-by providing the TPU address or name with the `--tpu` flag.
--- a/official/vision/image_classification/__init__.py
+++ b/official/vision/image_classification/__init__.py
@@ -12,3 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Deprecating the vision/detection folder."""
+raise ImportError(
+    'This module has been moved to official/legacy/image_classification')
--- a/orbit/utils/loop_fns.py
+++ b/orbit/utils/loop_fns.py
@@ -14,6 +14,7 @@
 """Utilities for creating loop functions."""
+from absl import logging
 from orbit.utils import tpu_summaries
 import tensorflow as tf
@@ -65,8 +66,8 @@ def create_loop_fn(step_fn):
      The final state returned by `reduce_fn`, or `None` if `state` and
      `reduce_fn` are not provided.
    """
+    step = 0
    try:
-      step = 0
      # To make sure the OutOfRangeError exception can be handled well under
      # async remote eager, we need to wrap the loop body in `async_scope`.
      with tf.experimental.async_scope():
@@ -77,6 +78,7 @@ def create_loop_fn(step_fn):
          step += 1
        return state
    except (StopIteration, tf.errors.OutOfRangeError):
+      logging.info("The dataset iterator is exhausted after %d steps.", step)
      tf.experimental.async_clear_error()
      return state

--- a/research/delf/delf/python/training/global_features/README.md
+++ b/research/delf/delf/python/training/global_features/README.md
+## Global features: CNN Image Retrieval
+This Python toolbox implements the training and testing of the approach described in the papers:
+[![Paper](http://img.shields.io/badge/paper-arXiv.2001.05027-B3181B.svg)](https://arxiv.org/abs/1711.02512)
+```
+"Fine-tuning CNN Image Retrieval with No Human Annotation",  
+Radenović F., Tolias G., Chum O.,
+TPAMI 2018 
+```
+[![Paper](http://img.shields.io/badge/paper-arXiv.2001.05027-B3181B.svg)](http://arxiv.org/abs/1604.02426)
+```
+"CNN Image Retrieval Learns from BoW: Unsupervised Fine-Tuning with Hard Examples",  
+Radenović F., Tolias G., Chum O.,
+ECCV 2016
+```
+Fine-tuned CNNs are used for global feature extraction with the goal of using
+those for image retrieval. The networks are trained on the <i>SfM120k</i>
+landmark images dataset.
+<img src="http://cmp.felk.cvut.cz/cnnimageretrieval/img/cnnimageretrieval_network_medium.png" width=\textwidth/>
+When initializing the network, one of the popular pre-trained architectures
+ for classification tasks (such as ResNet or VGG) is used as the network’s
+  backbone. The
+fully connected layers of such architectures are discarded, resulting in a fully
+convolutional backbone. Then, given an input image of the size [W × H × C],
+where C is the number of channels, W and H are image width and height,
+respectively; the output is a tensor X with dimensions [W' × H' × K], where
+K is the number of feature maps in the last layer. Tensor X
+can be considered as a set of the input image’s deep local features. For
+deep convolutional features, the simple aggregation approach based on global
+pooling arguably provides the best results. This method is fast, has a small
+number of parameters, and a low risk of overfitting. Keeping this in mind,
+we convert local features to a global descriptor vector using one of the
+retrieval system’s global poolings (MAC, SPoC, or GeM). After this stage,
+the feature vector is made up of the maximum activation per feature map
+with dimensionality equal to K. The final output dimensionality for the most
+common networks varies from 512 to 2048, making this image representation
+relatively compact.
+Vectors that have been pooled are subsequently L2-normalized. The obtained
+ representation is then optionally passed through the fully connected
+layers before being subjected to a
+new L2 re-normalization. The finally produced image representation allows
+comparing the resemblance of two images by simply using their inner product.
+### Install DELF library
+To be able to use this code, please follow
+[these instructions](../../../../INSTALL_INSTRUCTIONS.md) to properly install
+the DELF library.
+### Usage
+<details>
+  <summary><b>Training</b></summary><br/>
+  Navigate (```cd```) to the folder ```[DELF_ROOT/delf/python/training
+  /global_features].```
+  Example training script is located in ```DELF_ROOT/delf/python/training/global_features/train.py```.
+  ```
+  python3 train.py [--arch ARCH] [--batch_size N] [--data_root PATH]
+          [--debug] [--directory PATH] [--epochs N] [--gpu_id ID] 
+          [--image_size SIZE] [--launch_tensorboard] [--loss LOSS] 
+          [--loss_margin LM] [--lr LR] [--momentum M] [multiscale SCALES] 
+          [--neg_num N] [--optimizer OPTIMIZER] [--pool POOL] [--pool_size N]
+          [--pretrained] [--precompute_whitening DATASET] [--resume]
+          [--query_size N] [--test_datasets DATASET] [--test_freq N]
+          [--test_whiten] [--training_dataset DATASET] [--update_every N]
+          [--validation_type TYPE] [--weight_decay N] [--whitening]
+  ```
+  For detailed explanation of the options run:
+  ```
+  python3 train.py -helpfull
+  ```
+  Standard training of our models was run with the following parameters:
+  ```
+python3 train.py \
+--directory="DESTINATION_PATH" \
+--gpu_ids='0' \
+--data_root="TRAINING_DATA_DIRECTORY" \
+--training_dataset='retrieval-SfM-120k' \
+--test_datasets='roxford5k,rparis6k' \
+--arch='ResNet101' \
+--pool='gem' \
+--whitening=True \
+--debug=True \
+--loss='triplet' \
+--loss_margin=0.85 \
+--optimizer='adam' \
+--lr=5e-7 --neg_num=3 --query_size=2000 \
+--pool_size=20000 --batch_size=5 \
+--image_size=1024 --epochs=100 --test_freq=5 \
+--multiscale='[1, 2**(1/2), 1/2**(1/2)]'
+```
+  **Note**: Data and networks used for training and testing are automatically downloaded when using the example training
+   script (```DELF_ROOT/delf/python/training/global_features/train.py```).
+</details>
+<details>
+<summary><b>Training logic flow</b></summary><br/>
+**Initialization phase**
+1. Checking if required datasets are downloaded and automatically download them (both test and train/val) if they are 
+not present in the data folder.
+1. Setting up the logging and creating a logging/checkpoint directory.
+1. Initialize model according to the user-provided parameters (architecture
+/pooling/whitening/pretrained etc.).
+1. Defining loss (contrastive/triplet) according to the user parameters.
+1. Defining optimizer (Adam/SGD with learning rate/weight decay/momentum) according to the user parameters.
+1. Initializing CheckpointManager and resuming from the latest checkpoint if the resume flag is set.
+1. Launching Tensorboard if the flag is set.
+1. Initializing training (and validation, if required) datasets.
+1. Freezing BatchNorm weights update, since we we do training for one image at a time so the statistics would not be per batch, hence we choose freezing (i.e., using pretrained imagenet statistics).
+1. Evaluating the network performance before training (on the test datasets).
+**Training phase**
+The main training loop (for the required number of epochs):
+1. Finding the hard negative pairs in the dataset (using the forward pass through the model)
+1. Creating the training dataset from generator which changes every epoch. Each
+ element in the dataset consists of 1 x Positive image, 1 x Query image
+ , N x Hard negative images (N is specified by the `num_neg` flag), an array
+  specifying the Positive (-1), Query (0), Negative (1) images.
+1. Performing one training step and calculating the final epoch loss.
+1. If validation is required, finding hard negatives in the validation set
+, which has the same structure as the training set. Performing one validation
+ step and calculating the loss.
+1. Evaluating on the test datasets every `test_freq` epochs.
+1. Saving checkpoint (optimizer and the model weights).
+</details>
+## Exporting the Trained Model
+Assuming the training output, the TensorFlow checkpoint, is located in the
+`--directory` path. The following code exports the model:
+```
+python3 model/export_CNN_global_model.py \
+        [--ckpt_path PATH] [--export_path PATH] [--input_scales_list LIST]
+        [--multi_scale_pool_type TYPE] [--normalize_global_descriptor BOOL] 
+        [arch ARCHITECTURE] [pool POOLING] [whitening BOOL]
+```
+*NOTE:* Path to the checkpoint must include .h5 file.
+## Testing the trained model
+After the trained model has been exported, it can be used to extract global
+features similarly as for the DELG model. Please follow 
+[these instructions](https://github.com/tensorflow/models/tree/master/research/delf/delf/python/training#testing-the-trained-model).
+After training the standard training setup for 100 epochs, the
+ following results are obtained on Roxford and RParis datasets under a single
+ -scale evaluation:
+```
+>> roxford5k: mAP E: 74.88, M: 58.28, H: 30.4
+>> roxford5k: mP@k[1, 5, 10] E: [89.71 84.8  79.07],
+                             M: [91.43 84.67 78.24],
+                             H: [68.57 53.29 43.29]
+>> rparis6k: mAP E: 89.21, M: 73.69, H: 49.1
+>> rparis6k: mP@k[1, 5, 10] E: [98.57 97.43 95.57],
+                            M: [98.57 99.14 98.14],
+                            H: [94.29 90.   87.29]
+```
\ No newline at end of file
--- a/research/delf/delf/python/training/model/export_CNN_global.py
+++ b/research/delf/delf/python/training/model/export_CNN_global.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Export global CNN feature tensorflow inference model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import app
+from absl import flags
+import tensorflow as tf
+from delf.python.training.model import global_model
+from delf.python.training.model import export_model_utils
+FLAGS = flags.FLAGS
+flags.DEFINE_string('ckpt_path', None, help='Path to saved checkpoint.')
+flags.DEFINE_string('export_path', None,
+                    help='Path where model will be exported.')
+flags.DEFINE_list(
+        'input_scales_list', None,
+        'Optional input image scales to use. If None (default), an input '
+        'end-point '
+        '"input_scales" is added for the exported model. If not None, the '
+        'specified list of floats will be hard-coded as the desired input '
+        'scales.')
+flags.DEFINE_enum(
+        'multi_scale_pool_type', 'None', ['None', 'average', 'sum'],
+        "If 'None' (default), the model is exported with an output end-point "
+        "'global_descriptors', where the global descriptor for each scale is "
+        "returned separately. If not 'None', the global descriptor of each "
+        "scale is"
+        ' pooled and a 1D global descriptor is returned, with output end-point '
+        "'global_descriptor'.")
+flags.DEFINE_boolean('normalize_global_descriptor', False,
+                     'If True, L2-normalizes global descriptor.')
+# Network architecture and initialization options.
+flags.DEFINE_string('arch', 'ResNet101',
+                    'model architecture (default: ResNet101)')
+flags.DEFINE_string('pool', 'gem', 'pooling options (default: gem)')
+flags.DEFINE_boolean('whitening', False,
+                     'train model with learnable whitening (linear layer) '
+                     'after the pooling')
+def _NormalizeImages(images, *args):
+  """Normalize pixel values in image.
+  Args:
+    images: `Tensor`, images to normalize.
+  Returns:
+    normalized_images: `Tensor`, normalized images.
+  """
+  tf.keras.applications.imagenet_utils.preprocess_input(images, mode='caffe')
+  return images
+class _ExtractModule(tf.Module):
+  """Helper module to build and save global feature model."""
+  def __init__(self,
+               multi_scale_pool_type='None',
+               normalize_global_descriptor=False,
+               input_scales_tensor=None):
+    """Initialization of global feature model.
+    Args:
+      multi_scale_pool_type: Type of multi-scale pooling to perform.
+      normalize_global_descriptor: Whether to L2-normalize global
+        descriptor.
+      input_scales_tensor: If None, the exported function to be used
+        should be ExtractFeatures, where an input end-point "input_scales" is
+        added for the exported model. If not None, the specified 1D tensor of
+        floats will be hard-coded as the desired input scales, in conjunction
+         with ExtractFeaturesFixedScales.
+    """
+    self._multi_scale_pool_type = multi_scale_pool_type
+    self._normalize_global_descriptor = normalize_global_descriptor
+    if input_scales_tensor is None:
+      self._input_scales_tensor = []
+    else:
+      self._input_scales_tensor = input_scales_tensor
+    self._model = global_model.GlobalFeatureNet(
+            FLAGS.arch, FLAGS.pool, FLAGS.whitening, pretrained=False)
+  def LoadWeights(self, checkpoint_path):
+    self._model.load_weights(checkpoint_path)
+  @tf.function(input_signature=[
+    tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8,
+                  name='input_image'),
+    tf.TensorSpec(shape=[None], dtype=tf.float32, name='input_scales'),
+    tf.TensorSpec(shape=[None], dtype=tf.int32,
+                  name='input_global_scales_ind')
+  ])
+  def ExtractFeatures(self, input_image, input_scales,
+                      input_global_scales_ind):
+    extracted_features = export_model_utils.ExtractGlobalFeatures(
+            input_image,
+            input_scales,
+            input_global_scales_ind,
+            lambda x: self._model(x, training=False),
+            multi_scale_pool_type=self._multi_scale_pool_type,
+            normalize_global_descriptor=self._normalize_global_descriptor,
+            normalization_function=_NormalizeImages())
+    named_output_tensors = {}
+    named_output_tensors['global_descriptors'] = tf.identity(
+            extracted_features, name='global_descriptors')
+    return named_output_tensors
+  @tf.function(input_signature=[
+    tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8, name='input_image')
+  ])
+  def ExtractFeaturesFixedScales(self, input_image):
+    return self.ExtractFeatures(input_image, self._input_scales_tensor,
+                                tf.range(tf.size(self._input_scales_tensor)))
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+  export_path = FLAGS.export_path
+  if os.path.exists(export_path):
+    raise ValueError('export_path %s already exists.' % export_path)
+  if FLAGS.input_scales_list is None:
+    input_scales_tensor = None
+  else:
+    input_scales_tensor = tf.constant(
+            [float(s) for s in FLAGS.input_scales_list],
+            dtype=tf.float32,
+            shape=[len(FLAGS.input_scales_list)],
+            name='input_scales')
+  module = _ExtractModule(FLAGS.multi_scale_pool_type,
+                          FLAGS.normalize_global_descriptor,
+                          input_scales_tensor)
+  # Load the weights.
+  checkpoint_path = FLAGS.ckpt_path
+  module.LoadWeights(checkpoint_path)
+  print('Checkpoint loaded from ', checkpoint_path)
+  # Save the module.
+  if FLAGS.input_scales_list is None:
+    served_function = module.ExtractFeatures
+  else:
+    served_function = module.ExtractFeaturesFixedScales
+  tf.saved_model.save(
+          module, export_path, signatures={'serving_default': served_function})
+if __name__ == '__main__':
+  app.run(main)
--- a/research/delf/delf/python/training/model/export_model_utils.py
+++ b/research/delf/delf/python/training/model/export_model_utils.py
@@ -183,7 +183,8 @@ def ExtractGlobalFeatures(image,
                          global_scales_ind,
                          model_fn,
                          multi_scale_pool_type='None',
-                          normalize_global_descriptor=False):
+                          normalize_global_descriptor=False,
+                          normalization_function=gld.NormalizeImages):
  """Extract global features for input image.
  Args:
@@ -201,6 +202,7 @@ def ExtractGlobalFeatures(image,
      and a 1D global descriptor is returned.
    normalize_global_descriptor: If True, output global descriptors are
      L2-normalized.
+    normalization_function: Function used for normalization.
  Returns:
    global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D]
@@ -213,7 +215,7 @@ def ExtractGlobalFeatures(image,
  """
  original_image_shape_float = tf.gather(
      tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])
-  image_tensor = gld.NormalizeImages(
+  image_tensor = normalization_function(
      image, pixel_value_offset=128.0, pixel_value_scale=128.0)
  image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')

--- a/research/object_detection/colab_tutorials/generate_ssd_anchor_box_aspect_ratios_using_k_means_clustering.ipynb
+++ b/research/object_detection/colab_tutorials/generate_ssd_anchor_box_aspect_ratios_using_k_means_clustering.ipynb
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
  "cells": [
    {
      "cell_type": "markdown",
@@ -55,20 +39,22 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "hCQlBGJkZTR2"
      },
+      "outputs": [],
      "source": [
        "import tensorflow as tf"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "aw-Ba-5RUhMs"
      },
+      "outputs": [],
      "source": [
        "# Install the tensorflow Object Detection API...\n",
        "# If you're running this offline, you also might need to install the protobuf-compiler:\n",
@@ -87,9 +73,7 @@
        "\n",
        "# Test the installation\n",
        "! python object_detection/builders/model_builder_tf2_test.py"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -113,19 +97,21 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "sKYfhq7CKZ4B"
      },
+      "outputs": [],
      "source": [
        "%mkdir /content/dataset\n",
        "%cd /content/dataset\n",
        "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n",
        "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n",
        "! tar zxf images.tar.gz\n",
-        "! tar zxf annotations.tar.gz"
+        "! tar zxf annotations.tar.gz\n",
-      ],
+        "\n",
-      "execution_count": null,
+        "XML_PATH = '/content/dataset/annotations/xmls'"
-      "outputs": []
+      ]
    },
    {
      "cell_type": "markdown",
@@ -133,28 +119,53 @@
        "id": "44vtL0nsAqXg"
      },
      "source": [
-        "In this case, we want to reduce the PETS dataset to match the collection of cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)):\n",
+        "Because the following k-means script will process all XML annotations, we want to reduce the PETS dataset to include only the cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)). So we delete all annotation files that are **not** Abyssinian or American bulldog:\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
-        "id": "8gcUoBU2K_s7"
+        "id": "ih48zFbl6jM7"
      },
+      "outputs": [],
      "source": [
-        "! cp /content/dataset/annotations/list.txt /content/dataset/annotations/list_petsdataset.txt\n",
+        "! (cd /content/dataset/annotations/xmls/ \u0026\u0026 \\\n",
-        "! cp /content/dataset/annotations/trainval.txt /content/dataset/annotations/trainval_petsdataset.txt\n",
+        "  find . ! \\( -name 'Abyssinian*' -o -name 'american_bulldog*' \\) -type f -exec rm -f {} \\; )"
-        "! cp /content/dataset/annotations/test.txt /content/dataset/annotations/test_petsdataset.txt\n",
+      ]
-        "! grep \"Abyssinian\" /content/dataset/annotations/list_petsdataset.txt >  /content/dataset/annotations/list.txt\n",
+    },
-        "! grep \"american_bulldog\" /content/dataset/annotations/list_petsdataset.txt >> /content/dataset/annotations/list.txt\n",
+    {
-        "! grep \"Abyssinian\" /content/dataset/annotations/trainval_petsdataset.txt > /content/dataset/annotations/trainval.txt\n",
+      "cell_type": "markdown",
-        "! grep \"american_bulldog\" /content/dataset/annotations/trainval_petsdataset.txt >> /content/dataset/annotations/trainval.txt\n",
+      "metadata": {
-        "! grep \"Abyssinian\" /content/dataset/annotations/test_petsdataset.txt > /content/dataset/annotations/test.txt\n",
+        "id": "KG8uraCK-RSM"
-        "! grep \"american_bulldog\" /content/dataset/annotations/test_petsdataset.txt >> /content/dataset/annotations/test.txt"
+      },
-      ],
+      "source": [
+        "### Upload your own dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "m0bh_iKD-Xz4"
+      },
+      "source": [
+        "To generate the anchor box ratios for your own dataset, upload a ZIP file with your annotation files (click the **Files** tab on the left, and drag-drop your ZIP file there), and then uncomment the following code to unzip it and specify the path to the directory with your annotation files:"
+      ]
+    },
+    {
+      "cell_type": "code",
      "execution_count": null,
-      "outputs": []
+      "metadata": {
+        "id": "M0j_vWDR3WkK"
+      },
+      "outputs": [],
+      "source": [
+        "# %cd /content/\n",
+        "# !unzip dataset.zip\n",
+        "\n",
+        "# XML_PATH = '/content/dataset/annotations/xmls'"
+      ]
    },
    {
      "cell_type": "markdown",
@@ -188,23 +199,24 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "vCB8Dfs0Xlyv"
      },
+      "outputs": [],
      "source": [
        "import sys\n",
-        "import glob\n",
+        "import os\n",
        "import numpy as np\n",
        "import xml.etree.ElementTree as ET\n",
        "\n",
        "from sklearn.cluster import KMeans\n",
        "\n",
-        "def xml_to_boxes(path, classes, rescale_width=None, rescale_height=None):\n",
+        "def xml_to_boxes(path, rescale_width=None, rescale_height=None):\n",
        "  \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n",
        "\n",
        "  Args:\n",
        "  path : Path to .xml annotation files for your dataset.\n",
-        "  classes : List of classes that are part of dataset.\n",
        "  rescale_width : Scaling factor to rescale width of bounding box.\n",
        "  rescale_height : Scaling factor to rescale height of bounding box.\n",
        "\n",
@@ -213,23 +225,20 @@
        "  \"\"\"\n",
        "\n",
        "  xml_list = []\n",
-        "  for clss in classes:\n",
+        "  filenames = os.listdir(os.path.join(path))\n",
-        "    for xml_file in glob.glob(path + '/'+clss+'*'):\n",
+        "  filenames = [os.path.join(path, f) for f in filenames if (f.endswith('.xml'))]\n",
-        "      if xml_file.endswith('.xml'):\n",
+        "  for xml_file in filenames:\n",
-        "        tree = ET.parse(xml_file)\n",
+        "    tree = ET.parse(xml_file)\n",
-        "        root = tree.getroot()\n",
+        "    root = tree.getroot()\n",
-        "        for member in root.findall('object'):\n",
+        "    for member in root.findall('object'):\n",
-        "          bndbox = member.find('bndbox')\n",
+        "      bndbox = member.find('bndbox')\n",
-        "          bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
+        "      bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
-        "          bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
+        "      bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
-        "          if rescale_width and rescale_height:\n",
+        "      if rescale_width and rescale_height:\n",
-        "            size = root.find('size')\n",
+        "        size = root.find('size')\n",
-        "            bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
+        "        bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
-        "            bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
+        "        bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
-        "\n",
+        "      xml_list.append([bbox_width, bbox_height])\n",
-        "          xml_list.append([bbox_width, bbox_height])\n",
-        "      else:\n",
-        "        continue\n",
        "  bboxes = np.array(xml_list)\n",
        "  return bboxes\n",
        "\n",
@@ -275,10 +284,10 @@
        "  assert len(bboxes), \"You must provide bounding boxes\"\n",
        "\n",
        "  normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n",
-        "\n",
+        "  \n",
-        "   # Using kmeans to find centroids of the width/height clusters\n",
+        "  # Using kmeans to find centroids of the width/height clusters\n",
        "  kmeans = KMeans(\n",
-        "      init='random', n_clusters=num_aspect_ratios,random_state=0, max_iter=kmeans_max_iter)\n",
+        "      init='random', n_clusters=num_aspect_ratios, random_state=0, max_iter=kmeans_max_iter)\n",
        "  kmeans.fit(X=normalized_bboxes)\n",
        "  ar = kmeans.cluster_centers_\n",
        "\n",
@@ -292,9 +301,7 @@
        "  aspect_ratios = [w/h for w,h in ar]\n",
        "\n",
        "  return aspect_ratios, avg_iou_perc"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -323,13 +330,12 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "cNw-vX3nfl1g"
      },
+      "outputs": [],
      "source": [
-        "classes  = ['Abyssinian','american_bulldog']\n",
-        "xml_path = '/content/dataset/annotations/xmls'\n",
-        "\n",
        "# Tune this based on your accuracy/speed goals as described above\n",
        "num_aspect_ratios = 4 # can be [2,3,4,5,6]\n",
        "\n",
@@ -342,8 +348,7 @@
        "height = 320\n",
        "\n",
        "# Get the ground-truth bounding boxes for our dataset\n",
-        "bboxes = xml_to_boxes(path=xml_path, classes=classes,\n",
+        "bboxes = xml_to_boxes(path=XML_PATH, rescale_width=width, rescale_height=height)\n",
-        "                      rescale_width=width, rescale_height=height)\n",
        "\n",
        "aspect_ratios, avg_iou_perc =  kmeans_aspect_ratios(\n",
        "                                      bboxes=bboxes,\n",
@@ -354,9 +359,7 @@
        "\n",
        "print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n",
        "print('Average IOU with anchors:', avg_iou_perc)"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -378,9 +381,11 @@
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "AlMffd3rgKW2"
      },
+      "outputs": [],
      "source": [
        "import tensorflow as tf\n",
        "from google.protobuf import text_format\n",
@@ -404,9 +409,7 @@
        "    f.write(config_text)\n",
        "# Check for updated aspect ratios in the config\n",
        "!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config"
-      ],
+      ]
-      "execution_count": null,
-      "outputs": []
    },
    {
      "cell_type": "markdown",
@@ -441,5 +444,22 @@
        "\n"
      ]
    }
-  ]
+  ],
-}
+  "metadata": {
\ No newline at end of file
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/research/object_detection/core/model.py
+++ b/research/object_detection/core/model.py
@@ -89,6 +89,7 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    """
    self._num_classes = num_classes
    self._groundtruth_lists = {}
+    self._training_step = None
    super(DetectionModel, self).__init__()
@@ -132,6 +133,13 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
    """
    return field in self._groundtruth_lists
+  @property
+  def training_step(self):
+    if self._training_step is None:
+      raise ValueError('Training step was not provided to the model.')
+    return self._training_step
  @staticmethod
  def get_side_inputs(features):
    """Get side inputs from input features.
@@ -318,7 +326,9 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      groundtruth_verified_neg_classes=None,
      groundtruth_not_exhaustive_classes=None,
      groundtruth_keypoint_depths_list=None,
-      groundtruth_keypoint_depth_weights_list=None):
+      groundtruth_keypoint_depth_weights_list=None,
+      groundtruth_image_classes=None,
+      training_step=None):
    """Provide groundtruth tensors.
    Args:
@@ -389,6 +399,11 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      groundtruth_keypoint_depth_weights_list: a list of 2-D tf.float32 tensors
        of shape [num_boxes, num_keypoints] containing the weights of the
        relative depths.
+      groundtruth_image_classes: A list of 1-D tf.float32 tensors of shape
+        [num_classes], containing label indices encoded as k-hot of the classes
+        that are present or not present in the image.
+      training_step: An integer denoting the current training step. This is
+        useful when models want to anneal loss terms.
    """
    self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list
    self._groundtruth_lists[
@@ -463,11 +478,17 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      self._groundtruth_lists[
          fields.InputDataFields
          .groundtruth_verified_neg_classes] = groundtruth_verified_neg_classes
+    if groundtruth_image_classes:
+      self._groundtruth_lists[
+          fields.InputDataFields
+          .groundtruth_image_classes] = groundtruth_image_classes
    if groundtruth_not_exhaustive_classes:
      self._groundtruth_lists[
          fields.InputDataFields
          .groundtruth_not_exhaustive_classes] = (
              groundtruth_not_exhaustive_classes)
+    if training_step is not None:
+      self._training_step = training_step
  @abc.abstractmethod
  def regularization_losses(self):

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -925,7 +925,9 @@ class CenterNetCenterHeatmapTargetAssigner(object):
               compute_heatmap_sparse=False,
               keypoint_class_id=None,
               keypoint_indices=None,
-               keypoint_weights_for_center=None):
+               keypoint_weights_for_center=None,
+               box_heatmap_type='adaptive_gaussian',
+               heatmap_exponent=1.0):
    """Initializes the target assigner.
    Args:
@@ -947,6 +949,17 @@ class CenterNetCenterHeatmapTargetAssigner(object):
        the number of keypoints. The object center is calculated by the weighted
        mean of the keypoint locations. If not provided, the object center is
        determined by the center of the bounding box (default behavior).
+       box_heatmap_type: str, the algorithm used to compute the box heatmap,
+         used when calling the assign_center_targets_from_boxes method.
+         Options are:
+         'adaptaive_gaussian': A box-size adaptive Gaussian from the original
+           paper[1].
+         'iou': IOU based heatmap target where each point is assigned an IOU
+           based on its location, assuming that it produced a box centered at
+           that point with the correct size.
+       heatmap_exponent: float, The generated heatmap is exponentiated with
+         this number. A number > 1 will result in the heatmap being more peaky
+         and a number < 1 will cause the heatmap to be more spreadout.
    """
    self._stride = stride
@@ -955,6 +968,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
    self._keypoint_class_id = keypoint_class_id
    self._keypoint_indices = keypoint_indices
    self._keypoint_weights_for_center = keypoint_weights_for_center
+    self._box_heatmap_type = box_heatmap_type
+    self._heatmap_exponent = heatmap_exponent
  def assign_center_targets_from_boxes(self,
                                       height,
@@ -1018,19 +1033,29 @@ class CenterNetCenterHeatmapTargetAssigner(object):
                                             self._min_overlap)
      # Apply the Gaussian kernel to the center coordinates. Returned heatmap
      # has shape of [out_height, out_width, num_classes]
-      heatmap = ta_utils.coordinates_to_heatmap(
-          y_grid=y_grid,
+      if self._box_heatmap_type == 'adaptive_gaussian':
-          x_grid=x_grid,
+        heatmap = ta_utils.coordinates_to_heatmap(
-          y_coordinates=y_center,
+            y_grid=y_grid,
-          x_coordinates=x_center,
+            x_grid=x_grid,
-          sigma=sigma,
+            y_coordinates=y_center,
-          channel_onehot=class_targets,
+            x_coordinates=x_center,
-          channel_weights=weights,
+            sigma=sigma,
-          sparse=self._compute_heatmap_sparse)
+            channel_onehot=class_targets,
+            channel_weights=weights,
+            sparse=self._compute_heatmap_sparse)
+      elif self._box_heatmap_type == 'iou':
+        heatmap = ta_utils.coordinates_to_iou(y_grid, x_grid, boxes,
+                                              class_targets, weights)
+      else:
+        raise ValueError(f'Unknown heatmap type - {self._box_heatmap_type}')
      heatmaps.append(heatmap)
    # Return the stacked heatmaps over the batch.
-    return tf.stack(heatmaps, axis=0)
+    stacked_heatmaps = tf.stack(heatmaps, axis=0)
+    return (tf.pow(stacked_heatmaps, self._heatmap_exponent) if
+            self._heatmap_exponent != 1.0 else stacked_heatmaps)
  def assign_center_targets_from_keypoints(self,
                                           height,

--- a/research/object_detection/core/target_assigner_test.py
+++ b/research/object_detection/core/target_assigner_test.py
@@ -1678,6 +1678,66 @@ class CenterNetBoxTargetAssignerTest(test_case.TestCase):
    np.testing.assert_array_equal(preds, [[1, 2], [3, 4], [5, 6], [7, 8]])
+class CenterNetIOUTargetAssignerTest(test_case.TestCase):
+  def setUp(self):
+    super(CenterNetIOUTargetAssignerTest, self).setUp()
+    self._box_center = [0.0, 0.0, 1.0, 1.0]
+    self._box_center_small = [0.25, 0.25, 0.75, 0.75]
+    self._box_lower_left = [0.5, 0.0, 1.0, 0.5]
+    self._box_center_offset = [0.1, 0.05, 1.0, 1.0]
+    self._box_odd_coordinates = [0.1625, 0.2125, 0.5625, 0.9625]
+  def test_center_location(self):
+    """Test that the centers are at the correct location."""
+    def graph_fn():
+      box_batch = [tf.constant([self._box_center, self._box_lower_left]),
+                   tf.constant([self._box_lower_left, self._box_center])]
+      classes = [
+          tf.one_hot([0, 1], depth=4),
+          tf.one_hot([2, 2], depth=4)
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          4, box_heatmap_type='iou')
+      targets = assigner.assign_center_targets_from_boxes(
+          80, 80, box_batch, classes)
+      return targets
+    targets = self.execute(graph_fn, [])
+    self.assertEqual((10, 10), _array_argmax(targets[0, :, :, 0]))
+    self.assertAlmostEqual(1.0, targets[0, 10, 10, 0])
+    self.assertEqual((15, 5), _array_argmax(targets[0, :, :, 1]))
+    self.assertAlmostEqual(1.0, targets[0, 15, 5, 1])
+    self.assertAlmostEqual(1.0, targets[1, 15, 5, 2])
+    self.assertAlmostEqual(1.0, targets[1, 10, 10, 2])
+    self.assertAlmostEqual(0.0, targets[1, 0, 19, 1])
+  def test_exponent(self):
+    """Test that the centers are at the correct location."""
+    def graph_fn():
+      box_batch = [tf.constant([self._box_center, self._box_lower_left]),
+                   tf.constant([self._box_lower_left, self._box_center])]
+      classes = [
+          tf.one_hot([0], depth=2),
+      ]
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          1, box_heatmap_type='iou')
+      targets = assigner.assign_center_targets_from_boxes(
+          4, 4, box_batch, classes)
+      assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
+          1, box_heatmap_type='iou', heatmap_exponent=0.5)
+      targets_pow = assigner.assign_center_targets_from_boxes(
+          4, 4, box_batch, classes)
+      return targets, targets_pow
+    targets, targets_pow = self.execute(graph_fn, [])
+    self.assertLess(targets[0, 2, 3, 0], 1.0)
+    self.assertLess(targets_pow[0, 2, 3, 0], 1.0)
+    self.assertAlmostEqual(targets[0, 2, 3, 0], targets_pow[0, 2, 3, 0] ** 2)
 class CenterNetKeypointTargetAssignerTest(test_case.TestCase):
  def test_keypoint_heatmap_targets(self):

--- a/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md
+++ b/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md
@@ -10,12 +10,12 @@ devices. It enables on-device machine learning inference with low latency and a
 small binary size. TensorFlow Lite uses many techniques for this such as
 quantized kernels that allow smaller and faster (fixed-point math) models.
-For this section, you will need to build [TensorFlow from
+For this section, you will need to build
-source](https://www.tensorflow.org/install/install_sources) to get the
+[TensorFlow from source](https://www.tensorflow.org/install/install_sources) to
-TensorFlow Lite support for the SSD model. At this time only SSD models are supported.
+get the TensorFlow Lite support for the SSD model. At this time only SSD models
-Models like faster_rcnn are not supported at this time. You will also need to install the
+are supported. Models like faster_rcnn are not supported at this time. You will
-[bazel build
+also need to install the
-tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
+[bazel build tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
 To make these commands easier to run, let’s set up some environment variables:
@@ -96,7 +96,17 @@ bazel run -c opt tensorflow/lite/python:tflite_convert -- \
 --allow_custom_ops
 ```
-# Running our model on Android
+## Adding Metadata to the model
+To make it easier to use tflite models on mobile, you will need to add
+[metadata](https://www.tensorflow.org/lite/convert/metadata) to your model and
+also
+[pack](https://www.tensorflow.org/lite/convert/metadata#pack_metadata_and_associated_files_into_the_model)
+the associated labels file to it.
+If you need more information, this process is also explained in the
+[Metadata writer Object detectors documentation](https://www.tensorflow.org/lite/convert/metadata_writer_tutorial#object_detectors)
+## Running our model on Android
 To run our TensorFlow Lite model on device, we will use Android Studio to build
 and run the TensorFlow Lite detection example with the new model. The example is
@@ -119,8 +129,8 @@ cp /tmp/tflite/detect.tflite \
  $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
 ```
-You will also need to copy your new labelmap labelmap.txt to the assets
+It's important to notice that the labels file should be packed in the model (as
-directory.
+mentioned previously)
 We will now edit the gradle build file to use these assets. First, open the
 `build.gradle` file
@@ -128,17 +138,15 @@ We will now edit the gradle build file to use these assets. First, open the
 out the model download script to avoid your assets being overwritten: `// apply
 from:'download_model.gradle'` ```
-If your model is named `detect.tflite`, and your labels file `labelmap.txt`, the
+If your model is named `detect.tflite`, the example will use it automatically as
-example will use them automatically as long as they've been properly copied into
+long as they've been properly copied into the base assets directory. If you need
-the base assets directory. If you need to use a custom path or filename, open up
+to use a custom path or filename, open up the
-the
 $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
-file in a text editor and find the definition of TF_OD_API_LABELS_FILE. Update
+file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Note that
-this path to point to your new label map file:
+if your model is quantized, the flag TF_OD_API_IS_QUANTIZED is set to true, and
-"labels_list.txt". Note that if your model is quantized,
+if your model is floating point, the flag TF_OD_API_IS_QUANTIZED is set to
-the flag TF_OD_API_IS_QUANTIZED is set to true, and if your model is floating
+false. This new section of DetectorActivity.java should now look as follows for
-point, the flag TF_OD_API_IS_QUANTIZED is set to false. This new section of
+a quantized model:
-DetectorActivity.java should now look as follows for a quantized model:
 ```shell
  private static final boolean TF_OD_API_IS_QUANTIZED = true;

--- a/research/object_detection/g3doc/running_on_mobile_tf2.md
+++ b/research/object_detection/g3doc/running_on_mobile_tf2.md
@@ -92,27 +92,15 @@ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
 converter.representative_dataset = <...>
 ```
-### Step 3: Add Metadata
+### Step 3: add Metadata to the model
-The model needs to be packed with
+To make it easier to use tflite models on mobile, you will need to add
-[TFLite Metadata](https://www.tensorflow.org/lite/convert/metadata) to enable
+[metadata](https://www.tensorflow.org/lite/convert/metadata) to your model and
-easy integration into mobile apps using the
+also
-[TFLite Task Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/object_detector).
+[pack](https://www.tensorflow.org/lite/convert/metadata#pack_metadata_and_associated_files_into_the_model)
-This metadata helps the inference code perform the correct pre & post processing
+the associated labels file to it.
-as required by the model. Use the following code to create the metadata.
+If you need more information, This process is also explained in the
+[Image classification sample](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/metadata)
-```python
-from tflite_support.metadata_writers import object_detector
-from tflite_support.metadata_writers import writer_utils
-writer = object_detector.MetadataWriter.create_for_inference(
-    writer_utils.load_file(_TFLITE_MODEL_PATH), input_norm_mean=[0],
-    input_norm_std=[255], label_file_paths=[_TFLITE_LABEL_PATH])
-writer_utils.save_file(writer.populate(), _TFLITE_MODEL_WITH_METADATA_PATH)
-```
-See the TFLite Metadata Writer API [documentation](https://www.tensorflow.org/lite/convert/metadata_writer_tutorial#object_detectors)
-for more details.
 ## Running our model on Android
@@ -142,9 +130,9 @@ the
 that support API >= 21. Additional details are available on the
 [TensorFlow Lite example page](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android).
-Next we need to point the app to our new detect.tflite file and give it the
+Next we need to point the app to our new detect.tflite file . Specifically, we
-names of our new labels. Specifically, we will copy our TensorFlow Lite
+will copy our TensorFlow Lite flatbuffer to the app assets directory with the
-model with metadata to the app assets directory with the following command:
+following command:
 ```shell
 mkdir $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
@@ -152,21 +140,30 @@ cp /tmp/tflite/detect.tflite \
  $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
 ```
+It's important to notice that the labels file should be packed in the model (as
+mentioned on Step 3)
 We will now edit the gradle build file to use these assets. First, open the
 `build.gradle` file
 `$TF_EXAMPLES/lite/examples/object_detection/android/app/build.gradle`. Comment
-out the model download script to avoid your assets being overwritten:
+out the model download script to avoid your assets being overwritten: `// apply
+from:'download_model.gradle'` ```
-```shell
-// apply from:'download_model.gradle'
-```
 If your model is named `detect.tflite`, the example will use it automatically as
 long as they've been properly copied into the base assets directory. If you need
 to use a custom path or filename, open up the
 $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
-file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Update
+file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Note that
-this path to point to your new model file.
+if your model is quantized, the flag TF_OD_API_IS_QUANTIZED is set to true, and
+if your model is floating point, the flag TF_OD_API_IS_QUANTIZED is set to
+false. This new section of DetectorActivity.java should now look as follows for
+a quantized model:
+```shell
+  private static final boolean TF_OD_API_IS_QUANTIZED = true;
+  private static final String TF_OD_API_MODEL_FILE = "detect.tflite";
+  private static final String TF_OD_API_LABELS_FILE = "labels_list.txt";
+```
 Once you’ve copied the TensorFlow Lite model and edited the gradle build script
 to not use the downloaded assets, you can build and deploy the app using the

--- a/research/object_detection/inputs.py
+++ b/research/object_detection/inputs.py
@@ -668,7 +668,8 @@ def _get_labels_dict(input_dict):
      fields.InputDataFields.groundtruth_dp_surface_coords,
      fields.InputDataFields.groundtruth_track_ids,
      fields.InputDataFields.groundtruth_verified_neg_classes,
-      fields.InputDataFields.groundtruth_not_exhaustive_classes
+      fields.InputDataFields.groundtruth_not_exhaustive_classes,
+      fields.InputDataFields.groundtruth_image_classes,
  ]
  for key in optional_label_keys:

--- a/research/object_detection/meta_architectures/deepmac_meta_arch.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch.py
@@ -12,12 +12,12 @@ import tensorflow as tf
 from object_detection.builders import losses_builder
 from object_detection.core import box_list
 from object_detection.core import box_list_ops
-from object_detection.core import losses
 from object_detection.core import preprocessor
 from object_detection.core import standard_fields as fields
 from object_detection.meta_architectures import center_net_meta_arch
 from object_detection.models.keras_models import hourglass_network
 from object_detection.models.keras_models import resnet_v1
+from object_detection.protos import center_net_pb2
 from object_detection.protos import losses_pb2
 from object_detection.protos import preprocessor_pb2
 from object_detection.utils import shape_utils
@@ -38,46 +38,26 @@ NEIGHBORS_2D = [[-1, -1], [-1, 0], [-1, 1],
                [0, -1], [0, 1],
                [1, -1], [1, 0], [1, 1]]
 WEAK_LOSSES = [DEEP_MASK_BOX_CONSISTENCY, DEEP_MASK_COLOR_CONSISTENCY]
+MASK_LOSSES = WEAK_LOSSES + [DEEP_MASK_ESTIMATION]
-class DeepMACParams(
+DeepMACParams = collections.namedtuple('DeepMACParams', [
-    collections.namedtuple('DeepMACParams', [
        'classification_loss', 'dim', 'task_loss_weight', 'pixel_embedding_dim',
        'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
        'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
        'predict_full_resolution_masks', 'postprocess_crop_size',
        'max_roi_jitter_ratio', 'roi_jitter_mode',
        'box_consistency_loss_weight', 'color_consistency_threshold',
-        'color_consistency_dilation', 'color_consistency_loss_weight'
+        'color_consistency_dilation', 'color_consistency_loss_weight',
-    ])):
+        'box_consistency_loss_normalize', 'box_consistency_tightness',
-  """Class holding the DeepMAC network configutration."""
+        'color_consistency_warmup_steps', 'color_consistency_warmup_start'
+    ])
-  __slots__ = ()
-  def __new__(cls, classification_loss, dim, task_loss_weight,
+def _get_loss_weight(loss_name, config):
-              pixel_embedding_dim, allowed_masked_classes_ids, mask_size,
+  if loss_name == DEEP_MASK_ESTIMATION:
-              mask_num_subsamples, use_xy, network_type, use_instance_embedding,
+    return config.task_loss_weight
-              num_init_channels, predict_full_resolution_masks,
+  elif loss_name == DEEP_MASK_COLOR_CONSISTENCY:
-              postprocess_crop_size, max_roi_jitter_ratio,
-              roi_jitter_mode, box_consistency_loss_weight,
-              color_consistency_threshold, color_consistency_dilation,
-              color_consistency_loss_weight):
-    return super(DeepMACParams,
-                 cls).__new__(cls, classification_loss, dim,
-                              task_loss_weight, pixel_embedding_dim,
-                              allowed_masked_classes_ids, mask_size,
-                              mask_num_subsamples, use_xy, network_type,
-                              use_instance_embedding, num_init_channels,
-                              predict_full_resolution_masks,
-                              postprocess_crop_size, max_roi_jitter_ratio,
-                              roi_jitter_mode, box_consistency_loss_weight,
-                              color_consistency_threshold,
-                              color_consistency_dilation,
-                              color_consistency_loss_weight)
-def _get_weak_loss_weight(loss_name, config):
-  if loss_name == DEEP_MASK_COLOR_CONSISTENCY:
    return config.color_consistency_loss_weight
  elif loss_name == DEEP_MASK_BOX_CONSISTENCY:
    return config.box_consistency_loss_weight
@@ -151,7 +131,7 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
      raise ValueError('Mask size must be set.')
    return FullyConnectedMaskHead(num_init_channels, mask_size)
-  elif name == 'embedding_projection':
+  elif _is_mask_head_param_free(name):
    return tf.keras.layers.Lambda(lambda x: x)
  elif name.startswith('resnet'):
@@ -395,6 +375,94 @@ def dilated_cross_same_mask_label(instance_masks, dilation=2):
  return tf.transpose(same_mask_prob, (0, 3, 1, 2))
+def _per_pixel_single_conv(input_tensor, params, channels):
+  """Convolve the given input with the given params.
+  Args:
+    input_tensor: A [num_instances, height, width, channels] shaped
+      float tensor.
+    params: A [num_instances, num_params] shaped float tensor.
+    channels: int, number of channels in the convolution.
+  Returns:
+    output: A float tensor of shape [num_instances, height, width, channels]
+  """
+  input_channels = input_tensor.get_shape().as_list()[3]
+  weights = params[:, :(input_channels * channels)]
+  biases = params[:, (input_channels * channels):]
+  num_instances = tf.shape(params)[0]
+  weights = tf.reshape(weights, (num_instances, input_channels, channels))
+  output = (input_tensor[:, :, tf.newaxis, :] @
+            weights[:, tf.newaxis, tf.newaxis, :, :])
+  output = output[:, :, 0, :, :]
+  output = output + biases[:, tf.newaxis, tf.newaxis, :]
+  return output
+def per_pixel_conditional_conv(input_tensor, parameters, channels, depth):
+  """Use parameters perform per-pixel convolutions with the given depth [1].
+  [1]: https://arxiv.org/abs/2003.05664
+  Args:
+    input_tensor: float tensor of shape [num_instances, height,
+      width, input_channels]
+    parameters: A [num_instances, num_params] float tensor. If num_params
+      is incomparible with the given channels and depth, a ValueError will
+      be raised.
+    channels: int, the number of channels in the convolution.
+    depth: int, the number of layers of convolutions to perform.
+  Returns:
+    output: A [num_instances, height, width] tensor with the conditional
+      conv applied according to each instance's parameters.
+  """
+  input_channels = input_tensor.get_shape().as_list()[3]
+  num_params = parameters.get_shape().as_list()[1]
+  input_convs = 1 if depth > 1 else 0
+  intermediate_convs = depth - 2 if depth >= 2 else 0
+  expected_weights = ((input_channels * channels * input_convs) +
+                      (channels * channels * intermediate_convs) +
+                      channels)  # final conv
+  expected_biases = (channels * (depth - 1)) + 1
+  if depth == 1:
+    if input_channels != channels:
+      raise ValueError(
+          'When depth=1, input_channels({}) should be equal to'.format(
+              input_channels) + ' channels({})'.format(channels))
+  if num_params != (expected_weights + expected_biases):
+    raise ValueError('Expected {} parameters at depth {}, but got {}'.format(
+        expected_weights + expected_biases, depth, num_params))
+  start = 0
+  output = input_tensor
+  for i in range(depth):
+    is_last_layer = i == (depth - 1)
+    if is_last_layer:
+      channels = 1
+    num_params_single_conv = channels * input_channels + channels
+    params = parameters[:, start:start + num_params_single_conv]
+    start += num_params_single_conv
+    output = _per_pixel_single_conv(output, params, channels)
+    if not is_last_layer:
+      output = tf.nn.relu(output)
+    input_channels = channels
+  return output
 class ResNetMaskNetwork(tf.keras.layers.Layer):
  """A small wrapper around ResNet blocks to predict masks."""
@@ -560,6 +628,16 @@ class DenseResNet(tf.keras.layers.Layer):
    return self.out_conv(self.resnet(net))
+def _is_mask_head_param_free(name):
+  # Mask heads which don't have parameters of their own and instead rely
+  # on the instance embedding.
+  if name == 'embedding_projection' or name.startswith('cond_inst'):
+    return True
+  return False
 class MaskHeadNetwork(tf.keras.layers.Layer):
  """Mask head class for DeepMAC."""
@@ -586,13 +664,14 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
    self._use_instance_embedding = use_instance_embedding
    self._network_type = network_type
+    self._num_init_channels = num_init_channels
    if (self._use_instance_embedding and
-        (self._network_type == 'embedding_projection')):
+        (_is_mask_head_param_free(network_type))):
      raise ValueError(('Cannot feed instance embedding to mask head when '
-                        'computing embedding projection.'))
+                        'mask-head has no parameters.'))
-    if network_type == 'embedding_projection':
+    if _is_mask_head_param_free(network_type):
      self.project_out = tf.keras.layers.Lambda(lambda x: x)
    else:
      self.project_out = tf.keras.layers.Conv2D(
@@ -632,6 +711,11 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
      instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
      out = embedding_projection(instance_embedding, out)
+    elif self._network_type.startswith('cond_inst'):
+      depth = int(self._network_type.lstrip('cond_inst'))
+      out = per_pixel_conditional_conv(out, instance_embedding,
+                                       self._num_init_channels, depth)
    if out.shape[-1] > 1:
      out = self.project_out(out)
@@ -651,6 +735,9 @@ def deepmac_proto_to_params(deepmac_config):
  jitter_mode = preprocessor_pb2.RandomJitterBoxes.JitterMode.Name(
      deepmac_config.jitter_mode).lower()
+  box_consistency_loss_normalize = center_net_pb2.LossNormalize.Name(
+      deepmac_config.box_consistency_loss_normalize).lower()
  return DeepMACParams(
      dim=deepmac_config.dim,
      classification_loss=classification_loss,
@@ -671,7 +758,14 @@ def deepmac_proto_to_params(deepmac_config):
      box_consistency_loss_weight=deepmac_config.box_consistency_loss_weight,
      color_consistency_threshold=deepmac_config.color_consistency_threshold,
      color_consistency_dilation=deepmac_config.color_consistency_dilation,
-      color_consistency_loss_weight=deepmac_config.color_consistency_loss_weight
+      color_consistency_loss_weight=
+      deepmac_config.color_consistency_loss_weight,
+      box_consistency_loss_normalize=box_consistency_loss_normalize,
+      box_consistency_tightness=deepmac_config.box_consistency_tightness,
+      color_consistency_warmup_steps=
+      deepmac_config.color_consistency_warmup_steps,
+      color_consistency_warmup_start=
+      deepmac_config.color_consistency_warmup_start
  )
@@ -868,6 +962,60 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    return resize_instance_masks(logits, (height, width))
+  def _aggregate_classification_loss(self, loss, gt, pred, method):
+    """Aggregates loss at a per-instance level.
+    When this function is used with mask-heads, num_classes is usually 1.
+    Args:
+      loss: A [num_instances, num_pixels, num_classes] or
+        [num_instances, num_classes] tensor. If the tensor is of rank 2, i.e.,
+        of the form [num_instances, num_classes], we will assume that the
+        number of pixels have already been nornalized.
+      gt: A [num_instances, num_pixels, num_classes] float tensor of
+        groundtruths.
+      pred: A [num_instances, num_pixels, num_classes] float tensor of
+        preditions.
+      method: A string in ['auto', 'groundtruth'].
+        'auto': When `loss` is rank 2, aggregates by sum. Otherwise, aggregates
+          by mean.
+        'groundtruth_count': Aggreagates the loss by computing sum and dividing
+          by the number of positive (1) groundtruth pixels.
+        'balanced': Normalizes each pixel by the number of positive or negative
+          pixels depending on the groundtruth.
+    Returns:
+      per_instance_loss: A [num_instances] float tensor.
+    """
+    rank = len(loss.get_shape().as_list())
+    if rank == 2:
+      axes = [1]
+    else:
+      axes = [1, 2]
+    if method == 'normalize_auto':
+      normalization = 1.0
+      if rank == 2:
+        return tf.reduce_sum(loss, axis=axes)
+      else:
+        return tf.reduce_mean(loss, axis=axes)
+    elif method == 'normalize_groundtruth_count':
+      normalization = tf.reduce_sum(gt, axis=axes)
+      return tf.reduce_sum(loss, axis=axes) / normalization
+    elif method == 'normalize_balanced':
+      if rank != 3:
+        raise ValueError('Cannot apply normalized_balanced aggregation '
+                         f'to loss of rank {rank}')
+      normalization = (
+          (gt * tf.reduce_sum(gt, keepdims=True, axis=axes)) +
+          (1 - gt) * tf.reduce_sum(1 - gt, keepdims=True, axis=axes))
+      return tf.reduce_sum(loss / normalization, axis=axes)
+    else:
+      raise ValueError('Unknown loss aggregation - {}'.format(method))
  def _compute_per_instance_mask_prediction_loss(
      self, boxes, mask_logits, mask_gt):
    """Compute the per-instance mask loss.
@@ -891,14 +1039,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        target_tensor=mask_gt,
        weights=tf.ones_like(mask_logits))
-    # TODO(vighneshb) Make this configurable via config.
+    return self._aggregate_classification_loss(
-    # Skip normalization for dice loss because the denominator term already
+        loss, mask_gt, mask_logits, 'normalize_auto')
-    # does normalization.
-    if isinstance(self._deepmac_params.classification_loss,
-                  losses.WeightedDiceClassificationLoss):
-      return tf.reduce_sum(loss, axis=1)
-    else:
-      return tf.reduce_mean(loss, axis=[1, 2])
  def _compute_per_instance_box_consistency_loss(
      self, boxes_gt, boxes_for_crop, mask_logits):
@@ -930,23 +1072,30 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    loss = 0.0
    for axis in [1, 2]:
-      pred_max = tf.reduce_max(pred_crop, axis=axis)[:, :, tf.newaxis]
+      if self._deepmac_params.box_consistency_tightness:
+        pred_max_raw = tf.reduce_max(pred_crop, axis=axis)
+        pred_max_within_box = tf.reduce_max(pred_crop * gt_crop, axis=axis)
+        box_1d = tf.reduce_max(gt_crop, axis=axis)
+        pred_max = ((box_1d * pred_max_within_box) +
+                    ((1 - box_1d) * pred_max_raw))
+      else:
+        pred_max = tf.reduce_max(pred_crop, axis=axis)
+      pred_max = pred_max[:, :, tf.newaxis]
      gt_max = tf.reduce_max(gt_crop, axis=axis)[:, :, tf.newaxis]
-      axis_loss = self._deepmac_params.classification_loss(
+      raw_loss = self._deepmac_params.classification_loss(
          prediction_tensor=pred_max,
          target_tensor=gt_max,
          weights=tf.ones_like(pred_max))
-      loss += axis_loss
+      loss += self._aggregate_classification_loss(
-    # Skip normalization for dice loss because the denominator term already
+          raw_loss, gt_max, pred_max,
-    # does normalization.
+          self._deepmac_params.box_consistency_loss_normalize)
-    # TODO(vighneshb) Make this configurable via config.
-    if isinstance(self._deepmac_params.classification_loss,
+    return loss
-                  losses.WeightedDiceClassificationLoss):
-      return tf.reduce_sum(loss, axis=1)
-    else:
-      return tf.reduce_mean(loss, axis=[1, 2])
  def _compute_per_instance_color_consistency_loss(
      self, boxes, preprocessed_image, mask_logits):
@@ -995,6 +1144,17 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    num_box_pixels = tf.maximum(1.0, tf.reduce_sum(box_mask, axis=[1, 2]))
    loss = loss / num_box_pixels
+    if ((self._deepmac_params.color_consistency_warmup_steps > 0) and
+        self._is_training):
+      training_step = tf.cast(self.training_step, tf.float32)
+      warmup_steps = tf.cast(
+          self._deepmac_params.color_consistency_warmup_steps, tf.float32)
+      start_step = tf.cast(
+          self._deepmac_params.color_consistency_warmup_start, tf.float32)
+      warmup_weight = (training_step - start_step) / warmup_steps
+      warmup_weight = tf.clip_by_value(warmup_weight, 0.0, 1.0)
+      loss *= warmup_weight
    return loss
  def _compute_per_instance_deepmac_losses(
@@ -1084,11 +1244,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    allowed_masked_classes_ids = (
        self._deepmac_params.allowed_masked_classes_ids)
-    loss_dict = {
+    loss_dict = {}
-        DEEP_MASK_ESTIMATION: 0.0,
+    for loss_name in MASK_LOSSES:
-    }
-    for loss_name in WEAK_LOSSES:
      loss_dict[loss_name] = 0.0
    prediction_shape = tf.shape(prediction_dict[INSTANCE_EMBEDDING][0])
@@ -1148,13 +1305,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
      mask_loss_dict = self._compute_instance_masks_loss(
          prediction_dict=prediction_dict)
-      losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION] = (
+      for loss_name in MASK_LOSSES:
-          self._deepmac_params.task_loss_weight * mask_loss_dict[
+        loss_weight = _get_loss_weight(loss_name, self._deepmac_params)
-              DEEP_MASK_ESTIMATION]
-      )
-      for loss_name in WEAK_LOSSES:
-        loss_weight = _get_weak_loss_weight(loss_name, self._deepmac_params)
        if loss_weight > 0.0:
          losses_dict[LOSS_KEY_PREFIX + '/' + loss_name] = (
              loss_weight * mask_loss_dict[loss_name])

--- a/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
 """Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.deepmac_meta_arch."""
 import functools
+import random
 import unittest
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+from google.protobuf import text_format
 from object_detection.core import losses
 from object_detection.core import preprocessor
 from object_detection.meta_architectures import center_net_meta_arch
 from object_detection.meta_architectures import deepmac_meta_arch
+from object_detection.protos import center_net_pb2
 from object_detection.utils import tf_version
+DEEPMAC_PROTO_TEXT = """
+  dim: 153
+  task_loss_weight: 5.0
+  pixel_embedding_dim: 8
+  use_xy: false
+  use_instance_embedding: false
+  network_type: "cond_inst3"
+  num_init_channels: 8
+  classification_loss {
+    weighted_dice_classification_loss {
+      squared_normalization: false
+      is_prediction_probability: false
+    }
+  }
+  jitter_mode: EXPAND_SYMMETRIC_XY
+  max_roi_jitter_ratio: 0.0
+  predict_full_resolution_masks: true
+  allowed_masked_classes_ids: [99]
+  box_consistency_loss_weight: 1.0
+  color_consistency_loss_weight: 1.0
+  color_consistency_threshold: 0.1
+  box_consistency_tightness: false
+  box_consistency_loss_normalize: NORMALIZE_AUTO
+  color_consistency_warmup_steps: 20
+  color_consistency_warmup_start: 10
+"""
 class DummyFeatureExtractor(center_net_meta_arch.CenterNetFeatureExtractor):
  def __init__(self,
@@ -60,14 +93,37 @@ class MockMaskNet(tf.keras.layers.Layer):
    return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9
-def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
+def build_meta_arch(**override_params):
-                    use_instance_embedding=True, mask_num_subsamples=-1,
-                    network_type='hourglass10', use_xy=True,
-                    pixel_embedding_dim=2,
-                    dice_loss_prediction_probability=False,
-                    color_consistency_threshold=0.5):
  """Builds the DeepMAC meta architecture."""
+  params = dict(
+      predict_full_resolution_masks=False,
+      use_instance_embedding=True,
+      mask_num_subsamples=-1,
+      network_type='hourglass10',
+      use_xy=True,
+      pixel_embedding_dim=2,
+      dice_loss_prediction_probability=False,
+      color_consistency_threshold=0.5,
+      use_dice_loss=False,
+      box_consistency_loss_normalize='normalize_auto',
+      box_consistency_tightness=False,
+      task_loss_weight=1.0,
+      color_consistency_loss_weight=1.0,
+      box_consistency_loss_weight=1.0,
+      num_init_channels=8,
+      dim=8,
+      allowed_masked_classes_ids=[],
+      mask_size=16,
+      postprocess_crop_size=128,
+      max_roi_jitter_ratio=0.0,
+      roi_jitter_mode='random',
+      color_consistency_dilation=2,
+      color_consistency_warmup_steps=0,
+      color_consistency_warmup_start=0)
+  params.update(override_params)
  feature_extractor = DummyFeatureExtractor(
      channel_means=(1.0, 2.0, 3.0),
      channel_stds=(10., 20., 30.),
@@ -87,33 +143,18 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
      max_box_predictions=5,
      use_labeled_classes=False)
+  use_dice_loss = params.pop('use_dice_loss')
+  dice_loss_prediction_prob = params.pop('dice_loss_prediction_probability')
  if use_dice_loss:
    classification_loss = losses.WeightedDiceClassificationLoss(
        squared_normalization=False,
-        is_prediction_probability=dice_loss_prediction_probability)
+        is_prediction_probability=dice_loss_prediction_prob)
  else:
    classification_loss = losses.WeightedSigmoidClassificationLoss()
  deepmac_params = deepmac_meta_arch.DeepMACParams(
      classification_loss=classification_loss,
-      dim=8,
+      **params
-      task_loss_weight=1.0,
-      pixel_embedding_dim=pixel_embedding_dim,
-      allowed_masked_classes_ids=[],
-      mask_size=16,
-      mask_num_subsamples=mask_num_subsamples,
-      use_xy=use_xy,
-      network_type=network_type,
-      use_instance_embedding=use_instance_embedding,
-      num_init_channels=8,
-      predict_full_resolution_masks=predict_full_resolution_masks,
-      postprocess_crop_size=128,
-      max_roi_jitter_ratio=0.0,
-      roi_jitter_mode='random',
-      box_consistency_loss_weight=1.0,
-      color_consistency_threshold=color_consistency_threshold,
-      color_consistency_dilation=2,
-      color_consistency_loss_weight=1.0
  )
  object_detection_params = center_net_meta_arch.ObjectDetectionParams(
@@ -136,6 +177,15 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
+  def test_proto_parse(self):
+    proto = center_net_pb2.CenterNet().DeepMACMaskEstimation()
+    text_format.Parse(DEEPMAC_PROTO_TEXT, proto)
+    params = deepmac_meta_arch.deepmac_proto_to_params(proto)
+    self.assertIsInstance(params, deepmac_meta_arch.DeepMACParams)
+    self.assertEqual(params.dim, 153)
+    self.assertEqual(params.box_consistency_loss_normalize, 'normalize_auto')
  def test_subsample_trivial(self):
    """Test subsampling masks."""
@@ -280,18 +330,126 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllClose(np.ones((8, 5, 5)), output[:, 1, :, :])
    self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 1], output[:, 0, 2, 2])
+  def test_per_pixel_single_conv_multiple_instance(self):
+    inp = tf.zeros((5, 32, 32, 7))
+    params = tf.zeros((5, 7*8 + 8))
+    out = deepmac_meta_arch._per_pixel_single_conv(inp, params, 8)
+    self.assertEqual(out.shape, (5, 32, 32, 8))
+  def test_per_pixel_conditional_conv_error(self):
+    with self.assertRaises(ValueError):
+      deepmac_meta_arch.per_pixel_conditional_conv(
+          tf.zeros((10, 32, 32, 8)), tf.zeros((10, 2)), 8, 3)
+  def test_per_pixel_conditional_conv_error_tf_func(self):
+    with self.assertRaises(ValueError):
+      func = tf.function(deepmac_meta_arch.per_pixel_conditional_conv)
+      func(tf.zeros((10, 32, 32, 8)), tf.zeros((10, 2)), 8, 3)
+  def test_per_pixel_conditional_conv_depth1_error(self):
+    with self.assertRaises(ValueError):
+      _ = deepmac_meta_arch.per_pixel_conditional_conv(
+          tf.zeros((10, 32, 32, 7)), tf.zeros((10, 8)), 99, 1)
+  @parameterized.parameters([
+      {
+          'num_input_channels': 7,
+          'instance_embedding_dim': 8,
+          'channels': 7,
+          'depth': 1
+      },
+      {
+          'num_input_channels': 7,
+          'instance_embedding_dim': 82,
+          'channels': 9,
+          'depth': 2
+      },
+      {  # From https://arxiv.org/abs/2003.05664
+          'num_input_channels': 10,
+          'instance_embedding_dim': 169,
+          'channels': 8,
+          'depth': 3
+      },
+      {
+          'num_input_channels': 8,
+          'instance_embedding_dim': 433,
+          'channels': 16,
+          'depth': 3
+      },
+      {
+          'num_input_channels': 8,
+          'instance_embedding_dim': 1377,
+          'channels': 32,
+          'depth': 3
+      },
+      {
+          'num_input_channels': 8,
+          'instance_embedding_dim': 4801,
+          'channels': 64,
+          'depth': 3
+      },
+  ])
+  def test_per_pixel_conditional_conv_shape(
+      self, num_input_channels, instance_embedding_dim, channels, depth):
+    out = deepmac_meta_arch.per_pixel_conditional_conv(
+        tf.zeros((10, 32, 32, num_input_channels)),
+        tf.zeros((10, instance_embedding_dim)), channels, depth)
+    self.assertEqual(out.shape, (10, 32, 32, 1))
+  def test_per_pixel_conditional_conv_value_depth1(self):
+    input_tensor = tf.constant(np.array([1, 2, 3]))
+    input_tensor = tf.reshape(input_tensor, (1, 1, 1, 3))
+    instance_embedding = tf.constant(
+        np.array([1, 10, 100, 1000]))
+    instance_embedding = tf.reshape(instance_embedding, (1, 4))
+    out = deepmac_meta_arch.per_pixel_conditional_conv(
+        input_tensor, instance_embedding, channels=3, depth=1)
+    expected_output = np.array([1321])
+    expected_output = np.reshape(expected_output, (1, 1, 1, 1))
+    self.assertAllClose(expected_output, out)
+  def test_per_pixel_conditional_conv_value_depth2_single(self):
+    input_tensor = tf.constant(np.array([2]))
+    input_tensor = tf.reshape(input_tensor, (1, 1, 1, 1))
+    instance_embedding = tf.constant(
+        np.array([-2, 3, 100, 5]))
+    instance_embedding = tf.reshape(instance_embedding, (1, 4))
+    out = deepmac_meta_arch.per_pixel_conditional_conv(
+        input_tensor, instance_embedding, channels=1, depth=2)
+    expected_output = np.array([5])
+    expected_output = np.reshape(expected_output, (1, 1, 1, 1))
+    self.assertAllClose(expected_output, out)
+  def test_per_pixel_conditional_conv_value_depth2_identity(self):
+    input_tensor = tf.constant(np.array([1, 2]))
+    input_tensor = tf.reshape(input_tensor, (1, 1, 1, 2))
+    instance_embedding = tf.constant(
+        np.array([1, 0, 0, 1, 1, -3, 5, 100, -9]))
+    instance_embedding = tf.reshape(
+        instance_embedding, (1, 9))
+    out = deepmac_meta_arch.per_pixel_conditional_conv(
+        input_tensor, instance_embedding, channels=2, depth=2)
+    expected_output = np.array([1])
+    expected_output = np.reshape(expected_output, (1, 1, 1, 1))
+    self.assertAllClose(expected_output, out)
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
-  @parameterized.parameters(
-      ['hourglass10', 'hourglass20', 'resnet4'])
-  def test_mask_network(self, head_type):
-    net = deepmac_meta_arch.MaskHeadNetwork(head_type, 8)
-    out = net(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
-    self.assertEqual(out.shape, (2, 32, 32))
  def test_mask_network_params_resnet4(self):
    net = deepmac_meta_arch.MaskHeadNetwork('resnet4', num_init_channels=8)
    _ = net(tf.zeros((2, 16)), tf.zeros((2, 32, 32, 16)), training=True)
@@ -301,39 +459,93 @@ class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
    self.assertEqual(trainable_params.numpy(), 8665)
-  def test_mask_network_resnet_tf_function(self):
+  def test_mask_network_embedding_projection_small(self):
-    net = deepmac_meta_arch.MaskHeadNetwork('resnet8')
-    call_func = tf.function(net.__call__)
-    out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
-    self.assertEqual(out.shape, (2, 32, 32))
-  def test_mask_network_embedding_projection_zero(self):
    net = deepmac_meta_arch.MaskHeadNetwork(
-        'embedding_projection', num_init_channels=8,
+        'embedding_projection', num_init_channels=-1,
        use_instance_embedding=False)
    call_func = tf.function(net.__call__)
-    out = call_func(tf.zeros((2, 7)), tf.zeros((2, 32, 32, 7)), training=True)
+    out = call_func(1e6 + tf.zeros((2, 7)),
+                    tf.zeros((2, 32, 32, 7)), training=True)
    self.assertEqual(out.shape, (2, 32, 32))
    self.assertAllGreater(out.numpy(), -np.inf)
    self.assertAllLess(out.numpy(), np.inf)
-  def test_mask_network_embedding_projection_small(self):
+  @parameterized.parameters([
+      {
+          'mask_net': 'resnet4',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 4,
+          'input_channels': 16,
+          'use_instance_embedding': False
+      },
+      {
+          'mask_net': 'hourglass10',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 4,
+          'input_channels': 16,
+          'use_instance_embedding': False
+      },
+      {
+          'mask_net': 'hourglass20',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 4,
+          'input_channels': 16,
+          'use_instance_embedding': False
+      },
+      {
+          'mask_net': 'cond_inst3',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 153,
+          'input_channels': 8,
+          'use_instance_embedding': False
+      },
+      {
+          'mask_net': 'cond_inst3',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 169,
+          'input_channels': 10,
+          'use_instance_embedding': False
+      },
+      {
+          'mask_net': 'cond_inst1',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 9,
+          'input_channels': 8,
+          'use_instance_embedding': False
+      },
+      {
+          'mask_net': 'cond_inst2',
+          'mask_net_channels': 8,
+          'instance_embedding_dim': 81,
+          'input_channels': 8,
+          'use_instance_embedding': False
+      },
+  ])
+  def test_mask_network(self, mask_net, mask_net_channels,
+                        instance_embedding_dim, input_channels,
+                        use_instance_embedding):
    net = deepmac_meta_arch.MaskHeadNetwork(
-        'embedding_projection', num_init_channels=-1,
+        mask_net, num_init_channels=mask_net_channels,
-        use_instance_embedding=False)
+        use_instance_embedding=use_instance_embedding)
    call_func = tf.function(net.__call__)
-    out = call_func(1e6 + tf.zeros((2, 7)),
+    out = call_func(tf.zeros((2, instance_embedding_dim)),
-                    tf.zeros((2, 32, 32, 7)), training=True)
+                    tf.zeros((2, 32, 32, input_channels)), training=True)
    self.assertEqual(out.shape, (2, 32, 32))
    self.assertAllGreater(out.numpy(), -np.inf)
    self.assertAllLess(out.numpy(), np.inf)
+    out = call_func(tf.zeros((2, instance_embedding_dim)),
+                    tf.zeros((2, 32, 32, input_channels)), training=True)
+    self.assertEqual(out.shape, (2, 32, 32))
+    out = call_func(tf.zeros((0, instance_embedding_dim)),
+                    tf.zeros((0, 32, 32, input_channels)), training=True)
+    self.assertEqual(out.shape, (0, 32, 32))
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
@@ -619,8 +831,85 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    xloss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.constant([1.0] * 16),
        logits=[1.0] * 12 + [0.0] * 4)
+    yloss_mean = tf.reduce_mean(yloss)
+    xloss_mean = tf.reduce_mean(xloss)
+    self.assertAllClose(loss, [yloss_mean + xloss_mean])
+  def test_box_consistency_loss_with_tightness(self):
+    boxes_gt = tf.constant([[0., 0., 0.49, 0.49]])
+    boxes_jittered = None
+    mask_prediction = np.zeros((1, 8, 8)).astype(np.float32) - 1e10
+    mask_prediction[0, :4, :4] = 1e10
-    self.assertAllClose(loss, [tf.reduce_mean(yloss + xloss).numpy()])
+    model = build_meta_arch(box_consistency_tightness=True,
+                            predict_full_resolution_masks=True)
+    loss = model._compute_per_instance_box_consistency_loss(
+        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
+    self.assertAllClose(loss, [0.0])
+  def test_box_consistency_loss_gt_count(self):
+    boxes_gt = tf.constant([
+        [0., 0., 1.0, 1.0],
+        [0., 0., 0.49, 0.49]])
+    boxes_jittered = None
+    mask_prediction = np.zeros((2, 32, 32)).astype(np.float32)
+    mask_prediction[0, :16, :16] = 1.0
+    mask_prediction[1, :8, :8] = 1.0
+    model = build_meta_arch(
+        box_consistency_loss_normalize='normalize_groundtruth_count',
+        predict_full_resolution_masks=True)
+    loss_func = tf.function(
+        model._compute_per_instance_box_consistency_loss)
+    loss = loss_func(
+        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
+    yloss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=tf.constant([1.0] * 32),
+        logits=[1.0] * 16 + [0.0] * 16) / 32.0
+    yloss_mean = tf.reduce_sum(yloss)
+    xloss = yloss
+    xloss_mean = tf.reduce_sum(xloss)
+    self.assertAllClose(loss[0], yloss_mean + xloss_mean)
+    yloss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=tf.constant([1.0] * 16 + [0.0] * 16),
+        logits=[1.0] * 8 + [0.0] * 24) / 16.0
+    yloss_mean = tf.reduce_sum(yloss)
+    xloss = yloss
+    xloss_mean = tf.reduce_sum(xloss)
+    self.assertAllClose(loss[1], yloss_mean + xloss_mean)
+  def test_box_consistency_loss_balanced(self):
+    boxes_gt = tf.constant([
+        [0., 0., 0.49, 0.49]])
+    boxes_jittered = None
+    mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
+    mask_prediction[0] = 1.0
+    model = build_meta_arch(box_consistency_loss_normalize='normalize_balanced',
+                            predict_full_resolution_masks=True)
+    loss_func = tf.function(
+        model._compute_per_instance_box_consistency_loss)
+    loss = loss_func(
+        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
+    yloss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=[0.] * 16 + [1.0] * 16,
+        logits=[1.0] * 32)
+    yloss_mean = tf.reduce_sum(yloss) / 16.0
+    xloss_mean = yloss_mean
+    self.assertAllClose(loss[0], yloss_mean + xloss_mean)
  def test_box_consistency_dice_loss(self):
@@ -701,34 +990,145 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
    self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
-    for weak_loss in deepmac_meta_arch.WEAK_LOSSES:
+    for weak_loss in deepmac_meta_arch.MASK_LOSSES:
      if weak_loss == deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY:
        continue
      self.assertGreater(loss['Loss/' + weak_loss], 0.0,
                         '{} was <= 0'.format(weak_loss))
-  def test_loss_keys_full_res(self):
+  def test_loss_weight_response(self):
-    model = build_meta_arch(use_dice_loss=True,
+    model = build_meta_arch(
-                            predict_full_resolution_masks=True)
+        use_dice_loss=True,
+        predict_full_resolution_masks=True,
+        network_type='cond_inst1',
+        dim=9,
+        pixel_embedding_dim=8,
+        use_instance_embedding=False,
+        use_xy=False)
+    num_stages = 1
    prediction = {
        'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
-        'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 17))] * 2,
+        'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
-        'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 19))] * 2,
+        'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
-        'object_center': [tf.random.normal((1, 8, 8, 6))] * 2,
+        'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
-        'box/offset': [tf.random.normal((1, 8, 8, 2))] * 2,
+        'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
-        'box/scale': [tf.random.normal((1, 8, 8, 2))] * 2
+        'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
    }
+    boxes = [tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)]
+    classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
+    weights = [tf.ones(5)]
+    masks = [tf.ones((5, 32, 32))]
    model.provide_groundtruth(
-        groundtruth_boxes_list=[tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)],
+        groundtruth_boxes_list=boxes,
-        groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)],
+        groundtruth_classes_list=classes,
-        groundtruth_weights_list=[tf.ones(5)],
+        groundtruth_weights_list=weights,
-        groundtruth_masks_list=[tf.ones((5, 32, 32))])
+        groundtruth_masks_list=masks)
    loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
    self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
-    for weak_loss in deepmac_meta_arch.WEAK_LOSSES:
+    for mask_loss in deepmac_meta_arch.MASK_LOSSES:
-      self.assertGreater(loss['Loss/' + weak_loss], 0.0,
+      self.assertGreater(loss['Loss/' + mask_loss], 0.0,
-                         '{} was <= 0'.format(weak_loss))
+                         '{} was <= 0'.format(mask_loss))
+    rng = random.Random(0)
+    loss_weights = {
+        deepmac_meta_arch.DEEP_MASK_ESTIMATION: rng.uniform(1, 5),
+        deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY: rng.uniform(1, 5),
+        deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY: rng.uniform(1, 5)
+    }
+    weighted_model = build_meta_arch(
+        use_dice_loss=True,
+        predict_full_resolution_masks=True,
+        network_type='cond_inst1',
+        dim=9,
+        pixel_embedding_dim=8,
+        use_instance_embedding=False,
+        use_xy=False,
+        task_loss_weight=loss_weights[deepmac_meta_arch.DEEP_MASK_ESTIMATION],
+        box_consistency_loss_weight=(
+            loss_weights[deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY]),
+        color_consistency_loss_weight=(
+            loss_weights[deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY]))
+    weighted_model.provide_groundtruth(
+        groundtruth_boxes_list=boxes,
+        groundtruth_classes_list=classes,
+        groundtruth_weights_list=weights,
+        groundtruth_masks_list=masks)
+    weighted_loss = weighted_model.loss(prediction, tf.constant([[32, 32, 3]]))
+    for mask_loss in deepmac_meta_arch.MASK_LOSSES:
+      loss_key = 'Loss/' + mask_loss
+      self.assertAllEqual(
+          weighted_loss[loss_key], loss[loss_key] * loss_weights[mask_loss],
+          f'{mask_loss} did not respond to change in weight.')
+  def test_color_consistency_warmup(self):
+    model = build_meta_arch(
+        use_dice_loss=True,
+        predict_full_resolution_masks=True,
+        network_type='cond_inst1',
+        dim=9,
+        pixel_embedding_dim=8,
+        use_instance_embedding=False,
+        use_xy=False,
+        color_consistency_warmup_steps=10,
+        color_consistency_warmup_start=10)
+    num_stages = 1
+    prediction = {
+        'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
+        'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
+        'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
+        'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
+        'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
+        'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
+    }
+    boxes = [tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)]
+    classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
+    weights = [tf.ones(5)]
+    masks = [tf.ones((5, 32, 32))]
+    model.provide_groundtruth(
+        groundtruth_boxes_list=boxes,
+        groundtruth_classes_list=classes,
+        groundtruth_weights_list=weights,
+        groundtruth_masks_list=masks,
+        training_step=5)
+    loss_at_5 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
+    model.provide_groundtruth(
+        groundtruth_boxes_list=boxes,
+        groundtruth_classes_list=classes,
+        groundtruth_weights_list=weights,
+        groundtruth_masks_list=masks,
+        training_step=15)
+    loss_at_15 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
+    model.provide_groundtruth(
+        groundtruth_boxes_list=boxes,
+        groundtruth_classes_list=classes,
+        groundtruth_weights_list=weights,
+        groundtruth_masks_list=masks,
+        training_step=20)
+    loss_at_20 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
+    model.provide_groundtruth(
+        groundtruth_boxes_list=boxes,
+        groundtruth_classes_list=classes,
+        groundtruth_weights_list=weights,
+        groundtruth_masks_list=masks,
+        training_step=100)
+    loss_at_100 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
+    loss_key = 'Loss/' + deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY
+    self.assertAlmostEqual(loss_at_5[loss_key].numpy(), 0.0)
+    self.assertAlmostEqual(loss_at_15[loss_key].numpy(),
+                           loss_at_20[loss_key].numpy() / 2.0)
+    self.assertAlmostEqual(loss_at_20[loss_key].numpy(),
+                           loss_at_100[loss_key].numpy())
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')

--- a/research/object_detection/model_lib.py
+++ b/research/object_detection/model_lib.py
@@ -114,6 +114,10 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
      'groundtruth_not_exhaustive_classes': [batch_size, num_classes] K-hot
        representation of 1-indexed classes which don't have all of their
        instances marked exhaustively.
+      'input_data_fields.groundtruth_image_classes': integer representation of
+        the classes that were sent for verification for a given image. Note that
+        this field does not support batching as the number of classes can be
+        variable.
    class_agnostic: Boolean indicating whether detections are class agnostic.
  """
  input_data_fields = fields.InputDataFields()
@@ -136,6 +140,18 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
      input_data_fields.groundtruth_classes: groundtruth_classes
  }
+  if detection_model.groundtruth_has_field(
+      input_data_fields.groundtruth_image_classes):
+    groundtruth_image_classes_k_hot = tf.stack(
+        detection_model.groundtruth_lists(
+            input_data_fields.groundtruth_image_classes))
+    # We do not add label_id_offset here because it was not added when encoding
+    # groundtruth_image_classes.
+    groundtruth_image_classes = tf.expand_dims(
+        tf.where(groundtruth_image_classes_k_hot > 0)[:, 1], 0)
+    groundtruth[
+        input_data_fields.groundtruth_image_classes] = groundtruth_image_classes
  if detection_model.groundtruth_has_field(fields.BoxListFields.masks):
    groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.masks))
@@ -303,7 +319,7 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
  return unbatched_tensor_dict
-def provide_groundtruth(model, labels):
+def provide_groundtruth(model, labels, training_step=None):
  """Provides the labels to a model as groundtruth.
  This helper function extracts the corresponding boxes, classes,
@@ -313,6 +329,8 @@ def provide_groundtruth(model, labels):
  Args:
    model: The detection model to provide groundtruth to.
    labels: The labels for the training or evaluation inputs.
+    training_step: int, optional. The training step for the model. Useful
+      for models which want to anneal loss weights.
  """
  gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
  gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
@@ -382,6 +400,10 @@ def provide_groundtruth(model, labels):
  if fields.InputDataFields.groundtruth_not_exhaustive_classes in labels:
    gt_not_exhaustive_classes = labels[
        fields.InputDataFields.groundtruth_not_exhaustive_classes]
+  groundtruth_image_classes = None
+  if fields.InputDataFields.groundtruth_image_classes in labels:
+    groundtruth_image_classes = labels[
+        fields.InputDataFields.groundtruth_image_classes]
  model.provide_groundtruth(
      groundtruth_boxes_list=gt_boxes_list,
      groundtruth_classes_list=gt_classes_list,
@@ -402,7 +424,9 @@ def provide_groundtruth(model, labels):
      groundtruth_verified_neg_classes=gt_verified_neg_classes,
      groundtruth_not_exhaustive_classes=gt_not_exhaustive_classes,
      groundtruth_keypoint_depths_list=gt_keypoint_depths_list,
-      groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list)
+      groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list,
+      groundtruth_image_classes=groundtruth_image_classes,
+      training_step=training_step)
 def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False,

--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -51,7 +51,7 @@ RESTORE_MAP_ERROR_TEMPLATE = (
 def _compute_losses_and_predictions_dicts(
-    model, features, labels,
+    model, features, labels, training_step=None,
    add_regularization_loss=True):
  """Computes the losses dict and predictions dict for a model on inputs.
@@ -107,6 +107,7 @@ def _compute_losses_and_predictions_dicts(
          float32 tensor containing keypoint depths information.
        labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a
          float32 tensor containing the weights of the keypoint depth feature.
+    training_step: int, the current training step.
    add_regularization_loss: Whether or not to include the model's
      regularization loss in the losses dictionary.
@@ -116,7 +117,7 @@ def _compute_losses_and_predictions_dicts(
    `model.predict`.
  """
-  model_lib.provide_groundtruth(model, labels)
+  model_lib.provide_groundtruth(model, labels, training_step=training_step)
  preprocessed_images = features[fields.InputDataFields.image]
  prediction_dict = model.predict(
@@ -166,7 +167,8 @@ def _ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors):
    labels = model_lib.unstack_batch(
        labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
-    return _compute_losses_and_predictions_dicts(model, features, labels)
+    return _compute_losses_and_predictions_dicts(model, features, labels,
+                                                 training_step=0)
  strategy = tf.compat.v2.distribute.get_strategy()
  if hasattr(tf.distribute.Strategy, 'run'):
@@ -208,6 +210,7 @@ def eager_train_step(detection_model,
                     labels,
                     unpad_groundtruth_tensors,
                     optimizer,
+                     training_step,
                     add_regularization_loss=True,
                     clip_gradients_value=None,
                     num_replicas=1.0):
@@ -280,6 +283,7 @@ def eager_train_step(detection_model,
          float32 tensor containing the weights of the keypoint depth feature.
    unpad_groundtruth_tensors: A parameter passed to unstack_batch.
    optimizer: The training optimizer that will update the variables.
+    training_step: int, the training step number.
    add_regularization_loss: Whether or not to include the model's
      regularization loss in the losses dictionary.
    clip_gradients_value: If this is present, clip the gradients global norm
@@ -302,7 +306,9 @@ def eager_train_step(detection_model,
  with tf.GradientTape() as tape:
    losses_dict, _ = _compute_losses_and_predictions_dicts(
-        detection_model, features, labels, add_regularization_loss)
+        detection_model, features, labels,
+        training_step=training_step,
+        add_regularization_loss=add_regularization_loss)
    losses_dict = normalize_dict(losses_dict, num_replicas)
@@ -632,6 +638,7 @@ def train_loop(
              labels,
              unpad_groundtruth_tensors,
              optimizer,
+              training_step=global_step,
              add_regularization_loss=add_regularization_loss,
              clip_gradients_value=clip_gradients_value,
              num_replicas=strategy.num_replicas_in_sync)
@@ -901,7 +908,8 @@ def eager_eval_loop(
        labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
    losses_dict, prediction_dict = _compute_losses_and_predictions_dicts(
-        detection_model, features, labels, add_regularization_loss)
+        detection_model, features, labels, training_step=None,
+        add_regularization_loss=add_regularization_loss)
    prediction_dict = detection_model.postprocess(
        prediction_dict, features[fields.InputDataFields.true_image_shape])
    eval_features = {