Merge branch 'purdue-yolo' into detection_generator_pr

5ffcc5b6 · Anirudh Vegesana · GitHub · 0b81a843 · 76e0c014 · 5ffcc5b6
Unverified Commit 5ffcc5b6 authored Jul 21, 2021 by Anirudh Vegesana Committed by GitHub Jul 21, 2021
20 changed files
--- a/research/delf/delf/python/datasets/sfm120k/sfm120k_test.py
+++ b/research/delf/delf/python/datasets/sfm120k/sfm120k_test.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Sfm120k dataset module."""
+
+import tensorflow as tf
+
+from delf.python.datasets.sfm120k import sfm120k
+
+
+class Sfm120kTest(tf.test.TestCase):
+  """Tests for Sfm120k dataset module."""
+
+  def testId2Filename(self):
+    """Tests conversion of image id to full path mapping."""
+    image_id = "29fdc243aeb939388cfdf2d081dc080e"
+    prefix = "train/retrieval-SfM-120k/ims/"
+    path = sfm120k.id2filename(image_id, prefix)
+    expected_path = "train/retrieval-SfM-120k/ims/0e/08/dc" \
+                    "/29fdc243aeb939388cfdf2d081dc080e"
+    self.assertEqual(path, expected_path)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/delf/delf/python/datasets/tuples_dataset.py
+++ b/research/delf/delf/python/datasets/tuples_dataset.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tuple dataset module.
+
+Based on the Radenovic et al. ECCV16: CNN image retrieval learns from BoW.
+For more information refer to https://arxiv.org/abs/1604.02426.
+"""
+
+import os
+import pickle
+
+import numpy as np
+import tensorflow as tf
+
+from delf.python.datasets import utils as image_loading_utils
+from delf.python.training import global_features_utils
+from delf.python.training.model import global_model
+
+
+class TuplesDataset():
+  """Data loader that loads training and validation tuples.
+
+  After initialization, the function create_epoch_tuples() should be called to
+  create the dataset tuples. After that, the dataset can be iterated through
+  using next() function.
+  Tuples are based on Radenovic et al. ECCV16 work: CNN image retrieval
+  learns from BoW. For more information refer to
+  https://arxiv.org/abs/1604.02426.
+  """
+
+  def __init__(self, name, mode, data_root, imsize=None, num_negatives=5,
+               num_queries=2000, pool_size=20000,
+               loader=image_loading_utils.default_loader, ims_root=None):
+    """TuplesDataset object initialization.
+
+    Args:
+      name: String, dataset name. I.e. 'retrieval-sfm-120k'.
+      mode: 'train' or 'val' for training and validation parts of dataset.
+      data_root: Path to the root directory of the dataset.
+      imsize: Integer, defines the maximum size of longer image side transform.
+      num_negatives: Integer, number of negative images for a query image in a
+        training tuple.
+      num_queries: Integer, number of query images to be processed in one epoch.
+      pool_size: Integer, size of the negative image pool, from where the
+        hard-negative images are re-mined.
+      loader: Callable, a function to load an image given its path.
+      ims_root: String, image root directory.
+
+    Raises:
+      ValueError: If mode is not either 'train' or 'val'.
+    """
+
+    if mode not in ['train', 'val']:
+      raise ValueError(
+              "`mode` argument should be either 'train' or 'val', passed as a "
+              "String.")
+
+    # Loading db.
+    db_filename = os.path.join(data_root, '{}.pkl'.format(name))
+    with tf.io.gfile.GFile(db_filename, 'rb') as f:
+      db = pickle.load(f)[mode]
+
+    # Initializing tuples dataset.
+    self._ims_root = data_root if ims_root is None else ims_root
+    self._name = name
+    self._mode = mode
+    self._imsize = imsize
+    self._clusters = db['cluster']
+    self._query_pool = db['qidxs']
+    self._positive_pool = db['pidxs']
+
+    if not hasattr(self, 'images'):
+      self.images = db['ids']
+
+    # Size of training subset for an epoch.
+    self._num_negatives = num_negatives
+    self._num_queries = min(num_queries, len(self._query_pool))
+    self._pool_size = min(pool_size, len(self.images))
+    self._qidxs = None
+    self._pidxs = None
+    self._nidxs = None
+
+    self._loader = loader
+    self._print_freq = 10
+    # Indexer for the iterator.
+    self._n = 0
+
+  def __iter__(self):
+    """Function for making TupleDataset an iterator.
+
+    Returns:
+      iter: The iterator object itself (TupleDataset).
+    """
+    return self
+
+  def __next__(self):
+    """Function for making TupleDataset an iterator.
+
+    Returns:
+      next: The next item in the sequence (next dataset image tuple).
+    """
+    if self._n < len(self._qidxs):
+      result = self.__getitem__(self._n)
+      self._n += 1
+      return result
+    else:
+      raise StopIteration
+
+  def _img_names_to_full_path(self, image_list):
+    """Converts list of image names to the list of full paths to the images.
+
+    Args:
+      image_list: Image names, either a list or a single image path.
+
+    Returns:
+      image_full_paths: List of full paths to the images.
+    """
+    if not isinstance(image_list, list):
+      return os.path.join(self._ims_root, image_list)
+    return [os.path.join(self._ims_root, img_name) for img_name in image_list]
+
+  def __getitem__(self, index):
+    """Called to load an image tuple at the given `index`.
+
+    Args:
+      index: Integer, index.
+
+    Returns:
+      output: Tuple [q,p,n1,...,nN, target], loaded 'train'/'val' tuple at
+        index of qidxs. `q` is the query image tensor, `p` is the
+        corresponding positive image tensor, `n1`,...,`nN` are the negatives
+        associated with the query. `target` is a tensor (with the shape [2+N])
+        of integer labels corresponding to the tuple list: query (-1),
+        positive (1), negative (0).
+
+    Raises:
+      ValueError: Raised if the query indexes list `qidxs` is empty.
+    """
+    if self.__len__() == 0:
+      raise ValueError(
+              "List `qidxs` is empty. Run `dataset.create_epoch_tuples(net)` "
+              "method to create subset for `train`/`val`.")
+
+    output = []
+    # Query image.
+    output.append(self._loader(
+            self._img_names_to_full_path(self.images[self._qidxs[index]]),
+            self._imsize))
+    # Positive image.
+    output.append(self._loader(
+            self._img_names_to_full_path(self.images[self._pidxs[index]]),
+            self._imsize))
+    # Negative images.
+    for nidx in self._nidxs[index]:
+      output.append(self._loader(
+              self._img_names_to_full_path(self.images[nidx]),
+              self._imsize))
+    # Labels for the query (-1), positive (1), negative (0) images in the tuple.
+    target = tf.convert_to_tensor([-1, 1] + [0] * self._num_negatives)
+    output.append(target)
+
+    return tuple(output)
+
+  def __len__(self):
+    """Called to implement the built-in function len().
+
+    Returns:
+      len: Integer, number of query images.
+    """
+    if self._qidxs is None:
+      return 0
+    return len(self._qidxs)
+
+  def __repr__(self):
+    """Metadata for the TupleDataset.
+
+    Returns:
+      meta: String, containing TupleDataset meta.
+    """
+    fmt_str = self.__class__.__name__ + '\n'
+    fmt_str += '\tName and mode: {} {}\n'.format(self._name, self._mode)
+    fmt_str += '\tNumber of images: {}\n'.format(len(self.images))
+    fmt_str += '\tNumber of training tuples: {}\n'.format(len(self._query_pool))
+    fmt_str += '\tNumber of negatives per tuple: {}\n'.format(
+            self._num_negatives)
+    fmt_str += '\tNumber of tuples processed in an epoch: {}\n'.format(
+            self._num_queries)
+    fmt_str += '\tPool size for negative remining: {}\n'.format(self._pool_size)
+    return fmt_str
+
+  def create_epoch_tuples(self, net):
+    """Creates epoch tuples with the hard-negative re-mining.
+
+    Negative examples are selected from clusters different than the cluster
+    of the query image, as the clusters are ideally non-overlaping. For
+    every query image we choose  hard-negatives, that is, non-matching images
+    with the most similar descriptor. Hard-negatives depend on the current
+    CNN parameters. K-nearest neighbors from all non-matching images are
+    selected. Query images are selected randomly. Positives examples are
+    fixed for the related query image during the whole training process.
+
+    Args:
+      net: Model, network to be used for negative re-mining.
+
+    Raises:
+      ValueError: If the pool_size is smaller than the number of negative
+        images per tuple.
+
+    Returns:
+      avg_l2: Float, average negative L2-distance.
+    """
+    self._n = 0
+
+    if self._num_negatives < self._pool_size:
+      raise ValueError("Unable to create epoch tuples. Negative pool_size "
+                       "should be larger than the number of negative images "
+                       "per tuple.")
+
+    global_features_utils.debug_and_log(
+            '>> Creating tuples for an epoch of {}-{}...'.format(self._name,
+                                                                 self._mode),
+            True)
+    global_features_utils.debug_and_log(">> Used network: ", True)
+    global_features_utils.debug_and_log(net.meta_repr(), True)
+
+    ## Selecting queries.
+    # Draw `num_queries` random queries for the tuples.
+    idx_list = np.arange(len(self._query_pool))
+    np.random.shuffle(idx_list)
+    idxs2query_pool = idx_list[:self._num_queries]
+    self._qidxs = [self._query_pool[i] for i in idxs2query_pool]
+
+    ## Selecting positive pairs.
+    # Positives examples are fixed for each query during the whole training
+    # process.
+    self._pidxs = [self._positive_pool[i] for i in idxs2query_pool]
+
+    ## Selecting negative pairs.
+    # If `num_negatives` = 0 create dummy nidxs.
+    # Useful when only positives used for training.
+    if self._num_negatives == 0:
+      self._nidxs = [[] for _ in range(len(self._qidxs))]
+      return 0
+
+    # Draw pool_size random images for pool of negatives images.
+    neg_idx_list = np.arange(len(self.images))
+    np.random.shuffle(neg_idx_list)
+    neg_images_idxs = neg_idx_list[:self._pool_size]
+
+    global_features_utils.debug_and_log(
+            '>> Extracting descriptors for query images...', debug=True)
+
+    img_list = self._img_names_to_full_path([self.images[i] for i in
+                                             self._qidxs])
+    qvecs = global_model.extract_global_descriptors_from_list(
+            net,
+            images=img_list,
+            image_size=self._imsize,
+            print_freq=self._print_freq)
+
+    global_features_utils.debug_and_log(
+            '>> Extracting descriptors for negative pool...', debug=True)
+
+    poolvecs = global_model.extract_global_descriptors_from_list(
+            net,
+            images=self._img_names_to_full_path([self.images[i] for i in
+                                                 neg_images_idxs]),
+            image_size=self._imsize,
+            print_freq=self._print_freq)
+
+    global_features_utils.debug_and_log('>> Searching for hard negatives...',
+                                        debug=True)
+
+    # Compute dot product scores and ranks.
+    scores = tf.linalg.matmul(poolvecs, qvecs, transpose_a=True)
+    ranks = tf.argsort(scores, axis=0, direction='DESCENDING')
+
+    sum_ndist = 0.
+    n_ndist = 0.
+
+    # Selection of negative examples.
+    self._nidxs = []
+
+    for q, qidx in enumerate(self._qidxs):
+      # We are not using the query cluster, those images are potentially
+      # positive.
+      qcluster = self._clusters[qidx]
+      clusters = [qcluster]
+      nidxs = []
+      rank = 0
+
+      while len(nidxs) < self._num_negatives:
+        if rank >= tf.shape(ranks)[0]:
+          raise ValueError("Unable to create epoch tuples. Number of required "
+                           "negative images is larger than the number of "
+                           "clusters in the dataset.")
+        potential = neg_images_idxs[ranks[rank, q]]
+        # Take at most one image from the same cluster.
+        if not self._clusters[potential] in clusters:
+          nidxs.append(potential)
+          clusters.append(self._clusters[potential])
+          dist = tf.norm(qvecs[:, q] - poolvecs[:, ranks[rank, q]],
+                         axis=0).numpy()
+          sum_ndist += dist
+          n_ndist += 1
+        rank += 1
+
+      self._nidxs.append(nidxs)
+
+    global_features_utils.debug_and_log(
+            '>> Average negative l2-distance: {:.2f}'.format(
+                    sum_ndist / n_ndist))
+
+    # Return average negative L2-distance.
+    return sum_ndist / n_ndist
--- a/research/delf/delf/python/datasets/tuples_dataset_test.py
+++ b/research/delf/delf/python/datasets/tuples_dataset_test.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"Tests for the tuples dataset module."
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+import pickle
+
+from delf.python.datasets import tuples_dataset
+from delf.python.training.model import global_model
+
+FLAGS = flags.FLAGS
+
+
+class TuplesDatasetTest(tf.test.TestCase):
+  """Tests for tuples dataset module."""
+
+  def testCreateEpochTuples(self):
+    """Tests epoch tuple creation."""
+    # Create a tuples dataset instance.
+    name = 'test_dataset'
+    num_queries = 1
+    pool_size = 5
+    num_negatives = 2
+    # Create a ground truth .pkl file.
+    gnd = {
+      'train': {'ids': [str(i) + '.png' for i in range(2 * num_queries + pool_size)],
+                'cluster': [0, 0, 1, 2, 3, 4, 5],
+                'qidxs': [0], 'pidxs': [1]}}
+    gnd_name = name + '.pkl'
+    with tf.io.gfile.GFile(os.path.join(FLAGS.test_tmpdir, gnd_name),
+                           'wb') as gnd_file:
+      pickle.dump(gnd, gnd_file)
+
+    # Create random images for the dataset.
+    for i in range(2 * num_queries + pool_size):
+      dummy_image = np.random.rand(1024, 750, 3) * 255
+      img_out = Image.fromarray(dummy_image.astype('uint8')).convert('RGB')
+      filename = os.path.join(FLAGS.test_tmpdir, '{}.png'.format(i))
+      img_out.save(filename)
+
+    dataset = tuples_dataset.TuplesDataset(
+      name=name,
+      data_root=FLAGS.test_tmpdir,
+      mode='train',
+      imsize=1024,
+      num_negatives=num_negatives,
+      num_queries=num_queries,
+      pool_size=pool_size
+    )
+
+    # Assert that initially no negative images are set.
+    self.assertIsNone(dataset._nidxs)
+
+    # Initialize a network for negative re-mining.
+    model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
+                    'whitening': False, 'pretrained': True}
+    model = global_model.GlobalFeatureNet(**model_params)
+
+    avg_neg_distance = dataset.create_epoch_tuples(model)
+
+    # Check that an appropriate number of negative images has been chosen per
+    # query.
+    self.assertAllEqual(tf.shape(dataset._nidxs), [num_queries, num_negatives])
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/delf/delf/python/training/global_features/__init__.py
+++ b/research/delf/delf/python/training/global_features/__init__.py
+# Copyright 2021 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Global model training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
--- a/research/delf/delf/python/training/global_features/train.py
+++ b/research/delf/delf/python/training/global_features/train.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training script for Global Features model."""
+
+import math
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from delf.python.datasets.sfm120k import dataset_download
+from delf.python.datasets.sfm120k import sfm120k
+from delf.python.training import global_features_utils
+from delf.python.training import tensorboard_utils
+from delf.python.training.global_features import train_utils
+from delf.python.training.losses import ranking_losses
+from delf.python.training.model import global_model
+
+_LOSS_NAMES = ['contrastive', 'triplet']
+_MODEL_NAMES = global_features_utils.get_standard_keras_models()
+_OPTIMIZER_NAMES = ['sgd', 'adam']
+_POOL_NAMES = ['mac', 'spoc', 'gem']
+_PRECOMPUTE_WHITEN_NAMES = ['retrieval-SfM-30k', 'retrieval-SfM-120k']
+_TEST_DATASET_NAMES = ['roxford5k', 'rparis6k']
+_TRAINING_DATASET_NAMES = ['retrieval-SfM-120k']
+_VALIDATION_TYPES = ['standard', 'eccv2020']
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_boolean('debug', False, 'Debug mode.')
+
+# Export directory, training and val datasets, test datasets.
+flags.DEFINE_string('data_root', "data",
+                    'Absolute path to the folder containing training data.')
+flags.DEFINE_string('directory', "data",
+                    'Destination where trained network should be saved.')
+flags.DEFINE_enum('training_dataset', 'retrieval-SfM-120k',
+                  _TRAINING_DATASET_NAMES, 'Training dataset: ' +
+                  ' | '.join(_TRAINING_DATASET_NAMES) + '.')
+flags.DEFINE_enum('validation_type', None, _VALIDATION_TYPES,
+                  'Type of the evaluation to use. Either `None`, `standard` '
+                  'or `eccv2020`.')
+flags.DEFINE_list('test_datasets', 'roxford5k,rparis6k',
+                  'Comma separated list of test datasets: ' +
+                  ' | '.join(_TEST_DATASET_NAMES) + '.')
+flags.DEFINE_enum('precompute_whitening', None, _PRECOMPUTE_WHITEN_NAMES,
+                  'Dataset used to learn whitening: ' +
+                  ' | '.join(_PRECOMPUTE_WHITEN_NAMES) + '.')
+flags.DEFINE_integer('test_freq', 5,
+                     'Run test evaluation every N epochs.')
+flags.DEFINE_list('multiscale', [1.],
+                  'Use multiscale vectors for testing, ' +
+                  ' examples: 1 | 1,1/2**(1/2),1/2 | 1,2**(1/2),1/2**(1/2)]. '
+                  'Pass as a string of comma separated values.')
+
+# Network architecture and initialization options.
+flags.DEFINE_enum('arch', 'ResNet101', _MODEL_NAMES,
+                  'Model architecture: ' + ' | '.join(_MODEL_NAMES) + '.')
+flags.DEFINE_enum('pool', 'gem', _POOL_NAMES,
+                  'Pooling options: ' + ' | '.join(_POOL_NAMES) + '.')
+flags.DEFINE_bool('whitening', False,
+                  'Whether to train model with learnable whitening ('
+                  'linear layer) after the pooling.')
+flags.DEFINE_bool('pretrained', True,
+                  'Whether to initialize model with random weights ('
+                  'default: pretrained on imagenet).')
+flags.DEFINE_enum('loss', 'contrastive', _LOSS_NAMES,
+                  'Training loss options: ' + ' | '.join(_LOSS_NAMES) + '.')
+flags.DEFINE_float('loss_margin', 0.7, 'Loss margin.')
+
+# train/val options specific for image retrieval learning.
+flags.DEFINE_integer('image_size', 1024,
+                     'Maximum size of longer image side used for training.')
+flags.DEFINE_integer('neg_num', 5, 'Number of negative images per train/val '
+                                   'tuple.')
+flags.DEFINE_integer('query_size', 2000,
+                     'Number of queries randomly drawn per one training epoch.')
+flags.DEFINE_integer('pool_size', 20000,
+                     'Size of the pool for hard negative mining.')
+
+# Standard training/validation options.
+flags.DEFINE_string('gpu_id', '0', 'GPU id used for training.')
+flags.DEFINE_integer('epochs', 100, 'Number of total epochs to run.')
+flags.DEFINE_integer('batch_size', 5,
+                     'Number of (q,p,n1,...,nN) tuples in a mini-batch.')
+flags.DEFINE_integer('update_every', 1,
+                     'Update model weights every N batches, used to handle '
+                     'relatively large batches, batch_size effectively '
+                     'becomes update_every `x` batch_size.')
+flags.DEFINE_enum('optimizer', 'adam', _OPTIMIZER_NAMES,
+                  'Optimizer options: ' + ' | '.join(_OPTIMIZER_NAMES) + '.')
+flags.DEFINE_float('lr', 1e-6, 'Initial learning rate.')
+flags.DEFINE_float('momentum', 0.9, 'Momentum.')
+flags.DEFINE_float('weight_decay', 1e-6, 'Weight decay.')
+flags.DEFINE_bool('resume', False,
+                  'Whether to start from the latest checkpoint in the logdir.')
+flags.DEFINE_bool('launch_tensorboard', False, 'Whether to launch tensorboard.')
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise RuntimeError('Too many command-line arguments.')
+
+  # Manually check if there are unknown test datasets and if the dataset
+  # ground truth files are downloaded.
+  for dataset in FLAGS.test_datasets:
+    if dataset not in _TEST_DATASET_NAMES:
+      raise ValueError('Unsupported or unknown test dataset: {}.'.format(
+              dataset))
+
+    test_data_config = os.path.join(FLAGS.data_root,
+                                    'gnd_{}.pkl'.format(dataset))
+    if not tf.io.gfile.exists(test_data_config):
+      raise ValueError(
+              '{} ground truth file at {} not found. Please download it '
+              'according to '
+              'the DELG instructions.'.format(dataset, FLAGS.data_root))
+
+  # Check if train dataset is downloaded and download it if not found.
+  dataset_download.download_train(FLAGS.data_root)
+
+  # Creating model export directory if it does not exist.
+  model_directory = global_features_utils.create_model_directory(
+          FLAGS.training_dataset, FLAGS.arch, FLAGS.pool, FLAGS.whitening,
+          FLAGS.pretrained, FLAGS.loss, FLAGS.loss_margin, FLAGS.optimizer,
+          FLAGS.lr, FLAGS.weight_decay, FLAGS.neg_num, FLAGS.query_size,
+          FLAGS.pool_size, FLAGS.batch_size, FLAGS.update_every,
+          FLAGS.image_size, FLAGS.directory)
+
+  # Setting up logging directory, same as where the model is stored.
+  logging.get_absl_handler().use_absl_log_file('absl_logging', model_directory)
+
+  # Set cuda visible device.
+  os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_id
+  global_features_utils.debug_and_log('>> Num GPUs Available: {}'.format(
+          len(tf.config.experimental.list_physical_devices('GPU'))),
+          FLAGS.debug)
+
+  # Set random seeds.
+  tf.random.set_seed(0)
+  np.random.seed(0)
+
+  # Initialize the model.
+  if FLAGS.pretrained:
+    global_features_utils.debug_and_log(
+            '>> Using pre-trained model \'{}\''.format(FLAGS.arch))
+  else:
+    global_features_utils.debug_and_log(
+            '>> Using model from scratch (random weights) \'{}\'.'.format(
+                    FLAGS.arch))
+
+  model_params = {'architecture': FLAGS.arch, 'pooling': FLAGS.pool,
+                  'whitening': FLAGS.whitening, 'pretrained': FLAGS.pretrained,
+                  'data_root': FLAGS.data_root}
+  model = global_model.GlobalFeatureNet(**model_params)
+
+  # Freeze running mean and std in batch normalization layers.
+  # We do training one image at a time to improve memory requirements of
+  # the network; therefore, the computed statistics would not be per a
+  # batch. Instead, we choose freezing - setting the parameters of all
+  # batch norm layers in the network to non-trainable (i.e., using original
+  # imagenet statistics).
+  for layer in model.feature_extractor.layers:
+    if isinstance(layer, tf.keras.layers.BatchNormalization):
+      layer.trainable = False
+
+  global_features_utils.debug_and_log('>> Network initialized.')
+
+  global_features_utils.debug_and_log('>> Loss: {}.'.format(FLAGS.loss))
+  # Define the loss function.
+  if FLAGS.loss == 'contrastive':
+    criterion = ranking_losses.ContrastiveLoss(margin=FLAGS.loss_margin)
+  elif FLAGS.loss == 'triplet':
+    criterion = ranking_losses.TripletLoss(margin=FLAGS.loss_margin)
+  else:
+    raise ValueError('Loss {} not available.'.format(FLAGS.loss))
+
+  # Defining parameters for the training.
+  # When pre-computing whitening, we run evaluation before the network training
+  # and the `start_epoch` is set to 0. In other cases, we start from epoch 1.
+  start_epoch = 1
+  exp_decay = math.exp(-0.01)
+  decay_steps = FLAGS.query_size / FLAGS.batch_size
+
+  # Define learning rate decay schedule.
+  lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
+          initial_learning_rate=FLAGS.lr,
+          decay_steps=decay_steps,
+          decay_rate=exp_decay)
+
+  # Define the optimizer.
+  if FLAGS.optimizer == 'sgd':
+    opt = tfa.optimizers.extend_with_decoupled_weight_decay(
+            tf.keras.optimizers.SGD)
+    optimizer = opt(weight_decay=FLAGS.weight_decay,
+                    learning_rate=lr_scheduler, momentum=FLAGS.momentum)
+  elif FLAGS.optimizer == 'adam':
+    opt = tfa.optimizers.extend_with_decoupled_weight_decay(
+            tf.keras.optimizers.Adam)
+    optimizer = opt(weight_decay=FLAGS.weight_decay, learning_rate=lr_scheduler)
+  else:
+    raise ValueError('Optimizer {} not available.'.format(FLAGS.optimizer))
+
+  # Initializing logging.
+  writer = tf.summary.create_file_writer(model_directory)
+  tf.summary.experimental.set_step(1)
+
+  # Setting up the checkpoint manager.
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  manager = tf.train.CheckpointManager(
+          checkpoint,
+          model_directory,
+          max_to_keep=10,
+          keep_checkpoint_every_n_hours=3)
+  if FLAGS.resume:
+    # Restores the checkpoint, if existing.
+    global_features_utils.debug_and_log('>> Continuing from a checkpoint.')
+    checkpoint.restore(manager.latest_checkpoint)
+
+  # Launching tensorboard if required.
+  if FLAGS.launch_tensorboard:
+    tensorboard = tf.keras.callbacks.TensorBoard(model_directory)
+    tensorboard.set_model(model=model)
+    tensorboard_utils.launch_tensorboard(log_dir=model_directory)
+
+  # Log flags used.
+  global_features_utils.debug_and_log('>> Running training script with:')
+  global_features_utils.debug_and_log('>> logdir = {}'.format(model_directory))
+
+  if FLAGS.training_dataset.startswith('retrieval-SfM-120k'):
+    train_dataset = sfm120k.CreateDataset(
+            data_root=FLAGS.data_root,
+            mode='train',
+            imsize=FLAGS.image_size,
+            num_negatives=FLAGS.neg_num,
+            num_queries=FLAGS.query_size,
+            pool_size=FLAGS.pool_size
+    )
+    if FLAGS.validation_type is not None:
+      val_dataset = sfm120k.CreateDataset(
+              data_root=FLAGS.data_root,
+              mode='val',
+              imsize=FLAGS.image_size,
+              num_negatives=FLAGS.neg_num,
+              num_queries=float('Inf'),
+              pool_size=float('Inf'),
+              eccv2020=True if FLAGS.validation_type == 'eccv2020' else False
+      )
+
+  train_dataset_output_types = [tf.float32 for i in range(2 + FLAGS.neg_num)]
+  train_dataset_output_types.append(tf.int32)
+
+  global_features_utils.debug_and_log(
+          '>> Training the {} network'.format(model_directory))
+  global_features_utils.debug_and_log('>> GPU ids: {}'.format(FLAGS.gpu_id))
+
+  with writer.as_default():
+
+    # Precompute whitening if needed.
+    if FLAGS.precompute_whitening is not None:
+      epoch = 0
+      train_utils.test_retrieval(
+              FLAGS.test_datasets, model, writer=writer,
+              epoch=epoch, model_directory=model_directory,
+              precompute_whitening=FLAGS.precompute_whitening,
+              data_root=FLAGS.data_root,
+              multiscale=FLAGS.multiscale)
+
+    for epoch in range(start_epoch, FLAGS.epochs + 1):
+      # Set manual seeds per epoch.
+      np.random.seed(epoch)
+      tf.random.set_seed(epoch)
+
+      # Find hard-negatives.
+      # While hard-positive examples are fixed during the whole training
+      # process and are randomly chosen from every epoch; hard-negatives
+      # depend on the current CNN parameters and are re-mined once per epoch.
+      avg_neg_distance = train_dataset.create_epoch_tuples(model)
+
+      def _train_gen():
+        return (inst for inst in train_dataset)
+
+      train_loader = tf.data.Dataset.from_generator(
+              _train_gen,
+              output_types=tuple(train_dataset_output_types))
+
+      loss = train_utils.train_val_one_epoch(
+              loader=iter(train_loader), model=model,
+              criterion=criterion, optimizer=optimizer, epoch=epoch,
+              batch_size=FLAGS.batch_size, query_size=FLAGS.query_size,
+              neg_num=FLAGS.neg_num, update_every=FLAGS.update_every,
+              debug=FLAGS.debug)
+
+      # Write a scalar summary.
+      tf.summary.scalar('train_epoch_loss', loss, step=epoch)
+      # Forces summary writer to send any buffered data to storage.
+      writer.flush()
+
+      # Evaluate on validation set.
+      if FLAGS.validation_type is not None and (epoch % FLAGS.test_freq == 0 or
+                                                epoch == 1):
+        avg_neg_distance = val_dataset.create_epoch_tuples(model,
+                                                           model_directory)
+
+        def _val_gen():
+          return (inst for inst in val_dataset)
+
+        val_loader = tf.data.Dataset.from_generator(
+                _val_gen, output_types=tuple(train_dataset_output_types))
+
+        loss = train_utils.train_val_one_epoch(
+                loader=iter(val_loader), model=model,
+                criterion=criterion, optimizer=None,
+                epoch=epoch, train=False, batch_size=FLAGS.batch_size,
+                query_size=FLAGS.query_size, neg_num=FLAGS.neg_num,
+                update_every=FLAGS.update_every, debug=FLAGS.debug)
+        tf.summary.scalar('val_epoch_loss', loss, step=epoch)
+        writer.flush()
+
+      # Evaluate on test datasets every test_freq epochs.
+      if epoch == 1 or epoch % FLAGS.test_freq == 0:
+        train_utils.test_retrieval(
+                FLAGS.test_datasets, model, writer=writer, epoch=epoch,
+                model_directory=model_directory,
+                precompute_whitening=FLAGS.precompute_whitening,
+                data_root=FLAGS.data_root, multiscale=FLAGS.multiscale)
+
+      # Saving checkpoints and model weights.
+      try:
+        save_path = manager.save(checkpoint_number=epoch)
+        global_features_utils.debug_and_log(
+                'Saved ({}) at {}'.format(epoch, save_path))
+
+        filename = os.path.join(model_directory,
+                                'checkpoint_epoch_{}.h5'.format(epoch))
+        model.save_weights(filename, save_format='h5')
+        global_features_utils.debug_and_log(
+                'Saved weights ({}) at {}'.format(epoch, filename))
+      except Exception as ex:
+        global_features_utils.debug_and_log(
+                'Could not save checkpoint: {}'.format(ex))
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/research/delf/delf/python/training/global_features/train_utils.py
+++ b/research/delf/delf/python/training/global_features/train_utils.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training utilities for Global Features model."""
+
+import os
+import pickle
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from delf.python import whiten
+from delf.python.datasets.revisited_op import dataset as test_dataset
+from delf.python.datasets.sfm120k import sfm120k
+from delf.python.training import global_features_utils
+from delf.python.training.model import global_model
+
+
+def _compute_loss_and_gradient(criterion, model, input, target, neg_num=5):
+  """Records gradients and loss through the network.
+
+  Args:
+    criterion: Loss function.
+    model: Network for the gradient computation.
+    input: Tuple of query, positive and negative images.
+    target: List of indexes to specify queries (-1), positives(1), negatives(0).
+    neg_num: Integer, number of negatives per a tuple.
+
+  Returns:
+    loss: Loss for the training step.
+    gradients: Computed gradients for the network trainable variables.
+  """
+  # Record gradients and loss through the network.
+  with tf.GradientTape() as tape:
+    descriptors = tf.zeros(shape=(0, model.meta['outputdim']), dtype=tf.float32)
+    for img in input:
+      # Compute descriptor vector for each image.
+      o = model(tf.expand_dims(img, axis=0), training=True)
+      descriptors = tf.concat([descriptors, o], 0)
+
+    queries = descriptors[target == -1]
+    positives = descriptors[target == 1]
+    negatives = descriptors[target == 0]
+
+    negatives = tf.reshape(negatives, [tf.shape(queries)[0], neg_num,
+                                       model.meta['outputdim']])
+    # Loss calculation.
+    loss = criterion(queries, positives, negatives)
+
+  return loss, tape.gradient(loss, model.trainable_variables)
+
+
+def train_val_one_epoch(
+        loader, model, criterion, optimizer, epoch, train=True, batch_size=5,
+        query_size=2000, neg_num=5, update_every=1, debug=False):
+  """Executes either training or validation step based on `train` value.
+
+  Args:
+    loader: Training/validation iterable dataset.
+    model: Network to train/validate.
+    criterion: Loss function.
+    optimizer: Network optimizer.
+    epoch: Integer, epoch number.
+    train: Bool, specifies training or validation phase.
+    batch_size: Integer, number of (q,p,n1,...,nN) tuples in a mini-batch.
+    query_size: Integer, number of queries randomly drawn per one training
+      epoch.
+    neg_num: Integer, number of negatives per a tuple.
+    update_every: Integer, update model weights every N batches, used to
+      handle relatively large batches batch_size effectively becomes
+      update_every x batch_size.
+    debug: Bool, whether debug mode is used.
+
+  Returns:
+    average_epoch_loss: Average epoch loss.
+  """
+  batch_time = global_features_utils.AverageMeter()
+  data_time = global_features_utils.AverageMeter()
+  losses = global_features_utils.AverageMeter()
+
+  # Retrieve all trainable variables we defined in the graph.
+  tvs = model.trainable_variables
+  accum_grads = [tf.zeros_like(tv.read_value()) for tv in tvs]
+
+  end = time.time()
+  batch_num = 0
+  print_frequency = 10
+  all_batch_num = query_size // batch_size
+  state = 'Train' if train else 'Val'
+  global_features_utils.debug_and_log('>> {} step:'.format(state))
+
+  # For every batch in the dataset; Stops when all batches in the dataset have
+  # been processed.
+  while True:
+    data_time.update(time.time() - end)
+
+    if train:
+      try:
+        # Train on one batch.
+        # Each image in the batch is loaded into memory consecutively.
+        for _ in range(batch_size):
+          # Because the images are not necessarily of the same size, we can't
+          # set the batch size with .batch().
+          batch = loader.get_next()
+          input_tuple = batch[0:-1]
+          target_tuple = batch[-1]
+
+          loss_value, grads = _compute_loss_and_gradient(
+                  criterion, model, input_tuple, target_tuple, neg_num)
+          losses.update(loss_value)
+          # Accumulate gradients.
+          accum_grads += grads
+
+        # Perform weight update if required.
+        if (batch_num + 1) % update_every == 0 or (
+                batch_num + 1) == all_batch_num:
+          # Do one step for multiple batches. Accumulated gradients are
+          # used.
+          optimizer.apply_gradients(
+                  zip(accum_grads, model.trainable_variables))
+          accum_grads = [tf.zeros_like(tv.read_value()) for tv in tvs]
+      # We break when we run out of range, i.e., we exhausted all dataset
+      # images.
+      except tf.errors.OutOfRangeError:
+        break
+
+    else:
+      # Validate one batch.
+      # We load full batch into memory.
+      input = []
+      target = []
+      try:
+        for _ in range(batch_size):
+          # Because the images are not necessarily of the same size, we can't
+          # set the batch size with .batch().
+          batch = loader.get_next()
+          input.append(batch[0:-1])
+          target.append(batch[-1])
+      # We break when we run out of range, i.e., we exhausted all dataset
+      # images.
+      except tf.errors.OutOfRangeError:
+        break
+
+      descriptors = tf.zeros(shape=(0, model.meta['outputdim']),
+                             dtype=tf.float32)
+
+      for input_tuple in input:
+        for img in input_tuple:
+          # Compute the global descriptor vector.
+          model_out = model(tf.expand_dims(img, axis=0), training=False)
+          descriptors = tf.concat([descriptors, model_out], 0)
+
+      # No need to reduce memory consumption (no backward pass):
+      # Compute loss for the full batch.
+      queries = descriptors[target == -1]
+      positives = descriptors[target == 1]
+      negatives = descriptors[target == 0]
+      negatives = tf.reshape(negatives, [tf.shape(queries)[0], neg_num,
+                                         model.meta['outputdim']])
+      loss = criterion(queries, positives, negatives)
+
+      # Record loss.
+      losses.update(loss / batch_size, batch_size)
+
+    # Measure elapsed time.
+    batch_time.update(time.time() - end)
+    end = time.time()
+
+    # Record immediate loss and elapsed time.
+    if debug and ((batch_num + 1) % print_frequency == 0 or
+                  batch_num == 0 or (batch_num + 1) == all_batch_num):
+      global_features_utils.debug_and_log(
+              '>> {0}: [{1} epoch][{2}/{3} batch]\t Time val: {'
+              'batch_time.val:.3f} '
+              '(Batch Time avg: {batch_time.avg:.3f})\t Data {'
+              'data_time.val:.3f} ('
+              'Time avg: {data_time.avg:.3f})\t Immediate loss value: {'
+              'loss.val:.4f} '
+              '(Loss avg: {loss.avg:.4f})'.format(
+                      state, epoch, batch_num + 1, all_batch_num,
+                      batch_time=batch_time,
+                      data_time=data_time, loss=losses), debug=True, log=False)
+    batch_num += 1
+
+  return losses.avg
+
+
+def test_retrieval(datasets, net, epoch, writer=None, model_directory=None,
+                   precompute_whitening=None, data_root='data', multiscale=[1.],
+                   test_image_size=1024):
+  """Testing step.
+
+  Evaluates the network on the provided test datasets by computing single-scale
+  mAP for easy/medium/hard cases. If `writer` is specified, saves the mAP
+  values in a tensorboard supported format.
+
+  Args:
+    datasets: List of dataset names for model testing (from
+      `_TEST_DATASET_NAMES`).
+    net: Network to evaluate.
+    epoch: Integer, epoch number.
+    writer: Tensorboard writer.
+    model_directory: String, path to the model directory.
+    precompute_whitening: Dataset used to learn whitening. If no
+      precomputation required, then `None`. Only 'retrieval-SfM-30k' and
+      'retrieval-SfM-120k' datasets are supported for whitening pre-computation.
+    data_root: Absolute path to the data folder.
+    multiscale: List of scales for multiscale testing.
+    test_image_size: Integer, maximum size of the test images.
+  """
+  global_features_utils.debug_and_log(">> Testing step:")
+  global_features_utils.debug_and_log(
+          '>> Evaluating network on test datasets...')
+
+  # Precompute whitening.
+  if precompute_whitening is not None:
+
+    # If whitening already precomputed, load it and skip the computations.
+    filename = os.path.join(
+            model_directory, 'learned_whitening_mP_{}_epoch.pkl'.format(epoch))
+    filename_layer = os.path.join(
+            model_directory,
+            'learned_whitening_layer_config_{}_epoch.pkl'.format(
+                    epoch))
+
+    if tf.io.gfile.exists(filename):
+      global_features_utils.debug_and_log(
+              '>> {}: Whitening for this epoch is already precomputed. '
+              'Loading...'.format(precompute_whitening))
+      with tf.io.gfile.GFile(filename, 'rb') as learned_whitening_file:
+        learned_whitening = pickle.load(learned_whitening_file)
+
+    else:
+      start = time.time()
+      global_features_utils.debug_and_log(
+              '>> {}: Learning whitening...'.format(precompute_whitening))
+
+      # Loading db.
+      db_root = os.path.join(data_root, 'train', precompute_whitening)
+      ims_root = os.path.join(db_root, 'ims')
+      db_filename = os.path.join(db_root,
+                                 '{}-whiten.pkl'.format(precompute_whitening))
+      with tf.io.gfile.GFile(db_filename, 'rb') as f:
+        db = pickle.load(f)
+      images = [sfm120k.id2filename(db['cids'][i], ims_root) for i in
+                range(len(db['cids']))]
+
+      # Extract whitening vectors.
+      global_features_utils.debug_and_log(
+              '>> {}: Extracting...'.format(precompute_whitening))
+      wvecs = global_model.extract_global_descriptors_from_list(net, images,
+                                                                test_image_size)
+
+      # Learning whitening.
+      global_features_utils.debug_and_log(
+              '>> {}: Learning...'.format(precompute_whitening))
+      wvecs = wvecs.numpy()
+      mean_vector, projection_matrix = whiten.whitenlearn(wvecs, db['qidxs'],
+                                                          db['pidxs'])
+      learned_whitening = {'m': mean_vector, 'P': projection_matrix}
+
+      global_features_utils.debug_and_log(
+              '>> {}: Elapsed time: {}'.format(precompute_whitening,
+                                               global_features_utils.htime(
+                                                       time.time() - start)))
+      # Save learned_whitening parameters for a later use.
+      with tf.io.gfile.GFile(filename, 'wb') as learned_whitening_file:
+        pickle.dump(learned_whitening, learned_whitening_file)
+
+      # Saving whitening as a layer.
+      bias = -np.dot(mean_vector.T, projection_matrix.T)
+      whitening_layer = tf.keras.layers.Dense(
+              net.meta['outputdim'],
+              activation=None,
+              use_bias=True,
+              kernel_initializer=tf.keras.initializers.Constant(
+                      projection_matrix.T),
+              bias_initializer=tf.keras.initializers.Constant(bias)
+      )
+      with tf.io.gfile.GFile(filename_layer, 'wb') as learned_whitening_file:
+        pickle.dump(whitening_layer.get_config(), learned_whitening_file)
+  else:
+    learned_whitening = None
+
+  # Evaluate on test datasets.
+  for dataset in datasets:
+    start = time.time()
+
+    # Prepare config structure for the test dataset.
+    cfg = test_dataset.CreateConfigForTestDataset(dataset,
+                                                  os.path.join(data_root))
+    images = [cfg['im_fname'](cfg, i) for i in range(cfg['n'])]
+    qimages = [cfg['qim_fname'](cfg, i) for i in range(cfg['nq'])]
+    bounding_boxes = [tuple(cfg['gnd'][i]['bbx']) for i in range(cfg['nq'])]
+
+    # Extract database and query vectors.
+    global_features_utils.debug_and_log(
+            '>> {}: Extracting database images...'.format(dataset))
+    vecs = global_model.extract_global_descriptors_from_list(
+            net, images, test_image_size, scales=multiscale)
+    global_features_utils.debug_and_log(
+            '>> {}: Extracting query images...'.format(dataset))
+    qvecs = global_model.extract_global_descriptors_from_list(
+            net, qimages, test_image_size, bounding_boxes,
+            scales=multiscale)
+
+    global_features_utils.debug_and_log('>> {}: Evaluating...'.format(dataset))
+
+    # Convert the obtained descriptors to numpy.
+    vecs = vecs.numpy()
+    qvecs = qvecs.numpy()
+
+    # Search, rank and print test set metrics.
+    _calculate_metrics_and_export_to_tensorboard(vecs, qvecs, dataset, cfg,
+                                                 writer, epoch, whiten=False)
+
+    if learned_whitening is not None:
+      # Whiten the vectors.
+      mean_vector = learned_whitening['m']
+      projection_matrix = learned_whitening['P']
+      vecs_lw = whiten.whitenapply(vecs, mean_vector, projection_matrix)
+      qvecs_lw = whiten.whitenapply(qvecs, mean_vector, projection_matrix)
+
+      # Search, rank, and print.
+      _calculate_metrics_and_export_to_tensorboard(
+              vecs_lw, qvecs_lw, dataset, cfg, writer, epoch, whiten=True)
+
+    global_features_utils.debug_and_log(
+            '>> {}: Elapsed time: {}'.format(
+                    dataset, global_features_utils.htime(time.time() - start)))
+
+
+def _calculate_metrics_and_export_to_tensorboard(vecs, qvecs, dataset, cfg,
+                                                 writer, epoch, whiten=False):
+  """
+  Calculates metrics and exports them to tensorboard.
+
+  Args:
+    vecs: Numpy array dataset global descriptors.
+    qvecs: Numpy array query global descriptors.
+    dataset: String, one of `_TEST_DATASET_NAMES`.
+    cfg: Dataset configuration.
+    writer: Tensorboard writer.
+    epoch: Integer, epoch number.
+    whiten: Boolean, whether the metrics are with for whitening used as a
+      post-processing step. Affects the name of the extracted TensorBoard
+      metrics.
+  """
+  # Search, rank and print test set metrics.
+  scores = np.dot(vecs.T, qvecs)
+  ranks = np.transpose(np.argsort(-scores, axis=0))
+
+  metrics = global_features_utils.compute_metrics_and_print(dataset, ranks,
+                                                            cfg['gnd'])
+  # Save calculated metrics in a tensorboard format.
+  if writer:
+    if whiten:
+      metric_names = ['test_accuracy_whiten_{}_E'.format(dataset),
+                      'test_accuracy_whiten_{}_M'.format(dataset),
+                      'test_accuracy_whiten_{}_H'.format(dataset)]
+    else:
+      metric_names = ['test_accuracy_{}_E'.format(dataset),
+                      'test_accuracy_{}_M'.format(dataset),
+                      'test_accuracy_{}_H'.format(dataset)]
+    tf.summary.scalar(metric_names[0], metrics[0][0], step=epoch)
+    tf.summary.scalar(metric_names[1], metrics[1][0], step=epoch)
+    tf.summary.scalar(metric_names[2], metrics[2][0], step=epoch)
+    writer.flush()
+  return None
--- a/research/delf/delf/python/training/global_features_utils.py
+++ b/research/delf/delf/python/training/global_features_utils.py
@@ -21,7 +21,7 @@ from absl import logging
 import numpy as np
 import tensorflow as tf

-from delf.python.datasets.revisited_op import dataset
+from delf.python.datasets.revisited_op import dataset as revisited_dataset


 class AverageMeter():
@@ -40,7 +40,6 @@ class AverageMeter():

  def update(self, val, n=1):
    """Updates values in the AverageMeter.
-
    Args:
      val: Float, loss value.
      n: Integer, number of instances.
@@ -57,7 +56,6 @@ def compute_metrics_and_print(dataset_name,
                              desired_pr_ranks=None,
                              log=True):
  """Computes and logs ground-truth metrics for Revisited datasets.
-
  Args:
    dataset_name: String, name of the dataset.
    sorted_index_ids: Integer NumPy array of shape [#queries, #index_images].
@@ -71,7 +69,6 @@ def compute_metrics_and_print(dataset_name,
      precision@10/recall@10 are desired, this should be set to [1, 10]. The
      largest item should be <= #sorted_index_ids. Default: [1, 5, 10].
    log: Whether to log results using logging.info().
-
  Returns:
    mAP: (metricsE, metricsM, metricsH) Tuple of the metrics for different
      levels of complexity. Each metrics is a list containing:
@@ -81,53 +78,53 @@ def compute_metrics_and_print(dataset_name,
      (NumPy array of floats, with shape [#queries]), precisions (NumPy array of
      floats, with shape [#queries, len(desired_pr_ranks)]), recalls (NumPy
      array of floats, with shape [#queries, len(desired_pr_ranks)]).
-
  Raises:
    ValueError: If an unknown dataset name is provided as an argument.
  """
-  if dataset not in dataset.DATASET_NAMES:
+  if dataset_name not in revisited_dataset.DATASET_NAMES:
    raise ValueError('Unknown dataset: {}!'.format(dataset))

  if desired_pr_ranks is None:
    desired_pr_ranks = [1, 5, 10]

  (easy_ground_truth, medium_ground_truth,
-   hard_ground_truth) = dataset.ParseEasyMediumHardGroundTruth(ground_truth)
-
-  metrics_easy = dataset.ComputeMetrics(sorted_index_ids, easy_ground_truth,
-                                        desired_pr_ranks)
-  metrics_medium = dataset.ComputeMetrics(sorted_index_ids, medium_ground_truth,
-                                          desired_pr_ranks)
-  metrics_hard = dataset.ComputeMetrics(sorted_index_ids, hard_ground_truth,
-                                        desired_pr_ranks)
+   hard_ground_truth) = revisited_dataset.ParseEasyMediumHardGroundTruth(
+    ground_truth)
+
+  metrics_easy = revisited_dataset.ComputeMetrics(sorted_index_ids,
+                                                  easy_ground_truth,
+                                                  desired_pr_ranks)
+  metrics_medium = revisited_dataset.ComputeMetrics(sorted_index_ids,
+                                                    medium_ground_truth,
+                                                    desired_pr_ranks)
+  metrics_hard = revisited_dataset.ComputeMetrics(sorted_index_ids,
+                                                  hard_ground_truth,
+                                                  desired_pr_ranks)

  debug_and_log(
-      '>> {}: mAP E: {}, M: {}, H: {}'.format(
-          dataset_name, np.around(metrics_easy[0] * 100, decimals=2),
-          np.around(metrics_medium[0] * 100, decimals=2),
-          np.around(metrics_hard[0] * 100, decimals=2)),
-      log=log)
+          '>> {}: mAP E: {}, M: {}, H: {}'.format(
+                  dataset_name, np.around(metrics_easy[0] * 100, decimals=2),
+                  np.around(metrics_medium[0] * 100, decimals=2),
+                  np.around(metrics_hard[0] * 100, decimals=2)),
+          log=log)

  debug_and_log(
-      '>> {}: mP@k{} E: {}, M: {}, H: {}'.format(
-          dataset_name, desired_pr_ranks,
-          np.around(metrics_easy[1] * 100, decimals=2),
-          np.around(metrics_medium[1] * 100, decimals=2),
-          np.around(metrics_hard[1] * 100, decimals=2)),
-      log=log)
+          '>> {}: mP@k{} E: {}, M: {}, H: {}'.format(
+                  dataset_name, desired_pr_ranks,
+                  np.around(metrics_easy[1] * 100, decimals=2),
+                  np.around(metrics_medium[1] * 100, decimals=2),
+                  np.around(metrics_hard[1] * 100, decimals=2)),
+          log=log)

  return metrics_easy, metrics_medium, metrics_hard


 def htime(time_difference):
  """Time formatting function.
-
  Depending on the value of `time_difference` outputs time in an appropriate
  time format.
-
  Args:
    time_difference: Float, time difference between the two events.
-
  Returns:
    time: String representing time in an appropriate time format.
  """
@@ -149,7 +146,6 @@ def htime(time_difference):

 def debug_and_log(msg, debug=True, log=True, debug_on_the_same_line=False):
  """Outputs `msg` to both stdout (if in the debug mode) and the log file.
-
  Args:
    msg: String, message to be logged.
    debug: Bool, if True, will print `msg` to stdout.
@@ -168,14 +164,13 @@ def debug_and_log(msg, debug=True, log=True, debug_on_the_same_line=False):

 def get_standard_keras_models():
  """Gets the standard keras model names.
-
  Returns:
    model_names: List, names of the standard keras models.
  """
  model_names = sorted(
-      name for name in tf.keras.applications.__dict__
-      if not name.startswith('__') and
-      callable(tf.keras.applications.__dict__[name]))
+          name for name in tf.keras.applications.__dict__
+          if not name.startswith('__') and
+          callable(tf.keras.applications.__dict__[name]))
  return model_names


@@ -184,9 +179,7 @@ def create_model_directory(training_dataset, arch, pool, whitening, pretrained,
                           neg_num, query_size, pool_size, batch_size,
                           update_every, image_size, directory):
  """Based on the model parameters, creates the model directory.
-
  If the model directory does not exist, the directory is created.
-
  Args:
    training_dataset: String, training dataset name.
    arch: String, model architecture.
@@ -206,7 +199,6 @@ def create_model_directory(training_dataset, arch, pool, whitening, pretrained,
    update_every: Integer, frequency of the model weights update.
    image_size: Integer, maximum size of longer image side used for training.
    directory: String, destination where trained network should be saved.
-
  Returns:
    folder: String, path to the model folder.
  """
@@ -223,7 +215,7 @@ def create_model_directory(training_dataset, arch, pool, whitening, pretrained,

  folder = os.path.join(directory, folder)
  debug_and_log(
-      '>> Creating directory if does not exist:\n>> \'{}\''.format(folder))
+          '>> Creating directory if does not exist:\n>> \'{}\''.format(folder))
  if not os.path.exists(folder):
    os.makedirs(folder)
  return folder
--- a/research/delf/delf/python/training/model/global_model.py
+++ b/research/delf/delf/python/training/model/global_model.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CNN Image Retrieval model implementation based on the following papers:
+
+  [1] Fine-tuning CNN Image Retrieval with No Human Annotation,
+    Radenović F., Tolias G., Chum O., TPAMI 2018 [arXiv]
+    https://arxiv.org/abs/1711.02512
+
+  [2] CNN Image Retrieval Learns from BoW: Unsupervised Fine-Tuning with Hard
+    Examples, Radenović F., Tolias G., Chum O., ECCV 2016 [arXiv]
+    https://arxiv.org/abs/1604.02426
+"""
+
+import os
+
+import pickle
+import tensorflow as tf
+
+from delf.python.datasets import generic_dataset
+from delf.python.normalization_layers import normalization
+from delf.python.pooling_layers import pooling as pooling_layers
+from delf.python.training import global_features_utils
+
+# Pre-computed global whitening, for most commonly used architectures.
+# Using pre-computed whitening improves the speed of the convergence and the
+# performance.
+_WHITENING_CONFIG = {
+  'ResNet50': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
+              '/SFM120k_ResNet50_gem_learned_whitening_config.pkl',
+  'ResNet101': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
+               '/SFM120k_ResNet101_gem_learned_whitening_config.pkl',
+  'ResNet152': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
+               '/SFM120k_ResNet152_gem_learned_whitening_config.pkl',
+  'VGG19': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
+           '/SFM120k_VGG19_gem_learned_whitening_config.pkl'
+}
+
+# Possible global pooling layers.
+_POOLING = {
+  'mac': pooling_layers.MAC,
+  'spoc': pooling_layers.SPoC,
+  'gem': pooling_layers.GeM
+}
+
+# Output dimensionality for supported architectures.
+_OUTPUT_DIM = {
+  'VGG16': 512,
+  'VGG19': 512,
+  'ResNet50': 2048,
+  'ResNet101': 2048,
+  'ResNet101V2': 2048,
+  'ResNet152': 2048,
+  'DenseNet121': 1024,
+  'DenseNet169': 1664,
+  'DenseNet201': 1920,
+  'EfficientNetB5': 2048,
+  'EfficientNetB7': 2560
+}
+
+
+class GlobalFeatureNet(tf.keras.Model):
+  """Instantiates global model for image retrieval.
+
+  This class implements the [GlobalFeatureNet](
+  https://arxiv.org/abs/1711.02512) for image retrieval. The model uses a
+  user-defined model as a backbone.
+  """
+
+  def __init__(self, architecture='ResNet101', pooling='gem',
+               whitening=False, pretrained=True, data_root=''):
+    """GlobalFeatureNet network initialization.
+
+    Args:
+      architecture: Network backbone.
+      pooling: Pooling method used 'mac'/'spoc'/'gem'.
+      whitening: Bool, whether to use whitening.
+      pretrained: Bool, whether to initialize the network with the weights
+        pretrained on ImageNet.
+      data_root: String, path to the data folder where the precomputed
+        whitening is/will be saved in case `whitening` is True.
+
+    Raises:
+      ValueError: If `architecture` is not supported.
+    """
+    if architecture not in _OUTPUT_DIM.keys():
+      raise ValueError("Architecture {} is not supported.".format(architecture))
+
+    super(GlobalFeatureNet, self).__init__()
+
+    # Get standard output dimensionality size.
+    dim = _OUTPUT_DIM[architecture]
+
+    if pretrained:
+      # Initialize with network pretrained on imagenet.
+      net_in = getattr(tf.keras.applications, architecture)(include_top=False,
+                                                            weights="imagenet")
+    else:
+      # Initialize with random weights.
+      net_in = getattr(tf.keras.applications, architecture)(include_top=False,
+                                                            weights=None)
+
+    # Initialize `feature_extractor`. Take only convolutions for
+    # `feature_extractor`, always end with ReLU to make last activations
+    # non-negative.
+    if architecture.lower().startswith('densenet'):
+      tmp_model = tf.keras.Sequential()
+      tmp_model.add(net_in)
+      net_in = tmp_model
+      net_in.add(tf.keras.layers.ReLU())
+
+    # Initialize pooling.
+    self.pool = _POOLING[pooling]()
+
+    # Initialize whitening.
+    if whitening:
+      if pretrained and architecture in _WHITENING_CONFIG:
+        # If precomputed whitening for the architecture exists,
+        # the fully-connected layer is going to be initialized according to
+        # the precomputed layer configuration.
+        global_features_utils.debug_and_log(
+                ">> {}: for '{}' custom computed whitening '{}' is used."
+                  .format(os.getcwd(), architecture,
+                          os.path.basename(_WHITENING_CONFIG[architecture])))
+        # The layer configuration is downloaded to the `data_root` folder.
+        whiten_dir = os.path.join(data_root, architecture)
+        path = tf.keras.utils.get_file(fname=whiten_dir,
+                                       origin=_WHITENING_CONFIG[architecture])
+        # Whitening configuration is loaded.
+        with tf.io.gfile.GFile(path, 'rb') as learned_whitening_file:
+          whitening_config = pickle.load(learned_whitening_file)
+        # Whitening layer is initialized according to the configuration.
+        self.whiten = tf.keras.layers.Dense.from_config(whitening_config)
+      else:
+        # In case if no precomputed whitening exists for the chosen
+        # architecture, the fully-connected whitening layer is initialized
+        # with the random weights.
+        self.whiten = tf.keras.layers.Dense(dim, activation=None, use_bias=True)
+        global_features_utils.debug_and_log(
+                ">> There is either no whitening computed for the "
+                "used network architecture or pretrained is False,"
+                " random weights are used.")
+    else:
+      self.whiten = None
+
+    # Create meta information to be stored in the network.
+    self.meta = {
+      'architecture': architecture,
+      'pooling': pooling,
+      'whitening': whitening,
+      'outputdim': dim
+    }
+
+    self.feature_extractor = net_in
+    self.normalize = normalization.L2Normalization()
+
+  def call(self, x, training=False):
+    """Invokes the GlobalFeatureNet instance.
+
+    Args:
+      x: [B, H, W, C] Tensor with a batch of images.
+      training: Indicator of whether the forward pass is running in training
+      mode or not.
+
+    Returns:
+      out: [B, out_dim] Global descriptor.
+    """
+    # Forward pass through the fully-convolutional backbone.
+    o = self.feature_extractor(x, training)
+    # Pooling.
+    o = self.pool(o)
+    # Normalization.
+    o = self.normalize(o)
+
+    # If whitening exists: the pooled global descriptor is whitened and
+    # re-normalized.
+    if self.whiten is not None:
+      o = self.whiten(o)
+      o = self.normalize(o)
+    return o
+
+  def meta_repr(self):
+    '''Provides high-level information about the network.
+
+    Returns:
+      meta: string with the information about the network (used
+        architecture, pooling type, whitening, outputdim).
+    '''
+    tmpstr = '(meta):\n'
+    tmpstr += '\tarchitecture: {}\n'.format(self.meta['architecture'])
+    tmpstr += '\tpooling: {}\n'.format(self.meta['pooling'])
+    tmpstr += '\twhitening: {}\n'.format(self.meta['whitening'])
+    tmpstr += '\toutputdim: {}\n'.format(self.meta['outputdim'])
+    return tmpstr
+
+
+def extract_global_descriptors_from_list(net, images, image_size,
+                                         bounding_boxes=None, scales=[1.],
+                                         multi_scale_power=1., print_freq=10):
+  """Extracting global descriptors from a list of images.
+
+  Args:
+    net: Model object, network for the forward pass.
+    images: Absolute image paths as strings.
+    image_size: Integer, defines the maximum size of longer image side.
+    bounding_boxes: List of (x1,y1,x2,y2) tuples to crop the query images.
+    scales: List of float scales.
+    multi_scale_power: Float, multi-scale normalization power parameter.
+    print_freq: Printing frequency for debugging.
+
+  Returns:
+    descriptors: Global descriptors for the input images.
+  """
+  # Creating dataset loader.
+  data = generic_dataset.ImagesFromList(root='', image_paths=images,
+                                        imsize=image_size,
+                                        bounding_boxes=bounding_boxes)
+
+  def _data_gen():
+    return (inst for inst in data)
+
+  loader = tf.data.Dataset.from_generator(_data_gen, output_types=(tf.float32))
+  loader = loader.batch(1)
+
+  # Extracting vectors.
+  descriptors = tf.zeros((0, net.meta['outputdim']))
+  for i, input in enumerate(loader):
+    if len(scales) == 1 and scales[0] == 1:
+      descriptors = tf.concat([descriptors, net(input)], 0)
+    else:
+      descriptors = tf.concat(
+              [descriptors, extract_multi_scale_descriptor(
+                      net, input, scales, multi_scale_power)], 0)
+
+    if (i + 1) % print_freq == 0 or (i + 1) == len(images):
+      global_features_utils.debug_and_log(
+              '\r>>>> {}/{} done...'.format((i + 1), len(images)),
+              debug_on_the_same_line=True)
+  global_features_utils.debug_and_log('', log=False)
+
+  descriptors = tf.transpose(descriptors, perm=[1, 0])
+  return descriptors
+
+
+def extract_multi_scale_descriptor(net, input, scales, multi_scale_power):
+  """Extracts the global descriptor multi scale.
+
+  Args:
+    net: Model object, network for the forward pass.
+    input: [B, H, W, C] input tensor in channel-last (BHWC) configuration.
+    scales: List of float scales.
+    multi_scale_power: Float, multi-scale normalization power parameter.
+
+  Returns:
+    descriptors: Multi-scale global descriptors for the input images.
+  """
+  descriptors = tf.zeros(net.meta['outputdim'])
+
+  for s in scales:
+    if s == 1:
+      input_t = input
+    else:
+      output_shape = s * tf.shape(input)[1:3].numpy()
+      input_t = tf.image.resize(input, output_shape,
+                                method='bilinear',
+                                preserve_aspect_ratio=True)
+    descriptors += tf.pow(net(input_t), multi_scale_power)
+
+  descriptors /= len(scales)
+  descriptors = tf.pow(descriptors, 1. / multi_scale_power)
+  descriptors /= tf.norm(descriptors)
+
+  return descriptors
--- a/research/delf/delf/python/training/model/global_model_test.py
+++ b/research/delf/delf/python/training/model/global_model_test.py
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the GlobalFeatureNet backbone."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from delf.python.training.model import global_model
+
+FLAGS = flags.FLAGS
+
+
+class GlobalFeatureNetTest(tf.test.TestCase):
+  """Tests for the GlobalFeatureNet backbone."""
+
+  def testInitModel(self):
+    """Testing GlobalFeatureNet initialization."""
+    # Testing GlobalFeatureNet initialization.
+    model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
+                    'whitening': False, 'pretrained': True}
+    model = global_model.GlobalFeatureNet(**model_params)
+    expected_meta = {'architecture': 'ResNet101', 'pooling': 'gem',
+                     'whitening': False, 'outputdim': 2048}
+    self.assertEqual(expected_meta, model.meta)
+
+  def testExtractVectors(self):
+    """Tests extraction of global descriptors from list."""
+    # Initializing network for testing.
+    model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
+                    'whitening': False, 'pretrained': True}
+    model = global_model.GlobalFeatureNet(**model_params)
+
+    # Number of images to be created.
+    n = 2
+    image_paths = []
+
+    # Create `n` dummy images.
+    for i in range(n):
+      dummy_image = np.random.rand(1024, 750, 3) * 255
+      img_out = Image.fromarray(dummy_image.astype('uint8')).convert('RGB')
+      filename = os.path.join(FLAGS.test_tmpdir, 'test_image_{}.jpg'.format(i))
+      img_out.save(filename)
+      image_paths.append(filename)
+
+    descriptors = global_model.extract_global_descriptors_from_list(
+            model, image_paths, image_size=1024, bounding_boxes=None,
+            scales=[1., 3.], multi_scale_power=2, print_freq=1)
+    self.assertAllEqual([2048, 2], tf.shape(descriptors))
+
+  def testExtractMultiScale(self):
+    """Tests multi-scale global descriptor extraction."""
+    # Initializing network for testing.
+    model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
+                    'whitening': False, 'pretrained': True}
+    model = global_model.GlobalFeatureNet(**model_params)
+
+    input = tf.random.uniform([2, 1024, 750, 3], dtype=tf.float32, seed=0)
+    descriptors = global_model.extract_multi_scale_descriptor(
+            model, input, scales=[1., 3.], multi_scale_power=2)
+    self.assertAllEqual([2, 2048], tf.shape(descriptors))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/delf/delf/python/training/tensorboard_utils.py
+++ b/research/delf/delf/python/training/tensorboard_utils.py
+# Copyright 2021 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for tensorboard."""
+
+from tensorboard import program
+
+from delf.python.training import global_features_utils
+
+
+def launch_tensorboard(log_dir):
+  """Runs tensorboard with the given `log_dir`.
+
+  Args:
+    log_dir: String, directory to launch tensorboard in.
+  """
+  tensorboard = program.TensorBoard()
+  tensorboard.configure(argv=[None, '--logdir', log_dir])
+  url = tensorboard.launch()
+  global_features_utils.debug_and_log("Launching Tensorboard: {}".format(url))
--- a/research/object_detection/README.md
+++ b/research/object_detection/README.md
@@ -75,7 +75,7 @@ documentation of the Object Detection API:

 ### DeepMAC architecture

-We have released our new architecture, **DeepMAC**, desgined for partially
+We have released our new architecture, **DeepMAC**, designed for partially
 supervised instance segmentation. DeepMAC stands for Deep Mask-heads
 Above CenterNet, and is based on our CenterNet implementation. In our
 [paper](https://arxiv.org/abs/2104.00613) we show that DeepMAC achieves

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -2003,8 +2003,20 @@ def _resize_masks(masks, height, width, method):
 class CenterNetMaskTargetAssigner(object):
  """Wrapper to compute targets for segmentation masks."""

-  def __init__(self, stride):
+  def __init__(self, stride, boxes_scale=1.0):
+    """Constructor.
+
+    Args:
+      stride: The stride of the network. Targets are assigned at the output
+        stride.
+      boxes_scale: Scale to apply to boxes before producing mask weights. This
+        is meant to ensure the full object region is properly weighted prior to
+        applying loss. A value of ~1.05 is typically applied when object regions
+        should be blacked out (perhaps because valid groundtruth masks are not
+        present).
+    """
    self._stride = stride
+    self._boxes_scale = boxes_scale

  def assign_segmentation_targets(
      self, gt_masks_list, gt_classes_list, gt_boxes_list=None,
@@ -2072,7 +2084,7 @@ class CenterNetMaskTargetAssigner(object):
        segmentation_weight_for_image = (
            ta_utils.blackout_pixel_weights_by_box_regions(
                output_height, output_width, boxes_absolute.get(), blackout,
-                weights=gt_mask_weights))
+                weights=gt_mask_weights, boxes_scale=self._boxes_scale))
        segmentation_weights_list.append(segmentation_weight_for_image)
      else:
        segmentation_weights_list.append(tf.ones((output_height, output_width),

--- a/research/object_detection/dockerfiles/tf2_ai_platform/Dockerfile
+++ b/research/object_detection/dockerfiles/tf2_ai_platform/Dockerfile
+FROM tensorflow/tensorflow:latest-gpu
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install apt dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    gpg-agent \
+    python3-cairocffi \
+    protobuf-compiler \
+    python3-pil \
+    python3-lxml \
+    python3-tk \
+    python3-opencv \
+    wget
+
+# Installs google cloud sdk, this is mostly for using gsutil to export model.
+RUN wget -nv \
+    https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \
+    mkdir /root/tools && \
+    tar xvzf google-cloud-sdk.tar.gz -C /root/tools && \
+    rm google-cloud-sdk.tar.gz && \
+    /root/tools/google-cloud-sdk/install.sh --usage-reporting=false \
+        --path-update=false --bash-completion=false \
+        --disable-installation-options && \
+    rm -rf /root/.config/* && \
+    ln -s /root/.config /config && \
+    rm -rf /root/tools/google-cloud-sdk/.install/.backup
+
+# Path configuration
+ENV PATH $PATH:/root/tools/google-cloud-sdk/bin
+# Make sure gsutil will use the default service account
+RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg
+
+WORKDIR /home/tensorflow
+
+## Copy this code (make sure you are under the ../models/research directory)
+COPY . /home/tensorflow/models
+
+# Compile protobuf configs
+RUN (cd /home/tensorflow/models/ && protoc object_detection/protos/*.proto --python_out=.)
+WORKDIR /home/tensorflow/models/
+
+RUN cp object_detection/packages/tf2/setup.py ./
+ENV PATH="/home/tensorflow/.local/bin:${PATH}"
+
+RUN python -m pip install -U pip
+RUN python -m pip install .
+
+ENTRYPOINT ["python", "object_detection/model_main_tf2.py"]
--- a/research/object_detection/g3doc/configuring_jobs.md
+++ b/research/object_detection/g3doc/configuring_jobs.md
@@ -24,22 +24,23 @@ A skeleton configuration file is shown below:

 ```
 model {
-(... Add model config here...)
+  (... Add model config here...)
 }

 train_config : {
-(... Add train_config here...)
+  (... Add train_config here...)
 }

 train_input_reader: {
-(... Add train_input configuration here...)
+  (... Add train_input configuration here...)
 }

 eval_config: {
+  (... Add eval_config here...)
 }

 eval_input_reader: {
-(... Add eval_input configuration here...)
+  (... Add eval_input configuration here...)
 }
 ```

@@ -58,6 +59,106 @@ configuration files can be pasted into `model` field of the skeleton
 configuration. Users should note that the `num_classes` field should be changed
 to a value suited for the dataset the user is training on.

+### Anchor box parameters
+
+Many object detection models use an anchor generator as a region-sampling
+strategy, which generates a large number of anchor boxes in a range of shapes
+and sizes, in many locations of the image. The detection algorithm then
+incrementally offsets the anchor box closest to the ground truth until it
+(closely) matches. You can specify the variety of and position of these anchor
+boxes in the `anchor_generator` config.
+
+Usually, the anchor configs provided with pre-trained checkpoints are
+designed for large/versatile datasets (COCO, ImageNet), in which the goal is to
+improve accuracy for a wide range of object sizes and positions. But in most
+real-world applications, objects are confined to a limited number of sizes. So
+adjusting the anchors to be specific to your dataset and environment
+can both improve model accuracy and reduce training time.
+
+The format for these anchor box parameters differ depending on your model
+architecture. For details about all fields, see the [`anchor_generator`
+definition](https://github.com/tensorflow/models/blob/master/research/object_detection/protos/anchor_generator.proto).
+On this page, we'll focus on parameters
+used in a traditional single shot detector (SSD) model and SSD models with a
+feature pyramid network (FPN) head.
+
+Regardless of the model architecture, you'll need to understand the following
+anchor box concepts:
+
+  +  **Scale**: This defines the variety of anchor box sizes. Each box size is
+  defined as a proportion of the original image size (for SSD models) or as a
+  factor of the filter's stride length (for FPN). The number of different sizes
+  is defined using a range of "scales" (relative to image size) or "levels" (the
+  level on the feature pyramid). For example, to detect small objects with the
+  configurations below, the `min_scale` and `min_level` are set to a small
+  value, while `max_scale` and `max_level` specify the largest objects to
+  detect.
+
+  +  **Aspect ratio**: This is the height/width ratio for the anchor boxes.  For
+  example, the `aspect_ratio` value of `1.0` creates a square, and `2.0` creates
+  a 1:2 rectangle (landscape orientation). You can define as many aspects as you
+  want and each one is repeated at all anchor box scales.
+
+Beware that increasing the total number of anchor boxes will exponentially
+increase computation costs. Whereas generating fewer anchors that have a higher
+chance to overlap with ground truth will both improve accuracy and reduce
+computation costs.
+
+
+**Single Shot Detector (SSD) full model:**
+
+Setting `num_layers` to 6 means the model generates each box aspect at 6
+different sizes. The exact sizes are not specified but they're evenly spaced out
+between the `min_scale` and `max_scale` values, which specify the smallest box
+size is 20% of the input image size and the largest is 95% that size.
+
+```
+model {
+  ssd {
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+      }
+    }
+  }
+}
+```
+
+For more details, see [`ssd_anchor_generator.proto`](https://github.com/tensorflow/models/blob/master/research/object_detection/protos/ssd_anchor_generator.proto).
+
+**SSD with Feature Pyramid Network (FPN) head:**
+
+When using an FPN head, you must specify the anchor box size relative to the
+convolutional filter's stride length at a given pyramid level, using
+`anchor_scale`. So in this example, the box size is 4.0 multiplied by the
+layer's stride length. The number of sizes you get for each aspect simply
+depends on how many levels there are between the `min_level` and `max_level`.
+
+```
+model {
+  ssd {
+    anchor_generator {
+      multiscale_anchor_generator {
+        anchor_scale: 4.0
+        min_level: 3
+        max_level: 7
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+      }
+    }
+  }
+}
+```
+
+For more details, see [`multiscale_anchor_generator.proto`](https://github.com/tensorflow/models/blob/master/research/object_detection/protos/multiscale_anchor_generator.proto).
+
+
 ## Defining Inputs

 The TensorFlow Object Detection API accepts inputs in the TFRecord file format.
@@ -66,20 +167,21 @@ Additionally, users should also specify a label map, which define the mapping
 between a class id and class name. The label map should be identical between
 training and evaluation datasets.

-An example input configuration looks as follows:
+An example training input configuration looks as follows:

 ```
-tf_record_input_reader {
-  input_path: "/usr/home/username/data/train.record"
+train_input_reader: {
+  tf_record_input_reader {
+    input_path: "/usr/home/username/data/train.record-?????-of-00010"
+  }
+  label_map_path: "/usr/home/username/data/label_map.pbtxt"
 }
-label_map_path: "/usr/home/username/data/label_map.pbtxt"
 ```

-Users should substitute the `input_path` and `label_map_path` arguments and
-insert the input configuration into the `train_input_reader` and
-`eval_input_reader` fields in the skeleton configuration. Note that the paths
-can also point to Google Cloud Storage buckets (ie.
-"gs://project_bucket/train.record") for use on Google Cloud.
+The `eval_input_reader` follows the same format. Users should substitute the
+`input_path` and `label_map_path` arguments. Note that the paths can also point
+to Google Cloud Storage buckets (ie. "gs://project_bucket/train.record") to
+pull datasets hosted on Google Cloud.

 ## Configuring the Trainer

@@ -92,36 +194,38 @@ The `train_config` defines parts of the training process:
 A sample `train_config` is below:

 ```
-batch_size: 1
-optimizer {
-  momentum_optimizer: {
-    learning_rate: {
-      manual_step_learning_rate {
-        initial_learning_rate: 0.0002
-        schedule {
-          step: 0
-          learning_rate: .0002
-        }
-        schedule {
-          step: 900000
-          learning_rate: .00002
-        }
-        schedule {
-          step: 1200000
-          learning_rate: .000002
+train_config: {
+  batch_size: 1
+  optimizer {
+    momentum_optimizer: {
+      learning_rate: {
+        manual_step_learning_rate {
+          initial_learning_rate: 0.0002
+          schedule {
+            step: 0
+            learning_rate: .0002
+          }
+          schedule {
+            step: 900000
+            learning_rate: .00002
+          }
+          schedule {
+            step: 1200000
+            learning_rate: .000002
+          }
        }
      }
+      momentum_optimizer_value: 0.9
    }
-    momentum_optimizer_value: 0.9
+    use_moving_average: false
  }
-  use_moving_average: false
-}
-fine_tune_checkpoint: "/usr/home/username/tmp/model.ckpt-#####"
-from_detection_checkpoint: true
-load_all_detection_checkpoint_vars: true
-gradient_clipping_by_norm: 10.0
-data_augmentation_options {
-  random_horizontal_flip {
+  fine_tune_checkpoint: "/usr/home/username/tmp/model.ckpt-#####"
+  from_detection_checkpoint: true
+  load_all_detection_checkpoint_vars: true
+  gradient_clipping_by_norm: 10.0
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
  }
 }
 ```

--- a/research/object_detection/g3doc/tf2_training_and_evaluation.md
+++ b/research/object_detection/g3doc/tf2_training_and_evaluation.md
@@ -187,21 +187,28 @@ evaluation jobs for a few iterations [locally on their own machines](#local).

 ### Training with multiple GPUs

-A user can start a training job on Cloud AI Platform using the following
-command:
+A user can start a training job on Cloud AI Platform following the instruction
+https://cloud.google.com/ai-platform/training/docs/custom-containers-training.

 ```bash
+git clone https://github.com/tensorflow/models.git
+
 # From the tensorflow/models/research/ directory
-cp object_detection/packages/tf2/setup.py .
+cp object_detection/dockerfiles/tf2_ai_platform/Dockerfile .
+
+docker build -t gcr.io/${DOCKER_IMAGE_URI} .
+
+docker push gcr.io/${DOCKER_IMAGE_URI}
+```
+
+```bash
 gcloud ai-platform jobs submit training object_detection_`date +%m_%d_%Y_%H_%M_%S` \
-    --runtime-version 2.1 \
-    --python-version 3.6 \
    --job-dir=gs://${MODEL_DIR} \
-    --package-path ./object_detection \
-    --module-name object_detection.model_main_tf2 \
    --region us-central1 \
    --master-machine-type n1-highcpu-16 \
    --master-accelerator count=8,type=nvidia-tesla-v100 \
+    --master-image-uri gcr.io/${DOCKER_IMAGE_URI} \
+    --scale-tier CUSTOM \
    -- \
    --model_dir=gs://${MODEL_DIR} \
    --pipeline_config_path=gs://${PIPELINE_CONFIG_PATH}
@@ -210,15 +217,16 @@ gcloud ai-platform jobs submit training object_detection_`date +%m_%d_%Y_%H_%M_%
 Where `gs://${MODEL_DIR}` specifies the directory on Google Cloud Storage where
 the training checkpoints and events will be written to and
 `gs://${PIPELINE_CONFIG_PATH}` points to the pipeline configuration stored on
-Google Cloud Storage.
+Google Cloud Storage, and `gcr.io/${DOCKER_IMAGE_URI}` points to the docker
+image stored in Google Container Registry.

 Users can monitor the progress of their training job on the
 [ML Engine Dashboard](https://console.cloud.google.com/ai-platform/jobs).

 ### Training with TPU

-Launching a training job with a TPU compatible pipeline config requires using a
-similar command:
+Launching a training job with a TPU compatible pipeline config requires using
+the following command:

 ```bash
 # From the tensorflow/models/research/ directory
@@ -246,16 +254,11 @@ Evaluation jobs run on a single machine. Run the following command to start the
 evaluation job:

 ```bash
-# From the tensorflow/models/research/ directory
-cp object_detection/packages/tf2/setup.py .
 gcloud ai-platform jobs submit training object_detection_eval_`date +%m_%d_%Y_%H_%M_%S` \
-    --runtime-version 2.1 \
-    --python-version 3.6 \
    --job-dir=gs://${MODEL_DIR} \
-    --package-path ./object_detection \
-    --module-name object_detection.model_main_tf2 \
    --region us-central1 \
    --scale-tier BASIC_GPU \
+    --master-image-uri gcr.io/${DOCKER_IMAGE_URI} \
    -- \
    --model_dir=gs://${MODEL_DIR} \
    --pipeline_config_path=gs://${PIPELINE_CONFIG_PATH} \
@@ -264,8 +267,9 @@ gcloud ai-platform jobs submit training object_detection_eval_`date +%m_%d_%Y_%H

 where `gs://${MODEL_DIR}` points to the directory on Google Cloud Storage where
 training checkpoints are saved and `gs://{PIPELINE_CONFIG_PATH}` points to where
-the model configuration file stored on Google Cloud Storage. Evaluation events
-are written to `gs://${MODEL_DIR}/eval`
+the model configuration file stored on Google Cloud Storage, and
+`gcr.io/${DOCKER_IMAGE_URI}` points to the docker image stored in Google
+Container Registry. Evaluation events are written to `gs://${MODEL_DIR}/eval`

 Typically one starts an evaluation job concurrently with the training job. Note
 that we do not support running evaluation on TPU.

--- a/research/object_detection/legacy/evaluator.py
+++ b/research/object_detection/legacy/evaluator.py
@@ -122,7 +122,8 @@ def _extract_predictions_and_losses(model,
        [input_dict[fields.InputDataFields.groundtruth_boxes]],
        [tf.one_hot(input_dict[fields.InputDataFields.groundtruth_classes]
                    - label_id_offset, depth=model.num_classes)],
-        groundtruth_masks_list, groundtruth_keypoints_list)
+        groundtruth_masks_list=groundtruth_masks_list,
+        groundtruth_keypoints_list=groundtruth_keypoints_list)
    losses_dict.update(model.loss(prediction_dict, true_image_shapes))

  result_dict = eval_util.result_dict_for_single_example(

--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -598,7 +598,7 @@ def prediction_tensors_to_single_instance_kpts(
      keypoint type, as it's possible to filter some candidates due to the score
      threshold.
  """
-  batch_size, height, width, num_keypoints = _get_shape(
+  batch_size, _, _, num_keypoints = _get_shape(
      keypoint_heatmap_predictions, 4)
  # Get x, y and channel indices corresponding to the top indices in the
  # keypoint heatmap predictions.
@@ -612,24 +612,32 @@ def prediction_tensors_to_single_instance_kpts(
      _multi_range(batch_size, value_repetitions=num_keypoints),
      tf.reshape(y_indices, [-1]),
      tf.reshape(x_indices, [-1]),
-      tf.reshape(channel_indices, [-1])
  ], axis=1)

-  # Reshape the offsets predictions to shape:
-  # [batch_size, height, width, num_keypoints, 2]
-  keypoint_heatmap_offsets = tf.reshape(
-      keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, -1])
-
-  # shape: [num_keypoints, 2]
+  # shape: [num_keypoints, num_keypoints * 2]
  selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
                                       combined_indices)
-  y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
+  # shape: [num_keypoints, num_keypoints, 2].
+  selected_offsets_flat = tf.reshape(
+      selected_offsets_flat, [num_keypoints, num_keypoints, -1])
+  # shape: [num_keypoints].
+  channel_indices = tf.keras.backend.flatten(channel_indices)
+  # shape: [num_keypoints, 2].
+  retrieve_indices = tf.stack([channel_indices, channel_indices], axis=1)
+  # shape: [num_keypoints, 2]
+  selected_offsets = tf.gather_nd(selected_offsets_flat, retrieve_indices)
+  y_offsets, x_offsets = tf.unstack(selected_offsets, axis=1)

  keypoint_candidates = tf.stack([
      tf.cast(y_indices, dtype=tf.float32) + tf.expand_dims(y_offsets, axis=0),
      tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
  ], axis=2)
  keypoint_candidates = tf.expand_dims(keypoint_candidates, axis=0)
+
+  # Append the channel indices back to retrieve the keypoint scores from the
+  # heatmap.
+  combined_indices = tf.concat(
+      [combined_indices, tf.expand_dims(channel_indices, axis=-1)], axis=1)
  if keypoint_score_heatmap is None:
    keypoint_scores = tf.gather_nd(
        keypoint_heatmap_predictions, combined_indices)
@@ -1794,6 +1802,31 @@ def predicted_embeddings_at_object_centers(embedding_predictions,
  return embeddings


+def mask_from_true_image_shape(data_shape, true_image_shapes):
+  """Get a binary mask based on the true_image_shape.
+
+  Args:
+    data_shape: a possibly static (4,) tensor for the shape of the feature
+      map.
+    true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
+      the form [height, width, channels] indicating the shapes of true
+      images in the resized images, as resized images can be padded with
+      zeros.
+  Returns:
+    a [batch, data_height, data_width, 1] tensor of 1.0 wherever data_height
+    is less than height, etc.
+  """
+  mask_h = tf.cast(
+      tf.range(data_shape[1]) < true_image_shapes[:, tf.newaxis, 0],
+      tf.float32)
+  mask_w = tf.cast(
+      tf.range(data_shape[2]) < true_image_shapes[:, tf.newaxis, 1],
+      tf.float32)
+  mask = tf.expand_dims(
+      mask_h[:, :, tf.newaxis] * mask_w[:, tf.newaxis, :], 3)
+  return mask
+
+
 class ObjectDetectionParams(
    collections.namedtuple('ObjectDetectionParams', [
        'localization_loss', 'scale_loss_weight', 'offset_loss_weight',
@@ -2422,6 +2455,24 @@ class CenterNetMetaArch(model.DetectionModel):

    super(CenterNetMetaArch, self).__init__(num_classes)

+  def set_trainability_by_layer_traversal(self, trainable):
+    """Sets trainability layer by layer.
+
+    The commonly-seen `model.trainable = False` method does not traverse
+    the children layer. For example, if the parent is not trainable, we won't
+    be able to set individual layers as trainable/non-trainable differentially.
+
+    Args:
+      trainable: (bool) Setting this for the model layer by layer except for
+        the parent itself.
+    """
+    for layer in self._flatten_layers(include_self=False):
+      layer.trainable = trainable
+
+  @property
+  def prediction_head_dict(self):
+    return self._prediction_head_dict
+
  @property
  def batched_prediction_tensor_names(self):
    if not self._batched_prediction_tensor_names:
@@ -2647,7 +2698,7 @@ class CenterNetMetaArch(model.DetectionModel):
                per_keypoint_depth=kp_params.per_keypoint_depth))
    if self._mask_params is not None:
      target_assigners[SEGMENTATION_TASK] = (
-          cn_assigner.CenterNetMaskTargetAssigner(stride))
+          cn_assigner.CenterNetMaskTargetAssigner(stride, boxes_scale=1.05))
    if self._densepose_params is not None:
      dp_stride = 1 if self._densepose_params.upsample_to_input_res else stride
      target_assigners[DENSEPOSE_TASK] = (
@@ -3690,6 +3741,12 @@ class CenterNetMetaArch(model.DetectionModel):
          max_detections, reid_embed_size] containing object embeddings.
    """
    object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
+
+    # Mask object centers by true_image_shape. [batch, h, w, 1]
+    object_center_mask = mask_from_true_image_shape(
+        _get_shape(object_center_prob, 4), true_image_shapes)
+    object_center_prob *= object_center_mask
+
    # Get x, y and channel indices corresponding to the top indices in the class
    # center predictions.
    detection_scores, y_indices, x_indices, channel_indices = (
@@ -3751,7 +3808,7 @@ class CenterNetMetaArch(model.DetectionModel):
        ])
        keypoints, keypoint_scores = self._postprocess_keypoints_multi_class(
            prediction_dict, channel_indices, y_indices, x_indices,
-            None, num_detections)
+            boxes_strided, num_detections)
        keypoints, keypoint_scores = (
            convert_strided_predictions_to_normalized_keypoints(
                keypoints, keypoint_scores, self._stride, true_image_shapes,

--- a/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch_tf2_test.py
@@ -2518,6 +2518,75 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
    self.assertAllClose(detections['detection_keypoint_scores'][0, 0],
                        np.array([0.9, 0.9, 0.9, 0.1]))

+  def test_mask_object_center_in_postprocess_by_true_image_shape(self):
+    """Test the postprocess function is masked by true_image_shape."""
+    model = build_center_net_meta_arch(num_classes=1)
+    max_detection = model._center_params.max_box_predictions
+    num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
+
+    class_center = np.zeros((1, 32, 32, 1), dtype=np.float32)
+    height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
+    keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
+    keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
+
+    class_probs = np.zeros(1)
+    class_probs[0] = _logit(0.75)
+    class_center[0, 16, 16] = class_probs
+    height_width[0, 16, 16] = [5, 10]
+    offset[0, 16, 16] = [.25, .5]
+    keypoint_regression[0, 16, 16] = [
+        -1., -1.,
+        -1., 1.,
+        1., -1.,
+        1., 1.]
+    keypoint_heatmaps[0, 14, 14, 0] = _logit(0.9)
+    keypoint_heatmaps[0, 14, 18, 1] = _logit(0.9)
+    keypoint_heatmaps[0, 18, 14, 2] = _logit(0.9)
+    keypoint_heatmaps[0, 18, 18, 3] = _logit(0.05)  # Note the low score.
+
+    class_center = tf.constant(class_center)
+    height_width = tf.constant(height_width)
+    offset = tf.constant(offset)
+    keypoint_heatmaps = tf.constant(keypoint_heatmaps, dtype=tf.float32)
+    keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
+    keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
+
+    print(class_center)
+    prediction_dict = {
+        cnma.OBJECT_CENTER: [class_center],
+        cnma.BOX_SCALE: [height_width],
+        cnma.BOX_OFFSET: [offset],
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP):
+            [keypoint_heatmaps],
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET):
+            [keypoint_offsets],
+        cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
+            [keypoint_regression],
+    }
+
+    def graph_fn():
+      detections = model.postprocess(prediction_dict,
+                                     tf.constant([[1, 1, 3]]))
+      return detections
+
+    detections = self.execute_cpu(graph_fn, [])
+
+    self.assertAllClose(detections['detection_boxes'][0, 0],
+                        np.array([0, 0, 0, 0]))
+    # The class_center logits are initialized as 0's so it's filled with 0.5s.
+    # Despite that, we should only find one box.
+    self.assertAllClose(detections['detection_scores'][0],
+                        [0.5, 0., 0., 0., 0.])
+
+    self.assertEqual(np.sum(detections['detection_classes']), 0)
+    self.assertEqual(detections['num_detections'], [1])
+    self.assertAllEqual([1, max_detection, num_keypoints, 2],
+                        detections['detection_keypoints'].shape)
+    self.assertAllEqual([1, max_detection, num_keypoints],
+                        detections['detection_keypoint_scores'].shape)
+
  def test_get_instance_indices(self):
    classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
    num_detections = tf.constant([1, 3], dtype=tf.int32)

--- a/research/object_detection/meta_architectures/deepmac_meta_arch.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch.py
@@ -26,6 +26,7 @@ from object_detection.utils import spatial_transform_ops
 INSTANCE_EMBEDDING = 'INSTANCE_EMBEDDING'
 PIXEL_EMBEDDING = 'PIXEL_EMBEDDING'
 DEEP_MASK_ESTIMATION = 'deep_mask_estimation'
+DEEP_MASK_BOX_CONSISTENCY = 'deep_mask_box_consistency'
 LOSS_KEY_PREFIX = center_net_meta_arch.LOSS_KEY_PREFIX


@@ -35,7 +36,7 @@ class DeepMACParams(
        'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
        'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
        'predict_full_resolution_masks', 'postprocess_crop_size',
-        'max_roi_jitter_ratio', 'roi_jitter_mode'
+        'max_roi_jitter_ratio', 'roi_jitter_mode', 'box_consistency_loss_weight'
    ])):
  """Class holding the DeepMAC network configutration."""

@@ -46,7 +47,7 @@ class DeepMACParams(
              mask_num_subsamples, use_xy, network_type, use_instance_embedding,
              num_init_channels, predict_full_resolution_masks,
              postprocess_crop_size, max_roi_jitter_ratio,
-              roi_jitter_mode):
+              roi_jitter_mode, box_consistency_loss_weight):
    return super(DeepMACParams,
                 cls).__new__(cls, classification_loss, dim,
                              task_loss_weight, pixel_embedding_dim,
@@ -55,7 +56,7 @@ class DeepMACParams(
                              use_instance_embedding, num_init_channels,
                              predict_full_resolution_masks,
                              postprocess_crop_size, max_roi_jitter_ratio,
-                              roi_jitter_mode)
+                              roi_jitter_mode, box_consistency_loss_weight)


 def subsample_instances(classes, weights, boxes, masks, num_subsamples):
@@ -206,6 +207,61 @@ def filter_masked_classes(masked_class_ids, classes, weights, masks):
  )


+def crop_and_resize_feature_map(features, boxes, size):
+  """Crop and resize regions from a single feature map given a set of boxes.
+
+  Args:
+    features: A [H, W, C] float tensor.
+    boxes: A [N, 4] tensor of norrmalized boxes.
+    size: int, the size of the output features.
+
+  Returns:
+    per_box_features: A [N, size, size, C] tensor of cropped and resized
+      features.
+  """
+  return spatial_transform_ops.matmul_crop_and_resize(
+      features[tf.newaxis], boxes[tf.newaxis], [size, size])[0]
+
+
+def crop_and_resize_instance_masks(masks, boxes, mask_size):
+  """Crop and resize each mask according to the given boxes.
+
+  Args:
+    masks: A [N, H, W] float tensor.
+    boxes: A [N, 4] float tensor of normalized boxes.
+    mask_size: int, the size of the output masks.
+
+  Returns:
+    masks: A [N, mask_size, mask_size] float tensor of cropped and resized
+      instance masks.
+  """
+  cropped_masks = spatial_transform_ops.matmul_crop_and_resize(
+      masks[:, :, :, tf.newaxis], boxes[:, tf.newaxis, :],
+      [mask_size, mask_size])
+  cropped_masks = tf.squeeze(cropped_masks, axis=[1, 4])
+
+  return cropped_masks
+
+
+def fill_boxes(boxes, height, width):
+  """Fills the area included in the box."""
+  blist = box_list.BoxList(boxes)
+  blist = box_list_ops.to_absolute_coordinates(blist, height, width)
+  boxes = blist.get()
+  ymin, xmin, ymax, xmax = tf.unstack(
+      boxes[:, tf.newaxis, tf.newaxis, :], 4, axis=3)
+
+  ygrid, xgrid = tf.meshgrid(tf.range(height), tf.range(width), indexing='ij')
+  ygrid, xgrid = tf.cast(ygrid, tf.float32), tf.cast(xgrid, tf.float32)
+  ygrid, xgrid = ygrid[tf.newaxis, :, :], xgrid[tf.newaxis, :, :]
+
+  filled_boxes = tf.logical_and(
+      tf.logical_and(ygrid >= ymin, ygrid <= ymax),
+      tf.logical_and(xgrid >= xmin, xgrid <= xmax))
+
+  return tf.cast(filled_boxes, tf.float32)
+
+
 class ResNetMaskNetwork(tf.keras.layers.Layer):
  """A small wrapper around ResNet blocks to predict masks."""

@@ -379,7 +435,8 @@ def deepmac_proto_to_params(deepmac_config):
      deepmac_config.predict_full_resolution_masks,
      postprocess_crop_size=deepmac_config.postprocess_crop_size,
      max_roi_jitter_ratio=deepmac_config.max_roi_jitter_ratio,
-      roi_jitter_mode=jitter_mode
+      roi_jitter_mode=jitter_mode,
+      box_consistency_loss_weight=deepmac_config.box_consistency_loss_weight
  )


@@ -402,6 +459,13 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    """Constructs the super class with object center & detection params only."""

    self._deepmac_params = deepmac_params
+    if (self._deepmac_params.predict_full_resolution_masks and
+        self._deepmac_params.max_roi_jitter_ratio > 0.0):
+      raise ValueError('Jittering is not supported for full res masks.')
+
+    if self._deepmac_params.mask_num_subsamples > 0:
+      raise ValueError('Subsampling masks is currently not supported.')
+
    super(DeepMACMetaArch, self).__init__(
        is_training=is_training, add_summaries=add_summaries,
        num_classes=num_classes, feature_extractor=feature_extractor,
@@ -462,21 +526,34 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
      pixel_embedding = pixel_embedding[tf.newaxis, :, :, :]
      pixel_embeddings_processed = tf.tile(pixel_embedding,
                                           [num_instances, 1, 1, 1])
+      image_shape = tf.shape(pixel_embeddings_processed)
+      image_height, image_width = image_shape[1], image_shape[2]
+      y_grid, x_grid = tf.meshgrid(tf.linspace(0.0, 1.0, image_height),
+                                   tf.linspace(0.0, 1.0, image_width),
+                                   indexing='ij')
+
+      blist = box_list.BoxList(boxes)
+      ycenter, xcenter, _, _ = blist.get_center_coordinates_and_sizes()
+      y_grid = y_grid[tf.newaxis, :, :]
+      x_grid = x_grid[tf.newaxis, :, :]
+
+      y_grid -= ycenter[:, tf.newaxis, tf.newaxis]
+      x_grid -= xcenter[:, tf.newaxis, tf.newaxis]
+      coords = tf.stack([y_grid, x_grid], axis=3)
+
    else:
      # TODO(vighneshb) Explore multilevel_roi_align and align_corners=False.
-      pixel_embeddings_cropped = spatial_transform_ops.matmul_crop_and_resize(
-          pixel_embedding[tf.newaxis], boxes[tf.newaxis],
-          [mask_size, mask_size])
-      pixel_embeddings_processed = pixel_embeddings_cropped[0]
-
-    mask_shape = tf.shape(pixel_embeddings_processed)
-    mask_height, mask_width = mask_shape[1], mask_shape[2]
-    y_grid, x_grid = tf.meshgrid(tf.linspace(-1.0, 1.0, mask_height),
-                                 tf.linspace(-1.0, 1.0, mask_width),
-                                 indexing='ij')
-    coords = tf.stack([y_grid, x_grid], axis=2)
-    coords = coords[tf.newaxis, :, :, :]
-    coords = tf.tile(coords, [num_instances, 1, 1, 1])
+      pixel_embeddings_processed = crop_and_resize_feature_map(
+          pixel_embedding, boxes, mask_size)
+      mask_shape = tf.shape(pixel_embeddings_processed)
+      mask_height, mask_width = mask_shape[1], mask_shape[2]
+      y_grid, x_grid = tf.meshgrid(tf.linspace(-1.0, 1.0, mask_height),
+                                   tf.linspace(-1.0, 1.0, mask_width),
+                                   indexing='ij')
+
+      coords = tf.stack([y_grid, x_grid], axis=2)
+      coords = coords[tf.newaxis, :, :, :]
+      coords = tf.tile(coords, [num_instances, 1, 1, 1])

    if self._deepmac_params.use_xy:
      return tf.concat([coords, pixel_embeddings_processed], axis=3)
@@ -528,11 +605,9 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    if self._deepmac_params.predict_full_resolution_masks:
      return masks
    else:
-      cropped_masks = spatial_transform_ops.matmul_crop_and_resize(
-          masks[:, :, :, tf.newaxis], boxes[:, tf.newaxis, :],
-          [mask_size, mask_size])
+      cropped_masks = crop_and_resize_instance_masks(
+          masks, boxes, mask_size)
      cropped_masks = tf.stop_gradient(cropped_masks)
-      cropped_masks = tf.squeeze(cropped_masks, axis=[1, 4])

      # TODO(vighneshb) should we discretize masks?
      return cropped_masks
@@ -543,7 +618,64 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):

    return resize_instance_masks(logits, (height, width))

-  def _compute_per_instance_mask_loss(
+  def _compute_per_instance_mask_prediction_loss(
+      self, boxes, mask_logits, mask_gt):
+    num_instances = tf.shape(boxes)[0]
+    mask_logits = self._resize_logits_like_gt(mask_logits, mask_gt)
+
+    mask_logits = tf.reshape(mask_logits, [num_instances, -1, 1])
+    mask_gt = tf.reshape(mask_gt, [num_instances, -1, 1])
+    loss = self._deepmac_params.classification_loss(
+        prediction_tensor=mask_logits,
+        target_tensor=mask_gt,
+        weights=tf.ones_like(mask_logits))
+
+    # TODO(vighneshb) Make this configurable via config.
+    # Skip normalization for dice loss because the denominator term already
+    # does normalization.
+    if isinstance(self._deepmac_params.classification_loss,
+                  losses.WeightedDiceClassificationLoss):
+      return tf.reduce_sum(loss, axis=1)
+    else:
+      return tf.reduce_mean(loss, axis=[1, 2])
+
+  def _compute_per_instance_box_consistency_loss(
+      self, boxes_gt, boxes_for_crop, mask_logits):
+
+    height, width = tf.shape(mask_logits)[1], tf.shape(mask_logits)[2]
+    filled_boxes = fill_boxes(boxes_gt, height, width)[:, :, :, tf.newaxis]
+    mask_logits = mask_logits[:, :, :, tf.newaxis]
+
+    if self._deepmac_params.predict_full_resolution_masks:
+      gt_crop = filled_boxes[:, :, :, 0]
+      pred_crop = mask_logits[:, :, :, 0]
+    else:
+      gt_crop = crop_and_resize_instance_masks(
+          filled_boxes, boxes_for_crop, self._deepmac_params.mask_size)
+      pred_crop = crop_and_resize_instance_masks(
+          mask_logits, boxes_for_crop, self._deepmac_params.mask_size)
+
+    loss = 0.0
+    for axis in [1, 2]:
+      pred_max = tf.reduce_max(pred_crop, axis=axis)[:, :, tf.newaxis]
+      gt_max = tf.reduce_max(gt_crop, axis=axis)[:, :, tf.newaxis]
+
+      axis_loss = self._deepmac_params.classification_loss(
+          prediction_tensor=pred_max,
+          target_tensor=gt_max,
+          weights=tf.ones_like(pred_max))
+      loss += axis_loss
+
+    # Skip normalization for dice loss because the denominator term already
+    # does normalization.
+    # TODO(vighneshb) Make this configurable via config.
+    if isinstance(self._deepmac_params.classification_loss,
+                  losses.WeightedDiceClassificationLoss):
+      return tf.reduce_sum(loss, axis=1)
+    else:
+      return tf.reduce_mean(loss, axis=[1, 2])
+
+  def _compute_per_instance_deepmac_losses(
      self, boxes, masks, instance_embedding, pixel_embedding):
    """Returns the mask loss per instance.

@@ -558,40 +690,36 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        pixel_embedding_size] float tensor containing the per-pixel embeddings.

    Returns:
-      mask_loss: A [num_instances] shaped float tensor containing the
+      mask_prediction_loss: A [num_instances] shaped float tensor containing the
        mask loss for each instance.
-    """
+      box_consistency_loss: A [num_instances] shaped float tensor containing
+        the box consistency loss for each instance.

-    num_instances = tf.shape(boxes)[0]
+    """

    if tf.keras.backend.learning_phase():
-      boxes = preprocessor.random_jitter_boxes(
+      boxes_for_crop = preprocessor.random_jitter_boxes(
          boxes, self._deepmac_params.max_roi_jitter_ratio,
          jitter_mode=self._deepmac_params.roi_jitter_mode)
+    else:
+      boxes_for_crop = boxes
+
    mask_input = self._get_mask_head_input(
-        boxes, pixel_embedding)
+        boxes_for_crop, pixel_embedding)
    instance_embeddings = self._get_instance_embeddings(
-        boxes, instance_embedding)
-
+        boxes_for_crop, instance_embedding)
    mask_logits = self._mask_net(
        instance_embeddings, mask_input,
        training=tf.keras.backend.learning_phase())
-    mask_gt = self._get_groundtruth_mask_output(boxes, masks)
-    mask_logits = self._resize_logits_like_gt(mask_logits, mask_gt)
+    mask_gt = self._get_groundtruth_mask_output(boxes_for_crop, masks)

-    mask_logits = tf.reshape(mask_logits, [num_instances, -1, 1])
-    mask_gt = tf.reshape(mask_gt, [num_instances, -1, 1])
-    loss = self._deepmac_params.classification_loss(
-        prediction_tensor=mask_logits,
-        target_tensor=mask_gt,
-        weights=tf.ones_like(mask_logits))
+    mask_prediction_loss = self._compute_per_instance_mask_prediction_loss(
+        boxes_for_crop, mask_logits, mask_gt)

-    # TODO(vighneshb) Make this configurable via config.
-    if isinstance(self._deepmac_params.classification_loss,
-                  losses.WeightedDiceClassificationLoss):
-      return tf.reduce_sum(loss, axis=1)
-    else:
-      return tf.reduce_mean(loss, axis=[1, 2])
+    box_consistency_loss = self._compute_per_instance_box_consistency_loss(
+        boxes, boxes_for_crop, mask_logits)
+
+    return mask_prediction_loss, box_consistency_loss

  def _compute_instance_masks_loss(self, prediction_dict):
    """Computes the mask loss.
@@ -603,7 +731,7 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        [batch_size, height, width, embedding_size].

    Returns:
-      loss: float, the mask loss as a scalar.
+      loss_dict: A dict mapping string (loss names) to scalar floats.
    """
    gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
    gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
@@ -613,7 +741,10 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
    allowed_masked_classes_ids = (
        self._deepmac_params.allowed_masked_classes_ids)

-    total_loss = 0.0
+    loss_dict = {
+        DEEP_MASK_ESTIMATION: 0.0,
+        DEEP_MASK_BOX_CONSISTENCY: 0.0
+    }

    # Iterate over multiple preidctions by backbone (for hourglass length=2)
    for instance_pred, pixel_pred in zip(
@@ -625,24 +756,31 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
      for i, (boxes, weights, classes, masks) in enumerate(
          zip(gt_boxes_list, gt_weights_list, gt_classes_list, gt_masks_list)):

-        _, weights, masks = filter_masked_classes(allowed_masked_classes_ids,
-                                                  classes, weights, masks)
-        num_subsample = self._deepmac_params.mask_num_subsamples
-        _, weights, boxes, masks = subsample_instances(
-            classes, weights, boxes, masks, num_subsample)
+        # TODO(vighneshb) Add sub-sampling back if required.
+        classes, valid_mask_weights, masks = filter_masked_classes(
+            allowed_masked_classes_ids, classes, weights, masks)

-        per_instance_loss = self._compute_per_instance_mask_loss(
-            boxes, masks, instance_pred[i], pixel_pred[i])
-        per_instance_loss *= weights
+        per_instance_mask_loss, per_instance_consistency_loss = (
+            self._compute_per_instance_deepmac_losses(
+                boxes, masks, instance_pred[i], pixel_pred[i]))
+        per_instance_mask_loss *= valid_mask_weights
+        per_instance_consistency_loss *= weights

        num_instances = tf.maximum(tf.reduce_sum(weights), 1.0)
+        num_instances_allowed = tf.maximum(
+            tf.reduce_sum(valid_mask_weights), 1.0)

-        total_loss += tf.reduce_sum(per_instance_loss) / num_instances
+        loss_dict[DEEP_MASK_ESTIMATION] += (
+            tf.reduce_sum(per_instance_mask_loss) / num_instances_allowed)
+
+        loss_dict[DEEP_MASK_BOX_CONSISTENCY] += (
+            tf.reduce_sum(per_instance_consistency_loss) / num_instances)

    batch_size = len(gt_boxes_list)
    num_predictions = len(prediction_dict[INSTANCE_EMBEDDING])

-    return total_loss / float(batch_size * num_predictions)
+    return dict((key, loss / float(batch_size * num_predictions))
+                for key, loss in loss_dict.items())

  def loss(self, prediction_dict, true_image_shapes, scope=None):

@@ -650,13 +788,19 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
        prediction_dict, true_image_shapes, scope)

    if self._deepmac_params is not None:
-      mask_loss = self._compute_instance_masks_loss(
+      mask_loss_dict = self._compute_instance_masks_loss(
          prediction_dict=prediction_dict)
-      key = LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION
-      losses_dict[key] = (
-          self._deepmac_params.task_loss_weight * mask_loss
+
+      losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION] = (
+          self._deepmac_params.task_loss_weight * mask_loss_dict[
+              DEEP_MASK_ESTIMATION]
      )

+      if self._deepmac_params.box_consistency_loss_weight > 0.0:
+        losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_BOX_CONSISTENCY] = (
+            self._deepmac_params.box_consistency_loss_weight * mask_loss_dict[
+                DEEP_MASK_BOX_CONSISTENCY]
+        )
    return losses_dict

  def postprocess(self, prediction_dict, true_image_shapes, **params):

--- a/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
@@ -60,7 +60,8 @@ class MockMaskNet(tf.keras.layers.Layer):
    return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9


-def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
+def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
+                    mask_num_subsamples=-1):
  """Builds the DeepMAC meta architecture."""

  feature_extractor = DummyFeatureExtractor(
@@ -94,7 +95,7 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
      pixel_embedding_dim=2,
      allowed_masked_classes_ids=[],
      mask_size=16,
-      mask_num_subsamples=-1,
+      mask_num_subsamples=mask_num_subsamples,
      use_xy=True,
      network_type='hourglass10',
      use_instance_embedding=True,
@@ -102,7 +103,8 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
      predict_full_resolution_masks=predict_full_resolution_masks,
      postprocess_crop_size=128,
      max_roi_jitter_ratio=0.0,
-      roi_jitter_mode='random'
+      roi_jitter_mode='random',
+      box_consistency_loss_weight=1.0,
  )

  object_detection_params = center_net_meta_arch.ObjectDetectionParams(
@@ -140,6 +142,33 @@ class DeepMACUtilsTest(tf.test.TestCase):
    self.assertAllClose(result[2], boxes)
    self.assertAllClose(result[3], masks)

+  def test_fill_boxes(self):
+
+    boxes = tf.constant([[0., 0., 0.5, 0.5], [0.5, 0.5, 1.0, 1.0]])
+
+    filled_boxes = deepmac_meta_arch.fill_boxes(boxes, 32, 32)
+    expected = np.zeros((2, 32, 32))
+    expected[0, :17, :17] = 1.0
+    expected[1, 16:, 16:] = 1.0
+
+    self.assertAllClose(expected, filled_boxes.numpy(), rtol=1e-3)
+
+  def test_crop_and_resize_instance_masks(self):
+
+    boxes = tf.zeros((5, 4))
+    masks = tf.zeros((5, 128, 128))
+    output = deepmac_meta_arch.crop_and_resize_instance_masks(
+        masks, boxes, 32)
+    self.assertEqual(output.shape, (5, 32, 32))
+
+  def test_crop_and_resize_feature_map(self):
+
+    boxes = tf.zeros((5, 4))
+    features = tf.zeros((128, 128, 7))
+    output = deepmac_meta_arch.crop_and_resize_feature_map(
+        features, boxes, 32)
+    self.assertEqual(output.shape, (5, 32, 32, 7))
+

 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class DeepMACMetaArchTest(tf.test.TestCase):
@@ -199,7 +228,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
  def test_get_mask_head_input_no_crop_resize(self):

    model = build_meta_arch(predict_full_resolution_masks=True)
-    boxes = tf.constant([[0., 0., 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]],
+    boxes = tf.constant([[0., 0., 1.0, 1.0], [0.0, 0.0, 0.5, 1.0]],
                        dtype=tf.float32)

    pixel_embedding_np = np.random.randn(32, 32, 4).astype(np.float32)
@@ -208,12 +237,15 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    mask_inputs = model._get_mask_head_input(boxes, pixel_embedding)
    self.assertEqual(mask_inputs.shape, (2, 32, 32, 6))

-    y_grid, x_grid = tf.meshgrid(np.linspace(-1.0, 1.0, 32),
-                                 np.linspace(-1.0, 1.0, 32), indexing='ij')
+    y_grid, x_grid = tf.meshgrid(np.linspace(.0, 1.0, 32),
+                                 np.linspace(.0, 1.0, 32), indexing='ij')
+
+    ys = [0.5, 0.25]
+    xs = [0.5, 0.5]
    for i in range(2):
      mask_input = mask_inputs[i]
-      self.assertAllClose(y_grid, mask_input[:, :, 0])
-      self.assertAllClose(x_grid, mask_input[:, :, 1])
+      self.assertAllClose(y_grid - ys[i], mask_input[:, :, 0])
+      self.assertAllClose(x_grid - xs[i], mask_input[:, :, 1])
      pixel_embedding = mask_input[:, :, 2:]
      self.assertAllClose(pixel_embedding_np, pixel_embedding)

@@ -262,7 +294,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    masks[1, 16:, 16:] = 1.0
    masks = tf.constant(masks)

-    loss = model._compute_per_instance_mask_loss(
+    loss, _ = model._compute_per_instance_deepmac_losses(
        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
    self.assertAllClose(
        loss, np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
@@ -275,7 +307,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    masks = np.ones((2, 128, 128), dtype=np.float32)
    masks = tf.constant(masks)

-    loss = model._compute_per_instance_mask_loss(
+    loss, _ = model._compute_per_instance_deepmac_losses(
        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
    self.assertAllClose(
        loss, np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
@@ -289,7 +321,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    masks = np.ones((2, 128, 128), dtype=np.float32)
    masks = tf.constant(masks)

-    loss = model._compute_per_instance_mask_loss(
+    loss, _ = model._compute_per_instance_deepmac_losses(
        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
    pred = tf.nn.sigmoid(0.9)
    expected = (1.0 - ((2.0 * pred) / (1.0 + pred)))
@@ -299,7 +331,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    boxes = tf.zeros([0, 4])
    masks = tf.zeros([0, 128, 128])

-    loss = self.model._compute_per_instance_mask_loss(
+    loss, _ = self.model._compute_per_instance_deepmac_losses(
        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
    self.assertEqual(loss.shape, (0,))

@@ -394,6 +426,59 @@ class DeepMACMetaArchTest(tf.test.TestCase):
    out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 8)), training=True)
    self.assertEqual(out.shape, (2, 32, 32))

+  def test_box_consistency_loss(self):
+
+    boxes_gt = tf.constant([[0., 0., 0.49, 1.0]])
+    boxes_jittered = tf.constant([[0.0, 0.0, 1.0, 1.0]])
+
+    mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
+    mask_prediction[0, :24, :24] = 1.0
+
+    loss = self.model._compute_per_instance_box_consistency_loss(
+        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
+
+    yloss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=tf.constant([1.0] * 8 + [0.0] * 8),
+        logits=[1.0] * 12 + [0.0] * 4)
+    xloss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=tf.constant([1.0] * 16),
+        logits=[1.0] * 12 + [0.0] * 4)
+
+    self.assertAllClose(loss, [tf.reduce_mean(yloss + xloss).numpy()])
+
+  def test_box_consistency_dice_loss(self):
+
+    model = build_meta_arch(use_dice_loss=True)
+    boxes_gt = tf.constant([[0., 0., 0.49, 1.0]])
+    boxes_jittered = tf.constant([[0.0, 0.0, 1.0, 1.0]])
+
+    almost_inf = 1e10
+    mask_prediction = np.full((1, 32, 32), -almost_inf, dtype=np.float32)
+    mask_prediction[0, :24, :24] = almost_inf
+
+    loss = model._compute_per_instance_box_consistency_loss(
+        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
+
+    yloss = 1 - 6.0 / 7
+    xloss = 0.2
+
+    self.assertAllClose(loss, [yloss + xloss])
+
+  def test_box_consistency_dice_loss_full_res(self):
+
+    model = build_meta_arch(use_dice_loss=True,
+                            predict_full_resolution_masks=True)
+    boxes_gt = tf.constant([[0., 0., 1.0, 1.0]])
+    boxes_jittered = None
+
+    almost_inf = 1e10
+    mask_prediction = np.full((1, 32, 32), -almost_inf, dtype=np.float32)
+    mask_prediction[0, :16, :32] = almost_inf
+
+    loss = model._compute_per_instance_box_consistency_loss(
+        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
+    self.assertAlmostEqual(loss[0].numpy(), 1 / 3)
+

 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class FullyConnectedMaskHeadTest(tf.test.TestCase):