Unverified Commit 5ffcc5b6 authored by Anirudh Vegesana's avatar Anirudh Vegesana Committed by GitHub
Browse files

Merge branch 'purdue-yolo' into detection_generator_pr

parents 0b81a843 76e0c014
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Sfm120k dataset module."""
import tensorflow as tf
from delf.python.datasets.sfm120k import sfm120k
class Sfm120kTest(tf.test.TestCase):
"""Tests for Sfm120k dataset module."""
def testId2Filename(self):
"""Tests conversion of image id to full path mapping."""
image_id = "29fdc243aeb939388cfdf2d081dc080e"
prefix = "train/retrieval-SfM-120k/ims/"
path = sfm120k.id2filename(image_id, prefix)
expected_path = "train/retrieval-SfM-120k/ims/0e/08/dc" \
"/29fdc243aeb939388cfdf2d081dc080e"
self.assertEqual(path, expected_path)
if __name__ == '__main__':
tf.test.main()
# Lint as: python3
# Copyright 2021 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tuple dataset module.
Based on the Radenovic et al. ECCV16: CNN image retrieval learns from BoW.
For more information refer to https://arxiv.org/abs/1604.02426.
"""
import os
import pickle
import numpy as np
import tensorflow as tf
from delf.python.datasets import utils as image_loading_utils
from delf.python.training import global_features_utils
from delf.python.training.model import global_model
class TuplesDataset():
"""Data loader that loads training and validation tuples.
After initialization, the function create_epoch_tuples() should be called to
create the dataset tuples. After that, the dataset can be iterated through
using next() function.
Tuples are based on Radenovic et al. ECCV16 work: CNN image retrieval
learns from BoW. For more information refer to
https://arxiv.org/abs/1604.02426.
"""
def __init__(self, name, mode, data_root, imsize=None, num_negatives=5,
num_queries=2000, pool_size=20000,
loader=image_loading_utils.default_loader, ims_root=None):
"""TuplesDataset object initialization.
Args:
name: String, dataset name. I.e. 'retrieval-sfm-120k'.
mode: 'train' or 'val' for training and validation parts of dataset.
data_root: Path to the root directory of the dataset.
imsize: Integer, defines the maximum size of longer image side transform.
num_negatives: Integer, number of negative images for a query image in a
training tuple.
num_queries: Integer, number of query images to be processed in one epoch.
pool_size: Integer, size of the negative image pool, from where the
hard-negative images are re-mined.
loader: Callable, a function to load an image given its path.
ims_root: String, image root directory.
Raises:
ValueError: If mode is not either 'train' or 'val'.
"""
if mode not in ['train', 'val']:
raise ValueError(
"`mode` argument should be either 'train' or 'val', passed as a "
"String.")
# Loading db.
db_filename = os.path.join(data_root, '{}.pkl'.format(name))
with tf.io.gfile.GFile(db_filename, 'rb') as f:
db = pickle.load(f)[mode]
# Initializing tuples dataset.
self._ims_root = data_root if ims_root is None else ims_root
self._name = name
self._mode = mode
self._imsize = imsize
self._clusters = db['cluster']
self._query_pool = db['qidxs']
self._positive_pool = db['pidxs']
if not hasattr(self, 'images'):
self.images = db['ids']
# Size of training subset for an epoch.
self._num_negatives = num_negatives
self._num_queries = min(num_queries, len(self._query_pool))
self._pool_size = min(pool_size, len(self.images))
self._qidxs = None
self._pidxs = None
self._nidxs = None
self._loader = loader
self._print_freq = 10
# Indexer for the iterator.
self._n = 0
def __iter__(self):
"""Function for making TupleDataset an iterator.
Returns:
iter: The iterator object itself (TupleDataset).
"""
return self
def __next__(self):
"""Function for making TupleDataset an iterator.
Returns:
next: The next item in the sequence (next dataset image tuple).
"""
if self._n < len(self._qidxs):
result = self.__getitem__(self._n)
self._n += 1
return result
else:
raise StopIteration
def _img_names_to_full_path(self, image_list):
"""Converts list of image names to the list of full paths to the images.
Args:
image_list: Image names, either a list or a single image path.
Returns:
image_full_paths: List of full paths to the images.
"""
if not isinstance(image_list, list):
return os.path.join(self._ims_root, image_list)
return [os.path.join(self._ims_root, img_name) for img_name in image_list]
def __getitem__(self, index):
"""Called to load an image tuple at the given `index`.
Args:
index: Integer, index.
Returns:
output: Tuple [q,p,n1,...,nN, target], loaded 'train'/'val' tuple at
index of qidxs. `q` is the query image tensor, `p` is the
corresponding positive image tensor, `n1`,...,`nN` are the negatives
associated with the query. `target` is a tensor (with the shape [2+N])
of integer labels corresponding to the tuple list: query (-1),
positive (1), negative (0).
Raises:
ValueError: Raised if the query indexes list `qidxs` is empty.
"""
if self.__len__() == 0:
raise ValueError(
"List `qidxs` is empty. Run `dataset.create_epoch_tuples(net)` "
"method to create subset for `train`/`val`.")
output = []
# Query image.
output.append(self._loader(
self._img_names_to_full_path(self.images[self._qidxs[index]]),
self._imsize))
# Positive image.
output.append(self._loader(
self._img_names_to_full_path(self.images[self._pidxs[index]]),
self._imsize))
# Negative images.
for nidx in self._nidxs[index]:
output.append(self._loader(
self._img_names_to_full_path(self.images[nidx]),
self._imsize))
# Labels for the query (-1), positive (1), negative (0) images in the tuple.
target = tf.convert_to_tensor([-1, 1] + [0] * self._num_negatives)
output.append(target)
return tuple(output)
def __len__(self):
"""Called to implement the built-in function len().
Returns:
len: Integer, number of query images.
"""
if self._qidxs is None:
return 0
return len(self._qidxs)
def __repr__(self):
"""Metadata for the TupleDataset.
Returns:
meta: String, containing TupleDataset meta.
"""
fmt_str = self.__class__.__name__ + '\n'
fmt_str += '\tName and mode: {} {}\n'.format(self._name, self._mode)
fmt_str += '\tNumber of images: {}\n'.format(len(self.images))
fmt_str += '\tNumber of training tuples: {}\n'.format(len(self._query_pool))
fmt_str += '\tNumber of negatives per tuple: {}\n'.format(
self._num_negatives)
fmt_str += '\tNumber of tuples processed in an epoch: {}\n'.format(
self._num_queries)
fmt_str += '\tPool size for negative remining: {}\n'.format(self._pool_size)
return fmt_str
def create_epoch_tuples(self, net):
"""Creates epoch tuples with the hard-negative re-mining.
Negative examples are selected from clusters different than the cluster
of the query image, as the clusters are ideally non-overlaping. For
every query image we choose hard-negatives, that is, non-matching images
with the most similar descriptor. Hard-negatives depend on the current
CNN parameters. K-nearest neighbors from all non-matching images are
selected. Query images are selected randomly. Positives examples are
fixed for the related query image during the whole training process.
Args:
net: Model, network to be used for negative re-mining.
Raises:
ValueError: If the pool_size is smaller than the number of negative
images per tuple.
Returns:
avg_l2: Float, average negative L2-distance.
"""
self._n = 0
if self._num_negatives < self._pool_size:
raise ValueError("Unable to create epoch tuples. Negative pool_size "
"should be larger than the number of negative images "
"per tuple.")
global_features_utils.debug_and_log(
'>> Creating tuples for an epoch of {}-{}...'.format(self._name,
self._mode),
True)
global_features_utils.debug_and_log(">> Used network: ", True)
global_features_utils.debug_and_log(net.meta_repr(), True)
## Selecting queries.
# Draw `num_queries` random queries for the tuples.
idx_list = np.arange(len(self._query_pool))
np.random.shuffle(idx_list)
idxs2query_pool = idx_list[:self._num_queries]
self._qidxs = [self._query_pool[i] for i in idxs2query_pool]
## Selecting positive pairs.
# Positives examples are fixed for each query during the whole training
# process.
self._pidxs = [self._positive_pool[i] for i in idxs2query_pool]
## Selecting negative pairs.
# If `num_negatives` = 0 create dummy nidxs.
# Useful when only positives used for training.
if self._num_negatives == 0:
self._nidxs = [[] for _ in range(len(self._qidxs))]
return 0
# Draw pool_size random images for pool of negatives images.
neg_idx_list = np.arange(len(self.images))
np.random.shuffle(neg_idx_list)
neg_images_idxs = neg_idx_list[:self._pool_size]
global_features_utils.debug_and_log(
'>> Extracting descriptors for query images...', debug=True)
img_list = self._img_names_to_full_path([self.images[i] for i in
self._qidxs])
qvecs = global_model.extract_global_descriptors_from_list(
net,
images=img_list,
image_size=self._imsize,
print_freq=self._print_freq)
global_features_utils.debug_and_log(
'>> Extracting descriptors for negative pool...', debug=True)
poolvecs = global_model.extract_global_descriptors_from_list(
net,
images=self._img_names_to_full_path([self.images[i] for i in
neg_images_idxs]),
image_size=self._imsize,
print_freq=self._print_freq)
global_features_utils.debug_and_log('>> Searching for hard negatives...',
debug=True)
# Compute dot product scores and ranks.
scores = tf.linalg.matmul(poolvecs, qvecs, transpose_a=True)
ranks = tf.argsort(scores, axis=0, direction='DESCENDING')
sum_ndist = 0.
n_ndist = 0.
# Selection of negative examples.
self._nidxs = []
for q, qidx in enumerate(self._qidxs):
# We are not using the query cluster, those images are potentially
# positive.
qcluster = self._clusters[qidx]
clusters = [qcluster]
nidxs = []
rank = 0
while len(nidxs) < self._num_negatives:
if rank >= tf.shape(ranks)[0]:
raise ValueError("Unable to create epoch tuples. Number of required "
"negative images is larger than the number of "
"clusters in the dataset.")
potential = neg_images_idxs[ranks[rank, q]]
# Take at most one image from the same cluster.
if not self._clusters[potential] in clusters:
nidxs.append(potential)
clusters.append(self._clusters[potential])
dist = tf.norm(qvecs[:, q] - poolvecs[:, ranks[rank, q]],
axis=0).numpy()
sum_ndist += dist
n_ndist += 1
rank += 1
self._nidxs.append(nidxs)
global_features_utils.debug_and_log(
'>> Average negative l2-distance: {:.2f}'.format(
sum_ndist / n_ndist))
# Return average negative L2-distance.
return sum_ndist / n_ndist
# Lint as: python3
# Copyright 2021 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"Tests for the tuples dataset module."
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import flags
import numpy as np
from PIL import Image
import tensorflow as tf
import pickle
from delf.python.datasets import tuples_dataset
from delf.python.training.model import global_model
FLAGS = flags.FLAGS
class TuplesDatasetTest(tf.test.TestCase):
"""Tests for tuples dataset module."""
def testCreateEpochTuples(self):
"""Tests epoch tuple creation."""
# Create a tuples dataset instance.
name = 'test_dataset'
num_queries = 1
pool_size = 5
num_negatives = 2
# Create a ground truth .pkl file.
gnd = {
'train': {'ids': [str(i) + '.png' for i in range(2 * num_queries + pool_size)],
'cluster': [0, 0, 1, 2, 3, 4, 5],
'qidxs': [0], 'pidxs': [1]}}
gnd_name = name + '.pkl'
with tf.io.gfile.GFile(os.path.join(FLAGS.test_tmpdir, gnd_name),
'wb') as gnd_file:
pickle.dump(gnd, gnd_file)
# Create random images for the dataset.
for i in range(2 * num_queries + pool_size):
dummy_image = np.random.rand(1024, 750, 3) * 255
img_out = Image.fromarray(dummy_image.astype('uint8')).convert('RGB')
filename = os.path.join(FLAGS.test_tmpdir, '{}.png'.format(i))
img_out.save(filename)
dataset = tuples_dataset.TuplesDataset(
name=name,
data_root=FLAGS.test_tmpdir,
mode='train',
imsize=1024,
num_negatives=num_negatives,
num_queries=num_queries,
pool_size=pool_size
)
# Assert that initially no negative images are set.
self.assertIsNone(dataset._nidxs)
# Initialize a network for negative re-mining.
model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
'whitening': False, 'pretrained': True}
model = global_model.GlobalFeatureNet(**model_params)
avg_neg_distance = dataset.create_epoch_tuples(model)
# Check that an appropriate number of negative images has been chosen per
# query.
self.assertAllEqual(tf.shape(dataset._nidxs), [num_queries, num_negatives])
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Global model training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Training script for Global Features model."""
import math
import os
from absl import app
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from delf.python.datasets.sfm120k import dataset_download
from delf.python.datasets.sfm120k import sfm120k
from delf.python.training import global_features_utils
from delf.python.training import tensorboard_utils
from delf.python.training.global_features import train_utils
from delf.python.training.losses import ranking_losses
from delf.python.training.model import global_model
_LOSS_NAMES = ['contrastive', 'triplet']
_MODEL_NAMES = global_features_utils.get_standard_keras_models()
_OPTIMIZER_NAMES = ['sgd', 'adam']
_POOL_NAMES = ['mac', 'spoc', 'gem']
_PRECOMPUTE_WHITEN_NAMES = ['retrieval-SfM-30k', 'retrieval-SfM-120k']
_TEST_DATASET_NAMES = ['roxford5k', 'rparis6k']
_TRAINING_DATASET_NAMES = ['retrieval-SfM-120k']
_VALIDATION_TYPES = ['standard', 'eccv2020']
FLAGS = flags.FLAGS
flags.DEFINE_boolean('debug', False, 'Debug mode.')
# Export directory, training and val datasets, test datasets.
flags.DEFINE_string('data_root', "data",
'Absolute path to the folder containing training data.')
flags.DEFINE_string('directory', "data",
'Destination where trained network should be saved.')
flags.DEFINE_enum('training_dataset', 'retrieval-SfM-120k',
_TRAINING_DATASET_NAMES, 'Training dataset: ' +
' | '.join(_TRAINING_DATASET_NAMES) + '.')
flags.DEFINE_enum('validation_type', None, _VALIDATION_TYPES,
'Type of the evaluation to use. Either `None`, `standard` '
'or `eccv2020`.')
flags.DEFINE_list('test_datasets', 'roxford5k,rparis6k',
'Comma separated list of test datasets: ' +
' | '.join(_TEST_DATASET_NAMES) + '.')
flags.DEFINE_enum('precompute_whitening', None, _PRECOMPUTE_WHITEN_NAMES,
'Dataset used to learn whitening: ' +
' | '.join(_PRECOMPUTE_WHITEN_NAMES) + '.')
flags.DEFINE_integer('test_freq', 5,
'Run test evaluation every N epochs.')
flags.DEFINE_list('multiscale', [1.],
'Use multiscale vectors for testing, ' +
' examples: 1 | 1,1/2**(1/2),1/2 | 1,2**(1/2),1/2**(1/2)]. '
'Pass as a string of comma separated values.')
# Network architecture and initialization options.
flags.DEFINE_enum('arch', 'ResNet101', _MODEL_NAMES,
'Model architecture: ' + ' | '.join(_MODEL_NAMES) + '.')
flags.DEFINE_enum('pool', 'gem', _POOL_NAMES,
'Pooling options: ' + ' | '.join(_POOL_NAMES) + '.')
flags.DEFINE_bool('whitening', False,
'Whether to train model with learnable whitening ('
'linear layer) after the pooling.')
flags.DEFINE_bool('pretrained', True,
'Whether to initialize model with random weights ('
'default: pretrained on imagenet).')
flags.DEFINE_enum('loss', 'contrastive', _LOSS_NAMES,
'Training loss options: ' + ' | '.join(_LOSS_NAMES) + '.')
flags.DEFINE_float('loss_margin', 0.7, 'Loss margin.')
# train/val options specific for image retrieval learning.
flags.DEFINE_integer('image_size', 1024,
'Maximum size of longer image side used for training.')
flags.DEFINE_integer('neg_num', 5, 'Number of negative images per train/val '
'tuple.')
flags.DEFINE_integer('query_size', 2000,
'Number of queries randomly drawn per one training epoch.')
flags.DEFINE_integer('pool_size', 20000,
'Size of the pool for hard negative mining.')
# Standard training/validation options.
flags.DEFINE_string('gpu_id', '0', 'GPU id used for training.')
flags.DEFINE_integer('epochs', 100, 'Number of total epochs to run.')
flags.DEFINE_integer('batch_size', 5,
'Number of (q,p,n1,...,nN) tuples in a mini-batch.')
flags.DEFINE_integer('update_every', 1,
'Update model weights every N batches, used to handle '
'relatively large batches, batch_size effectively '
'becomes update_every `x` batch_size.')
flags.DEFINE_enum('optimizer', 'adam', _OPTIMIZER_NAMES,
'Optimizer options: ' + ' | '.join(_OPTIMIZER_NAMES) + '.')
flags.DEFINE_float('lr', 1e-6, 'Initial learning rate.')
flags.DEFINE_float('momentum', 0.9, 'Momentum.')
flags.DEFINE_float('weight_decay', 1e-6, 'Weight decay.')
flags.DEFINE_bool('resume', False,
'Whether to start from the latest checkpoint in the logdir.')
flags.DEFINE_bool('launch_tensorboard', False, 'Whether to launch tensorboard.')
def main(argv):
if len(argv) > 1:
raise RuntimeError('Too many command-line arguments.')
# Manually check if there are unknown test datasets and if the dataset
# ground truth files are downloaded.
for dataset in FLAGS.test_datasets:
if dataset not in _TEST_DATASET_NAMES:
raise ValueError('Unsupported or unknown test dataset: {}.'.format(
dataset))
test_data_config = os.path.join(FLAGS.data_root,
'gnd_{}.pkl'.format(dataset))
if not tf.io.gfile.exists(test_data_config):
raise ValueError(
'{} ground truth file at {} not found. Please download it '
'according to '
'the DELG instructions.'.format(dataset, FLAGS.data_root))
# Check if train dataset is downloaded and download it if not found.
dataset_download.download_train(FLAGS.data_root)
# Creating model export directory if it does not exist.
model_directory = global_features_utils.create_model_directory(
FLAGS.training_dataset, FLAGS.arch, FLAGS.pool, FLAGS.whitening,
FLAGS.pretrained, FLAGS.loss, FLAGS.loss_margin, FLAGS.optimizer,
FLAGS.lr, FLAGS.weight_decay, FLAGS.neg_num, FLAGS.query_size,
FLAGS.pool_size, FLAGS.batch_size, FLAGS.update_every,
FLAGS.image_size, FLAGS.directory)
# Setting up logging directory, same as where the model is stored.
logging.get_absl_handler().use_absl_log_file('absl_logging', model_directory)
# Set cuda visible device.
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_id
global_features_utils.debug_and_log('>> Num GPUs Available: {}'.format(
len(tf.config.experimental.list_physical_devices('GPU'))),
FLAGS.debug)
# Set random seeds.
tf.random.set_seed(0)
np.random.seed(0)
# Initialize the model.
if FLAGS.pretrained:
global_features_utils.debug_and_log(
'>> Using pre-trained model \'{}\''.format(FLAGS.arch))
else:
global_features_utils.debug_and_log(
'>> Using model from scratch (random weights) \'{}\'.'.format(
FLAGS.arch))
model_params = {'architecture': FLAGS.arch, 'pooling': FLAGS.pool,
'whitening': FLAGS.whitening, 'pretrained': FLAGS.pretrained,
'data_root': FLAGS.data_root}
model = global_model.GlobalFeatureNet(**model_params)
# Freeze running mean and std in batch normalization layers.
# We do training one image at a time to improve memory requirements of
# the network; therefore, the computed statistics would not be per a
# batch. Instead, we choose freezing - setting the parameters of all
# batch norm layers in the network to non-trainable (i.e., using original
# imagenet statistics).
for layer in model.feature_extractor.layers:
if isinstance(layer, tf.keras.layers.BatchNormalization):
layer.trainable = False
global_features_utils.debug_and_log('>> Network initialized.')
global_features_utils.debug_and_log('>> Loss: {}.'.format(FLAGS.loss))
# Define the loss function.
if FLAGS.loss == 'contrastive':
criterion = ranking_losses.ContrastiveLoss(margin=FLAGS.loss_margin)
elif FLAGS.loss == 'triplet':
criterion = ranking_losses.TripletLoss(margin=FLAGS.loss_margin)
else:
raise ValueError('Loss {} not available.'.format(FLAGS.loss))
# Defining parameters for the training.
# When pre-computing whitening, we run evaluation before the network training
# and the `start_epoch` is set to 0. In other cases, we start from epoch 1.
start_epoch = 1
exp_decay = math.exp(-0.01)
decay_steps = FLAGS.query_size / FLAGS.batch_size
# Define learning rate decay schedule.
lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=FLAGS.lr,
decay_steps=decay_steps,
decay_rate=exp_decay)
# Define the optimizer.
if FLAGS.optimizer == 'sgd':
opt = tfa.optimizers.extend_with_decoupled_weight_decay(
tf.keras.optimizers.SGD)
optimizer = opt(weight_decay=FLAGS.weight_decay,
learning_rate=lr_scheduler, momentum=FLAGS.momentum)
elif FLAGS.optimizer == 'adam':
opt = tfa.optimizers.extend_with_decoupled_weight_decay(
tf.keras.optimizers.Adam)
optimizer = opt(weight_decay=FLAGS.weight_decay, learning_rate=lr_scheduler)
else:
raise ValueError('Optimizer {} not available.'.format(FLAGS.optimizer))
# Initializing logging.
writer = tf.summary.create_file_writer(model_directory)
tf.summary.experimental.set_step(1)
# Setting up the checkpoint manager.
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
manager = tf.train.CheckpointManager(
checkpoint,
model_directory,
max_to_keep=10,
keep_checkpoint_every_n_hours=3)
if FLAGS.resume:
# Restores the checkpoint, if existing.
global_features_utils.debug_and_log('>> Continuing from a checkpoint.')
checkpoint.restore(manager.latest_checkpoint)
# Launching tensorboard if required.
if FLAGS.launch_tensorboard:
tensorboard = tf.keras.callbacks.TensorBoard(model_directory)
tensorboard.set_model(model=model)
tensorboard_utils.launch_tensorboard(log_dir=model_directory)
# Log flags used.
global_features_utils.debug_and_log('>> Running training script with:')
global_features_utils.debug_and_log('>> logdir = {}'.format(model_directory))
if FLAGS.training_dataset.startswith('retrieval-SfM-120k'):
train_dataset = sfm120k.CreateDataset(
data_root=FLAGS.data_root,
mode='train',
imsize=FLAGS.image_size,
num_negatives=FLAGS.neg_num,
num_queries=FLAGS.query_size,
pool_size=FLAGS.pool_size
)
if FLAGS.validation_type is not None:
val_dataset = sfm120k.CreateDataset(
data_root=FLAGS.data_root,
mode='val',
imsize=FLAGS.image_size,
num_negatives=FLAGS.neg_num,
num_queries=float('Inf'),
pool_size=float('Inf'),
eccv2020=True if FLAGS.validation_type == 'eccv2020' else False
)
train_dataset_output_types = [tf.float32 for i in range(2 + FLAGS.neg_num)]
train_dataset_output_types.append(tf.int32)
global_features_utils.debug_and_log(
'>> Training the {} network'.format(model_directory))
global_features_utils.debug_and_log('>> GPU ids: {}'.format(FLAGS.gpu_id))
with writer.as_default():
# Precompute whitening if needed.
if FLAGS.precompute_whitening is not None:
epoch = 0
train_utils.test_retrieval(
FLAGS.test_datasets, model, writer=writer,
epoch=epoch, model_directory=model_directory,
precompute_whitening=FLAGS.precompute_whitening,
data_root=FLAGS.data_root,
multiscale=FLAGS.multiscale)
for epoch in range(start_epoch, FLAGS.epochs + 1):
# Set manual seeds per epoch.
np.random.seed(epoch)
tf.random.set_seed(epoch)
# Find hard-negatives.
# While hard-positive examples are fixed during the whole training
# process and are randomly chosen from every epoch; hard-negatives
# depend on the current CNN parameters and are re-mined once per epoch.
avg_neg_distance = train_dataset.create_epoch_tuples(model)
def _train_gen():
return (inst for inst in train_dataset)
train_loader = tf.data.Dataset.from_generator(
_train_gen,
output_types=tuple(train_dataset_output_types))
loss = train_utils.train_val_one_epoch(
loader=iter(train_loader), model=model,
criterion=criterion, optimizer=optimizer, epoch=epoch,
batch_size=FLAGS.batch_size, query_size=FLAGS.query_size,
neg_num=FLAGS.neg_num, update_every=FLAGS.update_every,
debug=FLAGS.debug)
# Write a scalar summary.
tf.summary.scalar('train_epoch_loss', loss, step=epoch)
# Forces summary writer to send any buffered data to storage.
writer.flush()
# Evaluate on validation set.
if FLAGS.validation_type is not None and (epoch % FLAGS.test_freq == 0 or
epoch == 1):
avg_neg_distance = val_dataset.create_epoch_tuples(model,
model_directory)
def _val_gen():
return (inst for inst in val_dataset)
val_loader = tf.data.Dataset.from_generator(
_val_gen, output_types=tuple(train_dataset_output_types))
loss = train_utils.train_val_one_epoch(
loader=iter(val_loader), model=model,
criterion=criterion, optimizer=None,
epoch=epoch, train=False, batch_size=FLAGS.batch_size,
query_size=FLAGS.query_size, neg_num=FLAGS.neg_num,
update_every=FLAGS.update_every, debug=FLAGS.debug)
tf.summary.scalar('val_epoch_loss', loss, step=epoch)
writer.flush()
# Evaluate on test datasets every test_freq epochs.
if epoch == 1 or epoch % FLAGS.test_freq == 0:
train_utils.test_retrieval(
FLAGS.test_datasets, model, writer=writer, epoch=epoch,
model_directory=model_directory,
precompute_whitening=FLAGS.precompute_whitening,
data_root=FLAGS.data_root, multiscale=FLAGS.multiscale)
# Saving checkpoints and model weights.
try:
save_path = manager.save(checkpoint_number=epoch)
global_features_utils.debug_and_log(
'Saved ({}) at {}'.format(epoch, save_path))
filename = os.path.join(model_directory,
'checkpoint_epoch_{}.h5'.format(epoch))
model.save_weights(filename, save_format='h5')
global_features_utils.debug_and_log(
'Saved weights ({}) at {}'.format(epoch, filename))
except Exception as ex:
global_features_utils.debug_and_log(
'Could not save checkpoint: {}'.format(ex))
if __name__ == '__main__':
app.run(main)
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Training utilities for Global Features model."""
import os
import pickle
import time
import numpy as np
import tensorflow as tf
from delf.python import whiten
from delf.python.datasets.revisited_op import dataset as test_dataset
from delf.python.datasets.sfm120k import sfm120k
from delf.python.training import global_features_utils
from delf.python.training.model import global_model
def _compute_loss_and_gradient(criterion, model, input, target, neg_num=5):
"""Records gradients and loss through the network.
Args:
criterion: Loss function.
model: Network for the gradient computation.
input: Tuple of query, positive and negative images.
target: List of indexes to specify queries (-1), positives(1), negatives(0).
neg_num: Integer, number of negatives per a tuple.
Returns:
loss: Loss for the training step.
gradients: Computed gradients for the network trainable variables.
"""
# Record gradients and loss through the network.
with tf.GradientTape() as tape:
descriptors = tf.zeros(shape=(0, model.meta['outputdim']), dtype=tf.float32)
for img in input:
# Compute descriptor vector for each image.
o = model(tf.expand_dims(img, axis=0), training=True)
descriptors = tf.concat([descriptors, o], 0)
queries = descriptors[target == -1]
positives = descriptors[target == 1]
negatives = descriptors[target == 0]
negatives = tf.reshape(negatives, [tf.shape(queries)[0], neg_num,
model.meta['outputdim']])
# Loss calculation.
loss = criterion(queries, positives, negatives)
return loss, tape.gradient(loss, model.trainable_variables)
def train_val_one_epoch(
loader, model, criterion, optimizer, epoch, train=True, batch_size=5,
query_size=2000, neg_num=5, update_every=1, debug=False):
"""Executes either training or validation step based on `train` value.
Args:
loader: Training/validation iterable dataset.
model: Network to train/validate.
criterion: Loss function.
optimizer: Network optimizer.
epoch: Integer, epoch number.
train: Bool, specifies training or validation phase.
batch_size: Integer, number of (q,p,n1,...,nN) tuples in a mini-batch.
query_size: Integer, number of queries randomly drawn per one training
epoch.
neg_num: Integer, number of negatives per a tuple.
update_every: Integer, update model weights every N batches, used to
handle relatively large batches batch_size effectively becomes
update_every x batch_size.
debug: Bool, whether debug mode is used.
Returns:
average_epoch_loss: Average epoch loss.
"""
batch_time = global_features_utils.AverageMeter()
data_time = global_features_utils.AverageMeter()
losses = global_features_utils.AverageMeter()
# Retrieve all trainable variables we defined in the graph.
tvs = model.trainable_variables
accum_grads = [tf.zeros_like(tv.read_value()) for tv in tvs]
end = time.time()
batch_num = 0
print_frequency = 10
all_batch_num = query_size // batch_size
state = 'Train' if train else 'Val'
global_features_utils.debug_and_log('>> {} step:'.format(state))
# For every batch in the dataset; Stops when all batches in the dataset have
# been processed.
while True:
data_time.update(time.time() - end)
if train:
try:
# Train on one batch.
# Each image in the batch is loaded into memory consecutively.
for _ in range(batch_size):
# Because the images are not necessarily of the same size, we can't
# set the batch size with .batch().
batch = loader.get_next()
input_tuple = batch[0:-1]
target_tuple = batch[-1]
loss_value, grads = _compute_loss_and_gradient(
criterion, model, input_tuple, target_tuple, neg_num)
losses.update(loss_value)
# Accumulate gradients.
accum_grads += grads
# Perform weight update if required.
if (batch_num + 1) % update_every == 0 or (
batch_num + 1) == all_batch_num:
# Do one step for multiple batches. Accumulated gradients are
# used.
optimizer.apply_gradients(
zip(accum_grads, model.trainable_variables))
accum_grads = [tf.zeros_like(tv.read_value()) for tv in tvs]
# We break when we run out of range, i.e., we exhausted all dataset
# images.
except tf.errors.OutOfRangeError:
break
else:
# Validate one batch.
# We load full batch into memory.
input = []
target = []
try:
for _ in range(batch_size):
# Because the images are not necessarily of the same size, we can't
# set the batch size with .batch().
batch = loader.get_next()
input.append(batch[0:-1])
target.append(batch[-1])
# We break when we run out of range, i.e., we exhausted all dataset
# images.
except tf.errors.OutOfRangeError:
break
descriptors = tf.zeros(shape=(0, model.meta['outputdim']),
dtype=tf.float32)
for input_tuple in input:
for img in input_tuple:
# Compute the global descriptor vector.
model_out = model(tf.expand_dims(img, axis=0), training=False)
descriptors = tf.concat([descriptors, model_out], 0)
# No need to reduce memory consumption (no backward pass):
# Compute loss for the full batch.
queries = descriptors[target == -1]
positives = descriptors[target == 1]
negatives = descriptors[target == 0]
negatives = tf.reshape(negatives, [tf.shape(queries)[0], neg_num,
model.meta['outputdim']])
loss = criterion(queries, positives, negatives)
# Record loss.
losses.update(loss / batch_size, batch_size)
# Measure elapsed time.
batch_time.update(time.time() - end)
end = time.time()
# Record immediate loss and elapsed time.
if debug and ((batch_num + 1) % print_frequency == 0 or
batch_num == 0 or (batch_num + 1) == all_batch_num):
global_features_utils.debug_and_log(
'>> {0}: [{1} epoch][{2}/{3} batch]\t Time val: {'
'batch_time.val:.3f} '
'(Batch Time avg: {batch_time.avg:.3f})\t Data {'
'data_time.val:.3f} ('
'Time avg: {data_time.avg:.3f})\t Immediate loss value: {'
'loss.val:.4f} '
'(Loss avg: {loss.avg:.4f})'.format(
state, epoch, batch_num + 1, all_batch_num,
batch_time=batch_time,
data_time=data_time, loss=losses), debug=True, log=False)
batch_num += 1
return losses.avg
def test_retrieval(datasets, net, epoch, writer=None, model_directory=None,
precompute_whitening=None, data_root='data', multiscale=[1.],
test_image_size=1024):
"""Testing step.
Evaluates the network on the provided test datasets by computing single-scale
mAP for easy/medium/hard cases. If `writer` is specified, saves the mAP
values in a tensorboard supported format.
Args:
datasets: List of dataset names for model testing (from
`_TEST_DATASET_NAMES`).
net: Network to evaluate.
epoch: Integer, epoch number.
writer: Tensorboard writer.
model_directory: String, path to the model directory.
precompute_whitening: Dataset used to learn whitening. If no
precomputation required, then `None`. Only 'retrieval-SfM-30k' and
'retrieval-SfM-120k' datasets are supported for whitening pre-computation.
data_root: Absolute path to the data folder.
multiscale: List of scales for multiscale testing.
test_image_size: Integer, maximum size of the test images.
"""
global_features_utils.debug_and_log(">> Testing step:")
global_features_utils.debug_and_log(
'>> Evaluating network on test datasets...')
# Precompute whitening.
if precompute_whitening is not None:
# If whitening already precomputed, load it and skip the computations.
filename = os.path.join(
model_directory, 'learned_whitening_mP_{}_epoch.pkl'.format(epoch))
filename_layer = os.path.join(
model_directory,
'learned_whitening_layer_config_{}_epoch.pkl'.format(
epoch))
if tf.io.gfile.exists(filename):
global_features_utils.debug_and_log(
'>> {}: Whitening for this epoch is already precomputed. '
'Loading...'.format(precompute_whitening))
with tf.io.gfile.GFile(filename, 'rb') as learned_whitening_file:
learned_whitening = pickle.load(learned_whitening_file)
else:
start = time.time()
global_features_utils.debug_and_log(
'>> {}: Learning whitening...'.format(precompute_whitening))
# Loading db.
db_root = os.path.join(data_root, 'train', precompute_whitening)
ims_root = os.path.join(db_root, 'ims')
db_filename = os.path.join(db_root,
'{}-whiten.pkl'.format(precompute_whitening))
with tf.io.gfile.GFile(db_filename, 'rb') as f:
db = pickle.load(f)
images = [sfm120k.id2filename(db['cids'][i], ims_root) for i in
range(len(db['cids']))]
# Extract whitening vectors.
global_features_utils.debug_and_log(
'>> {}: Extracting...'.format(precompute_whitening))
wvecs = global_model.extract_global_descriptors_from_list(net, images,
test_image_size)
# Learning whitening.
global_features_utils.debug_and_log(
'>> {}: Learning...'.format(precompute_whitening))
wvecs = wvecs.numpy()
mean_vector, projection_matrix = whiten.whitenlearn(wvecs, db['qidxs'],
db['pidxs'])
learned_whitening = {'m': mean_vector, 'P': projection_matrix}
global_features_utils.debug_and_log(
'>> {}: Elapsed time: {}'.format(precompute_whitening,
global_features_utils.htime(
time.time() - start)))
# Save learned_whitening parameters for a later use.
with tf.io.gfile.GFile(filename, 'wb') as learned_whitening_file:
pickle.dump(learned_whitening, learned_whitening_file)
# Saving whitening as a layer.
bias = -np.dot(mean_vector.T, projection_matrix.T)
whitening_layer = tf.keras.layers.Dense(
net.meta['outputdim'],
activation=None,
use_bias=True,
kernel_initializer=tf.keras.initializers.Constant(
projection_matrix.T),
bias_initializer=tf.keras.initializers.Constant(bias)
)
with tf.io.gfile.GFile(filename_layer, 'wb') as learned_whitening_file:
pickle.dump(whitening_layer.get_config(), learned_whitening_file)
else:
learned_whitening = None
# Evaluate on test datasets.
for dataset in datasets:
start = time.time()
# Prepare config structure for the test dataset.
cfg = test_dataset.CreateConfigForTestDataset(dataset,
os.path.join(data_root))
images = [cfg['im_fname'](cfg, i) for i in range(cfg['n'])]
qimages = [cfg['qim_fname'](cfg, i) for i in range(cfg['nq'])]
bounding_boxes = [tuple(cfg['gnd'][i]['bbx']) for i in range(cfg['nq'])]
# Extract database and query vectors.
global_features_utils.debug_and_log(
'>> {}: Extracting database images...'.format(dataset))
vecs = global_model.extract_global_descriptors_from_list(
net, images, test_image_size, scales=multiscale)
global_features_utils.debug_and_log(
'>> {}: Extracting query images...'.format(dataset))
qvecs = global_model.extract_global_descriptors_from_list(
net, qimages, test_image_size, bounding_boxes,
scales=multiscale)
global_features_utils.debug_and_log('>> {}: Evaluating...'.format(dataset))
# Convert the obtained descriptors to numpy.
vecs = vecs.numpy()
qvecs = qvecs.numpy()
# Search, rank and print test set metrics.
_calculate_metrics_and_export_to_tensorboard(vecs, qvecs, dataset, cfg,
writer, epoch, whiten=False)
if learned_whitening is not None:
# Whiten the vectors.
mean_vector = learned_whitening['m']
projection_matrix = learned_whitening['P']
vecs_lw = whiten.whitenapply(vecs, mean_vector, projection_matrix)
qvecs_lw = whiten.whitenapply(qvecs, mean_vector, projection_matrix)
# Search, rank, and print.
_calculate_metrics_and_export_to_tensorboard(
vecs_lw, qvecs_lw, dataset, cfg, writer, epoch, whiten=True)
global_features_utils.debug_and_log(
'>> {}: Elapsed time: {}'.format(
dataset, global_features_utils.htime(time.time() - start)))
def _calculate_metrics_and_export_to_tensorboard(vecs, qvecs, dataset, cfg,
writer, epoch, whiten=False):
"""
Calculates metrics and exports them to tensorboard.
Args:
vecs: Numpy array dataset global descriptors.
qvecs: Numpy array query global descriptors.
dataset: String, one of `_TEST_DATASET_NAMES`.
cfg: Dataset configuration.
writer: Tensorboard writer.
epoch: Integer, epoch number.
whiten: Boolean, whether the metrics are with for whitening used as a
post-processing step. Affects the name of the extracted TensorBoard
metrics.
"""
# Search, rank and print test set metrics.
scores = np.dot(vecs.T, qvecs)
ranks = np.transpose(np.argsort(-scores, axis=0))
metrics = global_features_utils.compute_metrics_and_print(dataset, ranks,
cfg['gnd'])
# Save calculated metrics in a tensorboard format.
if writer:
if whiten:
metric_names = ['test_accuracy_whiten_{}_E'.format(dataset),
'test_accuracy_whiten_{}_M'.format(dataset),
'test_accuracy_whiten_{}_H'.format(dataset)]
else:
metric_names = ['test_accuracy_{}_E'.format(dataset),
'test_accuracy_{}_M'.format(dataset),
'test_accuracy_{}_H'.format(dataset)]
tf.summary.scalar(metric_names[0], metrics[0][0], step=epoch)
tf.summary.scalar(metric_names[1], metrics[1][0], step=epoch)
tf.summary.scalar(metric_names[2], metrics[2][0], step=epoch)
writer.flush()
return None
......@@ -21,7 +21,7 @@ from absl import logging
import numpy as np
import tensorflow as tf
from delf.python.datasets.revisited_op import dataset
from delf.python.datasets.revisited_op import dataset as revisited_dataset
class AverageMeter():
......@@ -40,7 +40,6 @@ class AverageMeter():
def update(self, val, n=1):
"""Updates values in the AverageMeter.
Args:
val: Float, loss value.
n: Integer, number of instances.
......@@ -57,7 +56,6 @@ def compute_metrics_and_print(dataset_name,
desired_pr_ranks=None,
log=True):
"""Computes and logs ground-truth metrics for Revisited datasets.
Args:
dataset_name: String, name of the dataset.
sorted_index_ids: Integer NumPy array of shape [#queries, #index_images].
......@@ -71,7 +69,6 @@ def compute_metrics_and_print(dataset_name,
precision@10/recall@10 are desired, this should be set to [1, 10]. The
largest item should be <= #sorted_index_ids. Default: [1, 5, 10].
log: Whether to log results using logging.info().
Returns:
mAP: (metricsE, metricsM, metricsH) Tuple of the metrics for different
levels of complexity. Each metrics is a list containing:
......@@ -81,53 +78,53 @@ def compute_metrics_and_print(dataset_name,
(NumPy array of floats, with shape [#queries]), precisions (NumPy array of
floats, with shape [#queries, len(desired_pr_ranks)]), recalls (NumPy
array of floats, with shape [#queries, len(desired_pr_ranks)]).
Raises:
ValueError: If an unknown dataset name is provided as an argument.
"""
if dataset not in dataset.DATASET_NAMES:
if dataset_name not in revisited_dataset.DATASET_NAMES:
raise ValueError('Unknown dataset: {}!'.format(dataset))
if desired_pr_ranks is None:
desired_pr_ranks = [1, 5, 10]
(easy_ground_truth, medium_ground_truth,
hard_ground_truth) = dataset.ParseEasyMediumHardGroundTruth(ground_truth)
metrics_easy = dataset.ComputeMetrics(sorted_index_ids, easy_ground_truth,
desired_pr_ranks)
metrics_medium = dataset.ComputeMetrics(sorted_index_ids, medium_ground_truth,
desired_pr_ranks)
metrics_hard = dataset.ComputeMetrics(sorted_index_ids, hard_ground_truth,
desired_pr_ranks)
hard_ground_truth) = revisited_dataset.ParseEasyMediumHardGroundTruth(
ground_truth)
metrics_easy = revisited_dataset.ComputeMetrics(sorted_index_ids,
easy_ground_truth,
desired_pr_ranks)
metrics_medium = revisited_dataset.ComputeMetrics(sorted_index_ids,
medium_ground_truth,
desired_pr_ranks)
metrics_hard = revisited_dataset.ComputeMetrics(sorted_index_ids,
hard_ground_truth,
desired_pr_ranks)
debug_and_log(
'>> {}: mAP E: {}, M: {}, H: {}'.format(
dataset_name, np.around(metrics_easy[0] * 100, decimals=2),
np.around(metrics_medium[0] * 100, decimals=2),
np.around(metrics_hard[0] * 100, decimals=2)),
log=log)
'>> {}: mAP E: {}, M: {}, H: {}'.format(
dataset_name, np.around(metrics_easy[0] * 100, decimals=2),
np.around(metrics_medium[0] * 100, decimals=2),
np.around(metrics_hard[0] * 100, decimals=2)),
log=log)
debug_and_log(
'>> {}: mP@k{} E: {}, M: {}, H: {}'.format(
dataset_name, desired_pr_ranks,
np.around(metrics_easy[1] * 100, decimals=2),
np.around(metrics_medium[1] * 100, decimals=2),
np.around(metrics_hard[1] * 100, decimals=2)),
log=log)
'>> {}: mP@k{} E: {}, M: {}, H: {}'.format(
dataset_name, desired_pr_ranks,
np.around(metrics_easy[1] * 100, decimals=2),
np.around(metrics_medium[1] * 100, decimals=2),
np.around(metrics_hard[1] * 100, decimals=2)),
log=log)
return metrics_easy, metrics_medium, metrics_hard
def htime(time_difference):
"""Time formatting function.
Depending on the value of `time_difference` outputs time in an appropriate
time format.
Args:
time_difference: Float, time difference between the two events.
Returns:
time: String representing time in an appropriate time format.
"""
......@@ -149,7 +146,6 @@ def htime(time_difference):
def debug_and_log(msg, debug=True, log=True, debug_on_the_same_line=False):
"""Outputs `msg` to both stdout (if in the debug mode) and the log file.
Args:
msg: String, message to be logged.
debug: Bool, if True, will print `msg` to stdout.
......@@ -168,14 +164,13 @@ def debug_and_log(msg, debug=True, log=True, debug_on_the_same_line=False):
def get_standard_keras_models():
"""Gets the standard keras model names.
Returns:
model_names: List, names of the standard keras models.
"""
model_names = sorted(
name for name in tf.keras.applications.__dict__
if not name.startswith('__') and
callable(tf.keras.applications.__dict__[name]))
name for name in tf.keras.applications.__dict__
if not name.startswith('__') and
callable(tf.keras.applications.__dict__[name]))
return model_names
......@@ -184,9 +179,7 @@ def create_model_directory(training_dataset, arch, pool, whitening, pretrained,
neg_num, query_size, pool_size, batch_size,
update_every, image_size, directory):
"""Based on the model parameters, creates the model directory.
If the model directory does not exist, the directory is created.
Args:
training_dataset: String, training dataset name.
arch: String, model architecture.
......@@ -206,7 +199,6 @@ def create_model_directory(training_dataset, arch, pool, whitening, pretrained,
update_every: Integer, frequency of the model weights update.
image_size: Integer, maximum size of longer image side used for training.
directory: String, destination where trained network should be saved.
Returns:
folder: String, path to the model folder.
"""
......@@ -223,7 +215,7 @@ def create_model_directory(training_dataset, arch, pool, whitening, pretrained,
folder = os.path.join(directory, folder)
debug_and_log(
'>> Creating directory if does not exist:\n>> \'{}\''.format(folder))
'>> Creating directory if does not exist:\n>> \'{}\''.format(folder))
if not os.path.exists(folder):
os.makedirs(folder)
return folder
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CNN Image Retrieval model implementation based on the following papers:
[1] Fine-tuning CNN Image Retrieval with No Human Annotation,
Radenović F., Tolias G., Chum O., TPAMI 2018 [arXiv]
https://arxiv.org/abs/1711.02512
[2] CNN Image Retrieval Learns from BoW: Unsupervised Fine-Tuning with Hard
Examples, Radenović F., Tolias G., Chum O., ECCV 2016 [arXiv]
https://arxiv.org/abs/1604.02426
"""
import os
import pickle
import tensorflow as tf
from delf.python.datasets import generic_dataset
from delf.python.normalization_layers import normalization
from delf.python.pooling_layers import pooling as pooling_layers
from delf.python.training import global_features_utils
# Pre-computed global whitening, for most commonly used architectures.
# Using pre-computed whitening improves the speed of the convergence and the
# performance.
_WHITENING_CONFIG = {
'ResNet50': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
'/SFM120k_ResNet50_gem_learned_whitening_config.pkl',
'ResNet101': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
'/SFM120k_ResNet101_gem_learned_whitening_config.pkl',
'ResNet152': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
'/SFM120k_ResNet152_gem_learned_whitening_config.pkl',
'VGG19': 'http://cmp.felk.cvut.cz/cnnimageretrieval_tf'
'/SFM120k_VGG19_gem_learned_whitening_config.pkl'
}
# Possible global pooling layers.
_POOLING = {
'mac': pooling_layers.MAC,
'spoc': pooling_layers.SPoC,
'gem': pooling_layers.GeM
}
# Output dimensionality for supported architectures.
_OUTPUT_DIM = {
'VGG16': 512,
'VGG19': 512,
'ResNet50': 2048,
'ResNet101': 2048,
'ResNet101V2': 2048,
'ResNet152': 2048,
'DenseNet121': 1024,
'DenseNet169': 1664,
'DenseNet201': 1920,
'EfficientNetB5': 2048,
'EfficientNetB7': 2560
}
class GlobalFeatureNet(tf.keras.Model):
"""Instantiates global model for image retrieval.
This class implements the [GlobalFeatureNet](
https://arxiv.org/abs/1711.02512) for image retrieval. The model uses a
user-defined model as a backbone.
"""
def __init__(self, architecture='ResNet101', pooling='gem',
whitening=False, pretrained=True, data_root=''):
"""GlobalFeatureNet network initialization.
Args:
architecture: Network backbone.
pooling: Pooling method used 'mac'/'spoc'/'gem'.
whitening: Bool, whether to use whitening.
pretrained: Bool, whether to initialize the network with the weights
pretrained on ImageNet.
data_root: String, path to the data folder where the precomputed
whitening is/will be saved in case `whitening` is True.
Raises:
ValueError: If `architecture` is not supported.
"""
if architecture not in _OUTPUT_DIM.keys():
raise ValueError("Architecture {} is not supported.".format(architecture))
super(GlobalFeatureNet, self).__init__()
# Get standard output dimensionality size.
dim = _OUTPUT_DIM[architecture]
if pretrained:
# Initialize with network pretrained on imagenet.
net_in = getattr(tf.keras.applications, architecture)(include_top=False,
weights="imagenet")
else:
# Initialize with random weights.
net_in = getattr(tf.keras.applications, architecture)(include_top=False,
weights=None)
# Initialize `feature_extractor`. Take only convolutions for
# `feature_extractor`, always end with ReLU to make last activations
# non-negative.
if architecture.lower().startswith('densenet'):
tmp_model = tf.keras.Sequential()
tmp_model.add(net_in)
net_in = tmp_model
net_in.add(tf.keras.layers.ReLU())
# Initialize pooling.
self.pool = _POOLING[pooling]()
# Initialize whitening.
if whitening:
if pretrained and architecture in _WHITENING_CONFIG:
# If precomputed whitening for the architecture exists,
# the fully-connected layer is going to be initialized according to
# the precomputed layer configuration.
global_features_utils.debug_and_log(
">> {}: for '{}' custom computed whitening '{}' is used."
.format(os.getcwd(), architecture,
os.path.basename(_WHITENING_CONFIG[architecture])))
# The layer configuration is downloaded to the `data_root` folder.
whiten_dir = os.path.join(data_root, architecture)
path = tf.keras.utils.get_file(fname=whiten_dir,
origin=_WHITENING_CONFIG[architecture])
# Whitening configuration is loaded.
with tf.io.gfile.GFile(path, 'rb') as learned_whitening_file:
whitening_config = pickle.load(learned_whitening_file)
# Whitening layer is initialized according to the configuration.
self.whiten = tf.keras.layers.Dense.from_config(whitening_config)
else:
# In case if no precomputed whitening exists for the chosen
# architecture, the fully-connected whitening layer is initialized
# with the random weights.
self.whiten = tf.keras.layers.Dense(dim, activation=None, use_bias=True)
global_features_utils.debug_and_log(
">> There is either no whitening computed for the "
"used network architecture or pretrained is False,"
" random weights are used.")
else:
self.whiten = None
# Create meta information to be stored in the network.
self.meta = {
'architecture': architecture,
'pooling': pooling,
'whitening': whitening,
'outputdim': dim
}
self.feature_extractor = net_in
self.normalize = normalization.L2Normalization()
def call(self, x, training=False):
"""Invokes the GlobalFeatureNet instance.
Args:
x: [B, H, W, C] Tensor with a batch of images.
training: Indicator of whether the forward pass is running in training
mode or not.
Returns:
out: [B, out_dim] Global descriptor.
"""
# Forward pass through the fully-convolutional backbone.
o = self.feature_extractor(x, training)
# Pooling.
o = self.pool(o)
# Normalization.
o = self.normalize(o)
# If whitening exists: the pooled global descriptor is whitened and
# re-normalized.
if self.whiten is not None:
o = self.whiten(o)
o = self.normalize(o)
return o
def meta_repr(self):
'''Provides high-level information about the network.
Returns:
meta: string with the information about the network (used
architecture, pooling type, whitening, outputdim).
'''
tmpstr = '(meta):\n'
tmpstr += '\tarchitecture: {}\n'.format(self.meta['architecture'])
tmpstr += '\tpooling: {}\n'.format(self.meta['pooling'])
tmpstr += '\twhitening: {}\n'.format(self.meta['whitening'])
tmpstr += '\toutputdim: {}\n'.format(self.meta['outputdim'])
return tmpstr
def extract_global_descriptors_from_list(net, images, image_size,
bounding_boxes=None, scales=[1.],
multi_scale_power=1., print_freq=10):
"""Extracting global descriptors from a list of images.
Args:
net: Model object, network for the forward pass.
images: Absolute image paths as strings.
image_size: Integer, defines the maximum size of longer image side.
bounding_boxes: List of (x1,y1,x2,y2) tuples to crop the query images.
scales: List of float scales.
multi_scale_power: Float, multi-scale normalization power parameter.
print_freq: Printing frequency for debugging.
Returns:
descriptors: Global descriptors for the input images.
"""
# Creating dataset loader.
data = generic_dataset.ImagesFromList(root='', image_paths=images,
imsize=image_size,
bounding_boxes=bounding_boxes)
def _data_gen():
return (inst for inst in data)
loader = tf.data.Dataset.from_generator(_data_gen, output_types=(tf.float32))
loader = loader.batch(1)
# Extracting vectors.
descriptors = tf.zeros((0, net.meta['outputdim']))
for i, input in enumerate(loader):
if len(scales) == 1 and scales[0] == 1:
descriptors = tf.concat([descriptors, net(input)], 0)
else:
descriptors = tf.concat(
[descriptors, extract_multi_scale_descriptor(
net, input, scales, multi_scale_power)], 0)
if (i + 1) % print_freq == 0 or (i + 1) == len(images):
global_features_utils.debug_and_log(
'\r>>>> {}/{} done...'.format((i + 1), len(images)),
debug_on_the_same_line=True)
global_features_utils.debug_and_log('', log=False)
descriptors = tf.transpose(descriptors, perm=[1, 0])
return descriptors
def extract_multi_scale_descriptor(net, input, scales, multi_scale_power):
"""Extracts the global descriptor multi scale.
Args:
net: Model object, network for the forward pass.
input: [B, H, W, C] input tensor in channel-last (BHWC) configuration.
scales: List of float scales.
multi_scale_power: Float, multi-scale normalization power parameter.
Returns:
descriptors: Multi-scale global descriptors for the input images.
"""
descriptors = tf.zeros(net.meta['outputdim'])
for s in scales:
if s == 1:
input_t = input
else:
output_shape = s * tf.shape(input)[1:3].numpy()
input_t = tf.image.resize(input, output_shape,
method='bilinear',
preserve_aspect_ratio=True)
descriptors += tf.pow(net(input_t), multi_scale_power)
descriptors /= len(scales)
descriptors = tf.pow(descriptors, 1. / multi_scale_power)
descriptors /= tf.norm(descriptors)
return descriptors
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the GlobalFeatureNet backbone."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import flags
import numpy as np
from PIL import Image
import tensorflow as tf
from delf.python.training.model import global_model
FLAGS = flags.FLAGS
class GlobalFeatureNetTest(tf.test.TestCase):
"""Tests for the GlobalFeatureNet backbone."""
def testInitModel(self):
"""Testing GlobalFeatureNet initialization."""
# Testing GlobalFeatureNet initialization.
model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
'whitening': False, 'pretrained': True}
model = global_model.GlobalFeatureNet(**model_params)
expected_meta = {'architecture': 'ResNet101', 'pooling': 'gem',
'whitening': False, 'outputdim': 2048}
self.assertEqual(expected_meta, model.meta)
def testExtractVectors(self):
"""Tests extraction of global descriptors from list."""
# Initializing network for testing.
model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
'whitening': False, 'pretrained': True}
model = global_model.GlobalFeatureNet(**model_params)
# Number of images to be created.
n = 2
image_paths = []
# Create `n` dummy images.
for i in range(n):
dummy_image = np.random.rand(1024, 750, 3) * 255
img_out = Image.fromarray(dummy_image.astype('uint8')).convert('RGB')
filename = os.path.join(FLAGS.test_tmpdir, 'test_image_{}.jpg'.format(i))
img_out.save(filename)
image_paths.append(filename)
descriptors = global_model.extract_global_descriptors_from_list(
model, image_paths, image_size=1024, bounding_boxes=None,
scales=[1., 3.], multi_scale_power=2, print_freq=1)
self.assertAllEqual([2048, 2], tf.shape(descriptors))
def testExtractMultiScale(self):
"""Tests multi-scale global descriptor extraction."""
# Initializing network for testing.
model_params = {'architecture': 'ResNet101', 'pooling': 'gem',
'whitening': False, 'pretrained': True}
model = global_model.GlobalFeatureNet(**model_params)
input = tf.random.uniform([2, 1024, 750, 3], dtype=tf.float32, seed=0)
descriptors = global_model.extract_multi_scale_descriptor(
model, input, scales=[1., 3.], multi_scale_power=2)
self.assertAllEqual([2, 2048], tf.shape(descriptors))
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for tensorboard."""
from tensorboard import program
from delf.python.training import global_features_utils
def launch_tensorboard(log_dir):
"""Runs tensorboard with the given `log_dir`.
Args:
log_dir: String, directory to launch tensorboard in.
"""
tensorboard = program.TensorBoard()
tensorboard.configure(argv=[None, '--logdir', log_dir])
url = tensorboard.launch()
global_features_utils.debug_and_log("Launching Tensorboard: {}".format(url))
......@@ -75,7 +75,7 @@ documentation of the Object Detection API:
### DeepMAC architecture
We have released our new architecture, **DeepMAC**, desgined for partially
We have released our new architecture, **DeepMAC**, designed for partially
supervised instance segmentation. DeepMAC stands for Deep Mask-heads
Above CenterNet, and is based on our CenterNet implementation. In our
[paper](https://arxiv.org/abs/2104.00613) we show that DeepMAC achieves
......
......@@ -2003,8 +2003,20 @@ def _resize_masks(masks, height, width, method):
class CenterNetMaskTargetAssigner(object):
"""Wrapper to compute targets for segmentation masks."""
def __init__(self, stride):
def __init__(self, stride, boxes_scale=1.0):
"""Constructor.
Args:
stride: The stride of the network. Targets are assigned at the output
stride.
boxes_scale: Scale to apply to boxes before producing mask weights. This
is meant to ensure the full object region is properly weighted prior to
applying loss. A value of ~1.05 is typically applied when object regions
should be blacked out (perhaps because valid groundtruth masks are not
present).
"""
self._stride = stride
self._boxes_scale = boxes_scale
def assign_segmentation_targets(
self, gt_masks_list, gt_classes_list, gt_boxes_list=None,
......@@ -2072,7 +2084,7 @@ class CenterNetMaskTargetAssigner(object):
segmentation_weight_for_image = (
ta_utils.blackout_pixel_weights_by_box_regions(
output_height, output_width, boxes_absolute.get(), blackout,
weights=gt_mask_weights))
weights=gt_mask_weights, boxes_scale=self._boxes_scale))
segmentation_weights_list.append(segmentation_weight_for_image)
else:
segmentation_weights_list.append(tf.ones((output_height, output_width),
......
FROM tensorflow/tensorflow:latest-gpu
ARG DEBIAN_FRONTEND=noninteractive
# Install apt dependencies
RUN apt-get update && apt-get install -y \
git \
gpg-agent \
python3-cairocffi \
protobuf-compiler \
python3-pil \
python3-lxml \
python3-tk \
python3-opencv \
wget
# Installs google cloud sdk, this is mostly for using gsutil to export model.
RUN wget -nv \
https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \
mkdir /root/tools && \
tar xvzf google-cloud-sdk.tar.gz -C /root/tools && \
rm google-cloud-sdk.tar.gz && \
/root/tools/google-cloud-sdk/install.sh --usage-reporting=false \
--path-update=false --bash-completion=false \
--disable-installation-options && \
rm -rf /root/.config/* && \
ln -s /root/.config /config && \
rm -rf /root/tools/google-cloud-sdk/.install/.backup
# Path configuration
ENV PATH $PATH:/root/tools/google-cloud-sdk/bin
# Make sure gsutil will use the default service account
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg
WORKDIR /home/tensorflow
## Copy this code (make sure you are under the ../models/research directory)
COPY . /home/tensorflow/models
# Compile protobuf configs
RUN (cd /home/tensorflow/models/ && protoc object_detection/protos/*.proto --python_out=.)
WORKDIR /home/tensorflow/models/
RUN cp object_detection/packages/tf2/setup.py ./
ENV PATH="/home/tensorflow/.local/bin:${PATH}"
RUN python -m pip install -U pip
RUN python -m pip install .
ENTRYPOINT ["python", "object_detection/model_main_tf2.py"]
......@@ -24,22 +24,23 @@ A skeleton configuration file is shown below:
```
model {
(... Add model config here...)
(... Add model config here...)
}
train_config : {
(... Add train_config here...)
(... Add train_config here...)
}
train_input_reader: {
(... Add train_input configuration here...)
(... Add train_input configuration here...)
}
eval_config: {
(... Add eval_config here...)
}
eval_input_reader: {
(... Add eval_input configuration here...)
(... Add eval_input configuration here...)
}
```
......@@ -58,6 +59,106 @@ configuration files can be pasted into `model` field of the skeleton
configuration. Users should note that the `num_classes` field should be changed
to a value suited for the dataset the user is training on.
### Anchor box parameters
Many object detection models use an anchor generator as a region-sampling
strategy, which generates a large number of anchor boxes in a range of shapes
and sizes, in many locations of the image. The detection algorithm then
incrementally offsets the anchor box closest to the ground truth until it
(closely) matches. You can specify the variety of and position of these anchor
boxes in the `anchor_generator` config.
Usually, the anchor configs provided with pre-trained checkpoints are
designed for large/versatile datasets (COCO, ImageNet), in which the goal is to
improve accuracy for a wide range of object sizes and positions. But in most
real-world applications, objects are confined to a limited number of sizes. So
adjusting the anchors to be specific to your dataset and environment
can both improve model accuracy and reduce training time.
The format for these anchor box parameters differ depending on your model
architecture. For details about all fields, see the [`anchor_generator`
definition](https://github.com/tensorflow/models/blob/master/research/object_detection/protos/anchor_generator.proto).
On this page, we'll focus on parameters
used in a traditional single shot detector (SSD) model and SSD models with a
feature pyramid network (FPN) head.
Regardless of the model architecture, you'll need to understand the following
anchor box concepts:
+ **Scale**: This defines the variety of anchor box sizes. Each box size is
defined as a proportion of the original image size (for SSD models) or as a
factor of the filter's stride length (for FPN). The number of different sizes
is defined using a range of "scales" (relative to image size) or "levels" (the
level on the feature pyramid). For example, to detect small objects with the
configurations below, the `min_scale` and `min_level` are set to a small
value, while `max_scale` and `max_level` specify the largest objects to
detect.
+ **Aspect ratio**: This is the height/width ratio for the anchor boxes. For
example, the `aspect_ratio` value of `1.0` creates a square, and `2.0` creates
a 1:2 rectangle (landscape orientation). You can define as many aspects as you
want and each one is repeated at all anchor box scales.
Beware that increasing the total number of anchor boxes will exponentially
increase computation costs. Whereas generating fewer anchors that have a higher
chance to overlap with ground truth will both improve accuracy and reduce
computation costs.
**Single Shot Detector (SSD) full model:**
Setting `num_layers` to 6 means the model generates each box aspect at 6
different sizes. The exact sizes are not specified but they're evenly spaced out
between the `min_scale` and `max_scale` values, which specify the smallest box
size is 20% of the input image size and the largest is 95% that size.
```
model {
ssd {
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
}
}
}
}
```
For more details, see [`ssd_anchor_generator.proto`](https://github.com/tensorflow/models/blob/master/research/object_detection/protos/ssd_anchor_generator.proto).
**SSD with Feature Pyramid Network (FPN) head:**
When using an FPN head, you must specify the anchor box size relative to the
convolutional filter's stride length at a given pyramid level, using
`anchor_scale`. So in this example, the box size is 4.0 multiplied by the
layer's stride length. The number of sizes you get for each aspect simply
depends on how many levels there are between the `min_level` and `max_level`.
```
model {
ssd {
anchor_generator {
multiscale_anchor_generator {
anchor_scale: 4.0
min_level: 3
max_level: 7
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
}
}
}
}
```
For more details, see [`multiscale_anchor_generator.proto`](https://github.com/tensorflow/models/blob/master/research/object_detection/protos/multiscale_anchor_generator.proto).
## Defining Inputs
The TensorFlow Object Detection API accepts inputs in the TFRecord file format.
......@@ -66,20 +167,21 @@ Additionally, users should also specify a label map, which define the mapping
between a class id and class name. The label map should be identical between
training and evaluation datasets.
An example input configuration looks as follows:
An example training input configuration looks as follows:
```
tf_record_input_reader {
input_path: "/usr/home/username/data/train.record"
train_input_reader: {
tf_record_input_reader {
input_path: "/usr/home/username/data/train.record-?????-of-00010"
}
label_map_path: "/usr/home/username/data/label_map.pbtxt"
}
label_map_path: "/usr/home/username/data/label_map.pbtxt"
```
Users should substitute the `input_path` and `label_map_path` arguments and
insert the input configuration into the `train_input_reader` and
`eval_input_reader` fields in the skeleton configuration. Note that the paths
can also point to Google Cloud Storage buckets (ie.
"gs://project_bucket/train.record") for use on Google Cloud.
The `eval_input_reader` follows the same format. Users should substitute the
`input_path` and `label_map_path` arguments. Note that the paths can also point
to Google Cloud Storage buckets (ie. "gs://project_bucket/train.record") to
pull datasets hosted on Google Cloud.
## Configuring the Trainer
......@@ -92,36 +194,38 @@ The `train_config` defines parts of the training process:
A sample `train_config` is below:
```
batch_size: 1
optimizer {
momentum_optimizer: {
learning_rate: {
manual_step_learning_rate {
initial_learning_rate: 0.0002
schedule {
step: 0
learning_rate: .0002
}
schedule {
step: 900000
learning_rate: .00002
}
schedule {
step: 1200000
learning_rate: .000002
train_config: {
batch_size: 1
optimizer {
momentum_optimizer: {
learning_rate: {
manual_step_learning_rate {
initial_learning_rate: 0.0002
schedule {
step: 0
learning_rate: .0002
}
schedule {
step: 900000
learning_rate: .00002
}
schedule {
step: 1200000
learning_rate: .000002
}
}
}
momentum_optimizer_value: 0.9
}
momentum_optimizer_value: 0.9
use_moving_average: false
}
use_moving_average: false
}
fine_tune_checkpoint: "/usr/home/username/tmp/model.ckpt-#####"
from_detection_checkpoint: true
load_all_detection_checkpoint_vars: true
gradient_clipping_by_norm: 10.0
data_augmentation_options {
random_horizontal_flip {
fine_tune_checkpoint: "/usr/home/username/tmp/model.ckpt-#####"
from_detection_checkpoint: true
load_all_detection_checkpoint_vars: true
gradient_clipping_by_norm: 10.0
data_augmentation_options {
random_horizontal_flip {
}
}
}
```
......
......@@ -187,21 +187,28 @@ evaluation jobs for a few iterations [locally on their own machines](#local).
### Training with multiple GPUs
A user can start a training job on Cloud AI Platform using the following
command:
A user can start a training job on Cloud AI Platform following the instruction
https://cloud.google.com/ai-platform/training/docs/custom-containers-training.
```bash
git clone https://github.com/tensorflow/models.git
# From the tensorflow/models/research/ directory
cp object_detection/packages/tf2/setup.py .
cp object_detection/dockerfiles/tf2_ai_platform/Dockerfile .
docker build -t gcr.io/${DOCKER_IMAGE_URI} .
docker push gcr.io/${DOCKER_IMAGE_URI}
```
```bash
gcloud ai-platform jobs submit training object_detection_`date +%m_%d_%Y_%H_%M_%S` \
--runtime-version 2.1 \
--python-version 3.6 \
--job-dir=gs://${MODEL_DIR} \
--package-path ./object_detection \
--module-name object_detection.model_main_tf2 \
--region us-central1 \
--master-machine-type n1-highcpu-16 \
--master-accelerator count=8,type=nvidia-tesla-v100 \
--master-image-uri gcr.io/${DOCKER_IMAGE_URI} \
--scale-tier CUSTOM \
-- \
--model_dir=gs://${MODEL_DIR} \
--pipeline_config_path=gs://${PIPELINE_CONFIG_PATH}
......@@ -210,15 +217,16 @@ gcloud ai-platform jobs submit training object_detection_`date +%m_%d_%Y_%H_%M_%
Where `gs://${MODEL_DIR}` specifies the directory on Google Cloud Storage where
the training checkpoints and events will be written to and
`gs://${PIPELINE_CONFIG_PATH}` points to the pipeline configuration stored on
Google Cloud Storage.
Google Cloud Storage, and `gcr.io/${DOCKER_IMAGE_URI}` points to the docker
image stored in Google Container Registry.
Users can monitor the progress of their training job on the
[ML Engine Dashboard](https://console.cloud.google.com/ai-platform/jobs).
### Training with TPU
Launching a training job with a TPU compatible pipeline config requires using a
similar command:
Launching a training job with a TPU compatible pipeline config requires using
the following command:
```bash
# From the tensorflow/models/research/ directory
......@@ -246,16 +254,11 @@ Evaluation jobs run on a single machine. Run the following command to start the
evaluation job:
```bash
# From the tensorflow/models/research/ directory
cp object_detection/packages/tf2/setup.py .
gcloud ai-platform jobs submit training object_detection_eval_`date +%m_%d_%Y_%H_%M_%S` \
--runtime-version 2.1 \
--python-version 3.6 \
--job-dir=gs://${MODEL_DIR} \
--package-path ./object_detection \
--module-name object_detection.model_main_tf2 \
--region us-central1 \
--scale-tier BASIC_GPU \
--master-image-uri gcr.io/${DOCKER_IMAGE_URI} \
-- \
--model_dir=gs://${MODEL_DIR} \
--pipeline_config_path=gs://${PIPELINE_CONFIG_PATH} \
......@@ -264,8 +267,9 @@ gcloud ai-platform jobs submit training object_detection_eval_`date +%m_%d_%Y_%H
where `gs://${MODEL_DIR}` points to the directory on Google Cloud Storage where
training checkpoints are saved and `gs://{PIPELINE_CONFIG_PATH}` points to where
the model configuration file stored on Google Cloud Storage. Evaluation events
are written to `gs://${MODEL_DIR}/eval`
the model configuration file stored on Google Cloud Storage, and
`gcr.io/${DOCKER_IMAGE_URI}` points to the docker image stored in Google
Container Registry. Evaluation events are written to `gs://${MODEL_DIR}/eval`
Typically one starts an evaluation job concurrently with the training job. Note
that we do not support running evaluation on TPU.
......
......@@ -122,7 +122,8 @@ def _extract_predictions_and_losses(model,
[input_dict[fields.InputDataFields.groundtruth_boxes]],
[tf.one_hot(input_dict[fields.InputDataFields.groundtruth_classes]
- label_id_offset, depth=model.num_classes)],
groundtruth_masks_list, groundtruth_keypoints_list)
groundtruth_masks_list=groundtruth_masks_list,
groundtruth_keypoints_list=groundtruth_keypoints_list)
losses_dict.update(model.loss(prediction_dict, true_image_shapes))
result_dict = eval_util.result_dict_for_single_example(
......
......@@ -598,7 +598,7 @@ def prediction_tensors_to_single_instance_kpts(
keypoint type, as it's possible to filter some candidates due to the score
threshold.
"""
batch_size, height, width, num_keypoints = _get_shape(
batch_size, _, _, num_keypoints = _get_shape(
keypoint_heatmap_predictions, 4)
# Get x, y and channel indices corresponding to the top indices in the
# keypoint heatmap predictions.
......@@ -612,24 +612,32 @@ def prediction_tensors_to_single_instance_kpts(
_multi_range(batch_size, value_repetitions=num_keypoints),
tf.reshape(y_indices, [-1]),
tf.reshape(x_indices, [-1]),
tf.reshape(channel_indices, [-1])
], axis=1)
# Reshape the offsets predictions to shape:
# [batch_size, height, width, num_keypoints, 2]
keypoint_heatmap_offsets = tf.reshape(
keypoint_heatmap_offsets, [batch_size, height, width, num_keypoints, -1])
# shape: [num_keypoints, 2]
# shape: [num_keypoints, num_keypoints * 2]
selected_offsets_flat = tf.gather_nd(keypoint_heatmap_offsets,
combined_indices)
y_offsets, x_offsets = tf.unstack(selected_offsets_flat, axis=1)
# shape: [num_keypoints, num_keypoints, 2].
selected_offsets_flat = tf.reshape(
selected_offsets_flat, [num_keypoints, num_keypoints, -1])
# shape: [num_keypoints].
channel_indices = tf.keras.backend.flatten(channel_indices)
# shape: [num_keypoints, 2].
retrieve_indices = tf.stack([channel_indices, channel_indices], axis=1)
# shape: [num_keypoints, 2]
selected_offsets = tf.gather_nd(selected_offsets_flat, retrieve_indices)
y_offsets, x_offsets = tf.unstack(selected_offsets, axis=1)
keypoint_candidates = tf.stack([
tf.cast(y_indices, dtype=tf.float32) + tf.expand_dims(y_offsets, axis=0),
tf.cast(x_indices, dtype=tf.float32) + tf.expand_dims(x_offsets, axis=0)
], axis=2)
keypoint_candidates = tf.expand_dims(keypoint_candidates, axis=0)
# Append the channel indices back to retrieve the keypoint scores from the
# heatmap.
combined_indices = tf.concat(
[combined_indices, tf.expand_dims(channel_indices, axis=-1)], axis=1)
if keypoint_score_heatmap is None:
keypoint_scores = tf.gather_nd(
keypoint_heatmap_predictions, combined_indices)
......@@ -1794,6 +1802,31 @@ def predicted_embeddings_at_object_centers(embedding_predictions,
return embeddings
def mask_from_true_image_shape(data_shape, true_image_shapes):
"""Get a binary mask based on the true_image_shape.
Args:
data_shape: a possibly static (4,) tensor for the shape of the feature
map.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
the form [height, width, channels] indicating the shapes of true
images in the resized images, as resized images can be padded with
zeros.
Returns:
a [batch, data_height, data_width, 1] tensor of 1.0 wherever data_height
is less than height, etc.
"""
mask_h = tf.cast(
tf.range(data_shape[1]) < true_image_shapes[:, tf.newaxis, 0],
tf.float32)
mask_w = tf.cast(
tf.range(data_shape[2]) < true_image_shapes[:, tf.newaxis, 1],
tf.float32)
mask = tf.expand_dims(
mask_h[:, :, tf.newaxis] * mask_w[:, tf.newaxis, :], 3)
return mask
class ObjectDetectionParams(
collections.namedtuple('ObjectDetectionParams', [
'localization_loss', 'scale_loss_weight', 'offset_loss_weight',
......@@ -2422,6 +2455,24 @@ class CenterNetMetaArch(model.DetectionModel):
super(CenterNetMetaArch, self).__init__(num_classes)
def set_trainability_by_layer_traversal(self, trainable):
"""Sets trainability layer by layer.
The commonly-seen `model.trainable = False` method does not traverse
the children layer. For example, if the parent is not trainable, we won't
be able to set individual layers as trainable/non-trainable differentially.
Args:
trainable: (bool) Setting this for the model layer by layer except for
the parent itself.
"""
for layer in self._flatten_layers(include_self=False):
layer.trainable = trainable
@property
def prediction_head_dict(self):
return self._prediction_head_dict
@property
def batched_prediction_tensor_names(self):
if not self._batched_prediction_tensor_names:
......@@ -2647,7 +2698,7 @@ class CenterNetMetaArch(model.DetectionModel):
per_keypoint_depth=kp_params.per_keypoint_depth))
if self._mask_params is not None:
target_assigners[SEGMENTATION_TASK] = (
cn_assigner.CenterNetMaskTargetAssigner(stride))
cn_assigner.CenterNetMaskTargetAssigner(stride, boxes_scale=1.05))
if self._densepose_params is not None:
dp_stride = 1 if self._densepose_params.upsample_to_input_res else stride
target_assigners[DENSEPOSE_TASK] = (
......@@ -3690,6 +3741,12 @@ class CenterNetMetaArch(model.DetectionModel):
max_detections, reid_embed_size] containing object embeddings.
"""
object_center_prob = tf.nn.sigmoid(prediction_dict[OBJECT_CENTER][-1])
# Mask object centers by true_image_shape. [batch, h, w, 1]
object_center_mask = mask_from_true_image_shape(
_get_shape(object_center_prob, 4), true_image_shapes)
object_center_prob *= object_center_mask
# Get x, y and channel indices corresponding to the top indices in the class
# center predictions.
detection_scores, y_indices, x_indices, channel_indices = (
......@@ -3751,7 +3808,7 @@ class CenterNetMetaArch(model.DetectionModel):
])
keypoints, keypoint_scores = self._postprocess_keypoints_multi_class(
prediction_dict, channel_indices, y_indices, x_indices,
None, num_detections)
boxes_strided, num_detections)
keypoints, keypoint_scores = (
convert_strided_predictions_to_normalized_keypoints(
keypoints, keypoint_scores, self._stride, true_image_shapes,
......
......@@ -2518,6 +2518,75 @@ class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
self.assertAllClose(detections['detection_keypoint_scores'][0, 0],
np.array([0.9, 0.9, 0.9, 0.1]))
def test_mask_object_center_in_postprocess_by_true_image_shape(self):
"""Test the postprocess function is masked by true_image_shape."""
model = build_center_net_meta_arch(num_classes=1)
max_detection = model._center_params.max_box_predictions
num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
class_center = np.zeros((1, 32, 32, 1), dtype=np.float32)
height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
class_probs = np.zeros(1)
class_probs[0] = _logit(0.75)
class_center[0, 16, 16] = class_probs
height_width[0, 16, 16] = [5, 10]
offset[0, 16, 16] = [.25, .5]
keypoint_regression[0, 16, 16] = [
-1., -1.,
-1., 1.,
1., -1.,
1., 1.]
keypoint_heatmaps[0, 14, 14, 0] = _logit(0.9)
keypoint_heatmaps[0, 14, 18, 1] = _logit(0.9)
keypoint_heatmaps[0, 18, 14, 2] = _logit(0.9)
keypoint_heatmaps[0, 18, 18, 3] = _logit(0.05) # Note the low score.
class_center = tf.constant(class_center)
height_width = tf.constant(height_width)
offset = tf.constant(offset)
keypoint_heatmaps = tf.constant(keypoint_heatmaps, dtype=tf.float32)
keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
print(class_center)
prediction_dict = {
cnma.OBJECT_CENTER: [class_center],
cnma.BOX_SCALE: [height_width],
cnma.BOX_OFFSET: [offset],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP):
[keypoint_heatmaps],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET):
[keypoint_offsets],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
[keypoint_regression],
}
def graph_fn():
detections = model.postprocess(prediction_dict,
tf.constant([[1, 1, 3]]))
return detections
detections = self.execute_cpu(graph_fn, [])
self.assertAllClose(detections['detection_boxes'][0, 0],
np.array([0, 0, 0, 0]))
# The class_center logits are initialized as 0's so it's filled with 0.5s.
# Despite that, we should only find one box.
self.assertAllClose(detections['detection_scores'][0],
[0.5, 0., 0., 0., 0.])
self.assertEqual(np.sum(detections['detection_classes']), 0)
self.assertEqual(detections['num_detections'], [1])
self.assertAllEqual([1, max_detection, num_keypoints, 2],
detections['detection_keypoints'].shape)
self.assertAllEqual([1, max_detection, num_keypoints],
detections['detection_keypoint_scores'].shape)
def test_get_instance_indices(self):
classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
num_detections = tf.constant([1, 3], dtype=tf.int32)
......
......@@ -26,6 +26,7 @@ from object_detection.utils import spatial_transform_ops
INSTANCE_EMBEDDING = 'INSTANCE_EMBEDDING'
PIXEL_EMBEDDING = 'PIXEL_EMBEDDING'
DEEP_MASK_ESTIMATION = 'deep_mask_estimation'
DEEP_MASK_BOX_CONSISTENCY = 'deep_mask_box_consistency'
LOSS_KEY_PREFIX = center_net_meta_arch.LOSS_KEY_PREFIX
......@@ -35,7 +36,7 @@ class DeepMACParams(
'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
'predict_full_resolution_masks', 'postprocess_crop_size',
'max_roi_jitter_ratio', 'roi_jitter_mode'
'max_roi_jitter_ratio', 'roi_jitter_mode', 'box_consistency_loss_weight'
])):
"""Class holding the DeepMAC network configutration."""
......@@ -46,7 +47,7 @@ class DeepMACParams(
mask_num_subsamples, use_xy, network_type, use_instance_embedding,
num_init_channels, predict_full_resolution_masks,
postprocess_crop_size, max_roi_jitter_ratio,
roi_jitter_mode):
roi_jitter_mode, box_consistency_loss_weight):
return super(DeepMACParams,
cls).__new__(cls, classification_loss, dim,
task_loss_weight, pixel_embedding_dim,
......@@ -55,7 +56,7 @@ class DeepMACParams(
use_instance_embedding, num_init_channels,
predict_full_resolution_masks,
postprocess_crop_size, max_roi_jitter_ratio,
roi_jitter_mode)
roi_jitter_mode, box_consistency_loss_weight)
def subsample_instances(classes, weights, boxes, masks, num_subsamples):
......@@ -206,6 +207,61 @@ def filter_masked_classes(masked_class_ids, classes, weights, masks):
)
def crop_and_resize_feature_map(features, boxes, size):
"""Crop and resize regions from a single feature map given a set of boxes.
Args:
features: A [H, W, C] float tensor.
boxes: A [N, 4] tensor of norrmalized boxes.
size: int, the size of the output features.
Returns:
per_box_features: A [N, size, size, C] tensor of cropped and resized
features.
"""
return spatial_transform_ops.matmul_crop_and_resize(
features[tf.newaxis], boxes[tf.newaxis], [size, size])[0]
def crop_and_resize_instance_masks(masks, boxes, mask_size):
"""Crop and resize each mask according to the given boxes.
Args:
masks: A [N, H, W] float tensor.
boxes: A [N, 4] float tensor of normalized boxes.
mask_size: int, the size of the output masks.
Returns:
masks: A [N, mask_size, mask_size] float tensor of cropped and resized
instance masks.
"""
cropped_masks = spatial_transform_ops.matmul_crop_and_resize(
masks[:, :, :, tf.newaxis], boxes[:, tf.newaxis, :],
[mask_size, mask_size])
cropped_masks = tf.squeeze(cropped_masks, axis=[1, 4])
return cropped_masks
def fill_boxes(boxes, height, width):
"""Fills the area included in the box."""
blist = box_list.BoxList(boxes)
blist = box_list_ops.to_absolute_coordinates(blist, height, width)
boxes = blist.get()
ymin, xmin, ymax, xmax = tf.unstack(
boxes[:, tf.newaxis, tf.newaxis, :], 4, axis=3)
ygrid, xgrid = tf.meshgrid(tf.range(height), tf.range(width), indexing='ij')
ygrid, xgrid = tf.cast(ygrid, tf.float32), tf.cast(xgrid, tf.float32)
ygrid, xgrid = ygrid[tf.newaxis, :, :], xgrid[tf.newaxis, :, :]
filled_boxes = tf.logical_and(
tf.logical_and(ygrid >= ymin, ygrid <= ymax),
tf.logical_and(xgrid >= xmin, xgrid <= xmax))
return tf.cast(filled_boxes, tf.float32)
class ResNetMaskNetwork(tf.keras.layers.Layer):
"""A small wrapper around ResNet blocks to predict masks."""
......@@ -379,7 +435,8 @@ def deepmac_proto_to_params(deepmac_config):
deepmac_config.predict_full_resolution_masks,
postprocess_crop_size=deepmac_config.postprocess_crop_size,
max_roi_jitter_ratio=deepmac_config.max_roi_jitter_ratio,
roi_jitter_mode=jitter_mode
roi_jitter_mode=jitter_mode,
box_consistency_loss_weight=deepmac_config.box_consistency_loss_weight
)
......@@ -402,6 +459,13 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
"""Constructs the super class with object center & detection params only."""
self._deepmac_params = deepmac_params
if (self._deepmac_params.predict_full_resolution_masks and
self._deepmac_params.max_roi_jitter_ratio > 0.0):
raise ValueError('Jittering is not supported for full res masks.')
if self._deepmac_params.mask_num_subsamples > 0:
raise ValueError('Subsampling masks is currently not supported.')
super(DeepMACMetaArch, self).__init__(
is_training=is_training, add_summaries=add_summaries,
num_classes=num_classes, feature_extractor=feature_extractor,
......@@ -462,21 +526,34 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
pixel_embedding = pixel_embedding[tf.newaxis, :, :, :]
pixel_embeddings_processed = tf.tile(pixel_embedding,
[num_instances, 1, 1, 1])
image_shape = tf.shape(pixel_embeddings_processed)
image_height, image_width = image_shape[1], image_shape[2]
y_grid, x_grid = tf.meshgrid(tf.linspace(0.0, 1.0, image_height),
tf.linspace(0.0, 1.0, image_width),
indexing='ij')
blist = box_list.BoxList(boxes)
ycenter, xcenter, _, _ = blist.get_center_coordinates_and_sizes()
y_grid = y_grid[tf.newaxis, :, :]
x_grid = x_grid[tf.newaxis, :, :]
y_grid -= ycenter[:, tf.newaxis, tf.newaxis]
x_grid -= xcenter[:, tf.newaxis, tf.newaxis]
coords = tf.stack([y_grid, x_grid], axis=3)
else:
# TODO(vighneshb) Explore multilevel_roi_align and align_corners=False.
pixel_embeddings_cropped = spatial_transform_ops.matmul_crop_and_resize(
pixel_embedding[tf.newaxis], boxes[tf.newaxis],
[mask_size, mask_size])
pixel_embeddings_processed = pixel_embeddings_cropped[0]
mask_shape = tf.shape(pixel_embeddings_processed)
mask_height, mask_width = mask_shape[1], mask_shape[2]
y_grid, x_grid = tf.meshgrid(tf.linspace(-1.0, 1.0, mask_height),
tf.linspace(-1.0, 1.0, mask_width),
indexing='ij')
coords = tf.stack([y_grid, x_grid], axis=2)
coords = coords[tf.newaxis, :, :, :]
coords = tf.tile(coords, [num_instances, 1, 1, 1])
pixel_embeddings_processed = crop_and_resize_feature_map(
pixel_embedding, boxes, mask_size)
mask_shape = tf.shape(pixel_embeddings_processed)
mask_height, mask_width = mask_shape[1], mask_shape[2]
y_grid, x_grid = tf.meshgrid(tf.linspace(-1.0, 1.0, mask_height),
tf.linspace(-1.0, 1.0, mask_width),
indexing='ij')
coords = tf.stack([y_grid, x_grid], axis=2)
coords = coords[tf.newaxis, :, :, :]
coords = tf.tile(coords, [num_instances, 1, 1, 1])
if self._deepmac_params.use_xy:
return tf.concat([coords, pixel_embeddings_processed], axis=3)
......@@ -528,11 +605,9 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
if self._deepmac_params.predict_full_resolution_masks:
return masks
else:
cropped_masks = spatial_transform_ops.matmul_crop_and_resize(
masks[:, :, :, tf.newaxis], boxes[:, tf.newaxis, :],
[mask_size, mask_size])
cropped_masks = crop_and_resize_instance_masks(
masks, boxes, mask_size)
cropped_masks = tf.stop_gradient(cropped_masks)
cropped_masks = tf.squeeze(cropped_masks, axis=[1, 4])
# TODO(vighneshb) should we discretize masks?
return cropped_masks
......@@ -543,7 +618,64 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
return resize_instance_masks(logits, (height, width))
def _compute_per_instance_mask_loss(
def _compute_per_instance_mask_prediction_loss(
self, boxes, mask_logits, mask_gt):
num_instances = tf.shape(boxes)[0]
mask_logits = self._resize_logits_like_gt(mask_logits, mask_gt)
mask_logits = tf.reshape(mask_logits, [num_instances, -1, 1])
mask_gt = tf.reshape(mask_gt, [num_instances, -1, 1])
loss = self._deepmac_params.classification_loss(
prediction_tensor=mask_logits,
target_tensor=mask_gt,
weights=tf.ones_like(mask_logits))
# TODO(vighneshb) Make this configurable via config.
# Skip normalization for dice loss because the denominator term already
# does normalization.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
def _compute_per_instance_box_consistency_loss(
self, boxes_gt, boxes_for_crop, mask_logits):
height, width = tf.shape(mask_logits)[1], tf.shape(mask_logits)[2]
filled_boxes = fill_boxes(boxes_gt, height, width)[:, :, :, tf.newaxis]
mask_logits = mask_logits[:, :, :, tf.newaxis]
if self._deepmac_params.predict_full_resolution_masks:
gt_crop = filled_boxes[:, :, :, 0]
pred_crop = mask_logits[:, :, :, 0]
else:
gt_crop = crop_and_resize_instance_masks(
filled_boxes, boxes_for_crop, self._deepmac_params.mask_size)
pred_crop = crop_and_resize_instance_masks(
mask_logits, boxes_for_crop, self._deepmac_params.mask_size)
loss = 0.0
for axis in [1, 2]:
pred_max = tf.reduce_max(pred_crop, axis=axis)[:, :, tf.newaxis]
gt_max = tf.reduce_max(gt_crop, axis=axis)[:, :, tf.newaxis]
axis_loss = self._deepmac_params.classification_loss(
prediction_tensor=pred_max,
target_tensor=gt_max,
weights=tf.ones_like(pred_max))
loss += axis_loss
# Skip normalization for dice loss because the denominator term already
# does normalization.
# TODO(vighneshb) Make this configurable via config.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
def _compute_per_instance_deepmac_losses(
self, boxes, masks, instance_embedding, pixel_embedding):
"""Returns the mask loss per instance.
......@@ -558,40 +690,36 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
pixel_embedding_size] float tensor containing the per-pixel embeddings.
Returns:
mask_loss: A [num_instances] shaped float tensor containing the
mask_prediction_loss: A [num_instances] shaped float tensor containing the
mask loss for each instance.
"""
box_consistency_loss: A [num_instances] shaped float tensor containing
the box consistency loss for each instance.
num_instances = tf.shape(boxes)[0]
"""
if tf.keras.backend.learning_phase():
boxes = preprocessor.random_jitter_boxes(
boxes_for_crop = preprocessor.random_jitter_boxes(
boxes, self._deepmac_params.max_roi_jitter_ratio,
jitter_mode=self._deepmac_params.roi_jitter_mode)
else:
boxes_for_crop = boxes
mask_input = self._get_mask_head_input(
boxes, pixel_embedding)
boxes_for_crop, pixel_embedding)
instance_embeddings = self._get_instance_embeddings(
boxes, instance_embedding)
boxes_for_crop, instance_embedding)
mask_logits = self._mask_net(
instance_embeddings, mask_input,
training=tf.keras.backend.learning_phase())
mask_gt = self._get_groundtruth_mask_output(boxes, masks)
mask_logits = self._resize_logits_like_gt(mask_logits, mask_gt)
mask_gt = self._get_groundtruth_mask_output(boxes_for_crop, masks)
mask_logits = tf.reshape(mask_logits, [num_instances, -1, 1])
mask_gt = tf.reshape(mask_gt, [num_instances, -1, 1])
loss = self._deepmac_params.classification_loss(
prediction_tensor=mask_logits,
target_tensor=mask_gt,
weights=tf.ones_like(mask_logits))
mask_prediction_loss = self._compute_per_instance_mask_prediction_loss(
boxes_for_crop, mask_logits, mask_gt)
# TODO(vighneshb) Make this configurable via config.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
box_consistency_loss = self._compute_per_instance_box_consistency_loss(
boxes, boxes_for_crop, mask_logits)
return mask_prediction_loss, box_consistency_loss
def _compute_instance_masks_loss(self, prediction_dict):
"""Computes the mask loss.
......@@ -603,7 +731,7 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
[batch_size, height, width, embedding_size].
Returns:
loss: float, the mask loss as a scalar.
loss_dict: A dict mapping string (loss names) to scalar floats.
"""
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
......@@ -613,7 +741,10 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
allowed_masked_classes_ids = (
self._deepmac_params.allowed_masked_classes_ids)
total_loss = 0.0
loss_dict = {
DEEP_MASK_ESTIMATION: 0.0,
DEEP_MASK_BOX_CONSISTENCY: 0.0
}
# Iterate over multiple preidctions by backbone (for hourglass length=2)
for instance_pred, pixel_pred in zip(
......@@ -625,24 +756,31 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
for i, (boxes, weights, classes, masks) in enumerate(
zip(gt_boxes_list, gt_weights_list, gt_classes_list, gt_masks_list)):
_, weights, masks = filter_masked_classes(allowed_masked_classes_ids,
classes, weights, masks)
num_subsample = self._deepmac_params.mask_num_subsamples
_, weights, boxes, masks = subsample_instances(
classes, weights, boxes, masks, num_subsample)
# TODO(vighneshb) Add sub-sampling back if required.
classes, valid_mask_weights, masks = filter_masked_classes(
allowed_masked_classes_ids, classes, weights, masks)
per_instance_loss = self._compute_per_instance_mask_loss(
boxes, masks, instance_pred[i], pixel_pred[i])
per_instance_loss *= weights
per_instance_mask_loss, per_instance_consistency_loss = (
self._compute_per_instance_deepmac_losses(
boxes, masks, instance_pred[i], pixel_pred[i]))
per_instance_mask_loss *= valid_mask_weights
per_instance_consistency_loss *= weights
num_instances = tf.maximum(tf.reduce_sum(weights), 1.0)
num_instances_allowed = tf.maximum(
tf.reduce_sum(valid_mask_weights), 1.0)
total_loss += tf.reduce_sum(per_instance_loss) / num_instances
loss_dict[DEEP_MASK_ESTIMATION] += (
tf.reduce_sum(per_instance_mask_loss) / num_instances_allowed)
loss_dict[DEEP_MASK_BOX_CONSISTENCY] += (
tf.reduce_sum(per_instance_consistency_loss) / num_instances)
batch_size = len(gt_boxes_list)
num_predictions = len(prediction_dict[INSTANCE_EMBEDDING])
return total_loss / float(batch_size * num_predictions)
return dict((key, loss / float(batch_size * num_predictions))
for key, loss in loss_dict.items())
def loss(self, prediction_dict, true_image_shapes, scope=None):
......@@ -650,13 +788,19 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
prediction_dict, true_image_shapes, scope)
if self._deepmac_params is not None:
mask_loss = self._compute_instance_masks_loss(
mask_loss_dict = self._compute_instance_masks_loss(
prediction_dict=prediction_dict)
key = LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION
losses_dict[key] = (
self._deepmac_params.task_loss_weight * mask_loss
losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION] = (
self._deepmac_params.task_loss_weight * mask_loss_dict[
DEEP_MASK_ESTIMATION]
)
if self._deepmac_params.box_consistency_loss_weight > 0.0:
losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_BOX_CONSISTENCY] = (
self._deepmac_params.box_consistency_loss_weight * mask_loss_dict[
DEEP_MASK_BOX_CONSISTENCY]
)
return losses_dict
def postprocess(self, prediction_dict, true_image_shapes, **params):
......
......@@ -60,7 +60,8 @@ class MockMaskNet(tf.keras.layers.Layer):
return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9
def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
mask_num_subsamples=-1):
"""Builds the DeepMAC meta architecture."""
feature_extractor = DummyFeatureExtractor(
......@@ -94,7 +95,7 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
pixel_embedding_dim=2,
allowed_masked_classes_ids=[],
mask_size=16,
mask_num_subsamples=-1,
mask_num_subsamples=mask_num_subsamples,
use_xy=True,
network_type='hourglass10',
use_instance_embedding=True,
......@@ -102,7 +103,8 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
predict_full_resolution_masks=predict_full_resolution_masks,
postprocess_crop_size=128,
max_roi_jitter_ratio=0.0,
roi_jitter_mode='random'
roi_jitter_mode='random',
box_consistency_loss_weight=1.0,
)
object_detection_params = center_net_meta_arch.ObjectDetectionParams(
......@@ -140,6 +142,33 @@ class DeepMACUtilsTest(tf.test.TestCase):
self.assertAllClose(result[2], boxes)
self.assertAllClose(result[3], masks)
def test_fill_boxes(self):
boxes = tf.constant([[0., 0., 0.5, 0.5], [0.5, 0.5, 1.0, 1.0]])
filled_boxes = deepmac_meta_arch.fill_boxes(boxes, 32, 32)
expected = np.zeros((2, 32, 32))
expected[0, :17, :17] = 1.0
expected[1, 16:, 16:] = 1.0
self.assertAllClose(expected, filled_boxes.numpy(), rtol=1e-3)
def test_crop_and_resize_instance_masks(self):
boxes = tf.zeros((5, 4))
masks = tf.zeros((5, 128, 128))
output = deepmac_meta_arch.crop_and_resize_instance_masks(
masks, boxes, 32)
self.assertEqual(output.shape, (5, 32, 32))
def test_crop_and_resize_feature_map(self):
boxes = tf.zeros((5, 4))
features = tf.zeros((128, 128, 7))
output = deepmac_meta_arch.crop_and_resize_feature_map(
features, boxes, 32)
self.assertEqual(output.shape, (5, 32, 32, 7))
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMetaArchTest(tf.test.TestCase):
......@@ -199,7 +228,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
def test_get_mask_head_input_no_crop_resize(self):
model = build_meta_arch(predict_full_resolution_masks=True)
boxes = tf.constant([[0., 0., 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]],
boxes = tf.constant([[0., 0., 1.0, 1.0], [0.0, 0.0, 0.5, 1.0]],
dtype=tf.float32)
pixel_embedding_np = np.random.randn(32, 32, 4).astype(np.float32)
......@@ -208,12 +237,15 @@ class DeepMACMetaArchTest(tf.test.TestCase):
mask_inputs = model._get_mask_head_input(boxes, pixel_embedding)
self.assertEqual(mask_inputs.shape, (2, 32, 32, 6))
y_grid, x_grid = tf.meshgrid(np.linspace(-1.0, 1.0, 32),
np.linspace(-1.0, 1.0, 32), indexing='ij')
y_grid, x_grid = tf.meshgrid(np.linspace(.0, 1.0, 32),
np.linspace(.0, 1.0, 32), indexing='ij')
ys = [0.5, 0.25]
xs = [0.5, 0.5]
for i in range(2):
mask_input = mask_inputs[i]
self.assertAllClose(y_grid, mask_input[:, :, 0])
self.assertAllClose(x_grid, mask_input[:, :, 1])
self.assertAllClose(y_grid - ys[i], mask_input[:, :, 0])
self.assertAllClose(x_grid - xs[i], mask_input[:, :, 1])
pixel_embedding = mask_input[:, :, 2:]
self.assertAllClose(pixel_embedding_np, pixel_embedding)
......@@ -262,7 +294,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
masks[1, 16:, 16:] = 1.0
masks = tf.constant(masks)
loss = model._compute_per_instance_mask_loss(
loss, _ = model._compute_per_instance_deepmac_losses(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
self.assertAllClose(
loss, np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
......@@ -275,7 +307,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
masks = np.ones((2, 128, 128), dtype=np.float32)
masks = tf.constant(masks)
loss = model._compute_per_instance_mask_loss(
loss, _ = model._compute_per_instance_deepmac_losses(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
self.assertAllClose(
loss, np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
......@@ -289,7 +321,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
masks = np.ones((2, 128, 128), dtype=np.float32)
masks = tf.constant(masks)
loss = model._compute_per_instance_mask_loss(
loss, _ = model._compute_per_instance_deepmac_losses(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
pred = tf.nn.sigmoid(0.9)
expected = (1.0 - ((2.0 * pred) / (1.0 + pred)))
......@@ -299,7 +331,7 @@ class DeepMACMetaArchTest(tf.test.TestCase):
boxes = tf.zeros([0, 4])
masks = tf.zeros([0, 128, 128])
loss = self.model._compute_per_instance_mask_loss(
loss, _ = self.model._compute_per_instance_deepmac_losses(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
self.assertEqual(loss.shape, (0,))
......@@ -394,6 +426,59 @@ class DeepMACMetaArchTest(tf.test.TestCase):
out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 8)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_box_consistency_loss(self):
boxes_gt = tf.constant([[0., 0., 0.49, 1.0]])
boxes_jittered = tf.constant([[0.0, 0.0, 1.0, 1.0]])
mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
mask_prediction[0, :24, :24] = 1.0
loss = self.model._compute_per_instance_box_consistency_loss(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 8 + [0.0] * 8),
logits=[1.0] * 12 + [0.0] * 4)
xloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 16),
logits=[1.0] * 12 + [0.0] * 4)
self.assertAllClose(loss, [tf.reduce_mean(yloss + xloss).numpy()])
def test_box_consistency_dice_loss(self):
model = build_meta_arch(use_dice_loss=True)
boxes_gt = tf.constant([[0., 0., 0.49, 1.0]])
boxes_jittered = tf.constant([[0.0, 0.0, 1.0, 1.0]])
almost_inf = 1e10
mask_prediction = np.full((1, 32, 32), -almost_inf, dtype=np.float32)
mask_prediction[0, :24, :24] = almost_inf
loss = model._compute_per_instance_box_consistency_loss(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
yloss = 1 - 6.0 / 7
xloss = 0.2
self.assertAllClose(loss, [yloss + xloss])
def test_box_consistency_dice_loss_full_res(self):
model = build_meta_arch(use_dice_loss=True,
predict_full_resolution_masks=True)
boxes_gt = tf.constant([[0., 0., 1.0, 1.0]])
boxes_jittered = None
almost_inf = 1e10
mask_prediction = np.full((1, 32, 32), -almost_inf, dtype=np.float32)
mask_prediction[0, :16, :32] = almost_inf
loss = model._compute_per_instance_box_consistency_loss(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
self.assertAlmostEqual(loss[0].numpy(), 1 / 3)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class FullyConnectedMaskHeadTest(tf.test.TestCase):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment