Unverified Commit 48b6d1d1 authored by Dan Anghel's avatar Dan Anghel Committed by GitHub
Browse files

Push to Github of TF2 changes to DELF package (#8678)



* First version of working script to download the GLDv2 dataset

* First version of the DEFL package installation script

* First working version of the DELF package installation script

* Fixed feedback from PR review

* Push to Github of changes to the TFRecord data generation script for DELF.

* Merged commit includes the following changes:
315363544  by Andre Araujo:

    Added the generation of TRAIN and VALIDATE splits from the train dataset.

--
314676530  by Andre Araujo:

    Updated script to download GLDv2 images for DELF training.

--
314101235  by Andre Araujo:

    Added newly created module 'utils' to the copybara script.

--
313677085  by Andre Araujo:

    Code migration from TF1 to TF2 for:
    - logging (replaced usage of tf.compat.v1.logging.info)
    - testing directories (replaced usage of tf.compat.v1.test.get_temp_dir())
    - feature/object extraction scripts (replaced usage of tf.compat.v1.train.string_input_producer and tf.compat.v1.train.start_queue_runners with PIL)

--
312770828  by Andre Araujo:

    Internal change.

--

PiperOrigin-RevId: 315363544

* First version of the updated README of the DELF training instructions

* Added to the README the section describing the generation of the training data

* Added warning about the TFRecord generation time

* Updated the launch of the training

* Minor README update

* Integrated review feedback

* Merged commit includes the following changes:
315971979  by Andre Araujo:

    Performance optimization in generating the TRAIN and VALIDATION splits per label.

--
315578370  by Andre Araujo:

    Tiny fix to char limit in extractor.py.

--
315546242  by Andre Araujo:

    Script to measure DELG latency.

--
315545801  by Andre Araujo:

    Pre-load PCA parameters, if using them when extracting DELF/G features.

--
315450392  by Andre Araujo:

    Code migration from TF1 to TF2 for:
    - loading the models using  in extractor.py and detector.py using tf.saved_model.load
    - removed tf.compat.v1.Session for the extractor and detector model usage

--
315406342  by Andre Araujo:

    Internal change.

--

PiperOrigin-RevId: 315971979

* Merged commit includes the following changes:
316538447  by Andre Araujo:

    Read the number of classes from the GLDv2 dataset metadata.

--
316416973  by Andre Araujo:

    Migration of DELF code to TF2:
    - removed tf.compat.v1.test.get_temp_dir() with FLAGS.test_tmpdir
    - removed delf_v1.py and its dependencies
    - removed tf.compat.v1, Session, Graph dependencies from feature_extractor.py, feature_aggregation_extractor.py and aggregation_extraction.py

--

PiperOrigin-RevId: 316538447
Co-authored-by: default avatarAndre Araujo <andrearaujo@google.com>
parent 57c08e2f
...@@ -20,11 +20,14 @@ from __future__ import print_function ...@@ -20,11 +20,14 @@ from __future__ import print_function
import os import os
from absl import flags
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from delf import box_io from delf import box_io
FLAGS = flags.FLAGS
class BoxesIoTest(tf.test.TestCase): class BoxesIoTest(tf.test.TestCase):
...@@ -57,8 +60,7 @@ class BoxesIoTest(tf.test.TestCase): ...@@ -57,8 +60,7 @@ class BoxesIoTest(tf.test.TestCase):
def testWriteAndReadToFile(self): def testWriteAndReadToFile(self):
boxes, scores, class_indices = self._create_data() boxes, scores, class_indices = self._create_data()
tmpdir = tf.compat.v1.test.get_temp_dir() filename = os.path.join(FLAGS.test_tmpdir, 'test.boxes')
filename = os.path.join(tmpdir, 'test.boxes')
box_io.WriteToFile(filename, boxes, scores, class_indices) box_io.WriteToFile(filename, boxes, scores, class_indices)
data_read = box_io.ReadFromFile(filename) data_read = box_io.ReadFromFile(filename)
...@@ -67,8 +69,7 @@ class BoxesIoTest(tf.test.TestCase): ...@@ -67,8 +69,7 @@ class BoxesIoTest(tf.test.TestCase):
self.assertAllEqual(class_indices, data_read[2]) self.assertAllEqual(class_indices, data_read[2])
def testWriteAndReadToFileEmptyFile(self): def testWriteAndReadToFileEmptyFile(self):
tmpdir = tf.compat.v1.test.get_temp_dir() filename = os.path.join(FLAGS.test_tmpdir, 'test.box')
filename = os.path.join(tmpdir, 'test.box')
box_io.WriteToFile(filename, np.array([]), np.array([]), np.array([])) box_io.WriteToFile(filename, np.array([]), np.array([]), np.array([]))
data_read = box_io.ReadFromFile(filename) data_read = box_io.ReadFromFile(filename)
......
...@@ -20,11 +20,14 @@ from __future__ import print_function ...@@ -20,11 +20,14 @@ from __future__ import print_function
import os import os
from absl import flags
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from delf import datum_io from delf import datum_io
FLAGS = flags.FLAGS
class DatumIoTest(tf.test.TestCase): class DatumIoTest(tf.test.TestCase):
...@@ -69,8 +72,7 @@ class DatumIoTest(tf.test.TestCase): ...@@ -69,8 +72,7 @@ class DatumIoTest(tf.test.TestCase):
def testWriteAndReadToFile(self): def testWriteAndReadToFile(self):
data = np.array([[[-1.0, 125.0, -2.5], [14.5, 3.5, 0.0]], data = np.array([[[-1.0, 125.0, -2.5], [14.5, 3.5, 0.0]],
[[20.0, 0.0, 30.0], [25.5, 36.0, 42.0]]]) [[20.0, 0.0, 30.0], [25.5, 36.0, 42.0]]])
tmpdir = tf.compat.v1.test.get_temp_dir() filename = os.path.join(FLAGS.test_tmpdir, 'test.datum')
filename = os.path.join(tmpdir, 'test.datum')
datum_io.WriteToFile(data, filename) datum_io.WriteToFile(data, filename)
data_read = datum_io.ReadFromFile(filename) data_read = datum_io.ReadFromFile(filename)
self.assertAllEqual(data_read, data) self.assertAllEqual(data_read, data)
...@@ -84,8 +86,7 @@ class DatumIoTest(tf.test.TestCase): ...@@ -84,8 +86,7 @@ class DatumIoTest(tf.test.TestCase):
data_2 = np.array( data_2 = np.array(
[[[255, 0, 5], [10, 300, 0]], [[20, 1, 100], [255, 360, 420]]], [[[255, 0, 5], [10, 300, 0]], [[20, 1, 100], [255, 360, 420]]],
dtype='uint32') dtype='uint32')
tmpdir = tf.compat.v1.test.get_temp_dir() filename = os.path.join(FLAGS.test_tmpdir, 'test.datum_pair')
filename = os.path.join(tmpdir, 'test.datum_pair')
datum_io.WritePairToFile(data_1, data_2, filename) datum_io.WritePairToFile(data_1, data_2, filename)
data_read_1, data_read_2 = datum_io.ReadPairFromFile(filename) data_read_1, data_read_2 = datum_io.ReadPairFromFile(filename)
self.assertAllEqual(data_read_1, data_1) self.assertAllEqual(data_read_1, data_1)
......
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""DELF model implementation based on the following paper.
Large-Scale Image Retrieval with Attentive Deep Local Features
https://arxiv.org/abs/1612.06321
Please refer to the README.md file for detailed explanations on using the DELF
model.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_slim import layers
from tf_slim.nets import resnet_v1
from tf_slim.ops.arg_scope import arg_scope
_SUPPORTED_TARGET_LAYER = ['resnet_v1_50/block3', 'resnet_v1_50/block4']
# The variable scope for the attention portion of the model.
_ATTENTION_VARIABLE_SCOPE = 'attention_block'
# The attention_type determines whether the attention based feature aggregation
# is performed on the L2-normalized feature map or on the default feature map
# where L2-normalization is not applied. Note that in both cases, attention
# functions are built on the un-normalized feature map. This is only relevant
# for the training stage.
# Currently supported options are as follows:
# * use_l2_normalized_feature:
# The option use_l2_normalized_feature first applies L2-normalization on the
# feature map and then applies attention based feature aggregation. This
# option is used for the DELF+FT+Att model in the paper.
# * use_default_input_feature:
# The option use_default_input_feature aggregates unnormalized feature map
# directly.
_SUPPORTED_ATTENTION_TYPES = [
'use_l2_normalized_feature', 'use_default_input_feature'
]
# Supported types of non-lineary for the attention score function.
_SUPPORTED_ATTENTION_NONLINEARITY = ['softplus']
class DelfV1(object):
"""Creates a DELF model.
Args:
target_layer_type: The name of target CNN architecture and its layer.
Raises:
ValueError: If an unknown target_layer_type is provided.
"""
def __init__(self, target_layer_type=_SUPPORTED_TARGET_LAYER[0]):
print('Creating model %s ' % target_layer_type)
self._target_layer_type = target_layer_type
if self._target_layer_type not in _SUPPORTED_TARGET_LAYER:
raise ValueError('Unknown model type.')
@property
def target_layer_type(self):
return self._target_layer_type
def _PerformAttention(self,
attention_feature_map,
feature_map,
attention_nonlinear,
kernel=1):
"""Helper function to construct the attention part of the model.
Computes attention score map and aggregates the input feature map based on
the attention score map.
Args:
attention_feature_map: Potentially normalized feature map that will be
aggregated with attention score map.
feature_map: Unnormalized feature map that will be used to compute
attention score map.
attention_nonlinear: Type of non-linearity that will be applied to
attention value.
kernel: Convolutional kernel to use in attention layers (eg: 1, [3, 3]).
Returns:
attention_feat: Aggregated feature vector.
attention_prob: Attention score map after the non-linearity.
attention_score: Attention score map before the non-linearity.
Raises:
ValueError: If unknown attention non-linearity type is provided.
"""
with tf.compat.v1.variable_scope(
'attention', values=[attention_feature_map, feature_map]):
with tf.compat.v1.variable_scope('compute', values=[feature_map]):
activation_fn_conv1 = tf.nn.relu
feature_map_conv1 = layers.conv2d(
feature_map,
512,
kernel,
rate=1,
activation_fn=activation_fn_conv1,
scope='conv1')
attention_score = layers.conv2d(
feature_map_conv1,
1,
kernel,
rate=1,
activation_fn=None,
normalizer_fn=None,
scope='conv2')
# Set activation of conv2 layer of attention model.
with tf.compat.v1.variable_scope(
'merge', values=[attention_feature_map, attention_score]):
if attention_nonlinear not in _SUPPORTED_ATTENTION_NONLINEARITY:
raise ValueError('Unknown attention non-linearity.')
if attention_nonlinear == 'softplus':
with tf.compat.v1.variable_scope(
'softplus_attention',
values=[attention_feature_map, attention_score]):
attention_prob = tf.nn.softplus(attention_score)
attention_feat = tf.reduce_mean(
tf.multiply(attention_feature_map, attention_prob), [1, 2])
attention_feat = tf.expand_dims(tf.expand_dims(attention_feat, 1), 2)
return attention_feat, attention_prob, attention_score
def _GetAttentionSubnetwork(
self,
feature_map,
end_points,
attention_nonlinear=_SUPPORTED_ATTENTION_NONLINEARITY[0],
attention_type=_SUPPORTED_ATTENTION_TYPES[0],
kernel=1,
reuse=False):
"""Constructs the part of the model performing attention.
Args:
feature_map: A tensor of size [batch, height, width, channels]. Usually it
corresponds to the output feature map of a fully-convolutional network.
end_points: Set of activations of the network constructed so far.
attention_nonlinear: Type of non-linearity on top of the attention
function.
attention_type: Type of the attention structure.
kernel: Convolutional kernel to use in attention layers (eg, [3, 3]).
reuse: Whether or not the layer and its variables should be reused.
Returns:
prelogits: A tensor of size [batch, 1, 1, channels].
attention_prob: Attention score after the non-linearity.
attention_score: Attention score before the non-linearity.
end_points: Updated set of activations, for external use.
Raises:
ValueError: If unknown attention_type is provided.
"""
with tf.compat.v1.variable_scope(
_ATTENTION_VARIABLE_SCOPE,
values=[feature_map, end_points],
reuse=reuse):
if attention_type not in _SUPPORTED_ATTENTION_TYPES:
raise ValueError('Unknown attention_type.')
if attention_type == 'use_l2_normalized_feature':
attention_feature_map = tf.nn.l2_normalize(
feature_map, 3, name='l2_normalize')
elif attention_type == 'use_default_input_feature':
attention_feature_map = feature_map
end_points['attention_feature_map'] = attention_feature_map
attention_outputs = self._PerformAttention(attention_feature_map,
feature_map,
attention_nonlinear, kernel)
prelogits, attention_prob, attention_score = attention_outputs
end_points['prelogits'] = prelogits
end_points['attention_prob'] = attention_prob
end_points['attention_score'] = attention_score
return prelogits, attention_prob, attention_score, end_points
def GetResnet50Subnetwork(self,
images,
is_training=False,
global_pool=False,
reuse=None):
"""Constructs resnet_v1_50 part of the DELF model.
Args:
images: A tensor of size [batch, height, width, channels].
is_training: Whether or not the model is in training mode.
global_pool: If True, perform global average pooling after feature
extraction. This may be useful for DELF's descriptor fine-tuning stage.
reuse: Whether or not the layer and its variables should be reused.
Returns:
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
If global_pool is True, height_out = width_out = 1.
end_points: A set of activations for external use.
"""
block = resnet_v1.resnet_v1_block
blocks = [
block('block1', base_depth=64, num_units=3, stride=2),
block('block2', base_depth=128, num_units=4, stride=2),
block('block3', base_depth=256, num_units=6, stride=2),
]
if self._target_layer_type == 'resnet_v1_50/block4':
blocks.append(block('block4', base_depth=512, num_units=3, stride=1))
net, end_points = resnet_v1.resnet_v1(
images,
blocks,
is_training=is_training,
global_pool=global_pool,
reuse=reuse,
scope='resnet_v1_50')
return net, end_points
def GetAttentionPrelogit(
self,
images,
weight_decay=0.0001,
attention_nonlinear=_SUPPORTED_ATTENTION_NONLINEARITY[0],
attention_type=_SUPPORTED_ATTENTION_TYPES[0],
kernel=1,
training_resnet=False,
training_attention=False,
reuse=False,
use_batch_norm=True):
"""Constructs attention model on resnet_v1_50.
Args:
images: A tensor of size [batch, height, width, channels].
weight_decay: The parameters for weight_decay regularizer.
attention_nonlinear: Type of non-linearity on top of the attention
function.
attention_type: Type of the attention structure.
kernel: Convolutional kernel to use in attention layers (eg, [3, 3]).
training_resnet: Whether or not the Resnet blocks from the model are in
training mode.
training_attention: Whether or not the attention part of the model is in
training mode.
reuse: Whether or not the layer and its variables should be reused.
use_batch_norm: Whether or not to use batch normalization.
Returns:
prelogits: A tensor of size [batch, 1, 1, channels].
attention_prob: Attention score after the non-linearity.
attention_score: Attention score before the non-linearity.
feature_map: Features extracted from the model, which are not
l2-normalized.
end_points: Set of activations for external use.
"""
# Construct Resnet50 features.
with arg_scope(resnet_v1.resnet_arg_scope(use_batch_norm=use_batch_norm)):
_, end_points = self.GetResnet50Subnetwork(
images, is_training=training_resnet, reuse=reuse)
feature_map = end_points[self._target_layer_type]
# Construct attention subnetwork on top of features.
with arg_scope(
resnet_v1.resnet_arg_scope(
weight_decay=weight_decay, use_batch_norm=use_batch_norm)):
with arg_scope([layers.batch_norm], is_training=training_attention):
(prelogits, attention_prob, attention_score,
end_points) = self._GetAttentionSubnetwork(
feature_map,
end_points,
attention_nonlinear=attention_nonlinear,
attention_type=attention_type,
kernel=kernel,
reuse=reuse)
return prelogits, attention_prob, attention_score, feature_map, end_points
def _GetAttentionModel(
self,
images,
num_classes,
weight_decay=0.0001,
attention_nonlinear=_SUPPORTED_ATTENTION_NONLINEARITY[0],
attention_type=_SUPPORTED_ATTENTION_TYPES[0],
kernel=1,
training_resnet=False,
training_attention=False,
reuse=False):
"""Constructs attention model on resnet_v1_50.
Args:
images: A tensor of size [batch, height, width, channels]
num_classes: The number of output classes.
weight_decay: The parameters for weight_decay regularizer.
attention_nonlinear: Type of non-linearity on top of the attention
function.
attention_type: Type of the attention structure.
kernel: Convolutional kernel to use in attention layers (eg, [3, 3]).
training_resnet: Whether or not the Resnet blocks from the model are in
training mode.
training_attention: Whether or not the attention part of the model is in
training mode.
reuse: Whether or not the layer and its variables should be reused.
Returns:
logits: A tensor of size [batch, num_classes].
attention_prob: Attention score after the non-linearity.
attention_score: Attention score before the non-linearity.
feature_map: Features extracted from the model, which are not
l2-normalized.
"""
attention_feat, attention_prob, attention_score, feature_map, _ = (
self.GetAttentionPrelogit(
images,
weight_decay,
attention_nonlinear=attention_nonlinear,
attention_type=attention_type,
kernel=kernel,
training_resnet=training_resnet,
training_attention=training_attention,
reuse=reuse))
with arg_scope(
resnet_v1.resnet_arg_scope(
weight_decay=weight_decay, batch_norm_scale=True)):
with arg_scope([layers.batch_norm], is_training=training_attention):
with tf.compat.v1.variable_scope(
_ATTENTION_VARIABLE_SCOPE, values=[attention_feat], reuse=reuse):
logits = layers.conv2d(
attention_feat,
num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
scope='logits')
logits = tf.squeeze(logits, [1, 2], name='spatial_squeeze')
return logits, attention_prob, attention_score, feature_map
def AttentionModel(self,
images,
num_classes,
weight_decay=0.0001,
attention_nonlinear=_SUPPORTED_ATTENTION_NONLINEARITY[0],
attention_type=_SUPPORTED_ATTENTION_TYPES[0],
kernel=1,
training_resnet=False,
training_attention=False,
reuse=False):
"""Constructs attention based classification model for training.
Args:
images: A tensor of size [batch, height, width, channels]
num_classes: The number of output classes.
weight_decay: The parameters for weight_decay regularizer.
attention_nonlinear: Type of non-linearity on top of the attention
function.
attention_type: Type of the attention structure.
kernel: Convolutional kernel to use in attention layers (eg, [3, 3]).
training_resnet: Whether or not the Resnet blocks from the model are in
training mode.
training_attention: Whether or not the model is in training mode. Note
that this function only supports training the attention part of the
model, ie, the feature extraction layers are not trained.
reuse: Whether or not the layer and its variables should be reused.
Returns:
logit: A tensor of size [batch, num_classes]
attention: Attention score after the non-linearity.
feature_map: Features extracted from the model, which are not
l2-normalized.
Raises:
ValueError: If unknown target_layer_type is provided.
"""
if 'resnet_v1_50' in self._target_layer_type:
net_outputs = self._GetAttentionModel(
images,
num_classes,
weight_decay,
attention_nonlinear=attention_nonlinear,
attention_type=attention_type,
kernel=kernel,
training_resnet=training_resnet,
training_attention=training_attention,
reuse=reuse)
logits, attention, _, feature_map = net_outputs
else:
raise ValueError('Unknown target_layer_type.')
return logits, attention, feature_map
...@@ -124,71 +124,70 @@ def ExtractAggregatedRepresentationsToFiles(image_names, features_dir, ...@@ -124,71 +124,70 @@ def ExtractAggregatedRepresentationsToFiles(image_names, features_dir,
if not tf.io.gfile.exists(output_aggregation_dir): if not tf.io.gfile.exists(output_aggregation_dir):
tf.io.gfile.makedirs(output_aggregation_dir) tf.io.gfile.makedirs(output_aggregation_dir)
with tf.compat.v1.Session() as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config)
start = time.time()
start = time.clock() for i in range(num_images):
for i in range(num_images): if i == 0:
if i == 0: print('Starting to extract aggregation from images...')
print('Starting to extract aggregation from images...') elif i % _STATUS_CHECK_ITERATIONS == 0:
elif i % _STATUS_CHECK_ITERATIONS == 0: elapsed = (time.time() - start)
elapsed = (time.clock() - start) print('Processing image %d out of %d, last %d '
print('Processing image %d out of %d, last %d ' 'images took %f seconds' %
'images took %f seconds' % (i, num_images, _STATUS_CHECK_ITERATIONS, elapsed))
(i, num_images, _STATUS_CHECK_ITERATIONS, elapsed)) start = time.time()
start = time.clock()
image_name = image_names[i]
image_name = image_names[i]
# Compose output file name, skip extraction for this image if it already
# Compose output file name, skip extraction for this image if it already # exists.
# exists. output_aggregation_filename = os.path.join(output_aggregation_dir,
output_aggregation_filename = os.path.join(output_aggregation_dir, image_name + output_extension)
image_name + output_extension) if tf.io.gfile.exists(output_aggregation_filename):
if tf.io.gfile.exists(output_aggregation_filename): print('Skipping %s' % image_name)
print('Skipping %s' % image_name) continue
continue
# Load DELF features.
# Load DELF features. if config.use_regional_aggregation:
if config.use_regional_aggregation: if not mapping_path:
if not mapping_path: raise ValueError(
raise ValueError( 'Requested regional aggregation, but mapping_path was not '
'Requested regional aggregation, but mapping_path was not ' 'provided')
'provided') descriptors_list = []
descriptors_list = [] num_features_per_box = []
num_features_per_box = [] for box_feature_file in images_to_box_feature_files[image_name]:
for box_feature_file in images_to_box_feature_files[image_name]: delf_filename = os.path.join(features_dir,
delf_filename = os.path.join(features_dir, box_feature_file + _DELF_EXTENSION)
box_feature_file + _DELF_EXTENSION) _, _, box_descriptors, _, _ = feature_io.ReadFromFile(delf_filename)
_, _, box_descriptors, _, _ = feature_io.ReadFromFile(delf_filename) # If `box_descriptors` is empty, reshape it such that it can be
# If `box_descriptors` is empty, reshape it such that it can be # concatenated with other descriptors.
# concatenated with other descriptors. if not box_descriptors.shape[0]:
if not box_descriptors.shape[0]: box_descriptors = np.reshape(box_descriptors,
box_descriptors = np.reshape(box_descriptors, [0, config.feature_dimensionality])
[0, config.feature_dimensionality]) descriptors_list.append(box_descriptors)
descriptors_list.append(box_descriptors) num_features_per_box.append(box_descriptors.shape[0])
num_features_per_box.append(box_descriptors.shape[0])
descriptors = np.concatenate(descriptors_list)
descriptors = np.concatenate(descriptors_list) else:
else: input_delf_filename = os.path.join(features_dir,
input_delf_filename = os.path.join(features_dir, image_name + _DELF_EXTENSION)
image_name + _DELF_EXTENSION) _, _, descriptors, _, _ = feature_io.ReadFromFile(input_delf_filename)
_, _, descriptors, _, _ = feature_io.ReadFromFile(input_delf_filename) # If `descriptors` is empty, reshape it to avoid extraction failure.
# If `descriptors` is empty, reshape it to avoid extraction failure. if not descriptors.shape[0]:
if not descriptors.shape[0]: descriptors = np.reshape(descriptors,
descriptors = np.reshape(descriptors, [0, config.feature_dimensionality])
[0, config.feature_dimensionality]) num_features_per_box = None
num_features_per_box = None
# Extract and save aggregation. If using VLAD, only
# Extract and save aggregation. If using VLAD, only # `aggregated_descriptors` needs to be saved.
# `aggregated_descriptors` needs to be saved. (aggregated_descriptors,
(aggregated_descriptors, feature_visual_words) = extractor.Extract(descriptors,
feature_visual_words) = extractor.Extract(descriptors, num_features_per_box)
num_features_per_box) if config.aggregation_type == _VLAD:
if config.aggregation_type == _VLAD: datum_io.WriteToFile(aggregated_descriptors,
datum_io.WriteToFile(aggregated_descriptors, output_aggregation_filename)
output_aggregation_filename) else:
else: datum_io.WritePairToFile(aggregated_descriptors,
datum_io.WritePairToFile(aggregated_descriptors, feature_visual_words.astype('uint32'),
feature_visual_words.astype('uint32'), output_aggregation_filename)
output_aggregation_filename)
...@@ -40,7 +40,6 @@ class ExtractAggregatedRepresentation(object): ...@@ -40,7 +40,6 @@ class ExtractAggregatedRepresentation(object):
"""Class for extraction of aggregated local feature representation. """Class for extraction of aggregated local feature representation.
Args: Args:
sess: TensorFlow session to use.
aggregation_config: AggregationConfig object defining type of aggregation to aggregation_config: AggregationConfig object defining type of aggregation to
use. use.
...@@ -48,65 +47,28 @@ class ExtractAggregatedRepresentation(object): ...@@ -48,65 +47,28 @@ class ExtractAggregatedRepresentation(object):
ValueError: If aggregation type is invalid. ValueError: If aggregation type is invalid.
""" """
def __init__(self, sess, aggregation_config): def __init__(self, aggregation_config):
self._sess = sess
self._codebook_size = aggregation_config.codebook_size self._codebook_size = aggregation_config.codebook_size
self._feature_dimensionality = aggregation_config.feature_dimensionality self._feature_dimensionality = aggregation_config.feature_dimensionality
self._aggregation_type = aggregation_config.aggregation_type self._aggregation_type = aggregation_config.aggregation_type
self._feature_batch_size = aggregation_config.feature_batch_size self._feature_batch_size = aggregation_config.feature_batch_size
self._codebook_path = aggregation_config.codebook_path
self._use_regional_aggregation = aggregation_config.use_regional_aggregation
self._use_l2_normalization = aggregation_config.use_l2_normalization
self._num_assignments = aggregation_config.num_assignments
# Inputs to extraction function. if self._aggregation_type not in [_VLAD, _ASMK, _ASMK_STAR]:
self._features = tf.compat.v1.placeholder(tf.float32, [None, None])
self._num_features_per_region = tf.compat.v1.placeholder(tf.int32, [None])
# Load codebook into graph.
codebook = tf.compat.v1.get_variable(
"codebook",
shape=[
aggregation_config.codebook_size,
aggregation_config.feature_dimensionality
])
tf.compat.v1.train.init_from_checkpoint(
aggregation_config.codebook_path, {_CLUSTER_CENTERS_VAR_NAME: codebook})
# Construct extraction graph based on desired options.
if self._aggregation_type == _VLAD:
# Feature visual words are unused in the case of VLAD, so just return
# dummy constant.
self._feature_visual_words = tf.constant(-1, dtype=tf.int32)
if aggregation_config.use_regional_aggregation:
self._aggregated_descriptors = self._ComputeRvlad(
self._features,
self._num_features_per_region,
codebook,
use_l2_normalization=aggregation_config.use_l2_normalization,
num_assignments=aggregation_config.num_assignments)
else:
self._aggregated_descriptors = self._ComputeVlad(
self._features,
codebook,
use_l2_normalization=aggregation_config.use_l2_normalization,
num_assignments=aggregation_config.num_assignments)
elif (self._aggregation_type == _ASMK or
self._aggregation_type == _ASMK_STAR):
if aggregation_config.use_regional_aggregation:
(self._aggregated_descriptors,
self._feature_visual_words) = self._ComputeRasmk(
self._features,
self._num_features_per_region,
codebook,
num_assignments=aggregation_config.num_assignments)
else:
(self._aggregated_descriptors,
self._feature_visual_words) = self._ComputeAsmk(
self._features,
codebook,
num_assignments=aggregation_config.num_assignments)
else:
raise ValueError("Invalid aggregation type: %d" % self._aggregation_type) raise ValueError("Invalid aggregation type: %d" % self._aggregation_type)
# Initialize variables in the TF graph. # Load codebook
sess.run(tf.compat.v1.global_variables_initializer()) codebook = tf.Variable(
tf.zeros([self._codebook_size, self._feature_dimensionality],
dtype=tf.float32),
name=_CLUSTER_CENTERS_VAR_NAME)
ckpt = tf.train.Checkpoint(codebook=codebook)
ckpt.restore(self._codebook_path)
self._codebook = codebook
def Extract(self, features, num_features_per_region=None): def Extract(self, features, num_features_per_region=None):
"""Extracts aggregated representation. """Extracts aggregated representation.
...@@ -127,10 +89,13 @@ class ExtractAggregatedRepresentation(object): ...@@ -127,10 +89,13 @@ class ExtractAggregatedRepresentation(object):
Raises: Raises:
ValueError: If inputs are misconfigured. ValueError: If inputs are misconfigured.
""" """
features = tf.cast(features, dtype=tf.float32)
if num_features_per_region is None: if num_features_per_region is None:
# Use dummy value since it is unused. # Use dummy value since it is unused.
num_features_per_region = [] num_features_per_region = []
else: else:
num_features_per_region = tf.cast(num_features_per_region, dtype=tf.int32)
if len(num_features_per_region if len(num_features_per_region
) and sum(num_features_per_region) != features.shape[0]: ) and sum(num_features_per_region) != features.shape[0]:
raise ValueError( raise ValueError(
...@@ -138,12 +103,41 @@ class ExtractAggregatedRepresentation(object): ...@@ -138,12 +103,41 @@ class ExtractAggregatedRepresentation(object):
"features.shape[0] are different: %d vs %d" % "features.shape[0] are different: %d vs %d" %
(sum(num_features_per_region), features.shape[0])) (sum(num_features_per_region), features.shape[0]))
aggregated_descriptors, feature_visual_words = self._sess.run( # Extract features based on desired options.
[self._aggregated_descriptors, self._feature_visual_words], if self._aggregation_type == _VLAD:
feed_dict={ # Feature visual words are unused in the case of VLAD, so just return
self._features: features, # dummy constant.
self._num_features_per_region: num_features_per_region feature_visual_words = tf.constant(-1, dtype=tf.int32)
}) if self._use_regional_aggregation:
aggregated_descriptors = self._ComputeRvlad(
features,
num_features_per_region,
self._codebook,
use_l2_normalization=self._use_l2_normalization,
num_assignments=self._num_assignments)
else:
aggregated_descriptors = self._ComputeVlad(
features,
self._codebook,
use_l2_normalization=self._use_l2_normalization,
num_assignments=self._num_assignments)
elif (self._aggregation_type == _ASMK or
self._aggregation_type == _ASMK_STAR):
if self._use_regional_aggregation:
(aggregated_descriptors,
feature_visual_words) = self._ComputeRasmk(
features,
num_features_per_region,
self._codebook,
num_assignments=self._num_assignments)
else:
(aggregated_descriptors,
feature_visual_words) = self._ComputeAsmk(
features,
self._codebook,
num_assignments=self._num_assignments)
feature_visual_words_output = feature_visual_words.numpy()
# If using ASMK*/RASMK*, binarize the aggregated descriptors. # If using ASMK*/RASMK*, binarize the aggregated descriptors.
if self._aggregation_type == _ASMK_STAR: if self._aggregation_type == _ASMK_STAR:
...@@ -151,9 +145,11 @@ class ExtractAggregatedRepresentation(object): ...@@ -151,9 +145,11 @@ class ExtractAggregatedRepresentation(object):
aggregated_descriptors, [-1, self._feature_dimensionality]) aggregated_descriptors, [-1, self._feature_dimensionality])
packed_descriptors = np.packbits( packed_descriptors = np.packbits(
reshaped_aggregated_descriptors > 0, axis=1) reshaped_aggregated_descriptors > 0, axis=1)
aggregated_descriptors = np.reshape(packed_descriptors, [-1]) aggregated_descriptors_output = np.reshape(packed_descriptors, [-1])
else:
aggregated_descriptors_output = aggregated_descriptors.numpy()
return aggregated_descriptors, feature_visual_words return aggregated_descriptors_output, feature_visual_words_output
def _ComputeVlad(self, def _ComputeVlad(self,
features, features,
...@@ -268,11 +264,13 @@ class ExtractAggregatedRepresentation(object): ...@@ -268,11 +264,13 @@ class ExtractAggregatedRepresentation(object):
output_vlad: VLAD descriptor updated to take into account contribution output_vlad: VLAD descriptor updated to take into account contribution
from ind-th feature. from ind-th feature.
""" """
diff = tf.tile(
tf.expand_dims(features[ind],
axis=0), [num_assignments, 1]) - tf.gather(
codebook, selected_visual_words[ind])
return ind + 1, tf.tensor_scatter_nd_add( return ind + 1, tf.tensor_scatter_nd_add(
vlad, tf.expand_dims(selected_visual_words[ind], axis=1), vlad, tf.expand_dims(selected_visual_words[ind], axis=1),
tf.tile( tf.cast(diff, dtype=tf.float32))
tf.expand_dims(features[ind], axis=0), [num_assignments, 1]) -
tf.gather(codebook, selected_visual_words[ind]))
ind_vlad = tf.constant(0, dtype=tf.int32) ind_vlad = tf.constant(0, dtype=tf.int32)
keep_going = lambda j, vlad: tf.less(j, num_features) keep_going = lambda j, vlad: tf.less(j, num_features)
...@@ -398,7 +396,9 @@ class ExtractAggregatedRepresentation(object): ...@@ -398,7 +396,9 @@ class ExtractAggregatedRepresentation(object):
visual_words = tf.reshape( visual_words = tf.reshape(
tf.where( tf.where(
tf.greater(per_centroid_norms, tf.sqrt(_NORM_SQUARED_TOLERANCE))), tf.greater(
per_centroid_norms,
tf.cast(tf.sqrt(_NORM_SQUARED_TOLERANCE), dtype=tf.float32))),
[-1]) [-1])
per_centroid_normalized_vector = tf.math.l2_normalize( per_centroid_normalized_vector = tf.math.l2_normalize(
......
...@@ -20,12 +20,15 @@ from __future__ import print_function ...@@ -20,12 +20,15 @@ from __future__ import print_function
import os import os
from absl import flags
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from delf import aggregation_config_pb2 from delf import aggregation_config_pb2
from delf import feature_aggregation_extractor from delf import feature_aggregation_extractor
FLAGS = flags.FLAGS
class FeatureAggregationTest(tf.test.TestCase): class FeatureAggregationTest(tf.test.TestCase):
...@@ -35,17 +38,15 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -35,17 +38,15 @@ class FeatureAggregationTest(tf.test.TestCase):
Args: Args:
checkpoint_path: Directory where codebook is saved to. checkpoint_path: Directory where codebook is saved to.
""" """
with tf.Graph().as_default() as g, self.session(graph=g) as sess: codebook = tf.Variable(
codebook = tf.Variable( [[0.5, 0.5], [0.0, 0.0], [1.0, 0.0], [-0.5, -0.5], [0.0, 1.0]],
[[0.5, 0.5], [0.0, 0.0], [1.0, 0.0], [-0.5, -0.5], [0.0, 1.0]], name='clusters',
name='clusters') dtype=tf.float32)
saver = tf.compat.v1.train.Saver([codebook]) ckpt = tf.train.Checkpoint(codebook=codebook)
sess.run(tf.compat.v1.global_variables_initializer()) ckpt.write(checkpoint_path)
saver.save(sess, checkpoint_path)
def setUp(self): def setUp(self):
self._codebook_path = os.path.join(tf.compat.v1.test.get_temp_dir(), self._codebook_path = os.path.join(FLAGS.test_tmpdir, 'test_codebook')
'test_codebook')
self._CreateCodebook(self._codebook_path) self._CreateCodebook(self._codebook_path)
def testComputeNormalizedVladWorks(self): def testComputeNormalizedVladWorks(self):
...@@ -61,10 +62,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -61,10 +62,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.num_assignments = 1 config.num_assignments = 1
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) vlad, extra_output = extractor.Extract(features)
vlad, extra_output = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_vlad = [ exp_vlad = [
...@@ -90,10 +90,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -90,10 +90,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.feature_batch_size = 2 config.feature_batch_size = 2
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) vlad, extra_output = extractor.Extract(features)
vlad, extra_output = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_vlad = [ exp_vlad = [
...@@ -118,10 +117,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -118,10 +117,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.num_assignments = 1 config.num_assignments = 1
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) vlad, extra_output = extractor.Extract(features)
vlad, extra_output = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_vlad = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 1.0, 1.0] exp_vlad = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5, 0.5, 1.0, 1.0]
...@@ -144,10 +142,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -144,10 +142,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.num_assignments = 3 config.num_assignments = 3
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) vlad, extra_output = extractor.Extract(features)
vlad, extra_output = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_vlad = [1.0, 1.0, 0.0, 0.0, 0.0, 2.0, -0.5, 0.5, 0.0, 0.0] exp_vlad = [1.0, 1.0, 0.0, 0.0, 0.0, 2.0, -0.5, 0.5, 0.0, 0.0]
...@@ -168,10 +165,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -168,10 +165,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.codebook_path = self._codebook_path config.codebook_path = self._codebook_path
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) vlad, extra_output = extractor.Extract(features)
vlad, extra_output = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_vlad = np.zeros([10], dtype=float) exp_vlad = np.zeros([10], dtype=float)
...@@ -197,10 +193,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -197,10 +193,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rvlad, extra_output = extractor.Extract(features, num_features_per_region)
rvlad, extra_output = extractor.Extract(features, num_features_per_region)
# Define expected results. # Define expected results.
exp_rvlad = [ exp_rvlad = [
...@@ -228,10 +223,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -228,10 +223,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rvlad, extra_output = extractor.Extract(features, num_features_per_region)
rvlad, extra_output = extractor.Extract(features, num_features_per_region)
# Define expected results. # Define expected results.
exp_rvlad = [ exp_rvlad = [
...@@ -256,10 +250,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -256,10 +250,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rvlad, extra_output = extractor.Extract(features, num_features_per_region)
rvlad, extra_output = extractor.Extract(features, num_features_per_region)
# Define expected results. # Define expected results.
exp_rvlad = np.zeros([10], dtype=float) exp_rvlad = np.zeros([10], dtype=float)
...@@ -286,10 +279,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -286,10 +279,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rvlad, extra_output = extractor.Extract(features, num_features_per_region)
rvlad, extra_output = extractor.Extract(features, num_features_per_region)
# Define expected results. # Define expected results.
exp_rvlad = [ exp_rvlad = [
...@@ -318,10 +310,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -318,10 +310,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rvlad, extra_output = extractor.Extract(features, num_features_per_region)
rvlad, extra_output = extractor.Extract(features, num_features_per_region)
# Define expected results. # Define expected results.
exp_rvlad = [ exp_rvlad = [
...@@ -349,14 +340,13 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -349,14 +340,13 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) with self.assertRaisesRegex(
with self.assertRaisesRegex( ValueError,
ValueError, r'Incorrect arguments: sum\(num_features_per_region\) and '
r'Incorrect arguments: sum\(num_features_per_region\) and ' r'features.shape\[0\] are different'):
r'features.shape\[0\] are different'): extractor.Extract(features, num_features_per_region)
extractor.Extract(features, num_features_per_region)
def testComputeAsmkWorks(self): def testComputeAsmkWorks(self):
# Construct inputs. # Construct inputs.
...@@ -370,10 +360,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -370,10 +360,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.num_assignments = 1 config.num_assignments = 1
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) asmk, visual_words = extractor.Extract(features)
asmk, visual_words = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_asmk = [-0.707107, 0.707107, 0.707107, 0.707107] exp_asmk = [-0.707107, 0.707107, 0.707107, 0.707107]
...@@ -395,10 +384,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -395,10 +384,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.num_assignments = 1 config.num_assignments = 1
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) asmk_star, visual_words = extractor.Extract(features)
asmk_star, visual_words = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_asmk_star = [64, 192] exp_asmk_star = [64, 192]
...@@ -420,10 +408,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -420,10 +408,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.num_assignments = 3 config.num_assignments = 3
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) asmk, visual_words = extractor.Extract(features)
asmk, visual_words = extractor.Extract(features)
# Define expected results. # Define expected results.
exp_asmk = [0.707107, 0.707107, 0.0, 1.0, -0.707107, 0.707107] exp_asmk = [0.707107, 0.707107, 0.0, 1.0, -0.707107, 0.707107]
...@@ -448,10 +435,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -448,10 +435,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rasmk, visual_words = extractor.Extract(features, num_features_per_region)
rasmk, visual_words = extractor.Extract(features, num_features_per_region)
# Define expected results. # Define expected results.
exp_rasmk = [-0.707107, 0.707107, 0.361261, 0.932465] exp_rasmk = [-0.707107, 0.707107, 0.361261, 0.932465]
...@@ -476,11 +462,10 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -476,11 +462,10 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation(
extractor = feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config) rasmk_star, visual_words = extractor.Extract(features,
rasmk_star, visual_words = extractor.Extract(features, num_features_per_region)
num_features_per_region)
# Define expected results. # Define expected results.
exp_rasmk_star = [64, 192] exp_rasmk_star = [64, 192]
...@@ -500,10 +485,9 @@ class FeatureAggregationTest(tf.test.TestCase): ...@@ -500,10 +485,9 @@ class FeatureAggregationTest(tf.test.TestCase):
config.use_regional_aggregation = True config.use_regional_aggregation = True
# Run tested function. # Run tested function.
with tf.Graph().as_default() as g, self.session(graph=g) as sess: with self.assertRaisesRegex(ValueError, 'Invalid aggregation type'):
with self.assertRaisesRegex(ValueError, 'Invalid aggregation type'): feature_aggregation_extractor.ExtractAggregatedRepresentation(
feature_aggregation_extractor.ExtractAggregatedRepresentation( config)
sess, config)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -19,10 +19,6 @@ from __future__ import print_function ...@@ -19,10 +19,6 @@ from __future__ import print_function
import tensorflow as tf import tensorflow as tf
from delf import delf_v1
from object_detection.core import box_list
from object_detection.core import box_list_ops
def NormalizePixelValues(image, def NormalizePixelValues(image,
pixel_value_offset=128.0, pixel_value_offset=128.0,
...@@ -81,219 +77,6 @@ def CalculateKeypointCenters(boxes): ...@@ -81,219 +77,6 @@ def CalculateKeypointCenters(boxes):
2.0) 2.0)
def ExtractKeypointDescriptor(image, layer_name, image_scales, iou,
max_feature_num, abs_thres, model_fn):
"""Extract keypoint descriptor for input image.
Args:
image: A image tensor with shape [h, w, channels].
layer_name: The endpoint of feature extraction layer.
image_scales: A 1D float tensor which contains the scales.
iou: A float scalar denoting the IOU threshold for NMS.
max_feature_num: An int tensor denoting the maximum selected feature points.
abs_thres: A float tensor denoting the score threshold for feature
selection.
model_fn: Model function. Follows the signature:
* Args:
* `images`: Image tensor which is re-scaled.
* `normalized_image`: Whether or not the images are normalized.
* `reuse`: Whether or not the layer and its variables should be reused.
* Returns:
* `attention`: Attention score after the non-linearity.
* `feature_map`: Feature map obtained from the ResNet model.
Returns:
boxes: [N, 4] float tensor which denotes the selected receptive box. N is
the number of final feature points which pass through keypoint selection
and NMS steps.
feature_scales: [N] float tensor. It is the inverse of the input image
scales such that larger image scales correspond to larger image regions,
which is compatible with scale-space keypoint detection convention.
features: [N, depth] float tensor with feature descriptors.
scores: [N, 1] float tensor denoting the attention score.
Raises:
ValueError: If the layer_name is unsupported.
"""
original_image_shape_float = tf.gather(
tf.cast(tf.shape(image), dtype=tf.float32), [0, 1])
image_tensor = NormalizePixelValues(image)
image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')
# Feature depth and receptive field parameters for each network version.
if layer_name == 'resnet_v1_50/block3':
feature_depth = 1024
rf, stride, padding = [291.0, 32.0, 145.0]
elif layer_name == 'resnet_v1_50/block4':
feature_depth = 2048
rf, stride, padding = [483.0, 32.0, 241.0]
else:
raise ValueError('Unsupported layer_name.')
def _ProcessSingleScale(scale_index,
boxes,
features,
scales,
scores,
reuse=True):
"""Resize the image and run feature extraction and keypoint selection.
This function will be passed into tf.while_loop() and be called
repeatedly. The input boxes are collected from the previous iteration
[0: scale_index -1]. We get the current scale by
image_scales[scale_index], and run image resizing, feature extraction and
keypoint selection. Then we will get a new set of selected_boxes for
current scale. In the end, we concat the previous boxes with current
selected_boxes as the output.
Args:
scale_index: A valid index in the image_scales.
boxes: Box tensor with the shape of [N, 4].
features: Feature tensor with the shape of [N, depth].
scales: Scale tensor with the shape of [N].
scores: Attention score tensor with the shape of [N].
reuse: Whether or not the layer and its variables should be reused.
Returns:
scale_index: The next scale index for processing.
boxes: Concatenated box tensor with the shape of [K, 4]. K >= N.
features: Concatenated feature tensor with the shape of [K, depth].
scales: Concatenated scale tensor with the shape of [K].
scores: Concatenated attention score tensor with the shape of [K].
"""
scale = tf.gather(image_scales, scale_index)
new_image_size = tf.cast(
tf.round(original_image_shape_float * scale), dtype=tf.int32)
resized_image = tf.compat.v1.image.resize_bilinear(image_tensor,
new_image_size)
attention, feature_map = model_fn(
resized_image, normalized_image=True, reuse=reuse)
rf_boxes = CalculateReceptiveBoxes(
tf.shape(feature_map)[1],
tf.shape(feature_map)[2], rf, stride, padding)
# Re-project back to the original image space.
rf_boxes = tf.divide(rf_boxes, scale)
attention = tf.reshape(attention, [-1])
feature_map = tf.reshape(feature_map, [-1, feature_depth])
# Use attention score to select feature vectors.
indices = tf.reshape(tf.where(attention >= abs_thres), [-1])
selected_boxes = tf.gather(rf_boxes, indices)
selected_features = tf.gather(feature_map, indices)
selected_scores = tf.gather(attention, indices)
selected_scales = tf.ones_like(selected_scores, tf.float32) / scale
# Concat with the previous result from different scales.
boxes = tf.concat([boxes, selected_boxes], 0)
features = tf.concat([features, selected_features], 0)
scales = tf.concat([scales, selected_scales], 0)
scores = tf.concat([scores, selected_scores], 0)
return scale_index + 1, boxes, features, scales, scores
output_boxes = tf.zeros([0, 4], dtype=tf.float32)
output_features = tf.zeros([0, feature_depth], dtype=tf.float32)
output_scales = tf.zeros([0], dtype=tf.float32)
output_scores = tf.zeros([0], dtype=tf.float32)
# Process the first scale separately, the following scales will reuse the
# graph variables.
(_, output_boxes, output_features, output_scales,
output_scores) = _ProcessSingleScale(
0,
output_boxes,
output_features,
output_scales,
output_scores,
reuse=False)
i = tf.constant(1, dtype=tf.int32)
num_scales = tf.shape(image_scales)[0]
keep_going = lambda j, boxes, features, scales, scores: tf.less(j, num_scales)
(_, output_boxes, output_features, output_scales,
output_scores) = tf.while_loop(
cond=keep_going,
body=_ProcessSingleScale,
loop_vars=[
i, output_boxes, output_features, output_scales, output_scores
],
shape_invariants=[
i.get_shape(),
tf.TensorShape([None, 4]),
tf.TensorShape([None, feature_depth]),
tf.TensorShape([None]),
tf.TensorShape([None])
],
back_prop=False)
feature_boxes = box_list.BoxList(output_boxes)
feature_boxes.add_field('features', output_features)
feature_boxes.add_field('scales', output_scales)
feature_boxes.add_field('scores', output_scores)
nms_max_boxes = tf.minimum(max_feature_num, feature_boxes.num_boxes())
final_boxes = box_list_ops.non_max_suppression(feature_boxes, iou,
nms_max_boxes)
return (final_boxes.get(), final_boxes.get_field('scales'),
final_boxes.get_field('features'),
tf.expand_dims(final_boxes.get_field('scores'), 1))
def BuildModel(layer_name, attention_nonlinear, attention_type,
attention_kernel_size):
"""Build the DELF model.
This function is helpful for constructing the model function which will be fed
to ExtractKeypointDescriptor().
Args:
layer_name: the endpoint of feature extraction layer.
attention_nonlinear: Type of the non-linearity for the attention function.
Currently, only 'softplus' is supported.
attention_type: Type of the attention used. Options are:
'use_l2_normalized_feature' and 'use_default_input_feature'. Note that
this is irrelevant during inference time.
attention_kernel_size: Size of attention kernel (kernel is square).
Returns:
Attention model function.
"""
def _ModelFn(images, normalized_image, reuse):
"""Attention model to get feature map and attention score map.
Args:
images: Image tensor.
normalized_image: Whether or not the images are normalized.
reuse: Whether or not the layer and its variables should be reused.
Returns:
attention: Attention score after the non-linearity.
feature_map: Feature map after ResNet convolution.
"""
if normalized_image:
image_tensor = images
else:
image_tensor = NormalizePixelValues(images)
# Extract features and attention scores.
model = delf_v1.DelfV1(layer_name)
_, attention, _, feature_map, _ = model.GetAttentionPrelogit(
image_tensor,
attention_nonlinear=attention_nonlinear,
attention_type=attention_type,
kernel=[attention_kernel_size, attention_kernel_size],
training_resnet=False,
training_attention=False,
reuse=reuse)
return attention, feature_map
return _ModelFn
def ApplyPcaAndWhitening(data, def ApplyPcaAndWhitening(data,
pca_matrix, pca_matrix,
pca_mean, pca_mean,
...@@ -345,22 +128,21 @@ def PostProcessDescriptors(descriptors, use_pca, pca_parameters=None): ...@@ -345,22 +128,21 @@ def PostProcessDescriptors(descriptors, use_pca, pca_parameters=None):
normalization and (possibly) PCA/whitening. normalization and (possibly) PCA/whitening.
""" """
# L2-normalize, and if desired apply PCA (followed by L2-normalization). # L2-normalize, and if desired apply PCA (followed by L2-normalization).
with tf.compat.v1.variable_scope('postprocess'): final_descriptors = tf.nn.l2_normalize(
descriptors, axis=1, name='l2_normalization')
if use_pca:
# Apply PCA, and whitening if desired.
final_descriptors = ApplyPcaAndWhitening(final_descriptors,
pca_parameters['matrix'],
pca_parameters['mean'],
pca_parameters['dim'],
pca_parameters['use_whitening'],
pca_parameters['variances'])
# Re-normalize.
final_descriptors = tf.nn.l2_normalize( final_descriptors = tf.nn.l2_normalize(
descriptors, axis=1, name='l2_normalization') final_descriptors, axis=1, name='pca_l2_normalization')
if use_pca:
# Apply PCA, and whitening if desired.
final_descriptors = ApplyPcaAndWhitening(final_descriptors,
pca_parameters['matrix'],
pca_parameters['mean'],
pca_parameters['dim'],
pca_parameters['use_whitening'],
pca_parameters['variances'])
# Re-normalize.
final_descriptors = tf.nn.l2_normalize(
final_descriptors, axis=1, name='pca_l2_normalization')
return final_descriptors return final_descriptors
......
...@@ -18,7 +18,6 @@ from __future__ import absolute_import ...@@ -18,7 +18,6 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np
import tensorflow as tf import tensorflow as tf
from delf import feature_extractor from delf import feature_extractor
...@@ -34,78 +33,24 @@ class FeatureExtractorTest(tf.test.TestCase): ...@@ -34,78 +33,24 @@ class FeatureExtractorTest(tf.test.TestCase):
image, pixel_value_offset=5.0, pixel_value_scale=2.0) image, pixel_value_offset=5.0, pixel_value_scale=2.0)
exp_normalized_image = [[[-1.0, 125.0, -2.5], [14.5, 3.5, 0.0]], exp_normalized_image = [[[-1.0, 125.0, -2.5], [14.5, 3.5, 0.0]],
[[20.0, 0.0, 30.0], [25.5, 36.0, 42.0]]] [[20.0, 0.0, 30.0], [25.5, 36.0, 42.0]]]
with self.session() as sess:
normalized_image_out = sess.run(normalized_image)
self.assertAllEqual(normalized_image_out, exp_normalized_image) self.assertAllEqual(normalized_image, exp_normalized_image)
def testCalculateReceptiveBoxes(self): def testCalculateReceptiveBoxes(self):
boxes = feature_extractor.CalculateReceptiveBoxes( boxes = feature_extractor.CalculateReceptiveBoxes(
height=1, width=2, rf=291, stride=32, padding=145) height=1, width=2, rf=291, stride=32, padding=145)
exp_boxes = [[-145., -145., 145., 145.], [-145., -113., 145., 177.]] exp_boxes = [[-145., -145., 145., 145.], [-145., -113., 145., 177.]]
with self.session() as sess:
boxes_out = sess.run(boxes)
self.assertAllEqual(exp_boxes, boxes_out) self.assertAllEqual(exp_boxes, boxes)
def testCalculateKeypointCenters(self): def testCalculateKeypointCenters(self):
boxes = [[-10.0, 0.0, 11.0, 21.0], [-2.5, 5.0, 18.5, 26.0], boxes = [[-10.0, 0.0, 11.0, 21.0], [-2.5, 5.0, 18.5, 26.0],
[45.0, -2.5, 66.0, 18.5]] [45.0, -2.5, 66.0, 18.5]]
centers = feature_extractor.CalculateKeypointCenters(boxes) centers = feature_extractor.CalculateKeypointCenters(boxes)
with self.session() as sess:
centers_out = sess.run(centers)
exp_centers = [[0.5, 10.5], [8.0, 15.5], [55.5, 8.0]] exp_centers = [[0.5, 10.5], [8.0, 15.5], [55.5, 8.0]]
self.assertAllEqual(exp_centers, centers_out) self.assertAllEqual(exp_centers, centers)
def testExtractKeypointDescriptor(self):
image = tf.constant(
[[[0, 255, 255], [128, 64, 196]], [[0, 0, 32], [32, 128, 16]]],
dtype=tf.uint8)
# Arbitrary model function used to test ExtractKeypointDescriptor. The
# generated feature_map is a replicated version of the image, concatenated
# with zeros to achieve the required dimensionality. The attention is simply
# the norm of the input image pixels.
def _test_model_fn(image, normalized_image, reuse):
del normalized_image, reuse # Unused variables in the test.
image_shape = tf.shape(image)
attention = tf.squeeze(tf.norm(image, axis=3))
feature_map = tf.concat([
tf.tile(image, [1, 1, 1, 341]),
tf.zeros([1, image_shape[1], image_shape[2], 1])
],
axis=3)
return attention, feature_map
boxes, feature_scales, features, scores = (
feature_extractor.ExtractKeypointDescriptor(
image,
layer_name='resnet_v1_50/block3',
image_scales=tf.constant([1.0]),
iou=1.0,
max_feature_num=10,
abs_thres=1.5,
model_fn=_test_model_fn))
exp_boxes = [[-145.0, -145.0, 145.0, 145.0], [-113.0, -145.0, 177.0, 145.0]]
exp_feature_scales = [1.0, 1.0]
exp_features = np.array(
np.concatenate(
(np.tile([[-1.0, 127.0 / 128.0, 127.0 / 128.0], [-1.0, -1.0, -0.75]
], [1, 341]), np.zeros([2, 1])),
axis=1))
exp_scores = [[1.723042], [1.600781]]
with self.session() as sess:
boxes_out, feature_scales_out, features_out, scores_out = sess.run(
[boxes, feature_scales, features, scores])
self.assertAllEqual(exp_boxes, boxes_out)
self.assertAllEqual(exp_feature_scales, feature_scales_out)
self.assertAllClose(exp_features, features_out)
self.assertAllClose(exp_scores, scores_out)
def testPcaWhitening(self): def testPcaWhitening(self):
data = tf.constant([[1.0, 2.0, -2.0], [-5.0, 0.0, 3.0], [-1.0, 2.0, 0.0], data = tf.constant([[1.0, 2.0, -2.0], [-5.0, 0.0, 3.0], [-1.0, 2.0, 0.0],
...@@ -123,12 +68,8 @@ class FeatureExtractorTest(tf.test.TestCase): ...@@ -123,12 +68,8 @@ class FeatureExtractorTest(tf.test.TestCase):
exp_output = [[2.5, -5.0], [-6.0, -2.0], [-0.5, -3.0], [1.0, -2.0]] exp_output = [[2.5, -5.0], [-6.0, -2.0], [-0.5, -3.0], [1.0, -2.0]]
with self.session() as sess: self.assertAllEqual(exp_output, output)
output_out = sess.run(output)
self.assertAllEqual(exp_output, output_out)
if __name__ == '__main__': if __name__ == '__main__':
tf.compat.v1.disable_eager_execution()
tf.test.main() tf.test.main()
...@@ -20,11 +20,14 @@ from __future__ import print_function ...@@ -20,11 +20,14 @@ from __future__ import print_function
import os import os
from absl import flags
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from delf import feature_io from delf import feature_io
FLAGS = flags.FLAGS
def create_data(): def create_data():
"""Creates data to be used in tests. """Creates data to be used in tests.
...@@ -81,8 +84,7 @@ class DelfFeaturesIoTest(tf.test.TestCase): ...@@ -81,8 +84,7 @@ class DelfFeaturesIoTest(tf.test.TestCase):
def testWriteAndReadToFile(self): def testWriteAndReadToFile(self):
locations, scales, descriptors, attention, orientations = create_data() locations, scales, descriptors, attention, orientations = create_data()
tmpdir = tf.compat.v1.test.get_temp_dir() filename = os.path.join(FLAGS.test_tmpdir, 'test.delf')
filename = os.path.join(tmpdir, 'test.delf')
feature_io.WriteToFile(filename, locations, scales, descriptors, attention, feature_io.WriteToFile(filename, locations, scales, descriptors, attention,
orientations) orientations)
data_read = feature_io.ReadFromFile(filename) data_read = feature_io.ReadFromFile(filename)
...@@ -94,8 +96,7 @@ class DelfFeaturesIoTest(tf.test.TestCase): ...@@ -94,8 +96,7 @@ class DelfFeaturesIoTest(tf.test.TestCase):
self.assertAllEqual(orientations, data_read[4]) self.assertAllEqual(orientations, data_read[4])
def testWriteAndReadToFileEmptyFile(self): def testWriteAndReadToFileEmptyFile(self):
tmpdir = tf.compat.v1.test.get_temp_dir() filename = os.path.join(FLAGS.test_tmpdir, 'test.delf')
filename = os.path.join(tmpdir, 'test.delf')
feature_io.WriteToFile(filename, np.array([]), np.array([]), np.array([]), feature_io.WriteToFile(filename, np.array([]), np.array([]), np.array([]),
np.array([]), np.array([])) np.array([]), np.array([]))
data_read = feature_io.ReadFromFile(filename) data_read = feature_io.ReadFromFile(filename)
......
...@@ -27,6 +27,15 @@ import functools ...@@ -27,6 +27,15 @@ import functools
import tensorflow as tf import tensorflow as tf
class _GoogleLandmarksInfo(object):
"""Metadata about the Google Landmarks dataset."""
num_classes = {
'gld_v1': 14951,
'gld_v2': 203094,
'gld_v2_clean': 81313
}
class _DataAugmentationParams(object): class _DataAugmentationParams(object):
"""Default parameters for augmentation.""" """Default parameters for augmentation."""
# The following are used for training. # The following are used for training.
...@@ -167,3 +176,12 @@ def CreateDataset(file_pattern, ...@@ -167,3 +176,12 @@ def CreateDataset(file_pattern,
dataset = dataset.batch(batch_size) dataset = dataset.batch(batch_size)
return dataset return dataset
def GoogleLandmarksInfo():
"""Returns metadata information on the Google Landmarks dataset.
Returns:
object _GoogleLandmarksInfo containing metadata about the GLD dataset.
"""
return _GoogleLandmarksInfo()
...@@ -43,6 +43,10 @@ flags.DEFINE_string('train_file_pattern', '/tmp/data/train*', ...@@ -43,6 +43,10 @@ flags.DEFINE_string('train_file_pattern', '/tmp/data/train*',
'File pattern of training dataset files.') 'File pattern of training dataset files.')
flags.DEFINE_string('validation_file_pattern', '/tmp/data/validation*', flags.DEFINE_string('validation_file_pattern', '/tmp/data/validation*',
'File pattern of validation dataset files.') 'File pattern of validation dataset files.')
flags.DEFINE_enum('dataset_version', 'gld_v1',
['gld_v1', 'gld_v2', 'gld_v2_clean'],
'Google Landmarks dataset version, used to determine the'
'number of classes.')
flags.DEFINE_integer('seed', 0, 'Seed to training dataset.') flags.DEFINE_integer('seed', 0, 'Seed to training dataset.')
flags.DEFINE_float('initial_lr', 0.001, 'Initial learning rate.') flags.DEFINE_float('initial_lr', 0.001, 'Initial learning rate.')
flags.DEFINE_integer('batch_size', 32, 'Global batch size.') flags.DEFINE_integer('batch_size', 32, 'Global batch size.')
...@@ -136,9 +140,9 @@ def main(argv): ...@@ -136,9 +140,9 @@ def main(argv):
save_interval = 1 save_interval = 1
report_interval = 1 report_interval = 1
# TODO(andrearaujo): Using placeholder, replace with actual value using # Determine the number of classes based on the version of the dataset.
# GoogleLandmarksInfo() from datasets/googlelandmarks.py. gld_info = gld.GoogleLandmarksInfo()
num_classes = 14951 num_classes = gld_info.num_classes[FLAGS.dataset_version]
# ------------------------------------------------------------ # ------------------------------------------------------------
# Create the distributed train/validation sets. # Create the distributed train/validation sets.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment