Commit 5a2cf36f authored by Kaushik Shivakumar's avatar Kaushik Shivakumar
Browse files

Merge remote-tracking branch 'upstream/master' into newavarecords

parents 258ddfc3 a829e648
......@@ -34,20 +34,21 @@ def char_accuracy(predictions, targets, rej_char, streaming=False):
a update_ops for execution and value tensor whose value on evaluation
returns the total character accuracy.
"""
with tf.variable_scope('CharAccuracy'):
with tf.compat.v1.variable_scope('CharAccuracy'):
predictions.get_shape().assert_is_compatible_with(targets.get_shape())
targets = tf.to_int32(targets)
targets = tf.cast(targets, dtype=tf.int32)
const_rej_char = tf.constant(rej_char, shape=targets.get_shape())
weights = tf.to_float(tf.not_equal(targets, const_rej_char))
correct_chars = tf.to_float(tf.equal(predictions, targets))
accuracy_per_example = tf.div(
tf.reduce_sum(tf.multiply(correct_chars, weights), 1),
tf.reduce_sum(weights, 1))
weights = tf.cast(tf.not_equal(targets, const_rej_char), dtype=tf.float32)
correct_chars = tf.cast(tf.equal(predictions, targets), dtype=tf.float32)
accuracy_per_example = tf.compat.v1.div(
tf.reduce_sum(input_tensor=tf.multiply(
correct_chars, weights), axis=1),
tf.reduce_sum(input_tensor=weights, axis=1))
if streaming:
return tf.contrib.metrics.streaming_mean(accuracy_per_example)
else:
return tf.reduce_mean(accuracy_per_example)
return tf.reduce_mean(input_tensor=accuracy_per_example)
def sequence_accuracy(predictions, targets, rej_char, streaming=False):
......@@ -66,25 +67,26 @@ def sequence_accuracy(predictions, targets, rej_char, streaming=False):
returns the total sequence accuracy.
"""
with tf.variable_scope('SequenceAccuracy'):
with tf.compat.v1.variable_scope('SequenceAccuracy'):
predictions.get_shape().assert_is_compatible_with(targets.get_shape())
targets = tf.to_int32(targets)
targets = tf.cast(targets, dtype=tf.int32)
const_rej_char = tf.constant(
rej_char, shape=targets.get_shape(), dtype=tf.int32)
include_mask = tf.not_equal(targets, const_rej_char)
include_predictions = tf.to_int32(
tf.where(include_mask, predictions,
tf.zeros_like(predictions) + rej_char))
correct_chars = tf.to_float(tf.equal(include_predictions, targets))
include_predictions = tf.cast(
tf.compat.v1.where(include_mask, predictions,
tf.zeros_like(predictions) + rej_char), dtype=tf.int32)
correct_chars = tf.cast(
tf.equal(include_predictions, targets), dtype=tf.float32)
correct_chars_counts = tf.cast(
tf.reduce_sum(correct_chars, reduction_indices=[1]), dtype=tf.int32)
tf.reduce_sum(input_tensor=correct_chars, axis=[1]), dtype=tf.int32)
target_length = targets.get_shape().dims[1].value
target_chars_counts = tf.constant(
target_length, shape=correct_chars_counts.get_shape())
accuracy_per_example = tf.to_float(
tf.equal(correct_chars_counts, target_chars_counts))
accuracy_per_example = tf.cast(
tf.equal(correct_chars_counts, target_chars_counts), dtype=tf.float32)
if streaming:
return tf.contrib.metrics.streaming_mean(accuracy_per_example)
else:
return tf.reduce_mean(accuracy_per_example)
return tf.reduce_mean(input_tensor=accuracy_per_example)
......@@ -38,8 +38,8 @@ class AccuracyTest(tf.test.TestCase):
A session object that should be used as a context manager.
"""
with self.cached_session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.compat.v1.local_variables_initializer())
yield sess
def _fake_labels(self):
......@@ -55,7 +55,7 @@ class AccuracyTest(tf.test.TestCase):
return incorrect
def test_sequence_accuracy_identical_samples(self):
labels_tf = tf.convert_to_tensor(self._fake_labels())
labels_tf = tf.convert_to_tensor(value=self._fake_labels())
accuracy_tf = metrics.sequence_accuracy(labels_tf, labels_tf,
self.rej_char)
......@@ -66,9 +66,9 @@ class AccuracyTest(tf.test.TestCase):
def test_sequence_accuracy_one_char_difference(self):
ground_truth_np = self._fake_labels()
ground_truth_tf = tf.convert_to_tensor(ground_truth_np)
ground_truth_tf = tf.convert_to_tensor(value=ground_truth_np)
prediction_tf = tf.convert_to_tensor(
self._incorrect_copy(ground_truth_np, bad_indexes=((0, 0))))
value=self._incorrect_copy(ground_truth_np, bad_indexes=((0, 0))))
accuracy_tf = metrics.sequence_accuracy(prediction_tf, ground_truth_tf,
self.rej_char)
......@@ -80,9 +80,9 @@ class AccuracyTest(tf.test.TestCase):
def test_char_accuracy_one_char_difference_with_padding(self):
ground_truth_np = self._fake_labels()
ground_truth_tf = tf.convert_to_tensor(ground_truth_np)
ground_truth_tf = tf.convert_to_tensor(value=ground_truth_np)
prediction_tf = tf.convert_to_tensor(
self._incorrect_copy(ground_truth_np, bad_indexes=((0, 0))))
value=self._incorrect_copy(ground_truth_np, bad_indexes=((0, 0))))
accuracy_tf = metrics.char_accuracy(prediction_tf, ground_truth_tf,
self.rej_char)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to build the Attention OCR model.
Usage example:
......@@ -26,6 +25,7 @@ Usage example:
import sys
import collections
import logging
import numpy as np
import tensorflow as tf
from tensorflow.contrib import slim
from tensorflow.contrib.slim.nets import inception
......@@ -35,29 +35,28 @@ import sequence_layers
import utils
OutputEndpoints = collections.namedtuple('OutputEndpoints', [
'chars_logit', 'chars_log_prob', 'predicted_chars', 'predicted_scores',
'predicted_text'
'chars_logit', 'chars_log_prob', 'predicted_chars', 'predicted_scores',
'predicted_text', 'predicted_length', 'predicted_conf',
'normalized_seq_conf'
])
# TODO(gorban): replace with tf.HParams when it is released.
ModelParams = collections.namedtuple('ModelParams', [
'num_char_classes', 'seq_length', 'num_views', 'null_code'
])
ModelParams = collections.namedtuple(
'ModelParams', ['num_char_classes', 'seq_length', 'num_views', 'null_code'])
ConvTowerParams = collections.namedtuple('ConvTowerParams', ['final_endpoint'])
SequenceLogitsParams = collections.namedtuple('SequenceLogitsParams', [
'use_attention', 'use_autoregression', 'num_lstm_units', 'weight_decay',
'lstm_state_clip_value'
'use_attention', 'use_autoregression', 'num_lstm_units', 'weight_decay',
'lstm_state_clip_value'
])
SequenceLossParams = collections.namedtuple('SequenceLossParams', [
'label_smoothing', 'ignore_nulls', 'average_across_timesteps'
])
SequenceLossParams = collections.namedtuple(
'SequenceLossParams',
['label_smoothing', 'ignore_nulls', 'average_across_timesteps'])
EncodeCoordinatesParams = collections.namedtuple('EncodeCoordinatesParams', [
'enabled'
])
EncodeCoordinatesParams = collections.namedtuple('EncodeCoordinatesParams',
['enabled'])
def _dict_to_array(id_to_char, default_character):
......@@ -85,16 +84,16 @@ class CharsetMapper(object):
"""
mapping_strings = tf.constant(_dict_to_array(charset, default_character))
self.table = tf.contrib.lookup.index_to_string_table_from_tensor(
mapping=mapping_strings, default_value=default_character)
mapping=mapping_strings, default_value=default_character)
def get_text(self, ids):
"""Returns a string corresponding to a sequence of character ids.
Args:
ids: a tensor with shape [batch_size, max_sequence_length]
"""
return tf.reduce_join(
self.table.lookup(tf.to_int64(ids)), reduction_indices=1)
"""
return tf.strings.reduce_join(
inputs=self.table.lookup(tf.cast(ids, dtype=tf.int64)), axis=1)
def get_softmax_loss_fn(label_smoothing):
......@@ -111,16 +110,153 @@ def get_softmax_loss_fn(label_smoothing):
def loss_fn(labels, logits):
return (tf.nn.softmax_cross_entropy_with_logits(
logits=logits, labels=labels))
logits=logits, labels=tf.stop_gradient(labels)))
else:
def loss_fn(labels, logits):
return tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=labels)
logits=logits, labels=labels)
return loss_fn
def get_tensor_dimensions(tensor):
"""Returns the shape components of a 4D tensor with variable batch size.
Args:
tensor : A 4D tensor, whose last 3 dimensions are known at graph
construction time.
Returns:
batch_size : The first dimension as a tensor object.
height : The second dimension as a scalar value.
width : The third dimension as a scalar value.
num_features : The forth dimension as a scalar value.
Raises:
ValueError: if input tensor does not have 4 dimensions.
"""
if len(tensor.get_shape().dims) != 4:
raise ValueError(
'Incompatible shape: len(tensor.get_shape().dims) != 4 (%d != 4)' %
len(tensor.get_shape().dims))
batch_size = tf.shape(input=tensor)[0]
height = tensor.get_shape().dims[1].value
width = tensor.get_shape().dims[2].value
num_features = tensor.get_shape().dims[3].value
return batch_size, height, width, num_features
def lookup_indexed_value(indices, row_vecs):
"""Lookup values in each row of 'row_vecs' indexed by 'indices'.
For each sample in the batch, look up the element for the corresponding
index.
Args:
indices : A tensor of shape (batch, )
row_vecs : A tensor of shape [batch, depth]
Returns:
A tensor of shape (batch, ) formed by row_vecs[i, indices[i]].
"""
gather_indices = tf.stack((tf.range(
tf.shape(input=row_vecs)[0], dtype=tf.int32), tf.cast(indices, tf.int32)),
axis=1)
return tf.gather_nd(row_vecs, gather_indices)
@utils.ConvertAllInputsToTensors
def max_char_logprob_cumsum(char_log_prob):
"""Computes the cumulative sum of character logprob for all sequence lengths.
Args:
char_log_prob: A tensor of shape [batch x seq_length x num_char_classes]
with log probabilities of a character.
Returns:
A tensor of shape [batch x (seq_length+1)] where each element x[_, j] is
the sum of the max char logprob for all positions upto j.
Note this duplicates the final column and produces (seq_length+1) columns
so the same function can be used regardless whether use_length_predictions
is true or false.
"""
max_char_log_prob = tf.reduce_max(input_tensor=char_log_prob, axis=2)
# For an input array [a, b, c]) tf.cumsum returns [a, a + b, a + b + c] if
# exclusive set to False (default).
return tf.cumsum(max_char_log_prob, axis=1, exclusive=False)
def find_length_by_null(predicted_chars, null_code):
"""Determine sequence length by finding null_code among predicted char IDs.
Given the char class ID for each position, compute the sequence length.
Note that this function computes this based on the number of null_code,
instead of the position of the first null_code.
Args:
predicted_chars: A tensor of [batch x seq_length] where each element stores
the char class ID with max probability;
null_code: an int32, character id for the NULL.
Returns:
A [batch, ] tensor which stores the sequence length for each sample.
"""
return tf.reduce_sum(
input_tensor=tf.cast(tf.not_equal(null_code, predicted_chars), tf.int32), axis=1)
def axis_pad(tensor, axis, before=0, after=0, constant_values=0.0):
"""Pad a tensor with the specified values along a single axis.
Args:
tensor: a Tensor;
axis: the dimension to add pad along to;
before: number of values to add before the contents of tensor in the
selected dimension;
after: number of values to add after the contents of tensor in the selected
dimension;
constant_values: the scalar pad value to use. Must be same type as tensor.
Returns:
A Tensor. Has the same type as the input tensor, but with a changed shape
along the specified dimension.
"""
if before == 0 and after == 0:
return tensor
ndims = tensor.shape.ndims
padding_size = np.zeros((ndims, 2), dtype='int32')
padding_size[axis] = before, after
return tf.pad(
tensor=tensor,
paddings=tf.constant(padding_size),
constant_values=constant_values)
def null_based_length_prediction(chars_log_prob, null_code):
"""Computes length and confidence of prediction based on positions of NULLs.
Args:
chars_log_prob: A tensor of shape [batch x seq_length x num_char_classes]
with log probabilities of a character;
null_code: an int32, character id for the NULL.
Returns:
A tuple (text_log_prob, predicted_length), where
text_log_prob - is a tensor of the same shape as length_log_prob.
Element #0 of the output corresponds to probability of the empty string,
element #seq_length - is the probability of length=seq_length.
predicted_length is a tensor with shape [batch].
"""
predicted_chars = tf.cast(
tf.argmax(input=chars_log_prob, axis=2), dtype=tf.int32)
# We do right pad to support sequences with seq_length elements.
text_log_prob = max_char_logprob_cumsum(
axis_pad(chars_log_prob, axis=1, after=1))
predicted_length = find_length_by_null(predicted_chars, null_code)
return text_log_prob, predicted_length
class Model(object):
"""Class to create the Attention OCR Model."""
......@@ -137,24 +273,24 @@ class Model(object):
num_char_classes: size of character set.
seq_length: number of characters in a sequence.
num_views: Number of views (conv towers) to use.
null_code: A character code corresponding to a character which
indicates end of a sequence.
mparams: a dictionary with hyper parameters for methods, keys -
function names, values - corresponding namedtuples.
null_code: A character code corresponding to a character which indicates
end of a sequence.
mparams: a dictionary with hyper parameters for methods, keys - function
names, values - corresponding namedtuples.
charset: an optional dictionary with a mapping between character ids and
utf8 strings. If specified the OutputEndpoints.predicted_text will
utf8 encoded strings corresponding to the character ids returned by
utf8 strings. If specified the OutputEndpoints.predicted_text will utf8
encoded strings corresponding to the character ids returned by
OutputEndpoints.predicted_chars (by default the predicted_text contains
an empty vector).
an empty vector).
NOTE: Make sure you call tf.tables_initializer().run() if the charset
specified.
specified.
"""
super(Model, self).__init__()
self._params = ModelParams(
num_char_classes=num_char_classes,
seq_length=seq_length,
num_views=num_views,
null_code=null_code)
num_char_classes=num_char_classes,
seq_length=seq_length,
num_views=num_views,
null_code=null_code)
self._mparams = self.default_mparams()
if mparams:
self._mparams.update(mparams)
......@@ -162,21 +298,22 @@ class Model(object):
def default_mparams(self):
return {
'conv_tower_fn':
ConvTowerParams(final_endpoint='Mixed_5d'),
'sequence_logit_fn':
SequenceLogitsParams(
use_attention=True,
use_autoregression=True,
num_lstm_units=256,
weight_decay=0.00004,
lstm_state_clip_value=10.0),
'sequence_loss_fn':
SequenceLossParams(
label_smoothing=0.1,
ignore_nulls=True,
average_across_timesteps=False),
'encode_coordinates_fn': EncodeCoordinatesParams(enabled=False)
'conv_tower_fn':
ConvTowerParams(final_endpoint='Mixed_5d'),
'sequence_logit_fn':
SequenceLogitsParams(
use_attention=True,
use_autoregression=True,
num_lstm_units=256,
weight_decay=0.00004,
lstm_state_clip_value=10.0),
'sequence_loss_fn':
SequenceLossParams(
label_smoothing=0.1,
ignore_nulls=True,
average_across_timesteps=False),
'encode_coordinates_fn':
EncodeCoordinatesParams(enabled=False)
}
def set_mparam(self, function, **kwargs):
......@@ -198,14 +335,14 @@ class Model(object):
"""
mparams = self._mparams['conv_tower_fn']
logging.debug('Using final_endpoint=%s', mparams.final_endpoint)
with tf.variable_scope('conv_tower_fn/INCE'):
with tf.compat.v1.variable_scope('conv_tower_fn/INCE'):
if reuse:
tf.get_variable_scope().reuse_variables()
tf.compat.v1.get_variable_scope().reuse_variables()
with slim.arg_scope(inception.inception_v3_arg_scope()):
with slim.arg_scope([slim.batch_norm, slim.dropout],
is_training=is_training):
net, _ = inception.inception_v3_base(
images, final_endpoint=mparams.final_endpoint)
images, final_endpoint=mparams.final_endpoint)
return net
def _create_lstm_inputs(self, net):
......@@ -222,10 +359,10 @@ class Model(object):
"""
num_features = net.get_shape().dims[1].value
if num_features < self._params.seq_length:
raise AssertionError('Incorrect dimension #1 of input tensor'
' %d should be bigger than %d (shape=%s)' %
(num_features, self._params.seq_length,
net.get_shape()))
raise AssertionError(
'Incorrect dimension #1 of input tensor'
' %d should be bigger than %d (shape=%s)' %
(num_features, self._params.seq_length, net.get_shape()))
elif num_features > self._params.seq_length:
logging.warning('Ignoring some features: use %d of %d (shape=%s)',
self._params.seq_length, num_features, net.get_shape())
......@@ -236,7 +373,7 @@ class Model(object):
def sequence_logit_fn(self, net, labels_one_hot):
mparams = self._mparams['sequence_logit_fn']
# TODO(gorban): remove /alias suffixes from the scopes.
with tf.variable_scope('sequence_logit_fn/SQLR'):
with tf.compat.v1.variable_scope('sequence_logit_fn/SQLR'):
layer_class = sequence_layers.get_layer_class(mparams.use_attention,
mparams.use_autoregression)
layer = layer_class(net, labels_one_hot, self._params, mparams)
......@@ -252,16 +389,16 @@ class Model(object):
A tensor with the same size as any input tensors.
"""
batch_size, height, width, num_features = [
d.value for d in nets_list[0].get_shape().dims
d.value for d in nets_list[0].get_shape().dims
]
xy_flat_shape = (batch_size, 1, height * width, num_features)
nets_for_merge = []
with tf.variable_scope('max_pool_views', values=nets_list):
with tf.compat.v1.variable_scope('max_pool_views', values=nets_list):
for net in nets_list:
nets_for_merge.append(tf.reshape(net, xy_flat_shape))
merged_net = tf.concat(nets_for_merge, 1)
net = slim.max_pool2d(
merged_net, kernel_size=[len(nets_list), 1], stride=1)
merged_net, kernel_size=[len(nets_list), 1], stride=1)
net = tf.reshape(net, (batch_size, height, width, num_features))
return net
......@@ -277,18 +414,20 @@ class Model(object):
Returns:
A tensor of shape [batch_size, seq_length, features_size].
"""
with tf.variable_scope('pool_views_fn/STCK'):
with tf.compat.v1.variable_scope('pool_views_fn/STCK'):
net = tf.concat(nets, 1)
batch_size = net.get_shape().dims[0].value
batch_size = tf.shape(input=net)[0]
image_size = net.get_shape().dims[1].value * \
net.get_shape().dims[2].value
feature_size = net.get_shape().dims[3].value
return tf.reshape(net, [batch_size, -1, feature_size])
return tf.reshape(net, tf.stack([batch_size, image_size, feature_size]))
def char_predictions(self, chars_logit):
"""Returns confidence scores (softmax values) for predicted characters.
Args:
chars_logit: chars logits, a tensor with shape
[batch_size x seq_length x num_char_classes]
chars_logit: chars logits, a tensor with shape [batch_size x seq_length x
num_char_classes]
Returns:
A tuple (ids, log_prob, scores), where:
......@@ -301,12 +440,17 @@ class Model(object):
with shape [batch_size x seq_length].
"""
log_prob = utils.logits_to_log_prob(chars_logit)
ids = tf.to_int32(tf.argmax(log_prob, axis=2), name='predicted_chars')
ids = tf.cast(tf.argmax(input=log_prob, axis=2),
name='predicted_chars', dtype=tf.int32)
mask = tf.cast(
slim.one_hot_encoding(ids, self._params.num_char_classes), tf.bool)
slim.one_hot_encoding(ids, self._params.num_char_classes), tf.bool)
all_scores = tf.nn.softmax(chars_logit)
selected_scores = tf.boolean_mask(all_scores, mask, name='char_scores')
scores = tf.reshape(selected_scores, shape=(-1, self._params.seq_length))
selected_scores = tf.boolean_mask(
tensor=all_scores, mask=mask, name='char_scores')
scores = tf.reshape(
selected_scores,
shape=(-1, self._params.seq_length),
name='predicted_scores')
return ids, log_prob, scores
def encode_coordinates_fn(self, net):
......@@ -323,12 +467,12 @@ class Model(object):
"""
mparams = self._mparams['encode_coordinates_fn']
if mparams.enabled:
batch_size, h, w, _ = net.shape.as_list()
batch_size, h, w, _ = get_tensor_dimensions(net)
x, y = tf.meshgrid(tf.range(w), tf.range(h))
w_loc = slim.one_hot_encoding(x, num_classes=w)
h_loc = slim.one_hot_encoding(y, num_classes=h)
loc = tf.concat([h_loc, w_loc], 2)
loc = tf.tile(tf.expand_dims(loc, 0), [batch_size, 1, 1, 1])
loc = tf.tile(tf.expand_dims(loc, 0), tf.stack([batch_size, 1, 1, 1]))
return tf.concat([net, loc], 3)
else:
return net
......@@ -341,7 +485,8 @@ class Model(object):
"""Creates a base part of the Model (no gradients, losses or summaries).
Args:
images: A tensor of shape [batch_size, height, width, channels].
images: A tensor of shape [batch_size, height, width, channels] with pixel
values in the range [0.0, 1.0].
labels_one_hot: Optional (can be None) one-hot encoding for ground truth
labels. If provided the function will create a model for training.
scope: Optional variable_scope.
......@@ -353,14 +498,19 @@ class Model(object):
"""
logging.debug('images: %s', images)
is_training = labels_one_hot is not None
with tf.variable_scope(scope, reuse=reuse):
# Normalize image pixel values to have a symmetrical range around zero.
images = tf.subtract(images, 0.5)
images = tf.multiply(images, 2.5)
with tf.compat.v1.variable_scope(scope, reuse=reuse):
views = tf.split(
value=images, num_or_size_splits=self._params.num_views, axis=2)
value=images, num_or_size_splits=self._params.num_views, axis=2)
logging.debug('Views=%d single view: %s', len(views), views[0])
nets = [
self.conv_tower_fn(v, is_training, reuse=(i != 0))
for i, v in enumerate(views)
self.conv_tower_fn(v, is_training, reuse=(i != 0))
for i, v in enumerate(views)
]
logging.debug('Conv tower: %s', nets[0])
......@@ -374,18 +524,34 @@ class Model(object):
logging.debug('chars_logit: %s', chars_logit)
predicted_chars, chars_log_prob, predicted_scores = (
self.char_predictions(chars_logit))
self.char_predictions(chars_logit))
if self._charset:
character_mapper = CharsetMapper(self._charset)
predicted_text = character_mapper.get_text(predicted_chars)
else:
predicted_text = tf.constant([])
text_log_prob, predicted_length = null_based_length_prediction(
chars_log_prob, self._params.null_code)
predicted_conf = lookup_indexed_value(predicted_length, text_log_prob)
# Convert predicted confidence from sum of logs to geometric mean
normalized_seq_conf = tf.exp(
tf.divide(predicted_conf,
tf.cast(predicted_length + 1, predicted_conf.dtype)),
name='normalized_seq_conf')
predicted_conf = tf.identity(predicted_conf, name='predicted_conf')
predicted_text = tf.identity(predicted_text, name='predicted_text')
predicted_length = tf.identity(predicted_length, name='predicted_length')
return OutputEndpoints(
chars_logit=chars_logit,
chars_log_prob=chars_log_prob,
predicted_chars=predicted_chars,
predicted_scores=predicted_scores,
predicted_text=predicted_text)
chars_logit=chars_logit,
chars_log_prob=chars_log_prob,
predicted_chars=predicted_chars,
predicted_scores=predicted_scores,
predicted_length=predicted_length,
predicted_text=predicted_text,
predicted_conf=predicted_conf,
normalized_seq_conf=normalized_seq_conf)
def create_loss(self, data, endpoints):
"""Creates all losses required to train the model.
......@@ -404,7 +570,7 @@ class Model(object):
# multiple losses including regularization losses.
self.sequence_loss_fn(endpoints.chars_logit, data.labels)
total_loss = slim.losses.get_total_loss()
tf.summary.scalar('TotalLoss', total_loss)
tf.compat.v1.summary.scalar('TotalLoss', total_loss)
return total_loss
def label_smoothing_regularization(self, chars_labels, weight=0.1):
......@@ -413,15 +579,15 @@ class Model(object):
Uses the same method as in https://arxiv.org/abs/1512.00567.
Args:
chars_labels: ground truth ids of charactes,
shape=[batch_size, seq_length];
chars_labels: ground truth ids of charactes, shape=[batch_size,
seq_length];
weight: label-smoothing regularization weight.
Returns:
A sensor with the same shape as the input.
"""
one_hot_labels = tf.one_hot(
chars_labels, depth=self._params.num_char_classes, axis=-1)
chars_labels, depth=self._params.num_char_classes, axis=-1)
pos_weight = 1.0 - weight
neg_weight = weight / self._params.num_char_classes
return one_hot_labels * pos_weight + neg_weight
......@@ -433,20 +599,20 @@ class Model(object):
also ignore all null chars after the first one.
Args:
chars_logits: logits for predicted characters,
shape=[batch_size, seq_length, num_char_classes];
chars_labels: ground truth ids of characters,
shape=[batch_size, seq_length];
chars_logits: logits for predicted characters, shape=[batch_size,
seq_length, num_char_classes];
chars_labels: ground truth ids of characters, shape=[batch_size,
seq_length];
mparams: method hyper parameters.
Returns:
A Tensor with shape [batch_size] - the log-perplexity for each sequence.
"""
mparams = self._mparams['sequence_loss_fn']
with tf.variable_scope('sequence_loss_fn/SLF'):
with tf.compat.v1.variable_scope('sequence_loss_fn/SLF'):
if mparams.label_smoothing > 0:
smoothed_one_hot_labels = self.label_smoothing_regularization(
chars_labels, mparams.label_smoothing)
chars_labels, mparams.label_smoothing)
labels_list = tf.unstack(smoothed_one_hot_labels, axis=1)
else:
# NOTE: in case of sparse softmax we are not using one-hot
......@@ -459,21 +625,21 @@ class Model(object):
else:
# Suppose that reject character is the last in the charset.
reject_char = tf.constant(
self._params.num_char_classes - 1,
shape=(batch_size, seq_length),
dtype=tf.int64)
self._params.num_char_classes - 1,
shape=(batch_size, seq_length),
dtype=tf.int64)
known_char = tf.not_equal(chars_labels, reject_char)
weights = tf.to_float(known_char)
weights = tf.cast(known_char, dtype=tf.float32)
logits_list = tf.unstack(chars_logits, axis=1)
weights_list = tf.unstack(weights, axis=1)
loss = tf.contrib.legacy_seq2seq.sequence_loss(
logits_list,
labels_list,
weights_list,
softmax_loss_function=get_softmax_loss_fn(mparams.label_smoothing),
average_across_timesteps=mparams.average_across_timesteps)
tf.losses.add_loss(loss)
logits_list,
labels_list,
weights_list,
softmax_loss_function=get_softmax_loss_fn(mparams.label_smoothing),
average_across_timesteps=mparams.average_across_timesteps)
tf.compat.v1.losses.add_loss(loss)
return loss
def create_summaries(self, data, endpoints, charset, is_training):
......@@ -482,8 +648,8 @@ class Model(object):
Args:
data: InputEndpoints namedtuple.
endpoints: OutputEndpoints namedtuple.
charset: A dictionary with mapping between character codes and
unicode characters. Use the one provided by a dataset.charset.
charset: A dictionary with mapping between character codes and unicode
characters. Use the one provided by a dataset.charset.
is_training: If True will create summary prefixes for training job,
otherwise - for evaluation.
......@@ -503,13 +669,14 @@ class Model(object):
# tf.summary.text(sname('text/pr'), pr_text)
# gt_text = charset_mapper.get_text(data.labels[:max_outputs,:])
# tf.summary.text(sname('text/gt'), gt_text)
tf.summary.image(sname('image'), data.images, max_outputs=max_outputs)
tf.compat.v1.summary.image(
sname('image'), data.images, max_outputs=max_outputs)
if is_training:
tf.summary.image(
sname('image/orig'), data.images_orig, max_outputs=max_outputs)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
tf.compat.v1.summary.image(
sname('image/orig'), data.images_orig, max_outputs=max_outputs)
for var in tf.compat.v1.trainable_variables():
tf.compat.v1.summary.histogram(var.op.name, var)
return None
else:
......@@ -520,32 +687,36 @@ class Model(object):
names_to_values[name] = value_update_tuple[0]
names_to_updates[name] = value_update_tuple[1]
use_metric('CharacterAccuracy',
metrics.char_accuracy(
endpoints.predicted_chars,
data.labels,
streaming=True,
rej_char=self._params.null_code))
use_metric(
'CharacterAccuracy',
metrics.char_accuracy(
endpoints.predicted_chars,
data.labels,
streaming=True,
rej_char=self._params.null_code))
# Sequence accuracy computed by cutting sequence at the first null char
use_metric('SequenceAccuracy',
metrics.sequence_accuracy(
endpoints.predicted_chars,
data.labels,
streaming=True,
rej_char=self._params.null_code))
use_metric(
'SequenceAccuracy',
metrics.sequence_accuracy(
endpoints.predicted_chars,
data.labels,
streaming=True,
rej_char=self._params.null_code))
for name, value in names_to_values.items():
summary_name = 'eval/' + name
tf.summary.scalar(summary_name, tf.Print(value, [value], summary_name))
tf.compat.v1.summary.scalar(
summary_name, tf.compat.v1.Print(value, [value], summary_name))
return list(names_to_updates.values())
def create_init_fn_to_restore(self, master_checkpoint,
def create_init_fn_to_restore(self,
master_checkpoint,
inception_checkpoint=None):
"""Creates an init operations to restore weights from various checkpoints.
Args:
master_checkpoint: path to a checkpoint which contains all weights for
the whole model.
master_checkpoint: path to a checkpoint which contains all weights for the
whole model.
inception_checkpoint: path to a checkpoint which contains weights for the
inception part only.
......@@ -556,8 +727,8 @@ class Model(object):
all_feed_dict = {}
def assign_from_checkpoint(variables, checkpoint):
logging.info('Request to re-store %d weights from %s',
len(variables), checkpoint)
logging.info('Request to re-store %d weights from %s', len(variables),
checkpoint)
if not variables:
logging.error('Can\'t find any variables to restore.')
sys.exit(1)
......@@ -565,15 +736,18 @@ class Model(object):
all_assign_ops.append(assign_op)
all_feed_dict.update(feed_dict)
logging.info('variables_to_restore:\n%s' % utils.variables_to_restore().keys())
logging.info('moving_average_variables:\n%s' % [v.op.name for v in tf.moving_average_variables()])
logging.info('trainable_variables:\n%s' % [v.op.name for v in tf.trainable_variables()])
logging.info('variables_to_restore:\n%s',
utils.variables_to_restore().keys())
logging.info('moving_average_variables:\n%s',
[v.op.name for v in tf.compat.v1.moving_average_variables()])
logging.info('trainable_variables:\n%s',
[v.op.name for v in tf.compat.v1.trainable_variables()])
if master_checkpoint:
assign_from_checkpoint(utils.variables_to_restore(), master_checkpoint)
if inception_checkpoint:
variables = utils.variables_to_restore(
'AttentionOcr_v1/conv_tower_fn/INCE', strip_scope=True)
'AttentionOcr_v1/conv_tower_fn/INCE', strip_scope=True)
assign_from_checkpoint(variables, inception_checkpoint)
def init_assign_fn(sess):
......
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Converts existing checkpoint into a SavedModel.
Usage example:
python model_export.py \
--logtostderr --checkpoint=model.ckpt-399731 \
--export_dir=/tmp/attention_ocr_export
"""
import os
import tensorflow as tf
from tensorflow import app
from tensorflow.contrib import slim
from tensorflow.python.platform import flags
import common_flags
import model_export_lib
FLAGS = flags.FLAGS
common_flags.define()
flags.DEFINE_string('export_dir', None, 'Directory to export model files to.')
flags.DEFINE_integer(
'image_width', None,
'Image width used during training (or crop width if used)'
' If not set, the dataset default is used instead.')
flags.DEFINE_integer(
'image_height', None,
'Image height used during training(or crop height if used)'
' If not set, the dataset default is used instead.')
flags.DEFINE_string('work_dir', '/tmp',
'A directory to store temporary files.')
flags.DEFINE_integer('version_number', 1, 'Version number of the model')
flags.DEFINE_bool(
'export_for_serving', True,
'Whether the exported model accepts serialized tf.Example '
'protos as input')
def get_checkpoint_path():
"""Returns a path to a checkpoint based on specified commandline flags.
In order to specify a full path to a checkpoint use --checkpoint flag.
Alternatively, if --train_log_dir was specified it will return a path to the
most recent checkpoint.
Raises:
ValueError: in case it can't find a checkpoint.
Returns:
A string.
"""
if FLAGS.checkpoint:
return FLAGS.checkpoint
else:
model_save_path = tf.train.latest_checkpoint(FLAGS.train_log_dir)
if not model_save_path:
raise ValueError('Can\'t find a checkpoint in: %s' % FLAGS.train_log_dir)
return model_save_path
def export_model(export_dir,
export_for_serving,
batch_size=None,
crop_image_width=None,
crop_image_height=None):
"""Exports a model to the named directory.
Note that --datatset_name and --checkpoint are required and parsed by the
underlying module common_flags.
Args:
export_dir: The output dir where model is exported to.
export_for_serving: If True, expects a serialized image as input and attach
image normalization as part of exported graph.
batch_size: For non-serving export, the input batch_size needs to be
specified.
crop_image_width: Width of the input image. Uses the dataset default if
None.
crop_image_height: Height of the input image. Uses the dataset default if
None.
Returns:
Returns the model signature_def.
"""
# Dataset object used only to get all parameters for the model.
dataset = common_flags.create_dataset(split_name='test')
model = common_flags.create_model(
dataset.num_char_classes,
dataset.max_sequence_length,
dataset.num_of_views,
dataset.null_code,
charset=dataset.charset)
dataset_image_height, dataset_image_width, image_depth = dataset.image_shape
# Add check for charmap file
if not os.path.exists(dataset.charset_file):
raise ValueError('No charset defined at {}: export will fail'.format(
dataset.charset))
# Default to dataset dimensions, otherwise use provided dimensions.
image_width = crop_image_width or dataset_image_width
image_height = crop_image_height or dataset_image_height
if export_for_serving:
images_orig = tf.compat.v1.placeholder(
tf.string, shape=[batch_size], name='tf_example')
images_orig_float = model_export_lib.generate_tfexample_image(
images_orig,
image_height,
image_width,
image_depth,
name='float_images')
else:
images_shape = (batch_size, image_height, image_width, image_depth)
images_orig = tf.compat.v1.placeholder(
tf.uint8, shape=images_shape, name='original_image')
images_orig_float = tf.image.convert_image_dtype(
images_orig, dtype=tf.float32, name='float_images')
endpoints = model.create_base(images_orig_float, labels_one_hot=None)
sess = tf.compat.v1.Session()
saver = tf.compat.v1.train.Saver(
slim.get_variables_to_restore(), sharded=True)
saver.restore(sess, get_checkpoint_path())
tf.compat.v1.logging.info('Model restored successfully.')
# Create model signature.
if export_for_serving:
input_tensors = {
tf.saved_model.CLASSIFY_INPUTS: images_orig
}
else:
input_tensors = {'images': images_orig}
signature_inputs = model_export_lib.build_tensor_info(input_tensors)
# NOTE: Tensors 'image_float' and 'chars_logit' are used by the inference
# or to compute saliency maps.
output_tensors = {
'images_float': images_orig_float,
'predictions': endpoints.predicted_chars,
'scores': endpoints.predicted_scores,
'chars_logit': endpoints.chars_logit,
'predicted_length': endpoints.predicted_length,
'predicted_text': endpoints.predicted_text,
'predicted_conf': endpoints.predicted_conf,
'normalized_seq_conf': endpoints.normalized_seq_conf
}
for i, t in enumerate(
model_export_lib.attention_ocr_attention_masks(
dataset.max_sequence_length)):
output_tensors['attention_mask_%d' % i] = t
signature_outputs = model_export_lib.build_tensor_info(output_tensors)
signature_def = tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
signature_inputs, signature_outputs,
tf.saved_model.CLASSIFY_METHOD_NAME)
# Save model.
builder = tf.compat.v1.saved_model.builder.SavedModelBuilder(export_dir)
builder.add_meta_graph_and_variables(
sess, [tf.saved_model.SERVING],
signature_def_map={
tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
signature_def
},
main_op=tf.compat.v1.tables_initializer(),
strip_default_attrs=True)
builder.save()
tf.compat.v1.logging.info('Model has been exported to %s' % export_dir)
return signature_def
def main(unused_argv):
if os.path.exists(FLAGS.export_dir):
raise ValueError('export_dir already exists: exporting will fail')
export_model(FLAGS.export_dir, FLAGS.export_for_serving, FLAGS.batch_size,
FLAGS.image_width, FLAGS.image_height)
if __name__ == '__main__':
flags.mark_flag_as_required('dataset_name')
flags.mark_flag_as_required('export_dir')
app.run(main)
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions for exporting Attention OCR model."""
import tensorflow as tf
# Function borrowed from research/object_detection/core/preprocessor.py
def normalize_image(image, original_minval, original_maxval, target_minval,
target_maxval):
"""Normalizes pixel values in the image.
Moves the pixel values from the current [original_minval, original_maxval]
range to a the [target_minval, target_maxval] range.
Args:
image: rank 3 float32 tensor containing 1 image -> [height, width,
channels].
original_minval: current image minimum value.
original_maxval: current image maximum value.
target_minval: target image minimum value.
target_maxval: target image maximum value.
Returns:
image: image which is the same shape as input image.
"""
with tf.compat.v1.name_scope('NormalizeImage', values=[image]):
original_minval = float(original_minval)
original_maxval = float(original_maxval)
target_minval = float(target_minval)
target_maxval = float(target_maxval)
image = tf.cast(image, dtype=tf.float32)
image = tf.subtract(image, original_minval)
image = tf.multiply(image, (target_maxval - target_minval) /
(original_maxval - original_minval))
image = tf.add(image, target_minval)
return image
def generate_tfexample_image(input_example_strings,
image_height,
image_width,
image_channels,
name=None):
"""Parses a 1D tensor of serialized tf.Example protos and returns image batch.
Args:
input_example_strings: A 1-Dimensional tensor of size [batch_size] and type
tf.string containing a serialized Example proto per image.
image_height: First image dimension.
image_width: Second image dimension.
image_channels: Third image dimension.
name: optional tensor name.
Returns:
A tensor with shape [batch_size, height, width, channels] of type float32
with values in the range [0..1]
"""
batch_size = tf.shape(input=input_example_strings)[0]
images_shape = tf.stack(
[batch_size, image_height, image_width, image_channels])
tf_example_image_key = 'image/encoded'
feature_configs = {
tf_example_image_key:
tf.io.FixedLenFeature(
image_height * image_width * image_channels, dtype=tf.float32)
}
feature_tensors = tf.io.parse_example(
serialized=input_example_strings, features=feature_configs)
float_images = tf.reshape(
normalize_image(
feature_tensors[tf_example_image_key],
original_minval=0.0,
original_maxval=255.0,
target_minval=0.0,
target_maxval=1.0),
images_shape,
name=name)
return float_images
def attention_ocr_attention_masks(num_characters):
# TODO(gorban): use tensors directly after replacing LSTM unroll methods.
prefix = ('AttentionOcr_v1/'
'sequence_logit_fn/SQLR/LSTM/attention_decoder/Attention_0')
names = ['%s/Softmax:0' % (prefix)]
for i in range(1, num_characters):
names += ['%s_%d/Softmax:0' % (prefix, i)]
return [tf.compat.v1.get_default_graph().get_tensor_by_name(n) for n in names]
def build_tensor_info(tensor_dict):
return {
k: tf.compat.v1.saved_model.utils.build_tensor_info(t)
for k, t in tensor_dict.items()
}
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for model_export."""
import os
import numpy as np
from absl.testing import flagsaver
import tensorflow as tf
import common_flags
import model_export
_CHECKPOINT = 'model.ckpt-399731'
_CHECKPOINT_URL = (
'http://download.tensorflow.org/models/attention_ocr_2017_08_09.tar.gz')
def _clean_up():
tf.io.gfile.rmtree(tf.compat.v1.test.get_temp_dir())
def _create_tf_example_string(image):
"""Create a serialized tf.Example proto for feeding the model."""
example = tf.train.Example()
example.features.feature['image/encoded'].float_list.value.extend(
list(np.reshape(image, (-1))))
return example.SerializeToString()
class AttentionOcrExportTest(tf.test.TestCase):
"""Tests for model_export.export_model."""
def setUp(self):
for suffix in ['.meta', '.index', '.data-00000-of-00001']:
filename = _CHECKPOINT + suffix
self.assertTrue(
tf.io.gfile.exists(filename),
msg='Missing checkpoint file %s. '
'Please download and extract it from %s' %
(filename, _CHECKPOINT_URL))
tf.flags.FLAGS.dataset_name = 'fsns'
tf.flags.FLAGS.checkpoint = _CHECKPOINT
tf.flags.FLAGS.dataset_dir = os.path.join(
os.path.dirname(__file__), 'datasets/testdata/fsns')
tf.test.TestCase.setUp(self)
_clean_up()
self.export_dir = os.path.join(
tf.compat.v1.test.get_temp_dir(), 'exported_model')
self.minimal_output_signature = {
'predictions': 'AttentionOcr_v1/predicted_chars:0',
'scores': 'AttentionOcr_v1/predicted_scores:0',
'predicted_length': 'AttentionOcr_v1/predicted_length:0',
'predicted_text': 'AttentionOcr_v1/predicted_text:0',
'predicted_conf': 'AttentionOcr_v1/predicted_conf:0',
'normalized_seq_conf': 'AttentionOcr_v1/normalized_seq_conf:0'
}
def create_input_feed(self, graph_def, serving):
"""Returns the input feed for the model.
Creates random images, according to the size specified by dataset_name,
format it in the correct way depending on whether the model was exported
for serving, and return the correctly keyed feed_dict for inference.
Args:
graph_def: Graph definition of the loaded model.
serving: Whether the model was exported for Serving.
Returns:
The feed_dict suitable for model inference.
"""
# Creates a dataset based on FLAGS.dataset_name.
self.dataset = common_flags.create_dataset('test')
# Create some random images to test inference for any dataset.
self.images = {
'img1':
np.random.uniform(low=64, high=192,
size=self.dataset.image_shape).astype('uint8'),
'img2':
np.random.uniform(low=32, high=224,
size=self.dataset.image_shape).astype('uint8'),
}
signature_def = graph_def.signature_def[
tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
if serving:
input_name = signature_def.inputs[
tf.saved_model.CLASSIFY_INPUTS].name
# Model for serving takes input: inputs['inputs'] = 'tf_example:0'
feed_dict = {
input_name: [
_create_tf_example_string(self.images['img1']),
_create_tf_example_string(self.images['img2'])
]
}
else:
input_name = signature_def.inputs['images'].name
# Model for direct use takes input: inputs['images'] = 'original_image:0'
feed_dict = {
input_name: np.stack([self.images['img1'], self.images['img2']])
}
return feed_dict
def verify_export_load_and_inference(self, export_for_serving=False):
"""Verify exported model can be loaded and inference can run successfully.
This function will load the exported model in self.export_dir, then create
some fake images according to the specification of FLAGS.dataset_name.
It then feeds the input through the model, and verify the minimal set of
output signatures are present.
Note: Model and dataset creation in the underlying library depends on the
following commandline flags:
FLAGS.dataset_name
Args:
export_for_serving: True if the model was exported for Serving. This
affects how input is fed into the model.
"""
tf.compat.v1.reset_default_graph()
sess = tf.compat.v1.Session()
graph_def = tf.compat.v1.saved_model.loader.load(
sess=sess,
tags=[tf.saved_model.SERVING],
export_dir=self.export_dir)
feed_dict = self.create_input_feed(graph_def, export_for_serving)
results = sess.run(self.minimal_output_signature, feed_dict=feed_dict)
out_shape = (2,)
self.assertEqual(np.shape(results['predicted_conf']), out_shape)
self.assertEqual(np.shape(results['predicted_text']), out_shape)
self.assertEqual(np.shape(results['predicted_length']), out_shape)
self.assertEqual(np.shape(results['normalized_seq_conf']), out_shape)
out_shape = (2, self.dataset.max_sequence_length)
self.assertEqual(np.shape(results['scores']), out_shape)
self.assertEqual(np.shape(results['predictions']), out_shape)
@flagsaver.flagsaver
def test_fsns_export_for_serving_and_load_inference(self):
model_export.export_model(self.export_dir, True)
self.verify_export_load_and_inference(True)
@flagsaver.flagsaver
def test_fsns_export_and_load_inference(self):
model_export.export_model(self.export_dir, False, batch_size=2)
self.verify_export_load_and_inference(False)
if __name__ == '__main__':
tf.test.main()
......@@ -12,11 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the model."""
import string
import numpy as np
import string
import tensorflow as tf
from tensorflow.contrib import slim
......@@ -32,6 +31,7 @@ def create_fake_charset(num_char_classes):
class ModelTest(tf.test.TestCase):
def setUp(self):
tf.test.TestCase.setUp(self)
......@@ -51,18 +51,21 @@ class ModelTest(tf.test.TestCase):
self.chars_logit_shape = (self.batch_size, self.seq_length,
self.num_char_classes)
self.length_logit_shape = (self.batch_size, self.seq_length + 1)
# Placeholder knows image dimensions, but not batch size.
self.input_images = tf.compat.v1.placeholder(
tf.float32,
shape=(None, self.image_height, self.image_width, 3),
name='input_node')
self.initialize_fakes()
def initialize_fakes(self):
self.images_shape = (self.batch_size, self.image_height, self.image_width,
3)
self.fake_images = tf.constant(
self.rng.randint(low=0, high=255,
size=self.images_shape).astype('float32'),
name='input_node')
self.fake_conv_tower_np = self.rng.randn(
*self.conv_tower_shape).astype('float32')
self.fake_images = self.rng.randint(
low=0, high=255, size=self.images_shape).astype('float32')
self.fake_conv_tower_np = self.rng.randn(*self.conv_tower_shape).astype(
'float32')
self.fake_conv_tower = tf.constant(self.fake_conv_tower_np)
self.fake_logits = tf.constant(
self.rng.randn(*self.chars_logit_shape).astype('float32'))
......@@ -74,33 +77,44 @@ class ModelTest(tf.test.TestCase):
def create_model(self, charset=None):
return model.Model(
self.num_char_classes, self.seq_length, num_views=4, null_code=62,
self.num_char_classes,
self.seq_length,
num_views=4,
null_code=62,
charset=charset)
def test_char_related_shapes(self):
ocr_model = self.create_model()
charset = create_fake_charset(self.num_char_classes)
ocr_model = self.create_model(charset=charset)
with self.test_session() as sess:
endpoints_tf = ocr_model.create_base(
images=self.fake_images, labels_one_hot=None)
sess.run(tf.global_variables_initializer())
endpoints = sess.run(endpoints_tf)
self.assertEqual((self.batch_size, self.seq_length,
self.num_char_classes), endpoints.chars_logit.shape)
self.assertEqual((self.batch_size, self.seq_length,
self.num_char_classes), endpoints.chars_log_prob.shape)
images=self.input_images, labels_one_hot=None)
sess.run(tf.compat.v1.global_variables_initializer())
tf.compat.v1.tables_initializer().run()
endpoints = sess.run(
endpoints_tf, feed_dict={self.input_images: self.fake_images})
self.assertEqual(
(self.batch_size, self.seq_length, self.num_char_classes),
endpoints.chars_logit.shape)
self.assertEqual(
(self.batch_size, self.seq_length, self.num_char_classes),
endpoints.chars_log_prob.shape)
self.assertEqual((self.batch_size, self.seq_length),
endpoints.predicted_chars.shape)
self.assertEqual((self.batch_size, self.seq_length),
endpoints.predicted_scores.shape)
self.assertEqual((self.batch_size,), endpoints.predicted_text.shape)
self.assertEqual((self.batch_size,), endpoints.predicted_conf.shape)
self.assertEqual((self.batch_size,), endpoints.normalized_seq_conf.shape)
def test_predicted_scores_are_within_range(self):
ocr_model = self.create_model()
_, _, scores = ocr_model.char_predictions(self.fake_logits)
with self.test_session() as sess:
scores_np = sess.run(scores)
scores_np = sess.run(
scores, feed_dict={self.input_images: self.fake_images})
values_in_range = (scores_np >= 0.0) & (scores_np <= 1.0)
self.assertTrue(
......@@ -111,10 +125,11 @@ class ModelTest(tf.test.TestCase):
def test_conv_tower_shape(self):
with self.test_session() as sess:
ocr_model = self.create_model()
conv_tower = ocr_model.conv_tower_fn(self.fake_images)
conv_tower = ocr_model.conv_tower_fn(self.input_images)
sess.run(tf.global_variables_initializer())
conv_tower_np = sess.run(conv_tower)
sess.run(tf.compat.v1.global_variables_initializer())
conv_tower_np = sess.run(
conv_tower, feed_dict={self.input_images: self.fake_images})
self.assertEqual(self.conv_tower_shape, conv_tower_np.shape)
......@@ -124,11 +139,12 @@ class ModelTest(tf.test.TestCase):
# updates, gradients and variances. It also depends on the type of used
# optimizer.
ocr_model = self.create_model()
ocr_model.create_base(images=self.fake_images, labels_one_hot=None)
ocr_model.create_base(images=self.input_images, labels_one_hot=None)
with self.test_session() as sess:
tfprof_root = tf.profiler.profile(
tfprof_root = tf.compat.v1.profiler.profile(
sess.graph,
options=tf.profiler.ProfileOptionBuilder.trainable_variables_parameter())
options=tf.compat.v1.profiler.ProfileOptionBuilder
.trainable_variables_parameter())
model_size_bytes = 4 * tfprof_root.total_parameters
self.assertLess(model_size_bytes, 1 * 2**30)
......@@ -147,9 +163,9 @@ class ModelTest(tf.test.TestCase):
summaries = ocr_model.create_summaries(
data, endpoints, charset, is_training=False)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
tf.tables_initializer().run()
sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.compat.v1.local_variables_initializer())
tf.compat.v1.tables_initializer().run()
sess.run(summaries) # just check it is runnable
def test_sequence_loss_function_without_label_smoothing(self):
......@@ -158,7 +174,7 @@ class ModelTest(tf.test.TestCase):
loss = model.sequence_loss_fn(self.fake_logits, self.fake_labels)
with self.test_session() as sess:
loss_np = sess.run(loss)
loss_np = sess.run(loss, feed_dict={self.input_images: self.fake_images})
# This test checks that the loss function is 'runnable'.
self.assertEqual(loss_np.shape, tuple())
......@@ -172,19 +188,21 @@ class ModelTest(tf.test.TestCase):
Returns:
a list of tensors with encoded image coordinates in them.
"""
batch_size, h, w, _ = net.shape.as_list()
batch_size = tf.shape(input=net)[0]
_, h, w, _ = net.shape.as_list()
h_loc = [
tf.tile(
tf.reshape(
tf.contrib.layers.one_hot_encoding(
tf.constant([i]), num_classes=h), [h, 1]), [1, w])
for i in range(h)
tf.tile(
tf.reshape(
tf.contrib.layers.one_hot_encoding(
tf.constant([i]), num_classes=h), [h, 1]), [1, w])
for i in range(h)
]
h_loc = tf.concat([tf.expand_dims(t, 2) for t in h_loc], 2)
w_loc = [
tf.tile(
tf.contrib.layers.one_hot_encoding(tf.constant([i]), num_classes=w),
[h, 1]) for i in range(w)
tf.tile(
tf.contrib.layers.one_hot_encoding(
tf.constant([i]), num_classes=w),
[h, 1]) for i in range(w)
]
w_loc = tf.concat([tf.expand_dims(t, 2) for t in w_loc], 2)
loc = tf.concat([h_loc, w_loc], 2)
......@@ -197,11 +215,12 @@ class ModelTest(tf.test.TestCase):
conv_w_coords_tf = model.encode_coordinates_fn(self.fake_conv_tower)
with self.test_session() as sess:
conv_w_coords = sess.run(conv_w_coords_tf)
conv_w_coords = sess.run(
conv_w_coords_tf, feed_dict={self.input_images: self.fake_images})
batch_size, height, width, feature_size = self.conv_tower_shape
self.assertEqual(conv_w_coords.shape, (batch_size, height, width,
feature_size + height + width))
self.assertEqual(conv_w_coords.shape,
(batch_size, height, width, feature_size + height + width))
def test_disabled_coordinate_encoding_returns_features_unchanged(self):
model = self.create_model()
......@@ -209,7 +228,8 @@ class ModelTest(tf.test.TestCase):
conv_w_coords_tf = model.encode_coordinates_fn(self.fake_conv_tower)
with self.test_session() as sess:
conv_w_coords = sess.run(conv_w_coords_tf)
conv_w_coords = sess.run(
conv_w_coords_tf, feed_dict={self.input_images: self.fake_images})
self.assertAllEqual(conv_w_coords, self.fake_conv_tower_np)
......@@ -221,7 +241,8 @@ class ModelTest(tf.test.TestCase):
conv_w_coords_tf = model.encode_coordinates_fn(fake_conv_tower)
with self.test_session() as sess:
conv_w_coords = sess.run(conv_w_coords_tf)
conv_w_coords = sess.run(
conv_w_coords_tf, feed_dict={self.input_images: self.fake_images})
# Original features
self.assertAllEqual(conv_w_coords[0, :, :, :4],
......@@ -252,8 +273,8 @@ class ModelTest(tf.test.TestCase):
endpoints_tf = ocr_model.create_base(
images=self.fake_images, labels_one_hot=None)
sess.run(tf.global_variables_initializer())
tf.tables_initializer().run()
sess.run(tf.compat.v1.global_variables_initializer())
tf.compat.v1.tables_initializer().run()
endpoints = sess.run(endpoints_tf)
self.assertEqual(endpoints.predicted_text.shape, (self.batch_size,))
......@@ -261,14 +282,15 @@ class ModelTest(tf.test.TestCase):
class CharsetMapperTest(tf.test.TestCase):
def test_text_corresponds_to_ids(self):
charset = create_fake_charset(36)
ids = tf.constant(
[[17, 14, 21, 21, 24], [32, 24, 27, 21, 13]], dtype=tf.int64)
ids = tf.constant([[17, 14, 21, 21, 24], [32, 24, 27, 21, 13]],
dtype=tf.int64)
charset_mapper = model.CharsetMapper(charset)
with self.test_session() as sess:
tf.tables_initializer().run()
tf.compat.v1.tables_initializer().run()
text = sess.run(charset_mapper.get_text(ids))
self.assertAllEqual(text, [b'hello', b'world'])
......
......@@ -111,12 +111,12 @@ class SequenceLayerBase(object):
self._mparams = method_params
self._net = net
self._labels_one_hot = labels_one_hot
self._batch_size = net.get_shape().dims[0].value
self._batch_size = tf.shape(input=net)[0]
# Initialize parameters for char logits which will be computed on the fly
# inside an LSTM decoder.
self._char_logits = {}
regularizer = slim.l2_regularizer(self._mparams.weight_decay)
regularizer = tf.keras.regularizers.l2(0.5 * (self._mparams.weight_decay))
self._softmax_w = slim.model_variable(
'softmax_w',
[self._mparams.num_lstm_units, self._params.num_char_classes],
......@@ -124,7 +124,7 @@ class SequenceLayerBase(object):
regularizer=regularizer)
self._softmax_b = slim.model_variable(
'softmax_b', [self._params.num_char_classes],
initializer=tf.zeros_initializer(),
initializer=tf.compat.v1.zeros_initializer(),
regularizer=regularizer)
@abc.abstractmethod
......@@ -203,8 +203,8 @@ class SequenceLayerBase(object):
A tensor with shape [batch_size, num_char_classes]
"""
if char_index not in self._char_logits:
self._char_logits[char_index] = tf.nn.xw_plus_b(inputs, self._softmax_w,
self._softmax_b)
self._char_logits[char_index] = tf.compat.v1.nn.xw_plus_b(inputs, self._softmax_w,
self._softmax_b)
return self._char_logits[char_index]
def char_one_hot(self, logit):
......@@ -216,7 +216,7 @@ class SequenceLayerBase(object):
Returns:
A tensor with shape [batch_size, num_char_classes]
"""
prediction = tf.argmax(logit, axis=1)
prediction = tf.argmax(input=logit, axis=1)
return slim.one_hot_encoding(prediction, self._params.num_char_classes)
def get_input(self, prev, i):
......@@ -244,10 +244,10 @@ class SequenceLayerBase(object):
Returns:
A tensor with shape [batch_size, seq_length, num_char_classes].
"""
with tf.variable_scope('LSTM'):
with tf.compat.v1.variable_scope('LSTM'):
first_label = self.get_input(prev=None, i=0)
decoder_inputs = [first_label] + [None] * (self._params.seq_length - 1)
lstm_cell = tf.contrib.rnn.LSTMCell(
lstm_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(
self._mparams.num_lstm_units,
use_peepholes=False,
cell_clip=self._mparams.lstm_state_clip_value,
......@@ -259,9 +259,9 @@ class SequenceLayerBase(object):
loop_function=self.get_input,
cell=lstm_cell)
with tf.variable_scope('logits'):
with tf.compat.v1.variable_scope('logits'):
logits_list = [
tf.expand_dims(self.char_logit(logit, i), dim=1)
tf.expand_dims(self.char_logit(logit, i), axis=1)
for i, logit in enumerate(lstm_outputs)
]
......@@ -275,7 +275,7 @@ class NetSlice(SequenceLayerBase):
def __init__(self, *args, **kwargs):
super(NetSlice, self).__init__(*args, **kwargs)
self._zero_label = tf.zeros(
[self._batch_size, self._params.num_char_classes])
tf.stack([self._batch_size, self._params.num_char_classes]))
def get_image_feature(self, char_index):
"""Returns a subset of image features for a character.
......@@ -352,7 +352,7 @@ class Attention(SequenceLayerBase):
def __init__(self, *args, **kwargs):
super(Attention, self).__init__(*args, **kwargs)
self._zero_label = tf.zeros(
[self._batch_size, self._params.num_char_classes])
tf.stack([self._batch_size, self._params.num_char_classes]))
def get_eval_input(self, prev, i):
"""See SequenceLayerBase.get_eval_input for details."""
......
......@@ -29,13 +29,13 @@ import sequence_layers
def fake_net(batch_size, num_features, feature_size):
return tf.convert_to_tensor(
np.random.uniform(size=(batch_size, num_features, feature_size)),
value=np.random.uniform(size=(batch_size, num_features, feature_size)),
dtype=tf.float32)
def fake_labels(batch_size, seq_length, num_char_classes):
labels_np = tf.convert_to_tensor(
np.random.randint(
value=np.random.randint(
low=0, high=num_char_classes, size=(batch_size, seq_length)))
return slim.one_hot_encoding(labels_np, num_classes=num_char_classes)
......
......@@ -96,16 +96,16 @@ def get_training_hparams():
def create_optimizer(hparams):
"""Creates optimized based on the specified flags."""
if hparams.optimizer == 'momentum':
optimizer = tf.train.MomentumOptimizer(
optimizer = tf.compat.v1.train.MomentumOptimizer(
hparams.learning_rate, momentum=hparams.momentum)
elif hparams.optimizer == 'adam':
optimizer = tf.train.AdamOptimizer(hparams.learning_rate)
optimizer = tf.compat.v1.train.AdamOptimizer(hparams.learning_rate)
elif hparams.optimizer == 'adadelta':
optimizer = tf.train.AdadeltaOptimizer(hparams.learning_rate)
optimizer = tf.compat.v1.train.AdadeltaOptimizer(hparams.learning_rate)
elif hparams.optimizer == 'adagrad':
optimizer = tf.train.AdagradOptimizer(hparams.learning_rate)
optimizer = tf.compat.v1.train.AdagradOptimizer(hparams.learning_rate)
elif hparams.optimizer == 'rmsprop':
optimizer = tf.train.RMSPropOptimizer(
optimizer = tf.compat.v1.train.RMSPropOptimizer(
hparams.learning_rate, momentum=hparams.momentum)
return optimizer
......@@ -154,14 +154,14 @@ def train(loss, init_fn, hparams):
def prepare_training_dir():
if not tf.gfile.Exists(FLAGS.train_log_dir):
if not tf.io.gfile.exists(FLAGS.train_log_dir):
logging.info('Create a new training directory %s', FLAGS.train_log_dir)
tf.gfile.MakeDirs(FLAGS.train_log_dir)
tf.io.gfile.makedirs(FLAGS.train_log_dir)
else:
if FLAGS.reset_train_dir:
logging.info('Reset the training directory %s', FLAGS.train_log_dir)
tf.gfile.DeleteRecursively(FLAGS.train_log_dir)
tf.gfile.MakeDirs(FLAGS.train_log_dir)
tf.io.gfile.rmtree(FLAGS.train_log_dir)
tf.io.gfile.makedirs(FLAGS.train_log_dir)
else:
logging.info('Use already existing training directory %s',
FLAGS.train_log_dir)
......@@ -169,7 +169,7 @@ def prepare_training_dir():
def calculate_graph_metrics():
param_stats = model_analyzer.print_model_analysis(
tf.get_default_graph(),
tf.compat.v1.get_default_graph(),
tfprof_options=model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
return param_stats.total_parameters
......@@ -186,7 +186,7 @@ def main(_):
# If ps_tasks is zero, the local device is used. When using multiple
# (non-local) replicas, the ReplicaDeviceSetter distributes the variables
# across the different devices.
device_setter = tf.train.replica_device_setter(
device_setter = tf.compat.v1.train.replica_device_setter(
FLAGS.ps_tasks, merge_devices=True)
with tf.device(device_setter):
data = data_provider.get_data(
......
......@@ -37,16 +37,16 @@ def logits_to_log_prob(logits):
probabilities.
"""
with tf.variable_scope('log_probabilities'):
with tf.compat.v1.variable_scope('log_probabilities'):
reduction_indices = len(logits.shape.as_list()) - 1
max_logits = tf.reduce_max(
logits, reduction_indices=reduction_indices, keep_dims=True)
input_tensor=logits, axis=reduction_indices, keepdims=True)
safe_logits = tf.subtract(logits, max_logits)
sum_exp = tf.reduce_sum(
tf.exp(safe_logits),
reduction_indices=reduction_indices,
keep_dims=True)
log_probs = tf.subtract(safe_logits, tf.log(sum_exp))
input_tensor=tf.exp(safe_logits),
axis=reduction_indices,
keepdims=True)
log_probs = tf.subtract(safe_logits, tf.math.log(sum_exp))
return log_probs
......@@ -78,3 +78,20 @@ def variables_to_restore(scope=None, strip_scope=False):
return variable_map
else:
return {v.op.name: v for v in slim.get_variables_to_restore()}
def ConvertAllInputsToTensors(func):
"""A decorator to convert all function's inputs into tensors.
Args:
func: a function to decorate.
Returns:
A decorated function.
"""
def FuncWrapper(*args):
tensors = [tf.convert_to_tensor(value=a) for a in args]
return func(*tensors)
return FuncWrapper
......@@ -113,17 +113,23 @@ def _get_files(data, dataset_split):
Args:
data: String, desired data ('image' or 'label').
dataset_split: String, dataset split ('train', 'val', 'test')
dataset_split: String, dataset split ('train_fine', 'val_fine', 'test_fine')
Returns:
A list of sorted file names or None when getting label for
test set.
"""
if data == 'label' and dataset_split == 'test':
return None
if dataset_split == 'train_fine':
split_dir = 'train'
elif dataset_split == 'val_fine':
split_dir = 'val'
elif dataset_split == 'test_fine':
split_dir = 'test'
else:
raise RuntimeError("Split {} is not supported".format(dataset_split))
pattern = '*%s.%s' % (_POSTFIX_MAP[data], _DATA_FORMAT_MAP[data])
search_files = os.path.join(
FLAGS.cityscapes_root, _FOLDERS_MAP[data], dataset_split, '*', pattern)
FLAGS.cityscapes_root, _FOLDERS_MAP[data], split_dir, '*', pattern)
filenames = glob.glob(search_files)
return sorted(filenames)
......@@ -132,7 +138,7 @@ def _convert_dataset(dataset_split):
"""Converts the specified dataset split to TFRecord format.
Args:
dataset_split: The dataset split (e.g., train, val).
dataset_split: The dataset split (e.g., train_fine, val_fine).
Raises:
RuntimeError: If loaded image and label have different shape, or if the
......@@ -142,8 +148,12 @@ def _convert_dataset(dataset_split):
label_files = _get_files('label', dataset_split)
num_images = len(image_files)
num_labels = len(label_files)
num_per_shard = int(math.ceil(num_images / _NUM_SHARDS))
if num_images != num_labels:
raise RuntimeError("The number of images and labels doesn't match: {} {}".format(num_images, num_labels))
image_reader = build_data.ImageReader('png', channels=3)
label_reader = build_data.ImageReader('png', channels=1)
......@@ -179,8 +189,8 @@ def _convert_dataset(dataset_split):
def main(unused_argv):
# Only support converting 'train' and 'val' sets for now.
for dataset_split in ['train', 'val']:
# Only support converting 'train_fine', 'val_fine' and 'test_fine' sets for now.
for dataset_split in ['train_fine', 'val_fine', 'test_fine']:
_convert_dataset(dataset_split)
......
......@@ -42,6 +42,8 @@ WORK_DIR="."
# Root path for Cityscapes dataset.
CITYSCAPES_ROOT="${WORK_DIR}/cityscapes"
export PYTHONPATH="${CITYSCAPES_ROOT}:${PYTHONPATH}"
# Create training labels.
python "${CITYSCAPES_ROOT}/cityscapesscripts/preparation/createTrainIdLabelImgs.py"
......
......@@ -81,8 +81,8 @@ DatasetDescriptor = collections.namedtuple(
_CITYSCAPES_INFORMATION = DatasetDescriptor(
splits_to_sizes={
'train': 2975,
'val': 500,
'train_fine': 2975,
'val_fine': 500,
},
num_classes=19,
ignore_label=255,
......
......@@ -43,7 +43,7 @@ A local training job using `xception_65` can be run with the following command:
python deeplab/train.py \
--logtostderr \
--training_number_of_steps=90000 \
--train_split="train" \
--train_split="train_fine" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
......@@ -95,7 +95,7 @@ command:
# From tensorflow/models/research/
python deeplab/eval.py \
--logtostderr \
--eval_split="val" \
--eval_split="val_fine" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
......@@ -121,7 +121,7 @@ command:
# From tensorflow/models/research/
python deeplab/vis.py \
--logtostderr \
--vis_split="val" \
--vis_split="val_fine" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
......
......@@ -68,6 +68,6 @@ Quick running the whole code on the PASCAL VOC 2012 dataset:
```bash
# From tensorflow/models/research/deeplab
sh local_test.sh
bash local_test.sh
```
......@@ -19,7 +19,7 @@
#
# Usage:
# # From the tensorflow/models/research/deeplab directory.
# sh ./local_test.sh
# bash ./local_test.sh
#
#
......@@ -42,7 +42,7 @@ python "${WORK_DIR}"/model_test.py
# Go to datasets folder and download PASCAL VOC 2012 segmentation dataset.
DATASET_DIR="datasets"
cd "${WORK_DIR}/${DATASET_DIR}"
sh download_and_convert_voc2012.sh
bash download_and_convert_voc2012.sh
# Go back to original directory.
cd "${CURRENT_DIR}"
......
## DELF installation
### Installation script
We now have a script to do the entire installation in one shot. Navigate to the
directory `models/research/delf/delf/python/training`, then run:
```bash
# From models/research/delf/delf/python/training
bash install_delf.sh
```
If this works, you are done! If not, see below for detailed instructions for
installing this codebase and its dependencies.
*Please note that this installation script only works on 64 bits Linux
architectures due to the `protoc` binary that is automatically downloaded. If
you wish to install the DELF library on other architectures please update the
[`install_delf.sh`](delf/python/training/install_delf.sh) script by referencing
the desired `protoc`
[binary release](https://github.com/protocolbuffers/protobuf/releases).*
In more detail: the `install_delf.sh` script installs both the DELF library and
its dependencies in the following sequence:
* Install TensorFlow 2.2 and TensorFlow 2.2 for GPU.
* Install the [TF-Slim](https://github.com/google-research/tf-slim) library
from source.
* Download [protoc](https://github.com/protocolbuffers/protobuf) and compile
the DELF Protocol Buffers.
* Install the matplotlib, numpy, scikit-image, scipy and python3-tk Python
libraries.
* Install the
[TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection)
from the cloned TensorFlow Model Garden repository.
* Install the DELF package.
### Tensorflow
[![TensorFlow 2.1](https://img.shields.io/badge/tensorflow-2.1-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0)
[![TensorFlow 2.2](https://img.shields.io/badge/tensorflow-2.2-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
[![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
For detailed steps to install Tensorflow, follow the
......@@ -11,9 +46,9 @@ typical user can install Tensorflow using one of the following commands:
```bash
# For CPU:
pip3 install 'tensorflow'
pip3 install 'tensorflow>=2.2.0'
# For GPU:
pip3 install 'tensorflow-gpu'
pip3 install 'tensorflow-gpu>=2.2.0'
```
### TF-Slim
......
# Deep Local and Global Image Features
[![TensorFlow 2.1](https://img.shields.io/badge/tensorflow-2.1-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0)
[![TensorFlow 2.2](https://img.shields.io/badge/tensorflow-2.2-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
[![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
This project presents code for deep local and global image feature methods,
......@@ -41,7 +41,7 @@ DELG:
```
"Unifying Deep Local and Global Features for Image Search",
B. Cao*, A. Araujo* and J. Sim,
arxiv:2001.05027
Proc. ECCV'20
```
GLDv2:
......@@ -55,11 +55,11 @@ Proc. CVPR'20
## News
- [Jul'20] Check out our ECCV'20 paper:
["Unifying Deep Local and Global Features for Image Search"](https://arxiv.org/abs/2001.05027)
- [Apr'20] Check out our CVPR'20 paper: ["Google Landmarks Dataset v2 - A
Large-Scale Benchmark for Instance-Level Recognition and
Retrieval"](https://arxiv.org/abs/2004.01804)
- [Jan'20] Check out our new paper:
["Unifying Deep Local and Global Features for Image Search"](https://arxiv.org/abs/2001.05027)
- [Jun'19] DELF achieved 2nd place in
[CVPR Visual Localization challenge (Local Features track)](https://sites.google.com/corp/view/ltvl2019).
See our slides
......@@ -182,104 +182,55 @@ directories therein, `protos` and `python`.
### `delf/protos`
This directory contains protobufs:
- `aggregation_config.proto`: protobuf for configuring local feature
aggregation.
- `box.proto`: protobuf for serializing detected boxes.
- `datum.proto`: general-purpose protobuf for serializing float tensors.
- `delf_config.proto`: protobuf for configuring DELF/DELG extraction.
- `feature.proto`: protobuf for serializing DELF features.
This directory contains protobufs for local feature aggregation
(`aggregation_config.proto`), serializing detected boxes (`box.proto`),
serializing float tensors (`datum.proto`), configuring DELF/DELG extraction
(`delf_config.proto`), serializing local features (`feature.proto`).
### `delf/python`
This directory contains files for several different purposes:
- `box_io.py`, `datum_io.py`, `feature_io.py` are helper files for reading and
writing tensors and features.
- `delf_v1.py` contains code to create DELF models.
- `feature_aggregation_extractor.py` contains a module to perform local
feature aggregation.
- `feature_aggregation_similarity.py` contains a module to perform similarity
computation for aggregated local features.
- `feature_extractor.py` contains the code to extract features using DELF.
This is particularly useful for extracting features over multiple scales,
with keypoint selection based on attention scores, and PCA/whitening
post-processing.
The subdirectory `delf/python/examples` contains sample scripts to run DELF
feature extraction/matching, and object detection:
- `delf_config_example.pbtxt` shows an example instantiation of the DelfConfig
proto, used for DELF feature extraction.
- `detector.py` is a module to construct an object detector function.
- `extract_boxes.py` enables object detection from a list of images.
- `extract_features.py` enables DELF extraction from a list of images.
- `extractor.py` is a module to construct a DELF/DELG local feature extraction
function.
- `match_images.py` supports image matching using DELF features extracted
using `extract_features.py`.
This directory contains files for several different purposes, such as:
reading/writing tensors/features (`box_io.py`, `datum_io.py`, `feature_io.py`),
local feature aggregation extraction and similarity computation
(`feature_aggregation_extractor.py`, `feature_aggregation_similarity.py`) and
helper functions for image/feature loading/processing (`utils.py`,
`feature_extractor.py`).
The subdirectory `delf/python/delg` contains sample scripts/configs related to
the DELG paper:
The subdirectory `delf/python/examples` contains sample scripts to run DELF/DELG
feature extraction/matching (`extractor.py`, `extract_features.py`,
`match_images.py`) and object detection (`detector.py`, `extract_boxes.py`).
`delf_config_example.pbtxt` shows an example instantiation of the DelfConfig
proto, used for DELF feature extraction.
- `delg_gld_config.pbtxt` gives the DelfConfig used in DELG paper.
- `extract_features.py` for local+global feature extraction on Revisited
datasets.
- `perform_retrieval.py` for performing retrieval/evaluating methods on
Revisited datasets.
The subdirectory `delf/python/delg` contains sample scripts/configs related to
the DELG paper: `extract_features.py` for local+global feature extraction (with
and example `delg_gld_config.pbtxt`) and `perform_retrieval.py` for performing
retrieval/scoring.
The subdirectory `delf/python/detect_to_retrieve` contains sample
scripts/configs related to the Detect-to-Retrieve paper:
- `aggregation_extraction.py` is a library to extract/save feature
aggregation.
- `boxes_and_features_extraction.py` is a library to extract/save boxes and
DELF features.
- `cluster_delf_features.py` for local feature clustering.
- `dataset.py` for parsing/evaluating results on Revisited Oxford/Paris
datasets.
- `delf_gld_config.pbtxt` gives the DelfConfig used in Detect-to-Retrieve
paper.
- `extract_aggregation.py` for aggregated local feature extraction.
- `extract_index_boxes_and_features.py` for index image local feature
extraction / bounding box detection on Revisited datasets.
- `extract_query_features.py` for query image local feature extraction on
Revisited datasets.
- `image_reranking.py` is a module to re-rank images with geometric
verification.
- `perform_retrieval.py` for performing retrieval/evaluating methods using
aggregated local features on Revisited datasets.
- `index_aggregation_config.pbtxt`, `query_aggregation_config.pbtxt` give
AggregationConfig's for Detect-to-Retrieve experiments.
scripts/configs related to the Detect-to-Retrieve paper, for feature/box
extraction/aggregation/clustering (`aggregation_extraction.py`,
`boxes_and_features_extraction.py`, `cluster_delf_features.py`,
`extract_aggregation.py`, `extract_index_boxes_and_features.py`,
`extract_query_features.py`), image retrieval/reranking (`perform_retrieval.py`,
`image_reranking.py`), along with configs used for feature
extraction/aggregation (`delf_gld_config.pbtxt`,
`index_aggregation_config.pbtxt`, `query_aggregation_config.pbtxt`) and
Revisited Oxford/Paris dataset parsing/evaluation (`dataset.py`).
The subdirectory `delf/python/google_landmarks_dataset` contains sample
scripts/modules for computing GLD metrics / reproducing results from the GLDv2
paper:
- `compute_recognition_metrics.py` performs recognition metric computation
given input predictions and solution files.
- `compute_retrieval_metrics.py` performs retrieval metric computation given
input predictions and solution files.
- `dataset_file_io.py` is a module for dataset-related file IO.
- `metrics.py` is a module for GLD metric computation.
- `rn101_af_gldv2clean_config.pbtxt` gives the DelfConfig used in the
ResNet101-ArcFace (trained on GLDv2-train-clean) baseline used in the GLDv2
paper.
scripts/modules for computing GLD metrics (`metrics.py`,
`compute_recognition_metrics.py`, `compute_retrieval_metrics.py`), GLD file IO
(`dataset_file_io.py`) / reproducing results from the GLDv2 paper
(`rn101_af_gldv2clean_config.pbtxt` and the instructions therein).
The subdirectory `delf/python/training` contains sample scripts/modules for
performing DELF training:
- `datasets/googlelandmarks.py` is the dataset module used for training.
- `model/delf_model.py` is the model module used for training.
- `model/export_model.py` is a script for exporting trained models in the
format used by the inference code.
- `model/export_model_utils.py` is a module with utilities for model
exporting.
- `model/resnet50.py` is a module with a backbone RN50 implementation.
- `build_image_dataset.py` converts downloaded dataset into TFRecords format
for training.
- `train.py` is the main training script.
performing model training (`train.py`) based on a ResNet50 DELF model
(`model/resnet50.py`, `model/delf_model.py`), also presenting relevant model
exporting scripts and associated utils (`model/export_model.py`,
`model/export_global_model.py`, `model/export_model_utils.py`) and dataset
downloading/preprocessing (`download_dataset.sh`, `build_image_dataset.py`,
`datasets/googlelandmarks.py`).
Besides these, other files in the different subdirectories contain tests for the
various modules.
......@@ -290,6 +241,16 @@ Andr&eacute; Araujo (@andrefaraujo)
## Release history
### Jul, 2020
- Full TF2 support. Only one minor `compat.v1` usage left. Updated
instructions to require TF2.2
- Refactored / much improved training code, with very detailed, step-by-step
instructions
**Thanks to contributors**: Dan Anghel, Barbara Fusinska and Andr&eacute;
Araujo.
### May, 2020
- Codebase is now Python3-first
......
......@@ -24,34 +24,9 @@ cd models/research/delf/delf/python/training
## Install the DELF Library
The DELF Python library can be installed by running the
[`install_delf.sh`](./install_delf.sh) script using the command:
```
bash install_delf.sh
```
The script installs both the DELF library and its dependencies in the following
sequence:
* Install TensorFlow 2.2 and TensorFlow 2.2 for GPU.
* Install the [TF-Slim](https://github.com/google-research/tf-slim) library
from source.
* Download [protoc](https://github.com/protocolbuffers/protobuf) and compile
the DELF Protocol Buffers.
* Install the matplotlib, numpy, scikit-image, scipy and python3-tk Python
libraries.
* Install the
[TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection)
from the cloned TensorFlow Model Garden repository.
* Install the DELF package.
*Please note that the current installation only works on 64 bits Linux
architectures due to the `protoc` binary downloaded by the installation script.
If you wish to install the DELF library on other architectures please update the
[`install_delf.sh`](./install_delf.sh) script by referencing the desired
`protoc`
[binary release](https://github.com/protocolbuffers/protobuf/releases).*
To be able to use this code, please follow
[these instructions](../../../INSTALL_INSTRUCTIONS.md) to properly install the
DELF library.
## Download the GLDv2 Training Data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment