"src/targets/vscode:/vscode.git/clone" did not exist on "16bb5c563932e31fda57489e16031c28a70114db"
Commit 3ce2f61b authored by Kaushik Shivakumar's avatar Kaushik Shivakumar
Browse files

Merge branch 'master' of https://github.com/tensorflow/models into context_tf2

parents bb16d5ca 8e9296ff
......@@ -166,6 +166,14 @@ implement one in Python or C++.
The recommended way is to use the [Serving infrastructure][serving].
To export to SavedModel format:
```
python model_export.py \
--checkpoint=model.ckpt-399731 \
--export_dir=/tmp/attention_ocr_export
```
Alternatively you can:
1. define a placeholder for images (or use directly an numpy array)
2. [create a graph ](https://github.com/tensorflow/models/blob/master/research/attention_ocr/python/eval.py#L60)
......@@ -188,7 +196,7 @@ other than a one time experiment please use the [TensorFlow Serving][serving].
[1]: https://github.com/tensorflow/tensorflow/blob/aaf7adc/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
[2]: https://www.tensorflow.org/api_docs/python/tf/contrib/framework/assign_from_checkpoint_fn
[serving]: https://tensorflow.github.io/serving/serving_basic
[serving]: https://www.tensorflow.org/tfx/serving/serving_basic
## Disclaimer
......
......@@ -14,10 +14,10 @@
# ==============================================================================
"""Define flags are common for both train.py and eval.py scripts."""
import logging
import sys
from tensorflow.python.platform import flags
import logging
import datasets
import model
......@@ -35,9 +35,17 @@ logging.basicConfig(
datefmt='%Y-%m-%d %H:%M:%S')
_common_flags_defined = False
def define():
"""Define common flags."""
# yapf: disable
# common_flags.define() may be called multiple times in unit tests.
global _common_flags_defined
if _common_flags_defined:
return
_common_flags_defined = True
flags.DEFINE_integer('batch_size', 32,
'Batch size.')
......
......@@ -144,9 +144,6 @@ def preprocess_image(image, augment=False, central_crop_size=None,
images = [augment_image(img) for img in images]
image = tf.concat(images, 1)
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.5)
return image
......
......@@ -177,6 +177,8 @@ def get_split(split_name, dataset_dir=None, config=None):
items_to_descriptions=config['items_to_descriptions'],
# additional parameters for convenience.
charset=charset,
charset_file=charset_file,
image_shape=config['image_shape'],
num_char_classes=len(charset),
num_of_views=config['num_of_views'],
max_sequence_length=config['max_sequence_length'],
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to build the Attention OCR model.
Usage example:
......@@ -26,6 +25,7 @@ Usage example:
import sys
import collections
import logging
import numpy as np
import tensorflow as tf
from tensorflow.contrib import slim
from tensorflow.contrib.slim.nets import inception
......@@ -36,13 +36,13 @@ import utils
OutputEndpoints = collections.namedtuple('OutputEndpoints', [
'chars_logit', 'chars_log_prob', 'predicted_chars', 'predicted_scores',
'predicted_text'
'predicted_text', 'predicted_length', 'predicted_conf',
'normalized_seq_conf'
])
# TODO(gorban): replace with tf.HParams when it is released.
ModelParams = collections.namedtuple('ModelParams', [
'num_char_classes', 'seq_length', 'num_views', 'null_code'
])
ModelParams = collections.namedtuple(
'ModelParams', ['num_char_classes', 'seq_length', 'num_views', 'null_code'])
ConvTowerParams = collections.namedtuple('ConvTowerParams', ['final_endpoint'])
......@@ -51,13 +51,12 @@ SequenceLogitsParams = collections.namedtuple('SequenceLogitsParams', [
'lstm_state_clip_value'
])
SequenceLossParams = collections.namedtuple('SequenceLossParams', [
'label_smoothing', 'ignore_nulls', 'average_across_timesteps'
])
SequenceLossParams = collections.namedtuple(
'SequenceLossParams',
['label_smoothing', 'ignore_nulls', 'average_across_timesteps'])
EncodeCoordinatesParams = collections.namedtuple('EncodeCoordinatesParams', [
'enabled'
])
EncodeCoordinatesParams = collections.namedtuple('EncodeCoordinatesParams',
['enabled'])
def _dict_to_array(id_to_char, default_character):
......@@ -121,6 +120,142 @@ def get_softmax_loss_fn(label_smoothing):
return loss_fn
def get_tensor_dimensions(tensor):
"""Returns the shape components of a 4D tensor with variable batch size.
Args:
tensor : A 4D tensor, whose last 3 dimensions are known at graph
construction time.
Returns:
batch_size : The first dimension as a tensor object.
height : The second dimension as a scalar value.
width : The third dimension as a scalar value.
num_features : The forth dimension as a scalar value.
Raises:
ValueError: if input tensor does not have 4 dimensions.
"""
if len(tensor.get_shape().dims) != 4:
raise ValueError(
'Incompatible shape: len(tensor.get_shape().dims) != 4 (%d != 4)' %
len(tensor.get_shape().dims))
batch_size = tf.shape(tensor)[0]
height = tensor.get_shape().dims[1].value
width = tensor.get_shape().dims[2].value
num_features = tensor.get_shape().dims[3].value
return batch_size, height, width, num_features
def lookup_indexed_value(indices, row_vecs):
"""Lookup values in each row of 'row_vecs' indexed by 'indices'.
For each sample in the batch, look up the element for the corresponding
index.
Args:
indices : A tensor of shape (batch, )
row_vecs : A tensor of shape [batch, depth]
Returns:
A tensor of shape (batch, ) formed by row_vecs[i, indices[i]].
"""
gather_indices = tf.stack((tf.range(
tf.shape(row_vecs)[0], dtype=tf.int32), tf.cast(indices, tf.int32)),
axis=1)
return tf.gather_nd(row_vecs, gather_indices)
@utils.ConvertAllInputsToTensors
def max_char_logprob_cumsum(char_log_prob):
"""Computes the cumulative sum of character logprob for all sequence lengths.
Args:
char_log_prob: A tensor of shape [batch x seq_length x num_char_classes]
with log probabilities of a character.
Returns:
A tensor of shape [batch x (seq_length+1)] where each element x[_, j] is
the sum of the max char logprob for all positions upto j.
Note this duplicates the final column and produces (seq_length+1) columns
so the same function can be used regardless whether use_length_predictions
is true or false.
"""
max_char_log_prob = tf.reduce_max(char_log_prob, reduction_indices=2)
# For an input array [a, b, c]) tf.cumsum returns [a, a + b, a + b + c] if
# exclusive set to False (default).
return tf.cumsum(max_char_log_prob, axis=1, exclusive=False)
def find_length_by_null(predicted_chars, null_code):
"""Determine sequence length by finding null_code among predicted char IDs.
Given the char class ID for each position, compute the sequence length.
Note that this function computes this based on the number of null_code,
instead of the position of the first null_code.
Args:
predicted_chars: A tensor of [batch x seq_length] where each element stores
the char class ID with max probability;
null_code: an int32, character id for the NULL.
Returns:
A [batch, ] tensor which stores the sequence length for each sample.
"""
return tf.reduce_sum(
tf.cast(tf.not_equal(null_code, predicted_chars), tf.int32), axis=1)
def axis_pad(tensor, axis, before=0, after=0, constant_values=0.0):
"""Pad a tensor with the specified values along a single axis.
Args:
tensor: a Tensor;
axis: the dimension to add pad along to;
before: number of values to add before the contents of tensor in the
selected dimension;
after: number of values to add after the contents of tensor in the selected
dimension;
constant_values: the scalar pad value to use. Must be same type as tensor.
Returns:
A Tensor. Has the same type as the input tensor, but with a changed shape
along the specified dimension.
"""
if before == 0 and after == 0:
return tensor
ndims = tensor.shape.ndims
padding_size = np.zeros((ndims, 2), dtype='int32')
padding_size[axis] = before, after
return tf.pad(
tensor=tensor,
paddings=tf.constant(padding_size),
constant_values=constant_values)
def null_based_length_prediction(chars_log_prob, null_code):
"""Computes length and confidence of prediction based on positions of NULLs.
Args:
chars_log_prob: A tensor of shape [batch x seq_length x num_char_classes]
with log probabilities of a character;
null_code: an int32, character id for the NULL.
Returns:
A tuple (text_log_prob, predicted_length), where
text_log_prob - is a tensor of the same shape as length_log_prob.
Element #0 of the output corresponds to probability of the empty string,
element #seq_length - is the probability of length=seq_length.
predicted_length is a tensor with shape [batch].
"""
predicted_chars = tf.to_int32(tf.argmax(chars_log_prob, axis=2))
# We do right pad to support sequences with seq_length elements.
text_log_prob = max_char_logprob_cumsum(
axis_pad(chars_log_prob, axis=1, after=1))
predicted_length = find_length_by_null(predicted_chars, null_code)
return text_log_prob, predicted_length
class Model(object):
"""Class to create the Attention OCR Model."""
......@@ -137,13 +272,13 @@ class Model(object):
num_char_classes: size of character set.
seq_length: number of characters in a sequence.
num_views: Number of views (conv towers) to use.
null_code: A character code corresponding to a character which
indicates end of a sequence.
mparams: a dictionary with hyper parameters for methods, keys -
function names, values - corresponding namedtuples.
null_code: A character code corresponding to a character which indicates
end of a sequence.
mparams: a dictionary with hyper parameters for methods, keys - function
names, values - corresponding namedtuples.
charset: an optional dictionary with a mapping between character ids and
utf8 strings. If specified the OutputEndpoints.predicted_text will
utf8 encoded strings corresponding to the character ids returned by
utf8 strings. If specified the OutputEndpoints.predicted_text will utf8
encoded strings corresponding to the character ids returned by
OutputEndpoints.predicted_chars (by default the predicted_text contains
an empty vector).
NOTE: Make sure you call tf.tables_initializer().run() if the charset
......@@ -176,7 +311,8 @@ class Model(object):
label_smoothing=0.1,
ignore_nulls=True,
average_across_timesteps=False),
'encode_coordinates_fn': EncodeCoordinatesParams(enabled=False)
'encode_coordinates_fn':
EncodeCoordinatesParams(enabled=False)
}
def set_mparam(self, function, **kwargs):
......@@ -222,10 +358,10 @@ class Model(object):
"""
num_features = net.get_shape().dims[1].value
if num_features < self._params.seq_length:
raise AssertionError('Incorrect dimension #1 of input tensor'
raise AssertionError(
'Incorrect dimension #1 of input tensor'
' %d should be bigger than %d (shape=%s)' %
(num_features, self._params.seq_length,
net.get_shape()))
(num_features, self._params.seq_length, net.get_shape()))
elif num_features > self._params.seq_length:
logging.warning('Ignoring some features: use %d of %d (shape=%s)',
self._params.seq_length, num_features, net.get_shape())
......@@ -279,16 +415,17 @@ class Model(object):
"""
with tf.variable_scope('pool_views_fn/STCK'):
net = tf.concat(nets, 1)
batch_size = net.get_shape().dims[0].value
batch_size = tf.shape(net)[0]
image_size = net.get_shape().dims[1].value * net.get_shape().dims[2].value
feature_size = net.get_shape().dims[3].value
return tf.reshape(net, [batch_size, -1, feature_size])
return tf.reshape(net, tf.stack([batch_size, image_size, feature_size]))
def char_predictions(self, chars_logit):
"""Returns confidence scores (softmax values) for predicted characters.
Args:
chars_logit: chars logits, a tensor with shape
[batch_size x seq_length x num_char_classes]
chars_logit: chars logits, a tensor with shape [batch_size x seq_length x
num_char_classes]
Returns:
A tuple (ids, log_prob, scores), where:
......@@ -306,7 +443,10 @@ class Model(object):
slim.one_hot_encoding(ids, self._params.num_char_classes), tf.bool)
all_scores = tf.nn.softmax(chars_logit)
selected_scores = tf.boolean_mask(all_scores, mask, name='char_scores')
scores = tf.reshape(selected_scores, shape=(-1, self._params.seq_length))
scores = tf.reshape(
selected_scores,
shape=(-1, self._params.seq_length),
name='predicted_scores')
return ids, log_prob, scores
def encode_coordinates_fn(self, net):
......@@ -323,12 +463,12 @@ class Model(object):
"""
mparams = self._mparams['encode_coordinates_fn']
if mparams.enabled:
batch_size, h, w, _ = net.shape.as_list()
batch_size, h, w, _ = get_tensor_dimensions(net)
x, y = tf.meshgrid(tf.range(w), tf.range(h))
w_loc = slim.one_hot_encoding(x, num_classes=w)
h_loc = slim.one_hot_encoding(y, num_classes=h)
loc = tf.concat([h_loc, w_loc], 2)
loc = tf.tile(tf.expand_dims(loc, 0), [batch_size, 1, 1, 1])
loc = tf.tile(tf.expand_dims(loc, 0), tf.stack([batch_size, 1, 1, 1]))
return tf.concat([net, loc], 3)
else:
return net
......@@ -341,7 +481,8 @@ class Model(object):
"""Creates a base part of the Model (no gradients, losses or summaries).
Args:
images: A tensor of shape [batch_size, height, width, channels].
images: A tensor of shape [batch_size, height, width, channels] with pixel
values in the range [0.0, 1.0].
labels_one_hot: Optional (can be None) one-hot encoding for ground truth
labels. If provided the function will create a model for training.
scope: Optional variable_scope.
......@@ -353,6 +494,11 @@ class Model(object):
"""
logging.debug('images: %s', images)
is_training = labels_one_hot is not None
# Normalize image pixel values to have a symmetrical range around zero.
images = tf.subtract(images, 0.5)
images = tf.multiply(images, 2.5)
with tf.variable_scope(scope, reuse=reuse):
views = tf.split(
value=images, num_or_size_splits=self._params.num_views, axis=2)
......@@ -380,12 +526,28 @@ class Model(object):
predicted_text = character_mapper.get_text(predicted_chars)
else:
predicted_text = tf.constant([])
text_log_prob, predicted_length = null_based_length_prediction(
chars_log_prob, self._params.null_code)
predicted_conf = lookup_indexed_value(predicted_length, text_log_prob)
# Convert predicted confidence from sum of logs to geometric mean
normalized_seq_conf = tf.exp(
tf.divide(predicted_conf,
tf.cast(predicted_length + 1, predicted_conf.dtype)),
name='normalized_seq_conf')
predicted_conf = tf.identity(predicted_conf, name='predicted_conf')
predicted_text = tf.identity(predicted_text, name='predicted_text')
predicted_length = tf.identity(predicted_length, name='predicted_length')
return OutputEndpoints(
chars_logit=chars_logit,
chars_log_prob=chars_log_prob,
predicted_chars=predicted_chars,
predicted_scores=predicted_scores,
predicted_text=predicted_text)
predicted_length=predicted_length,
predicted_text=predicted_text,
predicted_conf=predicted_conf,
normalized_seq_conf=normalized_seq_conf)
def create_loss(self, data, endpoints):
"""Creates all losses required to train the model.
......@@ -413,8 +575,8 @@ class Model(object):
Uses the same method as in https://arxiv.org/abs/1512.00567.
Args:
chars_labels: ground truth ids of charactes,
shape=[batch_size, seq_length];
chars_labels: ground truth ids of charactes, shape=[batch_size,
seq_length];
weight: label-smoothing regularization weight.
Returns:
......@@ -433,10 +595,10 @@ class Model(object):
also ignore all null chars after the first one.
Args:
chars_logits: logits for predicted characters,
shape=[batch_size, seq_length, num_char_classes];
chars_labels: ground truth ids of characters,
shape=[batch_size, seq_length];
chars_logits: logits for predicted characters, shape=[batch_size,
seq_length, num_char_classes];
chars_labels: ground truth ids of characters, shape=[batch_size,
seq_length];
mparams: method hyper parameters.
Returns:
......@@ -482,8 +644,8 @@ class Model(object):
Args:
data: InputEndpoints namedtuple.
endpoints: OutputEndpoints namedtuple.
charset: A dictionary with mapping between character codes and
unicode characters. Use the one provided by a dataset.charset.
charset: A dictionary with mapping between character codes and unicode
characters. Use the one provided by a dataset.charset.
is_training: If True will create summary prefixes for training job,
otherwise - for evaluation.
......@@ -520,14 +682,16 @@ class Model(object):
names_to_values[name] = value_update_tuple[0]
names_to_updates[name] = value_update_tuple[1]
use_metric('CharacterAccuracy',
use_metric(
'CharacterAccuracy',
metrics.char_accuracy(
endpoints.predicted_chars,
data.labels,
streaming=True,
rej_char=self._params.null_code))
# Sequence accuracy computed by cutting sequence at the first null char
use_metric('SequenceAccuracy',
use_metric(
'SequenceAccuracy',
metrics.sequence_accuracy(
endpoints.predicted_chars,
data.labels,
......@@ -539,13 +703,14 @@ class Model(object):
tf.summary.scalar(summary_name, tf.Print(value, [value], summary_name))
return list(names_to_updates.values())
def create_init_fn_to_restore(self, master_checkpoint,
def create_init_fn_to_restore(self,
master_checkpoint,
inception_checkpoint=None):
"""Creates an init operations to restore weights from various checkpoints.
Args:
master_checkpoint: path to a checkpoint which contains all weights for
the whole model.
master_checkpoint: path to a checkpoint which contains all weights for the
whole model.
inception_checkpoint: path to a checkpoint which contains weights for the
inception part only.
......@@ -556,8 +721,8 @@ class Model(object):
all_feed_dict = {}
def assign_from_checkpoint(variables, checkpoint):
logging.info('Request to re-store %d weights from %s',
len(variables), checkpoint)
logging.info('Request to re-store %d weights from %s', len(variables),
checkpoint)
if not variables:
logging.error('Can\'t find any variables to restore.')
sys.exit(1)
......@@ -565,9 +730,12 @@ class Model(object):
all_assign_ops.append(assign_op)
all_feed_dict.update(feed_dict)
logging.info('variables_to_restore:\n%s' % utils.variables_to_restore().keys())
logging.info('moving_average_variables:\n%s' % [v.op.name for v in tf.moving_average_variables()])
logging.info('trainable_variables:\n%s' % [v.op.name for v in tf.trainable_variables()])
logging.info('variables_to_restore:\n%s',
utils.variables_to_restore().keys())
logging.info('moving_average_variables:\n%s',
[v.op.name for v in tf.moving_average_variables()])
logging.info('trainable_variables:\n%s',
[v.op.name for v in tf.trainable_variables()])
if master_checkpoint:
assign_from_checkpoint(utils.variables_to_restore(), master_checkpoint)
......
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Converts existing checkpoint into a SavedModel.
Usage example:
python model_export.py \
--logtostderr --checkpoint=model.ckpt-399731 \
--export_dir=/tmp/attention_ocr_export
"""
import os
import tensorflow as tf
from tensorflow import app
from tensorflow.contrib import slim
from tensorflow.python.platform import flags
import common_flags
import model_export_lib
FLAGS = flags.FLAGS
common_flags.define()
flags.DEFINE_string('export_dir', None, 'Directory to export model files to.')
flags.DEFINE_integer(
'image_width', None,
'Image width used during training (or crop width if used)'
' If not set, the dataset default is used instead.')
flags.DEFINE_integer(
'image_height', None,
'Image height used during training(or crop height if used)'
' If not set, the dataset default is used instead.')
flags.DEFINE_string('work_dir', '/tmp', 'A directory to store temporary files.')
flags.DEFINE_integer('version_number', 1, 'Version number of the model')
flags.DEFINE_bool(
'export_for_serving', True,
'Whether the exported model accepts serialized tf.Example '
'protos as input')
def get_checkpoint_path():
"""Returns a path to a checkpoint based on specified commandline flags.
In order to specify a full path to a checkpoint use --checkpoint flag.
Alternatively, if --train_log_dir was specified it will return a path to the
most recent checkpoint.
Raises:
ValueError: in case it can't find a checkpoint.
Returns:
A string.
"""
if FLAGS.checkpoint:
return FLAGS.checkpoint
else:
model_save_path = tf.train.latest_checkpoint(FLAGS.train_log_dir)
if not model_save_path:
raise ValueError('Can\'t find a checkpoint in: %s' % FLAGS.train_log_dir)
return model_save_path
def export_model(export_dir,
export_for_serving,
batch_size=None,
crop_image_width=None,
crop_image_height=None):
"""Exports a model to the named directory.
Note that --datatset_name and --checkpoint are required and parsed by the
underlying module common_flags.
Args:
export_dir: The output dir where model is exported to.
export_for_serving: If True, expects a serialized image as input and attach
image normalization as part of exported graph.
batch_size: For non-serving export, the input batch_size needs to be
specified.
crop_image_width: Width of the input image. Uses the dataset default if
None.
crop_image_height: Height of the input image. Uses the dataset default if
None.
Returns:
Returns the model signature_def.
"""
# Dataset object used only to get all parameters for the model.
dataset = common_flags.create_dataset(split_name='test')
model = common_flags.create_model(
dataset.num_char_classes,
dataset.max_sequence_length,
dataset.num_of_views,
dataset.null_code,
charset=dataset.charset)
dataset_image_height, dataset_image_width, image_depth = dataset.image_shape
# Add check for charmap file
if not os.path.exists(dataset.charset_file):
raise ValueError('No charset defined at {}: export will fail'.format(
dataset.charset))
# Default to dataset dimensions, otherwise use provided dimensions.
image_width = crop_image_width or dataset_image_width
image_height = crop_image_height or dataset_image_height
if export_for_serving:
images_orig = tf.placeholder(
tf.string, shape=[batch_size], name='tf_example')
images_orig_float = model_export_lib.generate_tfexample_image(
images_orig,
image_height,
image_width,
image_depth,
name='float_images')
else:
images_shape = (batch_size, image_height, image_width, image_depth)
images_orig = tf.placeholder(
tf.uint8, shape=images_shape, name='original_image')
images_orig_float = tf.image.convert_image_dtype(
images_orig, dtype=tf.float32, name='float_images')
endpoints = model.create_base(images_orig_float, labels_one_hot=None)
sess = tf.Session()
saver = tf.train.Saver(slim.get_variables_to_restore(), sharded=True)
saver.restore(sess, get_checkpoint_path())
tf.logging.info('Model restored successfully.')
# Create model signature.
if export_for_serving:
input_tensors = {
tf.saved_model.signature_constants.CLASSIFY_INPUTS: images_orig
}
else:
input_tensors = {'images': images_orig}
signature_inputs = model_export_lib.build_tensor_info(input_tensors)
# NOTE: Tensors 'image_float' and 'chars_logit' are used by the inference
# or to compute saliency maps.
output_tensors = {
'images_float': images_orig_float,
'predictions': endpoints.predicted_chars,
'scores': endpoints.predicted_scores,
'chars_logit': endpoints.chars_logit,
'predicted_length': endpoints.predicted_length,
'predicted_text': endpoints.predicted_text,
'predicted_conf': endpoints.predicted_conf,
'normalized_seq_conf': endpoints.normalized_seq_conf
}
for i, t in enumerate(
model_export_lib.attention_ocr_attention_masks(
dataset.max_sequence_length)):
output_tensors['attention_mask_%d' % i] = t
signature_outputs = model_export_lib.build_tensor_info(output_tensors)
signature_def = tf.saved_model.signature_def_utils.build_signature_def(
signature_inputs, signature_outputs,
tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME)
# Save model.
builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
builder.add_meta_graph_and_variables(
sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map={
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
signature_def
},
main_op=tf.tables_initializer(),
strip_default_attrs=True)
builder.save()
tf.logging.info('Model has been exported to %s' % export_dir)
return signature_def
def main(unused_argv):
if os.path.exists(FLAGS.export_dir):
raise ValueError('export_dir already exists: exporting will fail')
export_model(FLAGS.export_dir, FLAGS.export_for_serving, FLAGS.batch_size,
FLAGS.image_width, FLAGS.image_height)
if __name__ == '__main__':
flags.mark_flag_as_required('dataset_name')
flags.mark_flag_as_required('export_dir')
app.run(main)
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions for exporting Attention OCR model."""
import tensorflow as tf
# Function borrowed from research/object_detection/core/preprocessor.py
def normalize_image(image, original_minval, original_maxval, target_minval,
target_maxval):
"""Normalizes pixel values in the image.
Moves the pixel values from the current [original_minval, original_maxval]
range to a the [target_minval, target_maxval] range.
Args:
image: rank 3 float32 tensor containing 1 image -> [height, width,
channels].
original_minval: current image minimum value.
original_maxval: current image maximum value.
target_minval: target image minimum value.
target_maxval: target image maximum value.
Returns:
image: image which is the same shape as input image.
"""
with tf.name_scope('NormalizeImage', values=[image]):
original_minval = float(original_minval)
original_maxval = float(original_maxval)
target_minval = float(target_minval)
target_maxval = float(target_maxval)
image = tf.cast(image, dtype=tf.float32)
image = tf.subtract(image, original_minval)
image = tf.multiply(image, (target_maxval - target_minval) /
(original_maxval - original_minval))
image = tf.add(image, target_minval)
return image
def generate_tfexample_image(input_example_strings,
image_height,
image_width,
image_channels,
name=None):
"""Parses a 1D tensor of serialized tf.Example protos and returns image batch.
Args:
input_example_strings: A 1-Dimensional tensor of size [batch_size] and type
tf.string containing a serialized Example proto per image.
image_height: First image dimension.
image_width: Second image dimension.
image_channels: Third image dimension.
name: optional tensor name.
Returns:
A tensor with shape [batch_size, height, width, channels] of type float32
with values in the range [0..1]
"""
batch_size = tf.shape(input_example_strings)[0]
images_shape = tf.stack(
[batch_size, image_height, image_width, image_channels])
tf_example_image_key = 'image/encoded'
feature_configs = {
tf_example_image_key:
tf.FixedLenFeature(
image_height * image_width * image_channels, dtype=tf.float32)
}
feature_tensors = tf.parse_example(input_example_strings, feature_configs)
float_images = tf.reshape(
normalize_image(
feature_tensors[tf_example_image_key],
original_minval=0.0,
original_maxval=255.0,
target_minval=0.0,
target_maxval=1.0),
images_shape,
name=name)
return float_images
def attention_ocr_attention_masks(num_characters):
# TODO(gorban): use tensors directly after replacing LSTM unroll methods.
prefix = ('AttentionOcr_v1/'
'sequence_logit_fn/SQLR/LSTM/attention_decoder/Attention_0')
names = ['%s/Softmax:0' % (prefix)]
for i in range(1, num_characters):
names += ['%s_%d/Softmax:0' % (prefix, i)]
return [tf.get_default_graph().get_tensor_by_name(n) for n in names]
def build_tensor_info(tensor_dict):
return {
k: tf.saved_model.utils.build_tensor_info(t)
for k, t in tensor_dict.items()
}
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for model_export."""
import os
import numpy as np
from absl.testing import flagsaver
import tensorflow as tf
import common_flags
import model_export
_CHECKPOINT = 'model.ckpt-399731'
_CHECKPOINT_URL = (
'http://download.tensorflow.org/models/attention_ocr_2017_08_09.tar.gz')
def _clean_up():
tf.gfile.DeleteRecursively(tf.test.get_temp_dir())
def _create_tf_example_string(image):
"""Create a serialized tf.Example proto for feeding the model."""
example = tf.train.Example()
example.features.feature['image/encoded'].float_list.value.extend(
list(np.reshape(image, (-1))))
return example.SerializeToString()
class AttentionOcrExportTest(tf.test.TestCase):
"""Tests for model_export.export_model."""
def setUp(self):
for suffix in ['.meta', '.index', '.data-00000-of-00001']:
filename = _CHECKPOINT + suffix
self.assertTrue(
tf.gfile.Exists(filename),
msg='Missing checkpoint file %s. '
'Please download and extract it from %s' %
(filename, _CHECKPOINT_URL))
tf.flags.FLAGS.dataset_name = 'fsns'
tf.flags.FLAGS.checkpoint = _CHECKPOINT
tf.flags.FLAGS.dataset_dir = os.path.join(
os.path.dirname(__file__), 'datasets/testdata/fsns')
tf.test.TestCase.setUp(self)
_clean_up()
self.export_dir = os.path.join(tf.test.get_temp_dir(), 'exported_model')
self.minimal_output_signature = {
'predictions': 'AttentionOcr_v1/predicted_chars:0',
'scores': 'AttentionOcr_v1/predicted_scores:0',
'predicted_length': 'AttentionOcr_v1/predicted_length:0',
'predicted_text': 'AttentionOcr_v1/predicted_text:0',
'predicted_conf': 'AttentionOcr_v1/predicted_conf:0',
'normalized_seq_conf': 'AttentionOcr_v1/normalized_seq_conf:0'
}
def create_input_feed(self, graph_def, serving):
"""Returns the input feed for the model.
Creates random images, according to the size specified by dataset_name,
format it in the correct way depending on whether the model was exported
for serving, and return the correctly keyed feed_dict for inference.
Args:
graph_def: Graph definition of the loaded model.
serving: Whether the model was exported for Serving.
Returns:
The feed_dict suitable for model inference.
"""
# Creates a dataset based on FLAGS.dataset_name.
self.dataset = common_flags.create_dataset('test')
# Create some random images to test inference for any dataset.
self.images = {
'img1':
np.random.uniform(low=64, high=192,
size=self.dataset.image_shape).astype('uint8'),
'img2':
np.random.uniform(low=32, high=224,
size=self.dataset.image_shape).astype('uint8'),
}
signature_def = graph_def.signature_def[
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
if serving:
input_name = signature_def.inputs[
tf.saved_model.signature_constants.CLASSIFY_INPUTS].name
# Model for serving takes input: inputs['inputs'] = 'tf_example:0'
feed_dict = {
input_name: [
_create_tf_example_string(self.images['img1']),
_create_tf_example_string(self.images['img2'])
]
}
else:
input_name = signature_def.inputs['images'].name
# Model for direct use takes input: inputs['images'] = 'original_image:0'
feed_dict = {
input_name: np.stack([self.images['img1'], self.images['img2']])
}
return feed_dict
def verify_export_load_and_inference(self, export_for_serving=False):
"""Verify exported model can be loaded and inference can run successfully.
This function will load the exported model in self.export_dir, then create
some fake images according to the specification of FLAGS.dataset_name.
It then feeds the input through the model, and verify the minimal set of
output signatures are present.
Note: Model and dataset creation in the underlying library depends on the
following commandline flags:
FLAGS.dataset_name
Args:
export_for_serving: True if the model was exported for Serving. This
affects how input is fed into the model.
"""
tf.reset_default_graph()
sess = tf.Session()
graph_def = tf.saved_model.loader.load(
sess=sess,
tags=[tf.saved_model.tag_constants.SERVING],
export_dir=self.export_dir)
feed_dict = self.create_input_feed(graph_def, export_for_serving)
results = sess.run(self.minimal_output_signature, feed_dict=feed_dict)
out_shape = (2,)
self.assertEqual(np.shape(results['predicted_conf']), out_shape)
self.assertEqual(np.shape(results['predicted_text']), out_shape)
self.assertEqual(np.shape(results['predicted_length']), out_shape)
self.assertEqual(np.shape(results['normalized_seq_conf']), out_shape)
out_shape = (2, self.dataset.max_sequence_length)
self.assertEqual(np.shape(results['scores']), out_shape)
self.assertEqual(np.shape(results['predictions']), out_shape)
@flagsaver.flagsaver
def test_fsns_export_for_serving_and_load_inference(self):
model_export.export_model(self.export_dir, True)
self.verify_export_load_and_inference(True)
@flagsaver.flagsaver
def test_fsns_export_and_load_inference(self):
model_export.export_model(self.export_dir, False, batch_size=2)
self.verify_export_load_and_inference(False)
if __name__ == '__main__':
tf.test.main()
......@@ -12,11 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the model."""
import string
import numpy as np
import string
import tensorflow as tf
from tensorflow.contrib import slim
......@@ -32,6 +31,7 @@ def create_fake_charset(num_char_classes):
class ModelTest(tf.test.TestCase):
def setUp(self):
tf.test.TestCase.setUp(self)
......@@ -51,18 +51,21 @@ class ModelTest(tf.test.TestCase):
self.chars_logit_shape = (self.batch_size, self.seq_length,
self.num_char_classes)
self.length_logit_shape = (self.batch_size, self.seq_length + 1)
# Placeholder knows image dimensions, but not batch size.
self.input_images = tf.placeholder(
tf.float32,
shape=(None, self.image_height, self.image_width, 3),
name='input_node')
self.initialize_fakes()
def initialize_fakes(self):
self.images_shape = (self.batch_size, self.image_height, self.image_width,
3)
self.fake_images = tf.constant(
self.rng.randint(low=0, high=255,
size=self.images_shape).astype('float32'),
name='input_node')
self.fake_conv_tower_np = self.rng.randn(
*self.conv_tower_shape).astype('float32')
self.fake_images = self.rng.randint(
low=0, high=255, size=self.images_shape).astype('float32')
self.fake_conv_tower_np = self.rng.randn(*self.conv_tower_shape).astype(
'float32')
self.fake_conv_tower = tf.constant(self.fake_conv_tower_np)
self.fake_logits = tf.constant(
self.rng.randn(*self.chars_logit_shape).astype('float32'))
......@@ -74,33 +77,44 @@ class ModelTest(tf.test.TestCase):
def create_model(self, charset=None):
return model.Model(
self.num_char_classes, self.seq_length, num_views=4, null_code=62,
self.num_char_classes,
self.seq_length,
num_views=4,
null_code=62,
charset=charset)
def test_char_related_shapes(self):
ocr_model = self.create_model()
charset = create_fake_charset(self.num_char_classes)
ocr_model = self.create_model(charset=charset)
with self.test_session() as sess:
endpoints_tf = ocr_model.create_base(
images=self.fake_images, labels_one_hot=None)
images=self.input_images, labels_one_hot=None)
sess.run(tf.global_variables_initializer())
endpoints = sess.run(endpoints_tf)
self.assertEqual((self.batch_size, self.seq_length,
self.num_char_classes), endpoints.chars_logit.shape)
self.assertEqual((self.batch_size, self.seq_length,
self.num_char_classes), endpoints.chars_log_prob.shape)
tf.tables_initializer().run()
endpoints = sess.run(
endpoints_tf, feed_dict={self.input_images: self.fake_images})
self.assertEqual(
(self.batch_size, self.seq_length, self.num_char_classes),
endpoints.chars_logit.shape)
self.assertEqual(
(self.batch_size, self.seq_length, self.num_char_classes),
endpoints.chars_log_prob.shape)
self.assertEqual((self.batch_size, self.seq_length),
endpoints.predicted_chars.shape)
self.assertEqual((self.batch_size, self.seq_length),
endpoints.predicted_scores.shape)
self.assertEqual((self.batch_size,), endpoints.predicted_text.shape)
self.assertEqual((self.batch_size,), endpoints.predicted_conf.shape)
self.assertEqual((self.batch_size,), endpoints.normalized_seq_conf.shape)
def test_predicted_scores_are_within_range(self):
ocr_model = self.create_model()
_, _, scores = ocr_model.char_predictions(self.fake_logits)
with self.test_session() as sess:
scores_np = sess.run(scores)
scores_np = sess.run(
scores, feed_dict={self.input_images: self.fake_images})
values_in_range = (scores_np >= 0.0) & (scores_np <= 1.0)
self.assertTrue(
......@@ -111,10 +125,11 @@ class ModelTest(tf.test.TestCase):
def test_conv_tower_shape(self):
with self.test_session() as sess:
ocr_model = self.create_model()
conv_tower = ocr_model.conv_tower_fn(self.fake_images)
conv_tower = ocr_model.conv_tower_fn(self.input_images)
sess.run(tf.global_variables_initializer())
conv_tower_np = sess.run(conv_tower)
conv_tower_np = sess.run(
conv_tower, feed_dict={self.input_images: self.fake_images})
self.assertEqual(self.conv_tower_shape, conv_tower_np.shape)
......@@ -124,11 +139,12 @@ class ModelTest(tf.test.TestCase):
# updates, gradients and variances. It also depends on the type of used
# optimizer.
ocr_model = self.create_model()
ocr_model.create_base(images=self.fake_images, labels_one_hot=None)
ocr_model.create_base(images=self.input_images, labels_one_hot=None)
with self.test_session() as sess:
tfprof_root = tf.profiler.profile(
sess.graph,
options=tf.profiler.ProfileOptionBuilder.trainable_variables_parameter())
options=tf.profiler.ProfileOptionBuilder
.trainable_variables_parameter())
model_size_bytes = 4 * tfprof_root.total_parameters
self.assertLess(model_size_bytes, 1 * 2**30)
......@@ -158,7 +174,7 @@ class ModelTest(tf.test.TestCase):
loss = model.sequence_loss_fn(self.fake_logits, self.fake_labels)
with self.test_session() as sess:
loss_np = sess.run(loss)
loss_np = sess.run(loss, feed_dict={self.input_images: self.fake_images})
# This test checks that the loss function is 'runnable'.
self.assertEqual(loss_np.shape, tuple())
......@@ -172,7 +188,8 @@ class ModelTest(tf.test.TestCase):
Returns:
a list of tensors with encoded image coordinates in them.
"""
batch_size, h, w, _ = net.shape.as_list()
batch_size = tf.shape(net)[0]
_, h, w, _ = net.shape.as_list()
h_loc = [
tf.tile(
tf.reshape(
......@@ -197,11 +214,12 @@ class ModelTest(tf.test.TestCase):
conv_w_coords_tf = model.encode_coordinates_fn(self.fake_conv_tower)
with self.test_session() as sess:
conv_w_coords = sess.run(conv_w_coords_tf)
conv_w_coords = sess.run(
conv_w_coords_tf, feed_dict={self.input_images: self.fake_images})
batch_size, height, width, feature_size = self.conv_tower_shape
self.assertEqual(conv_w_coords.shape, (batch_size, height, width,
feature_size + height + width))
self.assertEqual(conv_w_coords.shape,
(batch_size, height, width, feature_size + height + width))
def test_disabled_coordinate_encoding_returns_features_unchanged(self):
model = self.create_model()
......@@ -209,7 +227,8 @@ class ModelTest(tf.test.TestCase):
conv_w_coords_tf = model.encode_coordinates_fn(self.fake_conv_tower)
with self.test_session() as sess:
conv_w_coords = sess.run(conv_w_coords_tf)
conv_w_coords = sess.run(
conv_w_coords_tf, feed_dict={self.input_images: self.fake_images})
self.assertAllEqual(conv_w_coords, self.fake_conv_tower_np)
......@@ -221,7 +240,8 @@ class ModelTest(tf.test.TestCase):
conv_w_coords_tf = model.encode_coordinates_fn(fake_conv_tower)
with self.test_session() as sess:
conv_w_coords = sess.run(conv_w_coords_tf)
conv_w_coords = sess.run(
conv_w_coords_tf, feed_dict={self.input_images: self.fake_images})
# Original features
self.assertAllEqual(conv_w_coords[0, :, :, :4],
......@@ -261,10 +281,11 @@ class ModelTest(tf.test.TestCase):
class CharsetMapperTest(tf.test.TestCase):
def test_text_corresponds_to_ids(self):
charset = create_fake_charset(36)
ids = tf.constant(
[[17, 14, 21, 21, 24], [32, 24, 27, 21, 13]], dtype=tf.int64)
ids = tf.constant([[17, 14, 21, 21, 24], [32, 24, 27, 21, 13]],
dtype=tf.int64)
charset_mapper = model.CharsetMapper(charset)
with self.test_session() as sess:
......
......@@ -111,7 +111,7 @@ class SequenceLayerBase(object):
self._mparams = method_params
self._net = net
self._labels_one_hot = labels_one_hot
self._batch_size = net.get_shape().dims[0].value
self._batch_size = tf.shape(net)[0]
# Initialize parameters for char logits which will be computed on the fly
# inside an LSTM decoder.
......@@ -275,7 +275,7 @@ class NetSlice(SequenceLayerBase):
def __init__(self, *args, **kwargs):
super(NetSlice, self).__init__(*args, **kwargs)
self._zero_label = tf.zeros(
[self._batch_size, self._params.num_char_classes])
tf.stack([self._batch_size, self._params.num_char_classes]))
def get_image_feature(self, char_index):
"""Returns a subset of image features for a character.
......@@ -352,7 +352,7 @@ class Attention(SequenceLayerBase):
def __init__(self, *args, **kwargs):
super(Attention, self).__init__(*args, **kwargs)
self._zero_label = tf.zeros(
[self._batch_size, self._params.num_char_classes])
tf.stack([self._batch_size, self._params.num_char_classes]))
def get_eval_input(self, prev, i):
"""See SequenceLayerBase.get_eval_input for details."""
......
......@@ -78,3 +78,20 @@ def variables_to_restore(scope=None, strip_scope=False):
return variable_map
else:
return {v.op.name: v for v in slim.get_variables_to_restore()}
def ConvertAllInputsToTensors(func):
"""A decorator to convert all function's inputs into tensors.
Args:
func: a function to decorate.
Returns:
A decorated function.
"""
def FuncWrapper(*args):
tensors = [tf.convert_to_tensor(a) for a in args]
return func(*tensors)
return FuncWrapper
## DELF installation
### Installation script
We now have a script to do the entire installation in one shot. Navigate to the
directory `models/research/delf/delf/python/training`, then run:
```bash
# From models/research/delf/delf/python/training
bash install_delf.sh
```
If this works, you are done! If not, see below for detailed instructions for
installing this codebase and its dependencies.
*Please note that this installation script only works on 64 bits Linux
architectures due to the `protoc` binary that is automatically downloaded. If
you wish to install the DELF library on other architectures please update the
[`install_delf.sh`](delf/python/training/install_delf.sh) script by referencing
the desired `protoc`
[binary release](https://github.com/protocolbuffers/protobuf/releases).*
In more detail: the `install_delf.sh` script installs both the DELF library and
its dependencies in the following sequence:
* Install TensorFlow 2.2 and TensorFlow 2.2 for GPU.
* Install the [TF-Slim](https://github.com/google-research/tf-slim) library
from source.
* Download [protoc](https://github.com/protocolbuffers/protobuf) and compile
the DELF Protocol Buffers.
* Install the matplotlib, numpy, scikit-image, scipy and python3-tk Python
libraries.
* Install the
[TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection)
from the cloned TensorFlow Model Garden repository.
* Install the DELF package.
### Tensorflow
[![TensorFlow 2.1](https://img.shields.io/badge/tensorflow-2.1-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0)
[![TensorFlow 2.2](https://img.shields.io/badge/tensorflow-2.2-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
[![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
For detailed steps to install Tensorflow, follow the
......@@ -11,9 +46,9 @@ typical user can install Tensorflow using one of the following commands:
```bash
# For CPU:
pip3 install 'tensorflow'
pip3 install 'tensorflow>=2.2.0'
# For GPU:
pip3 install 'tensorflow-gpu'
pip3 install 'tensorflow-gpu>=2.2.0'
```
### TF-Slim
......
# Deep Local and Global Image Features
[![TensorFlow 2.1](https://img.shields.io/badge/tensorflow-2.1-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0)
[![TensorFlow 2.2](https://img.shields.io/badge/tensorflow-2.2-brightgreen)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
[![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
This project presents code for deep local and global image feature methods,
......@@ -41,7 +41,7 @@ DELG:
```
"Unifying Deep Local and Global Features for Image Search",
B. Cao*, A. Araujo* and J. Sim,
arxiv:2001.05027
Proc. ECCV'20
```
GLDv2:
......@@ -55,11 +55,11 @@ Proc. CVPR'20
## News
- [Jul'20] Check out our ECCV'20 paper:
["Unifying Deep Local and Global Features for Image Search"](https://arxiv.org/abs/2001.05027)
- [Apr'20] Check out our CVPR'20 paper: ["Google Landmarks Dataset v2 - A
Large-Scale Benchmark for Instance-Level Recognition and
Retrieval"](https://arxiv.org/abs/2004.01804)
- [Jan'20] Check out our new paper:
["Unifying Deep Local and Global Features for Image Search"](https://arxiv.org/abs/2001.05027)
- [Jun'19] DELF achieved 2nd place in
[CVPR Visual Localization challenge (Local Features track)](https://sites.google.com/corp/view/ltvl2019).
See our slides
......@@ -182,104 +182,55 @@ directories therein, `protos` and `python`.
### `delf/protos`
This directory contains protobufs:
- `aggregation_config.proto`: protobuf for configuring local feature
aggregation.
- `box.proto`: protobuf for serializing detected boxes.
- `datum.proto`: general-purpose protobuf for serializing float tensors.
- `delf_config.proto`: protobuf for configuring DELF/DELG extraction.
- `feature.proto`: protobuf for serializing DELF features.
This directory contains protobufs for local feature aggregation
(`aggregation_config.proto`), serializing detected boxes (`box.proto`),
serializing float tensors (`datum.proto`), configuring DELF/DELG extraction
(`delf_config.proto`), serializing local features (`feature.proto`).
### `delf/python`
This directory contains files for several different purposes:
- `box_io.py`, `datum_io.py`, `feature_io.py` are helper files for reading and
writing tensors and features.
- `delf_v1.py` contains code to create DELF models.
- `feature_aggregation_extractor.py` contains a module to perform local
feature aggregation.
- `feature_aggregation_similarity.py` contains a module to perform similarity
computation for aggregated local features.
- `feature_extractor.py` contains the code to extract features using DELF.
This is particularly useful for extracting features over multiple scales,
with keypoint selection based on attention scores, and PCA/whitening
post-processing.
The subdirectory `delf/python/examples` contains sample scripts to run DELF
feature extraction/matching, and object detection:
- `delf_config_example.pbtxt` shows an example instantiation of the DelfConfig
proto, used for DELF feature extraction.
- `detector.py` is a module to construct an object detector function.
- `extract_boxes.py` enables object detection from a list of images.
- `extract_features.py` enables DELF extraction from a list of images.
- `extractor.py` is a module to construct a DELF/DELG local feature extraction
function.
- `match_images.py` supports image matching using DELF features extracted
using `extract_features.py`.
This directory contains files for several different purposes, such as:
reading/writing tensors/features (`box_io.py`, `datum_io.py`, `feature_io.py`),
local feature aggregation extraction and similarity computation
(`feature_aggregation_extractor.py`, `feature_aggregation_similarity.py`) and
helper functions for image/feature loading/processing (`utils.py`,
`feature_extractor.py`).
The subdirectory `delf/python/delg` contains sample scripts/configs related to
the DELG paper:
The subdirectory `delf/python/examples` contains sample scripts to run DELF/DELG
feature extraction/matching (`extractor.py`, `extract_features.py`,
`match_images.py`) and object detection (`detector.py`, `extract_boxes.py`).
`delf_config_example.pbtxt` shows an example instantiation of the DelfConfig
proto, used for DELF feature extraction.
- `delg_gld_config.pbtxt` gives the DelfConfig used in DELG paper.
- `extract_features.py` for local+global feature extraction on Revisited
datasets.
- `perform_retrieval.py` for performing retrieval/evaluating methods on
Revisited datasets.
The subdirectory `delf/python/delg` contains sample scripts/configs related to
the DELG paper: `extract_features.py` for local+global feature extraction (with
and example `delg_gld_config.pbtxt`) and `perform_retrieval.py` for performing
retrieval/scoring.
The subdirectory `delf/python/detect_to_retrieve` contains sample
scripts/configs related to the Detect-to-Retrieve paper:
- `aggregation_extraction.py` is a library to extract/save feature
aggregation.
- `boxes_and_features_extraction.py` is a library to extract/save boxes and
DELF features.
- `cluster_delf_features.py` for local feature clustering.
- `dataset.py` for parsing/evaluating results on Revisited Oxford/Paris
datasets.
- `delf_gld_config.pbtxt` gives the DelfConfig used in Detect-to-Retrieve
paper.
- `extract_aggregation.py` for aggregated local feature extraction.
- `extract_index_boxes_and_features.py` for index image local feature
extraction / bounding box detection on Revisited datasets.
- `extract_query_features.py` for query image local feature extraction on
Revisited datasets.
- `image_reranking.py` is a module to re-rank images with geometric
verification.
- `perform_retrieval.py` for performing retrieval/evaluating methods using
aggregated local features on Revisited datasets.
- `index_aggregation_config.pbtxt`, `query_aggregation_config.pbtxt` give
AggregationConfig's for Detect-to-Retrieve experiments.
scripts/configs related to the Detect-to-Retrieve paper, for feature/box
extraction/aggregation/clustering (`aggregation_extraction.py`,
`boxes_and_features_extraction.py`, `cluster_delf_features.py`,
`extract_aggregation.py`, `extract_index_boxes_and_features.py`,
`extract_query_features.py`), image retrieval/reranking (`perform_retrieval.py`,
`image_reranking.py`), along with configs used for feature
extraction/aggregation (`delf_gld_config.pbtxt`,
`index_aggregation_config.pbtxt`, `query_aggregation_config.pbtxt`) and
Revisited Oxford/Paris dataset parsing/evaluation (`dataset.py`).
The subdirectory `delf/python/google_landmarks_dataset` contains sample
scripts/modules for computing GLD metrics / reproducing results from the GLDv2
paper:
- `compute_recognition_metrics.py` performs recognition metric computation
given input predictions and solution files.
- `compute_retrieval_metrics.py` performs retrieval metric computation given
input predictions and solution files.
- `dataset_file_io.py` is a module for dataset-related file IO.
- `metrics.py` is a module for GLD metric computation.
- `rn101_af_gldv2clean_config.pbtxt` gives the DelfConfig used in the
ResNet101-ArcFace (trained on GLDv2-train-clean) baseline used in the GLDv2
paper.
scripts/modules for computing GLD metrics (`metrics.py`,
`compute_recognition_metrics.py`, `compute_retrieval_metrics.py`), GLD file IO
(`dataset_file_io.py`) / reproducing results from the GLDv2 paper
(`rn101_af_gldv2clean_config.pbtxt` and the instructions therein).
The subdirectory `delf/python/training` contains sample scripts/modules for
performing DELF training:
- `datasets/googlelandmarks.py` is the dataset module used for training.
- `model/delf_model.py` is the model module used for training.
- `model/export_model.py` is a script for exporting trained models in the
format used by the inference code.
- `model/export_model_utils.py` is a module with utilities for model
exporting.
- `model/resnet50.py` is a module with a backbone RN50 implementation.
- `build_image_dataset.py` converts downloaded dataset into TFRecords format
for training.
- `train.py` is the main training script.
performing model training (`train.py`) based on a ResNet50 DELF model
(`model/resnet50.py`, `model/delf_model.py`), also presenting relevant model
exporting scripts and associated utils (`model/export_model.py`,
`model/export_global_model.py`, `model/export_model_utils.py`) and dataset
downloading/preprocessing (`download_dataset.sh`, `build_image_dataset.py`,
`datasets/googlelandmarks.py`).
Besides these, other files in the different subdirectories contain tests for the
various modules.
......@@ -290,6 +241,16 @@ Andr&eacute; Araujo (@andrefaraujo)
## Release history
### Jul, 2020
- Full TF2 support. Only one minor `compat.v1` usage left. Updated
instructions to require TF2.2
- Refactored / much improved training code, with very detailed, step-by-step
instructions
**Thanks to contributors**: Dan Anghel, Barbara Fusinska and Andr&eacute;
Araujo.
### May, 2020
- Codebase is now Python3-first
......
......@@ -24,34 +24,9 @@ cd models/research/delf/delf/python/training
## Install the DELF Library
The DELF Python library can be installed by running the
[`install_delf.sh`](./install_delf.sh) script using the command:
```
bash install_delf.sh
```
The script installs both the DELF library and its dependencies in the following
sequence:
* Install TensorFlow 2.2 and TensorFlow 2.2 for GPU.
* Install the [TF-Slim](https://github.com/google-research/tf-slim) library
from source.
* Download [protoc](https://github.com/protocolbuffers/protobuf) and compile
the DELF Protocol Buffers.
* Install the matplotlib, numpy, scikit-image, scipy and python3-tk Python
libraries.
* Install the
[TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection)
from the cloned TensorFlow Model Garden repository.
* Install the DELF package.
*Please note that the current installation only works on 64 bits Linux
architectures due to the `protoc` binary downloaded by the installation script.
If you wish to install the DELF library on other architectures please update the
[`install_delf.sh`](./install_delf.sh) script by referencing the desired
`protoc`
[binary release](https://github.com/protocolbuffers/protobuf/releases).*
To be able to use this code, please follow
[these instructions](../../../INSTALL_INSTRUCTIONS.md) to properly install the
DELF library.
## Download the GLDv2 Training Data
......
......@@ -22,7 +22,7 @@ install_requires = [
'pandas >= 0.24.2',
'numpy >= 1.16.1',
'scipy >= 1.2.2',
'tensorflow >= 2.0.0b1',
'tensorflow >= 2.2.0',
'tf_slim >= 1.1',
'tensorflow_probability >= 0.9.0',
]
......
# Contributing to the Tensorflow Object Detection API
# Contributing to the TensorFlow Object Detection API
Patches to Tensorflow Object Detection API are welcome!
Patches to TensorFlow Object Detection API are welcome!
We require contributors to fill out either the individual or corporate
Contributor License Agreement (CLA).
......@@ -9,5 +9,5 @@ Contributor License Agreement (CLA).
* If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](http://code.google.com/legal/corporate-cla-v1.0.html).
Please follow the
[Tensorflow contributing guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md)
[TensorFlow contributing guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md)
when submitting pull requests.
![TensorFlow Requirement: 1.15](https://img.shields.io/badge/TensorFlow%20Requirement-1.15-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Tensorflow Object Detection API
# TensorFlow Object Detection API
[![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
[![TensorFlow 1.15](https://img.shields.io/badge/TensorFlow-1.15-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v1.15.0)
[![Python 3.6](https://img.shields.io/badge/Python-3.6-3776AB)](https://www.python.org/downloads/release/python-360/)
Creating accurate machine learning models capable of localizing and identifying
multiple objects in a single image remains a core challenge in computer vision.
......@@ -11,7 +11,7 @@ models. At Google we’ve certainly found this codebase to be useful for our
computer vision needs, and we hope that you will as well. <p align="center">
<img src="g3doc/img/kites_detections_output.jpg" width=676 height=450> </p>
Contributions to the codebase are welcome and we would love to hear back from
you if you find this API useful. Finally if you use the Tensorflow Object
you if you find this API useful. Finally if you use the TensorFlow Object
Detection API for a research publication, please consider citing:
```
......@@ -26,91 +26,93 @@ Song Y, Guadarrama S, Murphy K, CVPR 2017
<img src="g3doc/img/tf-od-api-logo.png" width=140 height=195>
</p>
## Maintainers
## Support for TensorFlow 2 and 1
The TensorFlow Object Detection API supports both TensorFlow 2 (TF2) and
TensorFlow 1 (TF1). A majority of the modules in the library are both TF1 and
TF2 compatible. In cases where they are not, we provide two versions.
Name | GitHub
-------------- | ---------------------------------------------
Jonathan Huang | [jch1](https://github.com/jch1)
Vivek Rathod | [tombstone](https://github.com/tombstone)
Ronny Votel | [ronnyvotel](https://github.com/ronnyvotel)
Derek Chow | [derekjchow](https://github.com/derekjchow)
Chen Sun | [jesu9](https://github.com/jesu9)
Menglong Zhu | [dreamdragon](https://github.com/dreamdragon)
Alireza Fathi | [afathi3](https://github.com/afathi3)
Zhichao Lu | [pkulzc](https://github.com/pkulzc)
## Table of contents
Setup:
* <a href='g3doc/installation.md'>Installation</a><br>
Quick Start:
* <a href='object_detection_tutorial.ipynb'>
Quick Start: Jupyter notebook for off-the-shelf inference</a><br>
* <a href="g3doc/running_pets.md">Quick Start: Training a pet detector</a><br>
Customizing a Pipeline:
* <a href='g3doc/configuring_jobs.md'>
Configuring an object detection pipeline</a><br>
* <a href='g3doc/preparing_inputs.md'>Preparing inputs</a><br>
Running:
* <a href='g3doc/running_locally.md'>Running locally</a><br>
* <a href='g3doc/running_on_cloud.md'>Running on the cloud</a><br>
Extras:
* <a href='g3doc/detection_model_zoo.md'>Tensorflow detection model zoo</a><br>
* <a href='g3doc/exporting_models.md'>
Exporting a trained model for inference</a><br>
* <a href='g3doc/tpu_exporters.md'>
Exporting a trained model for TPU inference</a><br>
* <a href='g3doc/defining_your_own_model.md'>
Defining your own model architecture</a><br>
* <a href='g3doc/using_your_own_dataset.md'>
Bringing in your own dataset</a><br>
* <a href='g3doc/evaluation_protocols.md'>
Supported object detection evaluation protocols</a><br>
* <a href='g3doc/oid_inference_and_evaluation.md'>
Inference and evaluation on the Open Images dataset</a><br>
* <a href='g3doc/instance_segmentation.md'>
Run an instance segmentation model</a><br>
* <a href='g3doc/challenge_evaluation.md'>
Run the evaluation for the Open Images Challenge 2018/2019</a><br>
* <a href='g3doc/tpu_compatibility.md'>
TPU compatible detection pipelines</a><br>
* <a href='g3doc/running_on_mobile_tensorflowlite.md'>
Running object detection on mobile devices with TensorFlow Lite</a><br>
* <a href='g3doc/context_rcnn.md'>
Context R-CNN documentation for data preparation, training, and export</a><br>
Although we will continue to maintain the TF1 models and provide support, we
encourage users to try the Object Detection API with TF2 for the following
reasons:
## Getting Help
* We provide new architectures supported in TF2 only and we will continue to
develop in TF2 going forward.
To get help with issues you may encounter using the Tensorflow Object Detection
API, create a new question on [StackOverflow](https://stackoverflow.com/) with
the tags "tensorflow" and "object-detection".
* The popular models we ported from TF1 to TF2 achieve the same performance.
Please report bugs (actually broken code, not usage questions) to the
tensorflow/models GitHub
[issue tracker](https://github.com/tensorflow/models/issues), prefixing the
issue name with "object_detection".
* A single training and evaluation binary now supports both GPU and TPU
distribution strategies making it possible to train models with synchronous
SGD by default.
* Eager execution with new binaries makes debugging easy!
Finally, if are an existing user of the Object Detection API we have retained
the same config language you are familiar with and ensured that the
TF2 training/eval binary takes the same arguments as our TF1 binaries.
Note: The models we provide in [TF2 Zoo](g3doc/tf2_detection_zoo.md) and
[TF1 Zoo](g3doc/tf1_detection_zoo.md) are specific to the TensorFlow major
version and are not interoperable.
Please check [FAQ](g3doc/faq.md) for frequently asked questions before reporting
an issue.
Please select one of the two links below for TensorFlow version specific
documentation of the Object Detection API:
## Release information
### June 17th, 2020
<!-- mdlint off(WHITESPACE_LINE_LENGTH) -->
| [![Object Detection API TensorFlow 2](https://img.shields.io/badge/Object%20Detection%20API-TensorFlow%202-orange)](g3doc/tf2.md) | [![TensorFlow 2 Model Zoo](https://img.shields.io/badge/Model%20Zoo-TensorFlow%202-Orange)](g3doc/tf2_detection_zoo.md) |
|---|---|
| [![Object Detection API TensorFlow 1](https://img.shields.io/badge/Object%20Detection%20API-TensorFlow%201-orange)](g3doc/tf1.md) | [![TensorFlow 1 Model Zoo](https://img.shields.io/badge/Model%20Zoo-TensorFlow%201-Orange)](g3doc/tf1_detection_zoo.md) |
<!-- mdlint on -->
## Whats New
### TensorFlow 2 Support
We are happy to announce that the TF OD API officially supports TF2! Our release
includes:
* New binaries for train/eval/export that are designed to run in eager mode.
* A suite of TF2 compatible (Keras-based) models; this includes migrations of
our most popular TF1.x models (e.g., SSD with MobileNet, RetinaNet,
Faster R-CNN, Mask R-CNN), as well as a few new architectures for which we
will only maintain TF2 implementations:
1. CenterNet - a simple and effective anchor-free architecture based on
the recent [Objects as Points](https://arxiv.org/abs/1904.07850) paper by
Zhou et al.
2. [EfficientDet](https://arxiv.org/abs/1911.09070) - a recent family of
SOTA models discovered with the help of Neural Architecture Search.
* COCO pre-trained weights for all of the models provided as TF2 style
object-based checkpoints.
* Access to [Distribution Strategies](https://www.tensorflow.org/guide/distributed_training)
for distributed training --- our model are designed to be trainable using sync
multi-GPU and TPU platforms.
* Colabs demo’ing eager mode training and inference.
See our release blogpost [here](https://blog.tensorflow.org/2020/07/tensorflow-2-meets-object-detection-api.html).
If you are an existing user of the TF OD API using TF 1.x, don’t worry, we’ve
got you covered.
**Thanks to contributors**: Akhil Chinnakotla, Allen Lavoie, Anirudh Vegesana,
Anjali Sridhar, Austin Myers, Dan Kondratyuk, David Ross, Derek Chow, Jaeyoun
Kim, Jing Li, Jonathan Huang, Jordi Pont-Tuset, Karmel Allison, Kathy Ruan,
Kaushik Shivakumar, Lu He, Mingxing Tan, Pengchong Jin, Ronny Votel, Sara Beery,
Sergi Caelles Prat, Shan Yang, Sudheendra Vijayanarasimhan, Tina Tian, Tomer
Kaftan, Vighnesh Birodkar, Vishnu Banna, Vivek Rathod, Yanhui Liang, Yiming Shi,
Yixin Shi, Yu-hui Chen, Zhichao Lu.
### Context R-CNN
We have released [Context R-CNN](https://arxiv.org/abs/1912.03538), a model that
uses attention to incorporate contextual information images (e.g. from
temporally nearby frames taken by a static camera) in order to improve accuracy.
Importantly, these contextual images need not be labeled.
* When applied to a challenging wildlife detection dataset ([Snapshot Serengeti](http://lila.science/datasets/snapshot-serengeti)),
* When applied to a challenging wildlife detection dataset
([Snapshot Serengeti](http://lila.science/datasets/snapshot-serengeti)),
Context R-CNN with context from up to a month of images outperforms a
single-frame baseline by 17.9% mAP, and outperforms S3D (a 3d convolution
based baseline) by 11.2% mAP.
......@@ -118,282 +120,48 @@ Importantly, these contextual images need not be labeled.
novel camera deployment to improve performance at that camera, boosting
model generalizeability.
Read about Context R-CNN on the Google AI blog [here](https://ai.googleblog.com/2020/06/leveraging-temporal-context-for-object.html).
Read about Context R-CNN on the Google AI blog
[here](https://ai.googleblog.com/2020/06/leveraging-temporal-context-for-object.html).
We have provided code for generating data with associated context
[here](g3doc/context_rcnn.md), and a sample config for a Context R-CNN
model [here](samples/configs/context_rcnn_resnet101_snapshot_serengeti_sync.config).
[here](g3doc/context_rcnn.md), and a sample config for a Context R-CNN model
[here](samples/configs/context_rcnn_resnet101_snapshot_serengeti_sync.config).
Snapshot Serengeti-trained Faster R-CNN and Context R-CNN models can be found in
the [model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md#snapshot-serengeti-camera-trap-trained-models).
the
[model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#snapshot-serengeti-camera-trap-trained-models).
A colab demonstrating Context R-CNN is provided
[here](colab_tutorials/context_rcnn_tutorial.ipynb).
<b>Thanks to contributors</b>: Sara Beery, Jonathan Huang, Guanhang Wu, Vivek
Rathod, Ronny Votel, Zhichao Lu, David Ross, Pietro Perona, Tanya Birch, and
the Wildlife Insights AI Team.
### May 19th, 2020
We have released [MobileDets](https://arxiv.org/abs/2004.14525), a set of
high-performance models for mobile CPUs, DSPs and EdgeTPUs.
* MobileDets outperform MobileNetV3+SSDLite by 1.7 mAP at comparable mobile
CPU inference latencies. MobileDets also outperform MobileNetV2+SSDLite by
1.9 mAP on mobile CPUs, 3.7 mAP on EdgeTPUs and 3.4 mAP on DSPs while
running equally fast. MobileDets also offer up to 2x speedup over MnasFPN on
EdgeTPUs and DSPs.
For each of the three hardware platforms we have released model definition,
model checkpoints trained on the COCO14 dataset and converted TFLite models in
fp32 and/or uint8.
<b>Thanks to contributors</b>: Yunyang Xiong, Hanxiao Liu, Suyog Gupta, Berkin
Akin, Gabriel Bender, Pieter-Jan Kindermans, Mingxing Tan, Vikas Singh, Bo Chen,
Quoc Le, Zhichao Lu.
### May 7th, 2020
We have released a mobile model with the
[MnasFPN head](https://arxiv.org/abs/1912.01106).
* MnasFPN with MobileNet-V2 backbone is the most accurate (26.6 mAP at 183ms
on Pixel 1) mobile detection model we have released to date. With
depth-multiplier, MnasFPN with MobileNet-V2 backbone is 1.8 mAP higher than
MobileNet-V3-Large with SSDLite (23.8 mAP vs 22.0 mAP) at similar latency
(120ms) on Pixel 1.
We have released model definition, model checkpoints trained on the COCO14
dataset and a converted TFLite model.
<b>Thanks to contributors</b>: Bo Chen, Golnaz Ghiasi, Hanxiao Liu, Tsung-Yi
Lin, Dmitry Kalenichenko, Hartwig Adam, Quoc Le, Zhichao Lu, Jonathan Huang, Hao
Xu.
### Nov 13th, 2019
We have released MobileNetEdgeTPU SSDLite model.
* SSDLite with MobileNetEdgeTPU backbone, which achieves 10% mAP higher than
MobileNetV2 SSDLite (24.3 mAP vs 22 mAP) on a Google Pixel4 at comparable
latency (6.6ms vs 6.8ms).
Along with the model definition, we are also releasing model checkpoints trained
on the COCO dataset.
<b>Thanks to contributors</b>: Yunyang Xiong, Bo Chen, Suyog Gupta, Hanxiao Liu,
Gabriel Bender, Mingxing Tan, Berkin Akin, Zhichao Lu, Quoc Le
### Oct 15th, 2019
We have released two MobileNet V3 SSDLite models (presented in
[Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)).
* SSDLite with MobileNet-V3-Large backbone, which is 27% faster than Mobilenet
V2 SSDLite (119ms vs 162ms) on a Google Pixel phone CPU at the same mAP.
* SSDLite with MobileNet-V3-Small backbone, which is 37% faster than MnasNet
SSDLite reduced with depth-multiplier (43ms vs 68ms) at the same mAP.
Along with the model definition, we are also releasing model checkpoints trained
on the COCO dataset.
<b>Thanks to contributors</b>: Bo Chen, Zhichao Lu, Vivek Rathod, Jonathan Huang
### July 1st, 2019
We have released an updated set of utils and an updated
[tutorial](g3doc/challenge_evaluation.md) for all three tracks of the
[Open Images Challenge 2019](https://storage.googleapis.com/openimages/web/challenge2019.html)!
The Instance Segmentation metric for
[Open Images V5](https://storage.googleapis.com/openimages/web/index.html) and
[Challenge 2019](https://storage.googleapis.com/openimages/web/challenge2019.html)
is part of this release. Check out
[the metric description](https://storage.googleapis.com/openimages/web/evaluation.html#instance_segmentation_eval)
on the Open Images website.
<b>Thanks to contributors</b>: Alina Kuznetsova, Rodrigo Benenson
### Feb 11, 2019
Rathod, Ronny Votel, Zhichao Lu, David Ross, Pietro Perona, Tanya Birch, and the
Wildlife Insights AI Team.
We have released detection models trained on the Open Images Dataset V4 in our
detection model zoo, including
## Release Notes
See [notes](g3doc/release_notes.md) for all past releases.
* Faster R-CNN detector with Inception Resnet V2 feature extractor
* SSD detector with MobileNet V2 feature extractor
* SSD detector with ResNet 101 FPN feature extractor (aka RetinaNet-101)
<b>Thanks to contributors</b>: Alina Kuznetsova, Yinxiao Li
### Sep 17, 2018
We have released Faster R-CNN detectors with ResNet-50 / ResNet-101 feature
extractors trained on the
[iNaturalist Species Detection Dataset](https://github.com/visipedia/inat_comp/blob/master/2017/README.md#bounding-boxes).
The models are trained on the training split of the iNaturalist data for 4M
iterations, they achieve 55% and 58% mean AP@.5 over 2854 classes respectively.
For more details please refer to this [paper](https://arxiv.org/abs/1707.06642).
<b>Thanks to contributors</b>: Chen Sun
### July 13, 2018
There are many new updates in this release, extending the functionality and
capability of the API:
* Moving from slim-based training to
[Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)-based
training.
* Support for [RetinaNet](https://arxiv.org/abs/1708.02002), and a
[MobileNet](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
adaptation of RetinaNet.
* A novel SSD-based architecture called the
[Pooling Pyramid Network](https://arxiv.org/abs/1807.03284) (PPN).
* Releasing several [TPU](https://cloud.google.com/tpu/)-compatible models.
These can be found in the `samples/configs/` directory with a comment in the
pipeline configuration files indicating TPU compatibility.
* Support for quantized training.
* Updated documentation for new binaries, Cloud training, and
[Tensorflow Lite](https://www.tensorflow.org/mobile/tflite/).
See also our
[expanded announcement blogpost](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html)
and accompanying tutorial at the
[TensorFlow blog](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
<b>Thanks to contributors</b>: Sara Robinson, Aakanksha Chowdhery, Derek Chow,
Pengchong Jin, Jonathan Huang, Vivek Rathod, Zhichao Lu, Ronny Votel
### June 25, 2018
Additional evaluation tools for the
[Open Images Challenge 2018](https://storage.googleapis.com/openimages/web/challenge.html)
are out. Check out our short tutorial on data preparation and running evaluation
[here](g3doc/challenge_evaluation.md)!
<b>Thanks to contributors</b>: Alina Kuznetsova
### June 5, 2018
We have released the implementation of evaluation metrics for both tracks of the
[Open Images Challenge 2018](https://storage.googleapis.com/openimages/web/challenge.html)
as a part of the Object Detection API - see the
[evaluation protocols](g3doc/evaluation_protocols.md) for more details.
Additionally, we have released a tool for hierarchical labels expansion for the
Open Images Challenge: check out
[oid_hierarchical_labels_expansion.py](dataset_tools/oid_hierarchical_labels_expansion.py).
<b>Thanks to contributors</b>: Alina Kuznetsova, Vittorio Ferrari, Jasper
Uijlings
### April 30, 2018
We have released a Faster R-CNN detector with ResNet-101 feature extractor
trained on [AVA](https://research.google.com/ava/) v2.1. Compared with other
commonly used object detectors, it changes the action classification loss
function to per-class Sigmoid loss to handle boxes with multiple labels. The
model is trained on the training split of AVA v2.1 for 1.5M iterations, it
achieves mean AP of 11.25% over 60 classes on the validation split of AVA v2.1.
For more details please refer to this [paper](https://arxiv.org/abs/1705.08421).
<b>Thanks to contributors</b>: Chen Sun, David Ross
### April 2, 2018
Supercharge your mobile phones with the next generation mobile object detector!
We are adding support for MobileNet V2 with SSDLite presented in
[MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381).
This model is 35% faster than Mobilenet V1 SSD on a Google Pixel phone CPU
(200ms vs. 270ms) at the same accuracy. Along with the model definition, we are
also releasing a model checkpoint trained on the COCO dataset.
<b>Thanks to contributors</b>: Menglong Zhu, Mark Sandler, Zhichao Lu, Vivek
Rathod, Jonathan Huang
### February 9, 2018
We now support instance segmentation!! In this API update we support a number of
instance segmentation models similar to those discussed in the
[Mask R-CNN paper](https://arxiv.org/abs/1703.06870). For further details refer
to [our slides](http://presentations.cocodataset.org/Places17-GMRI.pdf) from the
2017 Coco + Places Workshop. Refer to the section on
[Running an Instance Segmentation Model](g3doc/instance_segmentation.md) for
instructions on how to configure a model that predicts masks in addition to
object bounding boxes.
<b>Thanks to contributors</b>: Alireza Fathi, Zhichao Lu, Vivek Rathod, Ronny
Votel, Jonathan Huang
### November 17, 2017
As a part of the Open Images V3 release we have released:
* An implementation of the Open Images evaluation metric and the
[protocol](g3doc/evaluation_protocols.md#open-images).
* Additional tools to separate inference of detection and evaluation (see
[this tutorial](g3doc/oid_inference_and_evaluation.md)).
* A new detection model trained on the Open Images V2 data release (see
[Open Images model](g3doc/detection_model_zoo.md#open-images-models)).
See more information on the
[Open Images website](https://github.com/openimages/dataset)!
<b>Thanks to contributors</b>: Stefan Popov, Alina Kuznetsova
### November 6, 2017
We have re-released faster versions of our (pre-trained) models in the
<a href='g3doc/detection_model_zoo.md'>model zoo</a>. In addition to what was
available before, we are also adding Faster R-CNN models trained on COCO with
Inception V2 and Resnet-50 feature extractors, as well as a Faster R-CNN with
Resnet-101 model trained on the KITTI dataset.
<b>Thanks to contributors</b>: Jonathan Huang, Vivek Rathod, Derek Chow, Tal
Remez, Chen Sun.
### October 31, 2017
We have released a new state-of-the-art model for object detection using the
Faster-RCNN with the
[NASNet-A image featurization](https://arxiv.org/abs/1707.07012). This model
achieves mAP of 43.1% on the test-dev validation dataset for COCO, improving on
the best available model in the zoo by 6% in terms of absolute mAP.
<b>Thanks to contributors</b>: Barret Zoph, Vijay Vasudevan, Jonathon Shlens,
Quoc Le
### August 11, 2017
## Getting Help
We have released an update to the
[Android Detect demo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android)
which will now run models trained using the Tensorflow Object Detection API on
an Android device. By default, it currently runs a frozen SSD w/Mobilenet
detector trained on COCO, but we encourage you to try out other detection
models!
To get help with issues you may encounter using the TensorFlow Object Detection
API, create a new question on [StackOverflow](https://stackoverflow.com/) with
the tags "tensorflow" and "object-detection".
<b>Thanks to contributors</b>: Jonathan Huang, Andrew Harp
Please report bugs (actually broken code, not usage questions) to the
tensorflow/models GitHub
[issue tracker](https://github.com/tensorflow/models/issues), prefixing the
issue name with "object_detection".
### June 15, 2017
Please check the [FAQ](g3doc/faq.md) for frequently asked questions before
reporting an issue.
In addition to our base Tensorflow detection model definitions, this release
includes:
## Maintainers
* A selection of trainable detection models, including:
* Single Shot Multibox Detector (SSD) with MobileNet,
* SSD with Inception V2,
* Region-Based Fully Convolutional Networks (R-FCN) with Resnet 101,
* Faster RCNN with Resnet 101,
* Faster RCNN with Inception Resnet v2
* Frozen weights (trained on the COCO dataset) for each of the above models to
be used for out-of-the-box inference purposes.
* A [Jupyter notebook](colab_tutorials/object_detection_tutorial.ipynb) for
performing out-of-the-box inference with one of our released models
* Convenient [local training](g3doc/running_locally.md) scripts as well as
distributed training and evaluation pipelines via
[Google Cloud](g3doc/running_on_cloud.md).
<b>Thanks to contributors</b>: Jonathan Huang, Vivek Rathod, Derek Chow, Chen
Sun, Menglong Zhu, Matthew Tang, Anoop Korattikara, Alireza Fathi, Ian Fischer,
Zbigniew Wojna, Yang Song, Sergio Guadarrama, Jasper Uijlings, Viacheslav
Kovalevskyi, Kevin Murphy
* Jonathan Huang ([@GitHub jch1](https://github.com/jch1))
* Vivek Rathod ([@GitHub tombstone](https://github.com/tombstone))
* Vighnesh Birodkar ([@GitHub vighneshbirodkar](https://github.com/vighneshbirodkar))
* Austin Myers ([@GitHub austin-myers](https://github.com/austin-myers))
* Zhichao Lu ([@GitHub pkulzc](https://github.com/pkulzc))
* Ronny Votel ([@GitHub ronnyvotel](https://github.com/ronnyvotel))
* Yu-hui Chen ([@GitHub yuhuichen1015](https://github.com/yuhuichen1015))
* Derek Chow ([@GitHub derekjchow](https://github.com/derekjchow))
......@@ -17,9 +17,8 @@
"""Tests for box_predictor_builder."""
import unittest
import mock
from unittest import mock # pylint: disable=g-importing-member
import tensorflow.compat.v1 as tf
from google.protobuf import text_format
from object_detection.builders import box_predictor_builder
from object_detection.builders import hyperparams_builder
......
......@@ -14,7 +14,7 @@
# ==============================================================================
"""Tests for graph_rewriter_builder."""
import unittest
import mock
from unittest import mock # pylint: disable=g-importing-member
import tensorflow.compat.v1 as tf
import tf_slim as slim
......
......@@ -16,6 +16,7 @@
"""A function to build a DetectionModel from configuration."""
import functools
import sys
from object_detection.builders import anchor_generator_builder
from object_detection.builders import box_coder_builder
from object_detection.builders import box_predictor_builder
......@@ -58,6 +59,8 @@ if tf_version.is_tf2():
from object_detection.models.ssd_mobilenet_v2_fpn_keras_feature_extractor import SSDMobileNetV2FpnKerasFeatureExtractor
from object_detection.models.ssd_mobilenet_v2_keras_feature_extractor import SSDMobileNetV2KerasFeatureExtractor
from object_detection.predictors import rfcn_keras_box_predictor
if sys.version_info[0] >= 3:
from object_detection.models import ssd_efficientnet_bifpn_feature_extractor as ssd_efficientnet_bifpn
if tf_version.is_tf1():
from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
......@@ -99,6 +102,22 @@ if tf_version.is_tf2():
ssd_resnet_v1_fpn_keras.SSDResNet101V1FpnKerasFeatureExtractor,
'ssd_resnet152_v1_fpn_keras':
ssd_resnet_v1_fpn_keras.SSDResNet152V1FpnKerasFeatureExtractor,
'ssd_efficientnet-b0_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB0BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b1_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB1BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b2_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB2BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b3_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB3BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b4_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB4BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b5_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB5BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b6_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB6BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b7_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB7BiFPNKerasFeatureExtractor,
}
FASTER_RCNN_KERAS_FEATURE_EXTRACTOR_CLASS_MAP = {
......@@ -110,11 +129,11 @@ if tf_version.is_tf2():
frcnn_resnet_keras.FasterRCNNResnet152KerasFeatureExtractor,
'faster_rcnn_inception_resnet_v2_keras':
frcnn_inc_res_keras.FasterRCNNInceptionResnetV2KerasFeatureExtractor,
'fasret_rcnn_resnet50_fpn_keras':
'faster_rcnn_resnet50_fpn_keras':
frcnn_resnet_fpn_keras.FasterRCNNResnet50FpnKerasFeatureExtractor,
'fasret_rcnn_resnet101_fpn_keras':
'faster_rcnn_resnet101_fpn_keras':
frcnn_resnet_fpn_keras.FasterRCNNResnet101FpnKerasFeatureExtractor,
'fasret_rcnn_resnet152_fpn_keras':
'faster_rcnn_resnet152_fpn_keras':
frcnn_resnet_fpn_keras.FasterRCNNResnet152FpnKerasFeatureExtractor,
}
......@@ -310,6 +329,14 @@ def _build_ssd_feature_extractor(feature_extractor_config,
feature_extractor_config.fpn.additional_layer_depth,
})
if feature_extractor_config.HasField('bifpn'):
kwargs.update({
'bifpn_min_level': feature_extractor_config.bifpn.min_level,
'bifpn_max_level': feature_extractor_config.bifpn.max_level,
'bifpn_num_iterations': feature_extractor_config.bifpn.num_iterations,
'bifpn_num_filters': feature_extractor_config.bifpn.num_filters,
'bifpn_combine_method': feature_extractor_config.bifpn.combine_method,
})
return feature_extractor_class(**kwargs)
......@@ -843,6 +870,22 @@ def mask_proto_to_params(mask_config):
heatmap_bias_init=mask_config.heatmap_bias_init)
def densepose_proto_to_params(densepose_config):
"""Converts CenterNet.DensePoseEstimation proto to parameter namedtuple."""
classification_loss, localization_loss, _, _, _, _, _ = (
losses_builder.build(densepose_config.loss))
return center_net_meta_arch.DensePoseParams(
class_id=densepose_config.class_id,
classification_loss=classification_loss,
localization_loss=localization_loss,
part_loss_weight=densepose_config.part_loss_weight,
coordinate_loss_weight=densepose_config.coordinate_loss_weight,
num_parts=densepose_config.num_parts,
task_loss_weight=densepose_config.task_loss_weight,
upsample_to_input_res=densepose_config.upsample_to_input_res,
heatmap_bias_init=densepose_config.heatmap_bias_init)
def _build_center_net_model(center_net_config, is_training, add_summaries):
"""Build a CenterNet detection model.
......@@ -895,6 +938,11 @@ def _build_center_net_model(center_net_config, is_training, add_summaries):
if center_net_config.HasField('mask_estimation_task'):
mask_params = mask_proto_to_params(center_net_config.mask_estimation_task)
densepose_params = None
if center_net_config.HasField('densepose_estimation_task'):
densepose_params = densepose_proto_to_params(
center_net_config.densepose_estimation_task)
return center_net_meta_arch.CenterNetMetaArch(
is_training=is_training,
add_summaries=add_summaries,
......@@ -904,7 +952,8 @@ def _build_center_net_model(center_net_config, is_training, add_summaries):
object_center_params=object_center_params,
object_detection_params=object_detection_params,
keypoint_params_dict=keypoint_params_dict,
mask_params=mask_params)
mask_params=mask_params,
densepose_params=densepose_params)
def _build_center_net_feature_extractor(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment