Unverified Commit 78ddf6eb authored by cclauss's avatar cclauss Committed by GitHub
Browse files

Merge branch 'master' into patch-6

parents 50cb0365 1f34fcaf
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Downloads pretrained InceptionV3 and ResnetV2-50 checkpoints."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tarfile
import urllib
INCEPTION_URL = 'http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz'
RESNET_URL = 'http://download.tensorflow.org/models/resnet_v2_50_2017_04_14.tar.gz'
def DownloadWeights(model_dir, url):
os.makedirs(model_dir)
tar_path = os.path.join(model_dir, 'ckpt.tar.gz')
urllib.urlretrieve(url, tar_path)
tar = tarfile.open(os.path.join(model_dir, 'ckpt.tar.gz'))
tar.extractall(model_dir)
if __name__ == '__main__':
# Create a directory for all pretrained checkpoints.
ckpt_dir = 'pretrained_checkpoints'
if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir)
# Download inception.
print('Downloading inception pretrained weights...')
inception_dir = os.path.join(ckpt_dir, 'inception')
DownloadWeights(inception_dir, INCEPTION_URL)
print('Done downloading inception pretrained weights.')
print('Downloading resnet pretrained weights...')
resnet_dir = os.path.join(ckpt_dir, 'resnet')
DownloadWeights(resnet_dir, RESNET_URL)
print('Done downloading resnet pretrained weights.')
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base estimator defining TCN training, test, and inference."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from abc import ABCMeta
from abc import abstractmethod
import os
import numpy as np
import numpy as np
import data_providers
import preprocessing
from utils import util
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.tpu.python.tpu import tpu_config
from tensorflow.contrib.tpu.python.tpu import tpu_estimator
from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
from tensorflow.python.training import session_run_hook
tf.app.flags.DEFINE_integer(
'tf_random_seed', 0, 'Random seed.')
FLAGS = tf.app.flags.FLAGS
class InitFromPretrainedCheckpointHook(session_run_hook.SessionRunHook):
"""Hook that can init graph from a pretrained checkpoint."""
def __init__(self, pretrained_checkpoint_dir):
"""Initializes a `InitFromPretrainedCheckpointHook`.
Args:
pretrained_checkpoint_dir: The dir of pretrained checkpoint.
Raises:
ValueError: If pretrained_checkpoint_dir is invalid.
"""
if pretrained_checkpoint_dir is None:
raise ValueError('pretrained_checkpoint_dir must be specified.')
self._pretrained_checkpoint_dir = pretrained_checkpoint_dir
def begin(self):
checkpoint_reader = tf.contrib.framework.load_checkpoint(
self._pretrained_checkpoint_dir)
variable_shape_map = checkpoint_reader.get_variable_to_shape_map()
exclude_scopes = 'logits/,final_layer/,aux_'
# Skip restoring global_step as to run fine tuning from step=0.
exclusions = ['global_step']
if exclude_scopes:
exclusions.extend([scope.strip() for scope in exclude_scopes.split(',')])
variable_to_restore = tf.contrib.framework.get_model_variables()
# Variable filtering by given exclude_scopes.
filtered_variables_to_restore = {}
for v in variable_to_restore:
excluded = False
for exclusion in exclusions:
if v.name.startswith(exclusion):
excluded = True
break
if not excluded:
var_name = v.name.split(':')[0]
filtered_variables_to_restore[var_name] = v
# Final filter by checking shape matching and skipping variables that
# are not in the checkpoint.
final_variables_to_restore = {}
for var_name, var_tensor in filtered_variables_to_restore.iteritems():
if var_name not in variable_shape_map:
# Try moving average version of variable.
var_name = os.path.join(var_name, 'ExponentialMovingAverage')
if var_name not in variable_shape_map:
tf.logging.info(
'Skip init [%s] because it is not in ckpt.', var_name)
# Skip variables not in the checkpoint.
continue
if not var_tensor.get_shape().is_compatible_with(
variable_shape_map[var_name]):
# Skip init variable from ckpt if shape dismatch.
tf.logging.info(
'Skip init [%s] from [%s] in ckpt because shape dismatch: %s vs %s',
var_tensor.name, var_name,
var_tensor.get_shape(), variable_shape_map[var_name])
continue
tf.logging.info('Init %s from %s in ckpt' % (var_tensor, var_name))
final_variables_to_restore[var_name] = var_tensor
self._init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
self._pretrained_checkpoint_dir,
final_variables_to_restore)
def after_create_session(self, session, coord):
tf.logging.info('Restoring InceptionV3 weights.')
self._init_fn(session)
tf.logging.info('Done restoring InceptionV3 weights.')
class BaseEstimator(object):
"""Abstract TCN base estimator class."""
__metaclass__ = ABCMeta
def __init__(self, config, logdir):
"""Constructor.
Args:
config: A Luatable-like T object holding training config.
logdir: String, a directory where checkpoints and summaries are written.
"""
self._config = config
self._logdir = logdir
@abstractmethod
def construct_input_fn(self, records, is_training):
"""Builds an estimator input_fn.
The input_fn is used to pass feature and target data to the train,
evaluate, and predict methods of the Estimator.
Method to be overridden by implementations.
Args:
records: A list of Strings, paths to TFRecords with image data.
is_training: Boolean, whether or not we're training.
Returns:
Function, that has signature of ()->(dict of features, target).
features is a dict mapping feature names to `Tensors`
containing the corresponding feature data (typically, just a single
key/value pair 'raw_data' -> image `Tensor` for TCN.
labels is a 1-D int32 `Tensor` holding labels.
"""
pass
def preprocess_data(self, images, is_training):
"""Preprocesses raw images for either training or inference.
Args:
images: A 4-D float32 `Tensor` holding images to preprocess.
is_training: Boolean, whether or not we're in training.
Returns:
data_preprocessed: data after the preprocessor.
"""
config = self._config
height = config.data.height
width = config.data.width
min_scale = config.data.augmentation.minscale
max_scale = config.data.augmentation.maxscale
p_scale_up = config.data.augmentation.proportion_scaled_up
aug_color = config.data.augmentation.color
fast_mode = config.data.augmentation.fast_mode
crop_strategy = config.data.preprocessing.eval_cropping
preprocessed_images = preprocessing.preprocess_images(
images, is_training, height, width,
min_scale, max_scale, p_scale_up,
aug_color=aug_color, fast_mode=fast_mode,
crop_strategy=crop_strategy)
return preprocessed_images
@abstractmethod
def forward(self, images, is_training, reuse=False):
"""Defines the forward pass that converts batch images to embeddings.
Method to be overridden by implementations.
Args:
images: A 4-D float32 `Tensor` holding images to be embedded.
is_training: Boolean, whether or not we're in training mode.
reuse: Boolean, whether or not to reuse embedder.
Returns:
embeddings: A 2-D float32 `Tensor` holding embedded images.
"""
pass
@abstractmethod
def define_loss(self, embeddings, labels, is_training):
"""Defines the loss function on the embedding vectors.
Method to be overridden by implementations.
Args:
embeddings: A 2-D float32 `Tensor` holding embedded images.
labels: A 1-D int32 `Tensor` holding problem labels.
is_training: Boolean, whether or not we're in training mode.
Returns:
loss: tf.float32 scalar.
"""
pass
@abstractmethod
def define_eval_metric_ops(self):
"""Defines the dictionary of eval metric tensors.
Method to be overridden by implementations.
Returns:
eval_metric_ops: A dict of name/value pairs specifying the
metrics that will be calculated when the model runs in EVAL mode.
"""
pass
def get_train_op(self, loss):
"""Creates a training op.
Args:
loss: A float32 `Tensor` representing the total training loss.
Returns:
train_op: A slim.learning.create_train_op train_op.
Raises:
ValueError: If specified optimizer isn't supported.
"""
# Get variables to train (defined in subclass).
assert self.variables_to_train
# Define a learning rate schedule.
decay_steps = self._config.learning.decay_steps
decay_factor = self._config.learning.decay_factor
learning_rate = float(self._config.learning.learning_rate)
# Define a learning rate schedule.
global_step = slim.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(
learning_rate,
global_step,
decay_steps,
decay_factor,
staircase=True)
# Create an optimizer.
opt_type = self._config.learning.optimizer
if opt_type == 'adam':
opt = tf.train.AdamOptimizer(learning_rate)
elif opt_type == 'momentum':
opt = tf.train.MomentumOptimizer(learning_rate, 0.9)
elif opt_type == 'rmsprop':
opt = tf.train.RMSPropOptimizer(learning_rate, momentum=0.9,
epsilon=1.0, decay=0.9)
else:
raise ValueError('Unsupported optimizer %s' % opt_type)
if self._config.use_tpu:
opt = tpu_optimizer.CrossShardOptimizer(opt)
# Create a training op.
# train_op = opt.minimize(loss, var_list=self.variables_to_train)
# Create a training op.
train_op = slim.learning.create_train_op(
loss,
optimizer=opt,
variables_to_train=self.variables_to_train,
update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS))
return train_op
def _get_model_fn(self):
"""Defines behavior for training, evaluation, and inference (prediction).
Returns:
`model_fn` for `Estimator`.
"""
# pylint: disable=unused-argument
def model_fn(features, labels, mode, params):
"""Build the model based on features, labels, and mode.
Args:
features: Dict, strings to `Tensor` input data, returned by the
input_fn.
labels: The labels Tensor returned by the input_fn.
mode: A string indicating the mode. This will be either
tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.PREDICT,
or tf.estimator.ModeKeys.EVAL.
params: A dict holding training parameters, passed in during TPU
training.
Returns:
A tf.estimator.EstimatorSpec specifying train/test/inference behavior.
"""
is_training = mode == tf.estimator.ModeKeys.TRAIN
# Get preprocessed images from the features dict.
batch_preprocessed = features['batch_preprocessed']
# Do a forward pass to embed data.
batch_encoded = self.forward(batch_preprocessed, is_training)
# Optionally set the pretrained initialization function.
initializer_fn = None
if mode == tf.estimator.ModeKeys.TRAIN:
initializer_fn = self.pretrained_init_fn
# If we're training or evaluating, define total loss.
total_loss = None
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
loss = self.define_loss(batch_encoded, labels, is_training)
tf.losses.add_loss(loss)
total_loss = tf.losses.get_total_loss()
# If we're training, define a train op.
train_op = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = self.get_train_op(total_loss)
# If we're doing inference, set the output to be the embedded images.
predictions_dict = None
if mode == tf.estimator.ModeKeys.PREDICT:
predictions_dict = {'embeddings': batch_encoded}
# Pass through additional metadata stored in features.
for k, v in features.iteritems():
predictions_dict[k] = v
# If we're evaluating, define some eval metrics.
eval_metric_ops = None
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = self.define_eval_metric_ops()
# Define training scaffold to load pretrained weights.
num_checkpoint_to_keep = self._config.logging.checkpoint.num_to_keep
saver = tf.train.Saver(
max_to_keep=num_checkpoint_to_keep)
if is_training and self._config.use_tpu:
# TPU doesn't have a scaffold option at the moment, so initialize
# pretrained weights using a custom train_hook instead.
return tpu_estimator.TPUEstimatorSpec(
mode,
loss=total_loss,
eval_metrics=None,
train_op=train_op,
predictions=predictions_dict)
else:
# Build a scaffold to initialize pretrained weights.
scaffold = tf.train.Scaffold(
init_fn=initializer_fn,
saver=saver,
summary_op=None)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions_dict,
loss=total_loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops,
scaffold=scaffold)
return model_fn
def train(self):
"""Runs training."""
# Get a list of training tfrecords.
config = self._config
training_dir = config.data.training
training_records = util.GetFilesRecursively(training_dir)
# Define batch size.
self._batch_size = config.data.batch_size
# Create a subclass-defined training input function.
train_input_fn = self.construct_input_fn(
training_records, is_training=True)
# Create the estimator.
estimator = self._build_estimator(is_training=True)
train_hooks = None
if config.use_tpu:
# TPU training initializes pretrained weights using a custom train hook.
train_hooks = []
if tf.train.latest_checkpoint(self._logdir) is None:
train_hooks.append(
InitFromPretrainedCheckpointHook(
config[config.embedder_strategy].pretrained_checkpoint))
# Run training.
estimator.train(input_fn=train_input_fn, hooks=train_hooks,
steps=config.learning.max_step)
def _build_estimator(self, is_training):
"""Returns an Estimator object.
Args:
is_training: Boolean, whether or not we're in training mode.
Returns:
A tf.estimator.Estimator.
"""
config = self._config
save_checkpoints_steps = config.logging.checkpoint.save_checkpoints_steps
keep_checkpoint_max = self._config.logging.checkpoint.num_to_keep
if is_training and config.use_tpu:
iterations = config.tpu.iterations
num_shards = config.tpu.num_shards
run_config = tpu_config.RunConfig(
save_checkpoints_secs=None,
save_checkpoints_steps=save_checkpoints_steps,
keep_checkpoint_max=keep_checkpoint_max,
master=FLAGS.master,
evaluation_master=FLAGS.master,
model_dir=self._logdir,
tpu_config=tpu_config.TPUConfig(
iterations_per_loop=iterations,
num_shards=num_shards,
per_host_input_for_training=num_shards <= 8),
tf_random_seed=FLAGS.tf_random_seed)
batch_size = config.data.batch_size
return tpu_estimator.TPUEstimator(
model_fn=self._get_model_fn(),
config=run_config,
use_tpu=True,
train_batch_size=batch_size,
eval_batch_size=batch_size)
else:
run_config = tf.estimator.RunConfig().replace(
model_dir=self._logdir,
save_checkpoints_steps=save_checkpoints_steps,
keep_checkpoint_max=keep_checkpoint_max,
tf_random_seed=FLAGS.tf_random_seed)
return tf.estimator.Estimator(
model_fn=self._get_model_fn(),
config=run_config)
def evaluate(self):
"""Runs `Estimator` validation.
"""
config = self._config
# Get a list of validation tfrecords.
validation_dir = config.data.validation
validation_records = util.GetFilesRecursively(validation_dir)
# Define batch size.
self._batch_size = config.data.batch_size
# Create a subclass-defined training input function.
validation_input_fn = self.construct_input_fn(
validation_records, False)
# Create the estimator.
estimator = self._build_estimator(is_training=False)
# Run validation.
eval_batch_size = config.data.batch_size
num_eval_samples = config.val.num_eval_samples
num_eval_batches = int(num_eval_samples / eval_batch_size)
estimator.evaluate(input_fn=validation_input_fn, steps=num_eval_batches)
def inference(
self, inference_input, checkpoint_path, batch_size=None, **kwargs):
"""Defines 3 of modes of inference.
Inputs:
* Mode 1: Input is an input_fn.
* Mode 2: Input is a TFRecord (or list of TFRecords).
* Mode 3: Input is a numpy array holding an image (or array of images).
Outputs:
* Mode 1: this returns an iterator over embeddings and additional
metadata. See
https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator#predict
for details.
* Mode 2: Returns an iterator over tuples of
(embeddings, raw_image_strings, sequence_name), where embeddings is a
2-D float32 numpy array holding [sequence_size, embedding_size] image
embeddings, raw_image_strings is a 1-D string numpy array holding
[sequence_size] jpeg-encoded image strings, and sequence_name is a
string holding the name of the embedded sequence.
* Mode 3: Returns a tuple of (embeddings, raw_image_strings), where
embeddings is a 2-D float32 numpy array holding
[batch_size, embedding_size] image embeddings, raw_image_strings is a
1-D string numpy array holding [batch_size] jpeg-encoded image strings.
Args:
inference_input: This can be a tf.Estimator input_fn, a TFRecord path,
a list of TFRecord paths, a numpy image, or an array of numpy images.
checkpoint_path: String, path to the checkpoint to restore for inference.
batch_size: Int, the size of the batch to use for inference.
**kwargs: Additional keyword arguments, depending on the mode.
See _input_fn_inference, _tfrecord_inference, and _np_inference.
Returns:
inference_output: Inference output depending on mode, see above for
details.
Raises:
ValueError: If inference_input isn't a tf.Estimator input_fn,
a TFRecord path, a list of TFRecord paths, or a numpy array,
"""
# Mode 1: input is a callable tf.Estimator input_fn.
if callable(inference_input):
return self._input_fn_inference(
input_fn=inference_input, checkpoint_path=checkpoint_path, **kwargs)
# Mode 2: Input is a TFRecord path (or list of TFRecord paths).
elif util.is_tfrecord_input(inference_input):
return self._tfrecord_inference(
records=inference_input, checkpoint_path=checkpoint_path,
batch_size=batch_size, **kwargs)
# Mode 3: Input is a numpy array of raw images.
elif util.is_np_array(inference_input):
return self._np_inference(
np_images=inference_input, checkpoint_path=checkpoint_path, **kwargs)
else:
raise ValueError(
'inference input must be a tf.Estimator input_fn, a TFRecord path,'
'a list of TFRecord paths, or a numpy array. Got: %s' % str(type(
inference_input)))
def _input_fn_inference(self, input_fn, checkpoint_path, predict_keys=None):
"""Mode 1: tf.Estimator inference.
Args:
input_fn: Function, that has signature of ()->(dict of features, None).
This is a function called by the estimator to get input tensors (stored
in the features dict) to do inference over.
checkpoint_path: String, path to a specific checkpoint to restore.
predict_keys: List of strings, the keys of the `Tensors` in the features
dict (returned by the input_fn) to evaluate during inference.
Returns:
predictions: An Iterator, yielding evaluated values of `Tensors`
specified in `predict_keys`.
"""
# Create the estimator.
estimator = self._build_estimator(is_training=False)
# Create an iterator of predicted embeddings.
predictions = estimator.predict(input_fn=input_fn,
checkpoint_path=checkpoint_path,
predict_keys=predict_keys)
return predictions
def _tfrecord_inference(self, records, checkpoint_path, batch_size,
num_sequences=-1, reuse=False):
"""Mode 2: TFRecord inference.
Args:
records: List of strings, paths to TFRecords.
checkpoint_path: String, path to a specific checkpoint to restore.
batch_size: Int, size of inference batch.
num_sequences: Int, number of sequences to embed. If -1,
embed everything.
reuse: Boolean, whether or not to reuse embedder weights.
Yields:
(embeddings, raw_image_strings, sequence_name):
embeddings is a 2-D float32 numpy array holding
[sequence_size, embedding_size] image embeddings.
raw_image_strings is a 1-D string numpy array holding
[sequence_size] jpeg-encoded image strings.
sequence_name is a string holding the name of the embedded sequence.
"""
tf.reset_default_graph()
if not isinstance(records, list):
records = list(records)
# Map the list of tfrecords to a dataset of preprocessed images.
num_views = self._config.data.num_views
(views, task, seq_len) = data_providers.full_sequence_provider(
records, num_views)
tensor_dict = {
'raw_image_strings': views,
'task': task,
'seq_len': seq_len
}
# Create a preprocess function over raw image string placeholders.
image_str_placeholder = tf.placeholder(tf.string, shape=[None])
decoded = preprocessing.decode_images(image_str_placeholder)
decoded.set_shape([batch_size, None, None, 3])
preprocessed = self.preprocess_data(decoded, is_training=False)
# Create an inference graph over preprocessed images.
embeddings = self.forward(preprocessed, is_training=False, reuse=reuse)
# Create a saver to restore model variables.
tf.train.get_or_create_global_step()
saver = tf.train.Saver(tf.all_variables())
# Create a session and restore model variables.
with tf.train.MonitoredSession() as sess:
saver.restore(sess, checkpoint_path)
cnt = 0
# If num_sequences is specified, embed that many sequences, else embed
# everything.
try:
while cnt < num_sequences if num_sequences != -1 else True:
# Get a preprocessed image sequence.
np_data = sess.run(tensor_dict)
np_raw_images = np_data['raw_image_strings']
np_seq_len = np_data['seq_len']
np_task = np_data['task']
# Embed each view.
embedding_size = self._config.embedding_size
view_embeddings = [
np.zeros((0, embedding_size)) for _ in range(num_views)]
for view_index in range(num_views):
view_raw = np_raw_images[view_index]
# Embed the full sequence.
t = 0
while t < np_seq_len:
# Decode and preprocess the batch of image strings.
embeddings_np = sess.run(
embeddings, feed_dict={
image_str_placeholder: view_raw[t:t+batch_size]})
view_embeddings[view_index] = np.append(
view_embeddings[view_index], embeddings_np, axis=0)
tf.logging.info('Embedded %d images for task %s' % (t, np_task))
t += batch_size
# Done embedding for all views.
view_raw_images = np_data['raw_image_strings']
yield (view_embeddings, view_raw_images, np_task)
cnt += 1
except tf.errors.OutOfRangeError:
tf.logging.info('Done embedding entire dataset.')
def _np_inference(self, np_images, checkpoint_path):
"""Mode 3: Call this repeatedly to do inference over numpy images.
This mode is for when we we want to do real-time inference over
some stream of images (represented as numpy arrays).
Args:
np_images: A float32 numpy array holding images to embed.
checkpoint_path: String, path to a specific checkpoint to restore.
Returns:
(embeddings, raw_image_strings):
embeddings is a 2-D float32 numpy array holding
[inferred batch_size, embedding_size] image embeddings.
raw_image_strings is a 1-D string numpy array holding
[inferred batch_size] jpeg-encoded image strings.
"""
if isinstance(np_images, list):
np_images = np.asarray(np_images)
# Add a batch dimension if only 3-dimensional.
if len(np_images.shape) == 3:
np_images = np.expand_dims(np_images, axis=0)
# If np_images are in the range [0,255], convert to [0,1].
assert np.min(np_images) >= 0.
if (np.min(np_images), np.max(np_images)) == (0, 255):
np_images = np_images.astype(np.float32) / 255.
assert (np.min(np_images), np.max(np_images)) == (0., 1.)
# If this is the first pass, set up inference graph.
if not hasattr(self, '_np_inf_tensor_dict'):
self._setup_np_inference(np_images, checkpoint_path)
# Convert np_images to embeddings.
np_tensor_dict = self._sess.run(self._np_inf_tensor_dict, feed_dict={
self._image_placeholder: np_images
})
return np_tensor_dict['embeddings'], np_tensor_dict['raw_image_strings']
def _setup_np_inference(self, np_images, checkpoint_path):
"""Sets up and restores inference graph, creates and caches a Session."""
tf.logging.info('Restoring model weights.')
# Define inference over an image placeholder.
_, height, width, _ = np.shape(np_images)
image_placeholder = tf.placeholder(
tf.float32, shape=(None, height, width, 3))
# Preprocess batch.
preprocessed = self.preprocess_data(image_placeholder, is_training=False)
# Unscale and jpeg encode preprocessed images for display purposes.
im_strings = preprocessing.unscale_jpeg_encode(preprocessed)
# Do forward pass to get embeddings.
embeddings = self.forward(preprocessed, is_training=False)
# Create a saver to restore model variables.
tf.train.get_or_create_global_step()
saver = tf.train.Saver(tf.all_variables())
self._image_placeholder = image_placeholder
self._batch_encoded = embeddings
self._np_inf_tensor_dict = {
'embeddings': embeddings,
'raw_image_strings': im_strings,
}
# Create a session and restore model variables.
self._sess = tf.Session()
saver.restore(self._sess, checkpoint_path)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Get a configured estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from estimators import mvtcn_estimator as mvtcn_estimators
from estimators import svtcn_estimator
def get_mvtcn_estimator(loss_strategy, config, logdir):
"""Returns a configured MVTCN estimator."""
loss_to_trainer = {
'triplet_semihard': mvtcn_estimators.MVTCNTripletEstimator,
'npairs': mvtcn_estimators.MVTCNNpairsEstimator,
}
if loss_strategy not in loss_to_trainer:
raise ValueError('Unknown loss for MVTCN: %s' % loss_strategy)
estimator = loss_to_trainer[loss_strategy](config, logdir)
return estimator
def get_estimator(config, logdir):
"""Returns an unsupervised model trainer based on config.
Args:
config: A T object holding training configs.
logdir: String, path to directory where model checkpoints and summaries
are saved.
Returns:
estimator: A configured `TCNEstimator` object.
Raises:
ValueError: If unknown training strategy is specified.
"""
# Get the training strategy.
training_strategy = config.training_strategy
if training_strategy == 'mvtcn':
loss_strategy = config.loss_strategy
estimator = get_mvtcn_estimator(
loss_strategy, config, logdir)
elif training_strategy == 'svtcn':
estimator = svtcn_estimator.SVTCNTripletEstimator(config, logdir)
else:
raise ValueError('Unknown training strategy: %s' % training_strategy)
return estimator
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""MVTCN trainer implementations with various metric learning losses."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import data_providers
import model as model_module
from estimators import base_estimator
import tensorflow as tf
class MVTCNEstimator(base_estimator.BaseEstimator):
"""Multi-view TCN base class."""
def __init__(self, config, logdir):
super(MVTCNEstimator, self).__init__(config, logdir)
def _pairs_provider(self, records, is_training):
config = self._config
num_views = config.data.num_views
window = config.mvtcn.window
num_parallel_calls = config.data.num_parallel_calls
sequence_prefetch_size = config.data.sequence_prefetch_size
batch_prefetch_size = config.data.batch_prefetch_size
examples_per_seq = config.data.examples_per_sequence
return functools.partial(
data_providers.multiview_pairs_provider,
file_list=records,
preprocess_fn=self.preprocess_data,
num_views=num_views,
window=window,
is_training=is_training,
examples_per_seq=examples_per_seq,
num_parallel_calls=num_parallel_calls,
sequence_prefetch_size=sequence_prefetch_size,
batch_prefetch_size=batch_prefetch_size)
def forward(self, images_concat, is_training, reuse=False):
"""See base class."""
embedder_strategy = self._config.embedder_strategy
loss_strategy = self._config.loss_strategy
l2_normalize_embedding = self._config[loss_strategy].embedding_l2
embedder = model_module.get_embedder(
embedder_strategy,
self._config,
images_concat,
is_training=is_training,
l2_normalize_embedding=l2_normalize_embedding, reuse=reuse)
embeddings_concat = embedder.construct_embedding()
variables_to_train = embedder.get_trainable_variables()
self.variables_to_train = variables_to_train
self.pretrained_init_fn = embedder.init_fn
return embeddings_concat
def _collect_image_summaries(self, anchor_images, positive_images,
images_concat):
image_summaries = self._config.logging.summary.image_summaries
if image_summaries and not self._config.use_tpu:
batch_pairs_summary = tf.concat(
[anchor_images, positive_images], axis=2)
tf.summary.image('training/mvtcn_pairs', batch_pairs_summary)
tf.summary.image('training/images_preprocessed_concat', images_concat)
class MVTCNTripletEstimator(MVTCNEstimator):
"""Multi-View TCN with semihard triplet loss."""
def __init__(self, config, logdir):
super(MVTCNTripletEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
def input_fn(params):
"""Provides input to MVTCN models."""
if is_training and self._config.use_tpu:
batch_size = params['batch_size']
else:
batch_size = self._batch_size
(images_concat,
anchor_labels,
positive_labels,
anchor_images,
positive_images) = self._pairs_provider(
records, is_training)(batch_size=batch_size)
if is_training:
self._collect_image_summaries(anchor_images, positive_images,
images_concat)
labels = tf.concat([anchor_labels, positive_labels], axis=0)
features = {'batch_preprocessed': images_concat}
return (features, labels)
return input_fn
def define_loss(self, embeddings, labels, is_training):
"""See base class."""
margin = self._config.triplet_semihard.margin
loss = tf.contrib.losses.metric_learning.triplet_semihard_loss(
labels=labels, embeddings=embeddings, margin=margin)
self._loss = loss
if is_training and not self._config.use_tpu:
tf.summary.scalar('training/triplet_semihard', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/triplet_semihard': tf.metrics.mean(self._loss)}
class MVTCNNpairsEstimator(MVTCNEstimator):
"""Multi-View TCN with npairs loss."""
def __init__(self, config, logdir):
super(MVTCNNpairsEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
def input_fn(params):
"""Provides input to MVTCN models."""
if is_training and self._config.use_tpu:
batch_size = params['batch_size']
else:
batch_size = self._batch_size
(images_concat,
npairs_labels,
_,
anchor_images,
positive_images) = self._pairs_provider(
records, is_training)(batch_size=batch_size)
if is_training:
self._collect_image_summaries(anchor_images, positive_images,
images_concat)
features = {'batch_preprocessed': images_concat}
return (features, npairs_labels)
return input_fn
def define_loss(self, embeddings, labels, is_training):
"""See base class."""
embeddings_anchor, embeddings_positive = tf.split(embeddings, 2, axis=0)
loss = tf.contrib.losses.metric_learning.npairs_loss(
labels=labels, embeddings_anchor=embeddings_anchor,
embeddings_positive=embeddings_positive)
self._loss = loss
if is_training and not self._config.use_tpu:
tf.summary.scalar('training/npairs', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/npairs': tf.metrics.mean(self._loss)}
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SVTCN estimator implementation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import data_providers
import model as model_module
from estimators import base_estimator
from estimators import svtcn_loss
import tensorflow as tf
class SVTCNEstimator(base_estimator.BaseEstimator):
"""Single-view TCN Estimator base class."""
def __init__(self, config, logdir):
super(SVTCNEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
config = self._config
num_views = config.data.num_views
num_parallel_calls = config.data.num_parallel_calls
sequence_prefetch_size = config.data.sequence_prefetch_size
batch_prefetch_size = config.data.batch_prefetch_size
def input_fn():
"""Provides input to SVTCN models."""
(images_preprocessed,
images_raw,
timesteps) = data_providers.singleview_tcn_provider(
file_list=records,
preprocess_fn=self.preprocess_data,
num_views=num_views,
is_training=is_training,
batch_size=self._batch_size,
num_parallel_calls=num_parallel_calls,
sequence_prefetch_size=sequence_prefetch_size,
batch_prefetch_size=batch_prefetch_size)
if config.logging.summary.image_summaries and is_training:
tf.summary.image('training/svtcn_images', images_raw)
features = {'batch_preprocessed': images_preprocessed}
return (features, timesteps)
return input_fn
def forward(self, images, is_training, reuse=False):
"""See base class."""
embedder_strategy = self._config.embedder_strategy
embedder = model_module.get_embedder(
embedder_strategy,
self._config,
images,
is_training=is_training, reuse=reuse)
embeddings = embedder.construct_embedding()
if is_training:
self.variables_to_train = embedder.get_trainable_variables()
self.pretrained_init_fn = embedder.init_fn
return embeddings
class SVTCNTripletEstimator(SVTCNEstimator):
"""Single-View TCN with semihard triplet loss."""
def __init__(self, config, logdir):
super(SVTCNTripletEstimator, self).__init__(config, logdir)
def define_loss(self, embeddings, timesteps, is_training):
"""See base class."""
pos_radius = self._config.svtcn.pos_radius
neg_radius = self._config.svtcn.neg_radius
margin = self._config.triplet_semihard.margin
loss = svtcn_loss.singleview_tcn_loss(
embeddings, timesteps, pos_radius, neg_radius, margin=margin)
self._loss = loss
if is_training:
tf.summary.scalar('training/svtcn_loss', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/svtcn_loss': tf.metrics.mean(self._loss)}
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""This implements single view TCN triplet loss."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def pairwise_squared_distance(feature):
"""Computes the squared pairwise distance matrix.
output[i, j] = || feature[i, :] - feature[j, :] ||_2^2
Args:
feature: 2-D Tensor of size [number of data, feature dimension]
Returns:
pairwise_squared_distances: 2-D Tensor of size
[number of data, number of data]
"""
pairwise_squared_distances = tf.add(
tf.reduce_sum(
tf.square(feature), axis=1, keep_dims=True),
tf.reduce_sum(
tf.square(tf.transpose(feature)), axis=0,
keep_dims=True)) - 2.0 * tf.matmul(feature, tf.transpose(feature))
# Deal with numerical inaccuracies. Set small negatives to zero.
pairwise_squared_distances = tf.maximum(pairwise_squared_distances, 0.0)
return pairwise_squared_distances
def masked_maximum(data, mask, dim=1):
"""Computes the axis wise maximum over chosen elements.
Args:
data: N-D Tensor.
mask: N-D Tensor of zeros or ones.
dim: The dimension over which to compute the maximum.
Returns:
masked_maximums: N-D Tensor.
The maximized dimension is of size 1 after the operation.
"""
axis_minimums = tf.reduce_min(data, dim, keep_dims=True)
masked_maximums = tf.reduce_max(
tf.multiply(
data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
return masked_maximums
def masked_minimum(data, mask, dim=1):
"""Computes the axis wise minimum over chosen elements.
Args:
data: 2-D Tensor of size [n, m].
mask: 2-D Boolean Tensor of size [n, m].
dim: The dimension over which to compute the minimum.
Returns:
masked_minimums: N-D Tensor.
The minimized dimension is of size 1 after the operation.
"""
axis_maximums = tf.reduce_max(data, dim, keep_dims=True)
masked_minimums = tf.reduce_min(
tf.multiply(
data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
return masked_minimums
def singleview_tcn_loss(
embeddings, timesteps, pos_radius, neg_radius, margin=1.0,
sequence_ids=None, multiseq=False):
"""Computes the single view triplet loss with semi-hard negative mining.
The loss encourages the positive distances (between a pair of embeddings with
the same labels) to be smaller than the minimum negative distance among
which are at least greater than the positive distance plus the margin constant
(called semi-hard negative) in the mini-batch. If no such negative exists,
uses the largest negative distance instead.
Anchor, positive, negative selection is as follow:
Anchors: We consider every embedding timestep as an anchor.
Positives: pos_radius defines a radius (in timesteps) around each anchor from
which positives can be drawn. E.g. An anchor with t=10 and a pos_radius of
2 produces a set of 4 (anchor,pos) pairs [(a=10, p=8), ... (a=10, p=12)].
Negatives: neg_radius defines a boundary (in timesteps) around each anchor,
outside of which negatives can be drawn. E.g. An anchor with t=10 and a
neg_radius of 4 means negatives can be any t_neg where t_neg < 6 and
t_neg > 14.
Args:
embeddings: 2-D Tensor of embedding vectors.
timesteps: 1-D Tensor with shape [batch_size, 1] of sequence timesteps.
pos_radius: int32; the size of the window (in timesteps) around each anchor
timestep that a positive can be drawn from.
neg_radius: int32; the size of the window (in timesteps) around each anchor
timestep that defines a negative boundary. Negatives can only be chosen
where negative timestep t is < negative boundary min or > negative
boundary max.
margin: Float; the triplet loss margin hyperparameter.
sequence_ids: (Optional) 1-D Tensor with shape [batch_size, 1] of sequence
ids. Together (sequence_id, sequence_timestep) give us a unique index for
each image if we have multiple sequences in a batch.
multiseq: Boolean, whether or not the batch is composed of multiple
sequences (with possibly colliding timesteps).
Returns:
triplet_loss: tf.float32 scalar.
"""
assert neg_radius > pos_radius
# If timesteps shape isn't [batchsize, 1], reshape to [batch_size, 1].
tshape = tf.shape(timesteps)
assert tshape.shape == 2 or tshape.shape == 1
if tshape.shape == 1:
timesteps = tf.reshape(timesteps, [tshape[0], 1])
# Build pairwise squared distance matrix.
pdist_matrix = pairwise_squared_distance(embeddings)
# Build pairwise binary adjacency matrix, where adjacency[i,j] is True
# if timestep j is inside the positive range for timestep i and both
# timesteps come from the same sequence.
pos_radius = tf.cast(pos_radius, tf.int32)
if multiseq:
# If sequence_ids shape isn't [batchsize, 1], reshape to [batch_size, 1].
tshape = tf.shape(sequence_ids)
assert tshape.shape == 2 or tshape.shape == 1
if tshape.shape == 1:
sequence_ids = tf.reshape(sequence_ids, [tshape[0], 1])
# Build pairwise binary adjacency matrix based on sequence_ids
sequence_adjacency = tf.equal(sequence_ids, tf.transpose(sequence_ids))
# Invert so we can select negatives only.
sequence_adjacency_not = tf.logical_not(sequence_adjacency)
in_pos_range = tf.logical_and(
tf.less_equal(
tf.abs(timesteps - tf.transpose(timesteps)), pos_radius),
sequence_adjacency)
# Build pairwise binary discordance matrix, where discordance[i,j] is True
# if timestep j is inside the negative range for timestep i or if the
# timesteps come from different sequences.
in_neg_range = tf.logical_or(
tf.greater(tf.abs(timesteps - tf.transpose(timesteps)), neg_radius),
sequence_adjacency_not
)
else:
in_pos_range = tf.less_equal(
tf.abs(timesteps - tf.transpose(timesteps)), pos_radius)
in_neg_range = tf.greater(tf.abs(timesteps - tf.transpose(timesteps)),
neg_radius)
batch_size = tf.size(timesteps)
# compute the mask
pdist_matrix_tile = tf.tile(pdist_matrix, [batch_size, 1])
mask = tf.logical_and(
tf.tile(in_neg_range, [batch_size, 1]),
tf.greater(pdist_matrix_tile,
tf.reshape(tf.transpose(pdist_matrix), [-1, 1])))
mask_final = tf.reshape(
tf.greater(
tf.reduce_sum(
tf.cast(
mask, dtype=tf.float32), 1, keep_dims=True),
0.0), [batch_size, batch_size])
mask_final = tf.transpose(mask_final)
in_neg_range = tf.cast(in_neg_range, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
# negatives_outside: smallest D_an where D_an > D_ap
negatives_outside = tf.reshape(
masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
negatives_outside = tf.transpose(negatives_outside)
# negatives_inside: largest D_an
negatives_inside = tf.tile(
masked_maximum(pdist_matrix, in_neg_range), [1, batch_size])
semi_hard_negatives = tf.where(
mask_final, negatives_outside, negatives_inside)
loss_mat = tf.add(margin, pdist_matrix - semi_hard_negatives)
mask_positives = tf.cast(
in_pos_range, dtype=tf.float32) - tf.diag(tf.ones([batch_size]))
# In lifted-struct, the authors multiply 0.5 for upper triangular
# in semihard, they take all positive pairs except the diagonal.
num_positives = tf.reduce_sum(mask_positives)
triplet_loss = tf.truediv(
tf.reduce_sum(tf.maximum(tf.multiply(loss_mat, mask_positives), 0.0)),
num_positives,
name='triplet_svtcn_loss')
return triplet_loss
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for svtcn_loss.py."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from estimators import svtcn_loss
import tensorflow as tf
class SVTCNLoss(tf.test.TestCase):
def testSVTCNLoss(self):
with self.test_session():
num_data = 64
num_sequences = 2
num_data_per_seq = num_data // num_sequences
feat_dim = 6
margin = 1.0
times = np.tile(np.arange(num_data_per_seq, dtype=np.int32),
num_sequences)
times = np.reshape(times, [times.shape[0], 1])
sequence_ids = np.concatenate(
[np.ones(num_data_per_seq)*i for i in range(num_sequences)])
sequence_ids = np.reshape(sequence_ids, [sequence_ids.shape[0], 1])
pos_radius = 6
neg_radius = 12
embedding = np.random.rand(num_data, feat_dim).astype(np.float32)
# Compute the loss in NP
# Get a positive mask, i.e. indices for each time index
# that are inside the positive range.
in_pos_range = np.less_equal(
np.abs(times - times.transpose()), pos_radius)
# Get a negative mask, i.e. indices for each time index
# that are inside the negative range (> t + (neg_mult * pos_radius)
# and < t - (neg_mult * pos_radius).
in_neg_range = np.greater(np.abs(times - times.transpose()), neg_radius)
sequence_adjacency = sequence_ids == sequence_ids.T
sequence_adjacency_not = np.logical_not(sequence_adjacency)
pdist_matrix = euclidean_distances(embedding, squared=True)
loss_np = 0.0
num_positives = 0.0
for i in range(num_data):
for j in range(num_data):
if in_pos_range[i, j] and i != j and sequence_adjacency[i, j]:
num_positives += 1.0
pos_distance = pdist_matrix[i][j]
neg_distances = []
for k in range(num_data):
if in_neg_range[i, k] or sequence_adjacency_not[i, k]:
neg_distances.append(pdist_matrix[i][k])
neg_distances.sort() # sort by distance
chosen_neg_distance = neg_distances[0]
for l in range(len(neg_distances)):
chosen_neg_distance = neg_distances[l]
if chosen_neg_distance > pos_distance:
break
loss_np += np.maximum(
0.0, margin - chosen_neg_distance + pos_distance)
loss_np /= num_positives
# Compute the loss in TF
loss_tf = svtcn_loss.singleview_tcn_loss(
embeddings=tf.convert_to_tensor(embedding),
timesteps=tf.convert_to_tensor(times),
pos_radius=pos_radius,
neg_radius=neg_radius,
margin=margin,
sequence_ids=tf.convert_to_tensor(sequence_ids),
multiseq=True
)
loss_tf = loss_tf.eval()
self.assertAllClose(loss_np, loss_tf)
if __name__ == '__main__':
tf.test.main()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Calculates running validation of TCN models (and baseline comparisons)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from estimators.get_estimator import get_estimator
from utils import util
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master to use')
tf.app.flags.DEFINE_string(
'logdir', '/tmp/tcn', 'Directory where to write event logs.')
FLAGS = tf.app.flags.FLAGS
def main(_):
"""Runs main eval loop."""
# Parse config dict from yaml config files / command line flags.
logdir = FLAGS.logdir
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
# Choose an estimator based on training strategy.
estimator = get_estimator(config, logdir)
# Wait for the first checkpoint file to be written.
while not tf.train.latest_checkpoint(logdir):
tf.logging.info('Waiting for a checkpoint file...')
time.sleep(10)
# Run validation.
while True:
estimator.evaluate()
if __name__ == '__main__':
tf.app.run()
This image diff could not be displayed because it is too large. You can view the blob instead.
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Generates imitation videos.
Generate single pairwise imitation videos:
blaze build -c opt --config=cuda --copt=-mavx \
learning/brain/research/tcn/generate_videos && \
blaze-bin/learning/brain/research/tcn/generate_videos \
--logtostderr \
--config_paths $config_paths \
--checkpointdir $checkpointdir \
--checkpoint_iter $checkpoint_iter \
--query_records_dir $query_records_dir \
--target_records_dir $target_records_dir \
--outdir $outdir \
--mode single \
--num_query_sequences 1 \
--num_target_sequences -1
# Generate imitation videos with multiple sequences in the target set:
query_records_path
blaze build -c opt --config=cuda --copt=-mavx \
learning/brain/research/tcn/generate_videos && \
blaze-bin/learning/brain/research/tcn/generate_videos \
--logtostderr \
--config_paths $config_paths \
--checkpointdir $checkpointdir \
--checkpoint_iter $checkpoint_iter \
--query_records_dir $query_records_dir \
--target_records_dir $target_records_dir \
--outdir $outdir \
--num_multi_targets 1 \
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cv2
import tensorflow as tf
import os
import matplotlib
matplotlib.use("pdf")
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import numpy as np
from estimators.get_estimator import get_estimator
from utils import util
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string(
'checkpointdir', '/tmp/tcn', 'Path to model checkpoints.')
tf.app.flags.DEFINE_string(
'checkpoint_iter', '', 'Checkpoint iter to use.')
tf.app.flags.DEFINE_integer(
'num_multi_targets', -1,
'Number of imitation vids in the target set per imitation video.')
tf.app.flags.DEFINE_string(
'outdir', '/tmp/tcn', 'Path to write embeddings to.')
tf.app.flags.DEFINE_string(
'mode', 'single', 'single | multi. Single means generate imitation vids'
'where query is being imitated by single sequence. Multi'
'means generate imitation vids where query is being'
'imitated by multiple.')
tf.app.flags.DEFINE_string('query_records_dir', '',
'Directory of image tfrecords.')
tf.app.flags.DEFINE_string('target_records_dir', '',
'Directory of image tfrecords.')
tf.app.flags.DEFINE_integer('query_view', 1,
'Viewpoint of the query video.')
tf.app.flags.DEFINE_integer('target_view', 0,
'Viewpoint of the imitation video.')
tf.app.flags.DEFINE_integer('smoothing_window', 5,
'Number of frames to smooth over.')
tf.app.flags.DEFINE_integer('num_query_sequences', -1,
'Number of query sequences to embed.')
tf.app.flags.DEFINE_integer('num_target_sequences', -1,
'Number of target sequences to embed.')
FLAGS = tf.app.flags.FLAGS
def SmoothEmbeddings(embs):
"""Temporally smoothes a sequence of embeddings."""
new_embs = []
window = int(FLAGS.smoothing_window)
for i in range(len(embs)):
min_i = max(i-window, 0)
max_i = min(i+window, len(embs))
new_embs.append(np.mean(embs[min_i:max_i, :], axis=0))
return np.array(new_embs)
def MakeImitationVideo(
outdir, vidname, query_im_strs, knn_im_strs, height=640, width=360):
"""Creates a KNN imitation video.
For each frame in vid0, pair with the frame at index in knn_indices in
vids1. Write video to disk.
Args:
outdir: String, directory to write videos.
vidname: String, name of video.
query_im_strs: Numpy array holding query image strings.
knn_im_strs: Numpy array holding knn image strings.
height: Int, height of raw images.
width: Int, width of raw images.
"""
if not tf.gfile.Exists(outdir):
tf.gfile.MakeDirs(outdir)
vid_path = os.path.join(outdir, vidname)
combined = zip(query_im_strs, knn_im_strs)
# Create and write the video.
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_aspect('equal')
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
im = ax.imshow(
np.zeros((height, width*2, 3)), cmap='gray', interpolation='nearest')
im.set_clim([0, 1])
plt.tight_layout(pad=0, w_pad=0, h_pad=0)
# pylint: disable=invalid-name
def update_img(pair):
"""Decode pairs of image strings, update a video."""
im_i, im_j = pair
nparr_i = np.fromstring(str(im_i), np.uint8)
img_np_i = cv2.imdecode(nparr_i, 1)
img_np_i = img_np_i[..., [2, 1, 0]]
nparr_j = np.fromstring(str(im_j), np.uint8)
img_np_j = cv2.imdecode(nparr_j, 1)
img_np_j = img_np_j[..., [2, 1, 0]]
# Optionally reshape the images to be same size.
frame = np.concatenate([img_np_i, img_np_j], axis=1)
im.set_data(frame)
return im
ani = animation.FuncAnimation(fig, update_img, combined, interval=15)
writer = animation.writers['ffmpeg'](fps=15)
dpi = 100
tf.logging.info('Writing video to:\n %s \n' % vid_path)
ani.save('%s.mp4' % vid_path, writer=writer, dpi=dpi)
def GenerateImitationVideo(
vid_name, query_ims, query_embs, target_ims, target_embs, height, width):
"""Generates a single cross-sequence imitation video.
For each frame in some query sequence, find the nearest neighbor from
some target sequence in embedding space.
Args:
vid_name: String, the name of the video.
query_ims: Numpy array of shape [query sequence length, height, width, 3].
query_embs: Numpy array of shape [query sequence length, embedding size].
target_ims: Numpy array of shape [target sequence length, height, width,
3].
target_embs: Numpy array of shape [target sequence length, embedding
size].
height: Int, height of the raw image.
width: Int, width of the raw image.
"""
# For each query frame, find the index of the nearest neighbor in the
# target video.
knn_indices = [util.KNNIds(q, target_embs, k=1)[0] for q in query_embs]
# Create and write out the video.
assert knn_indices
knn_ims = np.array([target_ims[k] for k in knn_indices])
MakeImitationVideo(FLAGS.outdir, vid_name, query_ims, knn_ims, height, width)
def SingleImitationVideos(
query_records, target_records, config, height, width):
"""Generates pairwise imitation videos.
This creates all pairs of target imitating query videos, where each frame
on the left is matched to a nearest neighbor coming a single
embedded target video.
Args:
query_records: List of Strings, paths to tfrecord datasets to use as
queries.
target_records: List of Strings, paths to tfrecord datasets to use as
targets.
config: A T object describing training config.
height: Int, height of the raw image.
width: Int, width of the raw image.
"""
# Embed query and target data.
(query_sequences_to_data,
target_sequences_to_data) = EmbedQueryTargetData(
query_records, target_records, config)
qview = FLAGS.query_view
tview = FLAGS.target_view
# Loop over query videos.
for task_i, data_i in query_sequences_to_data.iteritems():
for task_j, data_j in target_sequences_to_data.iteritems():
i_ims = data_i['images']
i_embs = data_i['embeddings']
query_embs = SmoothEmbeddings(i_embs[qview])
query_ims = i_ims[qview]
j_ims = data_j['images']
j_embs = data_j['embeddings']
target_embs = SmoothEmbeddings(j_embs[tview])
target_ims = j_ims[tview]
tf.logging.info('Generating %s imitating %s video.' % (task_j, task_i))
vid_name = 'q%sv%s_im%sv%s' % (task_i, qview, task_j, tview)
vid_name = vid_name.replace('/', '_')
GenerateImitationVideo(vid_name, query_ims, query_embs,
target_ims, target_embs, height, width)
def MultiImitationVideos(
query_records, target_records, config, height, width):
"""Creates multi-imitation videos.
This creates videos where every frame on the left is matched to a nearest
neighbor coming from a set of multiple embedded target videos.
Args:
query_records: List of Strings, paths to tfrecord datasets to use as
queries.
target_records: List of Strings, paths to tfrecord datasets to use as
targets.
config: A T object describing training config.
height: Int, height of the raw image.
width: Int, width of the raw image.
"""
# Embed query and target data.
(query_sequences_to_data,
target_sequences_to_data) = EmbedQueryTargetData(
query_records, target_records, config)
qview = FLAGS.query_view
tview = FLAGS.target_view
# Loop over query videos.
for task_i, data_i in query_sequences_to_data.iteritems():
i_ims = data_i['images']
i_embs = data_i['embeddings']
query_embs = SmoothEmbeddings(i_embs[qview])
query_ims = i_ims[qview]
all_target_embs = []
all_target_ims = []
# If num_imitation_vids is -1, add all seq embeddings to the target set.
if FLAGS.num_multi_targets == -1:
num_multi_targets = len(target_sequences_to_data)
else:
# Else, add some specified number of seq embeddings to the target set.
num_multi_targets = FLAGS.num_multi_targets
for j in range(num_multi_targets):
task_j = target_sequences_to_data.keys()[j]
data_j = target_sequences_to_data[task_j]
print('Adding %s to target set' % task_j)
j_ims = data_j['images']
j_embs = data_j['embeddings']
target_embs = SmoothEmbeddings(j_embs[tview])
target_ims = j_ims[tview]
all_target_embs.extend(target_embs)
all_target_ims.extend(target_ims)
# Generate a "j imitating i" video.
tf.logging.info('Generating all imitating %s video.' % task_i)
vid_name = 'q%sv%s_multiv%s' % (task_i, qview, tview)
vid_name = vid_name.replace('/', '_')
GenerateImitationVideo(vid_name, query_ims, query_embs,
all_target_ims, all_target_embs, height, width)
def SameSequenceVideos(query_records, config, height, width):
"""Generate same sequence, cross-view imitation videos."""
batch_size = config.data.embed_batch_size
# Choose an estimator based on training strategy.
estimator = get_estimator(config, FLAGS.checkpointdir)
# Choose a checkpoint path to restore.
checkpointdir = FLAGS.checkpointdir
checkpoint_path = os.path.join(checkpointdir,
'model.ckpt-%s' % FLAGS.checkpoint_iter)
# Embed num_sequences query sequences, store embeddings and image strings in
# query_sequences_to_data.
sequences_to_data = {}
for (view_embeddings, view_raw_image_strings, seqname) in estimator.inference(
query_records, checkpoint_path, batch_size,
num_sequences=FLAGS.num_query_sequences):
sequences_to_data[seqname] = {
'embeddings': view_embeddings,
'images': view_raw_image_strings,
}
# Loop over query videos.
qview = FLAGS.query_view
tview = FLAGS.target_view
for task_i, data_i in sequences_to_data.iteritems():
ims = data_i['images']
embs = data_i['embeddings']
query_embs = SmoothEmbeddings(embs[qview])
query_ims = ims[qview]
target_embs = SmoothEmbeddings(embs[tview])
target_ims = ims[tview]
tf.logging.info('Generating %s imitating %s video.' % (task_i, task_i))
vid_name = 'q%sv%s_im%sv%s' % (task_i, qview, task_i, tview)
vid_name = vid_name.replace('/', '_')
GenerateImitationVideo(vid_name, query_ims, query_embs,
target_ims, target_embs, height, width)
def EmbedQueryTargetData(query_records, target_records, config):
"""Embeds the full set of query_records and target_records.
Args:
query_records: List of Strings, paths to tfrecord datasets to use as
queries.
target_records: List of Strings, paths to tfrecord datasets to use as
targets.
config: A T object describing training config.
Returns:
query_sequences_to_data: A dict holding 'embeddings' and 'images'
target_sequences_to_data: A dict holding 'embeddings' and 'images'
"""
batch_size = config.data.embed_batch_size
# Choose an estimator based on training strategy.
estimator = get_estimator(config, FLAGS.checkpointdir)
# Choose a checkpoint path to restore.
checkpointdir = FLAGS.checkpointdir
checkpoint_path = os.path.join(checkpointdir,
'model.ckpt-%s' % FLAGS.checkpoint_iter)
# Embed num_sequences query sequences, store embeddings and image strings in
# query_sequences_to_data.
num_query_sequences = FLAGS.num_query_sequences
num_target_sequences = FLAGS.num_target_sequences
query_sequences_to_data = {}
for (view_embeddings, view_raw_image_strings, seqname) in estimator.inference(
query_records, checkpoint_path, batch_size,
num_sequences=num_query_sequences):
query_sequences_to_data[seqname] = {
'embeddings': view_embeddings,
'images': view_raw_image_strings,
}
if (query_records == target_records) and (
num_query_sequences == num_target_sequences):
target_sequences_to_data = query_sequences_to_data
else:
# Embed num_sequences target sequences, store embeddings and image strings
# in sequences_to_data.
target_sequences_to_data = {}
for (view_embeddings, view_raw_image_strings,
seqname) in estimator.inference(
target_records, checkpoint_path, batch_size,
num_sequences=num_target_sequences):
target_sequences_to_data[seqname] = {
'embeddings': view_embeddings,
'images': view_raw_image_strings,
}
return query_sequences_to_data, target_sequences_to_data
def main(_):
# Parse config dict from yaml config files / command line flags.
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
# Get tables to embed.
query_records_dir = FLAGS.query_records_dir
query_records = util.GetFilesRecursively(query_records_dir)
target_records_dir = FLAGS.target_records_dir
target_records = util.GetFilesRecursively(target_records_dir)
height = config.data.raw_height
width = config.data.raw_width
mode = FLAGS.mode
if mode == 'multi':
# Generate videos where target set is composed of multiple videos.
MultiImitationVideos(query_records, target_records, config,
height, width)
elif mode == 'single':
# Generate videos where target set is a single video.
SingleImitationVideos(query_records, target_records, config,
height, width)
elif mode == 'same':
# Generate videos where target set is the same as query, but diff view.
SameSequenceVideos(query_records, config, height, width)
else:
raise ValueError('Unknown mode %s' % mode)
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Generates test Recall@K statistics on labeled classification problems."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import os
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from six.moves import xrange
import data_providers
from estimators.get_estimator import get_estimator
from utils import util
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string(
'mode', 'validation',
'Which dataset to evaluate: `validation` | `test`.')
tf.app.flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master to use')
tf.app.flags.DEFINE_string(
'checkpoint_iter', '', 'Evaluate this specific checkpoint.')
tf.app.flags.DEFINE_string(
'checkpointdir', '/tmp/tcn', 'Path to model checkpoints.')
tf.app.flags.DEFINE_string('outdir', '/tmp/tcn', 'Path to write summaries to.')
FLAGS = tf.app.flags.FLAGS
def nearest_cross_sequence_neighbors(data, tasks, n_neighbors=1):
"""Computes the n_neighbors nearest neighbors for every row in data.
Args:
data: A np.float32 array of shape [num_data, embedding size] holding
an embedded validation / test dataset.
tasks: A list of strings of size [num_data] holding the task or sequence
name that each row belongs to.
n_neighbors: The number of knn indices to return for each row.
Returns:
indices: an np.int32 array of size [num_data, n_neighbors] holding the
n_neighbors nearest indices for every row in data. These are
restricted to be from different named sequences (as defined in `tasks`).
"""
# Compute the pairwise sequence adjacency matrix from `tasks`.
num_data = data.shape[0]
tasks = np.array(tasks)
tasks = np.reshape(tasks, (num_data, 1))
assert len(tasks.shape) == 2
not_adjacent = (tasks != tasks.T)
# Compute the symmetric pairwise distance matrix.
pdist = pairwise_distances(data, metric='sqeuclidean')
# For every row in the pairwise distance matrix, only consider
# cross-sequence columns.
indices = np.zeros((num_data, n_neighbors), dtype=np.int32)
for idx in range(num_data):
# Restrict to cross_sequence neighbors.
distances = [(
pdist[idx][i], i) for i in xrange(num_data) if not_adjacent[idx][i]]
_, nearest_indices = zip(*sorted(
distances, key=lambda x: x[0])[:n_neighbors])
indices[idx] = nearest_indices
return indices
def compute_cross_sequence_recall_at_k(retrieved_labels, labels, k_list):
"""Compute recall@k for a given list of k values.
Recall is one if an example of the same class is retrieved among the
top k nearest neighbors given a query example and zero otherwise.
Counting the recall for all examples and averaging the counts returns
recall@k score.
Args:
retrieved_labels: 2-D Numpy array of KNN labels for every embedding.
labels: 1-D Numpy array of shape [number of data].
k_list: List of k values to evaluate recall@k.
Returns:
recall_list: List of recall@k values.
"""
kvalue_to_recall = dict(zip(k_list, np.zeros(len(k_list))))
# For each value of K.
for k in k_list:
matches = defaultdict(float)
counts = defaultdict(float)
# For each (row index, label value) in the query labels.
for i, label_value in enumerate(labels):
# Loop over the K nearest retrieved labels.
if label_value in retrieved_labels[i][:k]:
matches[label_value] += 1.
# Increment the denominator.
counts[label_value] += 1.
kvalue_to_recall[k] = np.mean(
[matches[l]/counts[l] for l in matches])
return [kvalue_to_recall[i] for i in k_list]
def compute_cross_sequence_recalls_at_k(
embeddings, labels, label_attr_keys, tasks, k_list, summary_writer,
training_step):
"""Computes and reports the recall@k for each classification problem.
This takes an embedding matrix and an array of multiclass labels
with size [num_data, number of classification problems], then
computes the average recall@k for each classification problem
as well as the average across problems.
Args:
embeddings: A np.float32 array of size [num_data, embedding_size]
representing the embedded validation or test dataset.
labels: A np.int32 array of size [num_data, num_classification_problems]
holding multiclass labels for each embedding for each problem.
label_attr_keys: List of strings, holds the names of the classification
problems.
tasks: A list of strings describing the video sequence each row
belongs to. This is used to restrict the recall@k computation
to cross-sequence examples.
k_list: A list of ints, the k values to evaluate recall@k.
summary_writer: A tf.summary.FileWriter.
training_step: Int, the current training step we're evaluating.
"""
num_data = float(embeddings.shape[0])
assert labels.shape[0] == num_data
# Compute knn indices.
indices = nearest_cross_sequence_neighbors(
embeddings, tasks, n_neighbors=max(k_list))
retrieved_labels = labels[indices]
# Compute the recall@k for each classification problem.
recall_lists = []
for idx, label_attr in enumerate(label_attr_keys):
problem_labels = labels[:, idx]
# Take all indices, all k labels for the problem indexed by idx.
problem_retrieved = retrieved_labels[:, :, idx]
recall_list = compute_cross_sequence_recall_at_k(
retrieved_labels=problem_retrieved,
labels=problem_labels,
k_list=k_list)
recall_lists.append(recall_list)
for (k, recall) in zip(k_list, recall_list):
recall_error = 1-recall
summ = tf.Summary(value=[tf.Summary.Value(
tag='validation/classification/%s error@top%d' % (
label_attr, k),
simple_value=recall_error)])
print('%s recall@K=%d' % (label_attr, k), recall_error)
summary_writer.add_summary(summ, int(training_step))
# Report an average recall@k across problems.
recall_lists = np.array(recall_lists)
for i in range(recall_lists.shape[1]):
average_recall = np.mean(recall_lists[:, i])
recall_error = 1 - average_recall
summ = tf.Summary(value=[tf.Summary.Value(
tag='validation/classification/average error@top%d' % k_list[i],
simple_value=recall_error)])
print('Average recall@K=%d' % k_list[i], recall_error)
summary_writer.add_summary(summ, int(training_step))
def evaluate_once(
estimator, input_fn_by_view, batch_size, checkpoint_path,
label_attr_keys, embedding_size, num_views, k_list):
"""Compute the recall@k for a given checkpoint path.
Args:
estimator: an `Estimator` object to evaluate.
input_fn_by_view: An input_fn to an `Estimator's` predict method. Takes
a view index and returns a dict holding ops for getting raw images for
the view.
batch_size: Int, size of the labeled eval batch.
checkpoint_path: String, path to the specific checkpoint being evaluated.
label_attr_keys: A list of Strings, holding each attribute name.
embedding_size: Int, the size of the embedding.
num_views: Int, number of views in the dataset.
k_list: List of ints, list of K values to compute recall at K for.
"""
feat_matrix = np.zeros((0, embedding_size))
label_vect = np.zeros((0, len(label_attr_keys)))
tasks = []
eval_tensor_keys = ['embeddings', 'tasks', 'classification_labels']
# Iterate all views in the dataset.
for view_index in range(num_views):
# Set up a graph for embedding entire dataset.
predictions = estimator.inference(
input_fn_by_view(view_index), checkpoint_path,
batch_size, predict_keys=eval_tensor_keys)
# Enumerate predictions.
for i, p in enumerate(predictions):
if i % 100 == 0:
tf.logging.info('Embedded %d images for view %d' % (i, view_index))
label = p['classification_labels']
task = p['tasks']
embedding = p['embeddings']
# Collect (embedding, label, task) data.
feat_matrix = np.append(feat_matrix, [embedding], axis=0)
label_vect = np.append(label_vect, [label], axis=0)
tasks.append(task)
# Compute recall statistics.
ckpt_step = int(checkpoint_path.split('-')[-1])
summary_dir = os.path.join(FLAGS.outdir, 'labeled_eval_summaries')
summary_writer = tf.summary.FileWriter(summary_dir)
compute_cross_sequence_recalls_at_k(
feat_matrix, label_vect, label_attr_keys, tasks, k_list,
summary_writer, ckpt_step)
def get_labeled_tables(config):
"""Gets either labeled test or validation tables, based on flags."""
# Get a list of filenames corresponding to labeled data.
mode = FLAGS.mode
if mode == 'validation':
labeled_tables = util.GetFilesRecursively(config.data.labeled.validation)
elif mode == 'test':
labeled_tables = util.GetFilesRecursively(config.data.labeled.test)
else:
raise ValueError('Unknown dataset: %s' % mode)
return labeled_tables
def main(_):
"""Runs main labeled eval loop."""
# Parse config dict from yaml config files / command line flags.
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
# Choose an estimator based on training strategy.
checkpointdir = FLAGS.checkpointdir
estimator = get_estimator(config, checkpointdir)
# Get data configs.
image_attr_keys = config.data.labeled.image_attr_keys
label_attr_keys = config.data.labeled.label_attr_keys
embedding_size = config.embedding_size
num_views = config.data.num_views
k_list = config.val.recall_at_k_list
batch_size = config.data.batch_size
# Get either labeled validation or test tables.
labeled_tables = get_labeled_tables(config)
def input_fn_by_view(view_index):
"""Returns an input_fn for use with a tf.Estimator by view."""
def input_fn():
# Get raw labeled images.
(preprocessed_images, labels,
tasks) = data_providers.labeled_data_provider(
labeled_tables,
estimator.preprocess_data, view_index, image_attr_keys,
label_attr_keys, batch_size=batch_size)
return {
'batch_preprocessed': preprocessed_images,
'tasks': tasks,
'classification_labels': labels,
}, None
return input_fn
# If evaluating a specific checkpoint, do that.
if FLAGS.checkpoint_iter:
checkpoint_path = os.path.join(
'%s/model.ckpt-%s' % (checkpointdir, FLAGS.checkpoint_iter))
evaluate_once(
estimator, input_fn_by_view, batch_size, checkpoint_path,
label_attr_keys, embedding_size, num_views, k_list)
else:
for checkpoint_path in tf.contrib.training.checkpoints_iterator(
checkpointdir):
evaluate_once(
estimator, input_fn_by_view, batch_size, checkpoint_path,
label_attr_keys, embedding_size, num_views, k_list)
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tcn.labeled_eval."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import labeled_eval
import tensorflow as tf
class LabeledEvalTest(tf.test.TestCase):
def testNearestCrossSequenceNeighbors(self):
# Generate embeddings.
num_data = 64
embedding_size = 4
num_tasks = 8
n_neighbors = 2
data = np.random.randn(num_data, embedding_size)
tasks = np.repeat(range(num_tasks), num_data // num_tasks)
# Get nearest cross-sequence indices.
indices = labeled_eval.nearest_cross_sequence_neighbors(
data, tasks, n_neighbors=n_neighbors)
# Assert that no nearest neighbor indices come from the same task.
repeated_tasks = np.tile(np.reshape(tasks, (num_data, 1)), n_neighbors)
self.assertTrue(np.all(np.not_equal(repeated_tasks, tasks[indices])))
def testPerfectCrossSequenceRecall(self):
# Make sure cross-sequence recall@k returns 1.0 for near-duplicate features.
embeddings = np.random.randn(10, 2)
embeddings[5:, :] = 0.00001 + embeddings[:5, :]
tasks = np.repeat([0, 1], 5)
labels = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
# find k=1, k=2 nearest neighbors.
k_list = [1, 2]
# Compute knn indices.
indices = labeled_eval.nearest_cross_sequence_neighbors(
embeddings, tasks, n_neighbors=max(k_list))
retrieved_labels = labels[indices]
recall_list = labeled_eval.compute_cross_sequence_recall_at_k(
retrieved_labels=retrieved_labels,
labels=labels,
k_list=k_list)
self.assertTrue(np.allclose(
np.array(recall_list), np.array([1.0, 1.0])))
def testRelativeRecall(self):
# Make sure cross-sequence recall@k is strictly non-decreasing over k.
num_data = 100
num_tasks = 10
embeddings = np.random.randn(100, 5)
tasks = np.repeat(range(num_tasks), num_data // num_tasks)
labels = np.random.randint(0, 5, 100)
k_list = [1, 2, 4, 8, 16, 32, 64]
indices = labeled_eval.nearest_cross_sequence_neighbors(
embeddings, tasks, n_neighbors=max(k_list))
retrieved_labels = labels[indices]
recall_list = labeled_eval.compute_cross_sequence_recall_at_k(
retrieved_labels=retrieved_labels,
labels=labels,
k_list=k_list)
recall_list_sorted = sorted(recall_list)
self.assertTrue(np.allclose(
np.array(recall_list), np.array(recall_list_sorted)))
if __name__ == "__main__":
tf.test.main()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model implementations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from abc import ABCMeta
from abc import abstractmethod
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.python.slim.nets import inception
from tensorflow.contrib.slim.python.slim.nets import resnet_v2 as resnet_v2
from tensorflow.contrib.slim.python.slim.nets import resnet_utils as resnet_utils
def get_embedder(
embedder_strategy, config, images, is_training, reuse=False,
l2_normalize_embedding=True):
"""Returns an embedder based on config.
Args:
embedder_strategy: String, name of embedder version to return.
config: LuaTable object, training config.
images: 4-D float `Tensor` containing batch images.
is_training: Boolean or placeholder for boolean,
indicator for whether or not we're training.
reuse: Boolean: Reuse embedder variable scope.
l2_normalize_embedding: Boolean, whether or not to l2 normalize the
embedding.
Returns:
embedder: An `Embedder` object.
Raises:
ValueError: if unknown embedder_strategy specified.
"""
if embedder_strategy == 'inception_baseline':
pretrained_ckpt = config.inception_conv_ss_fc.pretrained_checkpoint
return InceptionBaselineEmbedder(
images,
pretrained_ckpt,
config.random_projection,
config.random_projection_dim)
strategy_to_embedder = {
'inception_conv_ss_fc': InceptionConvSSFCEmbedder,
'resnet': ResnetEmbedder,
}
if embedder_strategy not in strategy_to_embedder:
raise ValueError('unknown embedder_strategy', embedder_strategy)
embedding_size = config.embedding_size
l2_reg_weight = config.learning.l2_reg_weight
embedder = strategy_to_embedder[embedder_strategy](
config[embedder_strategy], images, embedding_size,
is_training, embedding_l2=l2_normalize_embedding,
l2_reg_weight=l2_reg_weight, reuse=reuse)
return embedder
def build_inceptionv3_graph(images, endpoint, is_training, checkpoint,
reuse=False):
"""Builds an InceptionV3 model graph.
Args:
images: A 4-D float32 `Tensor` of batch images.
endpoint: String, name of the InceptionV3 endpoint.
is_training: Boolean, whether or not to build a training or inference graph.
checkpoint: String, path to the pretrained model checkpoint.
reuse: Boolean, whether or not we are reusing the embedder.
Returns:
inception_output: `Tensor` holding the InceptionV3 output.
inception_variables: List of inception variables.
init_fn: Function to initialize the weights (if not reusing, then None).
"""
with slim.arg_scope(inception.inception_v3_arg_scope()):
_, endpoints = inception.inception_v3(
images, num_classes=1001, is_training=is_training)
inception_output = endpoints[endpoint]
inception_variables = slim.get_variables_to_restore()
inception_variables = [
i for i in inception_variables if 'global_step' not in i.name]
if is_training and not reuse:
init_saver = tf.train.Saver(inception_variables)
def init_fn(scaffold, sess):
del scaffold
init_saver.restore(sess, checkpoint)
else:
init_fn = None
return inception_output, inception_variables, init_fn
class InceptionBaselineEmbedder(object):
"""Produces pre-trained InceptionV3 embeddings."""
def __init__(self, images, pretrained_ckpt, reuse=False,
random_projection=False, random_projection_dim=32):
# Build InceptionV3 graph.
(inception_output,
self.inception_variables,
self.init_fn) = build_inceptionv3_graph(
images, 'Mixed_7c', False, pretrained_ckpt, reuse)
# Pool 8x8x2048 -> 1x1x2048.
embedding = slim.avg_pool2d(inception_output, [8, 8], stride=1)
embedding = tf.squeeze(embedding, [1, 2])
if random_projection:
embedding = tf.matmul(
embedding, tf.random_normal(
shape=[2048, random_projection_dim], seed=123))
self.embedding = embedding
class PretrainedEmbedder(object):
"""Base class for embedders that take pre-trained networks as input."""
__metaclass__ = ABCMeta
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
"""Constructor.
Args:
config: A T object holding training config.
images: A 4-D float32 `Tensor` holding images to embed.
embedding_size: Int, the size of the embedding.
is_training: Boolean, whether or not this is a training or inference-time
graph.
embedding_l2: Boolean, whether or not to l2 normalize the embedding.
l2_reg_weight: Float, weight applied to l2 weight regularization.
reuse: Boolean, whether or not we're reusing this graph.
"""
# Pull out all the embedder hyperparameters.
self._config = config
self._embedding_size = embedding_size
self._l2_reg_weight = l2_reg_weight
self._embedding_l2 = embedding_l2
self._is_training = is_training
self._reuse = reuse
# Pull out pretrained hparams.
pretrained_checkpoint = config.pretrained_checkpoint
pretrained_layer = config.pretrained_layer
pretrained_keep_prob = config.dropout.keep_pretrained
# Build pretrained graph.
(pretrained_output,
self._pretrained_variables,
self.init_fn) = self.build_pretrained_graph(
images, pretrained_layer, pretrained_checkpoint, is_training, reuse)
# Optionally drop out the activations.
pretrained_output = slim.dropout(
pretrained_output, keep_prob=pretrained_keep_prob,
is_training=is_training)
self._pretrained_output = pretrained_output
@abstractmethod
def build_pretrained_graph(self, images, layer, pretrained_checkpoint,
is_training, reuse):
"""Builds the graph for the pre-trained network.
Method to be overridden by implementations.
Args:
images: A 4-D tf.float32 `Tensor` holding images to embed.
layer: String, defining which pretrained layer to take as input
to adaptation layers.
pretrained_checkpoint: String, path to a checkpoint used to load
pretrained weights.
is_training: Boolean, whether or not we're in training mode.
reuse: Boolean, whether or not to reuse embedder weights.
Returns:
pretrained_output: A 2 or 3-d tf.float32 `Tensor` holding pretrained
activations.
"""
pass
@abstractmethod
def construct_embedding(self):
"""Builds an embedding function on top of images.
Method to be overridden by implementations.
Returns:
embeddings: A 2-d float32 `Tensor` of shape [batch_size, embedding_size]
holding the embedded images.
"""
pass
def get_trainable_variables(self):
"""Gets a list of variables to optimize."""
if self._config.finetune:
return tf.trainable_variables()
else:
adaptation_only_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope=self._adaptation_scope)
return adaptation_only_vars
class ResnetEmbedder(PretrainedEmbedder):
"""Resnet TCN.
ResnetV2 -> resnet adaptation layers -> optional l2 normalize -> embedding.
"""
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
super(ResnetEmbedder, self).__init__(
config, images, embedding_size, is_training, embedding_l2,
l2_reg_weight, reuse)
def build_pretrained_graph(
self, images, resnet_layer, checkpoint, is_training, reuse=False):
"""See baseclass."""
with slim.arg_scope(resnet_v2.resnet_arg_scope()):
_, endpoints = resnet_v2.resnet_v2_50(
images, is_training=is_training, reuse=reuse)
resnet_layer = 'resnet_v2_50/block%d' % resnet_layer
resnet_output = endpoints[resnet_layer]
resnet_variables = slim.get_variables_to_restore()
resnet_variables = [
i for i in resnet_variables if 'global_step' not in i.name]
if is_training and not reuse:
init_saver = tf.train.Saver(resnet_variables)
def init_fn(scaffold, sess):
del scaffold
init_saver.restore(sess, checkpoint)
else:
init_fn = None
return resnet_output, resnet_variables, init_fn
def construct_embedding(self):
"""Builds an embedding function on top of images.
Method to be overridden by implementations.
Returns:
embeddings: A 2-d float32 `Tensor` of shape [batch_size, embedding_size]
holding the embedded images.
"""
with tf.variable_scope('tcn_net', reuse=self._reuse) as vs:
self._adaptation_scope = vs.name
net = self._pretrained_output
# Define some adaptation blocks on top of the pre-trained resnet output.
adaptation_blocks = []
adaptation_block_params = [map(
int, i.split('_')) for i in self._config.adaptation_blocks.split('-')]
for i, (depth, num_units) in enumerate(adaptation_block_params):
block = resnet_v2.resnet_v2_block(
'adaptation_block_%d' % i, base_depth=depth, num_units=num_units,
stride=1)
adaptation_blocks.append(block)
# Stack them on top of the resent output.
net = resnet_utils.stack_blocks_dense(
net, adaptation_blocks, output_stride=None)
# Average pool the output.
net = tf.reduce_mean(net, [1, 2], name='adaptation_pool', keep_dims=True)
if self._config.emb_connection == 'fc':
# Use fully connected layer to project to embedding layer.
fc_hidden_sizes = self._config.fc_hidden_sizes
if fc_hidden_sizes == 'None':
fc_hidden_sizes = []
else:
fc_hidden_sizes = map(int, fc_hidden_sizes.split('_'))
fc_hidden_keep_prob = self._config.dropout.keep_fc
net = tf.squeeze(net)
for fc_hidden_size in fc_hidden_sizes:
net = slim.layers.fully_connected(net, fc_hidden_size)
if fc_hidden_keep_prob < 1.0:
net = slim.dropout(net, keep_prob=fc_hidden_keep_prob,
is_training=self._is_training)
# Connect last FC layer to embedding.
embedding = slim.layers.fully_connected(net, self._embedding_size,
activation_fn=None)
else:
# Use 1x1 conv layer to project to embedding layer.
embedding = slim.conv2d(
net, self._embedding_size, [1, 1], activation_fn=None,
normalizer_fn=None, scope='embedding')
embedding = tf.squeeze(embedding)
# Optionally L2 normalize the embedding.
if self._embedding_l2:
embedding = tf.nn.l2_normalize(embedding, dim=1)
return embedding
def get_trainable_variables(self):
"""Gets a list of variables to optimize."""
if self._config.finetune:
return tf.trainable_variables()
else:
adaptation_only_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope=self._adaptation_scope)
return adaptation_only_vars
class InceptionEmbedderBase(PretrainedEmbedder):
"""Base class for embedders that take pre-trained InceptionV3 activations."""
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
super(InceptionEmbedderBase, self).__init__(
config, images, embedding_size, is_training, embedding_l2,
l2_reg_weight, reuse)
def build_pretrained_graph(
self, images, inception_layer, checkpoint, is_training, reuse=False):
"""See baseclass."""
# Build InceptionV3 graph.
inception_output, inception_variables, init_fn = build_inceptionv3_graph(
images, inception_layer, is_training, checkpoint, reuse)
return inception_output, inception_variables, init_fn
class InceptionConvSSFCEmbedder(InceptionEmbedderBase):
"""TCN Embedder V1.
InceptionV3 (mixed_5d) -> conv layers -> spatial softmax ->
fully connected -> optional l2 normalize -> embedding.
"""
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
super(InceptionConvSSFCEmbedder, self).__init__(
config, images, embedding_size, is_training, embedding_l2,
l2_reg_weight, reuse)
# Pull out all the hyperparameters specific to this embedder.
self._additional_conv_sizes = config.additional_conv_sizes
self._conv_hidden_keep_prob = config.dropout.keep_conv
self._fc_hidden_sizes = config.fc_hidden_sizes
self._fc_hidden_keep_prob = config.dropout.keep_fc
def construct_embedding(self):
"""Builds a conv -> spatial softmax -> FC adaptation network."""
is_training = self._is_training
normalizer_params = {'is_training': is_training}
with tf.variable_scope('tcn_net', reuse=self._reuse) as vs:
self._adaptation_scope = vs.name
with slim.arg_scope(
[slim.layers.conv2d],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight),
biases_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight)):
with slim.arg_scope(
[slim.layers.fully_connected],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight),
biases_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight)):
# Input to embedder is pre-trained inception output.
net = self._pretrained_output
# Optionally add more conv layers.
for num_filters in self._additional_conv_sizes:
net = slim.layers.conv2d(
net, num_filters, kernel_size=[3, 3], stride=[1, 1])
net = slim.dropout(net, keep_prob=self._conv_hidden_keep_prob,
is_training=is_training)
# Take the spatial soft arg-max of the last convolutional layer.
# This is a form of spatial attention over the activations.
# See more here: http://arxiv.org/abs/1509.06113.
net = tf.contrib.layers.spatial_softmax(net)
self.spatial_features = net
# Add fully connected layers.
net = slim.layers.flatten(net)
for fc_hidden_size in self._fc_hidden_sizes:
net = slim.layers.fully_connected(net, fc_hidden_size)
if self._fc_hidden_keep_prob < 1.0:
net = slim.dropout(net, keep_prob=self._fc_hidden_keep_prob,
is_training=is_training)
# Connect last FC layer to embedding.
net = slim.layers.fully_connected(net, self._embedding_size,
activation_fn=None)
# Optionally L2 normalize the embedding.
if self._embedding_l2:
net = tf.nn.l2_normalize(net, dim=1)
return net
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Image preprocessing helpers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cv2
from scipy import ndimage
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
def apply_with_random_selector(x, func, num_cases):
"""Computes func(x, sel), with sel sampled from [0...num_cases-1].
TODO(coreylynch): add as a dependency, when slim or tensorflow/models are
pipfied.
Source:
https://raw.githubusercontent.com/tensorflow/models/a9d0e6e8923a4/slim/preprocessing/inception_preprocessing.py
Args:
x: input Tensor.
func: Python function to apply.
num_cases: Python int32, number of cases to sample sel from.
Returns:
The result of func(x, sel), where func receives the value of the
selector as a python integer, but sel is sampled dynamically.
"""
sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32)
# Pass the real x only to one of the func calls.
return control_flow_ops.merge([
func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
for case in range(num_cases)])[0]
def distorted_bounding_box_crop(image,
bbox,
min_object_covered=0.1,
aspect_ratio_range=(0.75, 1.33),
area_range=(0.05, 1.0),
max_attempts=100,
scope=None):
"""Generates cropped_image using a one of the bboxes randomly distorted.
TODO(coreylynch): add as a dependency, when slim or tensorflow/models are
pipfied.
Source:
https://raw.githubusercontent.com/tensorflow/models/a9d0e6e8923a4/slim/preprocessing/inception_preprocessing.py
See `tf.image.sample_distorted_bounding_box` for more documentation.
Args:
image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged
as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
image.
min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
area of the image must contain at least this fraction of any bounding box
supplied.
aspect_ratio_range: An optional list of `floats`. The cropped area of the
image must have an aspect ratio = width / height within this range.
area_range: An optional list of `floats`. The cropped area of the image
must contain a fraction of the supplied image within in this range.
max_attempts: An optional `int`. Number of attempts at generating a cropped
region of the image of the specified constraints. After `max_attempts`
failures, return the entire image.
scope: Optional scope for name_scope.
Returns:
A tuple, a 3-D Tensor cropped_image and the distorted bbox
"""
with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bbox]):
# Each bounding box has shape [1, num_boxes, box coords] and
# the coordinates are ordered [ymin, xmin, ymax, xmax].
# A large fraction of image datasets contain a human-annotated bounding
# box delineating the region of the image containing the object of interest.
# We choose to create a new bounding box for the object which is a randomly
# distorted version of the human-annotated bounding box that obeys an
# allowed range of aspect ratios, sizes and overlap with the human-annotated
# bounding box. If no box is supplied, then we assume the bounding box is
# the entire image.
sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
tf.shape(image),
bounding_boxes=bbox,
min_object_covered=min_object_covered,
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts,
use_image_if_no_bounding_boxes=True)
bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
# Crop the image to the specified bounding box.
cropped_image = tf.slice(image, bbox_begin, bbox_size)
return cropped_image, distort_bbox
def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
"""Distort the color of a Tensor image.
TODO(coreylynch): add as a dependency, when slim or tensorflow/models are
pipfied.
Source:
https://raw.githubusercontent.com/tensorflow/models/a9d0e6e8923a4/slim/preprocessing/inception_preprocessing.py
Each color distortion is non-commutative and thus ordering of the color ops
matters. Ideally we would randomly permute the ordering of the color ops.
Rather then adding that level of complication, we select a distinct ordering
of color ops for each preprocessing thread.
Args:
image: 3-D Tensor containing single image in [0, 1].
color_ordering: Python int, a type of distortion (valid values: 0-3).
fast_mode: Avoids slower ops (random_hue and random_contrast)
scope: Optional scope for name_scope.
Returns:
3-D Tensor color-distorted image on range [0, 1]
Raises:
ValueError: if color_ordering not in [0, 3]
"""
with tf.name_scope(scope, 'distort_color', [image]):
if fast_mode:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
else:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
elif color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
elif color_ordering == 2:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
elif color_ordering == 3:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
raise ValueError('color_ordering must be in [0, 3]')
# The random_* ops do not necessarily clamp.
return tf.clip_by_value(image, 0.0, 1.0)
def crop_center(image):
"""Returns a cropped square image."""
shape = tf.shape(image)
new_shape = tf.minimum(shape[0], shape[1])
offset_y = tf.maximum(shape[0] - shape[1], 0) // 2
offset_x = tf.maximum(shape[1] - shape[0], 0) // 2
image = tf.image.crop_to_bounding_box(
image, offset_y, offset_x, new_shape, new_shape)
return image
def pad(image):
"""Returns an image padded to be square."""
shape = tf.shape(image)
new_shape = tf.maximum(shape[0], shape[1])
height = shape[0]
width = shape[1]
offset_x = tf.maximum((height-width), 0) // 2
offset_y = tf.maximum((width-height), 0) // 2
image = tf.image.pad_to_bounding_box(
image, offset_y, offset_x, new_shape, new_shape)
return image
def pad_200(image):
"""Returns an image padded width-padded with 200 pixels."""
shape = tf.shape(image)
image = tf.image.pad_to_bounding_box(
image, 0, 200, shape[0], shape[1]+400)
shape = tf.shape(image)
new_shape = tf.minimum(shape[0], shape[1])
offset_y = tf.maximum(shape[0] - shape[1], 0) // 2
offset_x = tf.maximum(shape[1] - shape[0], 0) // 2
image = tf.image.crop_to_bounding_box(
image, offset_y, offset_x, new_shape, new_shape)
return image
def pad_crop_central(image, central_fraction=0.875):
"""Pads the image to the maximum length, crops the central fraction."""
# Pad the image to be square.
image = pad(image)
# Crop the central region of the image with an area containing 87.5% of
# the original image.
image = tf.image.central_crop(image, central_fraction=central_fraction)
return image
def crop_image_by_strategy(image, cropping):
"""Crops an image according to a strategy defined in config.
Args:
image: 3-d image tensor.
cropping: str, name of cropping strategy.
Returns:
image: cropped image.
Raises:
ValueError: When unknown cropping strategy is specified.
"""
strategy_to_method = {
'crop_center': crop_center,
'pad': pad,
'pad200': pad_200,
'pad_crop_central': pad_crop_central
}
tf.logging.info('Cropping strategy: %s.' % cropping)
if cropping not in strategy_to_method:
raise ValueError('Unknown cropping strategy: %s' % cropping)
return strategy_to_method[cropping](image)
def scale_augment_crop(image, central_bbox, area_range, min_object_covered):
"""Training time scale augmentation.
Args:
image: 3-d float tensor.
central_bbox: Bounding box defining the central region of interest.
area_range: Range of allowed areas for the augmented bounding box.
min_object_covered: Constraint for the fraction of original image in
augmented bounding box.
Returns:
distort_image: The scaled, cropped image.
"""
(distorted_image, _) = distorted_bounding_box_crop(
image, central_bbox, area_range=area_range,
aspect_ratio_range=(1.0, 1.0),
min_object_covered=min_object_covered)
# Restore the shape since the dynamic slice based upon the bbox_size loses
# the third dimension.
distorted_image.set_shape([None, None, 3])
return distorted_image
def scale_to_inception_range(image):
"""Scales an image in the range [0,1] to [-1,1] as expected by inception."""
# Assert that incoming images have been properly scaled to [0,1].
with tf.control_dependencies(
[tf.assert_less_equal(tf.reduce_max(image), 1.),
tf.assert_greater_equal(tf.reduce_min(image), 0.)]):
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.0)
return image
def resize_image(image, height, width):
"""Resizes an image to a target height and width."""
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
image = tf.squeeze(image, [0])
return image
def crop_or_pad(image, curr_height, curr_width, new, height=True, crop=True):
"""Crops or pads an image.
Args:
image: 3-D float32 `Tensor` image.
curr_height: Int, current height.
curr_width: Int, current width.
new: Int, new width or height.
height: Boolean, cropping or padding for height.
crop: Boolean, True if we're cropping, False if we're padding.
Returns:
image: 3-D float32 `Tensor` image.
"""
# Crop the image to fit the new shape.
abs_diff = tf.abs(new-curr_height)//2 if height else tf.abs(new-curr_width)//2
offset_x = 0 if height else abs_diff
offset_y = abs_diff if height else 0
# We process height first, so always pad/crop to new height.
target_height = new
# We process height first, so pad/crop to new width only if not doing height.
target_width = curr_width if height else new
if crop:
image = tf.image.crop_to_bounding_box(
image, offset_y, offset_x, target_height, target_width)
else:
image = tf.image.pad_to_bounding_box(
image, offset_y, offset_x, target_height, target_width)
return image
def get_central_bbox(min_side, new_size):
"""Gets the central bounding box for an image.
If image is square, returns bounding box [0,0,1,1].
Otherwise, returns the bounding box containing the central
smallest side x smallest side square.
Args:
min_side: Int, size of smallest side in pixels.
new_size: Int, resize image to a square of new_size x new_size pixels.
Returns:
bbox: A 4-D Int `Tensor`, holding the coordinates of the central bounding
box.
"""
max_shape = tf.cast(new_size, tf.float32)
min_shape = tf.cast(min_side, tf.float32)
top_xy = ((max_shape-min_shape)/2)/max_shape
bottom_xy = (min_shape+(max_shape-min_shape)/2)/max_shape
# Create a bbox for the center region of interest.
bbox = tf.stack([[[top_xy, top_xy, bottom_xy, bottom_xy]]])
bbox.set_shape([1, 1, 4])
return bbox
def pad_to_max(image, max_scale):
"""Pads an image to max_scale times the current center crop size.
E.g.: For an image with dimensions 1920x1080 and a max_scale of 1.5,
returns an image that is 1.5 * (1080x1080).
Args:
image: 3-D float32 `Tensor` image.
max_scale: Float, maximum scale of the image, as a multiplier on the
central bounding box.
Returns:
image: 3-D float32 `Tensor` image.
"""
orig_shape = tf.shape(image)
orig_height = orig_shape[0]
orig_width = orig_shape[1]
# Find the smallest side and corresponding new size.
min_side = tf.cast(tf.minimum(orig_height, orig_width), tf.float32)
new_shape = tf.cast(tf.sqrt(max_scale*min_side*min_side), tf.int32)
# Crop or pad height.
# pylint: disable=g-long-lambda
image = tf.cond(
orig_height >= new_shape,
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=True, crop=True),
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=True, crop=False))
# Crop or pad width.
image = tf.cond(
orig_width >= new_shape,
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=False, crop=True),
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=False, crop=False))
# Get the bounding box of the original centered box in the new resized image.
original_bounding_box = get_central_bbox(min_side, new_shape)
return image, original_bounding_box
def scale_up_augmentation(image, max_scale):
"""Scales an image randomly >100% up to some max scale."""
# Pad to max size.
image, original_central_bbox = pad_to_max(image, max_scale)
# Determine area range of the augmented crop, as a percentage of the
# new max area.
# aug_max == 100% of new max area.
aug_max = 1.0
# aug_min == original_area/new_area == original_area/(max_scale*original_area)
# == 1/max_scale.
aug_min = 1.0/max_scale
area_range = (aug_min, aug_max)
# Since we're doing >100% scale, always have the full original crop in frame.
min_object_covered = 1.0
# Get a random scaled, cropped image.
image = scale_augment_crop(image, original_central_bbox, area_range,
min_object_covered)
return image
def scale_down_augmentation(image, min_scale):
"""Scales an image randomly <100% down to some min scale."""
# Crop the center, and consider the whole image the bounding box ROI.
image = crop_center(image)
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
# Determine area range of the augmented crop, as a percentage of the
# original crop center area.
# aug_max == 100% of original area.
area_range = (min_scale, 1.0)
# Get a random scaled, cropped image.
image = scale_augment_crop(image, bbox, area_range, min_scale)
return image
def augment_image_scale(image, min_scale, max_scale, p_scale_up):
"""Training time scale augmentation.
Args:
image: 3-d float tensor representing image.
min_scale: minimum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
max_scale: maximum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
p_scale_up: Fraction of images scaled up.
Returns:
image: The scale-augmented image.
"""
assert max_scale >= 1.0
assert min_scale <= 1.0
if min_scale == max_scale == 1.0:
tf.logging.info('Min and max scale are 1.0, don`t augment.')
# Do no augmentation, just crop the center.
return crop_center(image)
elif (max_scale == 1.0) and (min_scale < 1.0):
tf.logging.info('Max scale is 1.0, only scale down augment.')
# Always do <100% augmentation.
return scale_down_augmentation(image, min_scale)
elif (min_scale == 1.0) and (max_scale > 1.0):
tf.logging.info('Min scale is 1.0, only scale up augment.')
# Always do >100% augmentation.
return scale_up_augmentation(image, max_scale)
else:
tf.logging.info('Sample both augmentations.')
# Choose to scale image up or down.
rn = tf.random_uniform([], minval=0., maxval=1., dtype=tf.float32)
image = tf.cond(rn >= p_scale_up,
lambda: scale_up_augmentation(image, max_scale),
lambda: scale_down_augmentation(image, min_scale))
return image
def decode_image(image_str):
"""Decodes a jpeg-encoded image string into a image in range [0,1]."""
# Decode jpeg string into np.uint8 tensor.
image = tf.image.decode_jpeg(image_str, channels=3)
# Convert the image to range [0,1].
if image.dtype != tf.float32:
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
return image
def decode_images(image_strs):
"""Decodes a tensor of image strings."""
return tf.map_fn(decode_image, image_strs, dtype=tf.float32)
def preprocess_training_images(images, height, width, min_scale, max_scale,
p_scale_up, aug_color=True, fast_mode=True):
"""Preprocesses a batch of images for training.
This applies training-time scale and color augmentation, crops/resizes,
and scales images to the [-1,1] range expected by pre-trained Inception nets.
Args:
images: A 4-D float32 `Tensor` holding raw images to be preprocessed.
height: Int, height in pixels to resize image to.
width: Int, width in pixels to resize image to.
min_scale: Float, minimum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
max_scale: Float, maximum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
p_scale_up: Float, fraction of images scaled up.
aug_color: Whether or not to do color augmentation.
fast_mode: Boolean, avoids slower ops (random_hue and random_contrast).
Returns:
preprocessed_images: A 4-D float32 `Tensor` holding preprocessed images.
"""
def _prepro_train(im):
"""Map this preprocessing function over each image in the batch."""
return preprocess_training_image(
im, height, width, min_scale, max_scale, p_scale_up,
aug_color=aug_color, fast_mode=fast_mode)
return tf.map_fn(_prepro_train, images)
def preprocess_training_image(
image, height, width, min_scale, max_scale, p_scale_up,
aug_color=True, fast_mode=True):
"""Preprocesses an image for training.
Args:
image: A 3-d float tensor representing the image.
height: Target image height.
width: Target image width.
min_scale: Minimum scale of bounding box (as a percentage of full
bounding box) used to crop image during scale augmentation.
max_scale: Minimum scale of bounding box (as a percentage of full
bounding box) used to crop image during scale augmentation.
p_scale_up: Fraction of images to scale >100%.
aug_color: Whether or not to do color augmentation.
fast_mode: Avoids slower ops (random_hue and random_contrast).
Returns:
scaled_image: An scaled image tensor in the range [-1,1].
"""
# Get a random scaled, cropped image.
image = augment_image_scale(image, min_scale, max_scale, p_scale_up)
# Resize image to desired height, width.
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
image = tf.squeeze(image, [0])
# Optionally augment the color.
# pylint: disable=g-long-lambda
if aug_color:
image = apply_with_random_selector(
image,
lambda x, ordering: distort_color(
x, ordering, fast_mode=fast_mode), num_cases=4)
# Scale to [-1,1] range as expected by inception.
scaled_image = scale_to_inception_range(image)
return scaled_image
def preprocess_test_image(image, height, width, crop_strategy):
"""Preprocesses an image for test/inference.
Args:
image: A 3-d float tensor representing the image.
height: Target image height.
width: Target image width.
crop_strategy: String, name of the strategy used to crop test-time images.
Can be: 'crop_center', 'pad', 'pad_200', 'pad_crop_central'.
Returns:
scaled_image: An scaled image tensor in the range [-1,1].
"""
image = crop_image_by_strategy(image, crop_strategy)
# Resize.
image = resize_image(image, height, width)
# Scale the input range to [-1,1] as expected by inception.
image = scale_to_inception_range(image)
return image
def preprocess_test_images(images, height, width, crop_strategy):
"""Apply test-time preprocessing to a batch of images.
This crops images (given a named strategy for doing so), resizes them,
and scales them to the [-1,1] range expected by pre-trained Inception nets.
Args:
images: A 4-D float32 `Tensor` holding raw images to be preprocessed.
height: Int, height in pixels to resize image to.
width: Int, width in pixels to resize image to.
crop_strategy: String, name of the strategy used to crop test-time images.
Can be: 'crop_center', 'pad', 'pad_200', 'pad_crop_central'.
Returns:
preprocessed_images: A 4-D float32 `Tensor` holding preprocessed images.
"""
def _prepro_test(im):
"""Map this preprocessing function over each image in the batch."""
return preprocess_test_image(im, height, width, crop_strategy)
if len(images.shape) == 3:
return _prepro_test(images)
else:
return tf.map_fn(_prepro_test, images)
def preprocess_images(
images, is_training, height, width,
min_scale=1.0, max_scale=1.0, p_scale_up=0.0,
aug_color=True, fast_mode=True,
crop_strategy='pad_crop_central'):
"""Preprocess a batch of images.
Args:
images: A 4-D float32 `Tensor` holding raw images to be preprocessed.
is_training: Boolean, whether to preprocess them for training or test.
height: Int, height in pixels to resize image to.
width: Int, width in pixels to resize image to.
min_scale: Float, minimum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
max_scale: Float, maximum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
p_scale_up: Float, fraction of images scaled up.
aug_color: Whether or not to do color augmentation.
fast_mode: Boolean, avoids slower ops (random_hue and random_contrast).
crop_strategy: String, name of the strategy used to crop test-time images.
Can be: 'crop_center', 'pad', 'pad_200', 'pad_crop_central'.
Returns:
preprocessed_images: A 4-D float32 `Tensor` holding preprocessed images.
"""
if is_training:
return preprocess_training_images(
images, height, width, min_scale, max_scale,
p_scale_up, aug_color, fast_mode)
else:
return preprocess_test_images(
images, height, width, crop_strategy)
def cv2rotateimage(image, angle):
"""Efficient rotation if 90 degrees rotations, slow otherwise.
Not a tensorflow function, using cv2 and scipy on numpy arrays.
Args:
image: a numpy array with shape [height, width, channels].
angle: the rotation angle in degrees in the range [-180, 180].
Returns:
The rotated image.
"""
# Limit angle to [-180, 180] degrees.
assert angle <= 180 and angle >= -180
if angle == 0:
return image
# Efficient rotations.
if angle == -90:
image = cv2.transpose(image)
image = cv2.flip(image, 0)
elif angle == 90:
image = cv2.transpose(image)
image = cv2.flip(image, 1)
elif angle == 180 or angle == -180:
image = cv2.flip(image, 0)
image = cv2.flip(image, 1)
else: # Slow rotation.
image = ndimage.interpolation.rotate(image, 270)
return image
def cv2resizeminedge(image, min_edge_size):
"""Resize smallest edge of image to min_edge_size."""
assert min_edge_size >= 0
height, width = (image.shape[0], image.shape[1])
new_height, new_width = (0, 0)
if height > width:
new_width = min_edge_size
new_height = int(height * new_width / float(width))
else:
new_height = min_edge_size
new_width = int(width * new_height / float(height))
return cv2.resize(image, (new_width, new_height),
interpolation=cv2.INTER_AREA)
def shapestring(array):
"""Returns a compact string describing shape of an array."""
shape = array.shape
s = str(shape[0])
for i in range(1, len(shape)):
s += 'x' + str(shape[i])
return s
def unscale_jpeg_encode(ims):
"""Unscales pixel values and jpeg encodes preprocessed image.
Args:
ims: A 4-D float32 `Tensor` holding preprocessed images.
Returns:
im_strings: A 1-D string `Tensor` holding images that have been unscaled
(reversing the inception [-1,1] scaling), and jpeg encoded.
"""
ims /= 2.0
ims += 0.5
ims *= 255.0
ims = tf.clip_by_value(ims, 0, 255)
ims = tf.cast(ims, tf.uint8)
im_strings = tf.map_fn(
lambda x: tf.image.encode_jpeg(x, format='rgb', quality=100),
ims, dtype=tf.string)
return im_strings
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment