Commit cbd571f2 authored by Corey Lynch's avatar Corey Lynch
Browse files

Adding TCN.

parent 69cf6fca
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SVTCN estimator implementation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import data_providers
import model as model_module
from estimators import base_estimator
from estimators import svtcn_loss
import tensorflow as tf
class SVTCNEstimator(base_estimator.BaseEstimator):
"""Single-view TCN Estimator base class."""
def __init__(self, config, logdir):
super(SVTCNEstimator, self).__init__(config, logdir)
def construct_input_fn(self, records, is_training):
"""See base class."""
config = self._config
num_views = config.data.num_views
num_parallel_calls = config.data.num_parallel_calls
sequence_prefetch_size = config.data.sequence_prefetch_size
batch_prefetch_size = config.data.batch_prefetch_size
def input_fn():
"""Provides input to SVTCN models."""
(images_preprocessed,
images_raw,
timesteps) = data_providers.singleview_tcn_provider(
file_list=records,
preprocess_fn=self.preprocess_data,
num_views=num_views,
is_training=is_training,
batch_size=self._batch_size,
num_parallel_calls=num_parallel_calls,
sequence_prefetch_size=sequence_prefetch_size,
batch_prefetch_size=batch_prefetch_size)
if config.logging.summary.image_summaries and is_training:
tf.summary.image('training/svtcn_images', images_raw)
features = {'batch_preprocessed': images_preprocessed}
return (features, timesteps)
return input_fn
def forward(self, images, is_training, reuse=False):
"""See base class."""
embedder_strategy = self._config.embedder_strategy
embedder = model_module.get_embedder(
embedder_strategy,
self._config,
images,
is_training=is_training, reuse=reuse)
embeddings = embedder.construct_embedding()
if is_training:
self.variables_to_train = embedder.get_trainable_variables()
self.pretrained_init_fn = embedder.init_fn
return embeddings
class SVTCNTripletEstimator(SVTCNEstimator):
"""Single-View TCN with semihard triplet loss."""
def __init__(self, config, logdir):
super(SVTCNTripletEstimator, self).__init__(config, logdir)
def define_loss(self, embeddings, timesteps, is_training):
"""See base class."""
pos_radius = self._config.svtcn.pos_radius
neg_radius = self._config.svtcn.neg_radius
margin = self._config.triplet_semihard.margin
loss = svtcn_loss.singleview_tcn_loss(
embeddings, timesteps, pos_radius, neg_radius, margin=margin)
self._loss = loss
if is_training:
tf.summary.scalar('training/svtcn_loss', loss)
return loss
def define_eval_metric_ops(self):
"""See base class."""
return {'validation/svtcn_loss': tf.metrics.mean(self._loss)}
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""This implements single view TCN triplet loss."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def pairwise_squared_distance(feature):
"""Computes the squared pairwise distance matrix.
output[i, j] = || feature[i, :] - feature[j, :] ||_2^2
Args:
feature: 2-D Tensor of size [number of data, feature dimension]
Returns:
pairwise_squared_distances: 2-D Tensor of size
[number of data, number of data]
"""
pairwise_squared_distances = tf.add(
tf.reduce_sum(
tf.square(feature), axis=1, keep_dims=True),
tf.reduce_sum(
tf.square(tf.transpose(feature)), axis=0,
keep_dims=True)) - 2.0 * tf.matmul(feature, tf.transpose(feature))
# Deal with numerical inaccuracies. Set small negatives to zero.
pairwise_squared_distances = tf.maximum(pairwise_squared_distances, 0.0)
return pairwise_squared_distances
def masked_maximum(data, mask, dim=1):
"""Computes the axis wise maximum over chosen elements.
Args:
data: N-D Tensor.
mask: N-D Tensor of zeros or ones.
dim: The dimension over which to compute the maximum.
Returns:
masked_maximums: N-D Tensor.
The maximized dimension is of size 1 after the operation.
"""
axis_minimums = tf.reduce_min(data, dim, keep_dims=True)
masked_maximums = tf.reduce_max(
tf.multiply(
data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
return masked_maximums
def masked_minimum(data, mask, dim=1):
"""Computes the axis wise minimum over chosen elements.
Args:
data: 2-D Tensor of size [n, m].
mask: 2-D Boolean Tensor of size [n, m].
dim: The dimension over which to compute the minimum.
Returns:
masked_minimums: N-D Tensor.
The minimized dimension is of size 1 after the operation.
"""
axis_maximums = tf.reduce_max(data, dim, keep_dims=True)
masked_minimums = tf.reduce_min(
tf.multiply(
data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
return masked_minimums
def singleview_tcn_loss(
embeddings, timesteps, pos_radius, neg_radius, margin=1.0,
sequence_ids=None, multiseq=False):
"""Computes the single view triplet loss with semi-hard negative mining.
The loss encourages the positive distances (between a pair of embeddings with
the same labels) to be smaller than the minimum negative distance among
which are at least greater than the positive distance plus the margin constant
(called semi-hard negative) in the mini-batch. If no such negative exists,
uses the largest negative distance instead.
Anchor, positive, negative selection is as follow:
Anchors: We consider every embedding timestep as an anchor.
Positives: pos_radius defines a radius (in timesteps) around each anchor from
which positives can be drawn. E.g. An anchor with t=10 and a pos_radius of
2 produces a set of 4 (anchor,pos) pairs [(a=10, p=8), ... (a=10, p=12)].
Negatives: neg_radius defines a boundary (in timesteps) around each anchor,
outside of which negatives can be drawn. E.g. An anchor with t=10 and a
neg_radius of 4 means negatives can be any t_neg where t_neg < 6 and
t_neg > 14.
Args:
embeddings: 2-D Tensor of embedding vectors.
timesteps: 1-D Tensor with shape [batch_size, 1] of sequence timesteps.
pos_radius: int32; the size of the window (in timesteps) around each anchor
timestep that a positive can be drawn from.
neg_radius: int32; the size of the window (in timesteps) around each anchor
timestep that defines a negative boundary. Negatives can only be chosen
where negative timestep t is < negative boundary min or > negative
boundary max.
margin: Float; the triplet loss margin hyperparameter.
sequence_ids: (Optional) 1-D Tensor with shape [batch_size, 1] of sequence
ids. Together (sequence_id, sequence_timestep) give us a unique index for
each image if we have multiple sequences in a batch.
multiseq: Boolean, whether or not the batch is composed of multiple
sequences (with possibly colliding timesteps).
Returns:
triplet_loss: tf.float32 scalar.
"""
assert neg_radius > pos_radius
# If timesteps shape isn't [batchsize, 1], reshape to [batch_size, 1].
tshape = tf.shape(timesteps)
assert tshape.shape == 2 or tshape.shape == 1
if tshape.shape == 1:
timesteps = tf.reshape(timesteps, [tshape[0], 1])
# Build pairwise squared distance matrix.
pdist_matrix = pairwise_squared_distance(embeddings)
# Build pairwise binary adjacency matrix, where adjacency[i,j] is True
# if timestep j is inside the positive range for timestep i and both
# timesteps come from the same sequence.
pos_radius = tf.cast(pos_radius, tf.int32)
if multiseq:
# If sequence_ids shape isn't [batchsize, 1], reshape to [batch_size, 1].
tshape = tf.shape(sequence_ids)
assert tshape.shape == 2 or tshape.shape == 1
if tshape.shape == 1:
sequence_ids = tf.reshape(sequence_ids, [tshape[0], 1])
# Build pairwise binary adjacency matrix based on sequence_ids
sequence_adjacency = tf.equal(sequence_ids, tf.transpose(sequence_ids))
# Invert so we can select negatives only.
sequence_adjacency_not = tf.logical_not(sequence_adjacency)
in_pos_range = tf.logical_and(
tf.less_equal(
tf.abs(timesteps - tf.transpose(timesteps)), pos_radius),
sequence_adjacency)
# Build pairwise binary discordance matrix, where discordance[i,j] is True
# if timestep j is inside the negative range for timestep i or if the
# timesteps come from different sequences.
in_neg_range = tf.logical_or(
tf.greater(tf.abs(timesteps - tf.transpose(timesteps)), neg_radius),
sequence_adjacency_not
)
else:
in_pos_range = tf.less_equal(
tf.abs(timesteps - tf.transpose(timesteps)), pos_radius)
in_neg_range = tf.greater(tf.abs(timesteps - tf.transpose(timesteps)),
neg_radius)
batch_size = tf.size(timesteps)
# compute the mask
pdist_matrix_tile = tf.tile(pdist_matrix, [batch_size, 1])
mask = tf.logical_and(
tf.tile(in_neg_range, [batch_size, 1]),
tf.greater(pdist_matrix_tile,
tf.reshape(tf.transpose(pdist_matrix), [-1, 1])))
mask_final = tf.reshape(
tf.greater(
tf.reduce_sum(
tf.cast(
mask, dtype=tf.float32), 1, keep_dims=True),
0.0), [batch_size, batch_size])
mask_final = tf.transpose(mask_final)
in_neg_range = tf.cast(in_neg_range, dtype=tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
# negatives_outside: smallest D_an where D_an > D_ap
negatives_outside = tf.reshape(
masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
negatives_outside = tf.transpose(negatives_outside)
# negatives_inside: largest D_an
negatives_inside = tf.tile(
masked_maximum(pdist_matrix, in_neg_range), [1, batch_size])
semi_hard_negatives = tf.where(
mask_final, negatives_outside, negatives_inside)
loss_mat = tf.add(margin, pdist_matrix - semi_hard_negatives)
mask_positives = tf.cast(
in_pos_range, dtype=tf.float32) - tf.diag(tf.ones([batch_size]))
# In lifted-struct, the authors multiply 0.5 for upper triangular
# in semihard, they take all positive pairs except the diagonal.
num_positives = tf.reduce_sum(mask_positives)
triplet_loss = tf.truediv(
tf.reduce_sum(tf.maximum(tf.multiply(loss_mat, mask_positives), 0.0)),
num_positives,
name='triplet_svtcn_loss')
return triplet_loss
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for svtcn_loss.py."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from estimators import svtcn_loss
import tensorflow as tf
class SVTCNLoss(tf.test.TestCase):
def testSVTCNLoss(self):
with self.test_session():
num_data = 64
num_sequences = 2
num_data_per_seq = num_data // num_sequences
feat_dim = 6
margin = 1.0
times = np.tile(np.arange(num_data_per_seq, dtype=np.int32),
num_sequences)
times = np.reshape(times, [times.shape[0], 1])
sequence_ids = np.concatenate(
[np.ones(num_data_per_seq)*i for i in range(num_sequences)])
sequence_ids = np.reshape(sequence_ids, [sequence_ids.shape[0], 1])
pos_radius = 6
neg_radius = 12
embedding = np.random.rand(num_data, feat_dim).astype(np.float32)
# Compute the loss in NP
# Get a positive mask, i.e. indices for each time index
# that are inside the positive range.
in_pos_range = np.less_equal(
np.abs(times - times.transpose()), pos_radius)
# Get a negative mask, i.e. indices for each time index
# that are inside the negative range (> t + (neg_mult * pos_radius)
# and < t - (neg_mult * pos_radius).
in_neg_range = np.greater(np.abs(times - times.transpose()), neg_radius)
sequence_adjacency = sequence_ids == sequence_ids.T
sequence_adjacency_not = np.logical_not(sequence_adjacency)
pdist_matrix = euclidean_distances(embedding, squared=True)
loss_np = 0.0
num_positives = 0.0
for i in range(num_data):
for j in range(num_data):
if in_pos_range[i, j] and i != j and sequence_adjacency[i, j]:
num_positives += 1.0
pos_distance = pdist_matrix[i][j]
neg_distances = []
for k in range(num_data):
if in_neg_range[i, k] or sequence_adjacency_not[i, k]:
neg_distances.append(pdist_matrix[i][k])
neg_distances.sort() # sort by distance
chosen_neg_distance = neg_distances[0]
for l in range(len(neg_distances)):
chosen_neg_distance = neg_distances[l]
if chosen_neg_distance > pos_distance:
break
loss_np += np.maximum(
0.0, margin - chosen_neg_distance + pos_distance)
loss_np /= num_positives
# Compute the loss in TF
loss_tf = svtcn_loss.singleview_tcn_loss(
embeddings=tf.convert_to_tensor(embedding),
timesteps=tf.convert_to_tensor(times),
pos_radius=pos_radius,
neg_radius=neg_radius,
margin=margin,
sequence_ids=tf.convert_to_tensor(sequence_ids),
multiseq=True
)
loss_tf = loss_tf.eval()
self.assertAllClose(loss_np, loss_tf)
if __name__ == '__main__':
tf.test.main()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Calculates running validation of TCN models (and baseline comparisons)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from estimators.get_estimator import get_estimator
from utils import util
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master to use')
tf.app.flags.DEFINE_string(
'logdir', '/tmp/tcn', 'Directory where to write event logs.')
FLAGS = tf.app.flags.FLAGS
def main(_):
"""Runs main eval loop."""
# Parse config dict from yaml config files / command line flags.
logdir = FLAGS.logdir
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
# Choose an estimator based on training strategy.
estimator = get_estimator(config, logdir)
# Wait for the first checkpoint file to be written.
while not tf.train.latest_checkpoint(logdir):
tf.logging.info('Waiting for a checkpoint file...')
time.sleep(10)
# Run validation.
while True:
estimator.evaluate()
if __name__ == '__main__':
tf.app.run()
This image diff could not be displayed because it is too large. You can view the blob instead.
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Generates imitation videos.
Generate single pairwise imitation videos:
blaze build -c opt --config=cuda --copt=-mavx \
learning/brain/research/tcn/generate_videos && \
blaze-bin/learning/brain/research/tcn/generate_videos \
--logtostderr \
--config_paths $config_paths \
--checkpointdir $checkpointdir \
--checkpoint_iter $checkpoint_iter \
--query_records_dir $query_records_dir \
--target_records_dir $target_records_dir \
--outdir $outdir \
--mode single \
--num_query_sequences 1 \
--num_target_sequences -1
# Generate imitation videos with multiple sequences in the target set:
query_records_path
blaze build -c opt --config=cuda --copt=-mavx \
learning/brain/research/tcn/generate_videos && \
blaze-bin/learning/brain/research/tcn/generate_videos \
--logtostderr \
--config_paths $config_paths \
--checkpointdir $checkpointdir \
--checkpoint_iter $checkpoint_iter \
--query_records_dir $query_records_dir \
--target_records_dir $target_records_dir \
--outdir $outdir \
--num_multi_targets 1 \
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cv2
import tensorflow as tf
import os
import matplotlib
matplotlib.use("pdf")
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import numpy as np
from estimators.get_estimator import get_estimator
from utils import util
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string(
'checkpointdir', '/tmp/tcn', 'Path to model checkpoints.')
tf.app.flags.DEFINE_string(
'checkpoint_iter', '', 'Checkpoint iter to use.')
tf.app.flags.DEFINE_integer(
'num_multi_targets', -1,
'Number of imitation vids in the target set per imitation video.')
tf.app.flags.DEFINE_string(
'outdir', '/tmp/tcn', 'Path to write embeddings to.')
tf.app.flags.DEFINE_string(
'mode', 'single', 'single | multi. Single means generate imitation vids'
'where query is being imitated by single sequence. Multi'
'means generate imitation vids where query is being'
'imitated by multiple.')
tf.app.flags.DEFINE_string('query_records_dir', '',
'Directory of image tfrecords.')
tf.app.flags.DEFINE_string('target_records_dir', '',
'Directory of image tfrecords.')
tf.app.flags.DEFINE_integer('query_view', 1,
'Viewpoint of the query video.')
tf.app.flags.DEFINE_integer('target_view', 0,
'Viewpoint of the imitation video.')
tf.app.flags.DEFINE_integer('smoothing_window', 5,
'Number of frames to smooth over.')
tf.app.flags.DEFINE_integer('num_query_sequences', -1,
'Number of query sequences to embed.')
tf.app.flags.DEFINE_integer('num_target_sequences', -1,
'Number of target sequences to embed.')
FLAGS = tf.app.flags.FLAGS
def SmoothEmbeddings(embs):
"""Temporally smoothes a sequence of embeddings."""
new_embs = []
window = int(FLAGS.smoothing_window)
for i in range(len(embs)):
min_i = max(i-window, 0)
max_i = min(i+window, len(embs))
new_embs.append(np.mean(embs[min_i:max_i, :], axis=0))
return np.array(new_embs)
def MakeImitationVideo(
outdir, vidname, query_im_strs, knn_im_strs, height=640, width=360):
"""Creates a KNN imitation video.
For each frame in vid0, pair with the frame at index in knn_indices in
vids1. Write video to disk.
Args:
outdir: String, directory to write videos.
vidname: String, name of video.
query_im_strs: Numpy array holding query image strings.
knn_im_strs: Numpy array holding knn image strings.
height: Int, height of raw images.
width: Int, width of raw images.
"""
if not tf.gfile.Exists(outdir):
tf.gfile.MakeDirs(outdir)
vid_path = os.path.join(outdir, vidname)
combined = zip(query_im_strs, knn_im_strs)
# Create and write the video.
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_aspect('equal')
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
im = ax.imshow(
np.zeros((height, width*2, 3)), cmap='gray', interpolation='nearest')
im.set_clim([0, 1])
plt.tight_layout(pad=0, w_pad=0, h_pad=0)
# pylint: disable=invalid-name
def update_img(pair):
"""Decode pairs of image strings, update a video."""
im_i, im_j = pair
nparr_i = np.fromstring(str(im_i), np.uint8)
img_np_i = cv2.imdecode(nparr_i, 1)
img_np_i = img_np_i[..., [2, 1, 0]]
nparr_j = np.fromstring(str(im_j), np.uint8)
img_np_j = cv2.imdecode(nparr_j, 1)
img_np_j = img_np_j[..., [2, 1, 0]]
# Optionally reshape the images to be same size.
frame = np.concatenate([img_np_i, img_np_j], axis=1)
im.set_data(frame)
return im
ani = animation.FuncAnimation(fig, update_img, combined, interval=15)
writer = animation.writers['ffmpeg'](fps=15)
dpi = 100
tf.logging.info('Writing video to:\n %s \n' % vid_path)
ani.save('%s.mp4' % vid_path, writer=writer, dpi=dpi)
def GenerateImitationVideo(
vid_name, query_ims, query_embs, target_ims, target_embs, height, width):
"""Generates a single cross-sequence imitation video.
For each frame in some query sequence, find the nearest neighbor from
some target sequence in embedding space.
Args:
vid_name: String, the name of the video.
query_ims: Numpy array of shape [query sequence length, height, width, 3].
query_embs: Numpy array of shape [query sequence length, embedding size].
target_ims: Numpy array of shape [target sequence length, height, width,
3].
target_embs: Numpy array of shape [target sequence length, embedding
size].
height: Int, height of the raw image.
width: Int, width of the raw image.
"""
# For each query frame, find the index of the nearest neighbor in the
# target video.
knn_indices = [util.KNNIds(q, target_embs, k=1)[0] for q in query_embs]
# Create and write out the video.
assert knn_indices
knn_ims = np.array([target_ims[k] for k in knn_indices])
MakeImitationVideo(FLAGS.outdir, vid_name, query_ims, knn_ims, height, width)
def SingleImitationVideos(
query_records, target_records, config, height, width):
"""Generates pairwise imitation videos.
This creates all pairs of target imitating query videos, where each frame
on the left is matched to a nearest neighbor coming a single
embedded target video.
Args:
query_records: List of Strings, paths to tfrecord datasets to use as
queries.
target_records: List of Strings, paths to tfrecord datasets to use as
targets.
config: A T object describing training config.
height: Int, height of the raw image.
width: Int, width of the raw image.
"""
# Embed query and target data.
(query_sequences_to_data,
target_sequences_to_data) = EmbedQueryTargetData(
query_records, target_records, config)
qview = FLAGS.query_view
tview = FLAGS.target_view
# Loop over query videos.
for task_i, data_i in query_sequences_to_data.iteritems():
for task_j, data_j in target_sequences_to_data.iteritems():
i_ims = data_i['images']
i_embs = data_i['embeddings']
query_embs = SmoothEmbeddings(i_embs[qview])
query_ims = i_ims[qview]
j_ims = data_j['images']
j_embs = data_j['embeddings']
target_embs = SmoothEmbeddings(j_embs[tview])
target_ims = j_ims[tview]
tf.logging.info('Generating %s imitating %s video.' % (task_j, task_i))
vid_name = 'q%sv%s_im%sv%s' % (task_i, qview, task_j, tview)
vid_name = vid_name.replace('/', '_')
GenerateImitationVideo(vid_name, query_ims, query_embs,
target_ims, target_embs, height, width)
def MultiImitationVideos(
query_records, target_records, config, height, width):
"""Creates multi-imitation videos.
This creates videos where every frame on the left is matched to a nearest
neighbor coming from a set of multiple embedded target videos.
Args:
query_records: List of Strings, paths to tfrecord datasets to use as
queries.
target_records: List of Strings, paths to tfrecord datasets to use as
targets.
config: A T object describing training config.
height: Int, height of the raw image.
width: Int, width of the raw image.
"""
# Embed query and target data.
(query_sequences_to_data,
target_sequences_to_data) = EmbedQueryTargetData(
query_records, target_records, config)
qview = FLAGS.query_view
tview = FLAGS.target_view
# Loop over query videos.
for task_i, data_i in query_sequences_to_data.iteritems():
i_ims = data_i['images']
i_embs = data_i['embeddings']
query_embs = SmoothEmbeddings(i_embs[qview])
query_ims = i_ims[qview]
all_target_embs = []
all_target_ims = []
# If num_imitation_vids is -1, add all seq embeddings to the target set.
if FLAGS.num_multi_targets == -1:
num_multi_targets = len(target_sequences_to_data)
else:
# Else, add some specified number of seq embeddings to the target set.
num_multi_targets = FLAGS.num_multi_targets
for j in range(num_multi_targets):
task_j = target_sequences_to_data.keys()[j]
data_j = target_sequences_to_data[task_j]
print('Adding %s to target set' % task_j)
j_ims = data_j['images']
j_embs = data_j['embeddings']
target_embs = SmoothEmbeddings(j_embs[tview])
target_ims = j_ims[tview]
all_target_embs.extend(target_embs)
all_target_ims.extend(target_ims)
# Generate a "j imitating i" video.
tf.logging.info('Generating all imitating %s video.' % task_i)
vid_name = 'q%sv%s_multiv%s' % (task_i, qview, tview)
vid_name = vid_name.replace('/', '_')
GenerateImitationVideo(vid_name, query_ims, query_embs,
all_target_ims, all_target_embs, height, width)
def SameSequenceVideos(query_records, config, height, width):
"""Generate same sequence, cross-view imitation videos."""
batch_size = config.data.embed_batch_size
# Choose an estimator based on training strategy.
estimator = get_estimator(config, FLAGS.checkpointdir)
# Choose a checkpoint path to restore.
checkpointdir = FLAGS.checkpointdir
checkpoint_path = os.path.join(checkpointdir,
'model.ckpt-%s' % FLAGS.checkpoint_iter)
# Embed num_sequences query sequences, store embeddings and image strings in
# query_sequences_to_data.
sequences_to_data = {}
for (view_embeddings, view_raw_image_strings, seqname) in estimator.inference(
query_records, checkpoint_path, batch_size,
num_sequences=FLAGS.num_query_sequences):
sequences_to_data[seqname] = {
'embeddings': view_embeddings,
'images': view_raw_image_strings,
}
# Loop over query videos.
qview = FLAGS.query_view
tview = FLAGS.target_view
for task_i, data_i in sequences_to_data.iteritems():
ims = data_i['images']
embs = data_i['embeddings']
query_embs = SmoothEmbeddings(embs[qview])
query_ims = ims[qview]
target_embs = SmoothEmbeddings(embs[tview])
target_ims = ims[tview]
tf.logging.info('Generating %s imitating %s video.' % (task_i, task_i))
vid_name = 'q%sv%s_im%sv%s' % (task_i, qview, task_i, tview)
vid_name = vid_name.replace('/', '_')
GenerateImitationVideo(vid_name, query_ims, query_embs,
target_ims, target_embs, height, width)
def EmbedQueryTargetData(query_records, target_records, config):
"""Embeds the full set of query_records and target_records.
Args:
query_records: List of Strings, paths to tfrecord datasets to use as
queries.
target_records: List of Strings, paths to tfrecord datasets to use as
targets.
config: A T object describing training config.
Returns:
query_sequences_to_data: A dict holding 'embeddings' and 'images'
target_sequences_to_data: A dict holding 'embeddings' and 'images'
"""
batch_size = config.data.embed_batch_size
# Choose an estimator based on training strategy.
estimator = get_estimator(config, FLAGS.checkpointdir)
# Choose a checkpoint path to restore.
checkpointdir = FLAGS.checkpointdir
checkpoint_path = os.path.join(checkpointdir,
'model.ckpt-%s' % FLAGS.checkpoint_iter)
# Embed num_sequences query sequences, store embeddings and image strings in
# query_sequences_to_data.
num_query_sequences = FLAGS.num_query_sequences
num_target_sequences = FLAGS.num_target_sequences
query_sequences_to_data = {}
for (view_embeddings, view_raw_image_strings, seqname) in estimator.inference(
query_records, checkpoint_path, batch_size,
num_sequences=num_query_sequences):
query_sequences_to_data[seqname] = {
'embeddings': view_embeddings,
'images': view_raw_image_strings,
}
if (query_records == target_records) and (
num_query_sequences == num_target_sequences):
target_sequences_to_data = query_sequences_to_data
else:
# Embed num_sequences target sequences, store embeddings and image strings
# in sequences_to_data.
target_sequences_to_data = {}
for (view_embeddings, view_raw_image_strings,
seqname) in estimator.inference(
target_records, checkpoint_path, batch_size,
num_sequences=num_target_sequences):
target_sequences_to_data[seqname] = {
'embeddings': view_embeddings,
'images': view_raw_image_strings,
}
return query_sequences_to_data, target_sequences_to_data
def main(_):
# Parse config dict from yaml config files / command line flags.
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
# Get tables to embed.
query_records_dir = FLAGS.query_records_dir
query_records = util.GetFilesRecursively(query_records_dir)
target_records_dir = FLAGS.target_records_dir
target_records = util.GetFilesRecursively(target_records_dir)
height = config.data.raw_height
width = config.data.raw_width
mode = FLAGS.mode
if mode == 'multi':
# Generate videos where target set is composed of multiple videos.
MultiImitationVideos(query_records, target_records, config,
height, width)
elif mode == 'single':
# Generate videos where target set is a single video.
SingleImitationVideos(query_records, target_records, config,
height, width)
elif mode == 'same':
# Generate videos where target set is the same as query, but diff view.
SameSequenceVideos(query_records, config, height, width)
else:
raise ValueError('Unknown mode %s' % mode)
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Generates test Recall@K statistics on labeled classification problems."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import os
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import data_providers
from estimators.get_estimator import get_estimator
from utils import util
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string(
'mode', 'validation',
'Which dataset to evaluate: `validation` | `test`.')
tf.app.flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master to use')
tf.app.flags.DEFINE_string(
'checkpoint_iter', '', 'Evaluate this specific checkpoint.')
tf.app.flags.DEFINE_string(
'checkpointdir', '/tmp/tcn', 'Path to model checkpoints.')
tf.app.flags.DEFINE_string('outdir', '/tmp/tcn', 'Path to write summaries to.')
FLAGS = tf.app.flags.FLAGS
def nearest_cross_sequence_neighbors(data, tasks, n_neighbors=1):
"""Computes the n_neighbors nearest neighbors for every row in data.
Args:
data: A np.float32 array of shape [num_data, embedding size] holding
an embedded validation / test dataset.
tasks: A list of strings of size [num_data] holding the task or sequence
name that each row belongs to.
n_neighbors: The number of knn indices to return for each row.
Returns:
indices: an np.int32 array of size [num_data, n_neighbors] holding the
n_neighbors nearest indices for every row in data. These are
restricted to be from different named sequences (as defined in `tasks`).
"""
# Compute the pairwise sequence adjacency matrix from `tasks`.
num_data = data.shape[0]
tasks = np.array(tasks)
tasks = np.reshape(tasks, (num_data, 1))
assert len(tasks.shape) == 2
not_adjacent = (tasks != tasks.T)
# Compute the symmetric pairwise distance matrix.
pdist = pairwise_distances(data, metric='sqeuclidean')
# For every row in the pairwise distance matrix, only consider
# cross-sequence columns.
indices = np.zeros((num_data, n_neighbors), dtype=np.int32)
for idx in range(num_data):
# Restrict to cross_sequence neighbors.
distances = [(
pdist[idx][i], i) for i in xrange(num_data) if not_adjacent[idx][i]]
_, nearest_indices = zip(*sorted(
distances, key=lambda x: x[0])[:n_neighbors])
indices[idx] = nearest_indices
return indices
def compute_cross_sequence_recall_at_k(retrieved_labels, labels, k_list):
"""Compute recall@k for a given list of k values.
Recall is one if an example of the same class is retrieved among the
top k nearest neighbors given a query example and zero otherwise.
Counting the recall for all examples and averaging the counts returns
recall@k score.
Args:
retrieved_labels: 2-D Numpy array of KNN labels for every embedding.
labels: 1-D Numpy array of shape [number of data].
k_list: List of k values to evaluate recall@k.
Returns:
recall_list: List of recall@k values.
"""
kvalue_to_recall = dict(zip(k_list, np.zeros(len(k_list))))
# For each value of K.
for k in k_list:
matches = defaultdict(float)
counts = defaultdict(float)
# For each (row index, label value) in the query labels.
for i, label_value in enumerate(labels):
# Loop over the K nearest retrieved labels.
if label_value in retrieved_labels[i][:k]:
matches[label_value] += 1.
# Increment the denominator.
counts[label_value] += 1.
kvalue_to_recall[k] = np.mean(
[matches[l]/counts[l] for l in matches])
return [kvalue_to_recall[i] for i in k_list]
def compute_cross_sequence_recalls_at_k(
embeddings, labels, label_attr_keys, tasks, k_list, summary_writer,
training_step):
"""Computes and reports the recall@k for each classification problem.
This takes an embedding matrix and an array of multiclass labels
with size [num_data, number of classification problems], then
computes the average recall@k for each classification problem
as well as the average across problems.
Args:
embeddings: A np.float32 array of size [num_data, embedding_size]
representing the embedded validation or test dataset.
labels: A np.int32 array of size [num_data, num_classification_problems]
holding multiclass labels for each embedding for each problem.
label_attr_keys: List of strings, holds the names of the classification
problems.
tasks: A list of strings describing the video sequence each row
belongs to. This is used to restrict the recall@k computation
to cross-sequence examples.
k_list: A list of ints, the k values to evaluate recall@k.
summary_writer: A tf.summary.FileWriter.
training_step: Int, the current training step we're evaluating.
"""
num_data = float(embeddings.shape[0])
assert labels.shape[0] == num_data
# Compute knn indices.
indices = nearest_cross_sequence_neighbors(
embeddings, tasks, n_neighbors=max(k_list))
retrieved_labels = labels[indices]
# Compute the recall@k for each classification problem.
recall_lists = []
for idx, label_attr in enumerate(label_attr_keys):
problem_labels = labels[:, idx]
# Take all indices, all k labels for the problem indexed by idx.
problem_retrieved = retrieved_labels[:, :, idx]
recall_list = compute_cross_sequence_recall_at_k(
retrieved_labels=problem_retrieved,
labels=problem_labels,
k_list=k_list)
recall_lists.append(recall_list)
for (k, recall) in zip(k_list, recall_list):
recall_error = 1-recall
summ = tf.Summary(value=[tf.Summary.Value(
tag='validation/classification/%s error@top%d' % (
label_attr, k),
simple_value=recall_error)])
print('%s recall@K=%d' % (label_attr, k), recall_error)
summary_writer.add_summary(summ, int(training_step))
# Report an average recall@k across problems.
recall_lists = np.array(recall_lists)
for i in range(recall_lists.shape[1]):
average_recall = np.mean(recall_lists[:, i])
recall_error = 1 - average_recall
summ = tf.Summary(value=[tf.Summary.Value(
tag='validation/classification/average error@top%d' % k_list[i],
simple_value=recall_error)])
print('Average recall@K=%d' % k_list[i], recall_error)
summary_writer.add_summary(summ, int(training_step))
def evaluate_once(
estimator, input_fn_by_view, batch_size, checkpoint_path,
label_attr_keys, embedding_size, num_views, k_list):
"""Compute the recall@k for a given checkpoint path.
Args:
estimator: an `Estimator` object to evaluate.
input_fn_by_view: An input_fn to an `Estimator's` predict method. Takes
a view index and returns a dict holding ops for getting raw images for
the view.
batch_size: Int, size of the labeled eval batch.
checkpoint_path: String, path to the specific checkpoint being evaluated.
label_attr_keys: A list of Strings, holding each attribute name.
embedding_size: Int, the size of the embedding.
num_views: Int, number of views in the dataset.
k_list: List of ints, list of K values to compute recall at K for.
"""
feat_matrix = np.zeros((0, embedding_size))
label_vect = np.zeros((0, len(label_attr_keys)))
tasks = []
eval_tensor_keys = ['embeddings', 'tasks', 'classification_labels']
# Iterate all views in the dataset.
for view_index in range(num_views):
# Set up a graph for embedding entire dataset.
predictions = estimator.inference(
input_fn_by_view(view_index), checkpoint_path,
batch_size, predict_keys=eval_tensor_keys)
# Enumerate predictions.
for i, p in enumerate(predictions):
if i % 100 == 0:
tf.logging.info('Embedded %d images for view %d' % (i, view_index))
label = p['classification_labels']
task = p['tasks']
embedding = p['embeddings']
# Collect (embedding, label, task) data.
feat_matrix = np.append(feat_matrix, [embedding], axis=0)
label_vect = np.append(label_vect, [label], axis=0)
tasks.append(task)
# Compute recall statistics.
ckpt_step = int(checkpoint_path.split('-')[-1])
summary_dir = os.path.join(FLAGS.outdir, 'labeled_eval_summaries')
summary_writer = tf.summary.FileWriter(summary_dir)
compute_cross_sequence_recalls_at_k(
feat_matrix, label_vect, label_attr_keys, tasks, k_list,
summary_writer, ckpt_step)
def get_labeled_tables(config):
"""Gets either labeled test or validation tables, based on flags."""
# Get a list of filenames corresponding to labeled data.
mode = FLAGS.mode
if mode == 'validation':
labeled_tables = util.GetFilesRecursively(config.data.labeled.validation)
elif mode == 'test':
labeled_tables = util.GetFilesRecursively(config.data.labeled.test)
else:
raise ValueError('Unknown dataset: %s' % mode)
return labeled_tables
def main(_):
"""Runs main labeled eval loop."""
# Parse config dict from yaml config files / command line flags.
config = util.ParseConfigsToLuaTable(FLAGS.config_paths, FLAGS.model_params)
# Choose an estimator based on training strategy.
checkpointdir = FLAGS.checkpointdir
estimator = get_estimator(config, checkpointdir)
# Get data configs.
image_attr_keys = config.data.labeled.image_attr_keys
label_attr_keys = config.data.labeled.label_attr_keys
embedding_size = config.embedding_size
num_views = config.data.num_views
k_list = config.val.recall_at_k_list
batch_size = config.data.batch_size
# Get either labeled validation or test tables.
labeled_tables = get_labeled_tables(config)
def input_fn_by_view(view_index):
"""Returns an input_fn for use with a tf.Estimator by view."""
def input_fn():
# Get raw labeled images.
(preprocessed_images, labels,
tasks) = data_providers.labeled_data_provider(
labeled_tables,
estimator.preprocess_data, view_index, image_attr_keys,
label_attr_keys, batch_size=batch_size)
return {
'batch_preprocessed': preprocessed_images,
'tasks': tasks,
'classification_labels': labels,
}, None
return input_fn
# If evaluating a specific checkpoint, do that.
if FLAGS.checkpoint_iter:
checkpoint_path = os.path.join(
'%s/model.ckpt-%s' % (checkpointdir, FLAGS.checkpoint_iter))
evaluate_once(
estimator, input_fn_by_view, batch_size, checkpoint_path,
label_attr_keys, embedding_size, num_views, k_list)
else:
for checkpoint_path in tf.contrib.training.checkpoints_iterator(
checkpointdir):
evaluate_once(
estimator, input_fn_by_view, batch_size, checkpoint_path,
label_attr_keys, embedding_size, num_views, k_list)
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tcn.labeled_eval."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import labeled_eval
import tensorflow as tf
class LabeledEvalTest(tf.test.TestCase):
def testNearestCrossSequenceNeighbors(self):
# Generate embeddings.
num_data = 64
embedding_size = 4
num_tasks = 8
n_neighbors = 2
data = np.random.randn(num_data, embedding_size)
tasks = np.repeat(range(num_tasks), num_data // num_tasks)
# Get nearest cross-sequence indices.
indices = labeled_eval.nearest_cross_sequence_neighbors(
data, tasks, n_neighbors=n_neighbors)
# Assert that no nearest neighbor indices come from the same task.
repeated_tasks = np.tile(np.reshape(tasks, (num_data, 1)), n_neighbors)
self.assertTrue(np.all(np.not_equal(repeated_tasks, tasks[indices])))
def testPerfectCrossSequenceRecall(self):
# Make sure cross-sequence recall@k returns 1.0 for near-duplicate features.
embeddings = np.random.randn(10, 2)
embeddings[5:, :] = 0.00001 + embeddings[:5, :]
tasks = np.repeat([0, 1], 5)
labels = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
# find k=1, k=2 nearest neighbors.
k_list = [1, 2]
# Compute knn indices.
indices = labeled_eval.nearest_cross_sequence_neighbors(
embeddings, tasks, n_neighbors=max(k_list))
retrieved_labels = labels[indices]
recall_list = labeled_eval.compute_cross_sequence_recall_at_k(
retrieved_labels=retrieved_labels,
labels=labels,
k_list=k_list)
self.assertTrue(np.allclose(
np.array(recall_list), np.array([1.0, 1.0])))
def testRelativeRecall(self):
# Make sure cross-sequence recall@k is strictly non-decreasing over k.
num_data = 100
num_tasks = 10
embeddings = np.random.randn(100, 5)
tasks = np.repeat(range(num_tasks), num_data // num_tasks)
labels = np.random.randint(0, 5, 100)
k_list = [1, 2, 4, 8, 16, 32, 64]
indices = labeled_eval.nearest_cross_sequence_neighbors(
embeddings, tasks, n_neighbors=max(k_list))
retrieved_labels = labels[indices]
recall_list = labeled_eval.compute_cross_sequence_recall_at_k(
retrieved_labels=retrieved_labels,
labels=labels,
k_list=k_list)
recall_list_sorted = sorted(recall_list)
self.assertTrue(np.allclose(
np.array(recall_list), np.array(recall_list_sorted)))
if __name__ == "__main__":
tf.test.main()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model implementations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from abc import ABCMeta
from abc import abstractmethod
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.python.slim.nets import inception
from tensorflow.contrib.slim.python.slim.nets import resnet_v2 as resnet_v2
from tensorflow.contrib.slim.python.slim.nets import resnet_utils as resnet_utils
def get_embedder(
embedder_strategy, config, images, is_training, reuse=False,
l2_normalize_embedding=True):
"""Returns an embedder based on config.
Args:
embedder_strategy: String, name of embedder version to return.
config: LuaTable object, training config.
images: 4-D float `Tensor` containing batch images.
is_training: Boolean or placeholder for boolean,
indicator for whether or not we're training.
reuse: Boolean: Reuse embedder variable scope.
l2_normalize_embedding: Boolean, whether or not to l2 normalize the
embedding.
Returns:
embedder: An `Embedder` object.
Raises:
ValueError: if unknown embedder_strategy specified.
"""
if embedder_strategy == 'inception_baseline':
pretrained_ckpt = config.inception_conv_ss_fc.pretrained_checkpoint
return InceptionBaselineEmbedder(
images,
pretrained_ckpt,
config.random_projection,
config.random_projection_dim)
strategy_to_embedder = {
'inception_conv_ss_fc': InceptionConvSSFCEmbedder,
'resnet': ResnetEmbedder,
}
if embedder_strategy not in strategy_to_embedder:
raise ValueError('unknown embedder_strategy', embedder_strategy)
embedding_size = config.embedding_size
l2_reg_weight = config.learning.l2_reg_weight
embedder = strategy_to_embedder[embedder_strategy](
config[embedder_strategy], images, embedding_size,
is_training, embedding_l2=l2_normalize_embedding,
l2_reg_weight=l2_reg_weight, reuse=reuse)
return embedder
def build_inceptionv3_graph(images, endpoint, is_training, checkpoint,
reuse=False):
"""Builds an InceptionV3 model graph.
Args:
images: A 4-D float32 `Tensor` of batch images.
endpoint: String, name of the InceptionV3 endpoint.
is_training: Boolean, whether or not to build a training or inference graph.
checkpoint: String, path to the pretrained model checkpoint.
reuse: Boolean, whether or not we are reusing the embedder.
Returns:
inception_output: `Tensor` holding the InceptionV3 output.
inception_variables: List of inception variables.
init_fn: Function to initialize the weights (if not reusing, then None).
"""
with slim.arg_scope(inception.inception_v3_arg_scope()):
_, endpoints = inception.inception_v3(
images, num_classes=1001, is_training=is_training)
inception_output = endpoints[endpoint]
inception_variables = slim.get_variables_to_restore()
inception_variables = [
i for i in inception_variables if 'global_step' not in i.name]
if is_training and not reuse:
init_saver = tf.train.Saver(inception_variables)
def init_fn(scaffold, sess):
del scaffold
init_saver.restore(sess, checkpoint)
else:
init_fn = None
return inception_output, inception_variables, init_fn
class InceptionBaselineEmbedder(object):
"""Produces pre-trained InceptionV3 embeddings."""
def __init__(self, images, pretrained_ckpt, reuse=False,
random_projection=False, random_projection_dim=32):
# Build InceptionV3 graph.
(inception_output,
self.inception_variables,
self.init_fn) = build_inceptionv3_graph(
images, 'Mixed_7c', False, pretrained_ckpt, reuse)
# Pool 8x8x2048 -> 1x1x2048.
embedding = slim.avg_pool2d(inception_output, [8, 8], stride=1)
embedding = tf.squeeze(embedding, [1, 2])
if random_projection:
embedding = tf.matmul(
embedding, tf.random_normal(
shape=[2048, random_projection_dim], seed=123))
self.embedding = embedding
class PretrainedEmbedder(object):
"""Base class for embedders that take pre-trained networks as input."""
__metaclass__ = ABCMeta
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
"""Constructor.
Args:
config: A T object holding training config.
images: A 4-D float32 `Tensor` holding images to embed.
embedding_size: Int, the size of the embedding.
is_training: Boolean, whether or not this is a training or inference-time
graph.
embedding_l2: Boolean, whether or not to l2 normalize the embedding.
l2_reg_weight: Float, weight applied to l2 weight regularization.
reuse: Boolean, whether or not we're reusing this graph.
"""
# Pull out all the embedder hyperparameters.
self._config = config
self._embedding_size = embedding_size
self._l2_reg_weight = l2_reg_weight
self._embedding_l2 = embedding_l2
self._is_training = is_training
self._reuse = reuse
# Pull out pretrained hparams.
pretrained_checkpoint = config.pretrained_checkpoint
pretrained_layer = config.pretrained_layer
pretrained_keep_prob = config.dropout.keep_pretrained
# Build pretrained graph.
(pretrained_output,
self._pretrained_variables,
self.init_fn) = self.build_pretrained_graph(
images, pretrained_layer, pretrained_checkpoint, is_training, reuse)
# Optionally drop out the activations.
pretrained_output = slim.dropout(
pretrained_output, keep_prob=pretrained_keep_prob,
is_training=is_training)
self._pretrained_output = pretrained_output
@abstractmethod
def build_pretrained_graph(self, images, layer, pretrained_checkpoint,
is_training, reuse):
"""Builds the graph for the pre-trained network.
Method to be overridden by implementations.
Args:
images: A 4-D tf.float32 `Tensor` holding images to embed.
layer: String, defining which pretrained layer to take as input
to adaptation layers.
pretrained_checkpoint: String, path to a checkpoint used to load
pretrained weights.
is_training: Boolean, whether or not we're in training mode.
reuse: Boolean, whether or not to reuse embedder weights.
Returns:
pretrained_output: A 2 or 3-d tf.float32 `Tensor` holding pretrained
activations.
"""
pass
@abstractmethod
def construct_embedding(self):
"""Builds an embedding function on top of images.
Method to be overridden by implementations.
Returns:
embeddings: A 2-d float32 `Tensor` of shape [batch_size, embedding_size]
holding the embedded images.
"""
pass
def get_trainable_variables(self):
"""Gets a list of variables to optimize."""
if self._config.finetune:
return tf.trainable_variables()
else:
adaptation_only_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope=self._adaptation_scope)
return adaptation_only_vars
class ResnetEmbedder(PretrainedEmbedder):
"""Resnet TCN.
ResnetV2 -> resnet adaptation layers -> optional l2 normalize -> embedding.
"""
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
super(ResnetEmbedder, self).__init__(
config, images, embedding_size, is_training, embedding_l2,
l2_reg_weight, reuse)
def build_pretrained_graph(
self, images, resnet_layer, checkpoint, is_training, reuse=False):
"""See baseclass."""
with slim.arg_scope(resnet_v2.resnet_arg_scope()):
_, endpoints = resnet_v2.resnet_v2_50(
images, is_training=is_training, reuse=reuse)
resnet_layer = 'resnet_v2_50/block%d' % resnet_layer
resnet_output = endpoints[resnet_layer]
resnet_variables = slim.get_variables_to_restore()
resnet_variables = [
i for i in resnet_variables if 'global_step' not in i.name]
if is_training and not reuse:
init_saver = tf.train.Saver(resnet_variables)
def init_fn(scaffold, sess):
del scaffold
init_saver.restore(sess, checkpoint)
else:
init_fn = None
return resnet_output, resnet_variables, init_fn
def construct_embedding(self):
"""Builds an embedding function on top of images.
Method to be overridden by implementations.
Returns:
embeddings: A 2-d float32 `Tensor` of shape [batch_size, embedding_size]
holding the embedded images.
"""
with tf.variable_scope('tcn_net', reuse=self._reuse) as vs:
self._adaptation_scope = vs.name
net = self._pretrained_output
# Define some adaptation blocks on top of the pre-trained resnet output.
adaptation_blocks = []
adaptation_block_params = [map(
int, i.split('_')) for i in self._config.adaptation_blocks.split('-')]
for i, (depth, num_units) in enumerate(adaptation_block_params):
block = resnet_v2.resnet_v2_block(
'adaptation_block_%d' % i, base_depth=depth, num_units=num_units,
stride=1)
adaptation_blocks.append(block)
# Stack them on top of the resent output.
net = resnet_utils.stack_blocks_dense(
net, adaptation_blocks, output_stride=None)
# Average pool the output.
net = tf.reduce_mean(net, [1, 2], name='adaptation_pool', keep_dims=True)
if self._config.emb_connection == 'fc':
# Use fully connected layer to project to embedding layer.
fc_hidden_sizes = self._config.fc_hidden_sizes
if fc_hidden_sizes == 'None':
fc_hidden_sizes = []
else:
fc_hidden_sizes = map(int, fc_hidden_sizes.split('_'))
fc_hidden_keep_prob = self._config.dropout.keep_fc
net = tf.squeeze(net)
for fc_hidden_size in fc_hidden_sizes:
net = slim.layers.fully_connected(net, fc_hidden_size)
if fc_hidden_keep_prob < 1.0:
net = slim.dropout(net, keep_prob=fc_hidden_keep_prob,
is_training=self._is_training)
# Connect last FC layer to embedding.
embedding = slim.layers.fully_connected(net, self._embedding_size,
activation_fn=None)
else:
# Use 1x1 conv layer to project to embedding layer.
embedding = slim.conv2d(
net, self._embedding_size, [1, 1], activation_fn=None,
normalizer_fn=None, scope='embedding')
embedding = tf.squeeze(embedding)
# Optionally L2 normalize the embedding.
if self._embedding_l2:
embedding = tf.nn.l2_normalize(embedding, dim=1)
return embedding
def get_trainable_variables(self):
"""Gets a list of variables to optimize."""
if self._config.finetune:
return tf.trainable_variables()
else:
adaptation_only_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope=self._adaptation_scope)
return adaptation_only_vars
class InceptionEmbedderBase(PretrainedEmbedder):
"""Base class for embedders that take pre-trained InceptionV3 activations."""
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
super(InceptionEmbedderBase, self).__init__(
config, images, embedding_size, is_training, embedding_l2,
l2_reg_weight, reuse)
def build_pretrained_graph(
self, images, inception_layer, checkpoint, is_training, reuse=False):
"""See baseclass."""
# Build InceptionV3 graph.
inception_output, inception_variables, init_fn = build_inceptionv3_graph(
images, inception_layer, is_training, checkpoint, reuse)
return inception_output, inception_variables, init_fn
class InceptionConvSSFCEmbedder(InceptionEmbedderBase):
"""TCN Embedder V1.
InceptionV3 (mixed_5d) -> conv layers -> spatial softmax ->
fully connected -> optional l2 normalize -> embedding.
"""
def __init__(self, config, images, embedding_size, is_training,
embedding_l2=True, l2_reg_weight=1e-6, reuse=False):
super(InceptionConvSSFCEmbedder, self).__init__(
config, images, embedding_size, is_training, embedding_l2,
l2_reg_weight, reuse)
# Pull out all the hyperparameters specific to this embedder.
self._additional_conv_sizes = config.additional_conv_sizes
self._conv_hidden_keep_prob = config.dropout.keep_conv
self._fc_hidden_sizes = config.fc_hidden_sizes
self._fc_hidden_keep_prob = config.dropout.keep_fc
def construct_embedding(self):
"""Builds a conv -> spatial softmax -> FC adaptation network."""
is_training = self._is_training
normalizer_params = {'is_training': is_training}
with tf.variable_scope('tcn_net', reuse=self._reuse) as vs:
self._adaptation_scope = vs.name
with slim.arg_scope(
[slim.layers.conv2d],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight),
biases_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight)):
with slim.arg_scope(
[slim.layers.fully_connected],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm, normalizer_params=normalizer_params,
weights_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight),
biases_regularizer=slim.regularizers.l2_regularizer(
self._l2_reg_weight)):
# Input to embedder is pre-trained inception output.
net = self._pretrained_output
# Optionally add more conv layers.
for num_filters in self._additional_conv_sizes:
net = slim.layers.conv2d(
net, num_filters, kernel_size=[3, 3], stride=[1, 1])
net = slim.dropout(net, keep_prob=self._conv_hidden_keep_prob,
is_training=is_training)
# Take the spatial soft arg-max of the last convolutional layer.
# This is a form of spatial attention over the activations.
# See more here: http://arxiv.org/abs/1509.06113.
net = tf.contrib.layers.spatial_softmax(net)
self.spatial_features = net
# Add fully connected layers.
net = slim.layers.flatten(net)
for fc_hidden_size in self._fc_hidden_sizes:
net = slim.layers.fully_connected(net, fc_hidden_size)
if self._fc_hidden_keep_prob < 1.0:
net = slim.dropout(net, keep_prob=self._fc_hidden_keep_prob,
is_training=is_training)
# Connect last FC layer to embedding.
net = slim.layers.fully_connected(net, self._embedding_size,
activation_fn=None)
# Optionally L2 normalize the embedding.
if self._embedding_l2:
net = tf.nn.l2_normalize(net, dim=1)
return net
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Image preprocessing helpers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cv2
from scipy import ndimage
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
def apply_with_random_selector(x, func, num_cases):
"""Computes func(x, sel), with sel sampled from [0...num_cases-1].
TODO(coreylynch): add as a dependency, when slim or tensorflow/models are
pipfied.
Source:
https://raw.githubusercontent.com/tensorflow/models/a9d0e6e8923a4/slim/preprocessing/inception_preprocessing.py
Args:
x: input Tensor.
func: Python function to apply.
num_cases: Python int32, number of cases to sample sel from.
Returns:
The result of func(x, sel), where func receives the value of the
selector as a python integer, but sel is sampled dynamically.
"""
sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32)
# Pass the real x only to one of the func calls.
return control_flow_ops.merge([
func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
for case in range(num_cases)])[0]
def distorted_bounding_box_crop(image,
bbox,
min_object_covered=0.1,
aspect_ratio_range=(0.75, 1.33),
area_range=(0.05, 1.0),
max_attempts=100,
scope=None):
"""Generates cropped_image using a one of the bboxes randomly distorted.
TODO(coreylynch): add as a dependency, when slim or tensorflow/models are
pipfied.
Source:
https://raw.githubusercontent.com/tensorflow/models/a9d0e6e8923a4/slim/preprocessing/inception_preprocessing.py
See `tf.image.sample_distorted_bounding_box` for more documentation.
Args:
image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged
as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
image.
min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
area of the image must contain at least this fraction of any bounding box
supplied.
aspect_ratio_range: An optional list of `floats`. The cropped area of the
image must have an aspect ratio = width / height within this range.
area_range: An optional list of `floats`. The cropped area of the image
must contain a fraction of the supplied image within in this range.
max_attempts: An optional `int`. Number of attempts at generating a cropped
region of the image of the specified constraints. After `max_attempts`
failures, return the entire image.
scope: Optional scope for name_scope.
Returns:
A tuple, a 3-D Tensor cropped_image and the distorted bbox
"""
with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bbox]):
# Each bounding box has shape [1, num_boxes, box coords] and
# the coordinates are ordered [ymin, xmin, ymax, xmax].
# A large fraction of image datasets contain a human-annotated bounding
# box delineating the region of the image containing the object of interest.
# We choose to create a new bounding box for the object which is a randomly
# distorted version of the human-annotated bounding box that obeys an
# allowed range of aspect ratios, sizes and overlap with the human-annotated
# bounding box. If no box is supplied, then we assume the bounding box is
# the entire image.
sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
tf.shape(image),
bounding_boxes=bbox,
min_object_covered=min_object_covered,
aspect_ratio_range=aspect_ratio_range,
area_range=area_range,
max_attempts=max_attempts,
use_image_if_no_bounding_boxes=True)
bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
# Crop the image to the specified bounding box.
cropped_image = tf.slice(image, bbox_begin, bbox_size)
return cropped_image, distort_bbox
def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
"""Distort the color of a Tensor image.
TODO(coreylynch): add as a dependency, when slim or tensorflow/models are
pipfied.
Source:
https://raw.githubusercontent.com/tensorflow/models/a9d0e6e8923a4/slim/preprocessing/inception_preprocessing.py
Each color distortion is non-commutative and thus ordering of the color ops
matters. Ideally we would randomly permute the ordering of the color ops.
Rather then adding that level of complication, we select a distinct ordering
of color ops for each preprocessing thread.
Args:
image: 3-D Tensor containing single image in [0, 1].
color_ordering: Python int, a type of distortion (valid values: 0-3).
fast_mode: Avoids slower ops (random_hue and random_contrast)
scope: Optional scope for name_scope.
Returns:
3-D Tensor color-distorted image on range [0, 1]
Raises:
ValueError: if color_ordering not in [0, 3]
"""
with tf.name_scope(scope, 'distort_color', [image]):
if fast_mode:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
else:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
elif color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
elif color_ordering == 2:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
elif color_ordering == 3:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
raise ValueError('color_ordering must be in [0, 3]')
# The random_* ops do not necessarily clamp.
return tf.clip_by_value(image, 0.0, 1.0)
def crop_center(image):
"""Returns a cropped square image."""
shape = tf.shape(image)
new_shape = tf.minimum(shape[0], shape[1])
offset_y = tf.maximum(shape[0] - shape[1], 0) // 2
offset_x = tf.maximum(shape[1] - shape[0], 0) // 2
image = tf.image.crop_to_bounding_box(
image, offset_y, offset_x, new_shape, new_shape)
return image
def pad(image):
"""Returns an image padded to be square."""
shape = tf.shape(image)
new_shape = tf.maximum(shape[0], shape[1])
height = shape[0]
width = shape[1]
offset_x = tf.maximum((height-width), 0) // 2
offset_y = tf.maximum((width-height), 0) // 2
image = tf.image.pad_to_bounding_box(
image, offset_y, offset_x, new_shape, new_shape)
return image
def pad_200(image):
"""Returns an image padded width-padded with 200 pixels."""
shape = tf.shape(image)
image = tf.image.pad_to_bounding_box(
image, 0, 200, shape[0], shape[1]+400)
shape = tf.shape(image)
new_shape = tf.minimum(shape[0], shape[1])
offset_y = tf.maximum(shape[0] - shape[1], 0) // 2
offset_x = tf.maximum(shape[1] - shape[0], 0) // 2
image = tf.image.crop_to_bounding_box(
image, offset_y, offset_x, new_shape, new_shape)
return image
def pad_crop_central(image, central_fraction=0.875):
"""Pads the image to the maximum length, crops the central fraction."""
# Pad the image to be square.
image = pad(image)
# Crop the central region of the image with an area containing 87.5% of
# the original image.
image = tf.image.central_crop(image, central_fraction=central_fraction)
return image
def crop_image_by_strategy(image, cropping):
"""Crops an image according to a strategy defined in config.
Args:
image: 3-d image tensor.
cropping: str, name of cropping strategy.
Returns:
image: cropped image.
Raises:
ValueError: When unknown cropping strategy is specified.
"""
strategy_to_method = {
'crop_center': crop_center,
'pad': pad,
'pad200': pad_200,
'pad_crop_central': pad_crop_central
}
tf.logging.info('Cropping strategy: %s.' % cropping)
if cropping not in strategy_to_method:
raise ValueError('Unknown cropping strategy: %s' % cropping)
return strategy_to_method[cropping](image)
def scale_augment_crop(image, central_bbox, area_range, min_object_covered):
"""Training time scale augmentation.
Args:
image: 3-d float tensor.
central_bbox: Bounding box defining the central region of interest.
area_range: Range of allowed areas for the augmented bounding box.
min_object_covered: Constraint for the fraction of original image in
augmented bounding box.
Returns:
distort_image: The scaled, cropped image.
"""
(distorted_image, _) = distorted_bounding_box_crop(
image, central_bbox, area_range=area_range,
aspect_ratio_range=(1.0, 1.0),
min_object_covered=min_object_covered)
# Restore the shape since the dynamic slice based upon the bbox_size loses
# the third dimension.
distorted_image.set_shape([None, None, 3])
return distorted_image
def scale_to_inception_range(image):
"""Scales an image in the range [0,1] to [-1,1] as expected by inception."""
# Assert that incoming images have been properly scaled to [0,1].
with tf.control_dependencies(
[tf.assert_less_equal(tf.reduce_max(image), 1.),
tf.assert_greater_equal(tf.reduce_min(image), 0.)]):
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.0)
return image
def resize_image(image, height, width):
"""Resizes an image to a target height and width."""
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
image = tf.squeeze(image, [0])
return image
def crop_or_pad(image, curr_height, curr_width, new, height=True, crop=True):
"""Crops or pads an image.
Args:
image: 3-D float32 `Tensor` image.
curr_height: Int, current height.
curr_width: Int, current width.
new: Int, new width or height.
height: Boolean, cropping or padding for height.
crop: Boolean, True if we're cropping, False if we're padding.
Returns:
image: 3-D float32 `Tensor` image.
"""
# Crop the image to fit the new shape.
abs_diff = tf.abs(new-curr_height)//2 if height else tf.abs(new-curr_width)//2
offset_x = 0 if height else abs_diff
offset_y = abs_diff if height else 0
# We process height first, so always pad/crop to new height.
target_height = new
# We process height first, so pad/crop to new width only if not doing height.
target_width = curr_width if height else new
if crop:
image = tf.image.crop_to_bounding_box(
image, offset_y, offset_x, target_height, target_width)
else:
image = tf.image.pad_to_bounding_box(
image, offset_y, offset_x, target_height, target_width)
return image
def get_central_bbox(min_side, new_size):
"""Gets the central bounding box for an image.
If image is square, returns bounding box [0,0,1,1].
Otherwise, returns the bounding box containing the central
smallest side x smallest side square.
Args:
min_side: Int, size of smallest side in pixels.
new_size: Int, resize image to a square of new_size x new_size pixels.
Returns:
bbox: A 4-D Int `Tensor`, holding the coordinates of the central bounding
box.
"""
max_shape = tf.cast(new_size, tf.float32)
min_shape = tf.cast(min_side, tf.float32)
top_xy = ((max_shape-min_shape)/2)/max_shape
bottom_xy = (min_shape+(max_shape-min_shape)/2)/max_shape
# Create a bbox for the center region of interest.
bbox = tf.stack([[[top_xy, top_xy, bottom_xy, bottom_xy]]])
bbox.set_shape([1, 1, 4])
return bbox
def pad_to_max(image, max_scale):
"""Pads an image to max_scale times the current center crop size.
E.g.: For an image with dimensions 1920x1080 and a max_scale of 1.5,
returns an image that is 1.5 * (1080x1080).
Args:
image: 3-D float32 `Tensor` image.
max_scale: Float, maximum scale of the image, as a multiplier on the
central bounding box.
Returns:
image: 3-D float32 `Tensor` image.
"""
orig_shape = tf.shape(image)
orig_height = orig_shape[0]
orig_width = orig_shape[1]
# Find the smallest side and corresponding new size.
min_side = tf.cast(tf.minimum(orig_height, orig_width), tf.float32)
new_shape = tf.cast(tf.sqrt(max_scale*min_side*min_side), tf.int32)
# Crop or pad height.
# pylint: disable=g-long-lambda
image = tf.cond(
orig_height >= new_shape,
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=True, crop=True),
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=True, crop=False))
# Crop or pad width.
image = tf.cond(
orig_width >= new_shape,
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=False, crop=True),
lambda: crop_or_pad(
image, orig_height, orig_width, new_shape, height=False, crop=False))
# Get the bounding box of the original centered box in the new resized image.
original_bounding_box = get_central_bbox(min_side, new_shape)
return image, original_bounding_box
def scale_up_augmentation(image, max_scale):
"""Scales an image randomly >100% up to some max scale."""
# Pad to max size.
image, original_central_bbox = pad_to_max(image, max_scale)
# Determine area range of the augmented crop, as a percentage of the
# new max area.
# aug_max == 100% of new max area.
aug_max = 1.0
# aug_min == original_area/new_area == original_area/(max_scale*original_area)
# == 1/max_scale.
aug_min = 1.0/max_scale
area_range = (aug_min, aug_max)
# Since we're doing >100% scale, always have the full original crop in frame.
min_object_covered = 1.0
# Get a random scaled, cropped image.
image = scale_augment_crop(image, original_central_bbox, area_range,
min_object_covered)
return image
def scale_down_augmentation(image, min_scale):
"""Scales an image randomly <100% down to some min scale."""
# Crop the center, and consider the whole image the bounding box ROI.
image = crop_center(image)
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
# Determine area range of the augmented crop, as a percentage of the
# original crop center area.
# aug_max == 100% of original area.
area_range = (min_scale, 1.0)
# Get a random scaled, cropped image.
image = scale_augment_crop(image, bbox, area_range, min_scale)
return image
def augment_image_scale(image, min_scale, max_scale, p_scale_up):
"""Training time scale augmentation.
Args:
image: 3-d float tensor representing image.
min_scale: minimum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
max_scale: maximum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
p_scale_up: Fraction of images scaled up.
Returns:
image: The scale-augmented image.
"""
assert max_scale >= 1.0
assert min_scale <= 1.0
if min_scale == max_scale == 1.0:
tf.logging.info('Min and max scale are 1.0, don`t augment.')
# Do no augmentation, just crop the center.
return crop_center(image)
elif (max_scale == 1.0) and (min_scale < 1.0):
tf.logging.info('Max scale is 1.0, only scale down augment.')
# Always do <100% augmentation.
return scale_down_augmentation(image, min_scale)
elif (min_scale == 1.0) and (max_scale > 1.0):
tf.logging.info('Min scale is 1.0, only scale up augment.')
# Always do >100% augmentation.
return scale_up_augmentation(image, max_scale)
else:
tf.logging.info('Sample both augmentations.')
# Choose to scale image up or down.
rn = tf.random_uniform([], minval=0., maxval=1., dtype=tf.float32)
image = tf.cond(rn >= p_scale_up,
lambda: scale_up_augmentation(image, max_scale),
lambda: scale_down_augmentation(image, min_scale))
return image
def decode_image(image_str):
"""Decodes a jpeg-encoded image string into a image in range [0,1]."""
# Decode jpeg string into np.uint8 tensor.
image = tf.image.decode_jpeg(image_str, channels=3)
# Convert the image to range [0,1].
if image.dtype != tf.float32:
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
return image
def decode_images(image_strs):
"""Decodes a tensor of image strings."""
return tf.map_fn(decode_image, image_strs, dtype=tf.float32)
def preprocess_training_images(images, height, width, min_scale, max_scale,
p_scale_up, aug_color=True, fast_mode=True):
"""Preprocesses a batch of images for training.
This applies training-time scale and color augmentation, crops/resizes,
and scales images to the [-1,1] range expected by pre-trained Inception nets.
Args:
images: A 4-D float32 `Tensor` holding raw images to be preprocessed.
height: Int, height in pixels to resize image to.
width: Int, width in pixels to resize image to.
min_scale: Float, minimum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
max_scale: Float, maximum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
p_scale_up: Float, fraction of images scaled up.
aug_color: Whether or not to do color augmentation.
fast_mode: Boolean, avoids slower ops (random_hue and random_contrast).
Returns:
preprocessed_images: A 4-D float32 `Tensor` holding preprocessed images.
"""
def _prepro_train(im):
"""Map this preprocessing function over each image in the batch."""
return preprocess_training_image(
im, height, width, min_scale, max_scale, p_scale_up,
aug_color=aug_color, fast_mode=fast_mode)
return tf.map_fn(_prepro_train, images)
def preprocess_training_image(
image, height, width, min_scale, max_scale, p_scale_up,
aug_color=True, fast_mode=True):
"""Preprocesses an image for training.
Args:
image: A 3-d float tensor representing the image.
height: Target image height.
width: Target image width.
min_scale: Minimum scale of bounding box (as a percentage of full
bounding box) used to crop image during scale augmentation.
max_scale: Minimum scale of bounding box (as a percentage of full
bounding box) used to crop image during scale augmentation.
p_scale_up: Fraction of images to scale >100%.
aug_color: Whether or not to do color augmentation.
fast_mode: Avoids slower ops (random_hue and random_contrast).
Returns:
scaled_image: An scaled image tensor in the range [-1,1].
"""
# Get a random scaled, cropped image.
image = augment_image_scale(image, min_scale, max_scale, p_scale_up)
# Resize image to desired height, width.
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
image = tf.squeeze(image, [0])
# Optionally augment the color.
# pylint: disable=g-long-lambda
if aug_color:
image = apply_with_random_selector(
image,
lambda x, ordering: distort_color(
x, ordering, fast_mode=fast_mode), num_cases=4)
# Scale to [-1,1] range as expected by inception.
scaled_image = scale_to_inception_range(image)
return scaled_image
def preprocess_test_image(image, height, width, crop_strategy):
"""Preprocesses an image for test/inference.
Args:
image: A 3-d float tensor representing the image.
height: Target image height.
width: Target image width.
crop_strategy: String, name of the strategy used to crop test-time images.
Can be: 'crop_center', 'pad', 'pad_200', 'pad_crop_central'.
Returns:
scaled_image: An scaled image tensor in the range [-1,1].
"""
image = crop_image_by_strategy(image, crop_strategy)
# Resize.
image = resize_image(image, height, width)
# Scale the input range to [-1,1] as expected by inception.
image = scale_to_inception_range(image)
return image
def preprocess_test_images(images, height, width, crop_strategy):
"""Apply test-time preprocessing to a batch of images.
This crops images (given a named strategy for doing so), resizes them,
and scales them to the [-1,1] range expected by pre-trained Inception nets.
Args:
images: A 4-D float32 `Tensor` holding raw images to be preprocessed.
height: Int, height in pixels to resize image to.
width: Int, width in pixels to resize image to.
crop_strategy: String, name of the strategy used to crop test-time images.
Can be: 'crop_center', 'pad', 'pad_200', 'pad_crop_central'.
Returns:
preprocessed_images: A 4-D float32 `Tensor` holding preprocessed images.
"""
def _prepro_test(im):
"""Map this preprocessing function over each image in the batch."""
return preprocess_test_image(im, height, width, crop_strategy)
if len(images.shape) == 3:
return _prepro_test(images)
else:
return tf.map_fn(_prepro_test, images)
def preprocess_images(
images, is_training, height, width,
min_scale=1.0, max_scale=1.0, p_scale_up=0.0,
aug_color=True, fast_mode=True,
crop_strategy='pad_crop_central'):
"""Preprocess a batch of images.
Args:
images: A 4-D float32 `Tensor` holding raw images to be preprocessed.
is_training: Boolean, whether to preprocess them for training or test.
height: Int, height in pixels to resize image to.
width: Int, width in pixels to resize image to.
min_scale: Float, minimum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
max_scale: Float, maximum scale augmentation allowed, as a fraction of the
central min_side * min_side area of the original image.
p_scale_up: Float, fraction of images scaled up.
aug_color: Whether or not to do color augmentation.
fast_mode: Boolean, avoids slower ops (random_hue and random_contrast).
crop_strategy: String, name of the strategy used to crop test-time images.
Can be: 'crop_center', 'pad', 'pad_200', 'pad_crop_central'.
Returns:
preprocessed_images: A 4-D float32 `Tensor` holding preprocessed images.
"""
if is_training:
return preprocess_training_images(
images, height, width, min_scale, max_scale,
p_scale_up, aug_color, fast_mode)
else:
return preprocess_test_images(
images, height, width, crop_strategy)
def cv2rotateimage(image, angle):
"""Efficient rotation if 90 degrees rotations, slow otherwise.
Not a tensorflow function, using cv2 and scipy on numpy arrays.
Args:
image: a numpy array with shape [height, width, channels].
angle: the rotation angle in degrees in the range [-180, 180].
Returns:
The rotated image.
"""
# Limit angle to [-180, 180] degrees.
assert angle <= 180 and angle >= -180
if angle == 0:
return image
# Efficient rotations.
if angle == -90:
image = cv2.transpose(image)
image = cv2.flip(image, 0)
elif angle == 90:
image = cv2.transpose(image)
image = cv2.flip(image, 1)
elif angle == 180 or angle == -180:
image = cv2.flip(image, 0)
image = cv2.flip(image, 1)
else: # Slow rotation.
image = ndimage.interpolation.rotate(image, 270)
return image
def cv2resizeminedge(image, min_edge_size):
"""Resize smallest edge of image to min_edge_size."""
assert min_edge_size >= 0
height, width = (image.shape[0], image.shape[1])
new_height, new_width = (0, 0)
if height > width:
new_width = min_edge_size
new_height = int(height * new_width / float(width))
else:
new_height = min_edge_size
new_width = int(width * new_height / float(height))
return cv2.resize(image, (new_width, new_height),
interpolation=cv2.INTER_AREA)
def shapestring(array):
"""Returns a compact string describing shape of an array."""
shape = array.shape
s = str(shape[0])
for i in range(1, len(shape)):
s += 'x' + str(shape[i])
return s
def unscale_jpeg_encode(ims):
"""Unscales pixel values and jpeg encodes preprocessed image.
Args:
ims: A 4-D float32 `Tensor` holding preprocessed images.
Returns:
im_strings: A 1-D string `Tensor` holding images that have been unscaled
(reversing the inception [-1,1] scaling), and jpeg encoded.
"""
ims /= 2.0
ims += 0.5
ims *= 255.0
ims = tf.clip_by_value(ims, 0, 255)
ims = tf.cast(ims, tf.uint8)
im_strings = tf.map_fn(
lambda x: tf.image.encode_jpeg(x, format='rgb', quality=100),
ims, dtype=tf.string)
return im_strings
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Trains TCN models (and baseline comparisons)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from estimators.get_estimator import get_estimator
from utils import util
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_string(
'config_paths', '',
"""
Path to a YAML configuration files defining FLAG values. Multiple files
can be separated by the `#` symbol. Files are merged recursively. Setting
a key in these files is equivalent to setting the FLAG value with
the same name.
""")
tf.flags.DEFINE_string(
'model_params', '{}', 'YAML configuration string for the model parameters.')
tf.app.flags.DEFINE_string('master', 'local',
'BNS name of the TensorFlow master to use')
tf.app.flags.DEFINE_string(
'logdir', '/tmp/tcn', 'Directory where to write event logs.')
tf.app.flags.DEFINE_integer(
'task', 0, 'Task id of the replica running the training.')
tf.app.flags.DEFINE_integer(
'ps_tasks', 0, 'Number of tasks in the ps job. If 0 no ps job is used.')
FLAGS = tf.app.flags.FLAGS
def main(_):
"""Runs main training loop."""
# Parse config dict from yaml config files / command line flags.
config = util.ParseConfigsToLuaTable(
FLAGS.config_paths, FLAGS.model_params, save=True, logdir=FLAGS.logdir)
# Choose an estimator based on training strategy.
estimator = get_estimator(config, FLAGS.logdir)
# Run training
estimator.train()
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# pylint: disable=line-too-long,g-explicit-length-test
"""A convenience class replicating some lua table syntax with a python dict.
In general, should behave like a dictionary except that we can use dot notation
to access keys. Users should be careful to only provide keys suitable for
instance variable names.
Nota bene: do not use the key "keys" since it will collide with the method keys.
Usage example:
>>> t = T(a=5,b='kaw', c=T(v=[],x=33))
>>> t.a
5
>>> t.z = None
>>> print t
T(a=5, z=None, c=T(x=33, v=[]), b='kaw')
>>> t2 = T({'h':'f','x':4})
>>> t2
T(h='f', x=4)
>>> t2['x']
4
"""
class T(object):
"""Class for emulating lua tables."""
def __init__(self, *args, **kwargs):
if len(args) > 1 or (len(args) == 1 and len(kwargs) > 0):
errmsg = '''constructor only allows a single dict as a positional
argument or keyword arguments'''
raise ValueError(errmsg)
if len(args) == 1 and isinstance(args[0], dict):
self.__dict__.update(args[0])
else:
self.__dict__.update(kwargs)
def __repr__(self):
fmt = ', '.join('%s=%s' for i in range(len(self.__dict__)))
kwargstr = fmt % tuple(
x for tup in self.__dict__.items() for x in [str(tup[0]), repr(tup[1])])
return 'T(' + kwargstr + ')'
def __getitem__(self, key):
return self.__dict__[key]
def __setitem__(self, key, val):
self.__dict__[key] = val
def __delitem__(self, key):
del self.__dict__[key]
def __iter__(self):
return iter(self.__dict__)
def __len__(self):
return len(self.__dict__)
def keys(self): # Needed for dict(T( ... )) to work.
return self.__dict__.keys()
def iteritems(self):
return self.__dict__.iteritems()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A utility class for reporting processing progress."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import datetime
class Progress(object):
"""A utility class for reporting processing progress."""
def __init__(self, target_size):
self.target_size = target_size
self.current_size = 0
self.start_time = datetime.datetime.now()
def Update(self, current_size):
"""Replaces internal current_size with current_size."""
self.current_size = current_size
def Add(self, size):
"""Increments internal current_size by size."""
self.current_size += size
def __str__(self):
processed = 1e-5 + self.current_size / float(self.target_size)
current_time = datetime.datetime.now()
elapsed = current_time - self.start_time
eta = datetime.timedelta(
seconds=elapsed.total_seconds() / processed - elapsed.total_seconds())
return "%d / %d (elapsed %s eta %s)" % (
self.current_size, self.target_size,
str(elapsed).split(".")[0],
str(eta).split(".")[0])
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""General utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import six
from utils.luatables import T
import tensorflow as tf
import yaml
from yaml.constructor import ConstructorError
# pylint: disable=invalid-name
def GetFilesRecursively(topdir):
"""Gets all records recursively for some topdir.
Args:
topdir: String, path to top directory.
Returns:
allpaths: List of Strings, full paths to all leaf records.
Raises:
ValueError: If there are no files found for this directory.
"""
assert topdir
topdir = os.path.expanduser(topdir)
allpaths = []
for path, _, leaffiles in tf.gfile.Walk(topdir):
if leaffiles:
allpaths.extend([os.path.join(path, i) for i in leaffiles])
if not allpaths:
raise ValueError('No files found for top directory %s' % topdir)
return allpaths
def NoDuplicatesConstructor(loader, node, deep=False):
"""Check for duplicate keys."""
mapping = {}
for key_node, value_node in node.value:
key = loader.construct_object(key_node, deep=deep)
value = loader.construct_object(value_node, deep=deep)
if key in mapping:
raise ConstructorError('while constructing a mapping', node.start_mark,
'found duplicate key (%s)' % key,
key_node.start_mark)
mapping[key] = value
return loader.construct_mapping(node, deep)
def WriteConfigAsYaml(config, logdir, filename):
"""Writes a config dict as yaml to logdir/experiment.yml."""
if not tf.gfile.Exists(logdir):
tf.gfile.MakeDirs(logdir)
config_filename = os.path.join(logdir, filename)
with tf.gfile.GFile(config_filename, 'w') as f:
f.write(yaml.dump(config))
tf.logging.info('wrote config to %s', config_filename)
def LoadConfigDict(config_paths, model_params):
"""Loads config dictionary from specified yaml files or command line yaml."""
# Ensure that no duplicate keys can be loaded (causing pain).
yaml.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
NoDuplicatesConstructor)
# Handle either ',' or '#' separated config lists, since borg will only
# accept '#'.
sep = ',' if ',' in config_paths else '#'
# Load flags from config file.
final_config = {}
if config_paths:
for config_path in config_paths.split(sep):
config_path = config_path.strip()
if not config_path:
continue
config_path = os.path.abspath(config_path)
tf.logging.info('Loading config from %s', config_path)
with tf.gfile.GFile(config_path.strip()) as config_file:
config_flags = yaml.load(config_file)
final_config = DeepMergeDict(final_config, config_flags)
if model_params:
model_params = MaybeLoadYaml(model_params)
final_config = DeepMergeDict(final_config, model_params)
tf.logging.info('Final Config:\n%s', yaml.dump(final_config))
return final_config
def MaybeLoadYaml(item):
"""Parses item if it's a string. If it's a dictionary it's returned as-is."""
if isinstance(item, six.string_types):
return yaml.load(item)
elif isinstance(item, dict):
return item
else:
raise ValueError('Got {}, expected YAML string or dict', type(item))
def DeepMergeDict(dict_x, dict_y, path=None):
"""Recursively merges dict_y into dict_x."""
if path is None: path = []
for key in dict_y:
if key in dict_x:
if isinstance(dict_x[key], dict) and isinstance(dict_y[key], dict):
DeepMergeDict(dict_x[key], dict_y[key], path + [str(key)])
elif dict_x[key] == dict_y[key]:
pass # same leaf value
else:
dict_x[key] = dict_y[key]
else:
dict_x[key] = dict_y[key]
return dict_x
def ParseConfigsToLuaTable(config_paths, extra_model_params=None,
save=False, save_name='final_training_config.yml',
logdir=None):
"""Maps config_paths and extra_model_params to a Luatable-like object."""
# Parse config dict from yaml config files / command line flags.
config = LoadConfigDict(config_paths, extra_model_params)
if save:
WriteConfigAsYaml(config, logdir, save_name)
# Convert config dictionary to T object with dot notation.
config = RecursivelyConvertToLuatable(config)
return config
def SetNestedValue(d, keys, value):
"""Sets a value in a nested dictionary.
Example:
d = {}, keys = ['data','augmentation','minscale'], value = 1.0.
returns {'data': {'augmentation' : {'minscale': 1.0 }}}
Args:
d: A dictionary to set a nested value in.
keys: list of dict keys nesting left to right.
value: the nested value to set.
Returns:
None
"""
for key in keys[:-1]:
d = d.setdefault(key, {})
d[keys[-1]] = value
def RecursivelyConvertToLuatable(yaml_dict):
"""Converts a dictionary to a LuaTable-like T object."""
if isinstance(yaml_dict, dict):
yaml_dict = T(yaml_dict)
for key, item in yaml_dict.iteritems():
if isinstance(item, dict):
yaml_dict[key] = RecursivelyConvertToLuatable(item)
return yaml_dict
def KNNIds(query_vec, target_seq, k=1):
"""Gets the knn ids to the query vec from the target sequence."""
sorted_distances = KNNIdsWithDistances(query_vec, target_seq, k)
return [i[0] for i in sorted_distances]
def KNNIdsWithDistances(query_vec, target_seq, k=1):
"""Gets the knn ids to the query vec from the target sequence."""
if not isinstance(np.array(target_seq), np.ndarray):
target_seq = np.array(target_seq)
assert np.shape(query_vec) == np.shape(target_seq[0])
distances = [(i, np.linalg.norm(query_vec-target_vec)) for (
i, target_vec) in enumerate(target_seq)]
sorted_distances = sorted(distances, key=lambda x: x[1])
return sorted_distances[:k]
def CopyLocalConfigsToCNS(outdir, configs, gfs_user):
"""Copies experiment yaml config files to the job_logdir on /cns."""
assert configs
assert outdir
conf_files = configs.split(',')
for conf_file in conf_files:
copy_command = 'fileutil --gfs_user %s cp -f %s %s' % (
gfs_user, conf_file, outdir)
tf.logging.info(copy_command)
os.system(copy_command)
def pairwise_distances(feature, squared=True):
"""Computes the pairwise distance matrix in numpy.
Args:
feature: 2-D numpy array of size [number of data, feature dimension]
squared: Boolean. If true, output is the pairwise squared euclidean
distance matrix; else, output is the pairwise euclidean distance matrix.
Returns:
pdists: 2-D numpy array of size
[number of data, number of data].
"""
triu = np.triu_indices(feature.shape[0], 1)
upper_tri_pdists = np.linalg.norm(feature[triu[1]] - feature[triu[0]], axis=1)
if squared:
upper_tri_pdists **= 2.
num_data = feature.shape[0]
pdists = np.zeros((num_data, num_data))
pdists[np.triu_indices(num_data, 1)] = upper_tri_pdists
# Make symmetrical.
pdists = pdists + pdists.T - np.diag(
pdists.diagonal())
return pdists
def is_tfrecord_input(inp):
"""Checks if input is a TFRecord or list of TFRecords."""
def _is_tfrecord(inp):
if not isinstance(inp, str):
return False
_, extension = os.path.splitext(inp)
return extension == '.tfrecord'
if isinstance(inp, str):
return _is_tfrecord(inp)
if isinstance(inp, list):
return all(map(_is_tfrecord, inp))
return False
def is_np_array(inp):
if isinstance(inp, np.ndarray):
return True
if isinstance(inp, list):
return all([isinstance(i, np.ndarray) for i in inp])
return False
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment