Commit 68a18b70 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Merge pull request #1 from tensorflow/master

update to tensorflow/model master
parents bc70271a 2c4fea8d
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains different architectures for the different DSN parts.
We define here the modules that can be used in the different parts of the DSN
model.
- shared encoder (dsn_cropped_linemod, dann_xxxx)
- private encoder (default_encoder)
- decoder (large_decoder, gtsrb_decoder, small_decoder)
"""
import tensorflow as tf
#from models.domain_adaptation.domain_separation
import utils
slim = tf.contrib.slim
def default_batch_norm_params(is_training=False):
"""Returns default batch normalization parameters for DSNs.
Args:
is_training: whether or not the model is training.
Returns:
a dictionary that maps batch norm parameter names (strings) to values.
"""
return {
# Decay for the moving averages.
'decay': 0.5,
# epsilon to prevent 0s in variance.
'epsilon': 0.001,
'is_training': is_training
}
################################################################################
# PRIVATE ENCODERS
################################################################################
def default_encoder(images, code_size, batch_norm_params=None,
weight_decay=0.0):
"""Encodes the given images to codes of the given size.
Args:
images: a tensor of size [batch_size, height, width, 1].
code_size: the number of hidden units in the code layer of the classifier.
batch_norm_params: a dictionary that maps batch norm parameter names to
values.
weight_decay: the value for the weight decay coefficient.
Returns:
end_points: the code of the input.
"""
end_points = {}
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
with slim.arg_scope([slim.conv2d], kernel_size=[5, 5], padding='SAME'):
net = slim.conv2d(images, 32, scope='conv1')
net = slim.max_pool2d(net, [2, 2], 2, scope='pool1')
net = slim.conv2d(net, 64, scope='conv2')
net = slim.max_pool2d(net, [2, 2], 2, scope='pool2')
net = slim.flatten(net)
end_points['flatten'] = net
net = slim.fully_connected(net, code_size, scope='fc1')
end_points['fc3'] = net
return end_points
################################################################################
# DECODERS
################################################################################
def large_decoder(codes,
height,
width,
channels,
batch_norm_params=None,
weight_decay=0.0):
"""Decodes the codes to a fixed output size.
Args:
codes: a tensor of size [batch_size, code_size].
height: the height of the output images.
width: the width of the output images.
channels: the number of the output channels.
batch_norm_params: a dictionary that maps batch norm parameter names to
values.
weight_decay: the value for the weight decay coefficient.
Returns:
recons: the reconstruction tensor of shape [batch_size, height, width, 3].
"""
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
net = slim.fully_connected(codes, 600, scope='fc1')
batch_size = net.get_shape().as_list()[0]
net = tf.reshape(net, [batch_size, 10, 10, 6])
net = slim.conv2d(net, 32, [5, 5], scope='conv1_1')
net = tf.image.resize_nearest_neighbor(net, (16, 16))
net = slim.conv2d(net, 32, [5, 5], scope='conv2_1')
net = tf.image.resize_nearest_neighbor(net, (32, 32))
net = slim.conv2d(net, 32, [5, 5], scope='conv3_2')
output_size = [height, width]
net = tf.image.resize_nearest_neighbor(net, output_size)
with slim.arg_scope([slim.conv2d], kernel_size=[3, 3]):
net = slim.conv2d(net, channels, activation_fn=None, scope='conv4_1')
return net
def gtsrb_decoder(codes,
height,
width,
channels,
batch_norm_params=None,
weight_decay=0.0):
"""Decodes the codes to a fixed output size. This decoder is specific to GTSRB
Args:
codes: a tensor of size [batch_size, 100].
height: the height of the output images.
width: the width of the output images.
channels: the number of the output channels.
batch_norm_params: a dictionary that maps batch norm parameter names to
values.
weight_decay: the value for the weight decay coefficient.
Returns:
recons: the reconstruction tensor of shape [batch_size, height, width, 3].
Raises:
ValueError: When the input code size is not 100.
"""
batch_size, code_size = codes.get_shape().as_list()
if code_size != 100:
raise ValueError('The code size used as an input to the GTSRB decoder is '
'expected to be 100.')
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
net = codes
net = tf.reshape(net, [batch_size, 10, 10, 1])
net = slim.conv2d(net, 32, [3, 3], scope='conv1_1')
# First upsampling 20x20
net = tf.image.resize_nearest_neighbor(net, [20, 20])
net = slim.conv2d(net, 32, [3, 3], scope='conv2_1')
output_size = [height, width]
# Final upsampling 40 x 40
net = tf.image.resize_nearest_neighbor(net, output_size)
with slim.arg_scope([slim.conv2d], kernel_size=[3, 3]):
net = slim.conv2d(net, 16, scope='conv3_1')
net = slim.conv2d(net, channels, activation_fn=None, scope='conv3_2')
return net
def small_decoder(codes,
height,
width,
channels,
batch_norm_params=None,
weight_decay=0.0):
"""Decodes the codes to a fixed output size.
Args:
codes: a tensor of size [batch_size, code_size].
height: the height of the output images.
width: the width of the output images.
channels: the number of the output channels.
batch_norm_params: a dictionary that maps batch norm parameter names to
values.
weight_decay: the value for the weight decay coefficient.
Returns:
recons: the reconstruction tensor of shape [batch_size, height, width, 3].
"""
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
net = slim.fully_connected(codes, 300, scope='fc1')
batch_size = net.get_shape().as_list()[0]
net = tf.reshape(net, [batch_size, 10, 10, 3])
net = slim.conv2d(net, 16, [3, 3], scope='conv1_1')
net = slim.conv2d(net, 16, [3, 3], scope='conv1_2')
output_size = [height, width]
net = tf.image.resize_nearest_neighbor(net, output_size)
with slim.arg_scope([slim.conv2d], kernel_size=[3, 3]):
net = slim.conv2d(net, 16, scope='conv2_1')
net = slim.conv2d(net, channels, activation_fn=None, scope='conv2_2')
return net
################################################################################
# SHARED ENCODERS
################################################################################
def dann_mnist(images,
weight_decay=0.0,
prefix='model',
num_classes=10,
**kwargs):
"""Creates a convolution MNIST model.
Note that this model implements the architecture for MNIST proposed in:
Y. Ganin et al., Domain-Adversarial Training of Neural Networks (DANN),
JMLR 2015
Args:
images: the MNIST digits, a tensor of size [batch_size, 28, 28, 1].
weight_decay: the value for the weight decay coefficient.
prefix: name of the model to use when prefixing tags.
num_classes: the number of output classes to use.
**kwargs: Placeholder for keyword arguments used by other shared encoders.
Returns:
the output logits, a tensor of size [batch_size, num_classes].
a dictionary with key/values the layer names and tensors.
"""
end_points = {}
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,):
with slim.arg_scope([slim.conv2d], padding='SAME'):
end_points['conv1'] = slim.conv2d(images, 32, [5, 5], scope='conv1')
end_points['pool1'] = slim.max_pool2d(
end_points['conv1'], [2, 2], 2, scope='pool1')
end_points['conv2'] = slim.conv2d(
end_points['pool1'], 48, [5, 5], scope='conv2')
end_points['pool2'] = slim.max_pool2d(
end_points['conv2'], [2, 2], 2, scope='pool2')
end_points['fc3'] = slim.fully_connected(
slim.flatten(end_points['pool2']), 100, scope='fc3')
end_points['fc4'] = slim.fully_connected(
slim.flatten(end_points['fc3']), 100, scope='fc4')
logits = slim.fully_connected(
end_points['fc4'], num_classes, activation_fn=None, scope='fc5')
return logits, end_points
def dann_svhn(images,
weight_decay=0.0,
prefix='model',
num_classes=10,
**kwargs):
"""Creates the convolutional SVHN model.
Note that this model implements the architecture for MNIST proposed in:
Y. Ganin et al., Domain-Adversarial Training of Neural Networks (DANN),
JMLR 2015
Args:
images: the SVHN digits, a tensor of size [batch_size, 32, 32, 3].
weight_decay: the value for the weight decay coefficient.
prefix: name of the model to use when prefixing tags.
num_classes: the number of output classes to use.
**kwargs: Placeholder for keyword arguments used by other shared encoders.
Returns:
the output logits, a tensor of size [batch_size, num_classes].
a dictionary with key/values the layer names and tensors.
"""
end_points = {}
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,):
with slim.arg_scope([slim.conv2d], padding='SAME'):
end_points['conv1'] = slim.conv2d(images, 64, [5, 5], scope='conv1')
end_points['pool1'] = slim.max_pool2d(
end_points['conv1'], [3, 3], 2, scope='pool1')
end_points['conv2'] = slim.conv2d(
end_points['pool1'], 64, [5, 5], scope='conv2')
end_points['pool2'] = slim.max_pool2d(
end_points['conv2'], [3, 3], 2, scope='pool2')
end_points['conv3'] = slim.conv2d(
end_points['pool2'], 128, [5, 5], scope='conv3')
end_points['fc3'] = slim.fully_connected(
slim.flatten(end_points['conv3']), 3072, scope='fc3')
end_points['fc4'] = slim.fully_connected(
slim.flatten(end_points['fc3']), 2048, scope='fc4')
logits = slim.fully_connected(
end_points['fc4'], num_classes, activation_fn=None, scope='fc5')
return logits, end_points
def dann_gtsrb(images,
weight_decay=0.0,
prefix='model',
num_classes=43,
**kwargs):
"""Creates the convolutional GTSRB model.
Note that this model implements the architecture for MNIST proposed in:
Y. Ganin et al., Domain-Adversarial Training of Neural Networks (DANN),
JMLR 2015
Args:
images: the GTSRB images, a tensor of size [batch_size, 40, 40, 3].
weight_decay: the value for the weight decay coefficient.
prefix: name of the model to use when prefixing tags.
num_classes: the number of output classes to use.
**kwargs: Placeholder for keyword arguments used by other shared encoders.
Returns:
the output logits, a tensor of size [batch_size, num_classes].
a dictionary with key/values the layer names and tensors.
"""
end_points = {}
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,):
with slim.arg_scope([slim.conv2d], padding='SAME'):
end_points['conv1'] = slim.conv2d(images, 96, [5, 5], scope='conv1')
end_points['pool1'] = slim.max_pool2d(
end_points['conv1'], [2, 2], 2, scope='pool1')
end_points['conv2'] = slim.conv2d(
end_points['pool1'], 144, [3, 3], scope='conv2')
end_points['pool2'] = slim.max_pool2d(
end_points['conv2'], [2, 2], 2, scope='pool2')
end_points['conv3'] = slim.conv2d(
end_points['pool2'], 256, [5, 5], scope='conv3')
end_points['pool3'] = slim.max_pool2d(
end_points['conv3'], [2, 2], 2, scope='pool3')
end_points['fc3'] = slim.fully_connected(
slim.flatten(end_points['pool3']), 512, scope='fc3')
logits = slim.fully_connected(
end_points['fc3'], num_classes, activation_fn=None, scope='fc4')
return logits, end_points
def dsn_cropped_linemod(images,
weight_decay=0.0,
prefix='model',
num_classes=11,
batch_norm_params=None,
is_training=False):
"""Creates the convolutional pose estimation model for Cropped Linemod.
Args:
images: the Cropped Linemod samples, a tensor of size
[batch_size, 64, 64, 4].
weight_decay: the value for the weight decay coefficient.
prefix: name of the model to use when prefixing tags.
num_classes: the number of output classes to use.
batch_norm_params: a dictionary that maps batch norm parameter names to
values.
is_training: specifies whether or not we're currently training the model.
This variable will determine the behaviour of the dropout layer.
Returns:
the output logits, a tensor of size [batch_size, num_classes].
a dictionary with key/values the layer names and tensors.
"""
end_points = {}
tf.summary.image('{}/input_images'.format(prefix), images)
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm if batch_norm_params else None,
normalizer_params=batch_norm_params):
with slim.arg_scope([slim.conv2d], padding='SAME'):
end_points['conv1'] = slim.conv2d(images, 32, [5, 5], scope='conv1')
end_points['pool1'] = slim.max_pool2d(
end_points['conv1'], [2, 2], 2, scope='pool1')
end_points['conv2'] = slim.conv2d(
end_points['pool1'], 64, [5, 5], scope='conv2')
end_points['pool2'] = slim.max_pool2d(
end_points['conv2'], [2, 2], 2, scope='pool2')
net = slim.flatten(end_points['pool2'])
end_points['fc3'] = slim.fully_connected(net, 128, scope='fc3')
net = slim.dropout(
end_points['fc3'], 0.5, is_training=is_training, scope='dropout')
with tf.variable_scope('quaternion_prediction'):
predicted_quaternion = slim.fully_connected(
net, 4, activation_fn=tf.nn.tanh)
predicted_quaternion = tf.nn.l2_normalize(predicted_quaternion, 1)
logits = slim.fully_connected(
net, num_classes, activation_fn=None, scope='fc4')
end_points['quaternion_pred'] = predicted_quaternion
return logits, end_points
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for DSN components."""
import numpy as np
import tensorflow as tf
#from models.domain_adaptation.domain_separation
import models
class SharedEncodersTest(tf.test.TestCase):
def _testSharedEncoder(self,
input_shape=[5, 28, 28, 1],
model=models.dann_mnist,
is_training=True):
images = tf.to_float(np.random.rand(*input_shape))
with self.test_session() as sess:
logits, _ = model(images)
sess.run(tf.global_variables_initializer())
logits_np = sess.run(logits)
return logits_np
def testBuildGRLMnistModel(self):
logits = self._testSharedEncoder(model=getattr(models,
'dann_mnist'))
self.assertEqual(logits.shape, (5, 10))
self.assertTrue(np.any(logits))
def testBuildGRLSvhnModel(self):
logits = self._testSharedEncoder(model=getattr(models,
'dann_svhn'))
self.assertEqual(logits.shape, (5, 10))
self.assertTrue(np.any(logits))
def testBuildGRLGtsrbModel(self):
logits = self._testSharedEncoder([5, 40, 40, 3],
getattr(models, 'dann_gtsrb'))
self.assertEqual(logits.shape, (5, 43))
self.assertTrue(np.any(logits))
def testBuildPoseModel(self):
logits = self._testSharedEncoder([5, 64, 64, 4],
getattr(models, 'dsn_cropped_linemod'))
self.assertEqual(logits.shape, (5, 11))
self.assertTrue(np.any(logits))
def testBuildPoseModelWithBatchNorm(self):
images = tf.to_float(np.random.rand(10, 64, 64, 4))
with self.test_session() as sess:
logits, _ = getattr(models, 'dsn_cropped_linemod')(
images, batch_norm_params=models.default_batch_norm_params(True))
sess.run(tf.global_variables_initializer())
logits_np = sess.run(logits)
self.assertEqual(logits_np.shape, (10, 11))
self.assertTrue(np.any(logits_np))
class EncoderTest(tf.test.TestCase):
def _testEncoder(self, batch_norm_params=None, channels=1):
images = tf.to_float(np.random.rand(10, 28, 28, channels))
with self.test_session() as sess:
end_points = models.default_encoder(
images, 128, batch_norm_params=batch_norm_params)
sess.run(tf.global_variables_initializer())
private_code = sess.run(end_points['fc3'])
self.assertEqual(private_code.shape, (10, 128))
self.assertTrue(np.any(private_code))
self.assertTrue(np.all(np.isfinite(private_code)))
def testEncoder(self):
self._testEncoder()
def testEncoderMultiChannel(self):
self._testEncoder(None, 4)
def testEncoderIsTrainingBatchNorm(self):
self._testEncoder(models.default_batch_norm_params(True))
def testEncoderBatchNorm(self):
self._testEncoder(models.default_batch_norm_params(False))
class DecoderTest(tf.test.TestCase):
def _testDecoder(self,
height=64,
width=64,
channels=4,
batch_norm_params=None,
decoder=models.small_decoder):
codes = tf.to_float(np.random.rand(32, 100))
with self.test_session() as sess:
output = decoder(
codes,
height=height,
width=width,
channels=channels,
batch_norm_params=batch_norm_params)
sess.run(tf.global_variables_initializer())
output_np = sess.run(output)
self.assertEqual(output_np.shape, (32, height, width, channels))
self.assertTrue(np.any(output_np))
self.assertTrue(np.all(np.isfinite(output_np)))
def testSmallDecoder(self):
self._testDecoder(28, 28, 4, None, getattr(models, 'small_decoder'))
def testSmallDecoderThreeChannels(self):
self._testDecoder(28, 28, 3)
def testSmallDecoderBatchNorm(self):
self._testDecoder(28, 28, 4, models.default_batch_norm_params(False))
def testSmallDecoderIsTrainingBatchNorm(self):
self._testDecoder(28, 28, 4, models.default_batch_norm_params(True))
def testLargeDecoder(self):
self._testDecoder(32, 32, 4, None, getattr(models, 'large_decoder'))
def testLargeDecoderThreeChannels(self):
self._testDecoder(32, 32, 3, None, getattr(models, 'large_decoder'))
def testLargeDecoderBatchNorm(self):
self._testDecoder(32, 32, 4,
models.default_batch_norm_params(False),
getattr(models, 'large_decoder'))
def testLargeDecoderIsTrainingBatchNorm(self):
self._testDecoder(32, 32, 4,
models.default_batch_norm_params(True),
getattr(models, 'large_decoder'))
def testGtsrbDecoder(self):
self._testDecoder(40, 40, 3, None, getattr(models, 'large_decoder'))
def testGtsrbDecoderBatchNorm(self):
self._testDecoder(40, 40, 4,
models.default_batch_norm_params(False),
getattr(models, 'gtsrb_decoder'))
def testGtsrbDecoderIsTrainingBatchNorm(self):
self._testDecoder(40, 40, 4,
models.default_batch_norm_params(True),
getattr(models, 'gtsrb_decoder'))
if __name__ == '__main__':
tf.test.main()
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Auxiliary functions for domain adaptation related losses.
"""
import math
import tensorflow as tf
def create_summaries(end_points, prefix='', max_images=3, use_op_name=False):
"""Creates a tf summary per endpoint.
If the endpoint is a 4 dimensional tensor it displays it as an image
otherwise if it is a two dimensional one it creates a histogram summary.
Args:
end_points: a dictionary of name, tf tensor pairs.
prefix: an optional string to prefix the summary with.
max_images: the maximum number of images to display per summary.
use_op_name: Use the op name as opposed to the shorter end_points key.
"""
for layer_name in end_points:
if use_op_name:
name = end_points[layer_name].op.name
else:
name = layer_name
if len(end_points[layer_name].get_shape().as_list()) == 4:
# if it's an actual image do not attempt to reshape it
if end_points[layer_name].get_shape().as_list()[-1] == 1 or end_points[
layer_name].get_shape().as_list()[-1] == 3:
visualization_image = end_points[layer_name]
else:
visualization_image = reshape_feature_maps(end_points[layer_name])
tf.summary.image(
'{}/{}'.format(prefix, name),
visualization_image,
max_outputs=max_images)
elif len(end_points[layer_name].get_shape().as_list()) == 3:
images = tf.expand_dims(end_points[layer_name], 3)
tf.summary.image(
'{}/{}'.format(prefix, name),
images,
max_outputs=max_images)
elif len(end_points[layer_name].get_shape().as_list()) == 2:
tf.summary.histogram('{}/{}'.format(prefix, name), end_points[layer_name])
def reshape_feature_maps(features_tensor):
"""Reshape activations for tf.summary.image visualization.
Arguments:
features_tensor: a tensor of activations with a square number of feature
maps, eg 4, 9, 16, etc.
Returns:
A composite image with all the feature maps that can be passed as an
argument to tf.summary.image.
"""
assert len(features_tensor.get_shape().as_list()) == 4
num_filters = features_tensor.get_shape().as_list()[-1]
assert num_filters > 0
num_filters_sqrt = math.sqrt(num_filters)
assert num_filters_sqrt.is_integer(
), 'Number of filters should be a square number but got {}'.format(
num_filters)
num_filters_sqrt = int(num_filters_sqrt)
conv_summary = tf.unstack(features_tensor, axis=3)
conv_one_row = tf.concat(axis=2, values=conv_summary[0:num_filters_sqrt])
ind = 1
conv_final = conv_one_row
for ind in range(1, num_filters_sqrt):
conv_one_row = tf.concat(axis=2,
values=conv_summary[
ind * num_filters_sqrt + 0:ind * num_filters_sqrt + num_filters_sqrt])
conv_final = tf.concat(
axis=1, values=[tf.squeeze(conv_final), tf.squeeze(conv_one_row)])
conv_final = tf.expand_dims(conv_final, -1)
return conv_final
def accuracy(predictions, labels):
"""Calculates the classificaton accuracy.
Args:
predictions: the predicted values, a tensor whose size matches 'labels'.
labels: the ground truth values, a tensor of any size.
Returns:
a tensor whose value on evaluation returns the total accuracy.
"""
return tf.reduce_mean(tf.cast(tf.equal(predictions, labels), tf.float32))
def compute_upsample_values(input_tensor, upsample_height, upsample_width):
"""Compute values for an upsampling op (ops.BatchCropAndResize).
Args:
input_tensor: image tensor with shape [batch, height, width, in_channels]
upsample_height: integer
upsample_width: integer
Returns:
grid_centers: tensor with shape [batch, 1]
crop_sizes: tensor with shape [batch, 1]
output_height: integer
output_width: integer
"""
batch, input_height, input_width, _ = input_tensor.shape
height_half = input_height / 2.
width_half = input_width / 2.
grid_centers = tf.constant(batch * [[height_half, width_half]])
crop_sizes = tf.constant(batch * [[input_height, input_width]])
output_height = input_height * upsample_height
output_width = input_width * upsample_width
return grid_centers, tf.to_float(crop_sizes), output_height, output_width
def compute_pairwise_distances(x, y):
"""Computes the squared pairwise Euclidean distances between x and y.
Args:
x: a tensor of shape [num_x_samples, num_features]
y: a tensor of shape [num_y_samples, num_features]
Returns:
a distance matrix of dimensions [num_x_samples, num_y_samples].
Raises:
ValueError: if the inputs do no matched the specified dimensions.
"""
if not len(x.get_shape()) == len(y.get_shape()) == 2:
raise ValueError('Both inputs should be matrices.')
if x.get_shape().as_list()[1] != y.get_shape().as_list()[1]:
raise ValueError('The number of features should be the same.')
norm = lambda x: tf.reduce_sum(tf.square(x), 1)
# By making the `inner' dimensions of the two matrices equal to 1 using
# broadcasting then we are essentially substracting every pair of rows
# of x and y.
# x will be num_samples x num_features x 1,
# and y will be 1 x num_features x num_samples (after broadcasting).
# After the substraction we will get a
# num_x_samples x num_features x num_y_samples matrix.
# The resulting dist will be of shape num_y_samples x num_x_samples.
# and thus we need to transpose it again.
return tf.transpose(norm(tf.expand_dims(x, 2) - tf.transpose(y)))
def gaussian_kernel_matrix(x, y, sigmas):
r"""Computes a Guassian Radial Basis Kernel between the samples of x and y.
We create a sum of multiple gaussian kernels each having a width sigma_i.
Args:
x: a tensor of shape [num_samples, num_features]
y: a tensor of shape [num_samples, num_features]
sigmas: a tensor of floats which denote the widths of each of the
gaussians in the kernel.
Returns:
A tensor of shape [num_samples{x}, num_samples{y}] with the RBF kernel.
"""
beta = 1. / (2. * (tf.expand_dims(sigmas, 1)))
dist = compute_pairwise_distances(x, y)
s = tf.matmul(beta, tf.reshape(dist, (1, -1)))
return tf.reshape(tf.reduce_sum(tf.exp(-s), 0), tf.shape(dist))
......@@ -12,9 +12,9 @@ Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan.
Full text available at: http://arxiv.org/abs/1609.06647
## Contact
***Author:*** Chris Shallue (shallue@google.com).
***Author:*** Chris Shallue
***Pull requests and issues:*** @cshallue.
***Pull requests and issues:*** @cshallue
## Contents
* [Model Overview](#model-overview)
......@@ -37,9 +37,7 @@ Full text available at: http://arxiv.org/abs/1609.06647
The *Show and Tell* model is a deep neural network that learns how to describe
the content of images. For example:
<center>
![Example captions](g3doc/example_captions.jpg)
</center>
### Architecture
......@@ -66,9 +64,7 @@ learned during training.
The following diagram illustrates the model architecture.
<center>
![Show and Tell Architecture](g3doc/show_and_tell_architecture.png)
</center>
In this diagram, \{*s*<sub>0</sub>, *s*<sub>1</sub>, ..., *s*<sub>*N*-1</sub>\}
are the words of the caption and \{*w*<sub>*e*</sub>*s*<sub>0</sub>,
......@@ -118,12 +114,12 @@ approximately 10 times slower.
### Install Required Packages
First ensure that you have installed the following required packages:
* **Bazel** ([instructions](http://bazel.io/docs/install.html)).
* **TensorFlow** r0.12 or greater ([instructions](https://www.tensorflow.org/versions/master/get_started/os_setup.html)).
* **NumPy** ([instructions](http://www.scipy.org/install.html)).
* **Bazel** ([instructions](http://bazel.io/docs/install.html))
* **TensorFlow** 1.0 or greater ([instructions](https://www.tensorflow.org/install/))
* **NumPy** ([instructions](http://www.scipy.org/install.html))
* **Natural Language Toolkit (NLTK)**:
* First install NLTK ([instructions](http://www.nltk.org/install.html)).
* Then install the NLTK data ([instructions](http://www.nltk.org/data.html)).
* First install NLTK ([instructions](http://www.nltk.org/install.html))
* Then install the NLTK data ([instructions](http://www.nltk.org/data.html))
### Prepare the Training Data
......@@ -137,8 +133,7 @@ Each caption is a list of words. During preprocessing, a dictionary is created
that assigns each word in the vocabulary to an integer-valued id. Each caption
is encoded as a list of integer word ids in the `tf.SequenceExample` protos.
We have provided a script to download and preprocess the [MSCOCO]
(http://mscoco.org/) image captioning data set into this format. Downloading
We have provided a script to download and preprocess the [MSCOCO](http://mscoco.org/) image captioning data set into this format. Downloading
and preprocessing the data may take several hours depending on your network and
computer speed. Please be patient.
......@@ -150,7 +145,8 @@ available space for storing the downloaded and processed data.
MSCOCO_DIR="${HOME}/im2txt/data/mscoco"
# Build the preprocessing script.
bazel build im2txt/download_and_preprocess_mscoco
cd tensorflow-models/im2txt
bazel build //im2txt:download_and_preprocess_mscoco
# Run the preprocessing script.
bazel-bin/im2txt/download_and_preprocess_mscoco "${MSCOCO_DIR}"
......@@ -216,7 +212,8 @@ INCEPTION_CHECKPOINT="${HOME}/im2txt/data/inception_v3.ckpt"
MODEL_DIR="${HOME}/im2txt/model"
# Build the model.
bazel build -c opt im2txt/...
cd tensorflow-models/im2txt
bazel build -c opt //im2txt/...
# Run the training script.
bazel-bin/im2txt/train \
......@@ -266,8 +263,7 @@ tensorboard --logdir="${MODEL_DIR}"
### Fine Tune the Inception v3 Model
Your model will already be able to generate reasonable captions after the first
phase of training. Try it out! (See [Generating Captions]
(#generating-captions)).
phase of training. Try it out! (See [Generating Captions](#generating-captions)).
You can further improve the performance of the model by running a
second training phase to jointly fine-tune the parameters of the *Inception v3*
......@@ -296,8 +292,12 @@ Your trained *Show and Tell* model can generate captions for any JPEG image! The
following command line will generate captions for an image from the test set.
```shell
# Directory containing model checkpoints.
CHECKPOINT_DIR="${HOME}/im2txt/model/train"
# Path to checkpoint file or a directory containing checkpoint files. Passing
# a directory will only work if there is also a file named 'checkpoint' which
# lists the available checkpoints in the directory. It will not work if you
# point to a directory with just a copy of a model checkpoint: in that case,
# you will need to pass the checkpoint path explicitly.
CHECKPOINT_PATH="${HOME}/im2txt/model/train"
# Vocabulary file generated by the preprocessing script.
VOCAB_FILE="${HOME}/im2txt/data/mscoco/word_counts.txt"
......@@ -306,7 +306,8 @@ VOCAB_FILE="${HOME}/im2txt/data/mscoco/word_counts.txt"
IMAGE_FILE="${HOME}/im2txt/data/mscoco/raw-data/val2014/COCO_val2014_000000224477.jpg"
# Build the inference binary.
bazel build -c opt im2txt/run_inference
cd tensorflow-models/im2txt
bazel build -c opt //im2txt:run_inference
# Ignore GPU devices (only necessary if your GPU is currently memory
# constrained, for example, by running the training script).
......@@ -314,7 +315,7 @@ export CUDA_VISIBLE_DEVICES=""
# Run inference to generate captions.
bazel-bin/im2txt/run_inference \
--checkpoint_path=${CHECKPOINT_DIR} \
--checkpoint_path=${CHECKPOINT_PATH} \
--vocab_file=${VOCAB_FILE} \
--input_files=${IMAGE_FILE}
```
......@@ -333,6 +334,4 @@ expected.
Here is the image:
<center>
![Surfer](g3doc/COCO_val2014_000000224477.jpg)
</center>
......@@ -424,7 +424,7 @@ def _load_and_process_metadata(captions_file, image_dir):
(len(id_to_filename), captions_file))
# Process the captions and combine the data into a list of ImageMetadata.
print("Proccessing captions.")
print("Processing captions.")
image_metadata = []
num_captions = 0
for image_id, base_filename in id_to_filename:
......
......@@ -62,7 +62,7 @@ def evaluate_model(sess, model, global_step, summary_writer, summary_op):
sess: Session object.
model: Instance of ShowAndTellModel; the model to evaluate.
global_step: Integer; global step of the model checkpoint.
summary_writer: Instance of SummaryWriter.
summary_writer: Instance of FileWriter.
summary_op: Op for generating model summaries.
"""
# Log model summaries on a single batch.
......@@ -91,7 +91,7 @@ def evaluate_model(sess, model, global_step, summary_writer, summary_op):
perplexity = math.exp(sum_losses / sum_weights)
tf.logging.info("Perplexity = %f (%.2g sec)", perplexity, eval_time)
# Log perplexity to the SummaryWriter.
# Log perplexity to the FileWriter.
summary = tf.Summary()
value = summary.value.add()
value.simple_value = perplexity
......@@ -110,7 +110,7 @@ def run_once(model, saver, summary_writer, summary_op):
Args:
model: Instance of ShowAndTellModel; the model to evaluate.
saver: Instance of tf.train.Saver for restoring model Variables.
summary_writer: Instance of SummaryWriter.
summary_writer: Instance of FileWriter.
summary_op: Op for generating model summaries.
"""
model_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
......@@ -171,8 +171,8 @@ def run():
saver = tf.train.Saver()
# Create the summary operation and the summary writer.
summary_op = tf.merge_all_summaries()
summary_writer = tf.train.SummaryWriter(eval_dir)
summary_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(eval_dir)
g.finalize()
......
......@@ -39,6 +39,8 @@ tf.flags.DEFINE_string("input_files", "",
"File pattern or comma-separated list of file patterns "
"of image files.")
tf.logging.set_verbosity(tf.logging.INFO)
def main(_):
# Build the inference graph.
......
......@@ -264,7 +264,7 @@ class ShowAndTellModel(object):
if self.mode == "inference":
# In inference mode, use concatenated states for convenient feeding and
# fetching.
tf.concat(initial_state, 1, name="initial_state")
tf.concat(axis=1, values=initial_state, name="initial_state")
# Placeholder for feeding a batch of concatenated states.
state_feed = tf.placeholder(dtype=tf.float32,
......@@ -274,11 +274,11 @@ class ShowAndTellModel(object):
# Run a single LSTM step.
lstm_outputs, state_tuple = lstm_cell(
inputs=tf.squeeze(self.seq_embeddings, squeeze_dims=[1]),
inputs=tf.squeeze(self.seq_embeddings, axis=[1]),
state=state_tuple)
# Concatentate the resulting state.
tf.concat(state_tuple, 1, name="state")
tf.concat(axis=1, values=state_tuple, name="state")
else:
# Run the batch of sequence embeddings through the LSTM.
sequence_length = tf.reduce_sum(self.input_mask, 1)
......
......@@ -18,12 +18,15 @@ evaluation with a computational cost of 5 billion multiply-adds per inference
and with using less than 25 million parameters. Below is a visualization of the
model architecture.
<center>
![Inception-v3 Architecture](g3doc/inception_v3_architecture.png)
</center>
## Description of Code
**NOTE**: For the most part, you will find a newer version of this code at [models/slim](https://github.com/tensorflow/models/tree/master/slim). In particular:
* `inception_train.py` and `imagenet_train.py` should no longer be used. The slim editions for running on multiple GPUs are the current best examples.
* `inception_distributed_train.py` and `imagenet_distributed_train.py` are still valid examples of distributed training.
The code base provides three core binaries for:
* Training an Inception v3 network from scratch across multiple GPUs and/or
......@@ -34,13 +37,12 @@ The code base provides three core binaries for:
errors to fine tune the network weights.
The training procedure employs synchronous stochastic gradient descent across
multiple GPUs. The user may specify the number of GPUs they wish harness. The
multiple GPUs. The user may specify the number of GPUs they wish to harness. The
synchronous training performs *batch-splitting* by dividing a given batch across
multiple GPUs.
The training set up is nearly identical to the section [Training a Model Using
Multiple GPU Cards]
(https://www.tensorflow.org/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards)
Multiple GPU Cards](https://www.tensorflow.org/tutorials/deep_cnn/index.html#launching_and_training_the_model_on_multiple_gpu_cards)
where we have substituted the CIFAR-10 model architecture with Inception v3. The
primary differences with that setup are:
......@@ -49,18 +51,12 @@ primary differences with that setup are:
* Specify the model architecture using a (still experimental) higher level
language called TensorFlow-Slim.
For more details about TensorFlow-Slim, please see the [Slim README]
(inception/slim/README.md). Please note that this higher-level language is still
For more details about TensorFlow-Slim, please see the [Slim README](inception/slim/README.md). Please note that this higher-level language is still
*experimental* and the API may change over time depending on usage and
subsequent research.
## Getting Started
**NOTE** Before doing anything, we first need to build TensorFlow from source,
and installed as a PIP package. Please follow the instructions at [Installing
From Source]
(https://www.tensorflow.org/get_started/os_setup.html#create-the-pip-package-and-install).
Before you run the training script for the first time, you will need to download
and convert the ImageNet data to native TFRecord format. The TFRecord format
consists of a set of sharded files where each entry is a serialized `tf.Example`
......@@ -73,8 +69,7 @@ downloading and converting ImageNet data to TFRecord format. Downloading and
preprocessing the data may take several hours (up to half a day) depending on
your network and computer speed. Please be patient.
To begin, you will need to sign up for an account with [ImageNet]
(http://image-net.org) to gain access to the data. Look for the sign up page,
To begin, you will need to sign up for an account with [ImageNet](http://image-net.org) to gain access to the data. Look for the sign up page,
create an account and request an access key to download the data.
After you have `USERNAME` and `PASSWORD`, you are ready to run our script. Make
......@@ -91,7 +86,8 @@ you will not need to interact with the script again.
DATA_DIR=$HOME/imagenet-data
# build the preprocessing script.
bazel build inception/download_and_preprocess_imagenet
cd tensorflow-models/inception
bazel build //inception:download_and_preprocess_imagenet
# run it
bazel-bin/inception/download_and_preprocess_imagenet "${DATA_DIR}"
......@@ -103,9 +99,9 @@ The final line of the output script should read:
2016-02-17 14:30:17.287989: Finished writing all 1281167 images in data set.
```
When the script finishes you will find 1024 and 128 training and validation
files in the `DATA_DIR`. The files will match the patterns `train-????-of-1024`
and `validation-?????-of-00128`, respectively.
When the script finishes, you will find 1024 training files and 128 validation
files in the `DATA_DIR`. The files will match the patterns
`train-?????-of-01024` and `validation-?????-of-00128`, respectively.
[Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0) You are now
ready to train or evaluate with the ImageNet data set.
......@@ -116,15 +112,12 @@ ready to train or evaluate with the ImageNet data set.
intensive task and depending on your compute setup may take several days or even
weeks.
*Before proceeding* please read the [Convolutional Neural Networks]
(https://www.tensorflow.org/tutorials/deep_cnn/index.html) tutorial in
particular focus on [Training a Model Using Multiple GPU Cards]
(https://www.tensorflow.org/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards)
. The model training method is nearly identical to that described in the
*Before proceeding* please read the [Convolutional Neural Networks](https://www.tensorflow.org/tutorials/deep_cnn/index.html) tutorial; in
particular, focus on [Training a Model Using Multiple GPU Cards](https://www.tensorflow.org/tutorials/deep_cnn/index.html#launching_and_training_the_model_on_multiple_gpu_cards). The model training method is nearly identical to that described in the
CIFAR-10 multi-GPU model training. Briefly, the model training
* Places an individual model replica on each GPU. Split the batch across the
GPUs.
* Places an individual model replica on each GPU.
* Splits the batch across the GPUs.
* Updates model parameters synchronously by waiting for all GPUs to finish
processing a batch of data.
......@@ -161,7 +154,8 @@ To train this model, you simply need to specify the following:
```shell
# Build the model. Note that we need to make sure the TensorFlow is ready to
# use before this as this command will not build TensorFlow.
bazel build inception/imagenet_train
cd tensorflow-models/inception
bazel build //inception:imagenet_train
# run it
bazel-bin/inception/imagenet_train --num_gpus=1 --batch_size=32 --train_dir=/tmp/imagenet_train --data_dir=/tmp/imagenet_data
......@@ -197,7 +191,8 @@ GPU cards.
```shell
# Build the model. Note that we need to make sure the TensorFlow is ready to
# use before this as this command will not build TensorFlow.
bazel build inception/imagenet_train
cd tensorflow-models/inception
bazel build //inception:imagenet_train
# run it
bazel-bin/inception/imagenet_train --num_gpus=2 --batch_size=64 --train_dir=/tmp/imagenet_train
......@@ -250,11 +245,9 @@ We term each machine that maintains model parameters a `ps`, short for
`ps` as the model parameters may be sharded across multiple machines.
Variables may be updated with synchronous or asynchronous gradient updates. One
may construct a an [`Optimizer`]
(https://www.tensorflow.org/api_docs/python/train.html#optimizers) in TensorFlow
that constructs the necessary graph for either case diagrammed below from
TensorFlow [Whitepaper]
(http://download.tensorflow.org/paper/whitepaper2015.pdf):
may construct a an [`Optimizer`](https://www.tensorflow.org/api_docs/python/train.html#optimizers) in TensorFlow
that constructs the necessary graph for either case diagrammed below from the
TensorFlow [Whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf):
<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
<img style="width:100%"
......@@ -270,7 +263,7 @@ Note that in this example each replica has a single tower that uses one GPU.
The command-line flags `worker_hosts` and `ps_hosts` specify available servers.
The same binary will be used for both the `worker` jobs and the `ps` jobs.
Command line flag `job_name` will be used to specify what role a task will be
playing and `task_id` will be used to idenify which one of the jobs it is
playing and `task_id` will be used to identify which one of the jobs it is
running. Several things to note here:
* The numbers of `ps` and `worker` tasks are inferred from the lists of hosts
......@@ -298,7 +291,8 @@ running. Several things to note here:
```shell
# Build the model. Note that we need to make sure the TensorFlow is ready to
# use before this as this command will not build TensorFlow.
bazel build inception/imagenet_distributed_train
cd tensorflow-models/inception
bazel build //inception:imagenet_distributed_train
# To start worker 0, go to the worker0 host and run the following (Note that
# task_id should be in the range [0, num_worker_tasks):
......@@ -377,6 +371,13 @@ I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:206] Initialize HostPo
I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:202] Started server with target: grpc://localhost:2222
```
If you compiled TensorFlow (from v1.1-rc3) with VERBS support and you have the
required device and IB verbs SW stack, you can specify --protocol='grpc+verbs'
In order to use Verbs RDMA for Tensor passing between workers and ps.
Need to add the the --protocol flag in all tasks (ps and workers).
The default protocol is the TensorFlow default protocol of grpc.
[Congratulations!](https://www.youtube.com/watch?v=9bZkp7q19f0) You are now
training Inception in a distributed manner.
......@@ -385,10 +386,8 @@ training Inception in a distributed manner.
Evaluating an Inception v3 model on the ImageNet 2012 validation data set
requires running a separate binary.
The evaluation procedure is nearly identical to [Evaluating a Model]
(https://www.tensorflow.org/tutorials/deep_cnn/index.html#evaluating-a-model)
described in the [Convolutional Neural Network]
(https://www.tensorflow.org/tutorials/deep_cnn/index.html) tutorial.
The evaluation procedure is nearly identical to [Evaluating a Model](https://www.tensorflow.org/tutorials/deep_cnn/index.html#evaluating_a_model)
described in the [Convolutional Neural Network](https://www.tensorflow.org/tutorials/deep_cnn/index.html) tutorial.
**WARNING** Be careful not to run the evaluation and training binary on the same
GPU or else you might run out of memory. Consider running the evaluation on a
......@@ -400,7 +399,8 @@ Briefly, one can evaluate the model by running:
```shell
# Build the model. Note that we need to make sure the TensorFlow is ready to
# use before this as this command will not build TensorFlow.
bazel build inception/imagenet_eval
cd tensorflow-models/inception
bazel build //inception:imagenet_eval
# run it
bazel-bin/inception/imagenet_eval --checkpoint_dir=/tmp/imagenet_train --eval_dir=/tmp/imagenet_eval
......@@ -443,20 +443,20 @@ daisy, dandelion, roses, sunflowers, tulips
There is a single automated script that downloads the data set and converts it
to the TFRecord format. Much like the ImageNet data set, each record in the
TFRecord format is a serialized `tf.Example` proto whose entries include a
JPEG-encoded string and an integer label. Please see [`parse_example_proto`]
(inception/image_processing.py) for details.
JPEG-encoded string and an integer label. Please see [`parse_example_proto`](inception/image_processing.py) for details.
The script just takes a few minutes to run depending your network connection
speed for downloading and processing the images. Your hard disk requires 200MB
of free storage. Here we select `DATA_DIR=$HOME/flowers-data` as such a location
of free storage. Here we select `DATA_DIR=/tmp/flowers-data/` as such a location
but feel free to edit accordingly.
```shell
# location of where to place the flowers data
FLOWERS_DATA_DIR=$HOME/flowers-data
FLOWERS_DATA_DIR=/tmp/flowers-data/
# build the preprocessing script.
bazel build inception/download_and_preprocess_flowers
cd tensorflow-models/inception
bazel build //inception:download_and_preprocess_flowers
# run it
bazel-bin/inception/download_and_preprocess_flowers "${FLOWERS_DATA_DIR}"
......@@ -470,26 +470,25 @@ look like:
```
When the script finishes you will find 2 shards for the training and validation
files in the `DATA_DIR`. The files will match the patterns `train-????-of-00001`
and `validation-?????-of-00001`, respectively.
files in the `DATA_DIR`. The files will match the patterns `train-?????-of-00002`
and `validation-?????-of-00002`, respectively.
**NOTE** If you wish to prepare a custom image data set for transfer learning,
you will need to invoke [`build_image_data.py`](inception/data/build_image_data.py) on
your custom data set. Please see the associated options and assumptions behind
this script by reading the comments section of [`build_image_data.py`]
(inception/data/build_image_data.py). Also, if your custom data has a different
this script by reading the comments section of [`build_image_data.py`](inception/data/build_image_data.py). Also, if your custom data has a different
number of examples or classes, you need to change the appropriate values in
[`imagenet_data.py`](inception/imagenet_data.py).
The second piece you will need is a trained Inception v3 image model. You have
the option of either training one yourself (See [How to Train from Scratch]
(#how-to-train-from-scratch) for details) or you can download a pre-trained
the option of either training one yourself (See [How to Train from Scratch](#how-to-train-from-scratch) for details) or you can download a pre-trained
model like so:
```shell
# location of where to place the Inception v3 model
DATA_DIR=$HOME/inception-v3-model
cd ${DATA_DIR}
INCEPTION_MODEL_DIR=$HOME/inception-v3-model
mkdir -p ${INCEPTION_MODEL_DIR}
cd ${INCEPTION_MODEL_DIR}
# download the Inception v3 model
curl -O http://download.tensorflow.org/models/image/imagenet/inception-v3-2016-03-01.tar.gz
......@@ -537,10 +536,11 @@ the flowers data set with the following command.
```shell
# Build the model. Note that we need to make sure the TensorFlow is ready to
# use before this as this command will not build TensorFlow.
bazel build inception/flowers_train
cd tensorflow-models/inception
bazel build //inception:flowers_train
# Path to the downloaded Inception-v3 model.
MODEL_PATH="${INCEPTION_MODEL_DIR}/model.ckpt-157585"
MODEL_PATH="${INCEPTION_MODEL_DIR}/inception-v3/model.ckpt-157585"
# Directory where the flowers data resides.
FLOWERS_DATA_DIR=/tmp/flowers-data/
......@@ -573,7 +573,8 @@ fine-tuned model, you will need to run `flowers_eval`:
```shell
# Build the model. Note that we need to make sure the TensorFlow is ready to
# use before this as this command will not build TensorFlow.
bazel build inception/flowers_eval
cd tensorflow-models/inception
bazel build //inception:flowers_eval
# Directory where we saved the fine-tuned checkpoint and events files.
TRAIN_DIR=/tmp/flowers_train/
......@@ -599,7 +600,7 @@ We find that the evaluation arrives at roughly 93.4% precision@1 after the model
has been running for 2000 steps.
```shell
Succesfully loaded model from /tmp/flowers/model.ckpt-1999 at step=1999.
Successfully loaded model from /tmp/flowers/model.ckpt-1999 at step=1999.
2016-03-01 16:52:51.761219: starting evaluation on (validation).
2016-03-01 16:53:05.450419: [20 batches out of 20] (36.5 examples/sec; 0.684sec/batch)
2016-03-01 16:53:05.450471: precision @ 1 = 0.9340 recall @ 5 = 0.9960 [500 examples]
......@@ -634,6 +635,15 @@ reside within `$TRAIN_DIR` and `$VALIDATION_DIR` arranged as such:
$VALIDATION_DIR/cat/cat.JPG
...
```
**NOTE**: This script will append an extra background class indexed at 0, so
your class labels will range from 0 to num_labels. Using the example above, the
corresponding class labels generated from `build_image_data.py` will be as
follows:
```shell
0
1 dog
2 cat
```
Each sub-directory in `$TRAIN_DIR` and `$VALIDATION_DIR` corresponds to a unique
label for the images that reside within that sub-directory. The images may be
......@@ -652,7 +662,8 @@ To run `build_image_data.py`, you can run the following command line:
OUTPUT_DIRECTORY=$HOME/my-custom-data/
# build the preprocessing script.
bazel build inception/build_image_data
cd tensorflow-models/inception
bazel build //inception:build_image_data
# convert the data.
bazel-bin/inception/build_image_data \
......@@ -686,26 +697,26 @@ class.
After running this script produces files that look like the following:
```shell
$TRAIN_DIR/train-00000-of-00024
$TRAIN_DIR/train-00001-of-00024
$TRAIN_DIR/train-00000-of-00128
$TRAIN_DIR/train-00001-of-00128
...
$TRAIN_DIR/train-00023-of-00024
$TRAIN_DIR/train-00127-of-00128
and
$VALIDATION_DIR/validation-00000-of-00008
$VALIDATION_DIR/validation-00001-of-00008
$VALIDATION_DIR/validation-00000-of-00024
$VALIDATION_DIR/validation-00001-of-00024
...
$VALIDATION_DIR/validation-00007-of-00008
$VALIDATION_DIR/validation-00023-of-00024
```
where 24 and 8 are the number of shards specified for each dataset,
where 128 and 24 are the number of shards specified for each dataset,
respectively. Generally speaking, we aim for selecting the number of shards such
that roughly 1024 images reside in each shard. Once this data set is built, you
are ready to train or fine-tune an Inception model on this data set.
Note, if you are piggy backing on the flowers retraining scripts, be sure to
update `num_classes()` and `num_examples_per_epoch()` in `flowers_data.py`
Note, if you are piggy backing on the flowers retraining scripts, be sure to
update `num_classes()` and `num_examples_per_epoch()` in `flowers_data.py`
to correspond with your data.
## Practical Considerations for Training a Model
......@@ -754,7 +765,7 @@ batch-splitting the model across multiple GPUs.
permit training the model with higher learning rates.
* Often the GPU memory is a bottleneck that prevents employing larger batch
sizes. Employing more GPUs allows one to user larger batch sizes because
sizes. Employing more GPUs allows one to use larger batch sizes because
this model splits the batch across the GPUs.
**NOTE** If one wishes to train this model with *asynchronous* gradient updates,
......@@ -801,8 +812,7 @@ comments in [`image_processing.py`](inception/image_processing.py) for more deta
#### The model runs out of CPU memory.
In lieu of buying more CPU memory, an easy fix is to decrease
`--input_queue_memory_factor`. See [Adjusting Memory Demands]
(#adjusting-memory-demands).
`--input_queue_memory_factor`. See [Adjusting Memory Demands](#adjusting-memory-demands).
#### The model runs out of GPU memory.
......
......@@ -32,7 +32,7 @@ a sharded data set consisting of TFRecord files
train_directory/train-00000-of-01024
train_directory/train-00001-of-01024
...
train_directory/train-00127-of-01024
train_directory/train-01023-of-01024
and
......@@ -50,7 +50,7 @@ contains the following fields:
image/width: integer, image width in pixels
image/colorspace: string, specifying the colorspace, always 'RGB'
image/channels: integer, specifying the number of channels, always 3
image/format: string, specifying the format, always'JPEG'
image/format: string, specifying the format, always 'JPEG'
image/filename: string containing the basename of the image file
e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
......@@ -60,7 +60,7 @@ contains the following fields:
image/class/text: string specifying the human-readable version of the label
e.g. 'dog'
If you data set involves bounding boxes, please look at build_imagenet_data.py.
If your data set involves bounding boxes, please look at build_imagenet_data.py.
"""
from __future__ import absolute_import
from __future__ import division
......@@ -72,7 +72,6 @@ import random
import sys
import threading
import numpy as np
import tensorflow as tf
......@@ -199,7 +198,7 @@ def _process_image(filename, coder):
width: integer, image width in pixels.
"""
# Read the image file.
with tf.gfile.FastGFile(filename, 'r') as f:
with tf.gfile.FastGFile(filename, 'rb') as f:
image_data = f.read()
# Convert any PNG to JPEG's for consistency.
......@@ -261,7 +260,12 @@ def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
label = labels[i]
text = texts[i]
image_buffer, height, width = _process_image(filename, coder)
try:
image_buffer, height, width = _process_image(filename, coder)
except Exception as e:
print(e)
print('SKIPPED: Unexpected eror while decoding %s.' % filename)
continue
example = _convert_to_example(filename, image_buffer, label,
text, height, width)
......@@ -301,7 +305,7 @@ def _process_image_files(name, filenames, texts, labels, num_shards):
spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
ranges = []
for i in range(len(spacing) - 1):
ranges.append([spacing[i], spacing[i+1]])
ranges.append([spacing[i], spacing[i + 1]])
# Launch a thread for each batch.
print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
......
......@@ -36,7 +36,7 @@ a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
train_directory/train-00000-of-01024
train_directory/train-00001-of-01024
...
train_directory/train-00127-of-01024
train_directory/train-01023-of-01024
and
......@@ -54,7 +54,7 @@ serialized Example proto. The Example proto contains the following fields:
image/width: integer, image width in pixels
image/colorspace: string, specifying the colorspace, always 'RGB'
image/channels: integer, specifying the number of channels, always 3
image/format: string, specifying the format, always'JPEG'
image/format: string, specifying the format, always 'JPEG'
image/filename: string containing the basename of the image file
e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
......@@ -80,7 +80,7 @@ serialized Example proto. The Example proto contains the following fields:
Note that the length of xmin is identical to the length of xmax, ymin and ymax
for each example.
Running this script using 16 threads may take around ~2.5 hours on a HP Z420.
Running this script using 16 threads may take around ~2.5 hours on an HP Z420.
"""
from __future__ import absolute_import
from __future__ import division
......@@ -92,7 +92,6 @@ import random
import sys
import threading
import numpy as np
import tensorflow as tf
......@@ -435,7 +434,7 @@ def _process_image_files(name, filenames, synsets, labels, humans,
ranges = []
threads = []
for i in range(len(spacing) - 1):
ranges.append([spacing[i], spacing[i+1]])
ranges.append([spacing[i], spacing[i + 1]])
# Launch a thread for each batch.
print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
......
......@@ -35,13 +35,13 @@
set -e
if [ -z "$1" ]; then
echo "usage download_and_preprocess_flowers.sh [data dir]"
echo "Usage: download_and_preprocess_flowers.sh [data dir]"
exit
fi
# Create the output and temporary directories.
DATA_DIR="${1%/}"
SCRATCH_DIR="${DATA_DIR}/raw-data/"
SCRATCH_DIR="${DATA_DIR}/raw-data"
mkdir -p "${DATA_DIR}"
mkdir -p "${SCRATCH_DIR}"
WORK_DIR="$0.runfiles/inception/inception"
......@@ -53,14 +53,14 @@ cd "${DATA_DIR}"
TARBALL="flower_photos.tgz"
if [ ! -f ${TARBALL} ]; then
echo "Downloading flower data set."
wget -O ${TARBALL} "${DATA_URL}"
curl -o ${TARBALL} "${DATA_URL}"
else
echo "Skipping download of flower data."
fi
# Note the locations of the train and validation data.
TRAIN_DIRECTORY="${SCRATCH_DIR}train/"
VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/"
TRAIN_DIRECTORY="${SCRATCH_DIR}/train"
VALIDATION_DIRECTORY="${SCRATCH_DIR}/validation"
# Expands the data into the flower_photos/ directory and rename it as the
# train directory.
......@@ -74,14 +74,14 @@ ls -1 "${TRAIN_DIRECTORY}" | grep -v 'LICENSE' | sed 's/\///' | sort > "${LABELS
# Generate the validation data set.
while read LABEL; do
VALIDATION_DIR_FOR_LABEL="${VALIDATION_DIRECTORY}${LABEL}"
TRAIN_DIR_FOR_LABEL="${TRAIN_DIRECTORY}${LABEL}"
VALIDATION_DIR_FOR_LABEL="${VALIDATION_DIRECTORY}/${LABEL}"
TRAIN_DIR_FOR_LABEL="${TRAIN_DIRECTORY}/${LABEL}"
# Move the first randomly selected 100 images to the validation set.
mkdir -p "${VALIDATION_DIR_FOR_LABEL}"
VALIDATION_IMAGES=$(ls -1 "${TRAIN_DIR_FOR_LABEL}" | shuf | head -100)
for IMAGE in ${VALIDATION_IMAGES}; do
mv -f "${TRAIN_DIRECTORY}${LABEL}/${IMAGE}" "${VALIDATION_DIR_FOR_LABEL}"
mv -f "${TRAIN_DIRECTORY}/${LABEL}/${IMAGE}" "${VALIDATION_DIR_FOR_LABEL}"
done
done < "${LABELS_FILE}"
......
......@@ -35,7 +35,7 @@
set -e
if [ -z "$1" ]; then
echo "usage download_and_preprocess_flowers.sh [data dir]"
echo "Usage: download_and_preprocess_flowers.sh [data dir]"
exit
fi
......@@ -53,7 +53,7 @@ cd "${DATA_DIR}"
TARBALL="flower_photos.tgz"
if [ ! -f ${TARBALL} ]; then
echo "Downloading flower data set."
wget -O ${TARBALL} "${DATA_URL}"
curl -o ${TARBALL} "${DATA_URL}"
else
echo "Skipping download of flower data."
fi
......
......@@ -26,7 +26,7 @@
# data_dir/train-00000-of-01024
# data_dir/train-00001-of-01024
# ...
# data_dir/train-00127-of-01024
# data_dir/train-01023-of-01024
#
# and
#
......@@ -49,7 +49,7 @@
set -e
if [ -z "$1" ]; then
echo "usage download_and_preprocess_imagenet.sh [data dir]"
echo "Usage: download_and_preprocess_imagenet.sh [data dir]"
exit
fi
......@@ -84,7 +84,7 @@ BOUNDING_BOX_FILE="${SCRATCH_DIR}/imagenet_2012_bounding_boxes.csv"
BOUNDING_BOX_DIR="${SCRATCH_DIR}bounding_boxes/"
"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \
| sort >"${BOUNDING_BOX_FILE}"
| sort > "${BOUNDING_BOX_FILE}"
echo "Finished downloading and preprocessing the ImageNet data."
# Build the TFRecords version of the ImageNet data.
......
......@@ -24,7 +24,7 @@
# downloading the raw images.
#
# usage:
# ./download_imagenet.sh [dirname]
# ./download_imagenet.sh [dir name] [synsets file]
set -e
if [ "x$IMAGENET_ACCESS_KEY" == x -o "x$IMAGENET_USERNAME" == x ]; then
......@@ -40,7 +40,6 @@ fi
OUTDIR="${1:-./imagenet-data}"
SYNSETS_FILE="${2:-./synsets.txt}"
SYNSETS_FILE="${PWD}/${SYNSETS_FILE}"
echo "Saving downloaded files to $OUTDIR"
mkdir -p "${OUTDIR}"
......
......@@ -102,7 +102,9 @@ def GetItem(name, root, index=0):
def GetInt(name, root, index=0):
return int(GetItem(name, root, index))
# In some XML annotation files, the point values are not integers, but floats.
# So we add a float function to avoid ValueError.
return int(float(GetItem(name, root, index)))
def FindNumberBoundingBoxes(root):
......
......@@ -142,11 +142,12 @@ def decode_jpeg(image_buffer, scope=None):
Args:
image_buffer: scalar string Tensor.
scope: Optional scope for op_scope.
scope: Optional scope for name_scope.
Returns:
3-D float Tensor with values ranging from [0, 1).
"""
with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
with tf.name_scope(values=[image_buffer], name=scope,
default_name='decode_jpeg'):
# Decode the string as an RGB JPEG.
# Note that the resulting image contains an unknown height and width
# that is set dynamically by decode_jpeg. In other words, the height
......@@ -171,11 +172,11 @@ def distort_color(image, thread_id=0, scope=None):
Args:
image: Tensor containing single image.
thread_id: preprocessing thread ID.
scope: Optional scope for op_scope.
scope: Optional scope for name_scope.
Returns:
color-distorted image
"""
with tf.op_scope([image], scope, 'distort_color'):
with tf.name_scope(values=[image], name=scope, default_name='distort_color'):
color_ordering = thread_id % 2
if color_ordering == 0:
......@@ -209,11 +210,12 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
where each coordinate is [0, 1) and the coordinates are arranged
as [ymin, xmin, ymax, xmax].
thread_id: integer indicating the preprocessing thread.
scope: Optional scope for op_scope.
scope: Optional scope for name_scope.
Returns:
3-D float Tensor of distorted image used for training.
"""
with tf.op_scope([image, height, width, bbox], scope, 'distort_image'):
with tf.name_scope(values=[image, height, width, bbox], name=scope,
default_name='distort_image'):
# Each bounding box has shape [1, num_boxes, box coords] and
# the coordinates are ordered [ymin, xmin, ymax, xmax].
......@@ -221,7 +223,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
if not thread_id:
image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
bbox)
tf.image_summary('image_with_bounding_boxes', image_with_box)
tf.summary.image('image_with_bounding_boxes', image_with_box)
# A large fraction of image datasets contain a human-annotated bounding
# box delineating the region of the image containing the object of interest.
......@@ -242,7 +244,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
if not thread_id:
image_with_distorted_box = tf.image.draw_bounding_boxes(
tf.expand_dims(image, 0), distort_bbox)
tf.image_summary('images_with_distorted_bounding_box',
tf.summary.image('images_with_distorted_bounding_box',
image_with_distorted_box)
# Crop the image to the specified bounding box.
......@@ -259,7 +261,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
# the third dimension.
distorted_image.set_shape([height, width, 3])
if not thread_id:
tf.image_summary('cropped_resized_image',
tf.summary.image('cropped_resized_image',
tf.expand_dims(distorted_image, 0))
# Randomly flip the image horizontally.
......@@ -269,7 +271,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
distorted_image = distort_color(distorted_image, thread_id)
if not thread_id:
tf.image_summary('final_distorted_image',
tf.summary.image('final_distorted_image',
tf.expand_dims(distorted_image, 0))
return distorted_image
......@@ -281,11 +283,12 @@ def eval_image(image, height, width, scope=None):
image: 3-D float Tensor
height: integer
width: integer
scope: Optional scope for op_scope.
scope: Optional scope for name_scope.
Returns:
3-D float Tensor of prepared image.
"""
with tf.op_scope([image, height, width], scope, 'eval_image'):
with tf.name_scope(values=[image, height, width], name=scope,
default_name='eval_image'):
# Crop the central region of the image with an area containing 87.5% of
# the original image.
image = tf.image.central_crop(image, central_fraction=0.875)
......@@ -328,8 +331,8 @@ def image_preprocessing(image_buffer, bbox, train, thread_id=0):
image = eval_image(image, height, width)
# Finally, rescale to [-1,1] instead of [0, 1)
image = tf.sub(image, 0.5)
image = tf.mul(image, 2.0)
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.0)
return image
......@@ -394,7 +397,7 @@ def parse_example_proto(example_serialized):
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
# Note that we impose an ordering of (y, x) just to make life difficult.
bbox = tf.concat(0, [ymin, xmin, ymax, xmax])
bbox = tf.concat(axis=0, values=[ymin, xmin, ymax, xmax])
# Force the variable number of bounding boxes into the shape
# [1, num_boxes, coords].
......@@ -505,6 +508,6 @@ def batch_inputs(dataset, batch_size, train, num_preprocess_threads=None,
images = tf.reshape(images, shape=[batch_size, height, width, depth])
# Display the training images in the visualizer.
tf.image_summary('images', images)
tf.summary.image('images', images)
return images, tf.reshape(label_index_batch, [batch_size])
......@@ -45,7 +45,8 @@ def main(unused_args):
{'ps': ps_hosts,
'worker': worker_hosts},
job_name=FLAGS.job_name,
task_index=FLAGS.task_id)
task_index=FLAGS.task_id,
protocol=FLAGS.protocol)
if FLAGS.job_name == 'ps':
# `ps` jobs wait for incoming connections from the workers.
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A binary to evaluate Inception on the flowers data set.
"""A binary to evaluate Inception on the ImageNet data set.
Note that using the supplied pre-trained inception checkpoint, the eval should
achieve:
......
......@@ -42,6 +42,9 @@ tf.app.flags.DEFINE_string('worker_hosts', '',
"""Comma-separated list of hostname:port for the """
"""worker jobs. e.g. """
"""'machine1:2222,machine2:1111,machine2:2222'""")
tf.app.flags.DEFINE_string('protocol', 'grpc',
"""Communication protocol to use in distributed """
"""execution (default grpc) """)
tf.app.flags.DEFINE_string('train_dir', '/tmp/imagenet_train',
"""Directory where to write event logs """
......@@ -52,11 +55,11 @@ tf.app.flags.DEFINE_boolean('log_device_placement', False,
'Whether to log device placement.')
# Task ID is used to select the chief and also to access the local_step for
# each replica to check staleness of the gradients in sync_replicas_optimizer.
# each replica to check staleness of the gradients in SyncReplicasOptimizer.
tf.app.flags.DEFINE_integer(
'task_id', 0, 'Task ID of the worker/replica running the training.')
# More details can be found in the sync_replicas_optimizer class:
# More details can be found in the SyncReplicasOptimizer class:
# tensorflow/python/training/sync_replicas_optimizer.py
tf.app.flags.DEFINE_integer('num_replicas_to_aggregate', -1,
"""Number of gradients to collect before """
......@@ -89,7 +92,7 @@ RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp.
def train(target, dataset, cluster_spec):
"""Train Inception on a dataset for a number of steps."""
# Number of workers and parameter servers are infered from the workers and ps
# Number of workers and parameter servers are inferred from the workers and ps
# hosts string.
num_workers = len(cluster_spec.as_dict()['worker'])
num_parameter_servers = len(cluster_spec.as_dict()['ps'])
......@@ -133,7 +136,7 @@ def train(target, dataset, cluster_spec):
FLAGS.learning_rate_decay_factor,
staircase=True)
# Add a summary to track the learning rate.
tf.scalar_summary('learning_rate', lr)
tf.summary.scalar('learning_rate', lr)
# Create an optimizer that performs gradient descent.
opt = tf.train.RMSPropOptimizer(lr,
......@@ -171,8 +174,8 @@ def train(target, dataset, cluster_spec):
loss_name = l.op.name
# Name each loss as '(raw)' and name the moving average version of the
# loss as the original loss name.
tf.scalar_summary(loss_name + ' (raw)', l)
tf.scalar_summary(loss_name, loss_averages.average(l))
tf.summary.scalar(loss_name + ' (raw)', l)
tf.summary.scalar(loss_name, loss_averages.average(l))
# Add dependency to compute loss_averages.
with tf.control_dependencies([loss_averages_op]):
......@@ -191,13 +194,12 @@ def train(target, dataset, cluster_spec):
# Add histograms for model variables.
for var in variables_to_average:
tf.histogram_summary(var.op.name, var)
tf.summary.histogram(var.op.name, var)
# Create synchronous replica optimizer.
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=num_replicas_to_aggregate,
replica_id=FLAGS.task_id,
total_num_replicas=num_workers,
variable_averages=exp_moving_averager,
variables_to_average=variables_to_average)
......@@ -215,25 +217,23 @@ def train(target, dataset, cluster_spec):
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
tf.histogram_summary(var.op.name + '/gradients', grad)
tf.summary.histogram(var.op.name + '/gradients', grad)
apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
train_op = tf.identity(total_loss, name='train_op')
# Get chief queue_runners, init_tokens and clean_up_op, which is used to
# synchronize replicas.
# More details can be found in sync_replicas_optimizer.
# Get chief queue_runners and init_tokens, which is used to synchronize
# replicas. More details can be found in SyncReplicasOptimizer.
chief_queue_runners = [opt.get_chief_queue_runner()]
init_tokens_op = opt.get_init_tokens_op()
clean_up_op = opt.get_clean_up_op()
# Create a saver.
saver = tf.train.Saver()
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.merge_all_summaries()
summary_op = tf.summary.merge_all()
# Build an initialization operation to run below.
init_op = tf.global_variables_initializer()
......@@ -301,8 +301,7 @@ def train(target, dataset, cluster_spec):
next_summary_time += FLAGS.save_summaries_secs
except:
if is_chief:
tf.logging.info('About to execute sync_clean_up_op!')
sess.run(clean_up_op)
tf.logging.info('Chief got exception while running!')
raise
# Stop the supervisor. This also waits for service threads to finish.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment