Commit 3e93722a authored by Neal Wu's avatar Neal Wu Committed by GitHub
Browse files

Merge branch 'master' into master

parents 2335c9fc 4de34a4c
...@@ -9,15 +9,15 @@ To propose a model for inclusion please submit a pull request. ...@@ -9,15 +9,15 @@ To propose a model for inclusion please submit a pull request.
## Models ## Models
- [autoencoder](autoencoder) -- various autoencoders - [autoencoder](autoencoder) -- various autoencoders
- [differential_privacy](differential_privacy) -- privacy-preserving student models from multiple teachers
- [im2txt](im2txt) -- image-to-text neural network for image captioning.
- [inception](inception) -- deep convolutional networks for computer vision - [inception](inception) -- deep convolutional networks for computer vision
- [namignizer](namignizer) -- recognize and generate names - [namignizer](namignizer) -- recognize and generate names
- [neural_gpu](neural_gpu) -- highly parallel neural computer - [neural_gpu](neural_gpu) -- highly parallel neural computer
- [privacy](privacy) -- privacy-preserving student models from multiple teachers - [neural_programmer](neural_programmer) -- neural network augmented with logic and mathematic operations.
- [resnet](resnet) -- deep and wide residual networks - [resnet](resnet) -- deep and wide residual networks
- [slim](slim) -- image classification models in TF-Slim - [slim](slim) -- image classification models in TF-Slim
- [swivel](swivel) -- the Swivel algorithm for generating word embeddings - [swivel](swivel) -- the Swivel algorithm for generating word embeddings
- [syntaxnet](syntaxnet) -- neural models of natural language syntax - [syntaxnet](syntaxnet) -- neural models of natural language syntax
- [textsum](textsum) -- sequence-to-sequence with attention model for text summarization. - [textsum](textsum) -- sequence-to-sequence with attention model for text summarization.
- [transformer](transformer) -- spatial transformer network, which allows the spatial manipulation of data within the network - [transformer](transformer) -- spatial transformer network, which allows the spatial manipulation of data within the network
- [im2txt](im2txt) -- image-to-text neural network for image captioning.
- [neural_programmer](neural programmer) -- neural network augmented with logic and mathematic operations.
...@@ -341,7 +341,7 @@ def loss_fun(logits, labels): ...@@ -341,7 +341,7 @@ def loss_fun(logits, labels):
# Calculate the cross entropy between labels and predictions # Calculate the cross entropy between labels and predictions
labels = tf.cast(labels, tf.int64) labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example') logits=logits, labels=labels, name='cross_entropy_per_example')
# Calculate the average cross entropy loss across the batch. # Calculate the average cross entropy loss across the batch.
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
......
...@@ -119,7 +119,7 @@ approximately 10 times slower. ...@@ -119,7 +119,7 @@ approximately 10 times slower.
First ensure that you have installed the following required packages: First ensure that you have installed the following required packages:
* **Bazel** ([instructions](http://bazel.io/docs/install.html)). * **Bazel** ([instructions](http://bazel.io/docs/install.html)).
* **TensorFlow** ([instructions](https://www.tensorflow.org/versions/master/get_started/os_setup.html)). * **TensorFlow** r0.12 or greater ([instructions](https://www.tensorflow.org/versions/master/get_started/os_setup.html)).
* **NumPy** ([instructions](http://www.scipy.org/install.html)). * **NumPy** ([instructions](http://www.scipy.org/install.html)).
* **Natural Language Toolkit (NLTK)**: * **Natural Language Toolkit (NLTK)**:
* First install NLTK ([instructions](http://www.nltk.org/install.html)). * First install NLTK ([instructions](http://www.nltk.org/install.html)).
......
...@@ -54,6 +54,16 @@ class Caption(object): ...@@ -54,6 +54,16 @@ class Caption(object):
return -1 return -1
else: else:
return 1 return 1
# For Python 3 compatibility (__cmp__ is deprecated).
def __lt__(self, other):
assert isinstance(other, Caption)
return self.score < other.score
# Also for Python 3 compatibility.
def __eq__(self, other):
assert isinstance(other, Caption)
return self.score == other.score
class TopN(object): class TopN(object):
......
...@@ -41,7 +41,7 @@ class InceptionV3Test(tf.test.TestCase): ...@@ -41,7 +41,7 @@ class InceptionV3Test(tf.test.TestCase):
def _countInceptionParameters(self): def _countInceptionParameters(self):
"""Counts the number of parameters in the inception model at top scope.""" """Counts the number of parameters in the inception model at top scope."""
counter = {} counter = {}
for v in tf.all_variables(): for v in tf.global_variables():
name_tokens = v.op.name.split("/") name_tokens = v.op.name.split("/")
if name_tokens[0] == "InceptionV3": if name_tokens[0] == "InceptionV3":
name = "InceptionV3/" + name_tokens[1] name = "InceptionV3/" + name_tokens[1]
...@@ -85,7 +85,7 @@ class InceptionV3Test(tf.test.TestCase): ...@@ -85,7 +85,7 @@ class InceptionV3Test(tf.test.TestCase):
self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
self._verifyParameterCounts() self._verifyParameterCounts()
self._assertCollectionSize(376, tf.GraphKeys.VARIABLES) self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES) self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES)
self._assertCollectionSize(188, tf.GraphKeys.UPDATE_OPS) self._assertCollectionSize(188, tf.GraphKeys.UPDATE_OPS)
self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES) self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES)
...@@ -98,7 +98,7 @@ class InceptionV3Test(tf.test.TestCase): ...@@ -98,7 +98,7 @@ class InceptionV3Test(tf.test.TestCase):
self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
self._verifyParameterCounts() self._verifyParameterCounts()
self._assertCollectionSize(376, tf.GraphKeys.VARIABLES) self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES) self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES)
self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS)
self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES) self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES)
...@@ -111,7 +111,7 @@ class InceptionV3Test(tf.test.TestCase): ...@@ -111,7 +111,7 @@ class InceptionV3Test(tf.test.TestCase):
self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
self._verifyParameterCounts() self._verifyParameterCounts()
self._assertCollectionSize(376, tf.GraphKeys.VARIABLES) self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES) self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES)
self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS)
self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES) self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES)
...@@ -124,7 +124,7 @@ class InceptionV3Test(tf.test.TestCase): ...@@ -124,7 +124,7 @@ class InceptionV3Test(tf.test.TestCase):
self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
self._verifyParameterCounts() self._verifyParameterCounts()
self._assertCollectionSize(376, tf.GraphKeys.VARIABLES) self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES) self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES)
self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS)
self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES) self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES)
......
...@@ -92,7 +92,7 @@ def process_image(encoded_image, ...@@ -92,7 +92,7 @@ def process_image(encoded_image,
# only logged in thread 0. # only logged in thread 0.
def image_summary(name, image): def image_summary(name, image):
if not thread_id: if not thread_id:
tf.image_summary(name, tf.expand_dims(image, 0)) tf.summary.image(name, tf.expand_dims(image, 0))
# Decode image into a float32 Tensor of shape [?, ?, 3] with values in [0, 1). # Decode image into a float32 Tensor of shape [?, ?, 3] with values in [0, 1).
with tf.name_scope("decode", values=[encoded_image]): with tf.name_scope("decode", values=[encoded_image]):
...@@ -128,6 +128,6 @@ def process_image(encoded_image, ...@@ -128,6 +128,6 @@ def process_image(encoded_image,
image_summary("final_image", image) image_summary("final_image", image)
# Rescale to [-1,1] instead of [0, 1] # Rescale to [-1,1] instead of [0, 1]
image = tf.sub(image, 0.5) image = tf.subtract(image, 0.5)
image = tf.mul(image, 2.0) image = tf.multiply(image, 2.0)
return image return image
...@@ -116,7 +116,7 @@ def prefetch_input_data(reader, ...@@ -116,7 +116,7 @@ def prefetch_input_data(reader,
enqueue_ops.append(values_queue.enqueue([value])) enqueue_ops.append(values_queue.enqueue([value]))
tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner( tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner(
values_queue, enqueue_ops)) values_queue, enqueue_ops))
tf.scalar_summary( tf.summary.scalar(
"queue/%s/fraction_of_%d_full" % (values_queue.name, capacity), "queue/%s/fraction_of_%d_full" % (values_queue.name, capacity),
tf.cast(values_queue.size(), tf.float32) * (1. / capacity)) tf.cast(values_queue.size(), tf.float32) * (1. / capacity))
...@@ -181,7 +181,7 @@ def batch_with_dynamic_pad(images_and_captions, ...@@ -181,7 +181,7 @@ def batch_with_dynamic_pad(images_and_captions,
enqueue_list = [] enqueue_list = []
for image, caption in images_and_captions: for image, caption in images_and_captions:
caption_length = tf.shape(caption)[0] caption_length = tf.shape(caption)[0]
input_length = tf.expand_dims(tf.sub(caption_length, 1), 0) input_length = tf.expand_dims(tf.subtract(caption_length, 1), 0)
input_seq = tf.slice(caption, [0], input_length) input_seq = tf.slice(caption, [0], input_length)
target_seq = tf.slice(caption, [1], input_length) target_seq = tf.slice(caption, [1], input_length)
...@@ -197,8 +197,8 @@ def batch_with_dynamic_pad(images_and_captions, ...@@ -197,8 +197,8 @@ def batch_with_dynamic_pad(images_and_captions,
if add_summaries: if add_summaries:
lengths = tf.add(tf.reduce_sum(mask, 1), 1) lengths = tf.add(tf.reduce_sum(mask, 1), 1)
tf.scalar_summary("caption_length/batch_min", tf.reduce_min(lengths)) tf.summary.scalar("caption_length/batch_min", tf.reduce_min(lengths))
tf.scalar_summary("caption_length/batch_max", tf.reduce_max(lengths)) tf.summary.scalar("caption_length/batch_max", tf.reduce_max(lengths))
tf.scalar_summary("caption_length/batch_mean", tf.reduce_mean(lengths)) tf.summary.scalar("caption_length/batch_mean", tf.reduce_mean(lengths))
return images, input_seqs, target_seqs, mask return images, input_seqs, target_seqs, mask
...@@ -244,10 +244,10 @@ class ShowAndTellModel(object): ...@@ -244,10 +244,10 @@ class ShowAndTellModel(object):
# This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the # This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the
# modified LSTM in the "Show and Tell" paper has no biases and outputs # modified LSTM in the "Show and Tell" paper has no biases and outputs
# new_c * sigmoid(o). # new_c * sigmoid(o).
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell( lstm_cell = tf.contrib.rnn.BasicLSTMCell(
num_units=self.config.num_lstm_units, state_is_tuple=True) num_units=self.config.num_lstm_units, state_is_tuple=True)
if self.mode == "train": if self.mode == "train":
lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell = tf.contrib.rnn.DropoutWrapper(
lstm_cell, lstm_cell,
input_keep_prob=self.config.lstm_dropout_keep_prob, input_keep_prob=self.config.lstm_dropout_keep_prob,
output_keep_prob=self.config.lstm_dropout_keep_prob) output_keep_prob=self.config.lstm_dropout_keep_prob)
...@@ -264,13 +264,13 @@ class ShowAndTellModel(object): ...@@ -264,13 +264,13 @@ class ShowAndTellModel(object):
if self.mode == "inference": if self.mode == "inference":
# In inference mode, use concatenated states for convenient feeding and # In inference mode, use concatenated states for convenient feeding and
# fetching. # fetching.
tf.concat(1, initial_state, name="initial_state") tf.concat_v2(initial_state, 1, name="initial_state")
# Placeholder for feeding a batch of concatenated states. # Placeholder for feeding a batch of concatenated states.
state_feed = tf.placeholder(dtype=tf.float32, state_feed = tf.placeholder(dtype=tf.float32,
shape=[None, sum(lstm_cell.state_size)], shape=[None, sum(lstm_cell.state_size)],
name="state_feed") name="state_feed")
state_tuple = tf.split(1, 2, state_feed) state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1)
# Run a single LSTM step. # Run a single LSTM step.
lstm_outputs, state_tuple = lstm_cell( lstm_outputs, state_tuple = lstm_cell(
...@@ -278,7 +278,7 @@ class ShowAndTellModel(object): ...@@ -278,7 +278,7 @@ class ShowAndTellModel(object):
state=state_tuple) state=state_tuple)
# Concatentate the resulting state. # Concatentate the resulting state.
tf.concat(1, state_tuple, name="state") tf.concat_v2(state_tuple, 1, name="state")
else: else:
# Run the batch of sequence embeddings through the LSTM. # Run the batch of sequence embeddings through the LSTM.
sequence_length = tf.reduce_sum(self.input_mask, 1) sequence_length = tf.reduce_sum(self.input_mask, 1)
...@@ -307,18 +307,19 @@ class ShowAndTellModel(object): ...@@ -307,18 +307,19 @@ class ShowAndTellModel(object):
weights = tf.to_float(tf.reshape(self.input_mask, [-1])) weights = tf.to_float(tf.reshape(self.input_mask, [-1]))
# Compute losses. # Compute losses.
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets) losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
batch_loss = tf.div(tf.reduce_sum(tf.mul(losses, weights)), logits=logits)
batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),
tf.reduce_sum(weights), tf.reduce_sum(weights),
name="batch_loss") name="batch_loss")
tf.contrib.losses.add_loss(batch_loss) tf.losses.add_loss(batch_loss)
total_loss = tf.contrib.losses.get_total_loss() total_loss = tf.losses.get_total_loss()
# Add summaries. # Add summaries.
tf.scalar_summary("batch_loss", batch_loss) tf.summary.scalar("losses/batch_loss", batch_loss)
tf.scalar_summary("total_loss", total_loss) tf.summary.scalar("losses/total_loss", total_loss)
for var in tf.trainable_variables(): for var in tf.trainable_variables():
tf.histogram_summary(var.op.name, var) tf.summary.histogram("parameters/" + var.op.name, var)
self.total_loss = total_loss self.total_loss = total_loss
self.target_cross_entropy_losses = losses # Used in evaluation. self.target_cross_entropy_losses = losses # Used in evaluation.
......
...@@ -63,7 +63,7 @@ class ShowAndTellModelTest(tf.test.TestCase): ...@@ -63,7 +63,7 @@ class ShowAndTellModelTest(tf.test.TestCase):
def _countModelParameters(self): def _countModelParameters(self):
"""Counts the number of parameters in the model at top level scope.""" """Counts the number of parameters in the model at top level scope."""
counter = {} counter = {}
for v in tf.all_variables(): for v in tf.global_variables():
name = v.op.name.split("/")[0] name = v.op.name.split("/")[0]
num_params = v.get_shape().num_elements() num_params = v.get_shape().num_elements()
assert num_params assert num_params
...@@ -98,7 +98,7 @@ class ShowAndTellModelTest(tf.test.TestCase): ...@@ -98,7 +98,7 @@ class ShowAndTellModelTest(tf.test.TestCase):
fetches = expected_shapes.keys() fetches = expected_shapes.keys()
with self.test_session() as sess: with self.test_session() as sess:
sess.run(tf.initialize_all_variables()) sess.run(tf.global_variables_initializer())
outputs = sess.run(fetches, feed_dict) outputs = sess.run(fetches, feed_dict)
for index, output in enumerate(outputs): for index, output in enumerate(outputs):
......
...@@ -137,13 +137,13 @@ def _convert_to_example(filename, image_buffer, label, text, height, width): ...@@ -137,13 +137,13 @@ def _convert_to_example(filename, image_buffer, label, text, height, width):
example = tf.train.Example(features=tf.train.Features(feature={ example = tf.train.Example(features=tf.train.Features(feature={
'image/height': _int64_feature(height), 'image/height': _int64_feature(height),
'image/width': _int64_feature(width), 'image/width': _int64_feature(width),
'image/colorspace': _bytes_feature(colorspace), 'image/colorspace': _bytes_feature(tf.compat.as_bytes(colorspace)),
'image/channels': _int64_feature(channels), 'image/channels': _int64_feature(channels),
'image/class/label': _int64_feature(label), 'image/class/label': _int64_feature(label),
'image/class/text': _bytes_feature(text), 'image/class/text': _bytes_feature(tf.compat.as_bytes(text)),
'image/format': _bytes_feature(image_format), 'image/format': _bytes_feature(tf.compat.as_bytes(image_format)),
'image/filename': _bytes_feature(os.path.basename(filename)), 'image/filename': _bytes_feature(tf.compat.as_bytes(os.path.basename(filename))),
'image/encoded': _bytes_feature(image_buffer)})) 'image/encoded': _bytes_feature(tf.compat.as_bytes(image_buffer))}))
return example return example
......
...@@ -79,7 +79,7 @@ RMSPROP_MOMENTUM = 0.9 # Momentum in RMSProp. ...@@ -79,7 +79,7 @@ RMSPROP_MOMENTUM = 0.9 # Momentum in RMSProp.
RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp. RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp.
def _tower_loss(images, labels, num_classes, scope): def _tower_loss(images, labels, num_classes, scope, reuse_variables=None):
"""Calculate the total loss on a single tower running the ImageNet model. """Calculate the total loss on a single tower running the ImageNet model.
We perform 'batch splitting'. This means that we cut up a batch across We perform 'batch splitting'. This means that we cut up a batch across
...@@ -103,9 +103,10 @@ def _tower_loss(images, labels, num_classes, scope): ...@@ -103,9 +103,10 @@ def _tower_loss(images, labels, num_classes, scope):
restore_logits = not FLAGS.fine_tune restore_logits = not FLAGS.fine_tune
# Build inference Graph. # Build inference Graph.
logits = inception.inference(images, num_classes, for_training=True, with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
restore_logits=restore_logits, logits = inception.inference(images, num_classes, for_training=True,
scope=scope) restore_logits=restore_logits,
scope=scope)
# Build the portion of the Graph calculating the losses. Note that we will # Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below. # assemble the total_loss using a custom function below.
...@@ -220,13 +221,14 @@ def train(dataset): ...@@ -220,13 +221,14 @@ def train(dataset):
# Number of classes in the Dataset label set plus 1. # Number of classes in the Dataset label set plus 1.
# Label 0 is reserved for an (unused) background class. # Label 0 is reserved for an (unused) background class.
num_classes = dataset.num_classes() + 1 num_classes = dataset.num_classes() + 1
# Split the batch of images and labels for towers. # Split the batch of images and labels for towers.
images_splits = tf.split(0, FLAGS.num_gpus, images) images_splits = tf.split(0, FLAGS.num_gpus, images)
labels_splits = tf.split(0, FLAGS.num_gpus, labels) labels_splits = tf.split(0, FLAGS.num_gpus, labels)
# Calculate the gradients for each model tower. # Calculate the gradients for each model tower.
tower_grads = [] tower_grads = []
reuse_variables = None
for i in range(FLAGS.num_gpus): for i in range(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i): with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope:
...@@ -236,10 +238,10 @@ def train(dataset): ...@@ -236,10 +238,10 @@ def train(dataset):
# function constructs the entire ImageNet model but shares the # function constructs the entire ImageNet model but shares the
# variables across all towers. # variables across all towers.
loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, loss = _tower_loss(images_splits[i], labels_splits[i], num_classes,
scope) scope, reuse_variables)
# Reuse variables for the next tower. # Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables() reuse_variables = True
# Retain the summaries from the final tower. # Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
......
...@@ -163,8 +163,8 @@ def cross_entropy_loss(logits, one_hot_labels, label_smoothing=0, ...@@ -163,8 +163,8 @@ def cross_entropy_loss(logits, one_hot_labels, label_smoothing=0,
smooth_positives = 1.0 - label_smoothing smooth_positives = 1.0 - label_smoothing
smooth_negatives = label_smoothing / num_classes smooth_negatives = label_smoothing / num_classes
one_hot_labels = one_hot_labels * smooth_positives + smooth_negatives one_hot_labels = one_hot_labels * smooth_positives + smooth_negatives
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
one_hot_labels, labels=one_hot_labels,
name='xentropy') name='xentropy')
weight = tf.convert_to_tensor(weight, weight = tf.convert_to_tensor(weight,
dtype=logits.dtype.base_dtype, dtype=logits.dtype.base_dtype,
......
...@@ -91,7 +91,7 @@ def batch_norm(inputs, ...@@ -91,7 +91,7 @@ def batch_norm(inputs,
if scale: if scale:
gamma = variables.variable('gamma', gamma = variables.variable('gamma',
params_shape, params_shape,
initializer=tf.ones_initializer, initializer=tf.ones_initializer(),
trainable=trainable, trainable=trainable,
restore=restore) restore=restore)
# Create moving_mean and moving_variance add them to # Create moving_mean and moving_variance add them to
...@@ -105,7 +105,7 @@ def batch_norm(inputs, ...@@ -105,7 +105,7 @@ def batch_norm(inputs,
collections=moving_collections) collections=moving_collections)
moving_variance = variables.variable('moving_variance', moving_variance = variables.variable('moving_variance',
params_shape, params_shape,
initializer=tf.ones_initializer, initializer=tf.ones_initializer(),
trainable=False, trainable=False,
restore=restore, restore=restore,
collections=moving_collections) collections=moving_collections)
......
...@@ -72,7 +72,7 @@ class Parameters: ...@@ -72,7 +72,7 @@ class Parameters:
self.RandomUniformInit([1, embedding_dims])) self.RandomUniformInit([1, embedding_dims]))
params["break_conditional"] = tf.Variable( params["break_conditional"] = tf.Variable(
self.RandomUniformInit([2 * embedding_dims, embedding_dims])) self.RandomUniformInit([2 * embedding_dims, embedding_dims]))
init = tf.initialize_all_variables() init = tf.global_variables_initializer()
return params, global_step, init return params, global_step, init
def RandomUniformInit(self, shape): def RandomUniformInit(self, shape):
......
<font size=4><b>Visual Dynamics: Probabilistic Future Frame Synthesis via Cross Convolutional Networks.</b></font>
<b>Introduction</b>
https://arxiv.org/pdf/1607.02586v1.pdf
This is an implementation based on my understanding, with small
variations. It doesn't necessarily represents the paper published
by the original authors.
Authors: Xin Pan (Github: panyx0718), Anelia Angelova
<b>Results:</b>
<left>
![Sample1](g3doc/cross_conv.png)
</left>
<left>
![Sample2](g3doc/cross_conv2.png)
</left>
<left>
![Loss](g3doc/cross_conv3.png)
</left>
<b>Prerequisite:</b>
1. Install TensorFlow (r0.12), Bazel.
2. Download the Sprites dataset or generate moving object dataset.
Sprites data is located here:
http://www.scottreed.info/files/nips2015-analogy-data.tar.gz
Convert .mat files into images and use sprites_gen.py to convert them
to tf.SequenceExample.
<b>How to run:</b>
```shell
ls -R
.:
data next_frame_prediction WORKSPACE
./data:
tfrecords tfrecords_test
./next_frame_prediction:
cross_conv g3doc README.md
./next_frame_prediction/cross_conv:
BUILD eval.py objects_gen.py model.py reader.py sprites_gen.py train.py
./next_frame_prediction/g3doc:
cross_conv2.png cross_conv3.png cross_conv.png
# Build everything.
bazel build -c opt next_frame_prediction/...
# The following example runs the generated 2d objects.
# For Sprites dataset, image_size should be 60, norm_scale should be 255.0.
# Batch size is normally 16~64, depending on your memory size.
#
# Run training.
bazel-bin/next_frame_prediction/cross_conv/train \
--batch_size=1 \
--data_filepattern=data/tfrecords \
--image_size=64 \
--log_root=/tmp/predict
step: 1, loss: 24.428671
step: 2, loss: 19.211605
step: 3, loss: 5.543143
step: 4, loss: 3.035339
step: 5, loss: 1.771392
step: 6, loss: 2.099824
step: 7, loss: 1.747665
step: 8, loss: 1.572436
step: 9, loss: 1.586816
step: 10, loss: 1.434191
#
# Run eval.
bazel-bin/next_frame_prediction/cross_conv/eval \
--batch_size=1 \
--data_filepattern=data/tfrecords_test \
--image_size=64 \
--log_root=/tmp/predict
```
licenses(["notice"]) # Apache 2.0
package_group(
name = "internal",
packages = [
"//next_frame_prediction/...",
],
)
package(default_visibility = [":internal"])
py_library(
name = "model",
srcs = ["model.py"],
)
py_library(
name = "reader",
srcs = ["reader.py"],
)
py_binary(
name = "train",
srcs = ["train.py"],
deps = [
":model",
":reader",
],
)
py_binary(
name = "eval",
srcs = ["eval.py"],
deps = [
":model",
":reader",
],
)
py_binary(
name = "example_gen",
srcs = ["example_gen.py"],
)
py_binary(
name = "sprites_gen",
srcs = ["sprites_gen.py"],
)
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Eval Cross Convolutional Model."""
import io
import os
import sys
import time
import numpy as np
import tensorflow as tf
import model as cross_conv_model
import reader
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_string('log_root', '/tmp/moving_obj', 'The root dir of output.')
tf.flags.DEFINE_string('data_filepattern',
'est',
'training data file pattern.')
tf.flags.DEFINE_integer('batch_size', 1, 'Batch size.')
tf.flags.DEFINE_integer('image_size', 64, 'Image height and width.')
tf.flags.DEFINE_float('norm_scale', 1.0, 'Normalize the original image')
tf.flags.DEFINE_float('scale', 10.0,
'Scale the image after norm_scale and move the diff '
'to the positive realm.')
tf.flags.DEFINE_integer('sequence_length', 2, 'tf.SequenceExample length.')
tf.flags.DEFINE_integer('eval_batch_count', 100,
'Average the result this number of examples.')
tf.flags.DEFINE_bool('l2_loss', True, 'If true, include l2_loss.')
tf.flags.DEFINE_bool('reconstr_loss', False, 'If true, include reconstr_loss.')
tf.flags.DEFINE_bool('kl_loss', True, 'If true, include KL loss.')
slim = tf.contrib.slim
def _Eval():
params = dict()
params['batch_size'] = FLAGS.batch_size
params['seq_len'] = FLAGS.sequence_length
params['image_size'] = FLAGS.image_size
params['is_training'] = False
params['norm_scale'] = FLAGS.norm_scale
params['scale'] = FLAGS.scale
params['l2_loss'] = FLAGS.l2_loss
params['reconstr_loss'] = FLAGS.reconstr_loss
params['kl_loss'] = FLAGS.kl_loss
eval_dir = os.path.join(FLAGS.log_root, 'eval')
images = reader.ReadInput(
FLAGS.data_filepattern, shuffle=False, params=params)
images *= params['scale']
# Increase the value makes training much faster.
image_diff_list = reader.SequenceToImageAndDiff(images)
model = cross_conv_model.CrossConvModel(image_diff_list, params)
model.Build()
summary_writer = tf.summary.FileWriter(eval_dir)
saver = tf.train.Saver()
sess = tf.Session('', config=tf.ConfigProto(allow_soft_placement=True))
tf.train.start_queue_runners(sess)
while True:
time.sleep(60)
try:
ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
except tf.errors.OutOfRangeError as e:
sys.stderr.write('Cannot restore checkpoint: %s\n' % e)
continue
if not (ckpt_state and ckpt_state.model_checkpoint_path):
sys.stderr.write('No model to eval yet at %s\n' % FLAGS.log_root)
continue
sys.stderr.write('Loading checkpoint %s\n' %
ckpt_state.model_checkpoint_path)
saver.restore(sess, ckpt_state.model_checkpoint_path)
# Use the empirical distribution of z from training set.
if not tf.gfile.Exists(os.path.join(FLAGS.log_root, 'z_mean.npy')):
sys.stderr.write('No z at %s\n' % FLAGS.log_root)
continue
with tf.gfile.Open(os.path.join(FLAGS.log_root, 'z_mean.npy')) as f:
sample_z_mean = np.load(io.BytesIO(f.read()))
with tf.gfile.Open(
os.path.join(FLAGS.log_root, 'z_stddev_log.npy')) as f:
sample_z_stddev_log = np.load(io.BytesIO(f.read()))
total_loss = 0.0
for _ in xrange(FLAGS.eval_batch_count):
loss_val, total_steps, summaries = sess.run(
[model.loss, model.global_step, model.summary_op],
feed_dict={model.z_mean: sample_z_mean,
model.z_stddev_log: sample_z_stddev_log})
total_loss += loss_val
summary_writer.add_summary(summaries, total_steps)
sys.stderr.write('steps: %d, loss: %f\n' %
(total_steps, total_loss / FLAGS.eval_batch_count))
def main(_):
_Eval()
if __name__ == '__main__':
tf.app.run()
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Generate examples of two objects moving in different directions."""
import random
import sys
import numpy as np
import tensorflow as tf
tf.flags.DEFINE_string('out_file', '',
'Output file for the tfrecords.')
def _add_object(obj_type, image, image2, xpos, ypos):
"""Add a moving obj to two consecutive images."""
obj_size = random.randint(8, 10)
channel = random.randint(0, 2)
move = random.randint(6, 10)
obj = np.zeros([obj_size, obj_size, 3])
if obj_type == 'rectangle':
xpos2 = xpos + move
ypos2 = ypos
for i in xrange(obj_size):
obj[i, 0:i+1, channel] = [1.0 for _ in xrange(i+1)]
elif obj_type == 'square':
xpos2 = xpos
ypos2 = ypos + move
obj[:, :, channel] = 1.0
for x in xrange(obj_size):
for y in xrange(obj_size):
if obj[x, y, channel] == 1.0:
image[xpos+x, ypos+y, channel] = 1.0
image2[xpos2+x, ypos2+y, channel] = 1.0
def _images_to_example(image, image2):
"""Convert two consecutive images to SequenceExample."""
example = tf.SequenceExample()
feature_list = example.feature_lists.feature_list['moving_objs']
feature = feature_list.feature.add()
feature.float_list.value.extend(np.reshape(image, [-1]).tolist())
feature = feature_list.feature.add()
feature.float_list.value.extend(np.reshape(image2, [-1]).tolist())
return example
def generate_input():
"""Generate tfrecords."""
writer = tf.python_io.TFRecordWriter(tf.flags.FLAGS.out_file)
writer2 = tf.python_io.TFRecordWriter(tf.flags.FLAGS.out_file + '_test')
examples = []
for xpos in xrange(0, 40, 3):
for ypos in xrange(0, 40, 3):
for xpos2 in xrange(0, 40, 3):
for ypos2 in xrange(0, 40, 3):
image = np.zeros([64, 64, 3])
image2 = np.zeros([64, 64, 3])
_add_object('rectangle', image, image2, xpos, ypos)
_add_object('square', image, image2, xpos2, ypos2)
examples.append(_images_to_example(image, image2))
sys.stderr.write('Finish generating examples.\n')
random.shuffle(examples)
for count, ex in enumerate(examples):
if count % 10 == 0:
writer2.write(ex.SerializeToString())
else:
writer.write(ex.SerializeToString())
def main(_):
generate_input()
if __name__ == '__main__':
tf.app.run()
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Cross Convolutional Model.
https://arxiv.org/pdf/1607.02586v1.pdf
"""
import math
import sys
import tensorflow as tf
slim = tf.contrib.slim
class CrossConvModel(object):
def __init__(self, image_diff_list, params):
"""Constructor.
Args:
image_diff_list: A list of (image, diff) tuples, with shape
[batch_size, image_size, image_size, 3] and image_sizes as
[32, 64, 128, 256].
params: Dict of parameters.
"""
self.images = [i for (i, _) in image_diff_list]
# Move the diff to the positive realm.
self.diffs = [(d + params['scale']) / 2 for (i, d) in image_diff_list]
self.params = params
def Build(self):
with tf.device('/gpu:0'):
with slim.arg_scope([slim.conv2d],
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
normalizer_params={'is_training':
self.params['is_training']}):
self._BuildMotionKernel()
encoded_images = self._BuildImageEncoder()
cross_conved_images = self._CrossConv(encoded_images)
self._BuildImageDecoder(cross_conved_images)
self._BuildLoss()
image = self.images[1]
diff = self.diffs[1]
self.global_step = tf.Variable(0, name='global_step', trainable=False)
if self.params['is_training']:
self._BuildTrainOp()
diff = diff * 2.0 - self.params['scale']
diff_output = self.diff_output * 2.0 - self.params['scale']
concat_image = tf.concat(
1, [image, image + diff_output, image + diff, diff_output])
tf.summary.image('origin_predict_expect_predictdiff', concat_image)
self.summary_op = tf.summary.merge_all()
return self.loss
def _BuildTrainOp(self):
lrn_rate = tf.maximum(
0.01, # min_lr_rate.
tf.train.exponential_decay(
self.params['learning_rate'], self.global_step, 10000, 0.5))
tf.summary.scalar('learning rate', lrn_rate)
optimizer = tf.train.GradientDescentOptimizer(lrn_rate)
self.train_op = slim.learning.create_train_op(
self.loss, optimizer, global_step=self.global_step)
def _BuildLoss(self):
# 1. reconstr_loss seems doesn't do better than l2 loss.
# 2. Only works when using reduce_mean. reduce_sum doesn't work.
# 3. It seems kl loss doesn't play an important role.
self.loss = 0
with tf.variable_scope('loss'):
if self.params['l2_loss']:
l2_loss = tf.reduce_mean(tf.square(self.diff_output - self.diffs[1]))
tf.summary.scalar('l2_loss', l2_loss)
self.loss += l2_loss
if self.params['reconstr_loss']:
reconstr_loss = (-tf.reduce_mean(
self.diffs[1] * (1e-10 + self.diff_output) +
(1-self.diffs[1]) * tf.log(1e-10 + 1 - self.diff_output)))
reconstr_loss = tf.check_numerics(reconstr_loss, 'reconstr_loss')
tf.summary.scalar('reconstr_loss', reconstr_loss)
self.loss += reconstr_loss
if self.params['kl_loss']:
kl_loss = (0.5 * tf.reduce_mean(
tf.square(self.z_mean) + tf.square(self.z_stddev) -
2 * self.z_stddev_log - 1))
tf.summary.scalar('kl_loss', kl_loss)
self.loss += kl_loss
tf.summary.scalar('loss', self.loss)
def _BuildMotionKernel(self):
image = self.images[-2]
diff = self.diffs[-2]
shape = image.get_shape().as_list()
assert shape[1] == shape[2] and shape[1] == 128
batch_size = shape[0]
net = tf.concat(3, [image, diff])
with tf.variable_scope('motion_encoder'):
with slim.arg_scope([slim.conv2d], padding='VALID'):
net = slim.conv2d(net, 96, [5, 5], stride=1)
net = slim.max_pool2d(net, [2, 2])
net = slim.conv2d(net, 96, [5, 5], stride=1)
net = slim.max_pool2d(net, [2, 2])
net = slim.conv2d(net, 128, [5, 5], stride=1)
net = slim.conv2d(net, 128, [5, 5], stride=1)
net = slim.max_pool2d(net, [2, 2])
net = slim.conv2d(net, 256, [4, 4], stride=1)
net = slim.conv2d(net, 256, [3, 3], stride=1)
z = tf.reshape(net, shape=[batch_size, -1])
self.z_mean, self.z_stddev_log = tf.split(
split_dim=1, num_split=2, value=z)
self.z_stddev = tf.exp(self.z_stddev_log)
epsilon = tf.random_normal(
self.z_mean.get_shape().as_list(), 0, 1, dtype=tf.float32)
kernel = self.z_mean + tf.multiply(self.z_stddev, epsilon)
width = int(math.sqrt(kernel.get_shape().as_list()[1] // 128))
kernel = tf.reshape(kernel, [batch_size, width, width, 128])
with tf.variable_scope('kernel_decoder'):
with slim.arg_scope([slim.conv2d], padding='SAME'):
kernel = slim.conv2d(kernel, 128, [5, 5], stride=1)
self.kernel = slim.conv2d(kernel, 128, [5, 5], stride=1)
sys.stderr.write('kernel shape: %s\n' % kernel.get_shape())
def _BuildImageEncoder(self):
feature_maps = []
for (i, image) in enumerate(self.images):
with tf.variable_scope('image_encoder_%d' % i):
with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'):
net = slim.conv2d(image, 64, [5, 5], stride=1)
net = slim.conv2d(net, 64, [5, 5], stride=1)
net = slim.max_pool2d(net, [5, 5])
net = slim.conv2d(net, 64, [5, 5], stride=1)
net = slim.conv2d(net, 32, [5, 5], stride=1)
net = slim.max_pool2d(net, [2, 2])
sys.stderr.write('image_conv shape: %s\n' % net.get_shape())
feature_maps.append(net)
return feature_maps
def _CrossConvHelper(self, encoded_image, kernel):
"""Cross Convolution.
The encoded image and kernel are of the same shape. Namely
[batch_size, image_size, image_size, channels]. They are split
into [image_size, image_size] image squares [kernel_size, kernel_size]
kernel squares. kernel squares are used to convolute image squares.
"""
images = tf.expand_dims(encoded_image, 0)
kernels = tf.expand_dims(kernel, 3)
return tf.nn.depthwise_conv2d(images, kernels, [1, 1, 1, 1], 'SAME')
def _CrossConv(self, encoded_images):
"""Apply the motion kernel on the encoded_images."""
cross_conved_images = []
kernels = tf.split(split_dim=3, num_split=4, value=self.kernel)
for (i, encoded_image) in enumerate(encoded_images):
with tf.variable_scope('cross_conv_%d' % i):
kernel = kernels[i]
encoded_image = tf.unstack(encoded_image, axis=0)
kernel = tf.unstack(kernel, axis=0)
assert len(encoded_image) == len(kernel)
assert len(encoded_image) == self.params['batch_size']
conved_image = []
for j in xrange(len(encoded_image)):
conved_image.append(self._CrossConvHelper(
encoded_image[j], kernel[j]))
cross_conved_images.append(tf.concat(0, conved_image))
sys.stderr.write('cross_conved shape: %s\n' %
cross_conved_images[-1].get_shape())
return cross_conved_images
def _Deconv(self, net, out_filters, kernel_size, stride):
shape = net.get_shape().as_list()
in_filters = shape[3]
kernel_shape = [kernel_size, kernel_size, out_filters, in_filters]
weights = tf.get_variable(
name='weights',
shape=kernel_shape,
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.01))
out_height = shape[1] * stride
out_width = shape[2] * stride
batch_size = shape[0]
output_shape = [batch_size, out_height, out_width, out_filters]
net = tf.nn.conv2d_transpose(net, weights, output_shape,
[1, stride, stride, 1], padding='SAME')
slim.batch_norm(net)
return net
def _BuildImageDecoder(self, cross_conved_images):
"""Decode the cross_conved feature maps into the predicted images."""
nets = []
for i, cross_conved_image in enumerate(cross_conved_images):
with tf.variable_scope('image_decoder_%d' % i):
stride = 64 / cross_conved_image.get_shape().as_list()[1]
# TODO(xpan): Alternative solution for upsampling?
nets.append(self._Deconv(
cross_conved_image, 64, kernel_size=3, stride=stride))
net = tf.concat(3, nets)
net = slim.conv2d(net, 128, [9, 9], padding='SAME', stride=1)
net = slim.conv2d(net, 128, [1, 1], padding='SAME', stride=1)
net = slim.conv2d(net, 3, [1, 1], padding='SAME', stride=1)
self.diff_output = net
sys.stderr.write('diff_output shape: %s\n' % self.diff_output.get_shape())
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Read image sequence."""
import tensorflow as tf
def SequenceToImageAndDiff(images):
"""Convert image sequence batch into image and diff batch.
Each image pair is converted to the first image and their diff.
Batch size will increase if sequence length is larger than 2.
Args:
images: Image sequence with shape
[batch_size, seq_len, image_size, image_size, channel]
Returns:
the list of (image, diff) tuples with shape
[batch_size2, image_size, image_size, channel]. image_sizes are
[32, 64, 128, 256].
"""
image_diff_list = []
image_seq = tf.unstack(images, axis=1)
for size in [32, 64, 128, 256]:
resized_images = [
tf.image.resize_images(i, [size, size]) for i in image_seq]
diffs = []
for i in xrange(0, len(resized_images)-1):
diffs.append(resized_images[i+1] - resized_images[i])
image_diff_list.append(
(tf.concat(0, resized_images[:-1]), tf.concat(0, diffs)))
return image_diff_list
def ReadInput(data_filepattern, shuffle, params):
"""Read the tf.SequenceExample tfrecord files.
Args:
data_filepattern: tf.SequenceExample tfrecord filepattern.
shuffle: Whether to shuffle the examples.
params: parameter dict.
Returns:
image sequence batch [batch_size, seq_len, image_size, image_size, channel].
"""
image_size = params['image_size']
filenames = tf.gfile.Glob(data_filepattern)
filename_queue = tf.train.string_input_producer(filenames, shuffle=shuffle)
reader = tf.TFRecordReader()
_, example = reader.read(filename_queue)
feature_sepc = {
'moving_objs': tf.FixedLenSequenceFeature(
shape=[image_size * image_size * 3], dtype=tf.float32)}
_, features = tf.parse_single_sequence_example(
example, sequence_features=feature_sepc)
moving_objs = tf.reshape(
features['moving_objs'], [params['seq_len'], image_size, image_size, 3])
if shuffle:
examples = tf.train.shuffle_batch(
[moving_objs],
batch_size=params['batch_size'],
num_threads=64,
capacity=params['batch_size'] * 100,
min_after_dequeue=params['batch_size'] * 4)
else:
examples = tf.train.batch([moving_objs],
batch_size=params['batch_size'],
num_threads=16,
capacity=params['batch_size'])
examples /= params['norm_scale']
return examples
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment