Commit 68a18b70 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Merge pull request #1 from tensorflow/master

update to tensorflow/model master
parents bc70271a 2c4fea8d
...@@ -60,7 +60,7 @@ def maybe_download(directory, filename, url): ...@@ -60,7 +60,7 @@ def maybe_download(directory, filename, url):
print "Downloading %s to %s" % (url, filepath) print "Downloading %s to %s" % (url, filepath)
filepath, _ = urllib.request.urlretrieve(url, filepath) filepath, _ = urllib.request.urlretrieve(url, filepath)
statinfo = os.stat(filepath) statinfo = os.stat(filepath)
print "Succesfully downloaded", filename, statinfo.st_size, "bytes" print "Successfully downloaded", filename, statinfo.st_size, "bytes"
return filepath return filepath
......
...@@ -223,7 +223,7 @@ def list_join(a): ...@@ -223,7 +223,7 @@ def list_join(a):
def group_by_max(table, number): def group_by_max(table, number):
#computes the most frequently occuring entry in a column #computes the most frequently occurring entry in a column
answer = [] answer = []
for i in range(len(table)): for i in range(len(table)):
temp = [] temp = []
......
...@@ -121,21 +121,21 @@ class Graph(): ...@@ -121,21 +121,21 @@ class Graph():
if (self.utility.FLAGS.rnn_dropout > 0.0): if (self.utility.FLAGS.rnn_dropout > 0.0):
question_hidden = question_hidden * rnn_dropout_mask question_hidden = question_hidden * rnn_dropout_mask
hidden_vectors.append(tf.expand_dims(question_hidden, 0)) hidden_vectors.append(tf.expand_dims(question_hidden, 0))
hidden_vectors = tf.concat(0, hidden_vectors) hidden_vectors = tf.concat(axis=0, values=hidden_vectors)
return question_hidden, hidden_vectors return question_hidden, hidden_vectors
def history_recurrent_step(self, curr_hprev, hprev): def history_recurrent_step(self, curr_hprev, hprev):
#A single RNN step for controller or history RNN #A single RNN step for controller or history RNN
return tf.tanh( return tf.tanh(
tf.matmul( tf.matmul(
tf.concat(1, [hprev, curr_hprev]), self.params[ tf.concat(axis=1, values=[hprev, curr_hprev]), self.params[
"history_recurrent"])) + self.params["history_recurrent_bias"] "history_recurrent"])) + self.params["history_recurrent_bias"]
def question_number_softmax(self, hidden_vectors): def question_number_softmax(self, hidden_vectors):
#Attention on quetsion to decide the question number to passed to comparison ops #Attention on quetsion to decide the question number to passed to comparison ops
def compute_ans(op_embedding, comparison): def compute_ans(op_embedding, comparison):
op_embedding = tf.expand_dims(op_embedding, 0) op_embedding = tf.expand_dims(op_embedding, 0)
#dot product of operation embedding with hidden state to the left of the number occurence #dot product of operation embedding with hidden state to the left of the number occurrence
first = tf.transpose( first = tf.transpose(
tf.matmul(op_embedding, tf.matmul(op_embedding,
tf.transpose( tf.transpose(
...@@ -150,13 +150,13 @@ class Graph(): ...@@ -150,13 +150,13 @@ class Graph():
tf.expand_dims( tf.expand_dims(
tf.transpose(self.batch_ordinal_question_one), 2 tf.transpose(self.batch_ordinal_question_one), 2
), [1, 1, self.utility.FLAGS.embedding_dims]), 0)))) ), [1, 1, self.utility.FLAGS.embedding_dims]), 0))))
question_number_softmax = tf.nn.softmax(tf.concat(1, [first, second])) question_number_softmax = tf.nn.softmax(tf.concat(axis=1, values=[first, second]))
if (self.mode == "test"): if (self.mode == "test"):
cond = tf.equal(question_number_softmax, cond = tf.equal(question_number_softmax,
tf.reshape( tf.reshape(
tf.reduce_max(question_number_softmax, 1), tf.reduce_max(question_number_softmax, 1),
[self.batch_size, 1])) [self.batch_size, 1]))
question_number_softmax = tf.select( question_number_softmax = tf.where(
cond, cond,
tf.fill(tf.shape(question_number_softmax), 1.0), tf.fill(tf.shape(question_number_softmax), 1.0),
tf.fill(tf.shape(question_number_softmax), 0.0)) tf.fill(tf.shape(question_number_softmax), 0.0))
...@@ -164,7 +164,7 @@ class Graph(): ...@@ -164,7 +164,7 @@ class Graph():
self.data_type) self.data_type)
ans = tf.reshape( ans = tf.reshape(
tf.reduce_sum(question_number_softmax * tf.concat( tf.reduce_sum(question_number_softmax * tf.concat(
1, [self.batch_question_number, self.batch_question_number_one]), axis=1, values=[self.batch_question_number, self.batch_question_number_one]),
1), [self.batch_size, 1]) 1), [self.batch_size, 1])
return ans return ans
...@@ -225,7 +225,7 @@ class Graph(): ...@@ -225,7 +225,7 @@ class Graph():
column_controller_vector = nn_utils.apply_dropout( column_controller_vector = nn_utils.apply_dropout(
column_controller_vector, self.utility.FLAGS.dropout, self.mode) column_controller_vector, self.utility.FLAGS.dropout, self.mode)
self.full_column_hidden_vectors = tf.concat( self.full_column_hidden_vectors = tf.concat(
1, [self.column_hidden_vectors, self.word_column_hidden_vectors]) axis=1, values=[self.column_hidden_vectors, self.word_column_hidden_vectors])
self.full_column_hidden_vectors += self.summary_text_entry_embeddings self.full_column_hidden_vectors += self.summary_text_entry_embeddings
self.full_column_hidden_vectors = nn_utils.apply_dropout( self.full_column_hidden_vectors = nn_utils.apply_dropout(
self.full_column_hidden_vectors, self.utility.FLAGS.dropout, self.mode) self.full_column_hidden_vectors, self.utility.FLAGS.dropout, self.mode)
...@@ -258,7 +258,7 @@ class Graph(): ...@@ -258,7 +258,7 @@ class Graph():
temp_ans.append(curr_prob) temp_ans.append(curr_prob)
else: else:
temp_ans.append(tf.zeros_like(curr_prob)) temp_ans.append(tf.zeros_like(curr_prob))
temp_ans = tf.transpose(tf.concat(0, temp_ans)) temp_ans = tf.transpose(tf.concat(axis=0, values=temp_ans))
answer += temp_ans answer += temp_ans
return answer return answer
...@@ -266,7 +266,7 @@ class Graph(): ...@@ -266,7 +266,7 @@ class Graph():
#converts soft selection to hard selection. used at test time #converts soft selection to hard selection. used at test time
cond = tf.equal( cond = tf.equal(
softmax, tf.reshape(tf.reduce_max(softmax, 1), [self.batch_size, 1])) softmax, tf.reshape(tf.reduce_max(softmax, 1), [self.batch_size, 1]))
softmax = tf.select( softmax = tf.where(
cond, tf.fill(tf.shape(softmax), 1.0), tf.fill(tf.shape(softmax), 0.0)) cond, tf.fill(tf.shape(softmax), 1.0), tf.fill(tf.shape(softmax), 0.0))
softmax = tf.cast(softmax, self.data_type) softmax = tf.cast(softmax, self.data_type)
return softmax return softmax
...@@ -297,7 +297,7 @@ class Graph(): ...@@ -297,7 +297,7 @@ class Graph():
curr_prob = curr_prob * tf.expand_dims((1 - sum_prob), 2) curr_prob = curr_prob * tf.expand_dims((1 - sum_prob), 2)
curr_prob = curr_prob * tf.expand_dims( curr_prob = curr_prob * tf.expand_dims(
tf.cast((1 - sum_prob) > 0.0, self.data_type), 2) tf.cast((1 - sum_prob) > 0.0, self.data_type), 2)
answer = tf.select(select_mask, curr_prob, answer) answer = tf.where(select_mask, curr_prob, answer)
sum_prob += tf.reduce_sum(curr_prob, 2) sum_prob += tf.reduce_sum(curr_prob, 2)
return answer return answer
...@@ -335,11 +335,11 @@ class Graph(): ...@@ -335,11 +335,11 @@ class Graph():
1) #BS * max_elements 1) #BS * max_elements
select_min = tf.reduce_sum(init_min * select_full_column_softmax, select_min = tf.reduce_sum(init_min * select_full_column_softmax,
1) #BS * max_elements 1) #BS * max_elements
select_prev = tf.concat(1, [ select_prev = tf.concat(axis=1, values=[
tf.slice(select, [0, 1], [self.batch_size, self.max_elements - 1]), tf.slice(select, [0, 1], [self.batch_size, self.max_elements - 1]),
tf.cast(tf.zeros([self.batch_size, 1]), self.data_type) tf.cast(tf.zeros([self.batch_size, 1]), self.data_type)
]) ])
select_next = tf.concat(1, [ select_next = tf.concat(axis=1, values=[
tf.cast(tf.zeros([self.batch_size, 1]), self.data_type), tf.slice( tf.cast(tf.zeros([self.batch_size, 1]), self.data_type), tf.slice(
select, [0, 0], [self.batch_size, self.max_elements - 1]) select, [0, 0], [self.batch_size, self.max_elements - 1])
]) ])
...@@ -352,11 +352,11 @@ class Graph(): ...@@ -352,11 +352,11 @@ class Graph():
length_content = 1 length_content = 1
length_select = 13 length_select = 13
length_print = 1 length_print = 1
values = tf.concat(1, [count]) values = tf.concat(axis=1, values=[count])
softmax_content = tf.slice(softmax, [0, 0], softmax_content = tf.slice(softmax, [0, 0],
[self.batch_size, length_content]) [self.batch_size, length_content])
#compute scalar output #compute scalar output
output = tf.reduce_sum(tf.mul(softmax_content, values), 1) output = tf.reduce_sum(tf.multiply(softmax_content, values), 1)
#compute lookup answer #compute lookup answer
softmax_print = tf.slice(softmax, [0, length_content + length_select], softmax_print = tf.slice(softmax, [0, length_content + length_select],
[self.batch_size, length_print]) [self.batch_size, length_print])
...@@ -384,7 +384,7 @@ class Graph(): ...@@ -384,7 +384,7 @@ class Graph():
] ]
select = tf.reduce_sum( select = tf.reduce_sum(
tf.tile(tf.expand_dims(softmax_select, 2), [1, 1, self.max_elements]) * tf.tile(tf.expand_dims(softmax_select, 2), [1, 1, self.max_elements]) *
tf.concat(1, select_lists), 1) tf.concat(axis=1, values=select_lists), 1)
select = select * self.select_whole_mask select = select * self.select_whole_mask
return output, select return output, select
...@@ -396,11 +396,11 @@ class Graph(): ...@@ -396,11 +396,11 @@ class Graph():
self.batch_question_attention_mask) #batch_size * embedding_dims self.batch_question_attention_mask) #batch_size * embedding_dims
controller_vector = tf.nn.relu( controller_vector = tf.nn.relu(
tf.matmul(hprev, self.params["controller_prev"]) + tf.matmul( tf.matmul(hprev, self.params["controller_prev"]) + tf.matmul(
tf.concat(1, [question_embedding, attention_vector]), self.params[ tf.concat(axis=1, values=[question_embedding, attention_vector]), self.params[
"controller"])) "controller"]))
column_controller_vector = tf.nn.relu( column_controller_vector = tf.nn.relu(
tf.matmul(hprev, self.params["column_controller_prev"]) + tf.matmul( tf.matmul(hprev, self.params["column_controller_prev"]) + tf.matmul(
tf.concat(1, [question_embedding, attention_vector]), self.params[ tf.concat(axis=1, values=[question_embedding, attention_vector]), self.params[
"column_controller"])) "column_controller"]))
controller_vector = nn_utils.apply_dropout( controller_vector = nn_utils.apply_dropout(
controller_vector, self.utility.FLAGS.dropout, self.mode) controller_vector, self.utility.FLAGS.dropout, self.mode)
...@@ -413,7 +413,7 @@ class Graph(): ...@@ -413,7 +413,7 @@ class Graph():
tf.matmul(tf.transpose(self.params_unit), tf.transpose(softmax))) tf.matmul(tf.transpose(self.params_unit), tf.transpose(softmax)))
column_controller_vector = tf.nn.relu( column_controller_vector = tf.nn.relu(
tf.matmul( tf.matmul(
tf.concat(1, [ tf.concat(axis=1, values=[
column_controller_vector, weighted_op_representation column_controller_vector, weighted_op_representation
]), self.params["break_conditional"])) ]), self.params["break_conditional"]))
full_column_softmax = self.compute_column_softmax(column_controller_vector, full_column_softmax = self.compute_column_softmax(column_controller_vector,
...@@ -429,7 +429,7 @@ class Graph(): ...@@ -429,7 +429,7 @@ class Graph():
def compute_lookup_error(self, val): def compute_lookup_error(self, val):
#computes lookup error. #computes lookup error.
cond = tf.equal(self.batch_print_answer, val) cond = tf.equal(self.batch_print_answer, val)
inter = tf.select( inter = tf.where(
cond, self.init_print_error, cond, self.init_print_error,
tf.tile( tf.tile(
tf.reshape(tf.constant(1e10, self.data_type), [1, 1, 1]), [ tf.reshape(tf.constant(1e10, self.data_type), [1, 1, 1]), [
...@@ -450,12 +450,12 @@ class Graph(): ...@@ -450,12 +450,12 @@ class Graph():
def error_computation(self): def error_computation(self):
#computes the error of each example in a batch #computes the error of each example in a batch
math_error = 0.5 * tf.square(tf.sub(self.scalar_output, self.batch_answer)) math_error = 0.5 * tf.square(tf.subtract(self.scalar_output, self.batch_answer))
#scale math error #scale math error
math_error = math_error / self.rows math_error = math_error / self.rows
math_error = tf.minimum(math_error, self.utility.FLAGS.max_math_error * math_error = tf.minimum(math_error, self.utility.FLAGS.max_math_error *
tf.ones(tf.shape(math_error), self.data_type)) tf.ones(tf.shape(math_error), self.data_type))
self.init_print_error = tf.select( self.init_print_error = tf.where(
self.batch_gold_select, -1 * tf.log(self.batch_lookup_answer + 1e-300 + self.batch_gold_select, -1 * tf.log(self.batch_lookup_answer + 1e-300 +
self.invert_select_full_mask), -1 * self.invert_select_full_mask), -1 *
tf.log(1 - self.batch_lookup_answer)) * self.select_full_mask tf.log(1 - self.batch_lookup_answer)) * self.select_full_mask
...@@ -466,24 +466,24 @@ class Graph(): ...@@ -466,24 +466,24 @@ class Graph():
print_error += self.compute_lookup_error(val + 0.0) print_error += self.compute_lookup_error(val + 0.0)
print_error = print_error * self.utility.FLAGS.print_cost / self.num_entries print_error = print_error * self.utility.FLAGS.print_cost / self.num_entries
if (self.mode == "train"): if (self.mode == "train"):
error = tf.select( error = tf.where(
tf.logical_and( tf.logical_and(
tf.not_equal(self.batch_answer, 0.0), tf.not_equal(self.batch_answer, 0.0),
tf.not_equal( tf.not_equal(
tf.reduce_sum(tf.reduce_sum(self.batch_print_answer, 1), 1), tf.reduce_sum(tf.reduce_sum(self.batch_print_answer, 1), 1),
0.0)), 0.0)),
self.soft_min(math_error, print_error), self.soft_min(math_error, print_error),
tf.select( tf.where(
tf.not_equal(self.batch_answer, 0.0), math_error, print_error)) tf.not_equal(self.batch_answer, 0.0), math_error, print_error))
else: else:
error = tf.select( error = tf.where(
tf.logical_and( tf.logical_and(
tf.equal(self.scalar_output, 0.0), tf.equal(self.scalar_output, 0.0),
tf.equal( tf.equal(
tf.reduce_sum(tf.reduce_sum(self.batch_lookup_answer, 1), 1), tf.reduce_sum(tf.reduce_sum(self.batch_lookup_answer, 1), 1),
0.0)), 0.0)),
tf.ones_like(math_error), tf.ones_like(math_error),
tf.select( tf.where(
tf.equal(self.scalar_output, 0.0), print_error, math_error)) tf.equal(self.scalar_output, 0.0), print_error, math_error))
return error return error
...@@ -558,7 +558,7 @@ class Graph(): ...@@ -558,7 +558,7 @@ class Graph():
input_col = tf.reduce_sum( input_col = tf.reduce_sum(
tf.expand_dims(soft_column_softmax, 2) * tf.expand_dims(soft_column_softmax, 2) *
self.full_column_hidden_vectors, 1) self.full_column_hidden_vectors, 1)
history_input = tf.concat(1, [input_op, input_col]) history_input = tf.concat(axis=1, values=[input_op, input_col])
history_input = nn_utils.apply_dropout( history_input = nn_utils.apply_dropout(
history_input, self.utility.FLAGS.dropout, self.mode) history_input, self.utility.FLAGS.dropout, self.mode)
hprev = self.history_recurrent_step(history_input, hprev) hprev = self.history_recurrent_step(history_input, hprev)
...@@ -567,7 +567,7 @@ class Graph(): ...@@ -567,7 +567,7 @@ class Graph():
self.scalar_output = output self.scalar_output = output
error = self.error_computation() error = self.error_computation()
cond = tf.less(error, 0.0001, name="cond") cond = tf.less(error, 0.0001, name="cond")
correct_add = tf.select( correct_add = tf.where(
cond, tf.fill(tf.shape(cond), 1.0), tf.fill(tf.shape(cond), 0.0)) cond, tf.fill(tf.shape(cond), 1.0), tf.fill(tf.shape(cond), 0.0))
correct = tf.reduce_sum(correct_add) correct = tf.reduce_sum(correct_add)
error = error / batch_size error = error / batch_size
...@@ -579,11 +579,11 @@ class Graph(): ...@@ -579,11 +579,11 @@ class Graph():
#Sets mask variables and performs batch processing #Sets mask variables and performs batch processing
self.batch_gold_select = self.batch_print_answer > 0.0 self.batch_gold_select = self.batch_print_answer > 0.0
self.full_column_mask = tf.concat( self.full_column_mask = tf.concat(
1, [self.batch_number_column_mask, self.batch_word_column_mask]) axis=1, values=[self.batch_number_column_mask, self.batch_word_column_mask])
self.full_processed_column = tf.concat( self.full_processed_column = tf.concat(
1, axis=1,
[self.batch_processed_number_column, self.batch_processed_word_column]) values=[self.batch_processed_number_column, self.batch_processed_word_column])
self.full_processed_sorted_index_column = tf.concat(1, [ self.full_processed_sorted_index_column = tf.concat(axis=1, values=[
self.batch_processed_sorted_index_number_column, self.batch_processed_sorted_index_number_column,
self.batch_processed_sorted_index_word_column self.batch_processed_sorted_index_word_column
]) ])
...@@ -603,7 +603,7 @@ class Graph(): ...@@ -603,7 +603,7 @@ class Graph():
tf.equal(self.batch_word_column_entry_mask, tf.equal(self.batch_word_column_entry_mask,
self.utility.dummy_token_id)), self.data_type) self.utility.dummy_token_id)), self.data_type)
self.select_full_mask = tf.concat( self.select_full_mask = tf.concat(
1, [self.select_mask, self.select_word_mask]) axis=1, values=[self.select_mask, self.select_word_mask])
self.select_whole_mask = tf.maximum( self.select_whole_mask = tf.maximum(
tf.reshape( tf.reshape(
tf.slice(self.select_mask, [0, 0, 0], tf.slice(self.select_mask, [0, 0, 0],
...@@ -614,7 +614,7 @@ class Graph(): ...@@ -614,7 +614,7 @@ class Graph():
[self.batch_size, 1, self.max_elements]), [self.batch_size, 1, self.max_elements]),
[self.batch_size, self.max_elements])) [self.batch_size, self.max_elements]))
self.invert_select_full_mask = tf.cast( self.invert_select_full_mask = tf.cast(
tf.concat(1, [ tf.concat(axis=1, values=[
tf.equal(self.batch_number_column, self.utility.FLAGS.pad_int), tf.equal(self.batch_number_column, self.utility.FLAGS.pad_int),
tf.equal(self.batch_word_column_entry_mask, tf.equal(self.batch_word_column_entry_mask,
self.utility.dummy_token_id) self.utility.dummy_token_id)
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
...@@ -12,17 +12,11 @@ Authors: Xin Pan (Github: panyx0718), Anelia Angelova ...@@ -12,17 +12,11 @@ Authors: Xin Pan (Github: panyx0718), Anelia Angelova
<b>Results:</b> <b>Results:</b>
<left>
![Sample1](g3doc/cross_conv.png) ![Sample1](g3doc/cross_conv.png)
</left>
<left>
![Sample2](g3doc/cross_conv2.png) ![Sample2](g3doc/cross_conv2.png)
</left>
<left>
![Loss](g3doc/cross_conv3.png) ![Loss](g3doc/cross_conv3.png)
</left>
<b>Prerequisite:</b> <b>Prerequisite:</b>
...@@ -40,7 +34,7 @@ to tf.SequenceExample. ...@@ -40,7 +34,7 @@ to tf.SequenceExample.
<b>How to run:</b> <b>How to run:</b>
```shell ```shell
ls -R $ ls -R
.: .:
data next_frame_prediction WORKSPACE data next_frame_prediction WORKSPACE
...@@ -58,18 +52,18 @@ cross_conv2.png cross_conv3.png cross_conv.png ...@@ -58,18 +52,18 @@ cross_conv2.png cross_conv3.png cross_conv.png
# Build everything. # Build everything.
bazel build -c opt next_frame_prediction/... $ bazel build -c opt next_frame_prediction/...
# The following example runs the generated 2d objects. # The following example runs the generated 2d objects.
# For Sprites dataset, image_size should be 60, norm_scale should be 255.0. # For Sprites dataset, image_size should be 60, norm_scale should be 255.0.
# Batch size is normally 16~64, depending on your memory size. # Batch size is normally 16~64, depending on your memory size.
#
# Run training. # Run training.
bazel-bin/next_frame_prediction/cross_conv/train \ $ bazel-bin/next_frame_prediction/cross_conv/train \
--batch_size=1 \ --batch_size=1 \
--data_filepattern=data/tfrecords \ --data_filepattern=data/tfrecords \
--image_size=64 \ --image_size=64 \
--log_root=/tmp/predict --log_root=/tmp/predict
step: 1, loss: 24.428671 step: 1, loss: 24.428671
step: 2, loss: 19.211605 step: 2, loss: 19.211605
...@@ -81,11 +75,11 @@ step: 7, loss: 1.747665 ...@@ -81,11 +75,11 @@ step: 7, loss: 1.747665
step: 8, loss: 1.572436 step: 8, loss: 1.572436
step: 9, loss: 1.586816 step: 9, loss: 1.586816
step: 10, loss: 1.434191 step: 10, loss: 1.434191
#
# Run eval. # Run eval.
bazel-bin/next_frame_prediction/cross_conv/eval \ $ bazel-bin/next_frame_prediction/cross_conv/eval \
--batch_size=1 \ --batch_size=1 \
--data_filepattern=data/tfrecords_test \ --data_filepattern=data/tfrecords_test \
--image_size=64 \ --image_size=64 \
--log_root=/tmp/predict --log_root=/tmp/predict
``` ```
...@@ -65,7 +65,7 @@ class CrossConvModel(object): ...@@ -65,7 +65,7 @@ class CrossConvModel(object):
diff = diff * 2.0 - self.params['scale'] diff = diff * 2.0 - self.params['scale']
diff_output = self.diff_output * 2.0 - self.params['scale'] diff_output = self.diff_output * 2.0 - self.params['scale']
concat_image = tf.concat( concat_image = tf.concat(
1, [image, image + diff_output, image + diff, diff_output]) axis=1, values=[image, image + diff_output, image + diff, diff_output])
tf.summary.image('origin_predict_expect_predictdiff', concat_image) tf.summary.image('origin_predict_expect_predictdiff', concat_image)
self.summary_op = tf.summary.merge_all() self.summary_op = tf.summary.merge_all()
return self.loss return self.loss
...@@ -113,7 +113,7 @@ class CrossConvModel(object): ...@@ -113,7 +113,7 @@ class CrossConvModel(object):
assert shape[1] == shape[2] and shape[1] == 128 assert shape[1] == shape[2] and shape[1] == 128
batch_size = shape[0] batch_size = shape[0]
net = tf.concat(3, [image, diff]) net = tf.concat(axis=3, values=[image, diff])
with tf.variable_scope('motion_encoder'): with tf.variable_scope('motion_encoder'):
with slim.arg_scope([slim.conv2d], padding='VALID'): with slim.arg_scope([slim.conv2d], padding='VALID'):
net = slim.conv2d(net, 96, [5, 5], stride=1) net = slim.conv2d(net, 96, [5, 5], stride=1)
...@@ -128,7 +128,7 @@ class CrossConvModel(object): ...@@ -128,7 +128,7 @@ class CrossConvModel(object):
z = tf.reshape(net, shape=[batch_size, -1]) z = tf.reshape(net, shape=[batch_size, -1])
self.z_mean, self.z_stddev_log = tf.split( self.z_mean, self.z_stddev_log = tf.split(
split_dim=1, num_split=2, value=z) axis=1, num_or_size_splits=2, value=z)
self.z_stddev = tf.exp(self.z_stddev_log) self.z_stddev = tf.exp(self.z_stddev_log)
epsilon = tf.random_normal( epsilon = tf.random_normal(
...@@ -174,7 +174,7 @@ class CrossConvModel(object): ...@@ -174,7 +174,7 @@ class CrossConvModel(object):
def _CrossConv(self, encoded_images): def _CrossConv(self, encoded_images):
"""Apply the motion kernel on the encoded_images.""" """Apply the motion kernel on the encoded_images."""
cross_conved_images = [] cross_conved_images = []
kernels = tf.split(split_dim=3, num_split=4, value=self.kernel) kernels = tf.split(axis=3, num_or_size_splits=4, value=self.kernel)
for (i, encoded_image) in enumerate(encoded_images): for (i, encoded_image) in enumerate(encoded_images):
with tf.variable_scope('cross_conv_%d' % i): with tf.variable_scope('cross_conv_%d' % i):
kernel = kernels[i] kernel = kernels[i]
...@@ -187,7 +187,7 @@ class CrossConvModel(object): ...@@ -187,7 +187,7 @@ class CrossConvModel(object):
for j in xrange(len(encoded_image)): for j in xrange(len(encoded_image)):
conved_image.append(self._CrossConvHelper( conved_image.append(self._CrossConvHelper(
encoded_image[j], kernel[j])) encoded_image[j], kernel[j]))
cross_conved_images.append(tf.concat(0, conved_image)) cross_conved_images.append(tf.concat(axis=0, values=conved_image))
sys.stderr.write('cross_conved shape: %s\n' % sys.stderr.write('cross_conved shape: %s\n' %
cross_conved_images[-1].get_shape()) cross_conved_images[-1].get_shape())
return cross_conved_images return cross_conved_images
...@@ -224,7 +224,7 @@ class CrossConvModel(object): ...@@ -224,7 +224,7 @@ class CrossConvModel(object):
nets.append(self._Deconv( nets.append(self._Deconv(
cross_conved_image, 64, kernel_size=3, stride=stride)) cross_conved_image, 64, kernel_size=3, stride=stride))
net = tf.concat(3, nets) net = tf.concat(axis=3, values=nets)
net = slim.conv2d(net, 128, [9, 9], padding='SAME', stride=1) net = slim.conv2d(net, 128, [9, 9], padding='SAME', stride=1)
net = slim.conv2d(net, 128, [1, 1], padding='SAME', stride=1) net = slim.conv2d(net, 128, [1, 1], padding='SAME', stride=1)
net = slim.conv2d(net, 3, [1, 1], padding='SAME', stride=1) net = slim.conv2d(net, 3, [1, 1], padding='SAME', stride=1)
......
...@@ -42,7 +42,7 @@ def SequenceToImageAndDiff(images): ...@@ -42,7 +42,7 @@ def SequenceToImageAndDiff(images):
for i in xrange(0, len(resized_images)-1): for i in xrange(0, len(resized_images)-1):
diffs.append(resized_images[i+1] - resized_images[i]) diffs.append(resized_images[i+1] - resized_images[i])
image_diff_list.append( image_diff_list.append(
(tf.concat(0, resized_images[:-1]), tf.concat(0, diffs))) (tf.concat(axis=0, values=resized_images[:-1]), tf.concat(axis=0, values=diffs)))
return image_diff_list return image_diff_list
......
...@@ -332,7 +332,7 @@ def masked_conv_aff_coupling(input_, mask_in, dim, name, ...@@ -332,7 +332,7 @@ def masked_conv_aff_coupling(input_, mask_in, dim, name,
residual_blocks=residual_blocks, residual_blocks=residual_blocks,
bottleneck=bottleneck, skip=skip) bottleneck=bottleneck, skip=skip)
mask = tf.mod(mask_channel + mask, 2) mask = tf.mod(mask_channel + mask, 2)
res = tf.split(res, 2, 3) res = tf.split(axis=3, num_or_size_splits=2, value=res)
shift, log_rescaling = res[-2], res[-1] shift, log_rescaling = res[-2], res[-1]
scale = variable_on_cpu( scale = variable_on_cpu(
"rescaling_scale", [], "rescaling_scale", [],
...@@ -486,9 +486,9 @@ def conv_ch_aff_coupling(input_, dim, name, ...@@ -486,9 +486,9 @@ def conv_ch_aff_coupling(input_, dim, name,
scope.reuse_variables() scope.reuse_variables()
if change_bottom: if change_bottom:
input_, canvas = tf.split(input_, 2, 3) input_, canvas = tf.split(axis=3, num_or_size_splits=2, value=input_)
else: else:
canvas, input_ = tf.split(input_, 2, 3) canvas, input_ = tf.split(axis=3, num_or_size_splits=2, value=input_)
shape = input_.get_shape().as_list() shape = input_.get_shape().as_list()
batch_size = shape[0] batch_size = shape[0]
height = shape[1] height = shape[1]
...@@ -509,7 +509,7 @@ def conv_ch_aff_coupling(input_, dim, name, ...@@ -509,7 +509,7 @@ def conv_ch_aff_coupling(input_, dim, name,
train=train, weight_norm=weight_norm, train=train, weight_norm=weight_norm,
residual_blocks=residual_blocks, residual_blocks=residual_blocks,
bottleneck=bottleneck, skip=skip) bottleneck=bottleneck, skip=skip)
shift, log_rescaling = tf.split(res, 2, 3) shift, log_rescaling = tf.split(axis=3, num_or_size_splits=2, value=res)
scale = variable_on_cpu( scale = variable_on_cpu(
"scale", [], "scale", [],
tf.constant_initializer(1.)) tf.constant_initializer(1.))
...@@ -570,9 +570,9 @@ def conv_ch_add_coupling(input_, dim, name, ...@@ -570,9 +570,9 @@ def conv_ch_add_coupling(input_, dim, name,
scope.reuse_variables() scope.reuse_variables()
if change_bottom: if change_bottom:
input_, canvas = tf.split(input_, 2, 3) input_, canvas = tf.split(axis=3, num_or_size_splits=2, value=input_)
else: else:
canvas, input_ = tf.split(input_, 2, 3) canvas, input_ = tf.split(axis=3, num_or_size_splits=2, value=input_)
shape = input_.get_shape().as_list() shape = input_.get_shape().as_list()
channels = shape[3] channels = shape[3]
res = input_ res = input_
...@@ -736,8 +736,8 @@ def rec_masked_conv_coupling(input_, hps, scale_idx, n_scale, ...@@ -736,8 +736,8 @@ def rec_masked_conv_coupling(input_, hps, scale_idx, n_scale,
log_diff_1 = log_diff[:, :, :, :channels] log_diff_1 = log_diff[:, :, :, :channels]
log_diff_2 = log_diff[:, :, :, channels:] log_diff_2 = log_diff[:, :, :, channels:]
else: else:
res_1, res_2 = tf.split(res, 2, 3) res_1, res_2 = tf.split(axis=3, num_or_size_splits=2, value=res)
log_diff_1, log_diff_2 = tf.split(log_diff, 2, 3) log_diff_1, log_diff_2 = tf.split(axis=3, num_or_size_splits=2, value=log_diff)
res_1, inc_log_diff = rec_masked_conv_coupling( res_1, inc_log_diff = rec_masked_conv_coupling(
input_=res_1, hps=hps, scale_idx=scale_idx + 1, n_scale=n_scale, input_=res_1, hps=hps, scale_idx=scale_idx + 1, n_scale=n_scale,
use_batch_norm=use_batch_norm, weight_norm=weight_norm, use_batch_norm=use_batch_norm, weight_norm=weight_norm,
...@@ -798,8 +798,8 @@ def rec_masked_deconv_coupling(input_, hps, scale_idx, n_scale, ...@@ -798,8 +798,8 @@ def rec_masked_deconv_coupling(input_, hps, scale_idx, n_scale,
log_diff_1 = log_diff[:, :, :, :channels] log_diff_1 = log_diff[:, :, :, :channels]
log_diff_2 = log_diff[:, :, :, channels:] log_diff_2 = log_diff[:, :, :, channels:]
else: else:
res_1, res_2 = tf.split(res, 2, 3) res_1, res_2 = tf.split(axis=3, num_or_size_splits=2, value=res)
log_diff_1, log_diff_2 = tf.split(log_diff, 2, 3) log_diff_1, log_diff_2 = tf.split(axis=3, num_or_size_splits=2, value=log_diff)
res_1, log_diff_1 = rec_masked_deconv_coupling( res_1, log_diff_1 = rec_masked_deconv_coupling(
input_=res_1, hps=hps, input_=res_1, hps=hps,
scale_idx=scale_idx + 1, n_scale=n_scale, scale_idx=scale_idx + 1, n_scale=n_scale,
...@@ -1305,7 +1305,7 @@ class RealNVP(object): ...@@ -1305,7 +1305,7 @@ class RealNVP(object):
z_lost = z_complete z_lost = z_complete
for scale_idx in xrange(hps.n_scale - 1): for scale_idx in xrange(hps.n_scale - 1):
z_lost = squeeze_2x2_ordered(z_lost) z_lost = squeeze_2x2_ordered(z_lost)
z_lost, _ = tf.split(z_lost, 2, 3) z_lost, _ = tf.split(axis=3, num_or_size_splits=2, value=z_lost)
z_compressed = z_lost z_compressed = z_lost
z_noisy = z_lost z_noisy = z_lost
for _ in xrange(scale_idx + 1): for _ in xrange(scale_idx + 1):
......
...@@ -99,8 +99,8 @@ def conv_layer(input_, ...@@ -99,8 +99,8 @@ def conv_layer(input_,
filter_size[1] - input_.get_shape().as_list()[2], filter_size[1] - input_.get_shape().as_list()[2],
input_.get_shape().as_list()[3] input_.get_shape().as_list()[3]
]) ])
res = tf.concat(1, [pad_1, res]) res = tf.concat(axis=1, values=[pad_1, res])
res = tf.concat(2, [pad_2, res]) res = tf.concat(axis=2, values=[pad_2, res])
res = tf.nn.conv2d( res = tf.nn.conv2d(
input=res, input=res,
filter=weights, filter=weights,
...@@ -139,8 +139,8 @@ def depool_2x2(input_, stride=2): ...@@ -139,8 +139,8 @@ def depool_2x2(input_, stride=2):
channels = shape[3] channels = shape[3]
res = tf.reshape(input_, [batch_size, height, 1, width, 1, channels]) res = tf.reshape(input_, [batch_size, height, 1, width, 1, channels])
res = tf.concat( res = tf.concat(
2, [res, tf.zeros([batch_size, height, stride - 1, width, 1, channels])]) axis=2, values=[res, tf.zeros([batch_size, height, stride - 1, width, 1, channels])])
res = tf.concat(4, [ res = tf.concat(axis=4, values=[
res, tf.zeros([batch_size, height, stride, width, stride - 1, channels]) res, tf.zeros([batch_size, height, stride, width, stride - 1, channels])
]) ])
res = tf.reshape(res, [batch_size, stride * height, stride * width, channels]) res = tf.reshape(res, [batch_size, stride * height, stride * width, channels])
...@@ -158,11 +158,11 @@ def batch_random_flip(input_): ...@@ -158,11 +158,11 @@ def batch_random_flip(input_):
height = shape[1] height = shape[1]
width = shape[2] width = shape[2]
channels = shape[3] channels = shape[3]
res = tf.split(0, batch_size, input_) res = tf.split(axis=0, num_or_size_splits=batch_size, value=input_)
res = [elem[0, :, :, :] for elem in res] res = [elem[0, :, :, :] for elem in res]
res = [tf.image.random_flip_left_right(elem) for elem in res] res = [tf.image.random_flip_left_right(elem) for elem in res]
res = [tf.reshape(elem, [1, height, width, channels]) for elem in res] res = [tf.reshape(elem, [1, height, width, channels]) for elem in res]
res = tf.concat(0, res) res = tf.concat(axis=0, values=res)
return res return res
...@@ -175,7 +175,7 @@ def as_one_hot(input_, n_indices): ...@@ -175,7 +175,7 @@ def as_one_hot(input_, n_indices):
n_elem = numpy.prod(shape) n_elem = numpy.prod(shape)
indices = tf.range(n_elem) indices = tf.range(n_elem)
indices = tf.cast(indices, tf.int64) indices = tf.cast(indices, tf.int64)
indices_input = tf.concat(0, [indices, tf.reshape(input_, [-1])]) indices_input = tf.concat(axis=0, values=[indices, tf.reshape(input_, [-1])])
indices_input = tf.reshape(indices_input, [2, -1]) indices_input = tf.reshape(indices_input, [2, -1])
indices_input = tf.transpose(indices_input) indices_input = tf.transpose(indices_input)
res = tf.sparse_to_dense( res = tf.sparse_to_dense(
......
...@@ -23,7 +23,7 @@ https://arxiv.org/pdf/1605.07146v1.pdf ...@@ -23,7 +23,7 @@ https://arxiv.org/pdf/1605.07146v1.pdf
<b>Settings:</b> <b>Settings:</b>
* Random split 50k training set into 45k/5k train/eval split. * Random split 50k training set into 45k/5k train/eval split.
* Pad to 36x36 and random crop. Horizontal flip. Per-image whitenting. * Pad to 36x36 and random crop. Horizontal flip. Per-image whitening.
* Momentum optimizer 0.9. * Momentum optimizer 0.9.
* Learning rate schedule: 0.1 (40k), 0.01 (60k), 0.001 (>60k). * Learning rate schedule: 0.1 (40k), 0.01 (60k), 0.001 (>60k).
* L2 weight decay: 0.002. * L2 weight decay: 0.002.
...@@ -31,13 +31,9 @@ https://arxiv.org/pdf/1605.07146v1.pdf ...@@ -31,13 +31,9 @@ https://arxiv.org/pdf/1605.07146v1.pdf
<b>Results:</b> <b>Results:</b>
<left>
![Precisions](g3doc/cifar_resnet.gif) ![Precisions](g3doc/cifar_resnet.gif)
</left>
<left>
![Precisions Legends](g3doc/cifar_resnet_legends.gif)
</left>
![Precisions Legends](g3doc/cifar_resnet_legends.gif)
CIFAR-10 Model|Best Precision|Steps CIFAR-10 Model|Best Precision|Steps
--------------|--------------|------ --------------|--------------|------
...@@ -69,40 +65,40 @@ curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binar ...@@ -69,40 +65,40 @@ curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binar
<b>How to run:</b> <b>How to run:</b>
```shell ```shell
# cd to the your workspace. # cd to the models repository and run with bash. Expected command output shown.
# It contains an empty WORKSPACE file, resnet codes and cifar10 dataset. # The directory should contain an empty WORKSPACE file, the resnet code, and the cifar10 dataset.
# Note: User can split 5k from train set for eval set. # Note: The user can split 5k from train set for eval set.
ls -R $ ls -R
.: .:
cifar10 resnet WORKSPACE cifar10 resnet WORKSPACE
./cifar10: ./cifar10:
data_batch_1.bin data_batch_2.bin data_batch_3.bin data_batch_4.bin data_batch_1.bin data_batch_2.bin data_batch_3.bin data_batch_4.bin
data_batch_5.bin test_batch.bin data_batch_5.bin test_batch.bin
./resnet: ./resnet:
BUILD cifar_input.py g3doc README.md resnet_main.py resnet_model.py BUILD cifar_input.py g3doc README.md resnet_main.py resnet_model.py
# Build everything for GPU. # Build everything for GPU.
bazel build -c opt --config=cuda resnet/... $ bazel build -c opt --config=cuda resnet/...
# Train the model. # Train the model.
bazel-bin/resnet/resnet_main --train_data_path=cifar10/data_batch* \ $ bazel-bin/resnet/resnet_main --train_data_path=cifar10/data_batch* \
--log_root=/tmp/resnet_model \ --log_root=/tmp/resnet_model \
--train_dir=/tmp/resnet_model/train \ --train_dir=/tmp/resnet_model/train \
--dataset='cifar10' \ --dataset='cifar10' \
--num_gpus=1 --num_gpus=1
# While the model is training, you can also check on its progress using tensorboard: # While the model is training, you can also check on its progress using tensorboard:
tensorboard --logdir=/tmp/resnet_model $ tensorboard --logdir=/tmp/resnet_model
# Evaluate the model. # Evaluate the model.
# Avoid running on the same GPU as the training job at the same time, # Avoid running on the same GPU as the training job at the same time,
# otherwise, you might run out of memory. # otherwise, you might run out of memory.
bazel-bin/resnet/resnet_main --eval_data_path=cifar10/test_batch.bin \ $ bazel-bin/resnet/resnet_main --eval_data_path=cifar10/test_batch.bin \
--log_root=/tmp/resnet_model \ --log_root=/tmp/resnet_model \
--eval_dir=/tmp/resnet_model/test \ --eval_dir=/tmp/resnet_model/test \
--mode=eval \ --mode=eval \
--dataset='cifar10' \ --dataset='cifar10' \
--num_gpus=0 --num_gpus=0
``` ```
...@@ -85,7 +85,7 @@ class ResNet(object): ...@@ -85,7 +85,7 @@ class ResNet(object):
# comparably good performance. # comparably good performance.
# https://arxiv.org/pdf/1605.07146v1.pdf # https://arxiv.org/pdf/1605.07146v1.pdf
# filters = [16, 160, 320, 640] # filters = [16, 160, 320, 640]
# Update hps.num_residual_units to 9 # Update hps.num_residual_units to 4
with tf.variable_scope('unit_1_0'): with tf.variable_scope('unit_1_0'):
x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
...@@ -128,7 +128,7 @@ class ResNet(object): ...@@ -128,7 +128,7 @@ class ResNet(object):
def _build_train_op(self): def _build_train_op(self):
"""Build training specific ops for the graph.""" """Build training specific ops for the graph."""
self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
tf.summary.scalar('learning rate', self.lrn_rate) tf.summary.scalar('learning_rate', self.lrn_rate)
trainable_variables = tf.trainable_variables() trainable_variables = tf.trainable_variables()
grads = tf.gradients(self.cost, trainable_variables) grads = tf.gradients(self.cost, trainable_variables)
......
/bazel-bin
/bazel-ci_build-cache
/bazel-genfiles
/bazel-out
/bazel-skip_thoughts
/bazel-testlogs
/bazel-tf
*.pyc
# Skip-Thought Vectors
This is a TensorFlow implementation of the model described in:
Jamie Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel,
Antonio Torralba, Raquel Urtasun, Sanja Fidler.
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf).
*In NIPS, 2015.*
## Contact
***Code author:*** Chris Shallue
***Pull requests and issues:*** @cshallue
## Contents
* [Model Overview](#model-overview)
* [Getting Started](#getting-started)
* [Install Required Packages](#install-required-packages)
* [Download Pretrained Models (Optional)](#download-pretrained-models-optional)
* [Training a Model](#training-a-model)
* [Prepare the Training Data](#prepare-the-training-data)
* [Run the Training Script](#run-the-training-script)
* [Track Training Progress](#track-training-progress)
* [Expanding the Vocabulary](#expanding-the-vocabulary)
* [Overview](#overview)
* [Preparation](#preparation)
* [Run the Vocabulary Expansion Script](#run-the-vocabulary-expansion-script)
* [Evaluating a Model](#evaluating-a-model)
* [Overview](#overview-1)
* [Preparation](#preparation-1)
* [Run the Evaluation Tasks](#run-the-evaluation-tasks)
* [Encoding Sentences](#encoding-sentences)
## Model overview
The *Skip-Thoughts* model is a sentence encoder. It learns to encode input
sentences into a fixed-dimensional vector representation that is useful for many
tasks, for example to detect paraphrases or to classify whether a product review
is positive or negative. See the
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf)
paper for details of the model architecture and more example applications.
A trained *Skip-Thoughts* model will encode similar sentences nearby each other
in the embedding vector space. The following examples show the nearest neighbor by
cosine similarity of some sentences from the
[movie review dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/).
| Input sentence | Nearest Neighbor |
|----------------|------------------|
| Simplistic, silly and tedious. | Trite, banal, cliched, mostly inoffensive. |
| Not so much farcical as sour. | Not only unfunny, but downright repellent. |
| A sensitive and astute first feature by Anne-Sophie Birot. | Absorbing character study by André Turpin . |
| An enthralling, entertaining feature. | A slick, engrossing melodrama. |
## Getting Started
### Install Required Packages
First ensure that you have installed the following required packages:
* **Bazel** ([instructions](http://bazel.build/docs/install.html))
* **TensorFlow** ([instructions](https://www.tensorflow.org/install/))
* **NumPy** ([instructions](http://www.scipy.org/install.html))
* **scikit-learn** ([instructions](http://scikit-learn.org/stable/install.html))
* **Natural Language Toolkit (NLTK)**
* First install NLTK ([instructions](http://www.nltk.org/install.html))
* Then install the NLTK data ([instructions](http://www.nltk.org/data.html))
* **gensim** ([instructions](https://radimrehurek.com/gensim/install.html))
* Only required if you will be expanding your vocabulary with the [word2vec](https://code.google.com/archive/p/word2vec/) model.
### Download Pretrained Models (Optional)
You can download model checkpoints pretrained on the
[BookCorpus](http://yknzhu.wixsite.com/mbweb) dataset in the following
configurations:
* Unidirectional RNN encoder ("uni-skip" in the paper)
* Bidirectional RNN encoder ("bi-skip" in the paper)
```shell
# Directory to download the pretrained models to.
PRETRAINED_MODELS_DIR="${HOME}/skip_thoughts/pretrained/"
mkdir -p ${PRETRAINED_MODELS_DIR}
cd ${PRETRAINED_MODELS_DIR}
# Download and extract the unidirectional model.
wget "http://download.tensorflow.org/models/skip_thoughts_uni_2017_02_02.tar.gz"
tar -xvf skip_thoughts_uni_2017_02_02.tar.gz
rm skip_thoughts_uni_2017_02_02.tar.gz
# Download and extract the bidirectional model.
wget "http://download.tensorflow.org/models/skip_thoughts_bi_2017_02_16.tar.gz"
tar -xvf skip_thoughts_bi_2017_02_16.tar.gz
rm skip_thoughts_bi_2017_02_16.tar.gz
```
You can now skip to the sections [Evaluating a Model](#evaluating-a-model) and
[Encoding Sentences](#encoding-sentences).
## Training a Model
### Prepare the Training Data
To train a model you will need to provide training data in TFRecord format. The
TFRecord format consists of a set of sharded files containing serialized
`tf.Example` protocol buffers. Each `tf.Example` proto contains three
sentences:
* `encode`: The sentence to encode.
* `decode_pre`: The sentence preceding `encode` in the original text.
* `decode_post`: The sentence following `encode` in the original text.
Each sentence is a list of words. During preprocessing, a dictionary is created
that assigns each word in the vocabulary to an integer-valued id. Each sentence
is encoded as a list of integer word ids in the `tf.Example` protos.
We have provided a script to preprocess any set of text-files into this format.
You may wish to use the [BookCorpus](http://yknzhu.wixsite.com/mbweb) dataset.
Note that the preprocessing script may take **12 hours** or more to complete
on this large dataset.
```shell
# Comma-separated list of globs matching the input input files. The format of
# the input files is assumed to be a list of newline-separated sentences, where
# each sentence is already tokenized.
INPUT_FILES="${HOME}/skip_thoughts/bookcorpus/*.txt"
# Location to save the preprocessed training and validation data.
DATA_DIR="${HOME}/skip_thoughts/data"
# Build the preprocessing script.
cd tensorflow-models/skip_thoughts
bazel build -c opt //skip_thoughts/data:preprocess_dataset
# Run the preprocessing script.
bazel-bin/skip_thoughts/data/preprocess_dataset \
--input_files=${INPUT_FILES} \
--output_dir=${DATA_DIR}
```
When the script finishes you will find 100 training files and 1 validation file
in `DATA_DIR`. The files will match the patterns `train-?????-of-00100` and
`validation-00000-of-00001` respectively.
The script will also produce a file named `vocab.txt`. The format of this file
is a list of newline-separated words where the word id is the corresponding 0-
based line index. Words are sorted by descending order of frequency in the input
data. Only the top 20,000 words are assigned unique ids; all other words are
assigned the "unknown id" of 1 in the processed data.
### Run the Training Script
Execute the following commands to start the training script. By default it will
run for 500k steps (around 9 days on a GeForce GTX 1080 GPU).
```shell
# Directory containing the preprocessed data.
DATA_DIR="${HOME}/skip_thoughts/data"
# Directory to save the model.
MODEL_DIR="${HOME}/skip_thoughts/model"
# Build the model.
cd tensorflow-models/skip_thoughts
bazel build -c opt //skip_thoughts/...
# Run the training script.
bazel-bin/skip_thoughts/train \
--input_file_pattern="${DATA_DIR}/train-?????-of-00100" \
--train_dir="${MODEL_DIR}/train"
```
### Track Training Progress
Optionally, you can run the `track_perplexity` script in a separate process.
This will log per-word perplexity on the validation set which allows training
progress to be monitored on
[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
Note that you may run out of memory if you run the this script on the same GPU
as the training script. You can set the environment variable
`CUDA_VISIBLE_DEVICES=""` to force the script to run on CPU. If it runs too
slowly on CPU, you can decrease the value of `--num_eval_examples`.
```shell
DATA_DIR="${HOME}/skip_thoughts/data"
MODEL_DIR="${HOME}/skip_thoughts/model"
# Ignore GPU devices (only necessary if your GPU is currently memory
# constrained, for example, by running the training script).
export CUDA_VISIBLE_DEVICES=""
# Run the evaluation script. This will run in a loop, periodically loading the
# latest model checkpoint file and computing evaluation metrics.
bazel-bin/skip_thoughts/track_perplexity \
--input_file_pattern="${DATA_DIR}/validation-?????-of-00001" \
--checkpoint_dir="${MODEL_DIR}/train" \
--eval_dir="${MODEL_DIR}/val" \
--num_eval_examples=50000
```
If you started the `track_perplexity` script, run a
[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
server in a separate process for real-time monitoring of training summaries and
validation perplexity.
```shell
MODEL_DIR="${HOME}/skip_thoughts/model"
# Run a TensorBoard server.
tensorboard --logdir="${MODEL_DIR}"
```
## Expanding the Vocabulary
### Overview
The vocabulary generated by the preprocessing script contains only 20,000 words
which is insufficient for many tasks. For example, a sentence from Wikipedia
might contain nouns that do not appear in this vocabulary.
A solution to this problem described in the
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf)
paper is to learn a mapping that transfers word representations from one model to
another. This idea is based on the "Translation Matrix" method from the paper
[Exploiting Similarities Among Languages for Machine Translation](https://arxiv.org/abs/1309.4168).
Specifically, we will load the word embeddings from a trained *Skip-Thoughts*
model and from a trained [word2vec model](https://arxiv.org/pdf/1301.3781.pdf)
(which has a much larger vocabulary). We will train a linear regression model
without regularization to learn a linear mapping from the word2vec embedding
space to the *Skip-Thoughts* embedding space. We will then apply the linear
model to all words in the word2vec vocabulary, yielding vectors in the *Skip-
Thoughts* word embedding space for the union of the two vocabularies.
The linear regression task is to learn a parameter matrix *W* to minimize
*|| X - Y \* W ||<sup>2</sup>*, where *X* is a matrix of *Skip-Thoughts*
embeddings of shape `[num_words, dim1]`, *Y* is a matrix of word2vec embeddings
of shape `[num_words, dim2]`, and *W* is a matrix of shape `[dim2, dim1]`.
### Preparation
First you will need to download and unpack a pretrained
[word2vec model](https://arxiv.org/pdf/1301.3781.pdf) from
[this website](https://code.google.com/archive/p/word2vec/)
([direct download link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)).
This model was trained on the Google News dataset (about 100 billion words).
Also ensure that you have already [installed gensim](https://radimrehurek.com/gensim/install.html).
### Run the Vocabulary Expansion Script
```shell
# Path to checkpoint file or a directory containing checkpoint files (the script
# will select the most recent).
CHECKPOINT_PATH="${HOME}/skip_thoughts/model/train"
# Vocabulary file generated by the preprocessing script.
SKIP_THOUGHTS_VOCAB="${HOME}/skip_thoughts/data/vocab.txt"
# Path to downloaded word2vec model.
WORD2VEC_MODEL="${HOME}/skip_thoughts/googlenews/GoogleNews-vectors-negative300.bin"
# Output directory.
EXP_VOCAB_DIR="${HOME}/skip_thoughts/exp_vocab"
# Build the vocabulary expansion script.
cd tensorflow-models/skip_thoughts
bazel build -c opt //skip_thoughts:vocabulary_expansion
# Run the vocabulary expansion script.
bazel-bin/skip_thoughts/vocabulary_expansion \
--skip_thoughts_model=${CHECKPOINT_PATH} \
--skip_thoughts_vocab=${SKIP_THOUGHTS_VOCAB} \
--word2vec_model=${WORD2VEC_MODEL} \
--output_dir=${EXP_VOCAB_DIR}
```
## Evaluating a Model
### Overview
The model can be evaluated using the benchmark tasks described in the
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf)
paper. The following tasks are supported (refer to the paper for full details):
* **SICK** semantic relatedness task.
* **MSRP** (Microsoft Research Paraphrase Corpus) paraphrase detection task.
* Binary classification tasks:
* **MR** movie review sentiment task.
* **CR** customer product review task.
* **SUBJ** subjectivity/objectivity task.
* **MPQA** opinion polarity task.
* **TREC** question-type classification task.
### Preparation
You will need to clone or download the
[skip-thoughts GitHub repository](https://github.com/ryankiros/skip-thoughts) by
[ryankiros](https://github.com/ryankiros) (the first author of the Skip-Thoughts
paper):
```shell
# Folder to clone the repository to.
ST_KIROS_DIR="${HOME}/skip_thoughts/skipthoughts_kiros"
# Clone the repository.
git clone git@github.com:ryankiros/skip-thoughts.git "${ST_KIROS_DIR}/skipthoughts"
# Make the package importable.
export PYTHONPATH="${ST_KIROS_DIR}/:${PYTHONPATH}"
```
You will also need to download the data needed for each evaluation task. See the
instructions [here](https://github.com/ryankiros/skip-thoughts).
For example, the CR (customer review) dataset is found [here](http://nlp.stanford.edu/~sidaw/home/projects:nbsvm). For this task we want the
files `custrev.pos` and `custrev.neg`.
### Run the Evaluation Tasks
In the following example we will evaluate a unidirectional model ("uni-skip" in
the paper) on the CR task. To use a bidirectional model ("bi-skip" in the
paper), simply pass the flags `--bi_vocab_file`, `--bi_embeddings_file` and
`--bi_checkpoint_path` instead. To use the "combine-skip" model described in the
paper you will need to pass both the unidirectional and bidirectional flags.
```shell
# Path to checkpoint file or a directory containing checkpoint files (the script
# will select the most recent).
CHECKPOINT_PATH="${HOME}/skip_thoughts/model/train"
# Vocabulary file generated by the vocabulary expansion script.
VOCAB_FILE="${HOME}/skip_thoughts/exp_vocab/vocab.txt"
# Embeddings file generated by the vocabulary expansion script.
EMBEDDINGS_FILE="${HOME}/skip_thoughts/exp_vocab/embeddings.npy"
# Directory containing files custrev.pos and custrev.neg.
EVAL_DATA_DIR="${HOME}/skip_thoughts/eval_data"
# Build the evaluation script.
cd tensorflow-models/skip_thoughts
bazel build -c opt //skip_thoughts:evaluate
# Run the evaluation script.
bazel-bin/skip_thoughts/evaluate \
--eval_task=CR \
--data_dir=${EVAL_DATA_DIR} \
--uni_vocab_file=${VOCAB_FILE} \
--uni_embeddings_file=${EMBEDDINGS_FILE} \
--uni_checkpoint_path=${CHECKPOINT_PATH}
```
Output:
```python
[0.82539682539682535, 0.84084880636604775, 0.83023872679045096,
0.86206896551724133, 0.83554376657824936, 0.85676392572944293,
0.84084880636604775, 0.83023872679045096, 0.85145888594164454,
0.82758620689655171]
```
The output is a list of accuracies of 10 cross-validation classification models.
To get a single number, simply take the average:
```python
ipython # Launch iPython.
In [0]:
import numpy as np
np.mean([0.82539682539682535, 0.84084880636604775, 0.83023872679045096,
0.86206896551724133, 0.83554376657824936, 0.85676392572944293,
0.84084880636604775, 0.83023872679045096, 0.85145888594164454,
0.82758620689655171])
Out [0]: 0.84009936423729525
```
## Encoding Sentences
In this example we will encode data from the
[movie review dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/)
(specifically the [sentence polarity dataset v1.0](https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)).
```python
ipython # Launch iPython.
In [0]:
# Imports.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os.path
import scipy.spatial.distance as sd
from skip_thoughts import configuration
from skip_thoughts import encoder_manager
In [1]:
# Set paths to the model.
VOCAB_FILE = "/path/to/vocab.txt"
EMBEDDING_MATRIX_FILE = "/path/to/embeddings.npy"
CHECKPOINT_PATH = "/path/to/model.ckpt-9999"
# The following directory should contain files rt-polarity.neg and
# rt-polarity.pos.
MR_DATA_DIR = "/dir/containing/mr/data"
In [2]:
# Set up the encoder. Here we are using a single unidirectional model.
# To use a bidirectional model as well, call load_model() again with
# configuration.model_config(bidirectional_encoder=True) and paths to the
# bidirectional model's files. The encoder will use the concatenation of
# all loaded models.
encoder = encoder_manager.EncoderManager()
encoder.load_model(configuration.model_config(),
vocabulary_file=VOCAB_FILE,
embedding_matrix_file=EMBEDDING_MATRIX_FILE,
checkpoint_path=CHECKPOINT_PATH)
In [3]:
# Load the movie review dataset.
data = []
with open(os.path.join(MR_DATA_DIR, 'rt-polarity.neg'), 'rb') as f:
data.extend([line.decode('latin-1').strip() for line in f])
with open(os.path.join(MR_DATA_DIR, 'rt-polarity.pos'), 'rb') as f:
data.extend([line.decode('latin-1').strip() for line in f])
In [4]:
# Generate Skip-Thought Vectors for each sentence in the dataset.
encodings = encoder.encode(data)
In [5]:
# Define a helper function to generate nearest neighbors.
def get_nn(ind, num=10):
encoding = encodings[ind]
scores = sd.cdist([encoding], encodings, "cosine")[0]
sorted_ids = np.argsort(scores)
print("Sentence:")
print("", data[ind])
print("\nNearest neighbors:")
for i in range(1, num + 1):
print(" %d. %s (%.3f)" %
(i, data[sorted_ids[i]], scores[sorted_ids[i]]))
In [6]:
# Compute nearest neighbors of the first sentence in the dataset.
get_nn(0)
```
Output:
```
Sentence:
simplistic , silly and tedious .
Nearest neighbors:
1. trite , banal , cliched , mostly inoffensive . (0.247)
2. banal and predictable . (0.253)
3. witless , pointless , tasteless and idiotic . (0.272)
4. loud , silly , stupid and pointless . (0.295)
5. grating and tedious . (0.299)
6. idiotic and ugly . (0.330)
7. black-and-white and unrealistic . (0.335)
8. hopelessly inane , humorless and under-inspired . (0.335)
9. shallow , noisy and pretentious . (0.340)
10. . . . unlikable , uninteresting , unfunny , and completely , utterly inept . (0.346)
```
package(default_visibility = [":internal"])
licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
package_group(
name = "internal",
packages = [
"//skip_thoughts/...",
],
)
py_library(
name = "configuration",
srcs = ["configuration.py"],
srcs_version = "PY2AND3",
)
py_library(
name = "skip_thoughts_model",
srcs = ["skip_thoughts_model.py"],
srcs_version = "PY2AND3",
deps = [
"//skip_thoughts/ops:gru_cell",
"//skip_thoughts/ops:input_ops",
],
)
py_test(
name = "skip_thoughts_model_test",
size = "large",
srcs = ["skip_thoughts_model_test.py"],
deps = [
":configuration",
":skip_thoughts_model",
],
)
py_binary(
name = "train",
srcs = ["train.py"],
srcs_version = "PY2AND3",
deps = [
":configuration",
":skip_thoughts_model",
],
)
py_binary(
name = "track_perplexity",
srcs = ["track_perplexity.py"],
srcs_version = "PY2AND3",
deps = [
":configuration",
":skip_thoughts_model",
],
)
py_binary(
name = "vocabulary_expansion",
srcs = ["vocabulary_expansion.py"],
srcs_version = "PY2AND3",
)
py_library(
name = "skip_thoughts_encoder",
srcs = ["skip_thoughts_encoder.py"],
srcs_version = "PY2AND3",
deps = [
":skip_thoughts_model",
"//skip_thoughts/data:special_words",
],
)
py_library(
name = "encoder_manager",
srcs = ["encoder_manager.py"],
srcs_version = "PY2AND3",
deps = [
":skip_thoughts_encoder",
],
)
py_binary(
name = "evaluate",
srcs = ["evaluate.py"],
srcs_version = "PY2AND3",
deps = [
":encoder_manager",
"//skip_thoughts:configuration",
],
)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Default configuration for model architecture and training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class _HParams(object):
"""Wrapper for configuration parameters."""
pass
def model_config(input_file_pattern=None,
input_queue_capacity=640000,
num_input_reader_threads=1,
shuffle_input_data=True,
uniform_init_scale=0.1,
vocab_size=20000,
batch_size=128,
word_embedding_dim=620,
bidirectional_encoder=False,
encoder_dim=2400):
"""Creates a model configuration object.
Args:
input_file_pattern: File pattern of sharded TFRecord files containing
tf.Example protobufs.
input_queue_capacity: Number of examples to keep in the input queue.
num_input_reader_threads: Number of threads for prefetching input
tf.Examples.
shuffle_input_data: Whether to shuffle the input data.
uniform_init_scale: Scale of random uniform initializer.
vocab_size: Number of unique words in the vocab.
batch_size: Batch size (training and evaluation only).
word_embedding_dim: Word embedding dimension.
bidirectional_encoder: Whether to use a bidirectional or unidirectional
encoder RNN.
encoder_dim: Number of output dimensions of the sentence encoder.
Returns:
An object containing model configuration parameters.
"""
config = _HParams()
config.input_file_pattern = input_file_pattern
config.input_queue_capacity = input_queue_capacity
config.num_input_reader_threads = num_input_reader_threads
config.shuffle_input_data = shuffle_input_data
config.uniform_init_scale = uniform_init_scale
config.vocab_size = vocab_size
config.batch_size = batch_size
config.word_embedding_dim = word_embedding_dim
config.bidirectional_encoder = bidirectional_encoder
config.encoder_dim = encoder_dim
return config
def training_config(learning_rate=0.0008,
learning_rate_decay_factor=0.5,
learning_rate_decay_steps=400000,
number_of_steps=500000,
clip_gradient_norm=5.0,
save_model_secs=600,
save_summaries_secs=600):
"""Creates a training configuration object.
Args:
learning_rate: Initial learning rate.
learning_rate_decay_factor: If > 0, the learning rate decay factor.
learning_rate_decay_steps: The number of steps before the learning rate
decays by learning_rate_decay_factor.
number_of_steps: The total number of training steps to run. Passing None
will cause the training script to run indefinitely.
clip_gradient_norm: If not None, then clip gradients to this value.
save_model_secs: How often (in seconds) to save model checkpoints.
save_summaries_secs: How often (in seconds) to save model summaries.
Returns:
An object containing training configuration parameters.
Raises:
ValueError: If learning_rate_decay_factor is set and
learning_rate_decay_steps is unset.
"""
if learning_rate_decay_factor and not learning_rate_decay_steps:
raise ValueError(
"learning_rate_decay_factor requires learning_rate_decay_steps.")
config = _HParams()
config.learning_rate = learning_rate
config.learning_rate_decay_factor = learning_rate_decay_factor
config.learning_rate_decay_steps = learning_rate_decay_steps
config.number_of_steps = number_of_steps
config.clip_gradient_norm = clip_gradient_norm
config.save_model_secs = save_model_secs
config.save_summaries_secs = save_summaries_secs
return config
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment