Commit b7523ee5 authored by Ivan Bogatyy's avatar Ivan Bogatyy
Browse files
parents 66723d7d 2c6d74b7
......@@ -21,6 +21,7 @@ To propose a model for inclusion please submit a pull request.
- [next_frame_prediction](next_frame_prediction): probabilistic future frame synthesis via cross convolutional networks.
- [real_nvp](real_nvp): density estimation using real-valued non-volume preserving (real NVP) transformations.
- [resnet](resnet): deep and wide residual networks.
- [skip_thoughts](skip_thoughts): recurrent neural network sentence-to-vector encoder.
- [slim](slim): image classification models in TF-Slim.
- [street](street): identify the name of a street (in France) from an image using a Deep RNN.
- [swivel](swivel): the Swivel algorithm for generating word embeddings.
......
......@@ -4,7 +4,7 @@ import sklearn.preprocessing as prep
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from autoencoder.autoencoder_models.DenoisingAutoencoder import AdditiveGaussianNoiseAutoencoder
from autoencoder_models.DenoisingAutoencoder import AdditiveGaussianNoiseAutoencoder
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)
......@@ -45,7 +45,6 @@ for epoch in range(training_epochs):
# Display logs per epoch step
if epoch % display_step == 0:
print "Epoch:", '%04d' % (epoch + 1), \
"cost=", "{:.9f}".format(avg_cost)
print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost))
print "Total cost: " + str(autoencoder.calc_total_cost(X_test))
print("Total cost: " + str(autoencoder.calc_total_cost(X_test)))
......@@ -4,7 +4,7 @@ import sklearn.preprocessing as prep
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from autoencoder.autoencoder_models.Autoencoder import Autoencoder
from autoencoder_models.Autoencoder import Autoencoder
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)
......@@ -44,7 +44,6 @@ for epoch in range(training_epochs):
# Display logs per epoch step
if epoch % display_step == 0:
print "Epoch:", '%04d' % (epoch + 1), \
"cost=", "{:.9f}".format(avg_cost)
print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost))
print "Total cost: " + str(autoencoder.calc_total_cost(X_test))
print("Total cost: " + str(autoencoder.calc_total_cost(X_test)))
......@@ -4,7 +4,7 @@ import sklearn.preprocessing as prep
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from autoencoder.autoencoder_models.DenoisingAutoencoder import MaskingNoiseAutoencoder
from autoencoder_models.DenoisingAutoencoder import MaskingNoiseAutoencoder
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)
......@@ -43,7 +43,6 @@ for epoch in range(training_epochs):
avg_cost += cost / n_samples * batch_size
if epoch % display_step == 0:
print "Epoch:", '%04d' % (epoch + 1), \
"cost=", "{:.9f}".format(avg_cost)
print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost))
print "Total cost: " + str(autoencoder.calc_total_cost(X_test))
print("Total cost: " + str(autoencoder.calc_total_cost(X_test)))
import numpy as np
import tensorflow as tf
def xavier_init(fan_in, fan_out, constant = 1):
low = -constant * np.sqrt(6.0 / (fan_in + fan_out))
high = constant * np.sqrt(6.0 / (fan_in + fan_out))
return tf.random_uniform((fan_in, fan_out),
minval = low, maxval = high,
dtype = tf.float32)
......@@ -4,7 +4,7 @@ import sklearn.preprocessing as prep
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from autoencoder.autoencoder_models.VariationalAutoencoder import VariationalAutoencoder
from autoencoder_models.VariationalAutoencoder import VariationalAutoencoder
mnist = input_data.read_data_sets('MNIST_data', one_hot = True)
......@@ -47,7 +47,6 @@ for epoch in range(training_epochs):
# Display logs per epoch step
if epoch % display_step == 0:
print "Epoch:", '%04d' % (epoch + 1), \
"cost=", "{:.9f}".format(avg_cost)
print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost))
print "Total cost: " + str(autoencoder.calc_total_cost(X_test))
print("Total cost: " + str(autoencoder.calc_total_cost(X_test)))
import tensorflow as tf
import numpy as np
import autoencoder.Utils
class Autoencoder(object):
......@@ -28,7 +26,8 @@ class Autoencoder(object):
def _initialize_weights(self):
all_weights = dict()
all_weights['w1'] = tf.Variable(autoencoder.Utils.xavier_init(self.n_input, self.n_hidden))
all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
......@@ -46,7 +45,7 @@ class Autoencoder(object):
def generate(self, hidden = None):
if hidden is None:
hidden = np.random.normal(size=self.weights["b1"])
hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
return self.sess.run(self.reconstruction, feed_dict={self.hidden: hidden})
def reconstruct(self, X):
......
import tensorflow as tf
import numpy as np
import autoencoder.Utils
class AdditiveGaussianNoiseAutoencoder(object):
def __init__(self, n_input, n_hidden, transfer_function = tf.nn.softplus, optimizer = tf.train.AdamOptimizer(),
......@@ -31,7 +28,8 @@ class AdditiveGaussianNoiseAutoencoder(object):
def _initialize_weights(self):
all_weights = dict()
all_weights['w1'] = tf.Variable(autoencoder.Utils.xavier_init(self.n_input, self.n_hidden))
all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype = tf.float32))
all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype = tf.float32))
all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype = tf.float32))
......@@ -53,9 +51,9 @@ class AdditiveGaussianNoiseAutoencoder(object):
self.scale: self.training_scale
})
def generate(self, hidden = None):
def generate(self, hidden=None):
if hidden is None:
hidden = np.random.normal(size = self.weights["b1"])
hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
return self.sess.run(self.reconstruction, feed_dict = {self.hidden: hidden})
def reconstruct(self, X):
......@@ -98,7 +96,8 @@ class MaskingNoiseAutoencoder(object):
def _initialize_weights(self):
all_weights = dict()
all_weights['w1'] = tf.Variable(autoencoder.Utils.xavier_init(self.n_input, self.n_hidden))
all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype = tf.float32))
all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype = tf.float32))
all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype = tf.float32))
......@@ -115,9 +114,9 @@ class MaskingNoiseAutoencoder(object):
def transform(self, X):
return self.sess.run(self.hidden, feed_dict = {self.x: X, self.keep_prob: 1.0})
def generate(self, hidden = None):
def generate(self, hidden=None):
if hidden is None:
hidden = np.random.normal(size = self.weights["b1"])
hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
return self.sess.run(self.reconstruction, feed_dict = {self.hidden: hidden})
def reconstruct(self, X):
......
import tensorflow as tf
import numpy as np
import autoencoder.Utils
class VariationalAutoencoder(object):
......@@ -36,8 +35,10 @@ class VariationalAutoencoder(object):
def _initialize_weights(self):
all_weights = dict()
all_weights['w1'] = tf.Variable(autoencoder.Utils.xavier_init(self.n_input, self.n_hidden))
all_weights['log_sigma_w1'] = tf.Variable(autoencoder.Utils.xavier_init(self.n_input, self.n_hidden))
all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['log_sigma_w1'] = tf.get_variable("log_sigma_w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
all_weights['log_sigma_b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
......
......@@ -37,9 +37,7 @@ Full text available at: http://arxiv.org/abs/1609.06647
The *Show and Tell* model is a deep neural network that learns how to describe
the content of images. For example:
<center>
![Example captions](g3doc/example_captions.jpg)
</center>
### Architecture
......@@ -66,9 +64,7 @@ learned during training.
The following diagram illustrates the model architecture.
<center>
![Show and Tell Architecture](g3doc/show_and_tell_architecture.png)
</center>
In this diagram, \{*s*<sub>0</sub>, *s*<sub>1</sub>, ..., *s*<sub>*N*-1</sub>\}
are the words of the caption and \{*w*<sub>*e*</sub>*s*<sub>0</sub>,
......@@ -137,8 +133,7 @@ Each caption is a list of words. During preprocessing, a dictionary is created
that assigns each word in the vocabulary to an integer-valued id. Each caption
is encoded as a list of integer word ids in the `tf.SequenceExample` protos.
We have provided a script to download and preprocess the [MSCOCO]
(http://mscoco.org/) image captioning data set into this format. Downloading
We have provided a script to download and preprocess the [MSCOCO](http://mscoco.org/) image captioning data set into this format. Downloading
and preprocessing the data may take several hours depending on your network and
computer speed. Please be patient.
......@@ -266,8 +261,7 @@ tensorboard --logdir="${MODEL_DIR}"
### Fine Tune the Inception v3 Model
Your model will already be able to generate reasonable captions after the first
phase of training. Try it out! (See [Generating Captions]
(#generating-captions)).
phase of training. Try it out! (See [Generating Captions](#generating-captions)).
You can further improve the performance of the model by running a
second training phase to jointly fine-tune the parameters of the *Inception v3*
......@@ -337,6 +331,4 @@ expected.
Here is the image:
<center>
![Surfer](g3doc/COCO_val2014_000000224477.jpg)
</center>
......@@ -261,7 +261,12 @@ def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
label = labels[i]
text = texts[i]
image_buffer, height, width = _process_image(filename, coder)
try:
image_buffer, height, width = _process_image(filename, coder)
except Exception as e:
print(e)
print('SKIPPED: Unexpected eror while decoding %s.' % filename)
continue
example = _convert_to_example(filename, image_buffer, label,
text, height, width)
......
......@@ -128,7 +128,7 @@ class ResNet(object):
def _build_train_op(self):
"""Build training specific ops for the graph."""
self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
tf.summary.scalar('learning rate', self.lrn_rate)
tf.summary.scalar('learning_rate', self.lrn_rate)
trainable_variables = tf.trainable_variables()
grads = tf.gradients(self.cost, trainable_variables)
......
/bazel-bin
/bazel-ci_build-cache
/bazel-genfiles
/bazel-out
/bazel-skip_thoughts
/bazel-testlogs
/bazel-tf
*.pyc
# Skip-Thought Vectors
This is a TensorFlow implementation of the model described in:
Jamie Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel,
Antonio Torralba, Raquel Urtasun, Sanja Fidler.
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf).
*In NIPS, 2015.*
## Contact
***Code author:*** Chris Shallue
***Pull requests and issues:*** @cshallue
## Contents
* [Model Overview](#model-overview)
* [Getting Started](#getting-started)
* [Install Required Packages](#install-required-packages)
* [Download Pretrained Models (Optional)](#download-pretrained-models-optional)
* [Training a Model](#training-a-model)
* [Prepare the Training Data](#prepare-the-training-data)
* [Run the Training Script](#run-the-training-script)
* [Track Training Progress](#track-training-progress)
* [Expanding the Vocabulary](#expanding-the-vocabulary)
* [Overview](#overview)
* [Preparation](#preparation)
* [Run the Vocabulary Expansion Script](#run-the-vocabulary-expansion-script)
* [Evaluating a Model](#evaluating-a-model)
* [Overview](#overview-1)
* [Preparation](#preparation-1)
* [Run the Evaluation Tasks](#run-the-evaluation-tasks)
* [Encoding Sentences](#encoding-sentences)
## Model overview
The *Skip-Thoughts* model is a sentence encoder. It learns to encode input
sentences into a fixed-dimensional vector representation that is useful for many
tasks, for example to detect paraphrases or to classify whether a product review
is positive or negative. See the
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf)
paper for details of the model architecture and more example applications.
A trained *Skip-Thoughts* model will encode similar sentences nearby each other
in the embedding vector space. The following examples show the nearest neighbor by
cosine similarity of some sentences from the
[movie review dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/).
| Input sentence | Nearest Neighbor |
|----------------|------------------|
| Simplistic, silly and tedious. | Trite, banal, cliched, mostly inoffensive. |
| Not so much farcical as sour. | Not only unfunny, but downright repellent. |
| A sensitive and astute first feature by Anne-Sophie Birot. | Absorbing character study by André Turpin . |
| An enthralling, entertaining feature. | A slick, engrossing melodrama. |
## Getting Started
### Install Required Packages
First ensure that you have installed the following required packages:
* **Bazel** ([instructions](http://bazel.build/docs/install.html))
* **TensorFlow** ([instructions](https://www.tensorflow.org/install/))
* **NumPy** ([instructions](http://www.scipy.org/install.html))
* **scikit-learn** ([instructions](http://scikit-learn.org/stable/install.html))
* **Natural Language Toolkit (NLTK)**
* First install NLTK ([instructions](http://www.nltk.org/install.html))
* Then install the NLTK data ([instructions](http://www.nltk.org/data.html))
* **gensim** ([instructions](https://radimrehurek.com/gensim/install.html))
* Only required if you will be expanding your vocabulary with the [word2vec](https://code.google.com/archive/p/word2vec/) model.
### Download Pretrained Models (Optional)
You can download model checkpoints pretrained on the
[BookCorpus](http://yknzhu.wixsite.com/mbweb) dataset in the following
configurations:
* Unidirectional RNN encoder ("uni-skip" in the paper)
* Bidirectional RNN encoder ("bi-skip" in the paper)
```shell
# Directory to download the pretrained models to.
PRETRAINED_MODELS_DIR="${HOME}/skip_thoughts/pretrained/"
mkdir -p ${PRETRAINED_MODELS_DIR}
cd ${PRETRAINED_MODELS_DIR}
# Download and extract the unidirectional model.
wget "http://download.tensorflow.org/models/skip_thoughts_uni_2017_02_02.tar.gz"
tar -xvf skip_thoughts_uni_2017_02_02.tar.gz
rm skip_thoughts_uni_2017_02_02.tar.gz
# Download and extract the bidirectional model.
wget "http://download.tensorflow.org/models/skip_thoughts_bi_2017_02_16.tar.gz"
tar -xvf skip_thoughts_bi_2017_02_16.tar.gz
rm skip_thoughts_bi_2017_02_16.tar.gz
```
You can now skip to the sections [Evaluating a Model](#evaluating-a-model) and
[Encoding Sentences](#encoding-sentences).
## Training a Model
### Prepare the Training Data
To train a model you will need to provide training data in TFRecord format. The
TFRecord format consists of a set of sharded files containing serialized
`tf.Example` protocol buffers. Each `tf.Example` proto contains three
sentences:
* `encode`: The sentence to encode.
* `decode_pre`: The sentence preceding `encode` in the original text.
* `decode_post`: The sentence following `encode` in the original text.
Each sentence is a list of words. During preprocessing, a dictionary is created
that assigns each word in the vocabulary to an integer-valued id. Each sentence
is encoded as a list of integer word ids in the `tf.Example` protos.
We have provided a script to preprocess any set of text-files into this format.
You may wish to use the [BookCorpus](http://yknzhu.wixsite.com/mbweb) dataset.
Note that the preprocessing script may take **12 hours** or more to complete
on this large dataset.
```shell
# Comma-separated list of globs matching the input input files. The format of
# the input files is assumed to be a list of newline-separated sentences, where
# each sentence is already tokenized.
INPUT_FILES="${HOME}/skip_thoughts/bookcorpus/*.txt"
# Location to save the preprocessed training and validation data.
DATA_DIR="${HOME}/skip_thoughts/data"
# Build the preprocessing script.
bazel build -c opt skip_thoughts/data/preprocess_dataset
# Run the preprocessing script.
bazel-bin/skip_thoughts/data/preprocess_dataset \
--input_files=${INPUT_FILES} \
--output_dir=${DATA_DIR}
```
When the script finishes you will find 100 training files and 1 validation file
in `DATA_DIR`. The files will match the patterns `train-?????-of-00100` and
`validation-00000-of-00001` respectively.
The script will also produce a file named `vocab.txt`. The format of this file
is a list of newline-separated words where the word id is the corresponding 0-
based line index. Words are sorted by descending order of frequency in the input
data. Only the top 20,000 words are assigned unique ids; all other words are
assigned the "unknown id" of 1 in the processed data.
### Run the Training Script
Execute the following commands to start the training script. By default it will
run for 500k steps (around 9 days on a GeForce GTX 1080 GPU).
```shell
# Directory containing the preprocessed data.
DATA_DIR="${HOME}/skip_thoughts/data"
# Directory to save the model.
MODEL_DIR="${HOME}/skip_thoughts/model"
# Build the model.
bazel build -c opt skip_thoughts/...
# Run the training script.
bazel-bin/skip_thoughts/train \
--input_file_pattern="${DATA_DIR}/train-?????-of-00100" \
--train_dir="${MODEL_DIR}/train"
```
### Track Training Progress
Optionally, you can run the `track_perplexity` script in a separate process.
This will log per-word perplexity on the validation set which allows training
progress to be monitored on
[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
Note that you may run out of memory if you run the this script on the same GPU
as the training script. You can set the environment variable
`CUDA_VISIBLE_DEVICES=""` to force the script to run on CPU. If it runs too
slowly on CPU, you can decrease the value of `--num_eval_examples`.
```shell
DATA_DIR="${HOME}/skip_thoughts/data"
MODEL_DIR="${HOME}/skip_thoughts/model"
# Ignore GPU devices (only necessary if your GPU is currently memory
# constrained, for example, by running the training script).
export CUDA_VISIBLE_DEVICES=""
# Run the evaluation script. This will run in a loop, periodically loading the
# latest model checkpoint file and computing evaluation metrics.
bazel-bin/skip_thoughts/track_perplexity \
--input_file_pattern="${DATA_DIR}/validation-?????-of-00001" \
--checkpoint_dir="${MODEL_DIR}/train" \
--eval_dir="${MODEL_DIR}/val" \
--num_eval_examples=50000
```
If you started the `track_perplexity` script, run a
[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
server in a separate process for real-time monitoring of training summaries and
validation perplexity.
```shell
MODEL_DIR="${HOME}/skip_thoughts/model"
# Run a TensorBoard server.
tensorboard --logdir="${MODEL_DIR}"
```
## Expanding the Vocabulary
### Overview
The vocabulary generated by the preprocessing script contains only 20,000 words
which is insufficient for many tasks. For example, a sentence from Wikipedia
might contain nouns that do not appear in this vocabulary.
A solution to this problem described in the
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf)
paper is to learn a mapping that transfers word representations from one model to
another. This idea is based on the "Translation Matrix" method from the paper
[Exploiting Similarities Among Languages for Machine Translation](https://arxiv.org/abs/1309.4168).
Specifically, we will load the word embeddings from a trained *Skip-Thoughts*
model and from a trained [word2vec model](https://arxiv.org/pdf/1301.3781.pdf)
(which has a much larger vocabulary). We will train a linear regression model
without regularization to learn a linear mapping from the word2vec embedding
space to the *Skip-Thoughts* embedding space. We will then apply the linear
model to all words in the word2vec vocabulary, yielding vectors in the *Skip-
Thoughts* word embedding space for the union of the two vocabularies.
The linear regression task is to learn a parameter matrix *W* to minimize
*|| X - Y \* W ||<sup>2</sup>*, where *X* is a matrix of *Skip-Thoughts*
embeddings of shape `[num_words, dim1]`, *Y* is a matrix of word2vec embeddings
of shape `[num_words, dim2]`, and *W* is a matrix of shape `[dim2, dim1]`.
### Preparation
First you will need to download and unpack a pretrained
[word2vec model](https://arxiv.org/pdf/1301.3781.pdf) from
[this website](https://code.google.com/archive/p/word2vec/)
([direct download link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)).
This model was trained on the Google News dataset (about 100 billion words).
Also ensure that you have already [installed gensim](https://radimrehurek.com/gensim/install.html).
### Run the Vocabulary Expansion Script
```shell
# Path to checkpoint file or a directory containing checkpoint files (the script
# will select the most recent).
CHECKPOINT_PATH="${HOME}/skip_thoughts/model/train"
# Vocabulary file generated by the preprocessing script.
SKIP_THOUGHTS_VOCAB="${HOME}/skip_thoughts/data/vocab.txt"
# Path to downloaded word2vec model.
WORD2VEC_MODEL="${HOME}/skip_thoughts/googlenews/GoogleNews-vectors-negative300.bin"
# Output directory.
EXP_VOCAB_DIR="${HOME}/skip_thoughts/exp_vocab"
# Build the vocabulary expansion script.
bazel build -c opt skip_thoughts/vocabulary_expansion
# Run the vocabulary expansion script.
bazel-bin/skip_thoughts/vocabulary_expansion \
--skip_thoughts_model=${CHECKPOINT_PATH} \
--skip_thoughts_vocab=${SKIP_THOUGHTS_VOCAB} \
--word2vec_model=${WORD2VEC_MODEL} \
--output_dir=${EXP_VOCAB_DIR}
```
## Evaluating a Model
### Overview
The model can be evaluated using the benchmark tasks described in the
[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf)
paper. The following tasks are suported (refer to the paper for full details):
* **SICK** semantic relatedness task.
* **MSRP** (Microsoft Research Paraphrase Corpus) paraphrase detection task.
* Binary classification tasks:
* **MR** movie review sentiment task.
* **CR** customer product review task.
* **SUBJ** subjectivity/objectivity task.
* **MPQA** opinion polarity task.
* **TREC** question-type classification task.
### Preparation
You will need to clone or download the
[skip-thoughts GitHub repository](https://github.com/ryankiros/skip-thoughts) by
[ryankiros](https://github.com/ryankiros) (the first author of the Skip-Thoughts
paper):
```shell
# Folder to clone the repository to.
ST_KIROS_DIR="${HOME}/skip_thoughts/skipthoughts_kiros"
# Clone the repository.
git clone git@github.com:ryankiros/skip-thoughts.git "${ST_KIROS_DIR}/skipthoughts"
# Make the package importable.
export PYTHONPATH="${ST_KIROS_DIR}/:${PYTHONPATH}"
```
You will also need to download the data needed for each evaluation task. See the
instructions [here](https://github.com/ryankiros/skip-thoughts).
For example, the CR (customer review) dataset is found [here](http://nlp.stanford.edu/~sidaw/home/projects:nbsvm). For this task we want the
files `custrev.pos` and `custrev.neg`.
### Run the Evaluation Tasks
In the following example we will evaluate a unidirectional model ("uni-skip" in
the paper) on the CR task. To use a bidirectional model ("bi-skip" in the
paper), simply pass the flags `--bi_vocab_file`, `--bi_embeddings_file` and
`--bi_checkpoint_path` instead. To use the "combine-skip" model described in the
paper you will need to pass both the unidirectional and bidirectional flags.
```shell
# Path to checkpoint file or a directory containing checkpoint files (the script
# will select the most recent).
CHECKPOINT_PATH="${HOME}/skip_thoughts/model/train"
# Vocabulary file generated by the vocabulary expansion script.
VOCAB_FILE="${HOME}/skip_thoughts/exp_vocab/vocab.txt"
# Embeddings file generated by the vocabulary expansion script.
EMBEDDINGS_FILE="${HOME}/skip_thoughts/exp_vocab/embeddings.npy"
# Directory containing files custrev.pos and custrev.neg.
EVAL_DATA_DIR="${HOME}/skip_thoughts/eval_data"
# Build the evaluation script.
bazel build -c opt skip_thoughts/evaluate
# Run the evaluation script.
bazel-bin/skip_thoughts/evaluate \
--eval_task=CR \
--data_dir=${EVAL_DATA_DIR} \
--uni_vocab_file=${VOCAB_FILE} \
--uni_embeddings_file=${EMBEDDINGS_FILE} \
--uni_checkpoint_path=${CHECKPOINT_PATH}
```
Output:
```python
[0.82539682539682535, 0.84084880636604775, 0.83023872679045096,
0.86206896551724133, 0.83554376657824936, 0.85676392572944293,
0.84084880636604775, 0.83023872679045096, 0.85145888594164454,
0.82758620689655171]
```
The output is a list of accuracies of 10 cross-validation classification models.
To get a single number, simply take the average:
```python
ipython # Launch iPython.
In [0]:
import numpy as np
np.mean([0.82539682539682535, 0.84084880636604775, 0.83023872679045096,
0.86206896551724133, 0.83554376657824936, 0.85676392572944293,
0.84084880636604775, 0.83023872679045096, 0.85145888594164454,
0.82758620689655171])
Out [0]: 0.84009936423729525
```
## Encoding Sentences
In this example we will encode data from the
[movie review dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/)
(specifically the [sentence polarity dataset v1.0](https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)).
```python
ipython # Launch iPython.
In [0]:
# Imports.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os.path
import scipy.spatial.distance as sd
from skip_thoughts import configuration
from skip_thoughts import encoder_manager
In [1]:
# Set paths to the model.
VOCAB_FILE = "/path/to/vocab.txt"
EMBEDDING_MATRIX_FILE = "/path/to/embeddings.npy"
CHECKPOINT_PATH = "/path/to/model.ckpt-9999"
# The following directory should contain files rt-polarity.neg and
# rt-polarity.pos.
MR_DATA_DIR = "/dir/containing/mr/data"
In [2]:
# Set up the encoder. Here we are using a single unidirectional model.
# To use a bidirectional model as well, call load_model() again with
# configuration.model_config(bidirectional_encoder=True) and paths to the
# bidirectional model's files. The encoder will use the concatenation of
# all loaded models.
encoder = encoder_manager.EncoderManager()
encoder.load_model(configuration.model_config(),
vocabulary_file=VOCAB_FILE,
embedding_matrix_file=EMBEDDING_MATRIX_FILE,
checkpoint_path=CHECKPOINT_PATH)
In [3]:
# Load the movie review dataset.
data = []
with open(os.path.join(MR_DATA_DIR, 'rt-polarity.neg'), 'rb') as f:
data.extend([line.decode('latin-1').strip() for line in f])
with open(os.path.join(MR_DATA_DIR, 'rt-polarity.pos'), 'rb') as f:
data.extend([line.decode('latin-1').strip() for line in f])
In [4]:
# Generate Skip-Thought Vectors for each sentence in the dataset.
encodings = encoder.encode(data)
In [5]:
# Define a helper function to generate nearest neighbors.
def get_nn(ind, num=10):
encoding = encodings[ind]
scores = sd.cdist([encoding], encodings, "cosine")[0]
sorted_ids = np.argsort(scores)
print("Sentence:")
print("", data[ind])
print("\nNearest neighbors:")
for i in range(1, num + 1):
print(" %d. %s (%.3f)" %
(i, data[sorted_ids[i]], scores[sorted_ids[i]]))
In [6]:
# Compute nearest neighbors of the first sentence in the dataset.
get_nn(0)
```
Output:
```
Sentence:
simplistic , silly and tedious .
Nearest neighbors:
1. trite , banal , cliched , mostly inoffensive . (0.247)
2. banal and predictable . (0.253)
3. witless , pointless , tasteless and idiotic . (0.272)
4. loud , silly , stupid and pointless . (0.295)
5. grating and tedious . (0.299)
6. idiotic and ugly . (0.330)
7. black-and-white and unrealistic . (0.335)
8. hopelessly inane , humorless and under-inspired . (0.335)
9. shallow , noisy and pretentious . (0.340)
10. . . . unlikable , uninteresting , unfunny , and completely , utterly inept . (0.346)
```
package(default_visibility = [":internal"])
licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
package_group(
name = "internal",
packages = [
"//skip_thoughts/...",
],
)
py_library(
name = "configuration",
srcs = ["configuration.py"],
srcs_version = "PY2AND3",
)
py_library(
name = "skip_thoughts_model",
srcs = ["skip_thoughts_model.py"],
srcs_version = "PY2AND3",
deps = [
"//skip_thoughts/ops:gru_cell",
"//skip_thoughts/ops:input_ops",
],
)
py_test(
name = "skip_thoughts_model_test",
size = "large",
srcs = ["skip_thoughts_model_test.py"],
deps = [
":configuration",
":skip_thoughts_model",
],
)
py_binary(
name = "train",
srcs = ["train.py"],
srcs_version = "PY2AND3",
deps = [
":configuration",
":skip_thoughts_model",
],
)
py_binary(
name = "track_perplexity",
srcs = ["track_perplexity.py"],
srcs_version = "PY2AND3",
deps = [
":configuration",
":skip_thoughts_model",
],
)
py_binary(
name = "vocabulary_expansion",
srcs = ["vocabulary_expansion.py"],
srcs_version = "PY2AND3",
)
py_library(
name = "skip_thoughts_encoder",
srcs = ["skip_thoughts_encoder.py"],
srcs_version = "PY2AND3",
deps = [
":skip_thoughts_model",
"//skip_thoughts/data:special_words",
],
)
py_library(
name = "encoder_manager",
srcs = ["encoder_manager.py"],
srcs_version = "PY2AND3",
deps = [
":skip_thoughts_encoder",
],
)
py_binary(
name = "evaluate",
srcs = ["evaluate.py"],
srcs_version = "PY2AND3",
deps = [
":encoder_manager",
"//skip_thoughts:configuration",
],
)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Default configuration for model architecture and training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class _HParams(object):
"""Wrapper for configuration parameters."""
pass
def model_config(input_file_pattern=None,
input_queue_capacity=640000,
num_input_reader_threads=1,
shuffle_input_data=True,
uniform_init_scale=0.1,
vocab_size=20000,
batch_size=128,
word_embedding_dim=620,
bidirectional_encoder=False,
encoder_dim=2400):
"""Creates a model configuration object.
Args:
input_file_pattern: File pattern of sharded TFRecord files containing
tf.Example protobufs.
input_queue_capacity: Number of examples to keep in the input queue.
num_input_reader_threads: Number of threads for prefetching input
tf.Examples.
shuffle_input_data: Whether to shuffle the input data.
uniform_init_scale: Scale of random uniform initializer.
vocab_size: Number of unique words in the vocab.
batch_size: Batch size (training and evaluation only).
word_embedding_dim: Word embedding dimension.
bidirectional_encoder: Whether to use a bidirectional or unidirectional
encoder RNN.
encoder_dim: Number of output dimensions of the sentence encoder.
Returns:
An object containing model configuration parameters.
"""
config = _HParams()
config.input_file_pattern = input_file_pattern
config.input_queue_capacity = input_queue_capacity
config.num_input_reader_threads = num_input_reader_threads
config.shuffle_input_data = shuffle_input_data
config.uniform_init_scale = uniform_init_scale
config.vocab_size = vocab_size
config.batch_size = batch_size
config.word_embedding_dim = word_embedding_dim
config.bidirectional_encoder = bidirectional_encoder
config.encoder_dim = encoder_dim
return config
def training_config(learning_rate=0.0008,
learning_rate_decay_factor=0.5,
learning_rate_decay_steps=400000,
number_of_steps=500000,
clip_gradient_norm=5.0,
save_model_secs=600,
save_summaries_secs=600):
"""Creates a training configuration object.
Args:
learning_rate: Initial learning rate.
learning_rate_decay_factor: If > 0, the learning rate decay factor.
learning_rate_decay_steps: The number of steps before the learning rate
decays by learning_rate_decay_factor.
number_of_steps: The total number of training steps to run. Passing None
will cause the training script to run indefinitely.
clip_gradient_norm: If not None, then clip gradients to this value.
save_model_secs: How often (in seconds) to save model checkpoints.
save_summaries_secs: How often (in seconds) to save model summaries.
Returns:
An object containing training configuration parameters.
Raises:
ValueError: If learning_rate_decay_factor is set and
learning_rate_decay_steps is unset.
"""
if learning_rate_decay_factor and not learning_rate_decay_steps:
raise ValueError(
"learning_rate_decay_factor requires learning_rate_decay_steps.")
config = _HParams()
config.learning_rate = learning_rate
config.learning_rate_decay_factor = learning_rate_decay_factor
config.learning_rate_decay_steps = learning_rate_decay_steps
config.number_of_steps = number_of_steps
config.clip_gradient_norm = clip_gradient_norm
config.save_model_secs = save_model_secs
config.save_summaries_secs = save_summaries_secs
return config
package(default_visibility = ["//skip_thoughts:internal"])
licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
py_library(
name = "special_words",
srcs = ["special_words.py"],
srcs_version = "PY2AND3",
deps = [],
)
py_binary(
name = "preprocess_dataset",
srcs = [
"preprocess_dataset.py",
],
srcs_version = "PY2AND3",
deps = [
":special_words",
],
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment