Commit 68a18b70 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Merge pull request #1 from tensorflow/master

update to tensorflow/model master
parents bc70271a 2c4fea8d
......@@ -77,7 +77,7 @@ def _eval_once(saver, summary_writer, top_1_op, top_5_op, summary_op):
# /my-favorite-path/imagenet_train/model.ckpt-0,
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
print('Succesfully loaded model from %s at step=%s.' %
print('Successfully loaded model from %s at step=%s.' %
(ckpt.model_checkpoint_path, global_step))
else:
print('No checkpoint file found')
......@@ -158,10 +158,10 @@ def evaluate(dataset):
saver = tf.train.Saver(variables_to_restore)
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.merge_all_summaries()
summary_op = tf.summary.merge_all()
graph_def = tf.get_default_graph().as_graph_def()
summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir,
summary_writer = tf.summary.FileWriter(FLAGS.eval_dir,
graph_def=graph_def)
while True:
......
......@@ -115,7 +115,7 @@ def loss(logits, labels, batch_size=None):
# shape [FLAGS.batch_size, num_classes].
sparse_labels = tf.reshape(labels, [batch_size, 1])
indices = tf.reshape(tf.range(batch_size), [batch_size, 1])
concated = tf.concat(1, [indices, sparse_labels])
concated = tf.concat(axis=1, values=[indices, sparse_labels])
num_classes = logits[0].get_shape()[-1].value
dense_labels = tf.sparse_to_dense(concated,
[batch_size, num_classes],
......@@ -147,8 +147,8 @@ def _activation_summary(x):
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
tf.contrib.deprecated.histogram_summary(tensor_name + '/activations', x)
tf.contrib.deprecated.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
tf.summary.histogram(tensor_name + '/activations', x)
tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
def _activation_summaries(endpoints):
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A library to train Inception using multiple GPU's with synchronous updates.
"""A library to train Inception using multiple GPUs with synchronous updates.
"""
from __future__ import absolute_import
from __future__ import division
......@@ -83,7 +83,7 @@ def _tower_loss(images, labels, num_classes, scope, reuse_variables=None):
"""Calculate the total loss on a single tower running the ImageNet model.
We perform 'batch splitting'. This means that we cut up a batch across
multiple GPU's. For instance, if the batch size = 32 and num_gpus = 2,
multiple GPUs. For instance, if the batch size = 32 and num_gpus = 2,
then each tower will operate on an batch of 16 images.
Args:
......@@ -132,8 +132,8 @@ def _tower_loss(images, labels, num_classes, scope, reuse_variables=None):
loss_name = re.sub('%s_[0-9]*/' % inception.TOWER_NAME, '', l.op.name)
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(loss_name +' (raw)', l)
tf.scalar_summary(loss_name, loss_averages.average(l))
tf.summary.scalar(loss_name +' (raw)', l)
tf.summary.scalar(loss_name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
......@@ -166,7 +166,7 @@ def _average_gradients(tower_grads):
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(0, grads)
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
......@@ -223,8 +223,8 @@ def train(dataset):
num_classes = dataset.num_classes() + 1
# Split the batch of images and labels for towers.
images_splits = tf.split(0, FLAGS.num_gpus, images)
labels_splits = tf.split(0, FLAGS.num_gpus, labels)
images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)
# Calculate the gradients for each model tower.
tower_grads = []
......@@ -268,20 +268,20 @@ def train(dataset):
summaries.extend(input_summaries)
# Add a summary to track the learning rate.
summaries.append(tf.scalar_summary('learning_rate', lr))
summaries.append(tf.summary.scalar('learning_rate', lr))
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
summaries.append(
tf.histogram_summary(var.op.name + '/gradients', grad))
tf.summary.histogram(var.op.name + '/gradients', grad))
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Add histograms for trainable variables.
for var in tf.trainable_variables():
summaries.append(tf.histogram_summary(var.op.name, var))
summaries.append(tf.summary.histogram(var.op.name, var))
# Track the moving averages of all trainable variables.
# Note that we maintain a "double-average" of the BatchNormalization
......@@ -290,7 +290,7 @@ def train(dataset):
variable_averages = tf.train.ExponentialMovingAverage(
inception.MOVING_AVERAGE_DECAY, global_step)
# Another possiblility is to use tf.slim.get_variables().
# Another possibility is to use tf.slim.get_variables().
variables_to_average = (tf.trainable_variables() +
tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
......@@ -301,10 +301,10 @@ def train(dataset):
batchnorm_updates_op)
# Create a saver.
saver = tf.train.Saver(tf.all_variables())
saver = tf.train.Saver(tf.global_variables())
# Build the summary operation from the last tower summaries.
summary_op = tf.merge_summary(summaries)
summary_op = tf.summary.merge(summaries)
# Build an initialization operation to run below.
init = tf.global_variables_initializer()
......@@ -329,9 +329,9 @@ def train(dataset):
# Start the queue runners.
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.train.SummaryWriter(
summary_writer = tf.summary.FileWriter(
FLAGS.train_dir,
graph_def=sess.graph.as_graph_def(add_shapes=True))
graph=sess.graph)
for step in range(FLAGS.max_steps):
start_time = time.time()
......
......@@ -319,7 +319,7 @@ their use, consider the following example.
def MyNewOp(inputs):
varA = ...
varB = ...
outputs = tf.mul(varA, inputs) + varB
outputs = tf.multiply(varA, inputs) + varB
return outputs
```
......@@ -445,15 +445,15 @@ defined with just the following snippet:
```python
with arg_scope([slim.ops.conv2d, slim.ops.fc], stddev=0.01, weight_decay=0.0005):
net = slim.ops.repeat_op(1, inputs, slim.ops.conv2d, 64, [3, 3], scope='conv1')
net = slim.ops.repeat_op(2, inputs, slim.ops.conv2d, 64, [3, 3], scope='conv1')
net = slim.ops.max_pool(net, [2, 2], scope='pool1')
net = slim.ops.repeat_op(1, net, slim.ops.conv2d, 128, [3, 3], scope='conv2')
net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 128, [3, 3], scope='conv2')
net = slim.ops.max_pool(net, [2, 2], scope='pool2')
net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 256, [3, 3], scope='conv3')
net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 256, [3, 3], scope='conv3')
net = slim.ops.max_pool(net, [2, 2], scope='pool3')
net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 512, [3, 3], scope='conv4')
net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv4')
net = slim.ops.max_pool(net, [2, 2], scope='pool4')
net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 512, [3, 3], scope='conv5')
net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv5')
net = slim.ops.max_pool(net, [2, 2], scope='pool5')
net = slim.ops.flatten(net, scope='flatten5')
net = slim.ops.fc(net, 4096, scope='fc6')
......
......@@ -122,7 +122,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 32, [1, 1])
net = tf.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool])
end_points['mixed_35x35x256a'] = net
# mixed_1: 35 x 35 x 288.
with tf.variable_scope('mixed_35x35x288a'):
......@@ -138,7 +138,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 64, [1, 1])
net = tf.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool])
end_points['mixed_35x35x288a'] = net
# mixed_2: 35 x 35 x 288.
with tf.variable_scope('mixed_35x35x288b'):
......@@ -154,7 +154,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 64, [1, 1])
net = tf.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool])
end_points['mixed_35x35x288b'] = net
# mixed_3: 17 x 17 x 768.
with tf.variable_scope('mixed_17x17x768a'):
......@@ -167,7 +167,7 @@ def inception_v3(inputs,
stride=2, padding='VALID')
with tf.variable_scope('branch_pool'):
branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID')
net = tf.concat([branch3x3, branch3x3dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch3x3, branch3x3dbl, branch_pool])
end_points['mixed_17x17x768a'] = net
# mixed4: 17 x 17 x 768.
with tf.variable_scope('mixed_17x17x768b'):
......@@ -186,7 +186,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool])
end_points['mixed_17x17x768b'] = net
# mixed_5: 17 x 17 x 768.
with tf.variable_scope('mixed_17x17x768c'):
......@@ -205,7 +205,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool])
end_points['mixed_17x17x768c'] = net
# mixed_6: 17 x 17 x 768.
with tf.variable_scope('mixed_17x17x768d'):
......@@ -224,7 +224,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool])
end_points['mixed_17x17x768d'] = net
# mixed_7: 17 x 17 x 768.
with tf.variable_scope('mixed_17x17x768e'):
......@@ -243,7 +243,7 @@ def inception_v3(inputs,
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool])
end_points['mixed_17x17x768e'] = net
# Auxiliary Head logits
aux_logits = tf.identity(end_points['mixed_17x17x768e'])
......@@ -276,7 +276,7 @@ def inception_v3(inputs,
stride=2, padding='VALID')
with tf.variable_scope('branch_pool'):
branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID')
net = tf.concat([branch3x3, branch7x7x3, branch_pool], 3)
net = tf.concat(axis=3, values=[branch3x3, branch7x7x3, branch_pool])
end_points['mixed_17x17x1280a'] = net
# mixed_9: 8 x 8 x 2048.
with tf.variable_scope('mixed_8x8x2048a'):
......@@ -284,17 +284,17 @@ def inception_v3(inputs,
branch1x1 = ops.conv2d(net, 320, [1, 1])
with tf.variable_scope('branch3x3'):
branch3x3 = ops.conv2d(net, 384, [1, 1])
branch3x3 = tf.concat([ops.conv2d(branch3x3, 384, [1, 3]),
ops.conv2d(branch3x3, 384, [3, 1])], 3)
branch3x3 = tf.concat(axis=3, values=[ops.conv2d(branch3x3, 384, [1, 3]),
ops.conv2d(branch3x3, 384, [3, 1])])
with tf.variable_scope('branch3x3dbl'):
branch3x3dbl = ops.conv2d(net, 448, [1, 1])
branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3])
branch3x3dbl = tf.concat([ops.conv2d(branch3x3dbl, 384, [1, 3]),
ops.conv2d(branch3x3dbl, 384, [3, 1])], 3)
branch3x3dbl = tf.concat(axis=3, values=[ops.conv2d(branch3x3dbl, 384, [1, 3]),
ops.conv2d(branch3x3dbl, 384, [3, 1])])
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
net = tf.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch3x3, branch3x3dbl, branch_pool])
end_points['mixed_8x8x2048a'] = net
# mixed_10: 8 x 8 x 2048.
with tf.variable_scope('mixed_8x8x2048b'):
......@@ -302,17 +302,17 @@ def inception_v3(inputs,
branch1x1 = ops.conv2d(net, 320, [1, 1])
with tf.variable_scope('branch3x3'):
branch3x3 = ops.conv2d(net, 384, [1, 1])
branch3x3 = tf.concat([ops.conv2d(branch3x3, 384, [1, 3]),
ops.conv2d(branch3x3, 384, [3, 1])], 3)
branch3x3 = tf.concat(axis=3, values=[ops.conv2d(branch3x3, 384, [1, 3]),
ops.conv2d(branch3x3, 384, [3, 1])])
with tf.variable_scope('branch3x3dbl'):
branch3x3dbl = ops.conv2d(net, 448, [1, 1])
branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3])
branch3x3dbl = tf.concat([ops.conv2d(branch3x3dbl, 384, [1, 3]),
ops.conv2d(branch3x3dbl, 384, [3, 1])], 3)
branch3x3dbl = tf.concat(axis=3, values=[ops.conv2d(branch3x3dbl, 384, [1, 3]),
ops.conv2d(branch3x3dbl, 384, [3, 1])])
with tf.variable_scope('branch_pool'):
branch_pool = ops.avg_pool(net, [3, 3])
branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
net = tf.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], 3)
net = tf.concat(axis=3, values=[branch1x1, branch3x3, branch3x3dbl, branch_pool])
end_points['mixed_8x8x2048b'] = net
# Final pooling and prediction
with tf.variable_scope('logits'):
......
......@@ -15,7 +15,7 @@
"""Contains convenience wrappers for typical Neural Network TensorFlow layers.
Additionally it maintains a collection with update_ops that need to be
updated after the ops have been computed, for exmaple to update moving means
updated after the ops have been computed, for example to update moving means
and moving variances of batch_norm.
Ops that have different behavior during training or eval have an is_training
......@@ -331,9 +331,9 @@ def one_hot_encoding(labels, num_classes, scope=None):
batch_size = labels.get_shape()[0]
indices = tf.expand_dims(tf.range(0, batch_size), 1)
labels = tf.cast(tf.expand_dims(labels, 1), indices.dtype)
concated = tf.concat([indices, labels], 1)
concated = tf.concat(axis=1, values=[indices, labels])
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([batch_size, num_classes]), 1.0, 0.0)
concated, tf.stack([batch_size, num_classes]), 1.0, 0.0)
onehot_labels.set_shape([batch_size, num_classes])
return onehot_labels
......
......@@ -21,8 +21,6 @@ from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
from inception.slim import ops
from inception.slim import scopes
from inception.slim import variables
......@@ -420,7 +418,7 @@ class DropoutTest(tf.test.TestCase):
with self.test_session():
images = tf.random_uniform((5, height, width, 3), seed=1)
output = ops.dropout(images)
self.assertEquals(output.op.name, 'Dropout/dropout/mul_1')
self.assertEquals(output.op.name, 'Dropout/dropout/mul')
output.get_shape().assert_is_compatible_with(images.get_shape())
def testCreateDropoutNoTraining(self):
......@@ -601,8 +599,7 @@ class BatchNormTest(tf.test.TestCase):
output = ops.batch_norm(images, decay=0.1)
update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION)
with tf.control_dependencies(update_ops):
barrier = tf.no_op(name='gradient_barrier')
output = control_flow_ops.with_dependencies([barrier], output)
output = tf.identity(output)
# Initialize all variables
sess.run(tf.global_variables_initializer())
moving_mean = variables.get_variables('BatchNorm/moving_mean')[0]
......@@ -631,8 +628,7 @@ class BatchNormTest(tf.test.TestCase):
output = ops.batch_norm(images, decay=0.1, is_training=False)
update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION)
with tf.control_dependencies(update_ops):
barrier = tf.no_op(name='gradient_barrier')
output = control_flow_ops.with_dependencies([barrier], output)
output = tf.identity(output)
# Initialize all variables
sess.run(tf.global_variables_initializer())
moving_mean = variables.get_variables('BatchNorm/moving_mean')[0]
......@@ -665,8 +661,7 @@ class BatchNormTest(tf.test.TestCase):
output = ops.batch_norm(images, decay=0.1, is_training=False)
update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION)
with tf.control_dependencies(update_ops):
barrier = tf.no_op(name='gradient_barrier')
output = control_flow_ops.with_dependencies([barrier], output)
output = tf.identity(output)
# Initialize all variables
sess.run(tf.global_variables_initializer())
moving_mean = variables.get_variables('BatchNorm/moving_mean')[0]
......
......@@ -240,7 +240,7 @@ def global_step(device=''):
# Get the device for the variable.
with tf.device(variable_device(device, 'global_step')):
return tf.get_variable('global_step', shape=[], dtype=tf.int64,
initializer=tf.zeros_initializer,
initializer=tf.zeros_initializer(),
trainable=False, collections=collections)
......
Code for the Memory Module as described
in "Learning to Remember Rare Events" by
Lukasz Kaiser, Ofir Nachum, Aurko Roy, and Samy Bengio
published as a conference paper at ICLR 2017.
Requirements:
* TensorFlow (see tensorflow.org for how to install)
* Some basic command-line utilities (git, unzip).
Description:
The general memory module is located in memory.py.
Some code is provided to see the memory module in
action on the standard Omniglot dataset.
Download and setup the dataset using data_utils.py
and then run the training script train.py
(see example commands below).
Note that the structure and parameters of the model
are optimized for the data preparation as provided.
Quick Start:
First download and set-up Omniglot data by running
```
python data_utils.py
```
Then run the training script:
```
python train.py --memory_size=8192 \
--batch_size=16 --validation_length=50 \
--episode_width=5 --episode_length=30
```
The first validation batch may look like this (although it is noisy):
```
0-shot: 0.040, 1-shot: 0.404, 2-shot: 0.516, 3-shot: 0.604,
4-shot: 0.656, 5-shot: 0.684
```
At step 500 you may see something like this:
```
0-shot: 0.036, 1-shot: 0.836, 2-shot: 0.900, 3-shot: 0.940,
4-shot: 0.944, 5-shot: 0.916
```
At step 4000 you may see something like this:
```
0-shot: 0.044, 1-shot: 0.960, 2-shot: 1.000, 3-shot: 0.988,
4-shot: 0.972, 5-shot: 0.992
```
Maintained by Ofir Nachum (ofirnachum) and
Lukasz Kaiser (lukaszkaiser).
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""Data loading and other utilities.
Use this file to first copy over and pre-process the Omniglot dataset.
Simply call
python data_utils.py
"""
import cPickle as pickle
import logging
import os
import subprocess
import numpy as np
from scipy.misc import imresize
from scipy.misc import imrotate
from scipy.ndimage import imread
import tensorflow as tf
MAIN_DIR = ''
REPO_LOCATION = 'https://github.com/brendenlake/omniglot.git'
REPO_DIR = os.path.join(MAIN_DIR, 'omniglot')
DATA_DIR = os.path.join(REPO_DIR, 'python')
TRAIN_DIR = os.path.join(DATA_DIR, 'images_background')
TEST_DIR = os.path.join(DATA_DIR, 'images_evaluation')
DATA_FILE_FORMAT = os.path.join(MAIN_DIR, '%s_omni.pkl')
TRAIN_ROTATIONS = True # augment training data with rotations
TEST_ROTATIONS = False # augment testing data with rotations
IMAGE_ORIGINAL_SIZE = 105
IMAGE_NEW_SIZE = 28
def get_data():
"""Get data in form suitable for episodic training.
Returns:
Train and test data as dictionaries mapping
label to list of examples.
"""
with tf.gfile.GFile(DATA_FILE_FORMAT % 'train') as f:
processed_train_data = pickle.load(f)
with tf.gfile.GFile(DATA_FILE_FORMAT % 'test') as f:
processed_test_data = pickle.load(f)
train_data = {}
test_data = {}
for data, processed_data in zip([train_data, test_data],
[processed_train_data, processed_test_data]):
for image, label in zip(processed_data['images'],
processed_data['labels']):
if label not in data:
data[label] = []
data[label].append(image.reshape([-1]).astype('float32'))
intersection = set(train_data.keys()) & set(test_data.keys())
assert not intersection, 'Train and test data intersect.'
ok_num_examples = [len(ll) == 20 for _, ll in train_data.iteritems()]
assert all(ok_num_examples), 'Bad number of examples in train data.'
ok_num_examples = [len(ll) == 20 for _, ll in test_data.iteritems()]
assert all(ok_num_examples), 'Bad number of examples in test data.'
logging.info('Number of labels in train data: %d.', len(train_data))
logging.info('Number of labels in test data: %d.', len(test_data))
return train_data, test_data
def crawl_directory(directory, augment_with_rotations=False,
first_label=0):
"""Crawls data directory and returns stuff."""
label_idx = first_label
images = []
labels = []
info = []
# traverse root directory
for root, _, files in os.walk(directory):
logging.info('Reading files from %s', root)
fileflag = 0
for file_name in files:
full_file_name = os.path.join(root, file_name)
img = imread(full_file_name, flatten=True)
for i, angle in enumerate([0, 90, 180, 270]):
if not augment_with_rotations and i > 0:
break
images.append(imrotate(img, angle))
labels.append(label_idx + i)
info.append(full_file_name)
fileflag = 1
if fileflag:
label_idx += 4 if augment_with_rotations else 1
return images, labels, info
def resize_images(images, new_width, new_height):
"""Resize images to new dimensions."""
resized_images = np.zeros([images.shape[0], new_width, new_height],
dtype=np.float32)
for i in range(images.shape[0]):
resized_images[i, :, :] = imresize(images[i, :, :],
[new_width, new_height],
interp='bilinear',
mode=None)
return resized_images
def write_datafiles(directory, write_file,
resize=True, rotate=False,
new_width=IMAGE_NEW_SIZE, new_height=IMAGE_NEW_SIZE,
first_label=0):
"""Load and preprocess images from a directory and write them to a file.
Args:
directory: Directory of alphabet sub-directories.
write_file: Filename to write to.
resize: Whether to resize the images.
rotate: Whether to augment the dataset with rotations.
new_width: New resize width.
new_height: New resize height.
first_label: Label to start with.
Returns:
Number of new labels created.
"""
# these are the default sizes for Omniglot:
imgwidth = IMAGE_ORIGINAL_SIZE
imgheight = IMAGE_ORIGINAL_SIZE
logging.info('Reading the data.')
images, labels, info = crawl_directory(directory,
augment_with_rotations=rotate,
first_label=first_label)
images_np = np.zeros([len(images), imgwidth, imgheight], dtype=np.bool)
labels_np = np.zeros([len(labels)], dtype=np.uint32)
for i in xrange(len(images)):
images_np[i, :, :] = images[i]
labels_np[i] = labels[i]
if resize:
logging.info('Resizing images.')
resized_images = resize_images(images_np, new_width, new_height)
logging.info('Writing resized data in float32 format.')
data = {'images': resized_images,
'labels': labels_np,
'info': info}
with tf.gfile.GFile(write_file, 'w') as f:
pickle.dump(data, f)
else:
logging.info('Writing original sized data in boolean format.')
data = {'images': images_np,
'labels': labels_np,
'info': info}
with tf.gfile.GFile(write_file, 'w') as f:
pickle.dump(data, f)
return len(np.unique(labels_np))
def maybe_download_data():
"""Download Omniglot repo if it does not exist."""
if os.path.exists(REPO_DIR):
logging.info('It appears that Git repo already exists.')
else:
logging.info('It appears that Git repo does not exist.')
logging.info('Cloning now.')
subprocess.check_output('git clone %s' % REPO_LOCATION, shell=True)
if os.path.exists(TRAIN_DIR):
logging.info('It appears that train data has already been unzipped.')
else:
logging.info('It appears that train data has not been unzipped.')
logging.info('Unzipping now.')
subprocess.check_output('unzip %s.zip -d %s' % (TRAIN_DIR, DATA_DIR),
shell=True)
if os.path.exists(TEST_DIR):
logging.info('It appears that test data has already been unzipped.')
else:
logging.info('It appears that test data has not been unzipped.')
logging.info('Unzipping now.')
subprocess.check_output('unzip %s.zip -d %s' % (TEST_DIR, DATA_DIR),
shell=True)
def preprocess_omniglot():
"""Download and prepare raw Omniglot data.
Downloads the data from GitHub if it does not exist.
Then load the images, augment with rotations if desired.
Resize the images and write them to a pickle file.
"""
maybe_download_data()
directory = TRAIN_DIR
write_file = DATA_FILE_FORMAT % 'train'
num_labels = write_datafiles(
directory, write_file, resize=True, rotate=TRAIN_ROTATIONS,
new_width=IMAGE_NEW_SIZE, new_height=IMAGE_NEW_SIZE)
directory = TEST_DIR
write_file = DATA_FILE_FORMAT % 'test'
write_datafiles(directory, write_file, resize=True, rotate=TEST_ROTATIONS,
new_width=IMAGE_NEW_SIZE, new_height=IMAGE_NEW_SIZE,
first_label=num_labels)
def main(unused_argv):
logging.basicConfig(level=logging.INFO)
preprocess_omniglot()
if __name__ == '__main__':
tf.app.run()
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""Memory module for storing "nearest neighbors".
Implements a key-value memory for generalized one-shot learning
as described in the paper
"Learning to Remember Rare Events"
by Lukasz Kaiser, Ofir Nachum, Aurko Roy, Samy Bengio,
published as a conference paper at ICLR 2017.
"""
import numpy as np
import tensorflow as tf
class Memory(object):
"""Memory module."""
def __init__(self, key_dim, memory_size, vocab_size,
choose_k=256, alpha=0.1, correct_in_top=1, age_noise=8.0,
var_cache_device='', nn_device=''):
self.key_dim = key_dim
self.memory_size = memory_size
self.vocab_size = vocab_size
self.choose_k = min(choose_k, memory_size)
self.alpha = alpha
self.correct_in_top = correct_in_top
self.age_noise = age_noise
self.var_cache_device = var_cache_device # Variables are cached here.
self.nn_device = nn_device # Device to perform nearest neighbour matmul.
caching_device = var_cache_device if var_cache_device else None
self.update_memory = tf.constant(True) # Can be fed "false" if needed.
self.mem_keys = tf.get_variable(
'memkeys', [self.memory_size, self.key_dim], trainable=False,
initializer=tf.random_uniform_initializer(-0.0, 0.0),
caching_device=caching_device)
self.mem_vals = tf.get_variable(
'memvals', [self.memory_size], dtype=tf.int32, trainable=False,
initializer=tf.constant_initializer(0, tf.int32),
caching_device=caching_device)
self.mem_age = tf.get_variable(
'memage', [self.memory_size], dtype=tf.float32, trainable=False,
initializer=tf.constant_initializer(0.0), caching_device=caching_device)
self.recent_idx = tf.get_variable(
'recent_idx', [self.vocab_size], dtype=tf.int32, trainable=False,
initializer=tf.constant_initializer(0, tf.int32))
# variable for projecting query vector into memory key
self.query_proj = tf.get_variable(
'memory_query_proj', [self.key_dim, self.key_dim], dtype=tf.float32,
initializer=tf.truncated_normal_initializer(0, 0.01),
caching_device=caching_device)
def get(self):
return self.mem_keys, self.mem_vals, self.mem_age, self.recent_idx
def set(self, k, v, a, r=None):
return tf.group(
self.mem_keys.assign(k),
self.mem_vals.assign(v),
self.mem_age.assign(a),
(self.recent_idx.assign(r) if r is not None else tf.group()))
def clear(self):
return tf.variables_initializer([self.mem_keys, self.mem_vals, self.mem_age,
self.recent_idx])
def get_hint_pool_idxs(self, normalized_query):
"""Get small set of idxs to compute nearest neighbor queries on.
This is an expensive look-up on the whole memory that is used to
avoid more expensive operations later on.
Args:
normalized_query: A Tensor of shape [None, key_dim].
Returns:
A Tensor of shape [None, choose_k] of indices in memory
that are closest to the queries.
"""
# look up in large memory, no gradients
with tf.device(self.nn_device):
similarities = tf.matmul(tf.stop_gradient(normalized_query),
self.mem_keys, transpose_b=True, name='nn_mmul')
_, hint_pool_idxs = tf.nn.top_k(
tf.stop_gradient(similarities), k=self.choose_k, name='nn_topk')
return hint_pool_idxs
def make_update_op(self, upd_idxs, upd_keys, upd_vals,
batch_size, use_recent_idx, intended_output):
"""Function that creates all the update ops."""
mem_age_incr = self.mem_age.assign_add(tf.ones([self.memory_size],
dtype=tf.float32))
with tf.control_dependencies([mem_age_incr]):
mem_age_upd = tf.scatter_update(
self.mem_age, upd_idxs, tf.zeros([batch_size], dtype=tf.float32))
mem_key_upd = tf.scatter_update(
self.mem_keys, upd_idxs, upd_keys)
mem_val_upd = tf.scatter_update(
self.mem_vals, upd_idxs, upd_vals)
if use_recent_idx:
recent_idx_upd = tf.scatter_update(
self.recent_idx, intended_output, upd_idxs)
else:
recent_idx_upd = tf.group()
return tf.group(mem_age_upd, mem_key_upd, mem_val_upd, recent_idx_upd)
def query(self, query_vec, intended_output, use_recent_idx=True):
"""Queries memory for nearest neighbor.
Args:
query_vec: A batch of vectors to query (embedding of input to model).
intended_output: The values that would be the correct output of the
memory.
use_recent_idx: Whether to always insert at least one instance of a
correct memory fetch.
Returns:
A tuple (result, mask, teacher_loss).
result: The result of the memory look up.
mask: The affinity of the query to the result.
teacher_loss: The loss for training the memory module.
"""
batch_size = tf.shape(query_vec)[0]
output_given = intended_output is not None
# prepare query for memory lookup
query_vec = tf.matmul(query_vec, self.query_proj)
normalized_query = tf.nn.l2_normalize(query_vec, dim=1)
hint_pool_idxs = self.get_hint_pool_idxs(normalized_query)
if output_given and use_recent_idx: # add at least one correct memory
most_recent_hint_idx = tf.gather(self.recent_idx, intended_output)
hint_pool_idxs = tf.concat(
axis=1,
values=[hint_pool_idxs, tf.expand_dims(most_recent_hint_idx, 1)])
choose_k = tf.shape(hint_pool_idxs)[1]
with tf.device(self.var_cache_device):
# create small memory and look up with gradients
my_mem_keys = tf.stop_gradient(tf.gather(self.mem_keys, hint_pool_idxs,
name='my_mem_keys_gather'))
similarities = tf.matmul(tf.expand_dims(normalized_query, 1),
my_mem_keys, adjoint_b=True, name='batch_mmul')
hint_pool_sims = tf.squeeze(similarities, [1], name='hint_pool_sims')
hint_pool_mem_vals = tf.gather(self.mem_vals, hint_pool_idxs,
name='hint_pool_mem_vals')
# Calculate softmax mask on the top-k if requested.
# Softmax temperature. Say we have K elements at dist x and one at (x+a).
# Softmax of the last is e^tm(x+a)/Ke^tm*x + e^tm(x+a) = e^tm*a/K+e^tm*a.
# To make that 20% we'd need to have e^tm*a ~= 0.2K, so tm = log(0.2K)/a.
softmax_temp = max(1.0, np.log(0.2 * self.choose_k) / self.alpha)
mask = tf.nn.softmax(hint_pool_sims[:, :choose_k - 1] * softmax_temp)
# prepare hints from the teacher on hint pool
teacher_hints = tf.to_float(
tf.abs(tf.expand_dims(intended_output, 1) - hint_pool_mem_vals))
teacher_hints = 1.0 - tf.minimum(1.0, teacher_hints)
teacher_vals, teacher_hint_idxs = tf.nn.top_k(
hint_pool_sims * teacher_hints, k=1)
neg_teacher_vals, _ = tf.nn.top_k(
hint_pool_sims * (1 - teacher_hints), k=1)
# bring back idxs to full memory
teacher_idxs = tf.gather(
tf.reshape(hint_pool_idxs, [-1]),
teacher_hint_idxs[:, 0] + choose_k * tf.range(batch_size))
# zero-out teacher_vals if there are no hints
teacher_vals *= (
1 - tf.to_float(tf.equal(0.0, tf.reduce_sum(teacher_hints, 1))))
# prepare returned values
nearest_neighbor = tf.to_int32(
tf.argmax(hint_pool_sims[:, :choose_k - 1], 1))
no_teacher_idxs = tf.gather(
tf.reshape(hint_pool_idxs, [-1]),
nearest_neighbor + choose_k * tf.range(batch_size))
# we'll determine whether to do an update to memory based on whether
# memory was queried correctly
sliced_hints = tf.slice(teacher_hints, [0, 0], [-1, self.correct_in_top])
incorrect_memory_lookup = tf.equal(0.0, tf.reduce_sum(sliced_hints, 1))
# loss based on triplet loss
teacher_loss = (tf.nn.relu(neg_teacher_vals - teacher_vals + self.alpha)
- self.alpha)
with tf.device(self.var_cache_device):
result = tf.gather(self.mem_vals, tf.reshape(no_teacher_idxs, [-1]))
# prepare memory updates
update_keys = normalized_query
update_vals = intended_output
fetched_idxs = teacher_idxs # correctly fetched from memory
with tf.device(self.var_cache_device):
fetched_keys = tf.gather(self.mem_keys, fetched_idxs, name='fetched_keys')
fetched_vals = tf.gather(self.mem_vals, fetched_idxs, name='fetched_vals')
# do memory updates here
fetched_keys_upd = update_keys + fetched_keys # Momentum-like update
fetched_keys_upd = tf.nn.l2_normalize(fetched_keys_upd, dim=1)
# Randomize age a bit, e.g., to select different ones in parallel workers.
mem_age_with_noise = self.mem_age + tf.random_uniform(
[self.memory_size], - self.age_noise, self.age_noise)
_, oldest_idxs = tf.nn.top_k(mem_age_with_noise, k=batch_size, sorted=False)
with tf.control_dependencies([result]):
upd_idxs = tf.where(incorrect_memory_lookup,
oldest_idxs,
fetched_idxs)
# upd_idxs = tf.Print(upd_idxs, [upd_idxs], "UPD IDX", summarize=8)
upd_keys = tf.where(incorrect_memory_lookup,
update_keys,
fetched_keys_upd)
upd_vals = tf.where(incorrect_memory_lookup,
update_vals,
fetched_vals)
def make_update_op():
return self.make_update_op(upd_idxs, upd_keys, upd_vals,
batch_size, use_recent_idx, intended_output)
update_op = tf.cond(self.update_memory, make_update_op, tf.no_op)
with tf.control_dependencies([update_op]):
result = tf.identity(result)
mask = tf.identity(mask)
teacher_loss = tf.identity(teacher_loss)
return result, mask, tf.reduce_mean(teacher_loss)
class LSHMemory(Memory):
"""Memory employing locality sensitive hashing.
Note: Not fully tested.
"""
def __init__(self, key_dim, memory_size, vocab_size,
choose_k=256, alpha=0.1, correct_in_top=1, age_noise=8.0,
var_cache_device='', nn_device='',
num_hashes=None, num_libraries=None):
super(LSHMemory, self).__init__(
key_dim, memory_size, vocab_size,
choose_k=choose_k, alpha=alpha, correct_in_top=1, age_noise=age_noise,
var_cache_device=var_cache_device, nn_device=nn_device)
self.num_libraries = num_libraries or int(self.choose_k ** 0.5)
self.num_per_hash_slot = max(1, self.choose_k // self.num_libraries)
self.num_hashes = (num_hashes or
int(np.log2(self.memory_size / self.num_per_hash_slot)))
self.num_hashes = min(max(self.num_hashes, 1), 20)
self.num_hash_slots = 2 ** self.num_hashes
# hashing vectors
self.hash_vecs = [
tf.get_variable(
'hash_vecs%d' % i, [self.num_hashes, self.key_dim],
dtype=tf.float32, trainable=False,
initializer=tf.truncated_normal_initializer(0, 1))
for i in xrange(self.num_libraries)]
# map representing which hash slots map to which mem keys
self.hash_slots = [
tf.get_variable(
'hash_slots%d' % i, [self.num_hash_slots, self.num_per_hash_slot],
dtype=tf.int32, trainable=False,
initializer=tf.random_uniform_initializer(maxval=self.memory_size,
dtype=tf.int32))
for i in xrange(self.num_libraries)]
def get(self): # not implemented
return self.mem_keys, self.mem_vals, self.mem_age, self.recent_idx
def set(self, k, v, a, r=None): # not implemented
return tf.group(
self.mem_keys.assign(k),
self.mem_vals.assign(v),
self.mem_age.assign(a),
(self.recent_idx.assign(r) if r is not None else tf.group()))
def clear(self):
return tf.variables_initializer([self.mem_keys, self.mem_vals, self.mem_age,
self.recent_idx] + self.hash_slots)
def get_hash_slots(self, query):
"""Gets hashed-to buckets for batch of queries.
Args:
query: 2-d Tensor of query vectors.
Returns:
A list of hashed-to buckets for each hash function.
"""
binary_hash = [
tf.less(tf.matmul(query, self.hash_vecs[i], transpose_b=True), 0)
for i in xrange(self.num_libraries)]
hash_slot_idxs = [
tf.reduce_sum(
tf.to_int32(binary_hash[i]) *
tf.constant([[2 ** i for i in xrange(self.num_hashes)]],
dtype=tf.int32), 1)
for i in xrange(self.num_libraries)]
return hash_slot_idxs
def get_hint_pool_idxs(self, normalized_query):
"""Get small set of idxs to compute nearest neighbor queries on.
This is an expensive look-up on the whole memory that is used to
avoid more expensive operations later on.
Args:
normalized_query: A Tensor of shape [None, key_dim].
Returns:
A Tensor of shape [None, choose_k] of indices in memory
that are closest to the queries.
"""
# get hash of query vecs
hash_slot_idxs = self.get_hash_slots(normalized_query)
# grab mem idxs in the hash slots
hint_pool_idxs = [
tf.maximum(tf.minimum(
tf.gather(self.hash_slots[i], idxs),
self.memory_size - 1), 0)
for i, idxs in enumerate(hash_slot_idxs)]
return tf.concat(axis=1, values=hint_pool_idxs)
def make_update_op(self, upd_idxs, upd_keys, upd_vals,
batch_size, use_recent_idx, intended_output):
"""Function that creates all the update ops."""
base_update_op = super(LSHMemory, self).make_update_op(
upd_idxs, upd_keys, upd_vals,
batch_size, use_recent_idx, intended_output)
# compute hash slots to be updated
hash_slot_idxs = self.get_hash_slots(upd_keys)
# make updates
update_ops = []
with tf.control_dependencies([base_update_op]):
for i, slot_idxs in enumerate(hash_slot_idxs):
# for each slot, choose which entry to replace
entry_idx = tf.random_uniform([batch_size],
maxval=self.num_per_hash_slot,
dtype=tf.int32)
entry_mul = 1 - tf.one_hot(entry_idx, self.num_per_hash_slot,
dtype=tf.int32)
entry_add = (tf.expand_dims(upd_idxs, 1) *
tf.one_hot(entry_idx, self.num_per_hash_slot,
dtype=tf.int32))
mul_op = tf.scatter_mul(self.hash_slots[i], slot_idxs, entry_mul)
with tf.control_dependencies([mul_op]):
add_op = tf.scatter_add(self.hash_slots[i], slot_idxs, entry_add)
update_ops.append(add_op)
return tf.group(*update_ops)
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""Model using memory component.
The model embeds images using a standard CNN architecture.
These embeddings are used as keys to the memory component,
which returns nearest neighbors.
"""
import tensorflow as tf
import memory
FLAGS = tf.flags.FLAGS
class BasicClassifier(object):
def __init__(self, output_dim):
self.output_dim = output_dim
def core_builder(self, memory_val, x, y):
del x, y
y_pred = memory_val
loss = 0.0
return loss, y_pred
class LeNet(object):
"""Standard CNN architecture."""
def __init__(self, image_size, num_channels, hidden_dim):
self.image_size = image_size
self.num_channels = num_channels
self.hidden_dim = hidden_dim
self.matrix_init = tf.truncated_normal_initializer(stddev=0.1)
self.vector_init = tf.constant_initializer(0.0)
def core_builder(self, x):
"""Embeds x using standard CNN architecture.
Args:
x: Batch of images as a 2-d Tensor [batch_size, -1].
Returns:
A 2-d Tensor [batch_size, hidden_dim] of embedded images.
"""
ch1 = 32 * 2 # number of channels in 1st layer
ch2 = 64 * 2 # number of channels in 2nd layer
conv1_weights = tf.get_variable('conv1_w',
[3, 3, self.num_channels, ch1],
initializer=self.matrix_init)
conv1_biases = tf.get_variable('conv1_b', [ch1],
initializer=self.vector_init)
conv1a_weights = tf.get_variable('conv1a_w',
[3, 3, ch1, ch1],
initializer=self.matrix_init)
conv1a_biases = tf.get_variable('conv1a_b', [ch1],
initializer=self.vector_init)
conv2_weights = tf.get_variable('conv2_w', [3, 3, ch1, ch2],
initializer=self.matrix_init)
conv2_biases = tf.get_variable('conv2_b', [ch2],
initializer=self.vector_init)
conv2a_weights = tf.get_variable('conv2a_w', [3, 3, ch2, ch2],
initializer=self.matrix_init)
conv2a_biases = tf.get_variable('conv2a_b', [ch2],
initializer=self.vector_init)
# fully connected
fc1_weights = tf.get_variable(
'fc1_w', [self.image_size // 4 * self.image_size // 4 * ch2,
self.hidden_dim], initializer=self.matrix_init)
fc1_biases = tf.get_variable('fc1_b', [self.hidden_dim],
initializer=self.vector_init)
# define model
x = tf.reshape(x,
[-1, self.image_size, self.image_size, self.num_channels])
batch_size = tf.shape(x)[0]
conv1 = tf.nn.conv2d(x, conv1_weights,
strides=[1, 1, 1, 1], padding='SAME')
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases))
conv1 = tf.nn.conv2d(relu1, conv1a_weights,
strides=[1, 1, 1, 1], padding='SAME')
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1a_biases))
pool1 = tf.nn.max_pool(relu1, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
conv2 = tf.nn.conv2d(pool1, conv2_weights,
strides=[1, 1, 1, 1], padding='SAME')
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases))
conv2 = tf.nn.conv2d(relu2, conv2a_weights,
strides=[1, 1, 1, 1], padding='SAME')
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2a_biases))
pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
reshape = tf.reshape(pool2, [batch_size, -1])
hidden = tf.matmul(reshape, fc1_weights) + fc1_biases
return hidden
class Model(object):
"""Model for coordinating between CNN embedder and Memory module."""
def __init__(self, input_dim, output_dim, rep_dim, memory_size, vocab_size,
learning_rate=0.0001, use_lsh=False):
self.input_dim = input_dim
self.output_dim = output_dim
self.rep_dim = rep_dim
self.memory_size = memory_size
self.vocab_size = vocab_size
self.learning_rate = learning_rate
self.use_lsh = use_lsh
self.embedder = self.get_embedder()
self.memory = self.get_memory()
self.classifier = self.get_classifier()
self.global_step = tf.contrib.framework.get_or_create_global_step()
def get_embedder(self):
return LeNet(int(self.input_dim ** 0.5), 1, self.rep_dim)
def get_memory(self):
cls = memory.LSHMemory if self.use_lsh else memory.Memory
return cls(self.rep_dim, self.memory_size, self.vocab_size)
def get_classifier(self):
return BasicClassifier(self.output_dim)
def core_builder(self, x, y, keep_prob, use_recent_idx=True):
embeddings = self.embedder.core_builder(x)
if keep_prob < 1.0:
embeddings = tf.nn.dropout(embeddings, keep_prob)
memory_val, _, teacher_loss = self.memory.query(
embeddings, y, use_recent_idx=use_recent_idx)
loss, y_pred = self.classifier.core_builder(memory_val, x, y)
return loss + teacher_loss, y_pred
def train(self, x, y):
loss, _ = self.core_builder(x, y, keep_prob=0.3)
gradient_ops = self.training_ops(loss)
return loss, gradient_ops
def eval(self, x, y):
_, y_preds = self.core_builder(x, y, keep_prob=1.0,
use_recent_idx=False)
return y_preds
def get_xy_placeholders(self):
return (tf.placeholder(tf.float32, [None, self.input_dim]),
tf.placeholder(tf.int32, [None]))
def setup(self):
"""Sets up all components of the computation graph."""
self.x, self.y = self.get_xy_placeholders()
with tf.variable_scope('core', reuse=None):
self.loss, self.gradient_ops = self.train(self.x, self.y)
with tf.variable_scope('core', reuse=True):
self.y_preds = self.eval(self.x, self.y)
# setup memory "reset" ops
(self.mem_keys, self.mem_vals,
self.mem_age, self.recent_idx) = self.memory.get()
self.mem_keys_reset = tf.placeholder(self.mem_keys.dtype,
tf.identity(self.mem_keys).shape)
self.mem_vals_reset = tf.placeholder(self.mem_vals.dtype,
tf.identity(self.mem_vals).shape)
self.mem_age_reset = tf.placeholder(self.mem_age.dtype,
tf.identity(self.mem_age).shape)
self.recent_idx_reset = tf.placeholder(self.recent_idx.dtype,
tf.identity(self.recent_idx).shape)
self.mem_reset_op = self.memory.set(self.mem_keys_reset,
self.mem_vals_reset,
self.mem_age_reset,
None)
def training_ops(self, loss):
opt = self.get_optimizer()
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
return opt.apply_gradients(zip(clipped_gradients, params),
global_step=self.global_step)
def get_optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.learning_rate,
epsilon=1e-4)
def one_step(self, sess, x, y):
outputs = [self.loss, self.gradient_ops]
return sess.run(outputs, feed_dict={self.x: x, self.y: y})
def episode_step(self, sess, x, y, clear_memory=False):
"""Performs training steps on episodic input.
Args:
sess: A Tensorflow Session.
x: A list of batches of images defining the episode.
y: A list of batches of labels corresponding to x.
clear_memory: Whether to clear the memory before the episode.
Returns:
List of losses the same length as the episode.
"""
outputs = [self.loss, self.gradient_ops]
if clear_memory:
self.clear_memory(sess)
losses = []
for xx, yy in zip(x, y):
out = sess.run(outputs, feed_dict={self.x: xx, self.y: yy})
loss = out[0]
losses.append(loss)
return losses
def predict(self, sess, x, y=None):
"""Predict the labels on a single batch of examples.
Args:
sess: A Tensorflow Session.
x: A batch of images.
y: The labels for the images in x.
This allows for updating the memory.
Returns:
Predicted y.
"""
cur_memory = sess.run([self.mem_keys, self.mem_vals,
self.mem_age])
outputs = [self.y_preds]
if y is None:
ret = sess.run(outputs, feed_dict={self.x: x})
else:
ret = sess.run(outputs, feed_dict={self.x: x, self.y: y})
sess.run([self.mem_reset_op],
feed_dict={self.mem_keys_reset: cur_memory[0],
self.mem_vals_reset: cur_memory[1],
self.mem_age_reset: cur_memory[2]})
return ret
def episode_predict(self, sess, x, y, clear_memory=False):
"""Predict the labels on an episode of examples.
Args:
sess: A Tensorflow Session.
x: A list of batches of images.
y: A list of labels for the images in x.
This allows for updating the memory.
clear_memory: Whether to clear the memory before the episode.
Returns:
List of predicted y.
"""
cur_memory = sess.run([self.mem_keys, self.mem_vals,
self.mem_age])
if clear_memory:
self.clear_memory(sess)
outputs = [self.y_preds]
y_preds = []
for xx, yy in zip(x, y):
out = sess.run(outputs, feed_dict={self.x: xx, self.y: yy})
y_pred = out[0]
y_preds.append(y_pred)
sess.run([self.mem_reset_op],
feed_dict={self.mem_keys_reset: cur_memory[0],
self.mem_vals_reset: cur_memory[1],
self.mem_age_reset: cur_memory[2]})
return y_preds
def clear_memory(self, sess):
sess.run([self.memory.clear()])
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
r"""Script for training model.
Simple command to get up and running:
python train.py --memory_size=8192 \
--batch_size=16 --validation_length=50 \
--episode_width=5 --episode_length=30
"""
import logging
import os
import random
import numpy as np
import tensorflow as tf
import data_utils
import model
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_integer('rep_dim', 128,
'dimension of keys to use in memory')
tf.flags.DEFINE_integer('episode_length', 100, 'length of episode')
tf.flags.DEFINE_integer('episode_width', 5,
'number of distinct labels in a single episode')
tf.flags.DEFINE_integer('memory_size', None, 'number of slots in memory. '
'Leave as None to default to episode length')
tf.flags.DEFINE_integer('batch_size', 16, 'batch size')
tf.flags.DEFINE_integer('num_episodes', 100000, 'number of training episodes')
tf.flags.DEFINE_integer('validation_frequency', 20,
'every so many training episodes, '
'assess validation accuracy')
tf.flags.DEFINE_integer('validation_length', 10,
'number of episodes to use to compute '
'validation accuracy')
tf.flags.DEFINE_integer('seed', 888, 'random seed for training sampling')
tf.flags.DEFINE_string('save_dir', '', 'directory to save model to')
tf.flags.DEFINE_bool('use_lsh', False,
'use locality-sensitive hashing '
'(NOTE: not fully tested)')
class Trainer(object):
"""Class that takes care of training, validating, and checkpointing model."""
def __init__(self, train_data, valid_data, input_dim, output_dim=None):
self.train_data = train_data
self.valid_data = valid_data
self.input_dim = input_dim
self.rep_dim = FLAGS.rep_dim
self.episode_length = FLAGS.episode_length
self.episode_width = FLAGS.episode_width
self.batch_size = FLAGS.batch_size
self.memory_size = (self.episode_length * self.batch_size
if FLAGS.memory_size is None else FLAGS.memory_size)
self.use_lsh = FLAGS.use_lsh
self.output_dim = (output_dim if output_dim is not None
else self.episode_width)
def get_model(self):
# vocab size is the number of distinct values that
# could go into the memory key-value storage
vocab_size = self.episode_width * self.batch_size
return model.Model(
self.input_dim, self.output_dim, self.rep_dim, self.memory_size,
vocab_size, use_lsh=self.use_lsh)
def sample_episode_batch(self, data,
episode_length, episode_width, batch_size):
"""Generates a random batch for training or validation.
Structures each element of the batch as an 'episode'.
Each episode contains episode_length examples and
episode_width distinct labels.
Args:
data: A dictionary mapping label to list of examples.
episode_length: Number of examples in each episode.
episode_width: Distinct number of labels in each episode.
batch_size: Batch size (number of episodes).
Returns:
A tuple (x, y) where x is a list of batches of examples
with size episode_length and y is a list of batches of labels.
"""
episodes_x = [[] for _ in xrange(episode_length)]
episodes_y = [[] for _ in xrange(episode_length)]
assert len(data) >= episode_width
keys = data.keys()
for b in xrange(batch_size):
episode_labels = random.sample(keys, episode_width)
remainder = episode_length % episode_width
remainders = [0] * (episode_width - remainder) + [1] * remainder
episode_x = [
random.sample(data[lab],
r + (episode_length - remainder) / episode_width)
for lab, r in zip(episode_labels, remainders)]
episode = sum([[(x, i, ii) for ii, x in enumerate(xx)]
for i, xx in enumerate(episode_x)], [])
random.shuffle(episode)
# Arrange episode so that each distinct label is seen before moving to
# 2nd showing
episode.sort(key=lambda elem: elem[2])
assert len(episode) == episode_length
for i in xrange(episode_length):
episodes_x[i].append(episode[i][0])
episodes_y[i].append(episode[i][1] + b * episode_width)
return ([np.array(xx).astype('float32') for xx in episodes_x],
[np.array(yy).astype('int32') for yy in episodes_y])
def compute_correct(self, ys, y_preds):
return np.mean(np.equal(y_preds, np.array(ys)))
def individual_compute_correct(self, y, y_pred):
return y_pred == y
def run(self):
"""Performs training.
Trains a model using episodic training.
Every so often, runs some evaluations on validation data.
"""
train_data, valid_data = self.train_data, self.valid_data
input_dim, output_dim = self.input_dim, self.output_dim
rep_dim, episode_length = self.rep_dim, self.episode_length
episode_width, memory_size = self.episode_width, self.memory_size
batch_size = self.batch_size
train_size = len(train_data)
valid_size = len(valid_data)
logging.info('train_size (number of labels) %d', train_size)
logging.info('valid_size (number of labels) %d', valid_size)
logging.info('input_dim %d', input_dim)
logging.info('output_dim %d', output_dim)
logging.info('rep_dim %d', rep_dim)
logging.info('episode_length %d', episode_length)
logging.info('episode_width %d', episode_width)
logging.info('memory_size %d', memory_size)
logging.info('batch_size %d', batch_size)
assert all(len(v) >= float(episode_length) / episode_width
for v in train_data.itervalues())
assert all(len(v) >= float(episode_length) / episode_width
for v in valid_data.itervalues())
output_dim = episode_width
self.model = self.get_model()
self.model.setup()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=10)
ckpt = None
if FLAGS.save_dir:
ckpt = tf.train.get_checkpoint_state(FLAGS.save_dir)
if ckpt and ckpt.model_checkpoint_path:
logging.info('restoring from %s', ckpt.model_checkpoint_path)
saver.restore(sess, ckpt.model_checkpoint_path)
logging.info('starting now')
losses = []
random.seed(FLAGS.seed)
np.random.seed(FLAGS.seed)
for i in xrange(FLAGS.num_episodes):
x, y = self.sample_episode_batch(
train_data, episode_length, episode_width, batch_size)
outputs = self.model.episode_step(sess, x, y, clear_memory=True)
loss = outputs
losses.append(loss)
if i % FLAGS.validation_frequency == 0:
logging.info('episode batch %d, avg train loss %f',
i, np.mean(losses))
losses = []
# validation
correct = []
correct_by_shot = dict((k, []) for k in xrange(self.episode_width + 1))
for _ in xrange(FLAGS.validation_length):
x, y = self.sample_episode_batch(
valid_data, episode_length, episode_width, 1)
outputs = self.model.episode_predict(
sess, x, y, clear_memory=True)
y_preds = outputs
correct.append(self.compute_correct(np.array(y), y_preds))
# compute per-shot accuracies
seen_counts = [[0] * episode_width for _ in xrange(batch_size)]
# loop over episode steps
for yy, yy_preds in zip(y, y_preds):
# loop over batch examples
for k, (yyy, yyy_preds) in enumerate(zip(yy, yy_preds)):
yyy, yyy_preds = int(yyy), int(yyy_preds)
count = seen_counts[k][yyy % self.episode_width]
if count in correct_by_shot:
correct_by_shot[count].append(
self.individual_compute_correct(yyy, yyy_preds))
seen_counts[k][yyy % self.episode_width] = count + 1
logging.info('validation overall accuracy %f', np.mean(correct))
logging.info('%d-shot: %.3f, ' * (self.episode_width + 1),
*sum([[k, np.mean(correct_by_shot[k])]
for k in xrange(self.episode_width + 1)], []))
if saver and FLAGS.save_dir:
saved_file = saver.save(sess,
os.path.join(FLAGS.save_dir, 'model.ckpt'),
global_step=self.model.global_step)
logging.info('saved model to %s', saved_file)
def main(unused_argv):
train_data, valid_data = data_utils.get_data()
trainer = Trainer(train_data, valid_data, data_utils.IMAGE_NEW_SIZE ** 2)
trainer.run()
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
tf.app.run()
......@@ -73,7 +73,7 @@ LSTM-8192-2048 (50\% Dropout) | 32.2 | 3.3
<b>How To Run</b>
Pre-requesite:
Prerequisites:
* Install TensorFlow.
* Install Bazel.
......@@ -97,7 +97,7 @@ Pre-requesite:
[link](http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt)
* test dataset: link
[link](http://download.tensorflow.org/models/LM_LSTM_CNN/test/news.en.heldout-00000-of-00050)
* It is recommended to run on modern desktop instead of laptop.
* It is recommended to run on a modern desktop instead of a laptop.
```shell
# 1. Clone the code to your workspace.
......@@ -105,7 +105,7 @@ Pre-requesite:
# 3. Create an empty WORKSPACE file in your workspace.
# 4. Create an empty output directory in your workspace.
# Example directory structure below:
ls -R
$ ls -R
.:
data lm_1b output WORKSPACE
......@@ -121,13 +121,13 @@ BUILD data_utils.py lm_1b_eval.py README.md
./output:
# Build the codes.
bazel build -c opt lm_1b/...
$ bazel build -c opt lm_1b/...
# Run sample mode:
bazel-bin/lm_1b/lm_1b_eval --mode sample \
--prefix "I love that I" \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--ckpt 'data/ckpt-*'
$ bazel-bin/lm_1b/lm_1b_eval --mode sample \
--prefix "I love that I" \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--ckpt 'data/ckpt-*'
...(omitted some TensorFlow output)
I love
I love that
......@@ -138,11 +138,11 @@ I love that I find that amazing
...(omitted)
# Run eval mode:
bazel-bin/lm_1b/lm_1b_eval --mode eval \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--input_data data/news.en.heldout-00000-of-00050 \
--ckpt 'data/ckpt-*'
$ bazel-bin/lm_1b/lm_1b_eval --mode eval \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--input_data data/news.en.heldout-00000-of-00050 \
--ckpt 'data/ckpt-*'
...(omitted some TensorFlow output)
Loaded step 14108582.
# perplexity is high initially because words without context are harder to
......@@ -166,28 +166,28 @@ Eval Step: 4531, Average Perplexity: 29.285674.
...(omitted. At convergence, it should be around 30.)
# Run dump_emb mode:
bazel-bin/lm_1b/lm_1b_eval --mode dump_emb \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--ckpt 'data/ckpt-*' \
--save_dir output
$ bazel-bin/lm_1b/lm_1b_eval --mode dump_emb \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--ckpt 'data/ckpt-*' \
--save_dir output
...(omitted some TensorFlow output)
Finished softmax weights
Finished word embedding 0/793471
Finished word embedding 1/793471
Finished word embedding 2/793471
...(omitted)
ls output/
$ ls output/
embeddings_softmax.npy ...
# Run dump_lstm_emb mode:
bazel-bin/lm_1b/lm_1b_eval --mode dump_lstm_emb \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--ckpt 'data/ckpt-*' \
--sentence "I love who I am ." \
--save_dir output
ls output/
$ bazel-bin/lm_1b/lm_1b_eval --mode dump_lstm_emb \
--pbtxt data/graph-2016-09-10.pbtxt \
--vocab_file data/vocab-2016-09-10.txt \
--ckpt 'data/ckpt-*' \
--sentence "I love who I am ." \
--save_dir output
$ ls output/
lstm_emb_step_0.npy lstm_emb_step_2.npy lstm_emb_step_4.npy
lstm_emb_step_6.npy lstm_emb_step_1.npy lstm_emb_step_3.npy
lstm_emb_step_5.npy
......
......@@ -19,6 +19,7 @@ import os
import sys
import numpy as np
from six.moves import xrange
import tensorflow as tf
from google.protobuf import text_format
......@@ -83,7 +84,7 @@ def _LoadModel(gd_file, ckpt_file):
with tf.Graph().as_default():
sys.stderr.write('Recovering graph.\n')
with tf.gfile.FastGFile(gd_file, 'r') as f:
s = f.read()
s = f.read().decode()
gd = tf.GraphDef()
text_format.Merge(s, gd)
......@@ -230,7 +231,7 @@ def _DumpEmb(vocab):
sys.stderr.write('Finished softmax weights\n')
all_embs = np.zeros([vocab.size, 1024])
for i in range(vocab.size):
for i in xrange(vocab.size):
input_dict = {t['inputs_in']: inputs,
t['targets_in']: targets,
t['target_weights_in']: weights}
......
......@@ -58,7 +58,7 @@ def _letter_to_number(letter):
def namignizer_iterator(names, counts, batch_size, num_steps, epoch_size):
"""Takes a list of names and counts like those output from read_names, and
makes an iterator yielding a batch_size by num_steps array of random names
separated by an end of name token. The names are choosen randomly according
separated by an end of name token. The names are chosen randomly according
to their counts. The batch may end mid-name
Args:
......
......@@ -37,11 +37,14 @@ class NamignizerModel(object):
self._weights = tf.placeholder(tf.float32, [batch_size * num_steps])
# lstm for our RNN cell (GRU supported too)
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
if is_training and config.keep_prob < 1:
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
lstm_cell, output_keep_prob=config.keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)
lstm_cells = []
for layer in range(config.num_layers):
lstm_cell = tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0)
if is_training and config.keep_prob < 1:
lstm_cell = tf.contrib.rnn.DropoutWrapper(
lstm_cell, output_keep_prob=config.keep_prob)
lstm_cells.append(lstm_cell)
cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
self._initial_state = cell.zero_state(batch_size, tf.float32)
......@@ -61,11 +64,11 @@ class NamignizerModel(object):
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
output = tf.reshape(tf.concat(1, outputs), [-1, size])
output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, size])
softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
softmax_b = tf.get_variable("softmax_b", [vocab_size])
logits = tf.matmul(output, softmax_w) + softmax_b
loss = tf.nn.seq2seq.sequence_loss_by_example(
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
[logits],
[tf.reshape(self._targets, [-1])],
[self._weights])
......@@ -77,7 +80,7 @@ class NamignizerModel(object):
self._activations = tf.nn.softmax(logits)
# ability to save the model
self.saver = tf.train.Saver(tf.all_variables())
self.saver = tf.train.Saver(tf.global_variables())
if not is_training:
return
......
......@@ -14,7 +14,7 @@
"""A library showing off sequence recognition and generation with the simple
example of names.
We use recurrent neural nets to learn complex functions able to recogize and
We use recurrent neural nets to learn complex functions able to recognize and
generate sequences of a given form. This can be used for natural language
syntax recognition, dynamically generating maps or puzzles and of course
baby name generation.
......@@ -122,7 +122,6 @@ def run_epoch(session, m, names, counts, epoch_size, eval_op, verbose=False):
cost, _ = session.run([m.cost, eval_op],
{m.input_data: x,
m.targets: y,
m.initial_state: m.initial_state.eval(),
m.weights: np.ones(m.batch_size * m.num_steps)})
costs += cost
iters += m.num_steps
......@@ -201,7 +200,6 @@ def namignize(names, checkpoint_path, config):
cost, loss, _ = session.run([m.cost, m.loss, tf.no_op()],
{m.input_data: x,
m.targets: y,
m.initial_state: m.initial_state.eval(),
m.weights: np.concatenate((
np.ones(len(name)), np.zeros(m.batch_size * m.num_steps - len(name))))})
......@@ -234,7 +232,6 @@ def namignator(checkpoint_path, config):
activations, final_state, _ = session.run([m.activations, m.final_state, tf.no_op()],
{m.input_data: np.zeros((1, 1)),
m.targets: np.zeros((1, 1)),
m.initial_state: m.initial_state.eval(),
m.weights: np.ones(1)})
# sample from our softmax activations
......@@ -254,9 +251,9 @@ def namignator(checkpoint_path, config):
if __name__ == "__main__":
# train("data/SmallNames.txt", "model/namignizer", SmallConfig)
train("data/SmallNames.txt", "model/namignizer", SmallConfig)
# namignize(["mary", "ida", "gazorbazorb", "mmmhmm", "bob"],
# tf.train.latest_checkpoint("model"), SmallConfig)
namignize(["mary", "ida", "gazorbazorb", "mmmhmm", "bob"],
tf.train.latest_checkpoint("model"), SmallConfig)
# namignator(tf.train.latest_checkpoint("model"), SmallConfig)
namignator(tf.train.latest_checkpoint("model"), SmallConfig)
# NeuralGPU
Code for the Neural GPU model described in [[http://arxiv.org/abs/1511.08228]].
The extended version was described in [[https://arxiv.org/abs/1610.08613]].
Code for the Neural GPU model described in http://arxiv.org/abs/1511.08228.
The extended version was described in https://arxiv.org/abs/1610.08613.
Requirements:
* TensorFlow (see tensorflow.org for how to install)
......
......@@ -36,7 +36,7 @@ def conv_linear(args, kw, kh, nin, nout, rate, do_bias, bias_start, prefix):
if len(args) == 1:
arg = args[0]
else:
arg = tf.concat(args, 3)
arg = tf.concat(axis=3, values=args)
res = tf.nn.convolution(arg, k, dilation_rate=(rate, 1), padding="SAME")
if not do_bias: return res
with tf.device("/cpu:0"):
......@@ -71,14 +71,14 @@ def place_at14(decided, selected, it):
"""Place selected at it-th coordinate of decided, dim=1 of 4."""
slice1 = decided[:, :it, :, :]
slice2 = decided[:, it + 1:, :, :]
return tf.concat([slice1, selected, slice2], 1)
return tf.concat(axis=1, values=[slice1, selected, slice2])
def place_at13(decided, selected, it):
"""Place selected at it-th coordinate of decided, dim=1 of 3."""
slice1 = decided[:, :it, :]
slice2 = decided[:, it + 1:, :]
return tf.concat([slice1, selected, slice2], 1)
return tf.concat(axis=1, values=[slice1, selected, slice2])
def tanh_cutoff(x, cutoff):
......@@ -211,7 +211,7 @@ def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
# beam_val is [batch_size x beam_size]; let b = batch_size * beam_size
# decided is len x b x a x b
# output is b x out_size; step is b x len x a x b;
outputs = tf.split(tf.nn.log_softmax(output), beam_size, 0)
outputs = tf.split(axis=0, num_or_size_splits=beam_size, value=tf.nn.log_softmax(output))
all_beam_vals, all_beam_idx = [], []
beam_range = 1 if is_first else beam_size
for i in xrange(beam_range):
......@@ -221,9 +221,9 @@ def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
cur_beam_val], "GREPO", summarize=8)
all_beam_vals.append(top_out + tf.expand_dims(cur_beam_val, 1))
all_beam_idx.append(top_out_idx)
all_beam_idx = tf.reshape(tf.transpose(tf.concat(all_beam_idx, 1), [1, 0]),
all_beam_idx = tf.reshape(tf.transpose(tf.concat(axis=1, values=all_beam_idx), [1, 0]),
[-1])
top_beam, top_beam_idx = tf.nn.top_k(tf.concat(all_beam_vals, 1), k=beam_size)
top_beam, top_beam_idx = tf.nn.top_k(tf.concat(axis=1, values=all_beam_vals), k=beam_size)
top_beam_idx = tf.Print(top_beam_idx, [top_beam, top_beam_idx],
"GREP", summarize=8)
reordered = [[] for _ in xrange(len(tensors_to_reorder) + 1)]
......@@ -236,8 +236,8 @@ def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
reordered[0].append(tf.gather(output, which_beam))
for i, t in enumerate(tensors_to_reorder):
reordered[i + 1].append(tf.gather(t, which_beam))
new_tensors = [tf.concat(t, 0) for t in reordered]
top_out_idx = tf.concat(top_out_idx, 0)
new_tensors = [tf.concat(axis=0, values=t) for t in reordered]
top_out_idx = tf.concat(axis=0, values=top_out_idx)
return (top_beam, new_tensors[0], top_out_idx, new_tensors[1:])
......@@ -266,9 +266,9 @@ class NeuralGPU(object):
self.input = tf.placeholder(tf.int32, name="inp")
self.target = tf.placeholder(tf.int32, name="tgt")
self.prev_step = tf.placeholder(tf.float32, name="prev_step")
gpu_input = tf.split(self.input, num_gpus, 0)
gpu_target = tf.split(self.target, num_gpus, 0)
gpu_prev_step = tf.split(self.prev_step, num_gpus, 0)
gpu_input = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.input)
gpu_target = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.target)
gpu_prev_step = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.prev_step)
batch_size = tf.shape(gpu_input[0])[0]
if backward:
......@@ -410,7 +410,7 @@ class NeuralGPU(object):
out_write = output_ta.write(it, output_l[:batch_size, :, :, :])
output = tf.gather(target_emb_weights, out)
output = tf.reshape(output, [-1, 1, nmaps])
output = tf.concat([output] * height, 1)
output = tf.concat(axis=1, values=[output] * height)
tgt = tgts[it, :, :, :]
selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling),
lambda: output, lambda: tgt)
......@@ -419,7 +419,7 @@ class NeuralGPU(object):
out_idx = place_at13(
out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it)
if mem_size > 0:
mem = tf.concat([mem] * height, 2)
mem = tf.concat(axis=2, values=[mem] * height)
dec_write = place_at14(dec_write, mem, it_incr)
return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd,
out_idx, beam_cost)
......@@ -459,7 +459,7 @@ class NeuralGPU(object):
gpu_targets_tn)
embedded_targets_tn = tf.transpose(
embedded_targets_tn, [2, 0, 1, 3]) # len x b x 1 x nmaps
embedded_targets_tn = tf.concat([embedded_targets_tn] * height, 2)
embedded_targets_tn = tf.concat(axis=2, values=[embedded_targets_tn] * height)
# First image comes from start by applying convolution and adding 0s.
start = tf.transpose(start, [0, 2, 1, 3]) # Now b x len x h x vec_s
......@@ -478,8 +478,10 @@ class NeuralGPU(object):
# This is just for running a baseline RNN seq2seq model.
if do_rnn:
self.after_enc_step.append(step) # Not meaningful here, but needed.
lstm_cell = tf.contrib.rnn.BasicLSTMCell(height * nmaps)
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * nconvs)
def lstm_cell():
return tf.contrib.rnn.BasicLSTMCell(height * nmaps)
cell = tf.contrib.rnn.MultiRNNCell(
[lstm_cell() for _ in range(nconvs)])
with tf.variable_scope("encoder"):
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
cell, tf.reshape(step, [batch_size, length, height * nmaps]),
......@@ -505,7 +507,7 @@ class NeuralGPU(object):
attn_res = attention_query(attn_q, tf.get_variable(
"attn_v", [height * nmaps],
initializer=tf.random_uniform_initializer(-0.1, 0.1)))
concatenated = tf.reshape(tf.concat([cell_inp, attn_res], 1),
concatenated = tf.reshape(tf.concat(axis=1, values=[cell_inp, attn_res]),
[batch_size, 2 * height * nmaps])
cell_inp = tf.layers.dense(
concatenated, height * nmaps, name="attn_merge")
......@@ -519,14 +521,14 @@ class NeuralGPU(object):
res = tf.gather(target_emb_weights, res)
res *= tf.expand_dims(mask[:, 0], 1)
output = tf.layers.dense(
tf.concat([output, res], 1), height * nmaps, name="rnnmem")
tf.concat(axis=1, values=[output, res]), height * nmaps, name="rnnmem")
return new_state, output, mem_loss
# pylint: enable=cell-var-from-loop
gpu_targets = tf.squeeze(gpu_target[gpu], [1]) # b x len
gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0])
dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32)
dec_inp = tf.concat([dec_zero, gpu_targets], 1)
dec_inp = tf.concat(axis=1, values=[dec_zero, gpu_targets])
dec_inp = dec_inp[:, :length]
embedded_dec_inp = tf.gather(target_emb_weights, dec_inp)
embedded_dec_inp_proj = tf.layers.dense(
......@@ -573,9 +575,9 @@ class NeuralGPU(object):
height, vec_size])
# Prepare for beam search.
tgts = tf.concat([embedded_targets_tn] * beam_size, 1)
tgts = tf.concat(axis=1, values=[embedded_targets_tn] * beam_size)
beam_cost = tf.zeros([batch_size, beam_size])
step = tf.concat([step] * beam_size, 0)
step = tf.concat(axis=0, values=[step] * beam_size)
# First step hard-coded.
step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step(
step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx,
......@@ -654,7 +656,7 @@ class NeuralGPU(object):
% (gpu, time.time() - start_time))
self.updates = []
self.after_enc_step = tf.concat(self.after_enc_step, 0) # Concat GPUs.
self.after_enc_step = tf.concat(axis=0, values=self.after_enc_step) # Concat GPUs.
if backward:
tf.get_variable_scope()._reuse = False
tf.get_variable_scope().set_caching_device(None)
......@@ -667,10 +669,10 @@ class NeuralGPU(object):
self.losses = [gpu_avg([gpu_losses[g][i] for g in xrange(num_gpus)])
for i in xrange(len(gpu_losses[0]))]
self.out_idx = tf.concat(gpu_out_idx, 0)
self.out_idx = tf.concat(axis=0, values=gpu_out_idx)
self.grad_norms = [gpu_avg([gpu_grad_norms[g][i] for g in xrange(num_gpus)])
for i in xrange(len(gpu_grad_norms[0]))]
self.outputs = [tf.concat([gpu_outputs[g] for g in xrange(num_gpus)], 1)]
self.outputs = [tf.concat(axis=1, values=[gpu_outputs[g] for g in xrange(num_gpus)])]
self.quantize_op = quantize_weights_op(512, 8)
if backward:
self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment