Commit dff0f0c1 authored by Alexander Gorban's avatar Alexander Gorban
Browse files

Merge branch 'master' of github.com:tensorflow/models

parents da341f70 36203f09
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Image/Mask decoder used while pretraining the network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
slim = tf.contrib.slim
_FEATURE_MAP_SIZE = 8
def _postprocess_im(images):
"""Performs post-processing for the images returned from conv net.
Transforms the value from [-1, 1] to [0, 1].
"""
return (images + 1) * 0.5
def model(identities, poses, params, is_training):
"""Decoder model to get image and mask from latent embedding."""
del is_training
f_dim = params.f_dim
fc_dim = params.fc_dim
outputs = dict()
with slim.arg_scope(
[slim.fully_connected, slim.conv2d_transpose],
weights_initializer=tf.truncated_normal_initializer(stddev=0.02, seed=1)):
# Concatenate the identity and pose units
h0 = tf.concat([identities, poses], 1)
h0 = slim.fully_connected(h0, fc_dim, activation_fn=tf.nn.relu)
h1 = slim.fully_connected(h0, fc_dim, activation_fn=tf.nn.relu)
# Mask decoder
dec_m0 = slim.fully_connected(
h1, (_FEATURE_MAP_SIZE**2) * f_dim * 2, activation_fn=tf.nn.relu)
dec_m0 = tf.reshape(
dec_m0, [-1, _FEATURE_MAP_SIZE, _FEATURE_MAP_SIZE, f_dim * 2])
dec_m1 = slim.conv2d_transpose(
dec_m0, f_dim, [5, 5], stride=2, activation_fn=tf.nn.relu)
dec_m2 = slim.conv2d_transpose(
dec_m1, int(f_dim / 2), [5, 5], stride=2, activation_fn=tf.nn.relu)
dec_m3 = slim.conv2d_transpose(
dec_m2, 1, [5, 5], stride=2, activation_fn=tf.nn.sigmoid)
# Image decoder
dec_i0 = slim.fully_connected(
h1, (_FEATURE_MAP_SIZE**2) * f_dim * 4, activation_fn=tf.nn.relu)
dec_i0 = tf.reshape(
dec_i0, [-1, _FEATURE_MAP_SIZE, _FEATURE_MAP_SIZE, f_dim * 4])
dec_i1 = slim.conv2d_transpose(
dec_i0, f_dim * 2, [5, 5], stride=2, activation_fn=tf.nn.relu)
dec_i2 = slim.conv2d_transpose(
dec_i1, f_dim * 2, [5, 5], stride=2, activation_fn=tf.nn.relu)
dec_i3 = slim.conv2d_transpose(
dec_i2, 3, [5, 5], stride=2, activation_fn=tf.nn.tanh)
outputs = dict()
outputs['images'] = _postprocess_im(dec_i3)
outputs['masks'] = dec_m3
return outputs
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Creates rotator network model.
This model performs the out-of-plane rotations given input image and action.
The action is either no-op, rotate clockwise or rotate counter-clockwise.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def bilinear(input_x, input_y, output_size):
"""Define the bilinear transformation layer."""
shape_x = input_x.get_shape().as_list()
shape_y = input_y.get_shape().as_list()
weights_initializer = tf.truncated_normal_initializer(stddev=0.02,
seed=1)
biases_initializer = tf.constant_initializer(0.0)
matrix = tf.get_variable("Matrix", [shape_x[1], shape_y[1], output_size],
tf.float32, initializer=weights_initializer)
bias = tf.get_variable("Bias", [output_size],
initializer=biases_initializer)
# Add to GraphKeys.MODEL_VARIABLES
tf.contrib.framework.add_model_variable(matrix)
tf.contrib.framework.add_model_variable(bias)
# Define the transformation
h0 = tf.matmul(input_x, tf.reshape(matrix,
[shape_x[1], shape_y[1]*output_size]))
h0 = tf.reshape(h0, [-1, shape_y[1], output_size])
h1 = tf.tile(tf.reshape(input_y, [-1, shape_y[1], 1]),
[1, 1, output_size])
h1 = tf.multiply(h0, h1)
return tf.reduce_sum(h1, 1) + bias
def model(poses, actions, params, is_training):
"""Model for performing rotation."""
del is_training # Unused
return bilinear(poses, actions, params.z_dim)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Training decoder as used in PTN (NIPS16)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
slim = tf.contrib.slim
@tf.contrib.framework.add_arg_scope
def conv3d_transpose(inputs,
num_outputs,
kernel_size,
stride=1,
padding='SAME',
activation_fn=tf.nn.relu,
weights_initializer=tf.contrib.layers.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
reuse=None,
trainable=True,
scope=None):
"""Wrapper for conv3d_transpose layer.
This function wraps the tf.conv3d_transpose with basic non-linearity.
Tt creates a variable called `weights`, representing the kernel,
that is convoled with the input. A second varibale called `biases'
is added to the result of operation.
"""
with tf.variable_scope(
scope, 'Conv3d_transpose', [inputs], reuse=reuse):
dtype = inputs.dtype.base_dtype
kernel_d, kernel_h, kernel_w = kernel_size[0:3]
num_filters_in = inputs.get_shape()[4]
weights_shape = [kernel_d, kernel_h, kernel_w, num_outputs, num_filters_in]
weights = tf.get_variable('weights',
shape=weights_shape,
dtype=dtype,
initializer=weights_initializer,
trainable=trainable)
tf.contrib.framework.add_model_variable(weights)
input_shape = inputs.get_shape().as_list()
batch_size = input_shape[0]
depth = input_shape[1]
height = input_shape[2]
width = input_shape[3]
def get_deconv_dim(dim_size, stride_size):
# Only support padding='SAME'.
if isinstance(dim_size, tf.Tensor):
dim_size = tf.multiply(dim_size, stride_size)
elif dim_size is not None:
dim_size *= stride_size
return dim_size
out_depth = get_deconv_dim(depth, stride)
out_height = get_deconv_dim(height, stride)
out_width = get_deconv_dim(width, stride)
out_shape = [batch_size, out_depth, out_height, out_width, num_outputs]
outputs = tf.nn.conv3d_transpose(inputs, weights, out_shape,
[1, stride, stride, stride, 1],
padding=padding)
outputs.set_shape(out_shape)
if biases_initializer is not None:
biases = tf.get_variable('biases',
shape=[num_outputs,],
dtype=dtype,
initializer=biases_initializer,
trainable=trainable)
tf.contrib.framework.add_model_variable(biases)
outputs = tf.nn.bias_add(outputs, biases)
if activation_fn:
outputs = activation_fn(outputs)
return outputs
def model(identities, params, is_training):
"""Model transforming embedding to voxels."""
del is_training # Unused
f_dim = params.f_dim
# Please refer to the original implementation: github.com/xcyan/nips16_PTN
# In TF replication, we use a slightly different architecture.
with slim.arg_scope(
[slim.fully_connected, conv3d_transpose],
weights_initializer=tf.truncated_normal_initializer(stddev=0.02, seed=1)):
h0 = slim.fully_connected(
identities, 4 * 4 * 4 * f_dim * 8, activation_fn=tf.nn.relu)
h1 = tf.reshape(h0, [-1, 4, 4, 4, f_dim * 8])
h1 = conv3d_transpose(
h1, f_dim * 4, [4, 4, 4], stride=2, activation_fn=tf.nn.relu)
h2 = conv3d_transpose(
h1, int(f_dim * 3 / 2), [5, 5, 5], stride=2, activation_fn=tf.nn.relu)
h3 = conv3d_transpose(
h2, 1, [6, 6, 6], stride=2, activation_fn=tf.nn.sigmoid)
return h3
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains training plan for the Rotator model (Pretraining in NIPS16)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import tensorflow as tf
from tensorflow import app
import model_rotator as model
flags = tf.app.flags
slim = tf.contrib.slim
flags.DEFINE_string('inp_dir', '',
'Directory path containing the input data (tfrecords).')
flags.DEFINE_string(
'dataset_name', 'shapenet_chair',
'Dataset name that is to be used for training and evaluation.')
flags.DEFINE_integer('z_dim', 512, '')
flags.DEFINE_integer('a_dim', 3, '')
flags.DEFINE_integer('f_dim', 64, '')
flags.DEFINE_integer('fc_dim', 1024, '')
flags.DEFINE_integer('num_views', 24, 'Num of viewpoints in the input data.')
flags.DEFINE_integer('image_size', 64,
'Input images dimension (pixels) - width & height.')
flags.DEFINE_integer('step_size', 1, 'Steps to take for rotation in pretraining.')
flags.DEFINE_integer('batch_size', 32, 'Batch size for training.')
flags.DEFINE_string('encoder_name', 'ptn_encoder',
'Name of the encoder network being used.')
flags.DEFINE_string('decoder_name', 'ptn_im_decoder',
'Name of the decoder network being used.')
flags.DEFINE_string('rotator_name', 'ptn_rotator',
'Name of the rotator network being used.')
# Save options
flags.DEFINE_string('checkpoint_dir', '/tmp/ptn_train/',
'Directory path for saving trained models and other data.')
flags.DEFINE_string('model_name', 'deeprotator_pretrain',
'Name of the model used in naming the TF job. Must be different for each run.')
flags.DEFINE_string('init_model', None,
'Checkpoint path of the model to initialize with.')
flags.DEFINE_integer('save_every', 1000,
'Average period of steps after which we save a model.')
# Optimization
flags.DEFINE_float('image_weight', 10, 'Weighting factor for image loss.')
flags.DEFINE_float('mask_weight', 1, 'Weighting factor for mask loss.')
flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
flags.DEFINE_float('weight_decay', 0.001, 'Weight decay parameter while training.')
flags.DEFINE_float('clip_gradient_norm', 0, 'Gradient clim norm, leave 0 if no gradient clipping.')
flags.DEFINE_integer('max_number_of_steps', 320000, 'Maximum number of steps for training.')
# Summary
flags.DEFINE_integer('save_summaries_secs', 15, 'Seconds interval for dumping TF summaries.')
flags.DEFINE_integer('save_interval_secs', 60 * 5, 'Seconds interval to save models.')
# Distribution
flags.DEFINE_string('master', '', 'The address of the tensorflow master if running distributed.')
flags.DEFINE_bool('sync_replicas', False, 'Whether to sync gradients between replicas for optimizer.')
flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas (train tasks).')
flags.DEFINE_integer('backup_workers', 0, 'Number of backup workers.')
flags.DEFINE_integer('ps_tasks', 0, 'Number of ps tasks.')
flags.DEFINE_integer('task', 0,
'Task identifier flag to be set for each task running in distributed manner. Task number 0 '
'will be chosen as the chief.')
FLAGS = flags.FLAGS
def main(_):
train_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name, 'train')
save_image_dir = os.path.join(train_dir, 'images')
if not os.path.exists(train_dir):
os.makedirs(train_dir)
if not os.path.exists(save_image_dir):
os.makedirs(save_image_dir)
g = tf.Graph()
with g.as_default():
with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
global_step = slim.get_or_create_global_step()
##########
## data ##
##########
train_data = model.get_inputs(
FLAGS.inp_dir,
FLAGS.dataset_name,
'train',
FLAGS.batch_size,
FLAGS.image_size,
is_training=True)
inputs = model.preprocess(train_data, FLAGS.step_size)
###########
## model ##
###########
model_fn = model.get_model_fn(FLAGS, is_training=True)
outputs = model_fn(inputs)
##########
## loss ##
##########
task_loss = model.get_loss(inputs, outputs, FLAGS)
regularization_loss = model.get_regularization_loss(
['encoder', 'rotator', 'decoder'], FLAGS)
loss = task_loss + regularization_loss
###############
## optimizer ##
###############
optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
if FLAGS.sync_replicas:
optimizer = tf.train.SyncReplicasOptimizer(
optimizer,
replicas_to_aggregate=FLAGS.workers_replicas - FLAGS.backup_workers,
total_num_replicas=FLAGS.worker_replicas)
##############
## train_op ##
##############
train_op = model.get_train_op_for_scope(
loss, optimizer, ['encoder', 'rotator', 'decoder'], FLAGS)
###########
## saver ##
###########
saver = tf.train.Saver(max_to_keep=np.minimum(5,
FLAGS.worker_replicas + 1))
if FLAGS.task == 0:
val_data = model.get_inputs(
FLAGS.inp_dir,
FLAGS.dataset_name,
'val',
FLAGS.batch_size,
FLAGS.image_size,
is_training=False)
val_inputs = model.preprocess(val_data, FLAGS.step_size)
# Note: don't compute loss here
reused_model_fn = model.get_model_fn(
FLAGS, is_training=False, reuse=True)
val_outputs = reused_model_fn(val_inputs)
with tf.device(tf.DeviceSpec(device_type='CPU')):
if FLAGS.step_size == 1:
vis_input_images = val_inputs['images_0'] * 255.0
vis_output_images = val_inputs['images_1'] * 255.0
vis_pred_images = val_outputs['images_1'] * 255.0
vis_pred_masks = (val_outputs['masks_1'] * (-1) + 1) * 255.0
else:
rep_times = int(np.ceil(32.0 / float(FLAGS.step_size)))
vis_list_1 = []
vis_list_2 = []
vis_list_3 = []
vis_list_4 = []
for j in xrange(rep_times):
for k in xrange(FLAGS.step_size):
vis_input_image = val_inputs['images_0'][j],
vis_output_image = val_inputs['images_%d' % (k + 1)][j]
vis_pred_image = val_outputs['images_%d' % (k + 1)][j]
vis_pred_mask = val_outputs['masks_%d' % (k + 1)][j]
vis_list_1.append(tf.expand_dims(vis_input_image, 0))
vis_list_2.append(tf.expand_dims(vis_output_image, 0))
vis_list_3.append(tf.expand_dims(vis_pred_image, 0))
vis_list_4.append(tf.expand_dims(vis_pred_mask, 0))
vis_list_1 = tf.reshape(
tf.stack(vis_list_1), [
rep_times * FLAGS.step_size, FLAGS.image_size,
FLAGS.image_size, 3
])
vis_list_2 = tf.reshape(
tf.stack(vis_list_2), [
rep_times * FLAGS.step_size, FLAGS.image_size,
FLAGS.image_size, 3
])
vis_list_3 = tf.reshape(
tf.stack(vis_list_3), [
rep_times * FLAGS.step_size, FLAGS.image_size,
FLAGS.image_size, 3
])
vis_list_4 = tf.reshape(
tf.stack(vis_list_4), [
rep_times * FLAGS.step_size, FLAGS.image_size,
FLAGS.image_size, 1
])
vis_input_images = vis_list_1 * 255.0
vis_output_images = vis_list_2 * 255.0
vis_pred_images = vis_list_3 * 255.0
vis_pred_masks = (vis_list_4 * (-1) + 1) * 255.0
write_disk_op = model.write_disk_grid(
global_step=global_step,
summary_freq=FLAGS.save_every,
log_dir=save_image_dir,
input_images=vis_input_images,
output_images=vis_output_images,
pred_images=vis_pred_images,
pred_masks=vis_pred_masks)
with tf.control_dependencies([write_disk_op]):
train_op = tf.identity(train_op)
#############
## init_fn ##
#############
init_fn = model.get_init_fn(['encoder, ' 'rotator', 'decoder'], FLAGS)
##############
## training ##
##############
slim.learning.train(
train_op=train_op,
logdir=train_dir,
init_fn=init_fn,
master=FLAGS.master,
is_chief=(FLAGS.task == 0),
number_of_steps=FLAGS.max_number_of_steps,
saver=saver,
save_summaries_secs=FLAGS.save_summaries_secs,
save_interval_secs=FLAGS.save_interval_secs)
if __name__ == '__main__':
app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains training plan for the Im2vox model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import tensorflow as tf
from tensorflow import app
import model_ptn
flags = tf.app.flags
slim = tf.contrib.slim
flags.DEFINE_string('inp_dir',
'',
'Directory path containing the input data (tfrecords).')
flags.DEFINE_string(
'dataset_name', 'shapenet_chair',
'Dataset name that is to be used for training and evaluation.')
flags.DEFINE_integer('z_dim', 512, '')
flags.DEFINE_integer('f_dim', 64, '')
flags.DEFINE_integer('fc_dim', 1024, '')
flags.DEFINE_integer('num_views', 24, 'Num of viewpoints in the input data.')
flags.DEFINE_integer('image_size', 64,
'Input images dimension (pixels) - width & height.')
flags.DEFINE_integer('vox_size', 32, 'Voxel prediction dimension.')
flags.DEFINE_integer('step_size', 24, 'Steps to take in rotation to fetch viewpoints.')
flags.DEFINE_integer('batch_size', 1, 'Batch size while training.')
flags.DEFINE_float('focal_length', 0.866, 'Focal length parameter used in perspective projection.')
flags.DEFINE_float('focal_range', 1.732, 'Focal length parameter used in perspective projection.')
flags.DEFINE_string('encoder_name', 'ptn_encoder',
'Name of the encoder network being used.')
flags.DEFINE_string('decoder_name', 'ptn_vox_decoder',
'Name of the decoder network being used.')
flags.DEFINE_string('projector_name', 'perspective_projector',
'Name of the projector network being used.')
# Save options
flags.DEFINE_string('checkpoint_dir', '/tmp/ptn_train/',
'Directory path for saving trained models and other data.')
flags.DEFINE_string('model_name', 'ptn_finetune',
'Name of the model used in naming the TF job. Must be different for each run.')
flags.DEFINE_string('init_model', None,
'Checkpoint path of the model to initialize with.')
flags.DEFINE_integer('save_every', 1000,
'Average period of steps after which we save a model.')
# Optimization
flags.DEFINE_float('proj_weight', 10, 'Weighting factor for projection loss.')
flags.DEFINE_float('volume_weight', 0, 'Weighting factor for volume loss.')
flags.DEFINE_float('viewpoint_weight', 1, 'Weighting factor for viewpoint loss.')
flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
flags.DEFINE_float('weight_decay', 0.001, 'Weight decay parameter while training.')
flags.DEFINE_float('clip_gradient_norm', 0, 'Gradient clim norm, leave 0 if no gradient clipping.')
flags.DEFINE_integer('max_number_of_steps', 10000, 'Maximum number of steps for training.')
# Summary
flags.DEFINE_integer('save_summaries_secs', 15, 'Seconds interval for dumping TF summaries.')
flags.DEFINE_integer('save_interval_secs', 60 * 5, 'Seconds interval to save models.')
# Scheduling
flags.DEFINE_string('master', '', 'The address of the tensorflow master')
flags.DEFINE_bool('sync_replicas', False, 'Whether to sync gradients between replicas for optimizer.')
flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas (train tasks).')
flags.DEFINE_integer('backup_workers', 0, 'Number of backup workers.')
flags.DEFINE_integer('ps_tasks', 0, 'Number of ps tasks.')
flags.DEFINE_integer('task', 0,
'Task identifier flag to be set for each task running in distributed manner. Task number 0 '
'will be chosen as the chief.')
FLAGS = flags.FLAGS
def main(_):
train_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name, 'train')
save_image_dir = os.path.join(train_dir, 'images')
if not os.path.exists(train_dir):
os.makedirs(train_dir)
if not os.path.exists(save_image_dir):
os.makedirs(save_image_dir)
g = tf.Graph()
with g.as_default():
with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
global_step = slim.get_or_create_global_step()
###########
## model ##
###########
model = model_ptn.model_PTN(FLAGS)
##########
## data ##
##########
train_data = model.get_inputs(
FLAGS.inp_dir,
FLAGS.dataset_name,
'train',
FLAGS.batch_size,
FLAGS.image_size,
FLAGS.vox_size,
is_training=True)
inputs = model.preprocess(train_data, FLAGS.step_size)
##############
## model_fn ##
##############
model_fn = model.get_model_fn(
is_training=True, reuse=False, run_projection=True)
outputs = model_fn(inputs)
##################
## train_scopes ##
##################
if FLAGS.init_model:
train_scopes = ['decoder']
init_scopes = ['encoder']
else:
train_scopes = ['encoder', 'decoder']
##########
## loss ##
##########
task_loss = model.get_loss(inputs, outputs)
regularization_loss = model.get_regularization_loss(train_scopes)
loss = task_loss + regularization_loss
###############
## optimizer ##
###############
optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
if FLAGS.sync_replicas:
optimizer = tf.train.SyncReplicasOptimizer(
optimizer,
replicas_to_aggregate=FLAGS.workers_replicas - FLAGS.backup_workers,
total_num_replicas=FLAGS.worker_replicas)
##############
## train_op ##
##############
train_op = model.get_train_op_for_scope(loss, optimizer, train_scopes)
###########
## saver ##
###########
saver = tf.train.Saver(max_to_keep=np.minimum(5,
FLAGS.worker_replicas + 1))
if FLAGS.task == 0:
params = FLAGS
params.batch_size = params.num_views
params.step_size = 1
model.set_params(params)
val_data = model.get_inputs(
params.inp_dir,
params.dataset_name,
'val',
params.batch_size,
params.image_size,
params.vox_size,
is_training=False)
val_inputs = model.preprocess(val_data, params.step_size)
# Note: don't compute loss here
reused_model_fn = model.get_model_fn(is_training=False, reuse=True)
val_outputs = reused_model_fn(val_inputs)
with tf.device(tf.DeviceSpec(device_type='CPU')):
vis_input_images = val_inputs['images_1'] * 255.0
vis_gt_projs = (val_outputs['masks_1'] * (-1) + 1) * 255.0
vis_pred_projs = (val_outputs['projs_1'] * (-1) + 1) * 255.0
vis_gt_projs = tf.concat([vis_gt_projs] * 3, axis=3)
vis_pred_projs = tf.concat([vis_pred_projs] * 3, axis=3)
# rescale
new_size = [FLAGS.image_size] * 2
vis_gt_projs = tf.image.resize_nearest_neighbor(
vis_gt_projs, new_size)
vis_pred_projs = tf.image.resize_nearest_neighbor(
vis_pred_projs, new_size)
# flip
# vis_gt_projs = utils.image_flipud(vis_gt_projs)
# vis_pred_projs = utils.image_flipud(vis_pred_projs)
# vis_gt_projs is of shape [batch, height, width, channels]
write_disk_op = model.write_disk_grid(
global_step=global_step,
log_dir=save_image_dir,
input_images=vis_input_images,
gt_projs=vis_gt_projs,
pred_projs=vis_pred_projs,
input_voxels=val_inputs['voxels'],
output_voxels=val_outputs['voxels_1'])
with tf.control_dependencies([write_disk_op]):
train_op = tf.identity(train_op)
#############
## init_fn ##
#############
if FLAGS.init_model:
init_fn = model.get_init_fn(init_scopes)
else:
init_fn = None
##############
## training ##
##############
slim.learning.train(
train_op=train_op,
logdir=train_dir,
init_fn=init_fn,
master=FLAGS.master,
is_chief=(FLAGS.task == 0),
number_of_steps=FLAGS.max_number_of_steps,
saver=saver,
save_summaries_secs=FLAGS.save_summaries_secs,
save_interval_secs=FLAGS.save_interval_secs)
if __name__ == '__main__':
app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import StringIO
from matplotlib import pylab as p
# axes3d is being used implictly for visualization.
from mpl_toolkits.mplot3d import axes3d as p3 # pylint:disable=unused-import
import numpy as np
from PIL import Image
from skimage import measure
import tensorflow as tf
def save_image(inp_array, image_file):
"""Function that dumps the image to disk."""
inp_array = np.clip(inp_array, 0, 255).astype(np.uint8)
image = Image.fromarray(inp_array)
buf = StringIO.StringIO()
image.save(buf, format='JPEG')
with open(image_file, 'w') as f:
f.write(buf.getvalue())
return None
def image_flipud(images):
"""Function that flip (up-down) the np image."""
quantity = images.get_shape().as_list()[0]
image_list = []
for k in xrange(quantity):
image_list.append(tf.image.flip_up_down(images[k, :, :, :]))
outputs = tf.stack(image_list)
return outputs
def resize_image(inp_array, new_height, new_width):
"""Function that resize the np image."""
inp_array = np.clip(inp_array, 0, 255).astype(np.uint8)
image = Image.fromarray(inp_array)
# Reverse order
image = image.resize((new_width, new_height))
return np.array(image)
def display_voxel(points, vis_size=128):
"""Function to display 3D voxel."""
try:
data = visualize_voxel_spectral(points, vis_size)
except ValueError:
data = visualize_voxel_scatter(points, vis_size)
return data
def visualize_voxel_spectral(points, vis_size=128):
"""Function to visualize voxel (spectral)."""
points = np.rint(points)
points = np.swapaxes(points, 0, 2)
fig = p.figure(figsize=(1, 1), dpi=vis_size)
verts, faces = measure.marching_cubes(points, 0, spacing=(0.1, 0.1, 0.1))
ax = fig.add_subplot(111, projection='3d')
ax.plot_trisurf(
verts[:, 0], verts[:, 1], faces, verts[:, 2], cmap='Spectral_r', lw=0.1)
ax.set_axis_off()
fig.tight_layout(pad=0)
fig.canvas.draw()
data = np.fromstring(
fig.canvas.tostring_rgb(), dtype=np.uint8, sep='').reshape(
vis_size, vis_size, 3)
p.close('all')
return data
def visualize_voxel_scatter(points, vis_size=128):
"""Function to visualize voxel (scatter)."""
points = np.rint(points)
points = np.swapaxes(points, 0, 2)
fig = p.figure(figsize=(1, 1), dpi=vis_size)
ax = fig.add_subplot(111, projection='3d')
x = []
y = []
z = []
(x_dimension, y_dimension, z_dimension) = points.shape
for i in range(x_dimension):
for j in range(y_dimension):
for k in range(z_dimension):
if points[i, j, k]:
x.append(i)
y.append(j)
z.append(k)
ax.scatter3D(x, y, z)
ax.set_axis_off()
fig.tight_layout(pad=0)
fig.canvas.draw()
data = np.fromstring(
fig.canvas.tostring_rgb(), dtype=np.uint8, sep='').reshape(
vis_size, vis_size, 3)
p.close('all')
return data
# Module networks for question answering on knowledge graph
This code repository contains a TensorFlow model for question answering on
knowledge graph with end-to-end module networks. The original paper describing
end-to-end module networks is as follows.
R. Hu, J. Andreas, M. Rohrbach, T. Darrell, K. Saenko, *Learning to Reason:
End-to-End Module Networks for Visual Question Answering*. in arXiv preprint
arXiv:1704.05526, 2017. ([PDF](https://arxiv.org/pdf/1704.05526.pdf))
```
@article{hu2017learning,
title={Learning to Reason: End-to-End Module Networks for Visual Question Answering},
author={Hu, Ronghang and Andreas, Jacob and Rohrbach, Marcus and Darrell, Trevor and Saenko, Kate},
journal={arXiv preprint arXiv:1704.05526},
year={2017}
}
```
The code in this repository is based on the original
[implementation](https://github.com/ronghanghu/n2nmn) for this paper.
## Requirements
1. Install TensorFlow 1.0.0. Follow the [official
guide](https://www.tensorflow.org/install/). Please note that newer or older
versions of TensorFlow may fail to work due to incompatibility with
TensorFlow Fold.
2. Install TensorFlow Fold. Follow the
[setup instructions](https://github.com/tensorflow/fold/blob/master/tensorflow_fold/g3doc/setup.md).
TensorFlow Fold only supports Linux platform. We have not tested
the code on other platforms.
## Data
1. Download the [MetaQA dataset](https://goo.gl/f3AmcY). Click the button
`MetaQA` and then click `Download` in the drop-down list. Extract the zip
file after downloading completed. Read the documents there for dataset
details.
2. Move the `MetaQA` folder to the root directory of this repository.
## How to use this code
We provide an experiment folder `exp_1_hop`, which applies the implemented model
to the 1-hop vanilla dataset in MetaQA. More experiment folders are coming soon.
Currently, we provide code for training with ground truth layout, and testing
the saved model. Configurations can be modified in `config.py`. They can also be
set via command line parameters.
To train the model:
```
python exp_1_hop/train_gt_layout.py
```
To test the saved model (need to provide the snapshot name):
```
python exp_1_hop/test.py --snapshot_name 00010000
```
## Model introduction
1. In this model, we store the knowledge graph in a key-value based memory. For
each knowledge graph edge (subject, relation, object), we use the (subject,
relation) as the key and the object as the value.
2. All entities and relations are embedded as fixed-dimension vectors. These
embeddings are also end-to-end learned.
3. Neural modules can separately operate on either the key side or the value
side.
4. The attention is shared between keys and corresponding values.
5. The answer output is based on the attention-weighted sum over keys or
values, depending on the output module.
## Contact
Authors: Yuyu Zhang, Xin Pan
Pull requests and issues: @yuyuz
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import argparse
import os
def str2bool(v):
return v.lower() in ('true', '1')
def add_argument_group(name):
arg = parser.add_argument_group(name)
arg_lists.append(arg)
return arg
def get_config():
config, unparsed = parser.parse_known_args()
return config, unparsed
arg_lists = []
parser = argparse.ArgumentParser()
work_dir = os.path.abspath(os.path.join(__file__, '../../'))
net_arg = add_argument_group('Network')
net_arg.add_argument('--lstm_dim', type=int, default=128)
net_arg.add_argument('--num_layers', type=int, default=1)
net_arg.add_argument('--embed_dim_txt', type=int, default=128)
net_arg.add_argument('--embed_dim_nmn', type=int, default=128)
net_arg.add_argument(
'--T_encoder', type=int, default=0) # will be updated when reading data
net_arg.add_argument('--T_decoder', type=int, default=5)
train_arg = add_argument_group('Training')
train_arg.add_argument('--train_tag', type=str, default='n2nmn')
train_arg.add_argument('--batch_size', type=int, default=128)
train_arg.add_argument('--max_iter', type=int, default=1000000)
train_arg.add_argument('--weight_decay', type=float, default=1e-5)
train_arg.add_argument('--baseline_decay', type=float, default=0.99)
train_arg.add_argument('--max_grad_norm', type=float, default=10)
train_arg.add_argument('--random_seed', type=int, default=123)
data_arg = add_argument_group('Data')
data_path = work_dir + '/MetaQA/'
data_arg.add_argument('--KB_file', type=str, default=data_path + 'kb.txt')
data_arg.add_argument(
'--data_dir', type=str, default=data_path + '1-hop/vanilla/')
data_arg.add_argument('--train_data_file', type=str, default='qa_train.txt')
data_arg.add_argument('--dev_data_file', type=str, default='qa_dev.txt')
data_arg.add_argument('--test_data_file', type=str, default='qa_test.txt')
exp_arg = add_argument_group('Experiment')
exp_path = work_dir + '/exp_1_hop/'
exp_arg.add_argument('--exp_dir', type=str, default=exp_path)
log_arg = add_argument_group('Log')
log_arg.add_argument('--log_dir', type=str, default='logs')
log_arg.add_argument('--log_interval', type=int, default=1000)
log_arg.add_argument('--num_log_samples', type=int, default=3)
log_arg.add_argument(
'--log_level', type=str, default='INFO', choices=['INFO', 'DEBUG', 'WARN'])
io_arg = add_argument_group('IO')
io_arg.add_argument('--model_dir', type=str, default='model')
io_arg.add_argument('--snapshot_interval', type=int, default=1000)
io_arg.add_argument('--output_dir', type=str, default='output')
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import sys
sys.path.append(os.path.abspath(os.path.join(__file__, '../../')))
import numpy as np
import tensorflow as tf
from config import get_config
from model_n2nmn.assembler import Assembler
from model_n2nmn.model import Model
from util.data_reader import DataReader
from util.data_reader import SampleBuilder
from util.misc import prepare_dirs_and_logger
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_string('snapshot_name', '00001000', 'snapshot file name')
def main(_):
config = prepare_dirs_and_logger(config_raw)
rng = np.random.RandomState(config.random_seed)
tf.set_random_seed(config.random_seed)
config.rng = rng
config.module_names = ['_key_find', '_key_filter', '_val_desc', '<eos>']
config.gt_layout_tokens = ['_key_find', '_key_filter', '_val_desc', '<eos>']
assembler = Assembler(config)
sample_builder = SampleBuilder(config)
config = sample_builder.config # update T_encoder according to data
data_test = sample_builder.data_all['test']
data_reader_test = DataReader(
config, data_test, assembler, shuffle=False, one_pass=True)
num_vocab_txt = len(sample_builder.dict_all)
num_vocab_nmn = len(assembler.module_names)
num_choices = len(sample_builder.dict_all)
# Network inputs
text_seq_batch = tf.placeholder(tf.int32, [None, None])
seq_len_batch = tf.placeholder(tf.int32, [None])
# The model
model = Model(
config,
sample_builder.kb,
text_seq_batch,
seq_len_batch,
num_vocab_txt=num_vocab_txt,
num_vocab_nmn=num_vocab_nmn,
EOS_idx=assembler.EOS_idx,
num_choices=num_choices,
decoder_sampling=False)
compiler = model.compiler
scores = model.scores
sess = tf.Session()
sess.run(tf.global_variables_initializer())
snapshot_file = os.path.join(config.model_dir, FLAGS.snapshot_name)
tf.logging.info('Snapshot file: %s' % snapshot_file)
snapshot_saver = tf.train.Saver()
snapshot_saver.restore(sess, snapshot_file)
# Evaluation metrics
num_questions = len(data_test.Y)
tf.logging.info('# of test questions: %d' % num_questions)
answer_correct = 0
layout_correct = 0
layout_valid = 0
for batch in data_reader_test.batches():
# set up input and output tensors
h = sess.partial_run_setup(
fetches=[model.predicted_tokens, scores],
feeds=[text_seq_batch, seq_len_batch, compiler.loom_input_tensor])
# Part 1: Generate module layout
tokens = sess.partial_run(
h,
fetches=model.predicted_tokens,
feed_dict={
text_seq_batch: batch['input_seq_batch'],
seq_len_batch: batch['seq_len_batch']
})
# Compute accuracy of the predicted layout
gt_tokens = batch['gt_layout_batch']
layout_correct += np.sum(
np.all(
np.logical_or(tokens == gt_tokens, gt_tokens == assembler.EOS_idx),
axis=0))
# Assemble the layout tokens into network structure
expr_list, expr_validity_array = assembler.assemble(tokens)
layout_valid += np.sum(expr_validity_array)
labels = batch['ans_label_batch']
# Build TensorFlow Fold input for NMN
expr_feed = compiler.build_feed_dict(expr_list)
# Part 2: Run NMN and learning steps
scores_val = sess.partial_run(h, scores, feed_dict=expr_feed)
# Compute accuracy
predictions = np.argmax(scores_val, axis=1)
answer_correct += np.sum(
np.logical_and(expr_validity_array, predictions == labels))
answer_accuracy = answer_correct * 1.0 / num_questions
layout_accuracy = layout_correct * 1.0 / num_questions
layout_validity = layout_valid * 1.0 / num_questions
tf.logging.info('test answer accuracy = %f, '
'test layout accuracy = %f, '
'test layout validity = %f' %
(answer_accuracy, layout_accuracy, layout_validity))
if __name__ == '__main__':
config_raw, unparsed = get_config()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import sys
sys.path.append(os.path.abspath(os.path.join(__file__, '../../')))
import numpy as np
import tensorflow as tf
from config import get_config
from model_n2nmn.assembler import Assembler
from model_n2nmn.model import Model
from util.data_reader import DataReader
from util.data_reader import SampleBuilder
from util.misc import prepare_dirs_and_logger
from util.misc import save_config
from util.misc import show_all_variables
def main(_):
config = prepare_dirs_and_logger(config_raw)
save_config(config)
rng = np.random.RandomState(config.random_seed)
tf.set_random_seed(config.random_seed)
config.rng = rng
config.module_names = ['_key_find', '_key_filter', '_val_desc', '<eos>']
config.gt_layout_tokens = ['_key_find', '_key_filter', '_val_desc', '<eos>']
assembler = Assembler(config)
sample_builder = SampleBuilder(config)
config = sample_builder.config # update T_encoder according to data
data_train = sample_builder.data_all['train']
data_reader_train = DataReader(
config, data_train, assembler, shuffle=True, one_pass=False)
num_vocab_txt = len(sample_builder.dict_all)
num_vocab_nmn = len(assembler.module_names)
num_choices = len(sample_builder.dict_all)
# Network inputs
text_seq_batch = tf.placeholder(tf.int32, [None, None])
seq_len_batch = tf.placeholder(tf.int32, [None])
ans_label_batch = tf.placeholder(tf.int32, [None])
use_gt_layout = tf.constant(True, dtype=tf.bool)
gt_layout_batch = tf.placeholder(tf.int32, [None, None])
# The model for training
model = Model(
config,
sample_builder.kb,
text_seq_batch,
seq_len_batch,
num_vocab_txt=num_vocab_txt,
num_vocab_nmn=num_vocab_nmn,
EOS_idx=assembler.EOS_idx,
num_choices=num_choices,
decoder_sampling=True,
use_gt_layout=use_gt_layout,
gt_layout_batch=gt_layout_batch)
compiler = model.compiler
scores = model.scores
log_seq_prob = model.log_seq_prob
# Loss function
softmax_loss_per_sample = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=scores, labels=ans_label_batch)
# The final per-sample loss, which is loss for valid expr
# and invalid_expr_loss for invalid expr
final_loss_per_sample = softmax_loss_per_sample # All exprs are valid
avg_sample_loss = tf.reduce_mean(final_loss_per_sample)
seq_likelihood_loss = tf.reduce_mean(-log_seq_prob)
total_training_loss = seq_likelihood_loss + avg_sample_loss
total_loss = total_training_loss + config.weight_decay * model.l2_reg
# Train with Adam optimizer
solver = tf.train.AdamOptimizer()
gradients = solver.compute_gradients(total_loss)
# Clip gradient by L2 norm
gradients = [(tf.clip_by_norm(g, config.max_grad_norm), v)
for g, v in gradients]
solver_op = solver.apply_gradients(gradients)
# Training operation
with tf.control_dependencies([solver_op]):
train_step = tf.constant(0)
# Write summary to TensorBoard
log_writer = tf.summary.FileWriter(config.log_dir, tf.get_default_graph())
loss_ph = tf.placeholder(tf.float32, [])
entropy_ph = tf.placeholder(tf.float32, [])
accuracy_ph = tf.placeholder(tf.float32, [])
summary_train = [
tf.summary.scalar('avg_sample_loss', loss_ph),
tf.summary.scalar('entropy', entropy_ph),
tf.summary.scalar('avg_accuracy', accuracy_ph)
]
log_step_train = tf.summary.merge(summary_train)
# Training
sess = tf.Session()
sess.run(tf.global_variables_initializer())
snapshot_saver = tf.train.Saver(max_to_keep=None) # keep all snapshots
show_all_variables()
avg_accuracy = 0
accuracy_decay = 0.99
for n_iter, batch in enumerate(data_reader_train.batches()):
if n_iter >= config.max_iter:
break
# set up input and output tensors
h = sess.partial_run_setup(
fetches=[
model.predicted_tokens, model.entropy_reg, scores, avg_sample_loss,
train_step
],
feeds=[
text_seq_batch, seq_len_batch, gt_layout_batch,
compiler.loom_input_tensor, ans_label_batch
])
# Part 1: Generate module layout
tokens, entropy_reg_val = sess.partial_run(
h,
fetches=(model.predicted_tokens, model.entropy_reg),
feed_dict={
text_seq_batch: batch['input_seq_batch'],
seq_len_batch: batch['seq_len_batch'],
gt_layout_batch: batch['gt_layout_batch']
})
# Assemble the layout tokens into network structure
expr_list, expr_validity_array = assembler.assemble(tokens)
# all exprs should be valid (since they are ground-truth)
assert np.all(expr_validity_array)
labels = batch['ans_label_batch']
# Build TensorFlow Fold input for NMN
expr_feed = compiler.build_feed_dict(expr_list)
expr_feed[ans_label_batch] = labels
# Part 2: Run NMN and learning steps
scores_val, avg_sample_loss_val, _ = sess.partial_run(
h, fetches=(scores, avg_sample_loss, train_step), feed_dict=expr_feed)
# Compute accuracy
predictions = np.argmax(scores_val, axis=1)
accuracy = np.mean(
np.logical_and(expr_validity_array, predictions == labels))
avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy)
# Add to TensorBoard summary
if (n_iter + 1) % config.log_interval == 0:
tf.logging.info('iter = %d\n\t'
'loss = %f, accuracy (cur) = %f, '
'accuracy (avg) = %f, entropy = %f' %
(n_iter + 1, avg_sample_loss_val, accuracy, avg_accuracy,
-entropy_reg_val))
summary = sess.run(
fetches=log_step_train,
feed_dict={
loss_ph: avg_sample_loss_val,
entropy_ph: -entropy_reg_val,
accuracy_ph: avg_accuracy
})
log_writer.add_summary(summary, n_iter + 1)
# Save snapshot
if (n_iter + 1) % config.snapshot_interval == 0:
snapshot_file = os.path.join(config.model_dir, '%08d' % (n_iter + 1))
snapshot_saver.save(sess, snapshot_file, write_meta_graph=False)
tf.logging.info('Snapshot saved to %s' % snapshot_file)
tf.logging.info('Run finished.')
if __name__ == '__main__':
config_raw, unparsed = get_config()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
# the number of attention input to each module
_module_input_num = {
'_key_find': 0,
'_key_filter': 1,
'_val_desc': 1}
_module_output_type = {
'_key_find': 'att',
'_key_filter': 'att',
'_val_desc': 'ans'
}
INVALID_EXPR = 'INVALID_EXPR'
class Assembler:
def __init__(self, config):
# read the module list, and record the index of each module and <eos>
self.module_names = config.module_names
# find the index of <eos>
for n_s in range(len(self.module_names)):
if self.module_names[n_s] == '<eos>':
self.EOS_idx = n_s
break
# build a dictionary from module name to token index
self.name2idx_dict = {
name: n_s
for n_s, name in enumerate(self.module_names)
}
def module_list2tokens(self, module_list, max_len=None):
layout_tokens = [self.name2idx_dict[name] for name in module_list]
if max_len is not None:
if len(module_list) >= max_len:
raise ValueError('Not enough time steps to add <eos>')
layout_tokens += [self.EOS_idx] * (max_len - len(module_list))
return layout_tokens
def _layout_tokens2str(self, layout_tokens):
return ' '.join([self.module_names[idx] for idx in layout_tokens])
def _invalid_expr(self, layout_tokens, error_str):
return {
'module': INVALID_EXPR,
'expr_str': self._layout_tokens2str(layout_tokens),
'error': error_str
}
def _assemble_layout_tokens(self, layout_tokens, batch_idx):
# Every module takes a time_idx as the index from LSTM hidden states
# (even if it doesn't need it, like _and), and different arity of
# attention inputs. The output type can be either attention or answer
#
# The final assembled expression for each instance is as follows:
# expr_type :=
# {'module': '_find', 'output_type': 'att', 'time_idx': idx}
# | {'module': '_relocate', 'output_type': 'att', 'time_idx': idx,
# 'inputs_0': <expr_type>}
# | {'module': '_and', 'output_type': 'att', 'time_idx': idx,
# 'inputs_0': <expr_type>, 'inputs_1': <expr_type>)}
# | {'module': '_describe', 'output_type': 'ans', 'time_idx': idx,
# 'inputs_0': <expr_type>}
# | {'module': INVALID_EXPR, 'expr_str': '...', 'error': '...',
# 'assembly_loss': <float32>} (for invalid expressions)
#
# A valid layout must contain <eos>. Assembly fails if it doesn't.
if not np.any(layout_tokens == self.EOS_idx):
return self._invalid_expr(layout_tokens, 'cannot find <eos>')
# Decoding Reverse Polish Notation with a stack
decoding_stack = []
for t in range(len(layout_tokens)):
# decode a module/operation
module_idx = layout_tokens[t]
if module_idx == self.EOS_idx:
break
module_name = self.module_names[module_idx]
expr = {
'module': module_name,
'output_type': _module_output_type[module_name],
'time_idx': t,
'batch_idx': batch_idx
}
input_num = _module_input_num[module_name]
# Check if there are enough input in the stack
if len(decoding_stack) < input_num:
# Invalid expression. Not enough input.
return self._invalid_expr(layout_tokens,
'not enough input for ' + module_name)
# Get the input from stack
for n_input in range(input_num - 1, -1, -1):
stack_top = decoding_stack.pop()
if stack_top['output_type'] != 'att':
# Invalid expression. Input must be attention
return self._invalid_expr(layout_tokens,
'input incompatible for ' + module_name)
expr['input_%d' % n_input] = stack_top
decoding_stack.append(expr)
# After decoding the reverse polish expression, there should be exactly
# one expression in the stack
if len(decoding_stack) != 1:
return self._invalid_expr(
layout_tokens,
'final stack size not equal to 1 (%d remains)' % len(decoding_stack))
result = decoding_stack[0]
# The result type should be answer, not attention
if result['output_type'] != 'ans':
return self._invalid_expr(layout_tokens,
'result type must be ans, not att')
return result
def assemble(self, layout_tokens_batch):
# layout_tokens_batch is a numpy array with shape [max_dec_len, batch_size],
# containing module tokens and <eos>, in Reverse Polish Notation.
_, batch_size = layout_tokens_batch.shape
expr_list = [
self._assemble_layout_tokens(layout_tokens_batch[:, batch_i], batch_i)
for batch_i in range(batch_size)
]
expr_validity = np.array(
[expr['module'] != INVALID_EXPR for expr in expr_list], np.bool)
return expr_list, expr_validity
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import tensorflow as tf
import tensorflow_fold as td
from model_n2nmn import netgen_att
from model_n2nmn import assembler
from model_n2nmn.modules import Modules
class Model:
def __init__(self,
config,
kb,
text_seq_batch,
seq_length_batch,
num_vocab_txt,
num_vocab_nmn,
EOS_idx,
num_choices,
decoder_sampling,
use_gt_layout=None,
gt_layout_batch=None,
scope='neural_module_network',
reuse=None):
with tf.variable_scope(scope, reuse=reuse):
# Part 1: Seq2seq RNN to generate module layout tokens
embedding_mat = tf.get_variable(
'embedding_mat', [num_vocab_txt, config.embed_dim_txt],
initializer=tf.contrib.layers.xavier_initializer())
with tf.variable_scope('layout_generation'):
att_seq2seq = netgen_att.AttentionSeq2Seq(
config, text_seq_batch, seq_length_batch, num_vocab_txt,
num_vocab_nmn, EOS_idx, decoder_sampling, embedding_mat,
use_gt_layout, gt_layout_batch)
self.att_seq2seq = att_seq2seq
predicted_tokens = att_seq2seq.predicted_tokens
token_probs = att_seq2seq.token_probs
word_vecs = att_seq2seq.word_vecs
neg_entropy = att_seq2seq.neg_entropy
self.atts = att_seq2seq.atts
self.predicted_tokens = predicted_tokens
self.token_probs = token_probs
self.word_vecs = word_vecs
self.neg_entropy = neg_entropy
# log probability of each generated sequence
self.log_seq_prob = tf.reduce_sum(tf.log(token_probs), axis=0)
# Part 2: Neural Module Network
with tf.variable_scope('layout_execution'):
modules = Modules(config, kb, word_vecs, num_choices, embedding_mat)
self.modules = modules
# Recursion of modules
att_shape = [len(kb)]
# Forward declaration of module recursion
att_expr_decl = td.ForwardDeclaration(td.PyObjectType(),
td.TensorType(att_shape))
# _key_find
case_key_find = td.Record([('time_idx', td.Scalar(dtype='int32')),
('batch_idx', td.Scalar(dtype='int32'))])
case_key_find = case_key_find >> td.ScopedLayer(
modules.KeyFindModule, name_or_scope='KeyFindModule')
# _key_filter
case_key_filter = td.Record([('input_0', att_expr_decl()),
('time_idx', td.Scalar('int32')),
('batch_idx', td.Scalar('int32'))])
case_key_filter = case_key_filter >> td.ScopedLayer(
modules.KeyFilterModule, name_or_scope='KeyFilterModule')
recursion_cases = td.OneOf(
td.GetItem('module'),
{'_key_find': case_key_find,
'_key_filter': case_key_filter})
att_expr_decl.resolve_to(recursion_cases)
# _val_desc: output scores for choice (for valid expressions)
predicted_scores = td.Record([('input_0', recursion_cases),
('time_idx', td.Scalar('int32')),
('batch_idx', td.Scalar('int32'))])
predicted_scores = predicted_scores >> td.ScopedLayer(
modules.ValDescribeModule, name_or_scope='ValDescribeModule')
# For invalid expressions, define a dummy answer
# so that all answers have the same form
INVALID = assembler.INVALID_EXPR
dummy_scores = td.Void() >> td.FromTensor(
np.zeros(num_choices, np.float32))
output_scores = td.OneOf(
td.GetItem('module'),
{'_val_desc': predicted_scores,
INVALID: dummy_scores})
# compile and get the output scores
self.compiler = td.Compiler.create(output_scores)
self.scores = self.compiler.output_tensors[0]
# Regularization: Entropy + L2
self.entropy_reg = tf.reduce_mean(neg_entropy)
module_weights = [
v for v in tf.trainable_variables()
if (scope in v.op.name and v.op.name.endswith('weights'))
]
self.l2_reg = tf.add_n([tf.nn.l2_loss(v) for v in module_weights])
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
class Modules:
def __init__(self, config, kb, word_vecs, num_choices, embedding_mat):
self.config = config
self.embedding_mat = embedding_mat
# kb has shape [N_kb, 3]
self.kb = kb
self.embed_keys_e, self.embed_keys_r, self.embed_vals_e = self.embed_kb()
# word_vecs has shape [T_decoder, N, D_txt]
self.word_vecs = word_vecs
self.num_choices = num_choices
def embed_kb(self):
keys_e, keys_r, vals_e = [], [], []
for idx_sub, idx_rel, idx_obj in self.kb:
keys_e.append(idx_sub)
keys_r.append(idx_rel)
vals_e.append(idx_obj)
embed_keys_e = tf.nn.embedding_lookup(self.embedding_mat, keys_e)
embed_keys_r = tf.nn.embedding_lookup(self.embedding_mat, keys_r)
embed_vals_e = tf.nn.embedding_lookup(self.embedding_mat, vals_e)
return embed_keys_e, embed_keys_r, embed_vals_e
def _slice_word_vecs(self, time_idx, batch_idx):
# this callable will be wrapped into a td.Function
# In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors
# time is highest dim in word_vecs
joint_index = tf.stack([time_idx, batch_idx], axis=1)
return tf.gather_nd(self.word_vecs, joint_index)
# All the layers are wrapped with td.ScopedLayer
def KeyFindModule(self,
time_idx,
batch_idx,
scope='KeyFindModule',
reuse=None):
# In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors
text_param = self._slice_word_vecs(time_idx, batch_idx)
# Mapping: embed_keys_e x text_param -> att
# Input:
# embed_keys_e: [N_kb, D_txt]
# text_param: [N, D_txt]
# Output:
# att: [N, N_kb]
#
# Implementation:
# 1. Elementwise multiplication between embed_key_e and text_param
# 2. L2-normalization
with tf.variable_scope(scope, reuse=reuse):
m = tf.matmul(text_param, self.embed_keys_e, transpose_b=True)
att = tf.nn.l2_normalize(m, dim=1)
return att
def KeyFilterModule(self,
input_0,
time_idx,
batch_idx,
scope='KeyFilterModule',
reuse=None):
att_0 = input_0
text_param = self._slice_word_vecs(time_idx, batch_idx)
# Mapping: and(embed_keys_r x text_param, att) -> att
# Input:
# embed_keys_r: [N_kb, D_txt]
# text_param: [N, D_txt]
# att_0: [N, N_kb]
# Output:
# att: [N, N_kb]
#
# Implementation:
# 1. Elementwise multiplication between embed_key_r and text_param
# 2. L2-normalization
# 3. Take the elementwise-min
with tf.variable_scope(scope, reuse=reuse):
m = tf.matmul(text_param, self.embed_keys_r, transpose_b=True)
att_1 = tf.nn.l2_normalize(m, dim=1)
att = tf.minimum(att_0, att_1)
return att
def ValDescribeModule(self,
input_0,
time_idx,
batch_idx,
scope='ValDescribeModule',
reuse=None):
att = input_0
# Mapping: att -> answer probs
# Input:
# embed_vals_e: [N_kb, D_txt]
# att: [N, N_kb]
# embedding_mat: [self.num_choices, D_txt]
# Output:
# answer_scores: [N, self.num_choices]
#
# Implementation:
# 1. Attention-weighted sum over values
# 2. Compute cosine similarity scores between the weighted sum and
# each candidate answer
with tf.variable_scope(scope, reuse=reuse):
# weighted_sum has shape [N, D_txt]
weighted_sum = tf.matmul(att, self.embed_vals_e)
# scores has shape [N, self.num_choices]
scores = tf.matmul(
weighted_sum,
tf.nn.l2_normalize(self.embedding_mat, dim=1),
transpose_b=True)
return scores
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
from util.nn import fc_layer as fc
def _get_lstm_cell(num_layers, lstm_dim):
cell_list = [
tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True)
for _ in range(num_layers)
]
cell = tf.contrib.rnn.MultiRNNCell(cell_list, state_is_tuple=True)
return cell
class AttentionSeq2Seq:
def __init__(self,
config,
text_seq_batch,
seq_length_batch,
num_vocab_txt,
num_vocab_nmn,
EOS_token,
decoder_sampling,
embedding_mat,
use_gt_layout=None,
gt_layout_batch=None,
scope='encoder_decoder',
reuse=None):
self.T_decoder = config.T_decoder
self.encoder_num_vocab = num_vocab_txt
self.encoder_embed_dim = config.embed_dim_txt
self.decoder_num_vocab = num_vocab_nmn
self.decoder_embed_dim = config.embed_dim_nmn
self.lstm_dim = config.lstm_dim
self.num_layers = config.num_layers
self.EOS_token = EOS_token
self.decoder_sampling = decoder_sampling
self.embedding_mat = embedding_mat
with tf.variable_scope(scope, reuse=reuse):
self._build_encoder(text_seq_batch, seq_length_batch)
self._build_decoder(use_gt_layout, gt_layout_batch)
def _build_encoder(self,
text_seq_batch,
seq_length_batch,
scope='encoder',
reuse=None):
lstm_dim = self.lstm_dim
num_layers = self.num_layers
with tf.variable_scope(scope, reuse=reuse):
T = tf.shape(text_seq_batch)[0]
N = tf.shape(text_seq_batch)[1]
self.T_encoder = T
self.N = N
# text_seq has shape [T, N] and embedded_seq has shape [T, N, D]
embedded_seq = tf.nn.embedding_lookup(self.embedding_mat, text_seq_batch)
self.embedded_input_seq = embedded_seq
# The RNN
cell = _get_lstm_cell(num_layers, lstm_dim)
# encoder_outputs has shape [T, N, lstm_dim]
encoder_outputs, encoder_states = tf.nn.dynamic_rnn(
cell,
embedded_seq,
seq_length_batch,
dtype=tf.float32,
time_major=True,
scope='lstm')
self.encoder_outputs = encoder_outputs
self.encoder_states = encoder_states
# transform the encoder outputs for further attention alignments
# encoder_outputs_flat has shape [T, N, lstm_dim]
encoder_h_transformed = fc(
'encoder_h_transform',
tf.reshape(encoder_outputs, [-1, lstm_dim]),
output_dim=lstm_dim)
encoder_h_transformed = tf.reshape(encoder_h_transformed,
[T, N, lstm_dim])
self.encoder_h_transformed = encoder_h_transformed
# seq_not_finished is a shape [T, N, 1] tensor,
# where seq_not_finished[t, n]
# is 1 iff sequence n is not finished at time t, and 0 otherwise
seq_not_finished = tf.less(
tf.range(T)[:, tf.newaxis, tf.newaxis],
seq_length_batch[:, tf.newaxis])
seq_not_finished = tf.cast(seq_not_finished, tf.float32)
self.seq_not_finished = seq_not_finished
def _build_decoder(self,
use_gt_layout,
gt_layout_batch,
scope='decoder',
reuse=None):
# The main difference from before is that the decoders now takes another
# input (the attention) when computing the next step
# T_max is the maximum length of decoded sequence (including <eos>)
#
# This function is for decoding only. It performs greedy search or sampling.
# the first input is <go> (its embedding vector) and the subsequent inputs
# are the outputs from previous time step
# num_vocab does not include <go>
#
# use_gt_layout is None or a bool tensor, and gt_layout_batch is a tensor
# with shape [T_max, N].
# If use_gt_layout is not None, then when use_gt_layout is true, predict
# exactly the tokens in gt_layout_batch, regardless of actual probability.
# Otherwise, if sampling is True, sample from the token probability
# If sampling is False, do greedy decoding (beam size 1)
N = self.N
encoder_states = self.encoder_states
T_max = self.T_decoder
lstm_dim = self.lstm_dim
num_layers = self.num_layers
EOS_token = self.EOS_token
sampling = self.decoder_sampling
with tf.variable_scope(scope, reuse=reuse):
embedding_mat = tf.get_variable(
'embedding_mat', [self.decoder_num_vocab, self.decoder_embed_dim])
# we use a separate embedding for <go>, as it is only used in the
# beginning of the sequence
go_embedding = tf.get_variable('go_embedding',
[1, self.decoder_embed_dim])
with tf.variable_scope('att_prediction'):
v = tf.get_variable('v', [lstm_dim])
W_a = tf.get_variable(
'weights', [lstm_dim, lstm_dim],
initializer=tf.contrib.layers.xavier_initializer())
b_a = tf.get_variable(
'biases', lstm_dim, initializer=tf.constant_initializer(0.))
# The parameters to predict the next token
with tf.variable_scope('token_prediction'):
W_y = tf.get_variable(
'weights', [lstm_dim * 2, self.decoder_num_vocab],
initializer=tf.contrib.layers.xavier_initializer())
b_y = tf.get_variable(
'biases',
self.decoder_num_vocab,
initializer=tf.constant_initializer(0.))
# Attentional decoding
# Loop function is called at time t BEFORE the cell execution at time t,
# and its next_input is used as the input at time t (not t+1)
# c.f. https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn
mask_range = tf.reshape(
tf.range(self.decoder_num_vocab, dtype=tf.int32), [1, -1])
all_eos_pred = EOS_token * tf.ones([N], tf.int32)
all_one_prob = tf.ones([N], tf.float32)
all_zero_entropy = tf.zeros([N], tf.float32)
if use_gt_layout is not None:
gt_layout_mult = tf.cast(use_gt_layout, tf.int32)
pred_layout_mult = 1 - gt_layout_mult
def loop_fn(time, cell_output, cell_state, loop_state):
if cell_output is None: # time == 0
next_cell_state = encoder_states
next_input = tf.tile(go_embedding, [N, 1])
else: # time > 0
next_cell_state = cell_state
# compute the attention map over the input sequence
# a_raw has shape [T, N, 1]
att_raw = tf.reduce_sum(
tf.tanh(
tf.nn.xw_plus_b(cell_output, W_a, b_a) +
self.encoder_h_transformed) * v,
axis=2,
keep_dims=True)
# softmax along the first dimension (T) over not finished examples
# att has shape [T, N, 1]
att = tf.nn.softmax(att_raw, dim=0) * self.seq_not_finished
att = att / tf.reduce_sum(att, axis=0, keep_dims=True)
# d has shape [N, lstm_dim]
d2 = tf.reduce_sum(att * self.encoder_outputs, axis=0)
# token_scores has shape [N, num_vocab]
token_scores = tf.nn.xw_plus_b(
tf.concat([cell_output, d2], axis=1), W_y, b_y)
# predict the next token (behavior depending on parameters)
if sampling:
# predicted_token has shape [N]
logits = token_scores
predicted_token = tf.cast(
tf.reshape(tf.multinomial(token_scores, 1), [-1]), tf.int32)
else:
# predicted_token has shape [N]
predicted_token = tf.cast(tf.argmax(token_scores, 1), tf.int32)
if use_gt_layout is not None:
predicted_token = (gt_layout_batch[time - 1] * gt_layout_mult +
predicted_token * pred_layout_mult)
# token_prob has shape [N], the probability of the predicted token
# although token_prob is not needed for predicting the next token
# it is needed in output (for policy gradient training)
# [N, num_vocab]
# mask has shape [N, num_vocab]
mask = tf.equal(mask_range, tf.reshape(predicted_token, [-1, 1]))
all_token_probs = tf.nn.softmax(token_scores)
token_prob = tf.reduce_sum(
all_token_probs * tf.cast(mask, tf.float32), axis=1)
neg_entropy = tf.reduce_sum(
all_token_probs * tf.log(all_token_probs), axis=1)
# is_eos_predicted is a [N] bool tensor, indicating whether
# <eos> has already been predicted previously in each sequence
is_eos_predicted = loop_state[2]
predicted_token_old = predicted_token
# if <eos> has already been predicted, now predict <eos> with
# prob 1
predicted_token = tf.where(is_eos_predicted, all_eos_pred,
predicted_token)
token_prob = tf.where(is_eos_predicted, all_one_prob, token_prob)
neg_entropy = tf.where(is_eos_predicted, all_zero_entropy,
neg_entropy)
is_eos_predicted = tf.logical_or(is_eos_predicted,
tf.equal(predicted_token_old,
EOS_token))
# the prediction is from the cell output of the last step
# timestep (t-1), feed it as input into timestep t
next_input = tf.nn.embedding_lookup(embedding_mat, predicted_token)
elements_finished = tf.greater_equal(time, T_max)
# loop_state is a 5-tuple, representing
# 1) the predicted_tokens
# 2) the prob of predicted_tokens
# 3) whether <eos> has already been predicted
# 4) the negative entropy of policy (accumulated across timesteps)
# 5) the attention
if loop_state is None: # time == 0
# Write the predicted token into the output
predicted_token_array = tf.TensorArray(
dtype=tf.int32, size=T_max, infer_shape=False)
token_prob_array = tf.TensorArray(
dtype=tf.float32, size=T_max, infer_shape=False)
att_array = tf.TensorArray(
dtype=tf.float32, size=T_max, infer_shape=False)
next_loop_state = (predicted_token_array, token_prob_array, tf.zeros(
[N], dtype=tf.bool), tf.zeros([N], dtype=tf.float32), att_array)
else: # time > 0
t_write = time - 1
next_loop_state = (
loop_state[0].write(t_write, predicted_token),
loop_state[1].write(t_write, token_prob),
is_eos_predicted,
loop_state[3] + neg_entropy,
loop_state[4].write(t_write, att))
return (elements_finished, next_input, next_cell_state, cell_output,
next_loop_state)
# The RNN
cell = _get_lstm_cell(num_layers, lstm_dim)
_, _, decodes_ta = tf.nn.raw_rnn(cell, loop_fn, scope='lstm')
predicted_tokens = decodes_ta[0].stack()
token_probs = decodes_ta[1].stack()
neg_entropy = decodes_ta[3]
# atts has shape [T_decoder, T_encoder, N, 1]
atts = decodes_ta[4].stack()
self.atts = atts
# word_vec has shape [T_decoder, N, D]
word_vecs = tf.reduce_sum(atts * self.embedded_input_seq, axis=1)
predicted_tokens.set_shape([None, None])
token_probs.set_shape([None, None])
neg_entropy.set_shape([None])
word_vecs.set_shape([None, None, self.encoder_embed_dim])
self.predicted_tokens = predicted_tokens
self.token_probs = token_probs
self.neg_entropy = neg_entropy
self.word_vecs = word_vecs
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from collections import namedtuple
from Queue import Queue
import re
import threading
import numpy as np
import tensorflow as tf
Data = namedtuple('Data', ['X', 'Y', 'MultiYs', 'qid'])
class SampleBuilder:
def __init__(self, config):
self.config = config
self.kb_raw = self.read_kb()
self.data_raw = self.read_raw_data()
# dictionary of entities, normal words, and relations
self.dict_all = self.gen_dict()
self.reverse_dict_all = dict(
zip(self.dict_all.values(), self.dict_all.keys()))
tf.logging.info('size of dict: %d' % len(self.dict_all))
self.kb = self.build_kb()
self.data_all = self.build_samples()
def read_kb(self):
kb_raw = []
for line in file(self.config.KB_file):
sub, rel, obj = line.strip().split('|')
kb_raw.append((sub, rel, obj))
tf.logging.info('# of KB records: %d' % len(kb_raw))
return kb_raw
def read_raw_data(self):
data = dict()
for name in self.config.data_files:
raw = []
tf.logging.info(
'Reading data file {}'.format(self.config.data_files[name]))
for line in file(self.config.data_files[name]):
question, answers = line.strip().split('\t')
question = question.replace('],', ']') # ignore ',' in the template
raw.append((question, answers))
data[name] = raw
return data
def build_kb(self):
tf.logging.info('Indexing KB...')
kb = []
for sub, rel, obj in self.kb_raw:
kb.append([self.dict_all[sub], self.dict_all[rel], self.dict_all[obj]])
return kb
def gen_dict(self):
s = set()
for sub, rel, obj in self.kb_raw:
s.add(sub)
s.add(rel)
s.add(obj)
for name in self.data_raw:
for question, answers in self.data_raw[name]:
normal = re.split('\[[^\]]+\]', question)
for phrase in normal:
for word in phrase.split():
s.add(word)
s = list(s)
d = {s[idx]: idx for idx in range(len(s))}
return d
def build_samples(self):
def map_entity_idx(text):
entities = re.findall('\[[^\]]+\]', text)
for entity in entities:
entity = entity[1:-1]
index = self.dict_all[entity]
text = text.replace('[%s]' % entity, '@%d' % index)
return text
data_all = dict()
for name in self.data_raw:
X, Y, MultiYs, qid = [], [], [], []
for i, (question, answers) in enumerate(self.data_raw[name]):
qdata, labels = [], []
question = map_entity_idx(question)
for word in question.split():
if word[0] == '@':
qdata.append(int(word[1:]))
else:
qdata.append(self.dict_all[word])
for answer in answers.split('|'):
labels.append(self.dict_all[answer])
if len(qdata) > self.config.T_encoder:
self.config.T_encoder = len(qdata)
for label in labels:
X.append(qdata)
Y.append(label)
MultiYs.append(set(labels))
qid.append(i)
data_all[name] = Data(X=X, Y=Y, MultiYs=MultiYs, qid=qid)
return data_all
def _run_prefetch(prefetch_queue, batch_loader, data, shuffle, one_pass,
config):
assert len(data.X) == len(data.Y) == len(data.MultiYs) == len(data.qid)
num_samples = len(data.X)
batch_size = config.batch_size
n_sample = 0
fetch_order = config.rng.permutation(num_samples)
while True:
sample_ids = fetch_order[n_sample:n_sample + batch_size]
batch = batch_loader.load_one_batch(sample_ids)
prefetch_queue.put(batch, block=True)
n_sample += len(sample_ids)
if n_sample >= num_samples:
if one_pass:
prefetch_queue.put(None, block=True)
n_sample = 0
if shuffle:
fetch_order = config.rng.permutation(num_samples)
class DataReader:
def __init__(self,
config,
data,
assembler,
shuffle=True,
one_pass=False,
prefetch_num=10):
self.config = config
self.data = data
self.assembler = assembler
self.batch_loader = BatchLoader(self.config,
self.data, self.assembler)
self.shuffle = shuffle
self.one_pass = one_pass
self.prefetch_queue = Queue(maxsize=prefetch_num)
self.prefetch_thread = threading.Thread(target=_run_prefetch,
args=(self.prefetch_queue,
self.batch_loader, self.data,
self.shuffle, self.one_pass,
self.config))
self.prefetch_thread.daemon = True
self.prefetch_thread.start()
def batches(self):
while True:
if self.prefetch_queue.empty():
tf.logging.warning('Waiting for data loading (IO is slow)...')
batch = self.prefetch_queue.get(block=True)
if batch is None:
assert self.one_pass
tf.logging.info('One pass finished!')
raise StopIteration()
yield batch
class BatchLoader:
def __init__(self, config,
data, assembler):
self.config = config
self.data = data
self.assembler = assembler
self.T_encoder = config.T_encoder
self.T_decoder = config.T_decoder
tf.logging.info('T_encoder: %d' % self.T_encoder)
tf.logging.info('T_decoder: %d' % self.T_decoder)
tf.logging.info('batch size: %d' % self.config.batch_size)
self.gt_layout_tokens = config.gt_layout_tokens
def load_one_batch(self, sample_ids):
actual_batch_size = len(sample_ids)
input_seq_batch = np.zeros((self.T_encoder, actual_batch_size), np.int32)
seq_len_batch = np.zeros(actual_batch_size, np.int32)
ans_label_batch = np.zeros(actual_batch_size, np.int32)
ans_set_labels_list = [None] * actual_batch_size
question_id_list = [None] * actual_batch_size
gt_layout_batch = np.zeros((self.T_decoder, actual_batch_size), np.int32)
for batch_i in range(actual_batch_size):
idx = sample_ids[batch_i]
seq_len = len(self.data.X[idx])
seq_len_batch[batch_i] = seq_len
input_seq_batch[:seq_len, batch_i] = self.data.X[idx]
ans_label_batch[batch_i] = self.data.Y[idx]
ans_set_labels_list[batch_i] = self.data.MultiYs[idx]
question_id_list[batch_i] = self.data.qid[idx]
gt_layout_batch[:, batch_i] = self.assembler.module_list2tokens(
self.gt_layout_tokens, self.T_decoder)
batch = dict(input_seq_batch=input_seq_batch,
seq_len_batch=seq_len_batch,
ans_label_batch=ans_label_batch,
gt_layout_batch=gt_layout_batch,
ans_set_labels_list=ans_set_labels_list,
question_id_list=question_id_list)
return batch
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from datetime import datetime
import json
import logging
import os
import tensorflow as tf
import tensorflow.contrib.slim as slim
def prepare_dirs_and_logger(config):
formatter = logging.Formatter('%(asctime)s:%(levelname)s::%(message)s')
logger = logging.getLogger('tensorflow')
for hdlr in logger.handlers:
logger.removeHandler(hdlr)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(tf.logging.INFO)
config.log_dir = os.path.join(config.exp_dir, config.log_dir,
config.train_tag)
config.model_dir = os.path.join(config.exp_dir, config.model_dir,
config.train_tag)
config.output_dir = os.path.join(config.exp_dir, config.output_dir,
config.train_tag)
for path in [
config.log_dir, config.model_dir, config.output_dir
]:
if not os.path.exists(path):
os.makedirs(path)
config.data_files = {
'train': os.path.join(config.data_dir, config.train_data_file),
'dev': os.path.join(config.data_dir, config.dev_data_file),
'test': os.path.join(config.data_dir, config.test_data_file)
}
return config
def get_time():
return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
def show_all_variables():
model_vars = tf.trainable_variables()
slim.model_analyzer.analyze_vars(model_vars, print_info=True)
def save_config(config):
param_path = os.path.join(config.model_dir, 'params.json')
tf.logging.info('log dir: %s' % config.log_dir)
tf.logging.info('model dir: %s' % config.model_dir)
tf.logging.info('param path: %s' % param_path)
tf.logging.info('output dir: %s' % config.output_dir)
with open(param_path, 'w') as f:
f.write(json.dumps(config.__dict__, indent=4, sort_keys=True))
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
def fc_layer(name,
bottom,
output_dim,
bias_term=True,
weights_initializer=None,
biases_initializer=None,
reuse=None):
# flatten bottom input
shape = bottom.get_shape().as_list()
input_dim = 1
for d in shape[1:]:
input_dim *= d
flat_bottom = tf.reshape(bottom, [-1, input_dim])
# weights and biases variables
with tf.variable_scope(name, reuse=reuse):
# initialize the variables
if weights_initializer is None:
weights_initializer = tf.contrib.layers.xavier_initializer()
if bias_term and biases_initializer is None:
biases_initializer = tf.constant_initializer(0.)
# weights has shape [input_dim, output_dim]
weights = tf.get_variable(
'weights', [input_dim, output_dim], initializer=weights_initializer)
if bias_term:
biases = tf.get_variable(
'biases', output_dim, initializer=biases_initializer)
if not reuse:
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
tf.nn.l2_loss(weights))
if bias_term:
fc = tf.nn.xw_plus_b(flat_bottom, weights, biases)
else:
fc = tf.matmul(flat_bottom, weights)
return fc
# REINFORCing Concrete with REBAR
*Implemention of REBAR (and other closely related methods) as described
in "REBAR: Low-variance, unbiased gradient estimates for discrete latent variable models" by
George Tucker, Andriy Mnih, Chris J. Maddison, Dieterich Lawson, Jascha Sohl-Dickstein [(https://arxiv.org/abs/1703.07370)](https://arxiv.org/abs/1703.07370).*
Learning in models with discrete latent variables is challenging due to high variance gradient estimators. Generally, approaches have relied on control variates to reduce the variance of the REINFORCE estimator. Recent work ([Jang et al. 2016](https://arxiv.org/abs/1611.01144); [Maddison et al. 2016](https://arxiv.org/abs/1611.00712)) has taken a different approach, introducing a continuous relaxation of discrete variables to produce low-variance, but biased, gradient estimates. In this work, we combine the two approaches through a novel control variate that produces low-variance, unbiased gradient estimates. Then, we introduce a novel continuous relaxation and show that the tightness of the relaxation can be adapted online, removing it as a hyperparameter. We show state-of-the-art variance reduction on several benchmark generative modeling tasks, generally leading to faster convergence to a better final log likelihood.
REBAR applied to multilayer sigmoid belief networks is implemented in rebar.py and rebar_train.py provides a training/evaluation setup. As a comparison, we also implemented the following methods:
* [NVIL](https://arxiv.org/abs/1402.0030)
* [MuProp](https://arxiv.org/abs/1511.05176)
* [Gumbel-Softmax](https://arxiv.org/abs/1611.01144)
The code is not optimized and some computation is repeated for ease of
implementation. We hope that this code will be a useful starting point for future research in this area.
## Quick Start:
Requirements:
* TensorFlow (see tensorflow.org for how to install)
* MNIST dataset
* Omniglot dataset
First download datasets by selecting URLs to download the data from. Then
fill in the download_data.py script like so:
```
MNIST_URL = 'http://yann.lecun.com/exdb/mnist'
MNIST_BINARIZED_URL = 'http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist'
OMNIGLOT_URL = 'https://github.com/yburda/iwae/raw/master/datasets/OMNIGLOT/chardata.mat'
```
Then run the script to download the data:
```
python download_data.py
```
Then run the training script:
```
python rebar_train.py --hparams="model=SBNDynamicRebar,learning_rate=0.0003,n_layer=2,task=sbn"
```
and you should see something like:
```
Step 2084: [-231.026474 0.3711713 1. 1.06934261 1.07023323
1.02173257 1.02171052 1. 1. 1. 1. ]
-3.6465678215
Step 4168: [-156.86795044 0.3097114 1. 1.03964758 1.03936625
1.02627242 1.02629256 1. 1. 1. 1. ]
-4.42727231979
Step 6252: [-143.4650116 0.26153237 1. 1.03633797 1.03600132
1.02639604 1.02639794 1. 1. 1. 1. ]
-4.85577583313
Step 8336: [-137.65275574 0.22313026 1. 1.03467286 1.03428006
1.02336085 1.02335203 0.99999988 1. 0.99999988
1. ]
-4.95563364029
```
The first number in the list is the log likelihood lower bound and the number
after the list is the log of the variance of the gradient estimator. The rest of
the numbers are for debugging.
We can also compare the variance between methods:
```
python rebar_train.py \
--hparams="model=SBNTrackGradVariances,learning_rate=0.0003,n_layer=2,task=omni"
```
and you should see something like:
```
Step 959: [ -2.60478699e+02 3.84281784e-01 6.31126612e-02 3.27319391e-02
6.13379292e-03 1.98278503e-04 1.96425783e-04 8.83973844e-04
8.70995224e-04 -inf]
('DynamicREBAR', -3.725339889526367)
('MuProp', -0.033569782972335815)
('NVIL', 2.7640280723571777)
('REBAR', -3.539274215698242)
('SimpleMuProp', -0.040744658559560776)
Step 1918: [ -2.06948471e+02 3.35904926e-01 5.20901568e-03 7.81541676e-05
2.06885766e-03 1.08521657e-04 1.07351625e-04 2.30646547e-04
2.26554010e-04 -8.22885323e+00]
('DynamicREBAR', -3.864381790161133)
('MuProp', -0.7183765172958374)
('NVIL', 2.266523599624634)
('REBAR', -3.662022113800049)
('SimpleMuProp', -0.7071359157562256)
```
where the tuples show the log of the variance of the gradient estimators.
The training script has a number of hyperparameter configuration flags:
* task (sbn): one of {sbn, sp, omni} which correspond to MNIST generative
modeling, structured prediction on MNIST, and Omniglot generative modeling,
respectively
* model (SBNGumbel) : one of {SBN, SBNNVIL, SBNMuProp, SBNSimpleMuProp,
SBNRebar, SBNDynamicRebar, SBNGumbel SBNTrackGradVariances}. DynamicRebar automatically
adjusts the temperature, whereas Rebar and Gumbel-Softmax require tuning the
temperature. The ones named after
methods uses that method to estimate the gradients (SBN refers to
REINFORCE). SBNTrackGradVariances runs multiple methods and follows a single
optimization trajectory
* n_hidden (200): number of hidden nodes per layer
* n_layer (1): number of layers in the model
* nonlinear (false): if true use 2 x tanh layers between each stochastic layer,
otherwise use a linear layer
* learning_rate (0.001): learning rate
* temperature (0.5): temperature hyperparameter (for DynamicRebar, this is the initial
value of the temperature)
* n_samples (1): number of samples used to compute the gradient estimator (for the
experiments in the paper, set to 1)
* batch_size (24): batch size
* muprop_relaxation (true): if true use the new relaxation described in the paper,
otherwise use the Concrete/Gumbel softmax relaxation
* dynamic_b (false): if true dynamically binarize the training set. This
increases the effective training dataset size and reduces overfitting, though
it is not a standard dataset
Maintained by George Tucker (gjt@google.com, github user: gjtucker).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment