Merge branch 'master' of github.com:tensorflow/models

dff0f0c1 · Alexander Gorban · da341f70 · 36203f09 · dff0f0c1 · dff0f0c1
Commit dff0f0c1 authored Aug 08, 2017 by Alexander Gorban
20 changed files
--- a/ptn/nets/ptn_im_decoder.py
+++ b/ptn/nets/ptn_im_decoder.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Image/Mask decoder used while pretraining the network."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+_FEATURE_MAP_SIZE = 8
+
+
+def _postprocess_im(images):
+  """Performs post-processing for the images returned from conv net.
+
+  Transforms the value from [-1, 1] to [0, 1].
+  """
+  return (images + 1) * 0.5
+
+
+def model(identities, poses, params, is_training):
+  """Decoder model to get image and mask from latent embedding."""
+  del is_training
+  f_dim = params.f_dim
+  fc_dim = params.fc_dim
+
+  outputs = dict()
+
+  with slim.arg_scope(
+      [slim.fully_connected, slim.conv2d_transpose],
+      weights_initializer=tf.truncated_normal_initializer(stddev=0.02, seed=1)):
+    # Concatenate the identity and pose units
+    h0 = tf.concat([identities, poses], 1)
+    h0 = slim.fully_connected(h0, fc_dim, activation_fn=tf.nn.relu)
+    h1 = slim.fully_connected(h0, fc_dim, activation_fn=tf.nn.relu)
+
+    # Mask decoder
+    dec_m0 = slim.fully_connected(
+        h1, (_FEATURE_MAP_SIZE**2) * f_dim * 2, activation_fn=tf.nn.relu)
+    dec_m0 = tf.reshape(
+        dec_m0, [-1, _FEATURE_MAP_SIZE, _FEATURE_MAP_SIZE, f_dim * 2])
+
+    dec_m1 = slim.conv2d_transpose(
+        dec_m0, f_dim, [5, 5], stride=2, activation_fn=tf.nn.relu)
+    dec_m2 = slim.conv2d_transpose(
+        dec_m1, int(f_dim / 2), [5, 5], stride=2, activation_fn=tf.nn.relu)
+    dec_m3 = slim.conv2d_transpose(
+        dec_m2, 1, [5, 5], stride=2, activation_fn=tf.nn.sigmoid)
+
+    # Image decoder
+    dec_i0 = slim.fully_connected(
+        h1, (_FEATURE_MAP_SIZE**2) * f_dim * 4, activation_fn=tf.nn.relu)
+    dec_i0 = tf.reshape(
+        dec_i0, [-1, _FEATURE_MAP_SIZE, _FEATURE_MAP_SIZE, f_dim * 4])
+
+    dec_i1 = slim.conv2d_transpose(
+        dec_i0, f_dim * 2, [5, 5], stride=2, activation_fn=tf.nn.relu)
+    dec_i2 = slim.conv2d_transpose(
+        dec_i1, f_dim * 2, [5, 5], stride=2, activation_fn=tf.nn.relu)
+    dec_i3 = slim.conv2d_transpose(
+        dec_i2, 3, [5, 5], stride=2, activation_fn=tf.nn.tanh)
+
+    outputs = dict()
+    outputs['images'] = _postprocess_im(dec_i3)
+    outputs['masks'] = dec_m3
+  return outputs
--- a/ptn/nets/ptn_rotator.py
+++ b/ptn/nets/ptn_rotator.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Creates rotator network model.
+
+This model performs the out-of-plane rotations given input image and action.
+The action is either no-op, rotate clockwise or rotate counter-clockwise.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def bilinear(input_x, input_y, output_size):
+  """Define the bilinear transformation layer."""
+  shape_x = input_x.get_shape().as_list()
+  shape_y = input_y.get_shape().as_list()
+
+  weights_initializer = tf.truncated_normal_initializer(stddev=0.02,
+                                                        seed=1)
+  biases_initializer = tf.constant_initializer(0.0)
+
+  matrix = tf.get_variable("Matrix", [shape_x[1], shape_y[1], output_size],
+                           tf.float32, initializer=weights_initializer)
+  bias = tf.get_variable("Bias", [output_size],
+                         initializer=biases_initializer)
+  # Add to GraphKeys.MODEL_VARIABLES
+  tf.contrib.framework.add_model_variable(matrix)
+  tf.contrib.framework.add_model_variable(bias)
+  # Define the transformation
+  h0 = tf.matmul(input_x, tf.reshape(matrix,
+                                     [shape_x[1], shape_y[1]*output_size]))
+  h0 = tf.reshape(h0, [-1, shape_y[1], output_size])
+  h1 = tf.tile(tf.reshape(input_y, [-1, shape_y[1], 1]),
+               [1, 1, output_size])
+  h1 = tf.multiply(h0, h1)
+  return tf.reduce_sum(h1, 1) + bias
+
+
+def model(poses, actions, params, is_training):
+  """Model for performing rotation."""
+  del is_training  # Unused
+  return bilinear(poses, actions, params.z_dim)
--- a/ptn/nets/ptn_vox_decoder.py
+++ b/ptn/nets/ptn_vox_decoder.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Training decoder as used in PTN (NIPS16)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+
+@tf.contrib.framework.add_arg_scope
+def conv3d_transpose(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride=1,
+                     padding='SAME',
+                     activation_fn=tf.nn.relu,
+                     weights_initializer=tf.contrib.layers.xavier_initializer(),
+                     biases_initializer=tf.zeros_initializer(),
+                     reuse=None,
+                     trainable=True,
+                     scope=None):
+  """Wrapper for conv3d_transpose layer.
+
+  This function wraps the tf.conv3d_transpose with basic non-linearity.
+  Tt creates a variable called `weights`, representing the kernel,
+  that is convoled with the input. A second varibale called `biases'
+  is added to the result of operation.
+  """
+  with tf.variable_scope(
+      scope, 'Conv3d_transpose', [inputs], reuse=reuse):
+    dtype = inputs.dtype.base_dtype
+    kernel_d, kernel_h, kernel_w = kernel_size[0:3]
+    num_filters_in = inputs.get_shape()[4]
+
+    weights_shape = [kernel_d, kernel_h, kernel_w, num_outputs, num_filters_in]
+    weights = tf.get_variable('weights',
+                              shape=weights_shape,
+                              dtype=dtype,
+                              initializer=weights_initializer,
+                              trainable=trainable)
+    tf.contrib.framework.add_model_variable(weights)
+
+    input_shape = inputs.get_shape().as_list()
+    batch_size = input_shape[0]
+    depth = input_shape[1]
+    height = input_shape[2]
+    width = input_shape[3]
+
+    def get_deconv_dim(dim_size, stride_size):
+      # Only support padding='SAME'.
+      if isinstance(dim_size, tf.Tensor):
+        dim_size = tf.multiply(dim_size, stride_size)
+      elif dim_size is not None:
+        dim_size *= stride_size
+      return dim_size
+
+    out_depth = get_deconv_dim(depth, stride)
+    out_height = get_deconv_dim(height, stride)
+    out_width = get_deconv_dim(width, stride)
+
+    out_shape = [batch_size, out_depth, out_height, out_width, num_outputs]
+    outputs = tf.nn.conv3d_transpose(inputs, weights, out_shape,
+                                     [1, stride, stride, stride, 1],
+                                     padding=padding)
+
+    outputs.set_shape(out_shape)
+
+    if biases_initializer is not None:
+      biases = tf.get_variable('biases',
+                               shape=[num_outputs,],
+                               dtype=dtype,
+                               initializer=biases_initializer,
+                               trainable=trainable)
+      tf.contrib.framework.add_model_variable(biases)
+      outputs = tf.nn.bias_add(outputs, biases)
+
+    if activation_fn:
+      outputs = activation_fn(outputs)
+    return outputs
+
+
+def model(identities, params, is_training):
+  """Model transforming embedding to voxels."""
+  del is_training  # Unused
+  f_dim = params.f_dim
+
+  # Please refer to the original implementation: github.com/xcyan/nips16_PTN
+  # In TF replication, we use a slightly different architecture.
+  with slim.arg_scope(
+      [slim.fully_connected, conv3d_transpose],
+      weights_initializer=tf.truncated_normal_initializer(stddev=0.02, seed=1)):
+    h0 = slim.fully_connected(
+        identities, 4 * 4 * 4 * f_dim * 8, activation_fn=tf.nn.relu)
+    h1 = tf.reshape(h0, [-1, 4, 4, 4, f_dim * 8])
+    h1 = conv3d_transpose(
+        h1, f_dim * 4, [4, 4, 4], stride=2, activation_fn=tf.nn.relu)
+    h2 = conv3d_transpose(
+        h1, int(f_dim * 3 / 2), [5, 5, 5], stride=2, activation_fn=tf.nn.relu)
+    h3 = conv3d_transpose(
+        h2, 1, [6, 6, 6], stride=2, activation_fn=tf.nn.sigmoid)
+  return h3
--- a/ptn/pretrain_rotator.py
+++ b/ptn/pretrain_rotator.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Contains training plan for the Rotator model (Pretraining in NIPS16)."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import app
+
+import model_rotator as model
+
+flags = tf.app.flags
+slim = tf.contrib.slim
+
+flags.DEFINE_string('inp_dir', '',
+                    'Directory path containing the input data (tfrecords).')
+flags.DEFINE_string(
+    'dataset_name', 'shapenet_chair',
+    'Dataset name that is to be used for training and evaluation.')
+flags.DEFINE_integer('z_dim', 512, '')
+flags.DEFINE_integer('a_dim', 3, '')
+flags.DEFINE_integer('f_dim', 64, '')
+flags.DEFINE_integer('fc_dim', 1024, '')
+flags.DEFINE_integer('num_views', 24, 'Num of viewpoints in the input data.')
+flags.DEFINE_integer('image_size', 64,
+                     'Input images dimension (pixels) - width & height.')
+flags.DEFINE_integer('step_size', 1, 'Steps to take for rotation in pretraining.')
+flags.DEFINE_integer('batch_size', 32, 'Batch size for training.')
+flags.DEFINE_string('encoder_name', 'ptn_encoder',
+                    'Name of the encoder network being used.')
+flags.DEFINE_string('decoder_name', 'ptn_im_decoder',
+                    'Name of the decoder network being used.')
+flags.DEFINE_string('rotator_name', 'ptn_rotator',
+                    'Name of the rotator network being used.')
+# Save options
+flags.DEFINE_string('checkpoint_dir', '/tmp/ptn_train/',
+                    'Directory path for saving trained models and other data.')
+flags.DEFINE_string('model_name', 'deeprotator_pretrain',
+                    'Name of the model used in naming the TF job. Must be different for each run.')
+flags.DEFINE_string('init_model', None,
+                    'Checkpoint path of the model to initialize with.')
+flags.DEFINE_integer('save_every', 1000,
+                     'Average period of steps after which we save a model.')
+# Optimization
+flags.DEFINE_float('image_weight', 10, 'Weighting factor for image loss.')
+flags.DEFINE_float('mask_weight', 1, 'Weighting factor for mask loss.')
+flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
+flags.DEFINE_float('weight_decay', 0.001, 'Weight decay parameter while training.')
+flags.DEFINE_float('clip_gradient_norm', 0, 'Gradient clim norm, leave 0 if no gradient clipping.')
+flags.DEFINE_integer('max_number_of_steps', 320000, 'Maximum number of steps for training.')
+# Summary
+flags.DEFINE_integer('save_summaries_secs', 15, 'Seconds interval for dumping TF summaries.')
+flags.DEFINE_integer('save_interval_secs', 60 * 5, 'Seconds interval to save models.')
+# Distribution
+flags.DEFINE_string('master', '', 'The address of the tensorflow master if running distributed.')
+flags.DEFINE_bool('sync_replicas', False, 'Whether to sync gradients between replicas for optimizer.')
+flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas (train tasks).')
+flags.DEFINE_integer('backup_workers', 0, 'Number of backup workers.')
+flags.DEFINE_integer('ps_tasks', 0, 'Number of ps tasks.')
+flags.DEFINE_integer('task', 0,
+                     'Task identifier flag to be set for each task running in distributed manner. Task number 0 '
+                     'will be chosen as the chief.')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  train_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name, 'train')
+  save_image_dir = os.path.join(train_dir, 'images')
+  if not os.path.exists(train_dir):
+    os.makedirs(train_dir)
+  if not os.path.exists(save_image_dir):
+    os.makedirs(save_image_dir)
+
+  g = tf.Graph()
+  with g.as_default():
+    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
+      global_step = slim.get_or_create_global_step()
+      ##########
+      ## data ##
+      ##########
+      train_data = model.get_inputs(
+          FLAGS.inp_dir,
+          FLAGS.dataset_name,
+          'train',
+          FLAGS.batch_size,
+          FLAGS.image_size,
+          is_training=True)
+      inputs = model.preprocess(train_data, FLAGS.step_size)
+      ###########
+      ## model ##
+      ###########
+      model_fn = model.get_model_fn(FLAGS, is_training=True)
+      outputs = model_fn(inputs)
+      ##########
+      ## loss ##
+      ##########
+      task_loss = model.get_loss(inputs, outputs, FLAGS)
+      regularization_loss = model.get_regularization_loss(
+          ['encoder', 'rotator', 'decoder'], FLAGS)
+      loss = task_loss + regularization_loss
+      ###############
+      ## optimizer ##
+      ###############
+      optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
+      if FLAGS.sync_replicas:
+        optimizer = tf.train.SyncReplicasOptimizer(
+            optimizer,
+            replicas_to_aggregate=FLAGS.workers_replicas - FLAGS.backup_workers,
+            total_num_replicas=FLAGS.worker_replicas)
+
+      ##############
+      ## train_op ##
+      ##############
+      train_op = model.get_train_op_for_scope(
+          loss, optimizer, ['encoder', 'rotator', 'decoder'], FLAGS)
+      ###########
+      ## saver ##
+      ###########
+      saver = tf.train.Saver(max_to_keep=np.minimum(5,
+                                                    FLAGS.worker_replicas + 1))
+
+      if FLAGS.task == 0:
+        val_data = model.get_inputs(
+            FLAGS.inp_dir,
+            FLAGS.dataset_name,
+            'val',
+            FLAGS.batch_size,
+            FLAGS.image_size,
+            is_training=False)
+        val_inputs = model.preprocess(val_data, FLAGS.step_size)
+        # Note: don't compute loss here
+        reused_model_fn = model.get_model_fn(
+            FLAGS, is_training=False, reuse=True)
+        val_outputs = reused_model_fn(val_inputs)
+        with tf.device(tf.DeviceSpec(device_type='CPU')):
+          if FLAGS.step_size == 1:
+            vis_input_images = val_inputs['images_0'] * 255.0
+            vis_output_images = val_inputs['images_1'] * 255.0
+            vis_pred_images = val_outputs['images_1'] * 255.0
+            vis_pred_masks = (val_outputs['masks_1'] * (-1) + 1) * 255.0
+          else:
+            rep_times = int(np.ceil(32.0 / float(FLAGS.step_size)))
+            vis_list_1 = []
+            vis_list_2 = []
+            vis_list_3 = []
+            vis_list_4 = []
+            for j in xrange(rep_times):
+              for k in xrange(FLAGS.step_size):
+                vis_input_image = val_inputs['images_0'][j],
+                vis_output_image = val_inputs['images_%d' % (k + 1)][j]
+                vis_pred_image = val_outputs['images_%d' % (k + 1)][j]
+                vis_pred_mask = val_outputs['masks_%d' % (k + 1)][j]
+                vis_list_1.append(tf.expand_dims(vis_input_image, 0))
+                vis_list_2.append(tf.expand_dims(vis_output_image, 0))
+                vis_list_3.append(tf.expand_dims(vis_pred_image, 0))
+                vis_list_4.append(tf.expand_dims(vis_pred_mask, 0))
+
+            vis_list_1 = tf.reshape(
+                tf.stack(vis_list_1), [
+                    rep_times * FLAGS.step_size, FLAGS.image_size,
+                    FLAGS.image_size, 3
+                ])
+            vis_list_2 = tf.reshape(
+                tf.stack(vis_list_2), [
+                    rep_times * FLAGS.step_size, FLAGS.image_size,
+                    FLAGS.image_size, 3
+                ])
+            vis_list_3 = tf.reshape(
+                tf.stack(vis_list_3), [
+                    rep_times * FLAGS.step_size, FLAGS.image_size,
+                    FLAGS.image_size, 3
+                ])
+            vis_list_4 = tf.reshape(
+                tf.stack(vis_list_4), [
+                    rep_times * FLAGS.step_size, FLAGS.image_size,
+                    FLAGS.image_size, 1
+                ])
+
+            vis_input_images = vis_list_1 * 255.0
+            vis_output_images = vis_list_2 * 255.0
+            vis_pred_images = vis_list_3 * 255.0
+            vis_pred_masks = (vis_list_4 * (-1) + 1) * 255.0
+
+          write_disk_op = model.write_disk_grid(
+              global_step=global_step,
+              summary_freq=FLAGS.save_every,
+              log_dir=save_image_dir,
+              input_images=vis_input_images,
+              output_images=vis_output_images,
+              pred_images=vis_pred_images,
+              pred_masks=vis_pred_masks)
+        with tf.control_dependencies([write_disk_op]):
+          train_op = tf.identity(train_op)
+
+      #############
+      ## init_fn ##
+      #############
+      init_fn = model.get_init_fn(['encoder, ' 'rotator', 'decoder'], FLAGS)
+
+      ##############
+      ## training ##
+      ##############
+      slim.learning.train(
+          train_op=train_op,
+          logdir=train_dir,
+          init_fn=init_fn,
+          master=FLAGS.master,
+          is_chief=(FLAGS.task == 0),
+          number_of_steps=FLAGS.max_number_of_steps,
+          saver=saver,
+          save_summaries_secs=FLAGS.save_summaries_secs,
+          save_interval_secs=FLAGS.save_interval_secs)
+
+
+if __name__ == '__main__':
+  app.run()
--- a/ptn/train_ptn.py
+++ b/ptn/train_ptn.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Contains training plan for the Im2vox model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import app
+
+import model_ptn
+
+flags = tf.app.flags
+slim = tf.contrib.slim
+
+flags.DEFINE_string('inp_dir',
+                    '',
+                    'Directory path containing the input data (tfrecords).')
+flags.DEFINE_string(
+    'dataset_name', 'shapenet_chair',
+    'Dataset name that is to be used for training and evaluation.')
+flags.DEFINE_integer('z_dim', 512, '')
+flags.DEFINE_integer('f_dim', 64, '')
+flags.DEFINE_integer('fc_dim', 1024, '')
+flags.DEFINE_integer('num_views', 24, 'Num of viewpoints in the input data.')
+flags.DEFINE_integer('image_size', 64,
+                     'Input images dimension (pixels) - width & height.')
+flags.DEFINE_integer('vox_size', 32, 'Voxel prediction dimension.')
+flags.DEFINE_integer('step_size', 24, 'Steps to take in rotation to fetch viewpoints.')
+flags.DEFINE_integer('batch_size', 1, 'Batch size while training.')
+flags.DEFINE_float('focal_length', 0.866, 'Focal length parameter used in perspective projection.')
+flags.DEFINE_float('focal_range', 1.732, 'Focal length parameter used in perspective projection.')
+flags.DEFINE_string('encoder_name', 'ptn_encoder',
+                    'Name of the encoder network being used.')
+flags.DEFINE_string('decoder_name', 'ptn_vox_decoder',
+                    'Name of the decoder network being used.')
+flags.DEFINE_string('projector_name', 'perspective_projector',
+                    'Name of the projector network being used.')
+# Save options
+flags.DEFINE_string('checkpoint_dir', '/tmp/ptn_train/',
+                    'Directory path for saving trained models and other data.')
+flags.DEFINE_string('model_name', 'ptn_finetune',
+                    'Name of the model used in naming the TF job. Must be different for each run.')
+flags.DEFINE_string('init_model', None,
+                    'Checkpoint path of the model to initialize with.')
+flags.DEFINE_integer('save_every', 1000,
+                     'Average period of steps after which we save a model.')
+# Optimization
+flags.DEFINE_float('proj_weight', 10, 'Weighting factor for projection loss.')
+flags.DEFINE_float('volume_weight', 0, 'Weighting factor for volume loss.')
+flags.DEFINE_float('viewpoint_weight', 1, 'Weighting factor for viewpoint loss.')
+flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
+flags.DEFINE_float('weight_decay', 0.001, 'Weight decay parameter while training.')
+flags.DEFINE_float('clip_gradient_norm', 0, 'Gradient clim norm, leave 0 if no gradient clipping.')
+flags.DEFINE_integer('max_number_of_steps', 10000, 'Maximum number of steps for training.')
+# Summary
+flags.DEFINE_integer('save_summaries_secs', 15, 'Seconds interval for dumping TF summaries.')
+flags.DEFINE_integer('save_interval_secs', 60 * 5, 'Seconds interval to save models.')
+
+# Scheduling
+flags.DEFINE_string('master', '', 'The address of the tensorflow master')
+flags.DEFINE_bool('sync_replicas', False, 'Whether to sync gradients between replicas for optimizer.')
+flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas (train tasks).')
+flags.DEFINE_integer('backup_workers', 0, 'Number of backup workers.')
+flags.DEFINE_integer('ps_tasks', 0, 'Number of ps tasks.')
+flags.DEFINE_integer('task', 0,
+                     'Task identifier flag to be set for each task running in distributed manner. Task number 0 '
+                     'will be chosen as the chief.')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  train_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name, 'train')
+  save_image_dir = os.path.join(train_dir, 'images')
+  if not os.path.exists(train_dir):
+    os.makedirs(train_dir)
+  if not os.path.exists(save_image_dir):
+    os.makedirs(save_image_dir)
+
+  g = tf.Graph()
+  with g.as_default():
+    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
+      global_step = slim.get_or_create_global_step()
+      ###########
+      ## model ##
+      ###########
+      model = model_ptn.model_PTN(FLAGS)
+      ##########
+      ## data ##
+      ##########
+      train_data = model.get_inputs(
+          FLAGS.inp_dir,
+          FLAGS.dataset_name,
+          'train',
+          FLAGS.batch_size,
+          FLAGS.image_size,
+          FLAGS.vox_size,
+          is_training=True)
+      inputs = model.preprocess(train_data, FLAGS.step_size)
+      ##############
+      ## model_fn ##
+      ##############
+      model_fn = model.get_model_fn(
+          is_training=True, reuse=False, run_projection=True)
+      outputs = model_fn(inputs)
+      ##################
+      ## train_scopes ##
+      ##################
+      if FLAGS.init_model:
+        train_scopes = ['decoder']
+        init_scopes = ['encoder']
+      else:
+        train_scopes = ['encoder', 'decoder']
+
+      ##########
+      ## loss ##
+      ##########
+      task_loss = model.get_loss(inputs, outputs)
+
+      regularization_loss = model.get_regularization_loss(train_scopes)
+      loss = task_loss + regularization_loss
+      ###############
+      ## optimizer ##
+      ###############
+      optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
+      if FLAGS.sync_replicas:
+        optimizer = tf.train.SyncReplicasOptimizer(
+            optimizer,
+            replicas_to_aggregate=FLAGS.workers_replicas - FLAGS.backup_workers,
+            total_num_replicas=FLAGS.worker_replicas)
+
+      ##############
+      ## train_op ##
+      ##############
+      train_op = model.get_train_op_for_scope(loss, optimizer, train_scopes)
+      ###########
+      ## saver ##
+      ###########
+      saver = tf.train.Saver(max_to_keep=np.minimum(5,
+                                                    FLAGS.worker_replicas + 1))
+
+      if FLAGS.task == 0:
+        params = FLAGS
+        params.batch_size = params.num_views
+        params.step_size = 1
+        model.set_params(params)
+        val_data = model.get_inputs(
+            params.inp_dir,
+            params.dataset_name,
+            'val',
+            params.batch_size,
+            params.image_size,
+            params.vox_size,
+            is_training=False)
+        val_inputs = model.preprocess(val_data, params.step_size)
+        # Note: don't compute loss here
+        reused_model_fn = model.get_model_fn(is_training=False, reuse=True)
+        val_outputs = reused_model_fn(val_inputs)
+
+        with tf.device(tf.DeviceSpec(device_type='CPU')):
+          vis_input_images = val_inputs['images_1'] * 255.0
+          vis_gt_projs = (val_outputs['masks_1'] * (-1) + 1) * 255.0
+          vis_pred_projs = (val_outputs['projs_1'] * (-1) + 1) * 255.0
+
+          vis_gt_projs = tf.concat([vis_gt_projs] * 3, axis=3)
+          vis_pred_projs = tf.concat([vis_pred_projs] * 3, axis=3)
+          # rescale
+          new_size = [FLAGS.image_size] * 2
+          vis_gt_projs = tf.image.resize_nearest_neighbor(
+              vis_gt_projs, new_size)
+          vis_pred_projs = tf.image.resize_nearest_neighbor(
+              vis_pred_projs, new_size)
+          # flip
+          # vis_gt_projs = utils.image_flipud(vis_gt_projs)
+          # vis_pred_projs = utils.image_flipud(vis_pred_projs)
+          # vis_gt_projs is of shape [batch, height, width, channels]
+          write_disk_op = model.write_disk_grid(
+              global_step=global_step,
+              log_dir=save_image_dir,
+              input_images=vis_input_images,
+              gt_projs=vis_gt_projs,
+              pred_projs=vis_pred_projs,
+              input_voxels=val_inputs['voxels'],
+              output_voxels=val_outputs['voxels_1'])
+        with tf.control_dependencies([write_disk_op]):
+          train_op = tf.identity(train_op)
+
+      #############
+      ## init_fn ##
+      #############
+      if FLAGS.init_model:
+        init_fn = model.get_init_fn(init_scopes)
+      else:
+        init_fn = None
+
+      ##############
+      ## training ##
+      ##############
+      slim.learning.train(
+          train_op=train_op,
+          logdir=train_dir,
+          init_fn=init_fn,
+          master=FLAGS.master,
+          is_chief=(FLAGS.task == 0),
+          number_of_steps=FLAGS.max_number_of_steps,
+          saver=saver,
+          save_summaries_secs=FLAGS.save_summaries_secs,
+          save_interval_secs=FLAGS.save_interval_secs)
+
+
+if __name__ == '__main__':
+  app.run()
--- a/ptn/utils.py
+++ b/ptn/utils.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import StringIO
+from matplotlib import pylab as p
+# axes3d is being used implictly for visualization.
+from mpl_toolkits.mplot3d import axes3d as p3  # pylint:disable=unused-import
+import numpy as np
+from PIL import Image
+from skimage import measure
+
+import tensorflow as tf
+
+
+def save_image(inp_array, image_file):
+  """Function that dumps the image to disk."""
+  inp_array = np.clip(inp_array, 0, 255).astype(np.uint8)
+  image = Image.fromarray(inp_array)
+  buf = StringIO.StringIO()
+  image.save(buf, format='JPEG')
+  with open(image_file, 'w') as f:
+    f.write(buf.getvalue())
+  return None
+
+
+def image_flipud(images):
+  """Function that flip (up-down) the np image."""
+  quantity = images.get_shape().as_list()[0]
+  image_list = []
+  for k in xrange(quantity):
+    image_list.append(tf.image.flip_up_down(images[k, :, :, :]))
+  outputs = tf.stack(image_list)
+  return outputs
+
+
+def resize_image(inp_array, new_height, new_width):
+  """Function that resize the np image."""
+  inp_array = np.clip(inp_array, 0, 255).astype(np.uint8)
+  image = Image.fromarray(inp_array)
+  # Reverse order
+  image = image.resize((new_width, new_height))
+  return np.array(image)
+
+
+def display_voxel(points, vis_size=128):
+  """Function to display 3D voxel."""
+  try:
+    data = visualize_voxel_spectral(points, vis_size)
+  except ValueError:
+    data = visualize_voxel_scatter(points, vis_size)
+  return data
+
+
+def visualize_voxel_spectral(points, vis_size=128):
+  """Function to visualize voxel (spectral)."""
+  points = np.rint(points)
+  points = np.swapaxes(points, 0, 2)
+  fig = p.figure(figsize=(1, 1), dpi=vis_size)
+  verts, faces = measure.marching_cubes(points, 0, spacing=(0.1, 0.1, 0.1))
+  ax = fig.add_subplot(111, projection='3d')
+  ax.plot_trisurf(
+      verts[:, 0], verts[:, 1], faces, verts[:, 2], cmap='Spectral_r', lw=0.1)
+  ax.set_axis_off()
+  fig.tight_layout(pad=0)
+  fig.canvas.draw()
+  data = np.fromstring(
+      fig.canvas.tostring_rgb(), dtype=np.uint8, sep='').reshape(
+          vis_size, vis_size, 3)
+  p.close('all')
+  return data
+
+
+def visualize_voxel_scatter(points, vis_size=128):
+  """Function to visualize voxel (scatter)."""
+  points = np.rint(points)
+  points = np.swapaxes(points, 0, 2)
+  fig = p.figure(figsize=(1, 1), dpi=vis_size)
+  ax = fig.add_subplot(111, projection='3d')
+  x = []
+  y = []
+  z = []
+  (x_dimension, y_dimension, z_dimension) = points.shape
+  for i in range(x_dimension):
+    for j in range(y_dimension):
+      for k in range(z_dimension):
+        if points[i, j, k]:
+          x.append(i)
+          y.append(j)
+          z.append(k)
+  ax.scatter3D(x, y, z)
+  ax.set_axis_off()
+  fig.tight_layout(pad=0)
+  fig.canvas.draw()
+  data = np.fromstring(
+      fig.canvas.tostring_rgb(), dtype=np.uint8, sep='').reshape(
+          vis_size, vis_size, 3)
+  p.close('all')
+  return data
+
--- a/qa_kg/README.md
+++ b/qa_kg/README.md
+# Module networks for question answering on knowledge graph
+
+This code repository contains a TensorFlow model for question answering on
+knowledge graph with end-to-end module networks. The original paper describing
+end-to-end module networks is as follows.
+
+R. Hu, J. Andreas, M. Rohrbach, T. Darrell, K. Saenko, *Learning to Reason:
+End-to-End Module Networks for Visual Question Answering*. in arXiv preprint
+arXiv:1704.05526, 2017. ([PDF](https://arxiv.org/pdf/1704.05526.pdf))
+
+```
+@article{hu2017learning,
+  title={Learning to Reason: End-to-End Module Networks for Visual Question Answering},
+  author={Hu, Ronghang and Andreas, Jacob and Rohrbach, Marcus and Darrell, Trevor and Saenko, Kate},
+  journal={arXiv preprint arXiv:1704.05526},
+  year={2017}
+}
+```
+
+The code in this repository is based on the original
+[implementation](https://github.com/ronghanghu/n2nmn) for this paper.
+
+## Requirements
+
+1.  Install TensorFlow 1.0.0. Follow the [official
+    guide](https://www.tensorflow.org/install/). Please note that newer or older
+    versions of TensorFlow may fail to work due to incompatibility with
+    TensorFlow Fold.
+2.  Install TensorFlow Fold. Follow the
+    [setup instructions](https://github.com/tensorflow/fold/blob/master/tensorflow_fold/g3doc/setup.md).
+    TensorFlow Fold only supports Linux platform. We have not tested
+    the code on other platforms.
+
+## Data
+
+1.  Download the [MetaQA dataset](https://goo.gl/f3AmcY). Click the button
+    `MetaQA` and then click `Download` in the drop-down list. Extract the zip
+    file after downloading completed. Read the documents there for dataset
+    details.
+2.  Move the `MetaQA` folder to the root directory of this repository.
+
+## How to use this code
+
+We provide an experiment folder `exp_1_hop`, which applies the implemented model
+to the 1-hop vanilla dataset in MetaQA. More experiment folders are coming soon.
+
+Currently, we provide code for training with ground truth layout, and testing
+the saved model. Configurations can be modified in `config.py`. They can also be
+set via command line parameters.
+
+To train the model:
+
+```
+python exp_1_hop/train_gt_layout.py
+```
+
+To test the saved model (need to provide the snapshot name):
+
+```
+python exp_1_hop/test.py --snapshot_name 00010000
+```
+
+## Model introduction
+
+1.  In this model, we store the knowledge graph in a key-value based memory. For
+    each knowledge graph edge (subject, relation, object), we use the (subject,
+    relation) as the key and the object as the value.
+2.  All entities and relations are embedded as fixed-dimension vectors. These
+    embeddings are also end-to-end learned.
+3.  Neural modules can separately operate on either the key side or the value
+    side.
+4.  The attention is shared between keys and corresponding values.
+5.  The answer output is based on the attention-weighted sum over keys or
+    values, depending on the output module.
+
+## Contact
+Authors: Yuyu Zhang, Xin Pan
+
+Pull requests and issues: @yuyuz
--- a/qa_kg/exp_1_hop/config.py
+++ b/qa_kg/exp_1_hop/config.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import argparse
+import os
+
+
+def str2bool(v):
+  return v.lower() in ('true', '1')
+
+
+def add_argument_group(name):
+  arg = parser.add_argument_group(name)
+  arg_lists.append(arg)
+  return arg
+
+
+def get_config():
+  config, unparsed = parser.parse_known_args()
+  return config, unparsed
+
+
+arg_lists = []
+parser = argparse.ArgumentParser()
+work_dir = os.path.abspath(os.path.join(__file__, '../../'))
+
+net_arg = add_argument_group('Network')
+net_arg.add_argument('--lstm_dim', type=int, default=128)
+net_arg.add_argument('--num_layers', type=int, default=1)
+net_arg.add_argument('--embed_dim_txt', type=int, default=128)
+net_arg.add_argument('--embed_dim_nmn', type=int, default=128)
+net_arg.add_argument(
+  '--T_encoder', type=int, default=0)  # will be updated when reading data
+net_arg.add_argument('--T_decoder', type=int, default=5)
+
+train_arg = add_argument_group('Training')
+train_arg.add_argument('--train_tag', type=str, default='n2nmn')
+train_arg.add_argument('--batch_size', type=int, default=128)
+train_arg.add_argument('--max_iter', type=int, default=1000000)
+train_arg.add_argument('--weight_decay', type=float, default=1e-5)
+train_arg.add_argument('--baseline_decay', type=float, default=0.99)
+train_arg.add_argument('--max_grad_norm', type=float, default=10)
+train_arg.add_argument('--random_seed', type=int, default=123)
+
+data_arg = add_argument_group('Data')
+data_path = work_dir + '/MetaQA/'
+data_arg.add_argument('--KB_file', type=str, default=data_path + 'kb.txt')
+data_arg.add_argument(
+  '--data_dir', type=str, default=data_path + '1-hop/vanilla/')
+data_arg.add_argument('--train_data_file', type=str, default='qa_train.txt')
+data_arg.add_argument('--dev_data_file', type=str, default='qa_dev.txt')
+data_arg.add_argument('--test_data_file', type=str, default='qa_test.txt')
+
+exp_arg = add_argument_group('Experiment')
+exp_path = work_dir + '/exp_1_hop/'
+exp_arg.add_argument('--exp_dir', type=str, default=exp_path)
+
+log_arg = add_argument_group('Log')
+log_arg.add_argument('--log_dir', type=str, default='logs')
+log_arg.add_argument('--log_interval', type=int, default=1000)
+log_arg.add_argument('--num_log_samples', type=int, default=3)
+log_arg.add_argument(
+  '--log_level', type=str, default='INFO', choices=['INFO', 'DEBUG', 'WARN'])
+
+io_arg = add_argument_group('IO')
+io_arg.add_argument('--model_dir', type=str, default='model')
+io_arg.add_argument('--snapshot_interval', type=int, default=1000)
+io_arg.add_argument('--output_dir', type=str, default='output')
--- a/qa_kg/exp_1_hop/test.py
+++ b/qa_kg/exp_1_hop/test.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(__file__, '../../')))
+import numpy as np
+import tensorflow as tf
+from config import get_config
+from model_n2nmn.assembler import Assembler
+from model_n2nmn.model import Model
+from util.data_reader import DataReader
+from util.data_reader import SampleBuilder
+from util.misc import prepare_dirs_and_logger
+
+FLAGS = tf.flags.FLAGS
+tf.flags.DEFINE_string('snapshot_name', '00001000', 'snapshot file name')
+
+
+def main(_):
+  config = prepare_dirs_and_logger(config_raw)
+
+  rng = np.random.RandomState(config.random_seed)
+  tf.set_random_seed(config.random_seed)
+  config.rng = rng
+
+  config.module_names = ['_key_find', '_key_filter', '_val_desc', '<eos>']
+  config.gt_layout_tokens = ['_key_find', '_key_filter', '_val_desc', '<eos>']
+  assembler = Assembler(config)
+
+  sample_builder = SampleBuilder(config)
+  config = sample_builder.config  # update T_encoder according to data
+  data_test = sample_builder.data_all['test']
+  data_reader_test = DataReader(
+      config, data_test, assembler, shuffle=False, one_pass=True)
+
+  num_vocab_txt = len(sample_builder.dict_all)
+  num_vocab_nmn = len(assembler.module_names)
+  num_choices = len(sample_builder.dict_all)
+
+  # Network inputs
+  text_seq_batch = tf.placeholder(tf.int32, [None, None])
+  seq_len_batch = tf.placeholder(tf.int32, [None])
+
+  # The model
+  model = Model(
+      config,
+      sample_builder.kb,
+      text_seq_batch,
+      seq_len_batch,
+      num_vocab_txt=num_vocab_txt,
+      num_vocab_nmn=num_vocab_nmn,
+      EOS_idx=assembler.EOS_idx,
+      num_choices=num_choices,
+      decoder_sampling=False)
+  compiler = model.compiler
+  scores = model.scores
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
+  snapshot_file = os.path.join(config.model_dir, FLAGS.snapshot_name)
+  tf.logging.info('Snapshot file: %s' % snapshot_file)
+
+  snapshot_saver = tf.train.Saver()
+  snapshot_saver.restore(sess, snapshot_file)
+
+  # Evaluation metrics
+  num_questions = len(data_test.Y)
+  tf.logging.info('# of test questions: %d' % num_questions)
+
+  answer_correct = 0
+  layout_correct = 0
+  layout_valid = 0
+  for batch in data_reader_test.batches():
+    # set up input and output tensors
+    h = sess.partial_run_setup(
+        fetches=[model.predicted_tokens, scores],
+        feeds=[text_seq_batch, seq_len_batch, compiler.loom_input_tensor])
+
+    # Part 1: Generate module layout
+    tokens = sess.partial_run(
+        h,
+        fetches=model.predicted_tokens,
+        feed_dict={
+            text_seq_batch: batch['input_seq_batch'],
+            seq_len_batch: batch['seq_len_batch']
+        })
+
+    # Compute accuracy of the predicted layout
+    gt_tokens = batch['gt_layout_batch']
+    layout_correct += np.sum(
+        np.all(
+            np.logical_or(tokens == gt_tokens, gt_tokens == assembler.EOS_idx),
+            axis=0))
+
+    # Assemble the layout tokens into network structure
+    expr_list, expr_validity_array = assembler.assemble(tokens)
+    layout_valid += np.sum(expr_validity_array)
+    labels = batch['ans_label_batch']
+    # Build TensorFlow Fold input for NMN
+    expr_feed = compiler.build_feed_dict(expr_list)
+
+    # Part 2: Run NMN and learning steps
+    scores_val = sess.partial_run(h, scores, feed_dict=expr_feed)
+
+    # Compute accuracy
+    predictions = np.argmax(scores_val, axis=1)
+    answer_correct += np.sum(
+        np.logical_and(expr_validity_array, predictions == labels))
+
+  answer_accuracy = answer_correct * 1.0 / num_questions
+  layout_accuracy = layout_correct * 1.0 / num_questions
+  layout_validity = layout_valid * 1.0 / num_questions
+
+  tf.logging.info('test answer accuracy = %f, '
+                  'test layout accuracy = %f, '
+                  'test layout validity = %f' %
+                  (answer_accuracy, layout_accuracy, layout_validity))
+
+
+if __name__ == '__main__':
+  config_raw, unparsed = get_config()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/qa_kg/exp_1_hop/train_gt_layout.py
+++ b/qa_kg/exp_1_hop/train_gt_layout.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(__file__, '../../')))
+import numpy as np
+import tensorflow as tf
+from config import get_config
+from model_n2nmn.assembler import Assembler
+from model_n2nmn.model import Model
+from util.data_reader import DataReader
+from util.data_reader import SampleBuilder
+from util.misc import prepare_dirs_and_logger
+from util.misc import save_config
+from util.misc import show_all_variables
+
+
+def main(_):
+  config = prepare_dirs_and_logger(config_raw)
+  save_config(config)
+
+  rng = np.random.RandomState(config.random_seed)
+  tf.set_random_seed(config.random_seed)
+  config.rng = rng
+
+  config.module_names = ['_key_find', '_key_filter', '_val_desc', '<eos>']
+  config.gt_layout_tokens = ['_key_find', '_key_filter', '_val_desc', '<eos>']
+  assembler = Assembler(config)
+
+  sample_builder = SampleBuilder(config)
+  config = sample_builder.config  # update T_encoder according to data
+  data_train = sample_builder.data_all['train']
+  data_reader_train = DataReader(
+      config, data_train, assembler, shuffle=True, one_pass=False)
+
+  num_vocab_txt = len(sample_builder.dict_all)
+  num_vocab_nmn = len(assembler.module_names)
+  num_choices = len(sample_builder.dict_all)
+
+  # Network inputs
+  text_seq_batch = tf.placeholder(tf.int32, [None, None])
+  seq_len_batch = tf.placeholder(tf.int32, [None])
+  ans_label_batch = tf.placeholder(tf.int32, [None])
+  use_gt_layout = tf.constant(True, dtype=tf.bool)
+  gt_layout_batch = tf.placeholder(tf.int32, [None, None])
+
+  # The model for training
+  model = Model(
+      config,
+      sample_builder.kb,
+      text_seq_batch,
+      seq_len_batch,
+      num_vocab_txt=num_vocab_txt,
+      num_vocab_nmn=num_vocab_nmn,
+      EOS_idx=assembler.EOS_idx,
+      num_choices=num_choices,
+      decoder_sampling=True,
+      use_gt_layout=use_gt_layout,
+      gt_layout_batch=gt_layout_batch)
+  compiler = model.compiler
+  scores = model.scores
+  log_seq_prob = model.log_seq_prob
+
+  # Loss function
+  softmax_loss_per_sample = tf.nn.sparse_softmax_cross_entropy_with_logits(
+      logits=scores, labels=ans_label_batch)
+  # The final per-sample loss, which is loss for valid expr
+  # and invalid_expr_loss for invalid expr
+  final_loss_per_sample = softmax_loss_per_sample  # All exprs are valid
+
+  avg_sample_loss = tf.reduce_mean(final_loss_per_sample)
+  seq_likelihood_loss = tf.reduce_mean(-log_seq_prob)
+
+  total_training_loss = seq_likelihood_loss + avg_sample_loss
+  total_loss = total_training_loss + config.weight_decay * model.l2_reg
+
+  # Train with Adam optimizer
+  solver = tf.train.AdamOptimizer()
+  gradients = solver.compute_gradients(total_loss)
+
+  # Clip gradient by L2 norm
+  gradients = [(tf.clip_by_norm(g, config.max_grad_norm), v)
+               for g, v in gradients]
+  solver_op = solver.apply_gradients(gradients)
+
+  # Training operation
+  with tf.control_dependencies([solver_op]):
+    train_step = tf.constant(0)
+
+  # Write summary to TensorBoard
+  log_writer = tf.summary.FileWriter(config.log_dir, tf.get_default_graph())
+
+  loss_ph = tf.placeholder(tf.float32, [])
+  entropy_ph = tf.placeholder(tf.float32, [])
+  accuracy_ph = tf.placeholder(tf.float32, [])
+  summary_train = [
+      tf.summary.scalar('avg_sample_loss', loss_ph),
+      tf.summary.scalar('entropy', entropy_ph),
+      tf.summary.scalar('avg_accuracy', accuracy_ph)
+  ]
+  log_step_train = tf.summary.merge(summary_train)
+
+  # Training
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
+  snapshot_saver = tf.train.Saver(max_to_keep=None)  # keep all snapshots
+  show_all_variables()
+
+  avg_accuracy = 0
+  accuracy_decay = 0.99
+  for n_iter, batch in enumerate(data_reader_train.batches()):
+    if n_iter >= config.max_iter:
+      break
+
+    # set up input and output tensors
+    h = sess.partial_run_setup(
+        fetches=[
+            model.predicted_tokens, model.entropy_reg, scores, avg_sample_loss,
+            train_step
+        ],
+        feeds=[
+            text_seq_batch, seq_len_batch, gt_layout_batch,
+            compiler.loom_input_tensor, ans_label_batch
+        ])
+
+    # Part 1: Generate module layout
+    tokens, entropy_reg_val = sess.partial_run(
+        h,
+        fetches=(model.predicted_tokens, model.entropy_reg),
+        feed_dict={
+            text_seq_batch: batch['input_seq_batch'],
+            seq_len_batch: batch['seq_len_batch'],
+            gt_layout_batch: batch['gt_layout_batch']
+        })
+    # Assemble the layout tokens into network structure
+    expr_list, expr_validity_array = assembler.assemble(tokens)
+    # all exprs should be valid (since they are ground-truth)
+    assert np.all(expr_validity_array)
+    labels = batch['ans_label_batch']
+    # Build TensorFlow Fold input for NMN
+    expr_feed = compiler.build_feed_dict(expr_list)
+    expr_feed[ans_label_batch] = labels
+
+    # Part 2: Run NMN and learning steps
+    scores_val, avg_sample_loss_val, _ = sess.partial_run(
+        h, fetches=(scores, avg_sample_loss, train_step), feed_dict=expr_feed)
+
+    # Compute accuracy
+    predictions = np.argmax(scores_val, axis=1)
+    accuracy = np.mean(
+        np.logical_and(expr_validity_array, predictions == labels))
+    avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy)
+
+    # Add to TensorBoard summary
+    if (n_iter + 1) % config.log_interval == 0:
+      tf.logging.info('iter = %d\n\t'
+                      'loss = %f, accuracy (cur) = %f, '
+                      'accuracy (avg) = %f, entropy = %f' %
+                      (n_iter + 1, avg_sample_loss_val, accuracy, avg_accuracy,
+                       -entropy_reg_val))
+      summary = sess.run(
+          fetches=log_step_train,
+          feed_dict={
+              loss_ph: avg_sample_loss_val,
+              entropy_ph: -entropy_reg_val,
+              accuracy_ph: avg_accuracy
+          })
+      log_writer.add_summary(summary, n_iter + 1)
+
+    # Save snapshot
+    if (n_iter + 1) % config.snapshot_interval == 0:
+      snapshot_file = os.path.join(config.model_dir, '%08d' % (n_iter + 1))
+      snapshot_saver.save(sess, snapshot_file, write_meta_graph=False)
+      tf.logging.info('Snapshot saved to %s' % snapshot_file)
+
+  tf.logging.info('Run finished.')
+
+
+if __name__ == '__main__':
+  config_raw, unparsed = get_config()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/qa_kg/model_n2nmn/__init__.py
+++ b/qa_kg/model_n2nmn/__init__.py
--- a/qa_kg/model_n2nmn/assembler.py
+++ b/qa_kg/model_n2nmn/assembler.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+
+# the number of attention input to each module
+_module_input_num = {
+    '_key_find': 0,
+    '_key_filter': 1,
+    '_val_desc': 1}
+_module_output_type = {
+    '_key_find': 'att',
+    '_key_filter': 'att',
+    '_val_desc': 'ans'
+}
+
+INVALID_EXPR = 'INVALID_EXPR'
+
+
+class Assembler:
+
+  def __init__(self, config):
+    # read the module list, and record the index of each module and <eos>
+    self.module_names = config.module_names
+    # find the index of <eos>
+    for n_s in range(len(self.module_names)):
+      if self.module_names[n_s] == '<eos>':
+        self.EOS_idx = n_s
+        break
+    # build a dictionary from module name to token index
+    self.name2idx_dict = {
+        name: n_s
+        for n_s, name in enumerate(self.module_names)
+    }
+
+  def module_list2tokens(self, module_list, max_len=None):
+    layout_tokens = [self.name2idx_dict[name] for name in module_list]
+    if max_len is not None:
+      if len(module_list) >= max_len:
+        raise ValueError('Not enough time steps to add <eos>')
+      layout_tokens += [self.EOS_idx] * (max_len - len(module_list))
+    return layout_tokens
+
+  def _layout_tokens2str(self, layout_tokens):
+    return ' '.join([self.module_names[idx] for idx in layout_tokens])
+
+  def _invalid_expr(self, layout_tokens, error_str):
+    return {
+        'module': INVALID_EXPR,
+        'expr_str': self._layout_tokens2str(layout_tokens),
+        'error': error_str
+    }
+
+  def _assemble_layout_tokens(self, layout_tokens, batch_idx):
+    # Every module takes a time_idx as the index from LSTM hidden states
+    # (even if it doesn't need it, like _and), and different arity of
+    # attention inputs. The output type can be either attention or answer
+    #
+    # The final assembled expression for each instance is as follows:
+    # expr_type :=
+    #    {'module': '_find',        'output_type': 'att', 'time_idx': idx}
+    #  | {'module': '_relocate',   'output_type': 'att', 'time_idx': idx,
+    #     'inputs_0': <expr_type>}
+    #  | {'module': '_and',         'output_type': 'att', 'time_idx': idx,
+    #     'inputs_0': <expr_type>,  'inputs_1': <expr_type>)}
+    #  | {'module': '_describe',      'output_type': 'ans', 'time_idx': idx,
+    #     'inputs_0': <expr_type>}
+    #  | {'module': INVALID_EXPR, 'expr_str': '...', 'error': '...',
+    #     'assembly_loss': <float32>} (for invalid expressions)
+    #
+
+    # A valid layout must contain <eos>. Assembly fails if it doesn't.
+    if not np.any(layout_tokens == self.EOS_idx):
+      return self._invalid_expr(layout_tokens, 'cannot find <eos>')
+
+    # Decoding Reverse Polish Notation with a stack
+    decoding_stack = []
+    for t in range(len(layout_tokens)):
+      # decode a module/operation
+      module_idx = layout_tokens[t]
+      if module_idx == self.EOS_idx:
+        break
+      module_name = self.module_names[module_idx]
+      expr = {
+          'module': module_name,
+          'output_type': _module_output_type[module_name],
+          'time_idx': t,
+          'batch_idx': batch_idx
+      }
+
+      input_num = _module_input_num[module_name]
+      # Check if there are enough input in the stack
+      if len(decoding_stack) < input_num:
+        # Invalid expression. Not enough input.
+        return self._invalid_expr(layout_tokens,
+                                  'not enough input for ' + module_name)
+
+      # Get the input from stack
+      for n_input in range(input_num - 1, -1, -1):
+        stack_top = decoding_stack.pop()
+        if stack_top['output_type'] != 'att':
+          # Invalid expression. Input must be attention
+          return self._invalid_expr(layout_tokens,
+                                    'input incompatible for ' + module_name)
+        expr['input_%d' % n_input] = stack_top
+
+      decoding_stack.append(expr)
+
+    # After decoding the reverse polish expression, there should be exactly
+    # one expression in the stack
+    if len(decoding_stack) != 1:
+      return self._invalid_expr(
+          layout_tokens,
+          'final stack size not equal to 1 (%d remains)' % len(decoding_stack))
+
+    result = decoding_stack[0]
+    # The result type should be answer, not attention
+    if result['output_type'] != 'ans':
+      return self._invalid_expr(layout_tokens,
+                                'result type must be ans, not att')
+    return result
+
+  def assemble(self, layout_tokens_batch):
+    # layout_tokens_batch is a numpy array with shape [max_dec_len, batch_size],
+    # containing module tokens and <eos>, in Reverse Polish Notation.
+    _, batch_size = layout_tokens_batch.shape
+    expr_list = [
+        self._assemble_layout_tokens(layout_tokens_batch[:, batch_i], batch_i)
+        for batch_i in range(batch_size)
+    ]
+    expr_validity = np.array(
+        [expr['module'] != INVALID_EXPR for expr in expr_list], np.bool)
+    return expr_list, expr_validity
--- a/qa_kg/model_n2nmn/model.py
+++ b/qa_kg/model_n2nmn/model.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import tensorflow as tf
+import tensorflow_fold as td
+from model_n2nmn import netgen_att
+from model_n2nmn import assembler
+from model_n2nmn.modules import Modules
+
+
+class Model:
+
+  def __init__(self,
+               config,
+               kb,
+               text_seq_batch,
+               seq_length_batch,
+               num_vocab_txt,
+               num_vocab_nmn,
+               EOS_idx,
+               num_choices,
+               decoder_sampling,
+               use_gt_layout=None,
+               gt_layout_batch=None,
+               scope='neural_module_network',
+               reuse=None):
+    with tf.variable_scope(scope, reuse=reuse):
+      # Part 1: Seq2seq RNN to generate module layout tokens
+
+      embedding_mat = tf.get_variable(
+        'embedding_mat', [num_vocab_txt, config.embed_dim_txt],
+        initializer=tf.contrib.layers.xavier_initializer())
+
+      with tf.variable_scope('layout_generation'):
+        att_seq2seq = netgen_att.AttentionSeq2Seq(
+            config, text_seq_batch, seq_length_batch, num_vocab_txt,
+            num_vocab_nmn, EOS_idx, decoder_sampling, embedding_mat,
+            use_gt_layout, gt_layout_batch)
+        self.att_seq2seq = att_seq2seq
+        predicted_tokens = att_seq2seq.predicted_tokens
+        token_probs = att_seq2seq.token_probs
+        word_vecs = att_seq2seq.word_vecs
+        neg_entropy = att_seq2seq.neg_entropy
+        self.atts = att_seq2seq.atts
+
+        self.predicted_tokens = predicted_tokens
+        self.token_probs = token_probs
+        self.word_vecs = word_vecs
+        self.neg_entropy = neg_entropy
+
+        # log probability of each generated sequence
+        self.log_seq_prob = tf.reduce_sum(tf.log(token_probs), axis=0)
+
+      # Part 2: Neural Module Network
+      with tf.variable_scope('layout_execution'):
+        modules = Modules(config, kb, word_vecs, num_choices, embedding_mat)
+        self.modules = modules
+        # Recursion of modules
+        att_shape = [len(kb)]
+        # Forward declaration of module recursion
+        att_expr_decl = td.ForwardDeclaration(td.PyObjectType(),
+                                              td.TensorType(att_shape))
+        # _key_find
+        case_key_find = td.Record([('time_idx', td.Scalar(dtype='int32')),
+                                   ('batch_idx', td.Scalar(dtype='int32'))])
+        case_key_find = case_key_find >> td.ScopedLayer(
+            modules.KeyFindModule, name_or_scope='KeyFindModule')
+        # _key_filter
+        case_key_filter = td.Record([('input_0', att_expr_decl()),
+                                     ('time_idx', td.Scalar('int32')),
+                                     ('batch_idx', td.Scalar('int32'))])
+        case_key_filter = case_key_filter >> td.ScopedLayer(
+            modules.KeyFilterModule, name_or_scope='KeyFilterModule')
+        recursion_cases = td.OneOf(
+            td.GetItem('module'),
+            {'_key_find': case_key_find,
+             '_key_filter': case_key_filter})
+        att_expr_decl.resolve_to(recursion_cases)
+        # _val_desc: output scores for choice (for valid expressions)
+        predicted_scores = td.Record([('input_0', recursion_cases),
+                                      ('time_idx', td.Scalar('int32')),
+                                      ('batch_idx', td.Scalar('int32'))])
+        predicted_scores = predicted_scores >> td.ScopedLayer(
+            modules.ValDescribeModule, name_or_scope='ValDescribeModule')
+
+        # For invalid expressions, define a dummy answer
+        # so that all answers have the same form
+        INVALID = assembler.INVALID_EXPR
+        dummy_scores = td.Void() >> td.FromTensor(
+            np.zeros(num_choices, np.float32))
+        output_scores = td.OneOf(
+            td.GetItem('module'),
+            {'_val_desc': predicted_scores,
+             INVALID: dummy_scores})
+
+        # compile and get the output scores
+        self.compiler = td.Compiler.create(output_scores)
+        self.scores = self.compiler.output_tensors[0]
+
+      # Regularization: Entropy + L2
+      self.entropy_reg = tf.reduce_mean(neg_entropy)
+      module_weights = [
+          v for v in tf.trainable_variables()
+          if (scope in v.op.name and v.op.name.endswith('weights'))
+      ]
+      self.l2_reg = tf.add_n([tf.nn.l2_loss(v) for v in module_weights])
--- a/qa_kg/model_n2nmn/modules.py
+++ b/qa_kg/model_n2nmn/modules.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+
+
+class Modules:
+
+  def __init__(self, config, kb, word_vecs, num_choices, embedding_mat):
+    self.config = config
+
+    self.embedding_mat = embedding_mat
+
+    # kb has shape [N_kb, 3]
+    self.kb = kb
+    self.embed_keys_e, self.embed_keys_r, self.embed_vals_e = self.embed_kb()
+
+    # word_vecs has shape [T_decoder, N, D_txt]
+    self.word_vecs = word_vecs
+    self.num_choices = num_choices
+
+  def embed_kb(self):
+    keys_e, keys_r, vals_e = [], [], []
+    for idx_sub, idx_rel, idx_obj in self.kb:
+      keys_e.append(idx_sub)
+      keys_r.append(idx_rel)
+      vals_e.append(idx_obj)
+    embed_keys_e = tf.nn.embedding_lookup(self.embedding_mat, keys_e)
+    embed_keys_r = tf.nn.embedding_lookup(self.embedding_mat, keys_r)
+    embed_vals_e = tf.nn.embedding_lookup(self.embedding_mat, vals_e)
+    return embed_keys_e, embed_keys_r, embed_vals_e
+
+  def _slice_word_vecs(self, time_idx, batch_idx):
+    # this callable will be wrapped into a td.Function
+    # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors
+    # time is highest dim in word_vecs
+    joint_index = tf.stack([time_idx, batch_idx], axis=1)
+    return tf.gather_nd(self.word_vecs, joint_index)
+
+  # All the layers are wrapped with td.ScopedLayer
+  def KeyFindModule(self,
+                    time_idx,
+                    batch_idx,
+                    scope='KeyFindModule',
+                    reuse=None):
+    # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors
+    text_param = self._slice_word_vecs(time_idx, batch_idx)
+
+    # Mapping: embed_keys_e x text_param -> att
+    # Input:
+    #   embed_keys_e: [N_kb, D_txt]
+    #   text_param: [N, D_txt]
+    # Output:
+    #   att: [N, N_kb]
+    #
+    # Implementation:
+    #   1. Elementwise multiplication between embed_key_e and text_param
+    #   2. L2-normalization
+    with tf.variable_scope(scope, reuse=reuse):
+      m = tf.matmul(text_param, self.embed_keys_e, transpose_b=True)
+      att = tf.nn.l2_normalize(m, dim=1)
+    return att
+
+  def KeyFilterModule(self,
+                      input_0,
+                      time_idx,
+                      batch_idx,
+                      scope='KeyFilterModule',
+                      reuse=None):
+    att_0 = input_0
+    text_param = self._slice_word_vecs(time_idx, batch_idx)
+
+    # Mapping: and(embed_keys_r x text_param, att) -> att
+    # Input:
+    #   embed_keys_r: [N_kb, D_txt]
+    #   text_param: [N, D_txt]
+    #   att_0: [N, N_kb]
+    # Output:
+    #   att: [N, N_kb]
+    #
+    # Implementation:
+    #   1. Elementwise multiplication between embed_key_r and text_param
+    #   2. L2-normalization
+    #   3. Take the elementwise-min
+    with tf.variable_scope(scope, reuse=reuse):
+      m = tf.matmul(text_param, self.embed_keys_r, transpose_b=True)
+      att_1 = tf.nn.l2_normalize(m, dim=1)
+      att = tf.minimum(att_0, att_1)
+    return att
+
+  def ValDescribeModule(self,
+                        input_0,
+                        time_idx,
+                        batch_idx,
+                        scope='ValDescribeModule',
+                        reuse=None):
+    att = input_0
+
+    # Mapping: att -> answer probs
+    # Input:
+    #   embed_vals_e: [N_kb, D_txt]
+    #   att: [N, N_kb]
+    #   embedding_mat: [self.num_choices, D_txt]
+    # Output:
+    #   answer_scores: [N, self.num_choices]
+    #
+    # Implementation:
+    #   1. Attention-weighted sum over values
+    #   2. Compute cosine similarity scores between the weighted sum and
+    #      each candidate answer
+    with tf.variable_scope(scope, reuse=reuse):
+      # weighted_sum has shape [N, D_txt]
+      weighted_sum = tf.matmul(att, self.embed_vals_e)
+      # scores has shape [N, self.num_choices]
+      scores = tf.matmul(
+          weighted_sum,
+          tf.nn.l2_normalize(self.embedding_mat, dim=1),
+          transpose_b=True)
+    return scores
--- a/qa_kg/model_n2nmn/netgen_att.py
+++ b/qa_kg/model_n2nmn/netgen_att.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from util.nn import fc_layer as fc
+
+
+def _get_lstm_cell(num_layers, lstm_dim):
+  cell_list = [
+      tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True)
+      for _ in range(num_layers)
+  ]
+  cell = tf.contrib.rnn.MultiRNNCell(cell_list, state_is_tuple=True)
+  return cell
+
+
+class AttentionSeq2Seq:
+
+  def __init__(self,
+               config,
+               text_seq_batch,
+               seq_length_batch,
+               num_vocab_txt,
+               num_vocab_nmn,
+               EOS_token,
+               decoder_sampling,
+               embedding_mat,
+               use_gt_layout=None,
+               gt_layout_batch=None,
+               scope='encoder_decoder',
+               reuse=None):
+    self.T_decoder = config.T_decoder
+    self.encoder_num_vocab = num_vocab_txt
+    self.encoder_embed_dim = config.embed_dim_txt
+    self.decoder_num_vocab = num_vocab_nmn
+    self.decoder_embed_dim = config.embed_dim_nmn
+    self.lstm_dim = config.lstm_dim
+    self.num_layers = config.num_layers
+    self.EOS_token = EOS_token
+    self.decoder_sampling = decoder_sampling
+    self.embedding_mat = embedding_mat
+
+    with tf.variable_scope(scope, reuse=reuse):
+      self._build_encoder(text_seq_batch, seq_length_batch)
+      self._build_decoder(use_gt_layout, gt_layout_batch)
+
+  def _build_encoder(self,
+                     text_seq_batch,
+                     seq_length_batch,
+                     scope='encoder',
+                     reuse=None):
+    lstm_dim = self.lstm_dim
+    num_layers = self.num_layers
+
+    with tf.variable_scope(scope, reuse=reuse):
+      T = tf.shape(text_seq_batch)[0]
+      N = tf.shape(text_seq_batch)[1]
+      self.T_encoder = T
+      self.N = N
+
+      # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]
+      embedded_seq = tf.nn.embedding_lookup(self.embedding_mat, text_seq_batch)
+      self.embedded_input_seq = embedded_seq
+
+      # The RNN
+      cell = _get_lstm_cell(num_layers, lstm_dim)
+
+      # encoder_outputs has shape [T, N, lstm_dim]
+      encoder_outputs, encoder_states = tf.nn.dynamic_rnn(
+          cell,
+          embedded_seq,
+          seq_length_batch,
+          dtype=tf.float32,
+          time_major=True,
+          scope='lstm')
+      self.encoder_outputs = encoder_outputs
+      self.encoder_states = encoder_states
+
+      # transform the encoder outputs for further attention alignments
+      # encoder_outputs_flat has shape [T, N, lstm_dim]
+      encoder_h_transformed = fc(
+          'encoder_h_transform',
+          tf.reshape(encoder_outputs, [-1, lstm_dim]),
+          output_dim=lstm_dim)
+      encoder_h_transformed = tf.reshape(encoder_h_transformed,
+                                         [T, N, lstm_dim])
+      self.encoder_h_transformed = encoder_h_transformed
+
+      # seq_not_finished is a shape [T, N, 1] tensor,
+      # where seq_not_finished[t, n]
+      # is 1 iff sequence n is not finished at time t, and 0 otherwise
+      seq_not_finished = tf.less(
+          tf.range(T)[:, tf.newaxis, tf.newaxis],
+          seq_length_batch[:, tf.newaxis])
+      seq_not_finished = tf.cast(seq_not_finished, tf.float32)
+      self.seq_not_finished = seq_not_finished
+
+  def _build_decoder(self,
+                     use_gt_layout,
+                     gt_layout_batch,
+                     scope='decoder',
+                     reuse=None):
+    # The main difference from before is that the decoders now takes another
+    # input (the attention) when computing the next step
+    # T_max is the maximum length of decoded sequence (including <eos>)
+    #
+    # This function is for decoding only. It performs greedy search or sampling.
+    # the first input is <go> (its embedding vector) and the subsequent inputs
+    # are the outputs from previous time step
+    # num_vocab does not include <go>
+    #
+    # use_gt_layout is None or a bool tensor, and gt_layout_batch is a tensor
+    # with shape [T_max, N].
+    # If use_gt_layout is not None, then when use_gt_layout is true, predict
+    # exactly the tokens in gt_layout_batch, regardless of actual probability.
+    # Otherwise, if sampling is True, sample from the token probability
+    # If sampling is False, do greedy decoding (beam size 1)
+    N = self.N
+    encoder_states = self.encoder_states
+    T_max = self.T_decoder
+    lstm_dim = self.lstm_dim
+    num_layers = self.num_layers
+    EOS_token = self.EOS_token
+    sampling = self.decoder_sampling
+
+    with tf.variable_scope(scope, reuse=reuse):
+      embedding_mat = tf.get_variable(
+          'embedding_mat', [self.decoder_num_vocab, self.decoder_embed_dim])
+      # we use a separate embedding for <go>, as it is only used in the
+      # beginning of the sequence
+      go_embedding = tf.get_variable('go_embedding',
+                                     [1, self.decoder_embed_dim])
+
+      with tf.variable_scope('att_prediction'):
+        v = tf.get_variable('v', [lstm_dim])
+        W_a = tf.get_variable(
+            'weights', [lstm_dim, lstm_dim],
+            initializer=tf.contrib.layers.xavier_initializer())
+        b_a = tf.get_variable(
+            'biases', lstm_dim, initializer=tf.constant_initializer(0.))
+
+      # The parameters to predict the next token
+      with tf.variable_scope('token_prediction'):
+        W_y = tf.get_variable(
+            'weights', [lstm_dim * 2, self.decoder_num_vocab],
+            initializer=tf.contrib.layers.xavier_initializer())
+        b_y = tf.get_variable(
+            'biases',
+            self.decoder_num_vocab,
+            initializer=tf.constant_initializer(0.))
+
+      # Attentional decoding
+      # Loop function is called at time t BEFORE the cell execution at time t,
+      # and its next_input is used as the input at time t (not t+1)
+      # c.f. https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn
+      mask_range = tf.reshape(
+          tf.range(self.decoder_num_vocab, dtype=tf.int32), [1, -1])
+      all_eos_pred = EOS_token * tf.ones([N], tf.int32)
+      all_one_prob = tf.ones([N], tf.float32)
+      all_zero_entropy = tf.zeros([N], tf.float32)
+      if use_gt_layout is not None:
+        gt_layout_mult = tf.cast(use_gt_layout, tf.int32)
+        pred_layout_mult = 1 - gt_layout_mult
+
+      def loop_fn(time, cell_output, cell_state, loop_state):
+        if cell_output is None:  # time == 0
+          next_cell_state = encoder_states
+          next_input = tf.tile(go_embedding, [N, 1])
+        else:  # time > 0
+          next_cell_state = cell_state
+
+          # compute the attention map over the input sequence
+          # a_raw has shape [T, N, 1]
+          att_raw = tf.reduce_sum(
+              tf.tanh(
+                  tf.nn.xw_plus_b(cell_output, W_a, b_a) +
+                  self.encoder_h_transformed) * v,
+              axis=2,
+              keep_dims=True)
+          # softmax along the first dimension (T) over not finished examples
+          # att has shape [T, N, 1]
+          att = tf.nn.softmax(att_raw, dim=0) * self.seq_not_finished
+          att = att / tf.reduce_sum(att, axis=0, keep_dims=True)
+          # d has shape [N, lstm_dim]
+          d2 = tf.reduce_sum(att * self.encoder_outputs, axis=0)
+
+          # token_scores has shape [N, num_vocab]
+          token_scores = tf.nn.xw_plus_b(
+              tf.concat([cell_output, d2], axis=1), W_y, b_y)
+          # predict the next token (behavior depending on parameters)
+          if sampling:
+            # predicted_token has shape [N]
+            logits = token_scores
+            predicted_token = tf.cast(
+                tf.reshape(tf.multinomial(token_scores, 1), [-1]), tf.int32)
+          else:
+            # predicted_token has shape [N]
+            predicted_token = tf.cast(tf.argmax(token_scores, 1), tf.int32)
+          if use_gt_layout is not None:
+            predicted_token = (gt_layout_batch[time - 1] * gt_layout_mult +
+                               predicted_token * pred_layout_mult)
+
+          # token_prob has shape [N], the probability of the predicted token
+          # although token_prob is not needed for predicting the next token
+          # it is needed in output (for policy gradient training)
+          # [N, num_vocab]
+          # mask has shape [N, num_vocab]
+          mask = tf.equal(mask_range, tf.reshape(predicted_token, [-1, 1]))
+          all_token_probs = tf.nn.softmax(token_scores)
+          token_prob = tf.reduce_sum(
+              all_token_probs * tf.cast(mask, tf.float32), axis=1)
+          neg_entropy = tf.reduce_sum(
+              all_token_probs * tf.log(all_token_probs), axis=1)
+
+          # is_eos_predicted is a [N] bool tensor, indicating whether
+          # <eos> has already been predicted previously in each sequence
+          is_eos_predicted = loop_state[2]
+          predicted_token_old = predicted_token
+          # if <eos> has already been predicted, now predict <eos> with
+          # prob 1
+          predicted_token = tf.where(is_eos_predicted, all_eos_pred,
+                                     predicted_token)
+          token_prob = tf.where(is_eos_predicted, all_one_prob, token_prob)
+          neg_entropy = tf.where(is_eos_predicted, all_zero_entropy,
+                                 neg_entropy)
+          is_eos_predicted = tf.logical_or(is_eos_predicted,
+                                           tf.equal(predicted_token_old,
+                                                    EOS_token))
+
+          # the prediction is from the cell output of the last step
+          # timestep (t-1), feed it as input into timestep t
+          next_input = tf.nn.embedding_lookup(embedding_mat, predicted_token)
+
+        elements_finished = tf.greater_equal(time, T_max)
+
+        # loop_state is a 5-tuple, representing
+        #   1) the predicted_tokens
+        #   2) the prob of predicted_tokens
+        #   3) whether <eos> has already been predicted
+        #   4) the negative entropy of policy (accumulated across timesteps)
+        #   5) the attention
+        if loop_state is None:  # time == 0
+          # Write the predicted token into the output
+          predicted_token_array = tf.TensorArray(
+              dtype=tf.int32, size=T_max, infer_shape=False)
+          token_prob_array = tf.TensorArray(
+              dtype=tf.float32, size=T_max, infer_shape=False)
+          att_array = tf.TensorArray(
+              dtype=tf.float32, size=T_max, infer_shape=False)
+          next_loop_state = (predicted_token_array, token_prob_array, tf.zeros(
+              [N], dtype=tf.bool), tf.zeros([N], dtype=tf.float32), att_array)
+        else:  # time > 0
+          t_write = time - 1
+          next_loop_state = (
+              loop_state[0].write(t_write, predicted_token),
+              loop_state[1].write(t_write, token_prob),
+              is_eos_predicted,
+              loop_state[3] + neg_entropy,
+              loop_state[4].write(t_write, att))
+        return (elements_finished, next_input, next_cell_state, cell_output,
+                next_loop_state)
+
+      # The RNN
+      cell = _get_lstm_cell(num_layers, lstm_dim)
+      _, _, decodes_ta = tf.nn.raw_rnn(cell, loop_fn, scope='lstm')
+      predicted_tokens = decodes_ta[0].stack()
+      token_probs = decodes_ta[1].stack()
+      neg_entropy = decodes_ta[3]
+      # atts has shape [T_decoder, T_encoder, N, 1]
+      atts = decodes_ta[4].stack()
+      self.atts = atts
+      # word_vec has shape [T_decoder, N, D]
+      word_vecs = tf.reduce_sum(atts * self.embedded_input_seq, axis=1)
+
+      predicted_tokens.set_shape([None, None])
+      token_probs.set_shape([None, None])
+      neg_entropy.set_shape([None])
+      word_vecs.set_shape([None, None, self.encoder_embed_dim])
+
+      self.predicted_tokens = predicted_tokens
+      self.token_probs = token_probs
+      self.neg_entropy = neg_entropy
+      self.word_vecs = word_vecs
--- a/qa_kg/util/__init__.py
+++ b/qa_kg/util/__init__.py
--- a/qa_kg/util/data_reader.py
+++ b/qa_kg/util/data_reader.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from collections import namedtuple
+from Queue import Queue
+import re
+import threading
+import numpy as np
+import tensorflow as tf
+
+Data = namedtuple('Data', ['X', 'Y', 'MultiYs', 'qid'])
+
+
+class SampleBuilder:
+
+  def __init__(self, config):
+    self.config = config
+
+    self.kb_raw = self.read_kb()
+    self.data_raw = self.read_raw_data()
+
+    # dictionary of entities, normal words, and relations
+    self.dict_all = self.gen_dict()
+    self.reverse_dict_all = dict(
+        zip(self.dict_all.values(), self.dict_all.keys()))
+
+    tf.logging.info('size of dict: %d' % len(self.dict_all))
+
+    self.kb = self.build_kb()
+    self.data_all = self.build_samples()
+
+  def read_kb(self):
+    kb_raw = []
+    for line in file(self.config.KB_file):
+      sub, rel, obj = line.strip().split('|')
+      kb_raw.append((sub, rel, obj))
+    tf.logging.info('# of KB records: %d' % len(kb_raw))
+    return kb_raw
+
+  def read_raw_data(self):
+    data = dict()
+    for name in self.config.data_files:
+      raw = []
+      tf.logging.info(
+        'Reading data file {}'.format(self.config.data_files[name]))
+      for line in file(self.config.data_files[name]):
+        question, answers = line.strip().split('\t')
+        question = question.replace('],', ']')  # ignore ',' in the template
+        raw.append((question, answers))
+      data[name] = raw
+    return data
+
+  def build_kb(self):
+    tf.logging.info('Indexing KB...')
+    kb = []
+    for sub, rel, obj in self.kb_raw:
+      kb.append([self.dict_all[sub], self.dict_all[rel], self.dict_all[obj]])
+    return kb
+
+  def gen_dict(self):
+    s = set()
+    for sub, rel, obj in self.kb_raw:
+      s.add(sub)
+      s.add(rel)
+      s.add(obj)
+    for name in self.data_raw:
+      for question, answers in self.data_raw[name]:
+        normal = re.split('\[[^\]]+\]', question)
+        for phrase in normal:
+          for word in phrase.split():
+            s.add(word)
+    s = list(s)
+    d = {s[idx]: idx for idx in range(len(s))}
+    return d
+
+  def build_samples(self):
+
+    def map_entity_idx(text):
+      entities = re.findall('\[[^\]]+\]', text)
+      for entity in entities:
+        entity = entity[1:-1]
+        index = self.dict_all[entity]
+        text = text.replace('[%s]' % entity, '@%d' % index)
+      return text
+
+    data_all = dict()
+
+    for name in self.data_raw:
+      X, Y, MultiYs, qid = [], [], [], []
+      for i, (question, answers) in enumerate(self.data_raw[name]):
+        qdata, labels = [], []
+        question = map_entity_idx(question)
+        for word in question.split():
+          if word[0] == '@':
+            qdata.append(int(word[1:]))
+          else:
+            qdata.append(self.dict_all[word])
+        for answer in answers.split('|'):
+          labels.append(self.dict_all[answer])
+        if len(qdata) > self.config.T_encoder:
+          self.config.T_encoder = len(qdata)
+        for label in labels:
+          X.append(qdata)
+          Y.append(label)
+          MultiYs.append(set(labels))
+          qid.append(i)
+      data_all[name] = Data(X=X, Y=Y, MultiYs=MultiYs, qid=qid)
+
+    return data_all
+
+
+def _run_prefetch(prefetch_queue, batch_loader, data, shuffle, one_pass,
+                  config):
+  assert len(data.X) == len(data.Y) == len(data.MultiYs) == len(data.qid)
+  num_samples = len(data.X)
+  batch_size = config.batch_size
+
+  n_sample = 0
+  fetch_order = config.rng.permutation(num_samples)
+  while True:
+    sample_ids = fetch_order[n_sample:n_sample + batch_size]
+    batch = batch_loader.load_one_batch(sample_ids)
+    prefetch_queue.put(batch, block=True)
+
+    n_sample += len(sample_ids)
+    if n_sample >= num_samples:
+      if one_pass:
+        prefetch_queue.put(None, block=True)
+      n_sample = 0
+      if shuffle:
+        fetch_order = config.rng.permutation(num_samples)
+
+
+class DataReader:
+  def __init__(self,
+               config,
+               data,
+               assembler,
+               shuffle=True,
+               one_pass=False,
+               prefetch_num=10):
+    self.config = config
+
+    self.data = data
+    self.assembler = assembler
+    self.batch_loader = BatchLoader(self.config,
+                                    self.data, self.assembler)
+
+    self.shuffle = shuffle
+    self.one_pass = one_pass
+    self.prefetch_queue = Queue(maxsize=prefetch_num)
+    self.prefetch_thread = threading.Thread(target=_run_prefetch,
+                                            args=(self.prefetch_queue,
+                                                  self.batch_loader, self.data,
+                                                  self.shuffle, self.one_pass,
+                                                  self.config))
+    self.prefetch_thread.daemon = True
+    self.prefetch_thread.start()
+
+  def batches(self):
+    while True:
+      if self.prefetch_queue.empty():
+        tf.logging.warning('Waiting for data loading (IO is slow)...')
+      batch = self.prefetch_queue.get(block=True)
+      if batch is None:
+        assert self.one_pass
+        tf.logging.info('One pass finished!')
+        raise StopIteration()
+      yield batch
+
+
+class BatchLoader:
+  def __init__(self, config,
+               data, assembler):
+    self.config = config
+
+    self.data = data
+    self.assembler = assembler
+
+    self.T_encoder = config.T_encoder
+    self.T_decoder = config.T_decoder
+
+    tf.logging.info('T_encoder: %d' % self.T_encoder)
+    tf.logging.info('T_decoder: %d' % self.T_decoder)
+    tf.logging.info('batch size: %d' % self.config.batch_size)
+
+    self.gt_layout_tokens = config.gt_layout_tokens
+
+  def load_one_batch(self, sample_ids):
+    actual_batch_size = len(sample_ids)
+    input_seq_batch = np.zeros((self.T_encoder, actual_batch_size), np.int32)
+    seq_len_batch = np.zeros(actual_batch_size, np.int32)
+    ans_label_batch = np.zeros(actual_batch_size, np.int32)
+    ans_set_labels_list = [None] * actual_batch_size
+    question_id_list = [None] * actual_batch_size
+    gt_layout_batch = np.zeros((self.T_decoder, actual_batch_size), np.int32)
+
+    for batch_i in range(actual_batch_size):
+      idx = sample_ids[batch_i]
+      seq_len = len(self.data.X[idx])
+      seq_len_batch[batch_i] = seq_len
+      input_seq_batch[:seq_len, batch_i] = self.data.X[idx]
+      ans_label_batch[batch_i] = self.data.Y[idx]
+      ans_set_labels_list[batch_i] = self.data.MultiYs[idx]
+      question_id_list[batch_i] = self.data.qid[idx]
+
+      gt_layout_batch[:, batch_i] = self.assembler.module_list2tokens(
+        self.gt_layout_tokens, self.T_decoder)
+
+    batch = dict(input_seq_batch=input_seq_batch,
+                 seq_len_batch=seq_len_batch,
+                 ans_label_batch=ans_label_batch,
+                 gt_layout_batch=gt_layout_batch,
+                 ans_set_labels_list=ans_set_labels_list,
+                 question_id_list=question_id_list)
+    return batch
--- a/qa_kg/util/misc.py
+++ b/qa_kg/util/misc.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from datetime import datetime
+import json
+import logging
+import os
+import tensorflow as tf
+import tensorflow.contrib.slim as slim
+
+
+def prepare_dirs_and_logger(config):
+  formatter = logging.Formatter('%(asctime)s:%(levelname)s::%(message)s')
+  logger = logging.getLogger('tensorflow')
+
+  for hdlr in logger.handlers:
+    logger.removeHandler(hdlr)
+
+  handler = logging.StreamHandler()
+  handler.setFormatter(formatter)
+
+  logger.addHandler(handler)
+  logger.setLevel(tf.logging.INFO)
+
+  config.log_dir = os.path.join(config.exp_dir, config.log_dir,
+                                config.train_tag)
+  config.model_dir = os.path.join(config.exp_dir, config.model_dir,
+                                  config.train_tag)
+  config.output_dir = os.path.join(config.exp_dir, config.output_dir,
+                                   config.train_tag)
+
+  for path in [
+    config.log_dir, config.model_dir, config.output_dir
+  ]:
+    if not os.path.exists(path):
+      os.makedirs(path)
+
+  config.data_files = {
+    'train': os.path.join(config.data_dir, config.train_data_file),
+    'dev': os.path.join(config.data_dir, config.dev_data_file),
+    'test': os.path.join(config.data_dir, config.test_data_file)
+  }
+
+  return config
+
+
+def get_time():
+  return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+
+
+def show_all_variables():
+  model_vars = tf.trainable_variables()
+  slim.model_analyzer.analyze_vars(model_vars, print_info=True)
+
+
+def save_config(config):
+  param_path = os.path.join(config.model_dir, 'params.json')
+
+  tf.logging.info('log dir: %s' % config.log_dir)
+  tf.logging.info('model dir: %s' % config.model_dir)
+  tf.logging.info('param path: %s' % param_path)
+  tf.logging.info('output dir: %s' % config.output_dir)
+
+  with open(param_path, 'w') as f:
+    f.write(json.dumps(config.__dict__, indent=4, sort_keys=True))
--- a/qa_kg/util/nn.py
+++ b/qa_kg/util/nn.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+
+
+def fc_layer(name,
+             bottom,
+             output_dim,
+             bias_term=True,
+             weights_initializer=None,
+             biases_initializer=None,
+             reuse=None):
+  # flatten bottom input
+  shape = bottom.get_shape().as_list()
+  input_dim = 1
+  for d in shape[1:]:
+    input_dim *= d
+  flat_bottom = tf.reshape(bottom, [-1, input_dim])
+
+  # weights and biases variables
+  with tf.variable_scope(name, reuse=reuse):
+    # initialize the variables
+    if weights_initializer is None:
+      weights_initializer = tf.contrib.layers.xavier_initializer()
+    if bias_term and biases_initializer is None:
+      biases_initializer = tf.constant_initializer(0.)
+
+    # weights has shape [input_dim, output_dim]
+    weights = tf.get_variable(
+        'weights', [input_dim, output_dim], initializer=weights_initializer)
+    if bias_term:
+      biases = tf.get_variable(
+          'biases', output_dim, initializer=biases_initializer)
+    if not reuse:
+      tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
+                           tf.nn.l2_loss(weights))
+
+  if bias_term:
+    fc = tf.nn.xw_plus_b(flat_bottom, weights, biases)
+  else:
+    fc = tf.matmul(flat_bottom, weights)
+  return fc
--- a/rebar/README.md
+++ b/rebar/README.md
+# REINFORCing Concrete with REBAR
+*Implemention of REBAR (and other closely related methods) as described
+in "REBAR: Low-variance, unbiased gradient estimates for discrete latent variable models" by
+George Tucker, Andriy Mnih, Chris J. Maddison, Dieterich Lawson, Jascha Sohl-Dickstein [(https://arxiv.org/abs/1703.07370)](https://arxiv.org/abs/1703.07370).*
+
+Learning in models with discrete latent variables is challenging due to high variance gradient estimators. Generally, approaches have relied on control variates to reduce the variance of the REINFORCE estimator. Recent work ([Jang et al. 2016](https://arxiv.org/abs/1611.01144); [Maddison et al. 2016](https://arxiv.org/abs/1611.00712)) has taken a different approach, introducing a continuous relaxation of discrete variables to produce low-variance, but biased, gradient estimates. In this work, we combine the two approaches through a novel control variate that produces low-variance, unbiased gradient estimates. Then, we introduce a novel continuous relaxation and show that the tightness of the relaxation can be adapted online, removing it as a hyperparameter. We show state-of-the-art variance reduction on several benchmark generative modeling tasks, generally leading to faster convergence to a better final log likelihood.
+
+REBAR applied to multilayer sigmoid belief networks is implemented in rebar.py and rebar_train.py provides a training/evaluation setup. As a comparison, we also implemented the following methods:
+* [NVIL](https://arxiv.org/abs/1402.0030)
+* [MuProp](https://arxiv.org/abs/1511.05176)
+* [Gumbel-Softmax](https://arxiv.org/abs/1611.01144)
+
+The code is not optimized and some computation is repeated for ease of
+implementation. We hope that this code will be a useful starting point for future research in this area.
+
+## Quick Start:
+
+Requirements:
+* TensorFlow (see tensorflow.org for how to install)
+* MNIST dataset
+* Omniglot dataset
+
+First download datasets by selecting URLs to download the data from. Then
+fill in the download_data.py script like so:
+
+```
+MNIST_URL = 'http://yann.lecun.com/exdb/mnist'
+MNIST_BINARIZED_URL = 'http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist'
+OMNIGLOT_URL = 'https://github.com/yburda/iwae/raw/master/datasets/OMNIGLOT/chardata.mat'
+```
+
+Then run the script to download the data:
+
+```
+python download_data.py
+```
+
+Then run the training script:
+
+```
+python rebar_train.py --hparams="model=SBNDynamicRebar,learning_rate=0.0003,n_layer=2,task=sbn"
+```
+
+and you should see something like:
+
+```
+Step 2084: [-231.026474      0.3711713     1.            1.06934261    1.07023323
+    1.02173257    1.02171052    1.            1.            1.            1.        ]
+-3.6465678215
+Step 4168: [-156.86795044    0.3097114     1.            1.03964758    1.03936625
+    1.02627242    1.02629256    1.            1.            1.            1.        ]
+-4.42727231979
+Step 6252: [-143.4650116     0.26153237    1.            1.03633797    1.03600132
+    1.02639604    1.02639794    1.            1.            1.            1.        ]
+-4.85577583313
+Step 8336: [-137.65275574    0.22313026    1.            1.03467286    1.03428006
+    1.02336085    1.02335203    0.99999988    1.            0.99999988
+    1.        ]
+-4.95563364029
+```
+
+The first number in the list is the log likelihood lower bound and the number
+after the list is the log of the variance of the gradient estimator. The rest of
+the numbers are for debugging.
+
+We can also compare the variance between methods:
+
+```
+python rebar_train.py \
+  --hparams="model=SBNTrackGradVariances,learning_rate=0.0003,n_layer=2,task=omni"
+```
+
+and you should see something like:
+
+```
+Step 959: [ -2.60478699e+02   3.84281784e-01   6.31126612e-02   3.27319391e-02
+   6.13379292e-03   1.98278503e-04   1.96425783e-04   8.83973844e-04
+   8.70995224e-04             -inf]
+('DynamicREBAR', -3.725339889526367)
+('MuProp', -0.033569782972335815)
+('NVIL', 2.7640280723571777)
+('REBAR', -3.539274215698242)
+('SimpleMuProp', -0.040744658559560776)
+Step 1918: [ -2.06948471e+02   3.35904926e-01   5.20901568e-03   7.81541676e-05
+   2.06885766e-03   1.08521657e-04   1.07351625e-04   2.30646547e-04
+   2.26554010e-04  -8.22885323e+00]
+('DynamicREBAR', -3.864381790161133)
+('MuProp', -0.7183765172958374)
+('NVIL', 2.266523599624634)
+('REBAR', -3.662022113800049)
+('SimpleMuProp', -0.7071359157562256)
+```
+where the tuples show the log of the variance of the gradient estimators.
+
+The training script has a number of hyperparameter configuration flags:
+* task (sbn): one of {sbn, sp, omni} which correspond to MNIST generative
+  modeling, structured prediction on MNIST, and Omniglot generative modeling,
+  respectively
+* model (SBNGumbel) : one of {SBN, SBNNVIL, SBNMuProp, SBNSimpleMuProp,
+  SBNRebar, SBNDynamicRebar, SBNGumbel SBNTrackGradVariances}. DynamicRebar automatically
+  adjusts the temperature, whereas Rebar and Gumbel-Softmax require tuning the
+  temperature. The ones named after
+  methods uses that method to estimate the gradients (SBN refers to
+  REINFORCE). SBNTrackGradVariances runs multiple methods and follows a single
+  optimization trajectory
+* n_hidden (200): number of hidden nodes per layer
+* n_layer (1): number of layers in the model
+* nonlinear (false): if true use 2 x tanh layers between each stochastic layer,
+  otherwise use a linear layer
+* learning_rate (0.001): learning rate
+* temperature (0.5): temperature hyperparameter (for DynamicRebar, this is the initial
+  value of the temperature)
+* n_samples (1): number of samples used to compute the gradient estimator (for the
+  experiments in the paper, set to 1)
+* batch_size (24): batch size
+* muprop_relaxation (true): if true use the new relaxation described in the paper,
+  otherwise use the Concrete/Gumbel softmax relaxation
+* dynamic_b (false): if true dynamically binarize the training set. This
+  increases the effective training dataset size and reduces overfitting, though
+  it is not a standard dataset
+
+Maintained by George Tucker (gjt@google.com, github user: gjtucker).