Merge branch 'master' into patch-6

78ddf6eb · cclauss · GitHub · 50cb0365 · 1f34fcaf · 78ddf6eb
Unverified Commit 78ddf6eb authored Jan 26, 2018 by cclauss Committed by GitHub Jan 26, 2018
20 changed files
--- a/research/brain_coder/single_task/pg_agent_test.py
+++ b/research/brain_coder/single_task/pg_agent_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Tests for pg_agent."""
+
+from collections import Counter
+
+from absl import logging
+import numpy as np
+from six.moves import xrange
+import tensorflow as tf
+
+from common import utils  # brain coder
+from single_task import data  # brain coder
+from single_task import defaults  # brain coder
+from single_task import misc  # brain coder
+from single_task import pg_agent as agent_lib  # brain coder
+from single_task import pg_train  # brain coder
+
+
+# Symmetric mean absolute percentage error (SMAPE).
+# https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error
+def smape(a, b):
+  return 2.0 * abs(a - b) / float(a + b)
+
+
+def onehot(dim, num_dims):
+  value = np.zeros(num_dims, dtype=np.float32)
+  value[dim] = 1
+  return value
+
+
+def random_sequence(max_length, num_tokens, eos=0):
+  length = np.random.randint(1, max_length - 1)
+  return np.append(np.random.randint(1, num_tokens, length), eos)
+
+
+def repeat_and_pad(v, rep, total_len):
+  return [v] * rep + [0.0] * (total_len - rep)
+
+
+class AgentTest(tf.test.TestCase):
+
+  def testProcessEpisodes(self):
+    batch_size = 3
+
+    def reward_fn(code_string):
+      return misc.RewardInfo(
+          episode_rewards=[float(ord(c)) for c in code_string],
+          input_case=[],
+          correct_output=[],
+          code_output=[],
+          input_type=misc.IOType.integer,
+          output_type=misc.IOType.integer,
+          reason='none')
+
+    rl_batch = data.RLBatch(
+        reward_fns=[reward_fn for _ in range(batch_size)],
+        batch_size=batch_size,
+        good_reward=10.0)
+    batch_actions = np.asarray([
+        [4, 5, 3, 6, 8, 1, 0, 0],
+        [1, 2, 3, 4, 0, 0, 0, 0],
+        [8, 7, 6, 5, 4, 3, 2, 1]], dtype=np.int32)
+    batch_values = np.asarray([
+        [0, 1, 2, 1, 0, 1, 1, 0],
+        [0, 2, 1, 2, 1, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0, 1, 1]], dtype=np.float32)
+    episode_lengths = np.asarray([7, 5, 8], dtype=np.int32)
+
+    scores = agent_lib.compute_rewards(
+        rl_batch, batch_actions, episode_lengths)
+    batch_targets, batch_returns = agent_lib.process_episodes(
+        scores.batch_rewards, episode_lengths, a2c=True,
+        batch_values=batch_values)
+    self.assertEqual(
+        [[473.0, 428.0, 337.0, 294.0, 201.0, 157.0, 95.0, 0.0],
+         [305.0, 243.0, 183.0, 140.0, 95.0, 0.0, 0.0, 0.0],
+         [484.0, 440.0, 394.0, 301.0, 210.0, 165.0, 122.0, 62.0]],
+        batch_returns.tolist())
+    self.assertEqual(
+        [[473.0, 427.0, 335.0, 293.0, 201.0, 156.0, 94.0, 0.0],
+         [305.0, 241.0, 182.0, 138.0, 94.0, 0.0, 0.0, 0.0],
+         [484.0, 439.0, 393.0, 301.0, 210.0, 165.0, 121.0, 61.0]],
+        batch_targets.tolist())
+
+  def testVarUpdates(self):
+    """Tests that variables get updated as expected.
+
+    For the RL update, check that gradients are non-zero and that the global
+    model gets updated.
+    """
+    config = defaults.default_config_with_updates(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)')
+    lr = config.agent.lr
+
+    tf.reset_default_graph()
+    trainer = pg_train.AsyncTrainer(
+        config, task_id=0, ps_tasks=0, num_workers=1)
+    global_init_op = tf.variables_initializer(
+        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
+    with tf.Session() as sess:
+      sess.run(global_init_op)  # Initialize global copy.
+      trainer.initialize(sess)
+      model = trainer.model
+      global_vars = sess.run(trainer.global_model.trainable_variables)
+      local_vars = sess.run(model.trainable_variables)
+
+      # Make sure names match.
+      g_prefix = 'global/'
+      l_prefix = 'local/'
+      for g, l in zip(trainer.global_model.trainable_variables,
+                      model.trainable_variables):
+        self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):])
+
+      # Assert that shapes and values are the same between global and local
+      # models.
+      for g, l in zip(global_vars, local_vars):
+        self.assertEqual(g.shape, l.shape)
+        self.assertTrue(np.array_equal(g, l))
+
+      # Make all gradients dense tensors.
+      for param, grad in model.gradients_dict.items():
+        if isinstance(grad, tf.IndexedSlices):
+          # Converts to dense tensor.
+          model.gradients_dict[param] = tf.multiply(grad, 1.0)
+
+      # Perform update.
+      results = model.update_step(
+          sess, trainer.data_manager.sample_rl_batch(), trainer.train_op,
+          trainer.global_step, return_gradients=True)
+      grads_dict = results.gradients_dict
+      for grad in grads_dict.values():
+        self.assertIsNotNone(grad)
+        self.assertTrue(np.count_nonzero(grad) > 0)
+      global_update = sess.run(trainer.global_model.trainable_variables)
+      for tf_var, var_before, var_after in zip(
+          model.trainable_variables, local_vars, global_update):
+        # Check that the params were updated.
+        self.assertTrue(np.allclose(
+            var_after,
+            var_before - grads_dict[tf_var] * lr))
+
+      # Test that global to local sync works.
+      sess.run(trainer.sync_op)
+      global_vars = sess.run(trainer.global_model.trainable_variables)
+      local_vars = sess.run(model.trainable_variables)
+      for l, g in zip(local_vars, global_vars):
+        self.assertTrue(np.allclose(l, g))
+
+  def testMonteCarloGradients(self):
+    """Test Monte Carlo estimate of REINFORCE gradient.
+
+    Test that the Monte Carlo estimate of the REINFORCE gradient is
+    approximately equal to the true gradient. We compute the true gradient for a
+    toy environment with a very small action space.
+
+    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
+    """
+    # Test may have different outcome on different machines due to different
+    # rounding behavior of float arithmetic.
+    tf.reset_default_graph()
+    tf.set_random_seed(12345678987654321)
+    np.random.seed(1294024302)
+    max_length = 2
+    num_tokens = misc.bf_num_tokens()
+    eos = misc.BF_EOS_INT
+    assert eos == 0
+    def sequence_iterator(max_length):
+      """Iterates through all sequences up to the given length."""
+      yield [eos]
+      for a in xrange(1, num_tokens):
+        if max_length > 1:
+          for sub_seq in sequence_iterator(max_length - 1):
+            yield [a] + sub_seq
+        else:
+          yield [a]
+    actions = list(sequence_iterator(max_length))
+
+    # This batch contains all possible episodes up to max_length.
+    actions_batch = utils.stack_pad(actions, 0)
+    lengths_batch = [len(s) for s in actions]
+
+    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
+    # reward_map = {tuple(a): np.random.normal(3, 1)
+    #               for a in actions_batch}  # normal distribution
+    # reward_map = {tuple(a): 1.0
+    #               for a in actions_batch}  # expected reward is 1
+
+    n = 100000  # MC sample size.
+    config = defaults.default_config_with_updates(
+        'env=c(task="print"),'
+        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
+        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
+        'policy_lstm_sizes=[10],eos_token=True),'
+        'batch_size='+str(n)+',timestep_limit='+str(max_length))
+
+    dtype = tf.float64
+    trainer = pg_train.AsyncTrainer(
+        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
+    model = trainer.model
+    actions_ph = model.actions
+    lengths_ph = model.adjusted_lengths
+    multipliers_ph = model.policy_multipliers
+
+    global_init_op = tf.variables_initializer(
+        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
+    with tf.Session() as sess, sess.graph.as_default():
+      sess.run(global_init_op)  # Initialize global copy.
+      trainer.initialize(sess)
+
+      # Compute exact gradients.
+      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
+      true_loss_unnormalized = 0.0
+      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
+      episode_probs_map = {}
+      grads_map = {}
+      for a_idx in xrange(len(actions_batch)):
+        a = actions_batch[a_idx]
+        grads_result, probs_result, loss = sess.run(
+            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
+            {actions_ph: [a],
+             lengths_ph: [lengths_batch[a_idx]],
+             multipliers_ph: [
+                 repeat_and_pad(reward_map[tuple(a)],
+                                lengths_batch[a_idx],
+                                max_length)]})
+        # Take product over time axis.
+        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
+        for i in range(0, len(exact_grads)):
+          exact_grads[i] += grads_result[i] * episode_probs_result
+        episode_probs_map[tuple(a)] = episode_probs_result
+        reward_map[tuple(a)] = reward_map[tuple(a)]
+        grads_map[tuple(a)] = grads_result
+        true_loss_unnormalized += loss
+      # Normalize loss. Since each episode is feed into the model one at a time,
+      # normalization needs to be done manually.
+      true_loss = true_loss_unnormalized / float(len(actions_batch))
+
+      # Compute Monte Carlo gradients.
+      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
+      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
+      # where len(actions_sampled_from_P) == n.
+      #
+      # In other words, sample from the policy and compute the gradients of the
+      # log probs weighted by the returns. This will excersize the code in
+      # agent.py
+      sampled_actions, sampled_lengths = sess.run(
+          [model.sampled_tokens, model.episode_lengths])
+      pi_multipliers = [
+          repeat_and_pad(reward_map[tuple(a)], l, max_length)
+          for a, l in zip(sampled_actions, sampled_lengths)]
+      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
+          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
+          {actions_ph: sampled_actions,
+           multipliers_ph: pi_multipliers,
+           lengths_ph: sampled_lengths})
+      # Loss is already normalized across the minibatch, so no normalization
+      # is needed.
+      mc_grads = mc_grads_unnormalized
+      mc_loss = mc_loss_unnormalized
+
+    # Make sure true loss and MC loss are similar.
+    loss_error = smape(true_loss, mc_loss)
+    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)
+
+    # Check that probs computed for episodes sampled from the model are the same
+    # as the recorded true probs.
+    for i in range(100):
+      acs = tuple(sampled_actions[i].tolist())
+      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
+      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))
+
+    # Make sure MC estimates of true probs are close.
+    counter = Counter(tuple(e) for e in sampled_actions)
+    for acs, count in counter.iteritems():
+      mc_prob = count / float(len(sampled_actions))
+      true_prob = episode_probs_map[acs]
+      error = smape(mc_prob, true_prob)
+      self.assertTrue(
+          error < 0.15,
+          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
+          % (error, count, mc_prob, true_prob))
+
+    # Manually recompute MC gradients and make sure they match MC gradients
+    # computed in TF.
+    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
+    for i in range(n):
+      acs = tuple(sampled_actions[i].tolist())
+      for i in range(0, len(mc_grads_recompute)):
+        mc_grads_recompute[i] += grads_map[acs][i]
+    for i in range(0, len(mc_grads_recompute)):
+      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))
+
+    # Check angle between gradients as fraction of pi.
+    for index in range(len(mc_grads)):
+      v1 = mc_grads[index].reshape(-1)
+      v2 = exact_grads[index].reshape(-1)
+      # angle = arccos(v1 . v2 / (|v1|*|v2|))
+      angle_rad = np.arccos(
+          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
+      logging.info('angle / pi: %s', angle_rad / np.pi)
+      angle_frac = angle_rad / np.pi
+      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
+    # Check norms.
+    for index in range(len(mc_grads)):
+      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
+      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
+      error = smape(v1_norm, v2_norm)
+      self.assertTrue(error < 0.02, msg='actual: %s' % error)
+
+    # Check expected rewards.
+    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
+    mc_expected_reward = np.mean(
+        [reward_map[tuple(a)] for a in sampled_actions])
+    exact_expected_reward = np.sum(
+        [episode_probs_map[k] * reward_map[k] for k in reward_map])
+    error = smape(mc_expected_reward, exact_expected_reward)
+    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
+
+  def testNumericalGradChecking(self):
+    # Similar to
+    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
+    epsilon = 1e-4
+    eos = misc.BF_EOS_INT
+    self.assertEqual(0, eos)
+    config = defaults.default_config_with_updates(
+        'env=c(task="print"),'
+        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
+        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
+        'eos_token=True),'
+        'batch_size=64')
+    dtype = tf.float64
+    tf.reset_default_graph()
+    tf.set_random_seed(12345678987654321)
+    np.random.seed(1294024302)
+    trainer = pg_train.AsyncTrainer(
+        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
+    model = trainer.model
+    actions_ph = model.actions
+    lengths_ph = model.adjusted_lengths
+    multipliers_ph = model.policy_multipliers
+    loss = model.pi_loss
+    global_init_op = tf.variables_initializer(
+        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
+
+    assign_add_placeholders = [None] * len(model.trainable_variables)
+    assign_add_ops = [None] * len(model.trainable_variables)
+    param_shapes = [None] * len(model.trainable_variables)
+    for i, param in enumerate(model.trainable_variables):
+      param_shapes[i] = param.get_shape().as_list()
+      assign_add_placeholders[i] = tf.placeholder(dtype,
+                                                  np.prod(param_shapes[i]))
+      assign_add_ops[i] = param.assign_add(
+          tf.reshape(assign_add_placeholders[i], param_shapes[i]))
+
+    with tf.Session() as sess:
+      sess.run(global_init_op)  # Initialize global copy.
+      trainer.initialize(sess)
+
+      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
+      actions_batch = utils.stack_pad(actions_raw, 0)
+      lengths_batch = [len(l) for l in actions_raw]
+      feed = {actions_ph: actions_batch,
+              multipliers_ph: np.ones_like(actions_batch),
+              lengths_ph: lengths_batch}
+
+      estimated_grads = [None] * len(model.trainable_variables)
+      for i, param in enumerate(model.trainable_variables):
+        param_size = np.prod(param_shapes[i])
+        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
+        for index in xrange(param_size):
+          e = onehot(index, param_size) * epsilon
+          sess.run(assign_add_ops[i],
+                   {assign_add_placeholders[i]: e})
+          j_plus = sess.run(loss, feed)
+          sess.run(assign_add_ops[i],
+                   {assign_add_placeholders[i]: -2 * e})
+          j_minus = sess.run(loss, feed)
+          sess.run(assign_add_ops[i],
+                   {assign_add_placeholders[i]: e})
+          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
+        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])
+
+      analytic_grads = sess.run(model.dense_unclipped_grads, feed)
+
+      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
+        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
+        self.assertTrue(np.allclose(g1, g2))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/pg_train.py
+++ b/research/brain_coder/single_task/pg_train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+r"""Train RL agent on coding tasks."""
+
+import contextlib
+import cPickle
+import cProfile
+import marshal
+import os
+import time
+
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+# internal session lib import
+
+from single_task import data  # brain coder
+from single_task import defaults  # brain coder
+from single_task import pg_agent as agent_lib  # brain coder
+from single_task import results_lib  # brain coder
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'master', '',
+    'URL of the TensorFlow master to use.')
+flags.DEFINE_integer(
+    'ps_tasks', 0,
+    'Number of parameter server tasks. Only set to 0 for '
+    'single worker training.')
+flags.DEFINE_integer(
+    'summary_interval', 10,
+    'How often to write summaries.')
+flags.DEFINE_integer(
+    'summary_tasks', 16,
+    'If greater than 0 only tasks 0 through summary_tasks - 1 '
+    'will write summaries. If 0, all tasks will write '
+    'summaries.')
+flags.DEFINE_bool(
+    'stop_on_success', True,
+    'If True, training will stop as soon as a solution is found. '
+    'If False, training will continue indefinitely until another '
+    'stopping condition is reached.')
+flags.DEFINE_bool(
+    'do_profiling', False,
+    'If True, cProfile profiler will run and results will be '
+    'written to logdir. WARNING: Results will not be written if '
+    'the code crashes. Make sure it exists successfully.')
+flags.DEFINE_integer('model_v', 0, 'Model verbosity level.')
+flags.DEFINE_bool(
+    'delayed_graph_cleanup', True,
+    'If true, container for n-th run will not be reset until the (n+1)-th run '
+    'is complete. This greatly reduces the chance that a worker is still '
+    'using the n-th container when it is cleared.')
+
+
+def define_tuner_hparam_space(hparam_space_type):
+  """Define tunable hparams for grid search."""
+  if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
+    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
+
+  # Discrete hparam space is stored as a dict from hparam name to discrete
+  # values.
+  hparam_space = {}
+
+  if hparam_space_type in ('pg', 'pg-topk', 'is'):
+    # Add a floating point parameter named learning rate.
+    hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
+    hparam_space['entropy_beta'] = [0.005, 0.01, 0.05, 0.10]
+  else:  # 'topk'
+    # Add a floating point parameter named learning rate.
+    hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
+    hparam_space['entropy_beta'] = [0.0, 0.005, 0.01, 0.05, 0.10]
+
+  if hparam_space_type in ('topk', 'pg-topk'):
+    # topk tuning will be enabled.
+    hparam_space['topk'] = [10]
+    hparam_space['topk_loss_hparam'] = [1.0, 10.0, 50.0, 200.0]
+
+  elif hparam_space_type == 'is':
+    # importance sampling tuning will be enabled.
+    hparam_space['replay_temperature'] = [0.25, 0.5, 1.0, 2.0]
+    hparam_space['alpha'] = [0.5, 0.75, 63/64.]
+
+  return hparam_space
+
+
+def write_hparams_to_config(config, hparams, hparam_space_type):
+  """Write hparams given by the tuner into the Config object."""
+  if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
+    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
+
+  config.agent.lr = hparams.lr
+  config.agent.entropy_beta = hparams.entropy_beta
+
+  if hparam_space_type in ('topk', 'pg-topk'):
+    # topk tuning will be enabled.
+    config.agent.topk = hparams.topk
+    config.agent.topk_loss_hparam = hparams.topk_loss_hparam
+  elif hparam_space_type == 'is':
+    # importance sampling tuning will be enabled.
+    config.agent.replay_temperature = hparams.replay_temperature
+    config.agent.alpha = hparams.alpha
+
+
+def make_initialized_variable(value, name, shape=None, dtype=tf.float32):
+  """Create a tf.Variable with a constant initializer.
+
+  Args:
+    value: Constant value to initialize the variable with. This is the value
+        that the variable starts with.
+    name: Name of the variable in the TF graph.
+    shape: Shape of the variable. If None, variable will be a scalar.
+    dtype: Data type of the variable. Should be a TF dtype. Defaults to
+        tf.float32.
+
+  Returns:
+    tf.Variable instance.
+  """
+  if shape is None:
+    shape = []
+  return tf.get_variable(
+      name=name, shape=shape, initializer=tf.constant_initializer(value),
+      dtype=dtype, trainable=False)
+
+
+class AsyncTrainer(object):
+  """Manages graph creation and training.
+
+  This async trainer creates a global model on the parameter server, and a local
+  model (for this worker). Gradient updates are sent to the global model, and
+  the updated weights are synced to the local copy.
+  """
+
+  def __init__(self, config, task_id, ps_tasks, num_workers, is_chief=True,
+               summary_writer=None,
+               dtype=tf.float32,
+               summary_interval=1,
+               run_number=0,
+               logging_dir='/tmp', model_v=0):
+    self.config = config
+    self.data_manager = data.DataManager(
+        config, run_number=run_number,
+        do_code_simplification=not FLAGS.stop_on_success)
+    self.task_id = task_id
+    self.ps_tasks = ps_tasks
+    self.is_chief = is_chief
+    if ps_tasks == 0:
+      assert task_id == 0, 'No parameter servers specified. Expecting 1 task.'
+      assert num_workers == 1, (
+          'No parameter servers specified. Expecting 1 task.')
+      worker_device = '/job:localhost/replica:%d/task:0/cpu:0' % task_id
+      # worker_device = '/cpu:0'
+      # ps_device = '/cpu:0'
+    else:
+      assert num_workers > 0, 'There must be at least 1 training worker.'
+      worker_device = '/job:worker/replica:%d/task:0/cpu:0' % task_id
+      # ps_device = '/job:ps/replica:0/task:0/cpu:0'
+    logging.info('worker_device: %s', worker_device)
+
+    logging_file = os.path.join(
+        logging_dir, 'solutions_%d.txt' % task_id)
+    experience_replay_file = os.path.join(
+        logging_dir, 'replay_buffer_%d.pickle' % task_id)
+    self.topk_file = os.path.join(
+        logging_dir, 'topk_buffer_%d.pickle' % task_id)
+
+    tf.get_variable_scope().set_use_resource(True)
+
+    # global model
+    with tf.device(tf.train.replica_device_setter(ps_tasks,
+                                                  ps_device='/job:ps/replica:0',
+                                                  worker_device=worker_device)):
+      with tf.variable_scope('global'):
+        global_model = agent_lib.LMAgent(config, dtype=dtype, is_local=False)
+        global_params_dict = {p.name: p
+                              for p in global_model.sync_variables}
+        self.global_model = global_model
+        self.global_step = make_initialized_variable(
+            0, 'global_step', dtype=tf.int64)
+
+        self.global_best_reward = make_initialized_variable(
+            -10.0, 'global_best_reward', dtype=tf.float64)
+        self.is_best_model = make_initialized_variable(
+            False, 'is_best_model', dtype=tf.bool)
+        self.reset_is_best_model = self.is_best_model.assign(False)
+        self.global_best_reward_placeholder = tf.placeholder(
+            tf.float64, [], name='global_best_reward_placeholder')
+        self.assign_global_best_reward_op = tf.group(
+            self.global_best_reward.assign(
+                self.global_best_reward_placeholder),
+            self.is_best_model.assign(True))
+        def assign_global_best_reward_fn(session, reward):
+          reward = round(reward, 10)
+          best_reward = round(session.run(self.global_best_reward), 10)
+          is_best = reward > best_reward
+          if is_best:
+            session.run(self.assign_global_best_reward_op,
+                        {self.global_best_reward_placeholder: reward})
+          return is_best
+        self.assign_global_best_reward_fn = assign_global_best_reward_fn
+
+        # Any worker will set to true when it finds a solution.
+        self.found_solution_flag = make_initialized_variable(
+            False, 'found_solution_flag', dtype=tf.bool)
+        self.found_solution_op = self.found_solution_flag.assign(True)
+
+        self.run_number = make_initialized_variable(
+            run_number, 'run_number', dtype=tf.int32)
+
+        # Store a solution when found.
+        self.code_solution_variable = tf.get_variable(
+            'code_solution', [], tf.string,
+            initializer=tf.constant_initializer(''))
+        self.code_solution_ph = tf.placeholder(
+            tf.string, [], name='code_solution_ph')
+        self.code_solution_assign_op = self.code_solution_variable.assign(
+            self.code_solution_ph)
+        def assign_code_solution_fn(session, code_solution_string):
+          session.run(self.code_solution_assign_op,
+                      {self.code_solution_ph: code_solution_string})
+        self.assign_code_solution_fn = assign_code_solution_fn
+
+        # Count all programs sampled from policy. This does not include
+        # programs sampled from replay buffer.
+        # This equals NPE (number of programs executed). Only programs sampled
+        # from the policy need to be executed.
+        self.program_count = make_initialized_variable(
+            0, 'program_count', dtype=tf.int64)
+
+    # local model
+    with tf.device(worker_device):
+      with tf.variable_scope('local'):
+        self.model = model = agent_lib.LMAgent(
+            config,
+            task_id=task_id,
+            logging_file=logging_file,
+            experience_replay_file=experience_replay_file,
+            dtype=dtype,
+            global_best_reward_fn=self.assign_global_best_reward_fn,
+            found_solution_op=self.found_solution_op,
+            assign_code_solution_fn=self.assign_code_solution_fn,
+            program_count=self.program_count,
+            stop_on_success=FLAGS.stop_on_success,
+            verbose_level=model_v)
+        local_params = model.trainable_variables
+        local_params_dict = {p.name: p for p in local_params}
+
+    # Pull global params to local model.
+    def _global_to_local_scope(name):
+      assert name.startswith('global/')
+      return 'local' + name[6:]
+    sync_dict = {
+        local_params_dict[_global_to_local_scope(p_name)]: p
+        for p_name, p in global_params_dict.items()}
+    self.sync_op = tf.group(*[v_local.assign(v_global)
+                              for v_local, v_global
+                              in sync_dict.items()])
+
+    # Pair local gradients with global params.
+    grad_var_dict = {
+        gradient: sync_dict[local_var]
+        for local_var, gradient in model.gradients_dict.items()}
+
+    # local model
+    model.make_summary_ops()  # Don't put summaries under 'local' scope.
+    with tf.variable_scope('local'):
+      self.train_op = model.optimizer.apply_gradients(
+          grad_var_dict.items(), global_step=self.global_step)
+      self.local_init_op = tf.variables_initializer(
+          tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+                            tf.get_variable_scope().name))
+
+    self.local_step = 0
+    self.last_summary_time = time.time()
+    self.summary_interval = summary_interval
+    self.summary_writer = summary_writer
+    self.cached_global_step = -1
+    self.cached_global_npe = -1
+
+    logging.info('summary_interval: %d', self.summary_interval)
+
+    # Load top-k buffer.
+    if self.model.top_episodes is not None and tf.gfile.Exists(self.topk_file):
+      try:
+        with tf.gfile.FastGFile(self.topk_file, 'r') as f:
+          self.model.top_episodes = cPickle.loads(f.read())
+        logging.info(
+            'Loaded top-k buffer from disk with %d items. Location: "%s"',
+            len(self.model.top_episodes), self.topk_file)
+      except (cPickle.UnpicklingError, EOFError) as e:
+        logging.warn(
+            'Failed to load existing top-k buffer from disk. Removing bad file.'
+            '\nLocation: "%s"\nException: %s', self.topk_file, str(e))
+        tf.gfile.Remove(self.topk_file)
+
+  def initialize(self, session):
+    """Run initialization ops."""
+    session.run(self.local_init_op)
+    session.run(self.sync_op)
+    self.cached_global_step, self.cached_global_npe = session.run(
+        [self.global_step, self.program_count])
+
+  def update_global_model(self, session):
+    """Run an update step.
+
+    1) Asynchronously copy global weights to local model.
+    2) Call into local model's update_step method, which does the following:
+        a) Sample batch of programs from policy.
+        b) Compute rewards.
+        c) Compute gradients and update the global model asynchronously.
+    3) Write tensorboard summaries to disk.
+
+    Args:
+      session: tf.Session instance.
+    """
+    session.run(self.sync_op)  # Copy weights from global to local.
+
+    with session.as_default():
+      result = self.model.update_step(
+          session, self.data_manager.sample_rl_batch(), self.train_op,
+          self.global_step)
+      global_step = result.global_step
+      global_npe = result.global_npe
+      summaries = result.summaries_list
+    self.cached_global_step = global_step
+    self.cached_global_npe = global_npe
+    self.local_step += 1
+
+    if self.summary_writer and self.local_step % self.summary_interval == 0:
+      if not isinstance(summaries, (tuple, list)):
+        summaries = [summaries]
+      summaries.append(self._local_step_summary())
+      if self.is_chief:
+        (global_best_reward,
+         found_solution_flag,
+         program_count) = session.run(
+             [self.global_best_reward,
+              self.found_solution_flag,
+              self.program_count])
+        summaries.append(
+            tf.Summary(
+                value=[tf.Summary.Value(
+                    tag='model/best_reward',
+                    simple_value=global_best_reward)]))
+        summaries.append(
+            tf.Summary(
+                value=[tf.Summary.Value(
+                    tag='model/solution_found',
+                    simple_value=int(found_solution_flag))]))
+        summaries.append(
+            tf.Summary(
+                value=[tf.Summary.Value(
+                    tag='model/program_count',
+                    simple_value=program_count)]))
+      for s in summaries:
+        self.summary_writer.add_summary(s, global_step)
+      self.last_summary_time = time.time()
+
+  def _local_step_summary(self):
+    """Compute number of local steps per time increment."""
+    dt = time.time() - self.last_summary_time
+    steps_per_time = self.summary_interval / float(dt)
+    return tf.Summary(value=[
+        tf.Summary.Value(
+            tag='local_step/per_sec',
+            simple_value=steps_per_time),
+        tf.Summary.Value(
+            tag='local_step/step',
+            simple_value=self.local_step)])
+
+  def maybe_save_best_model(self, session, saver, checkpoint_file):
+    """Check if this model got the highest reward and save to disk if so."""
+    if self.is_chief and session.run(self.is_best_model):
+      logging.info('Saving best model to "%s"', checkpoint_file)
+      saver.save(session, checkpoint_file)
+      session.run(self.reset_is_best_model)
+
+  def save_replay_buffer(self):
+    """Save replay buffer to disk.
+
+    Call this periodically so that training can recover if jobs go down.
+    """
+    if self.model.experience_replay is not None:
+      logging.info('Saving experience replay buffer to "%s".',
+                   self.model.experience_replay.save_file)
+      self.model.experience_replay.incremental_save(True)
+
+  def delete_replay_buffer(self):
+    """Delete replay buffer from disk.
+
+    Call this at the end of training to clean up. Replay buffer can get very
+    large.
+    """
+    if self.model.experience_replay is not None:
+      logging.info('Deleting experience replay buffer at "%s".',
+                   self.model.experience_replay.save_file)
+      tf.gfile.Remove(self.model.experience_replay.save_file)
+
+  def save_topk_buffer(self):
+    """Save top-k buffer to disk.
+
+    Call this periodically so that training can recover if jobs go down.
+    """
+    if self.model.top_episodes is not None:
+      logging.info('Saving top-k buffer to "%s".', self.topk_file)
+      # Overwrite previous data each time.
+      with tf.gfile.FastGFile(self.topk_file, 'w') as f:
+        f.write(cPickle.dumps(self.model.top_episodes))
+
+
+@contextlib.contextmanager
+def managed_session(sv, master='', config=None,
+                    start_standard_services=True,
+                    close_summary_writer=True,
+                    max_wait_secs=7200):
+  # Same as Supervisor.managed_session, but with configurable timeout.
+  try:
+    sess = sv.prepare_or_wait_for_session(
+        master=master, config=config,
+        start_standard_services=start_standard_services,
+        max_wait_secs=max_wait_secs)
+    yield sess
+  except tf.errors.DeadlineExceededError:
+    raise
+  except Exception as e:  # pylint: disable=broad-except
+    sv.request_stop(e)
+  finally:
+    try:
+      # Request all the threads to stop and wait for them to do so.  Any
+      # exception raised by the threads is raised again from stop().
+      # Passing stop_grace_period_secs is for blocked enqueue/dequeue
+      # threads which are not checking for `should_stop()`.  They
+      # will be stopped when we close the session further down.
+      sv.stop(close_summary_writer=close_summary_writer)
+    finally:
+      # Close the session to finish up all pending calls.  We do not care
+      # about exceptions raised when closing.  This takes care of
+      # blocked enqueue/dequeue calls.
+      try:
+        sess.close()
+      except Exception:  # pylint: disable=broad-except
+        # Silently ignore exceptions raised by close().
+        pass
+
+
+def train(config, is_chief, tuner=None, run_dir=None, run_number=0,
+          results_writer=None):
+  """Run training loop.
+
+  Args:
+    config: config_lib.Config instance containing global config (agent and env).
+    is_chief: True if this worker is chief. Chief worker manages writing some
+        data to disk and initialization of the global model.
+    tuner: A tuner instance. If not tuning, leave as None.
+    run_dir: Directory where all data for this run will be written. If None,
+        run_dir = FLAGS.logdir. Set this argument when doing multiple runs.
+    run_number: Which run is this.
+    results_writer: Managest writing training results to disk. Results are a
+        dict of metric names and values.
+
+  Returns:
+    The trainer object used to run training updates.
+  """
+  logging.info('Will run asynchronous training.')
+
+  if run_dir is None:
+    run_dir = FLAGS.logdir
+  train_dir = os.path.join(run_dir, 'train')
+  best_model_checkpoint = os.path.join(train_dir, 'best.ckpt')
+  events_dir = '%s/events_%d' % (run_dir, FLAGS.task_id)
+  logging.info('Events directory: %s', events_dir)
+
+  logging_dir = os.path.join(run_dir, 'logs')
+  if not tf.gfile.Exists(logging_dir):
+    tf.gfile.MakeDirs(logging_dir)
+  status_file = os.path.join(logging_dir, 'status.txt')
+
+  if FLAGS.summary_tasks and FLAGS.task_id < FLAGS.summary_tasks:
+    summary_writer = tf.summary.FileWriter(events_dir)
+  else:
+    summary_writer = None
+
+  # Only profile task 0.
+  if FLAGS.do_profiling:
+    logging.info('Profiling enabled')
+    profiler = cProfile.Profile()
+    profiler.enable()
+  else:
+    profiler = None
+
+  trainer = AsyncTrainer(
+      config, FLAGS.task_id, FLAGS.ps_tasks, FLAGS.num_workers,
+      is_chief=is_chief,
+      summary_interval=FLAGS.summary_interval,
+      summary_writer=summary_writer,
+      logging_dir=logging_dir,
+      run_number=run_number,
+      model_v=FLAGS.model_v)
+
+  variables_to_save = [v for v in tf.global_variables()
+                       if v.name.startswith('global')]
+  global_init_op = tf.variables_initializer(variables_to_save)
+  saver = tf.train.Saver(variables_to_save)
+
+  var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
+                               tf.get_variable_scope().name)
+  logging.info('Trainable vars:')
+  for v in var_list:
+    logging.info('  %s, %s, %s', v.name, v.device, v.get_shape())
+
+  logging.info('All vars:')
+  for v in tf.global_variables():
+    logging.info('  %s, %s, %s', v.name, v.device, v.get_shape())
+
+  def init_fn(unused_sess):
+    logging.info('No checkpoint found. Initialized global params.')
+
+  sv = tf.train.Supervisor(is_chief=is_chief,
+                           logdir=train_dir,
+                           saver=saver,
+                           summary_op=None,
+                           init_op=global_init_op,
+                           init_fn=init_fn,
+                           summary_writer=summary_writer,
+                           ready_op=tf.report_uninitialized_variables(
+                               variables_to_save),
+                           ready_for_local_init_op=None,
+                           global_step=trainer.global_step,
+                           save_model_secs=30,
+                           save_summaries_secs=30)
+
+  # Add a thread that periodically checks if this Trial should stop
+  # based on an early stopping policy.
+  if tuner:
+    sv.Loop(60, tuner.check_for_stop, (sv.coord,))
+
+  last_replay_save_time = time.time()
+
+  global_step = -1
+  logging.info(
+      'Starting session. '
+      'If this hangs, we\'re mostly likely waiting to connect '
+      'to the parameter server. One common cause is that the parameter '
+      'server DNS name isn\'t resolving yet, or is misspecified.')
+  should_retry = True
+  supervisor_deadline_exceeded = False
+  while should_retry:
+    try:
+      with managed_session(
+          sv, FLAGS.master, max_wait_secs=60) as session, session.as_default():
+        should_retry = False
+        do_training = True
+
+        try:
+          trainer.initialize(session)
+          if session.run(trainer.run_number) != run_number:
+            # If we loaded existing model from disk, and the saved run number is
+            # different, throw an exception.
+            raise RuntimeError(
+                'Expecting to be on run %d, but is actually on run %d. '
+                'run_dir: "%s"'
+                % (run_number, session.run(trainer.run_number), run_dir))
+          global_step = trainer.cached_global_step
+          logging.info('Starting training at step=%d', global_step)
+          while do_training:
+            trainer.update_global_model(session)
+
+            if is_chief:
+              trainer.maybe_save_best_model(
+                  session, saver, best_model_checkpoint)
+            global_step = trainer.cached_global_step
+            global_npe = trainer.cached_global_npe
+
+            if time.time() - last_replay_save_time >= 30:
+              trainer.save_replay_buffer()
+              trainer.save_topk_buffer()
+              last_replay_save_time = time.time()
+
+            # Stopping conditions.
+            if tuner and tuner.should_trial_stop():
+              logging.info('Tuner requested early stopping. Finishing.')
+              do_training = False
+            if is_chief and FLAGS.stop_on_success:
+              found_solution = session.run(trainer.found_solution_flag)
+              if found_solution:
+                do_training = False
+                logging.info('Solution found. Finishing.')
+            if FLAGS.max_npe and global_npe >= FLAGS.max_npe:
+              # Max NPE (number of programs executed) reached.
+              logging.info('Max NPE reached. Finishing.')
+              do_training = False
+            if sv.should_stop():
+              logging.info('Supervisor issued stop. Finishing.')
+              do_training = False
+
+        except tf.errors.NotFoundError:
+          # Catch "Error while reading resource variable".
+          # The chief worker likely destroyed the container, so do not retry.
+          logging.info('Caught NotFoundError. Quitting.')
+          do_training = False
+          should_retry = False
+          break
+        except tf.errors.InternalError as e:
+          # Catch "Invalid variable reference."
+          if str(e).startswith('Invalid variable reference.'):
+            # The chief worker likely destroyed the container, so do not
+            # retry.
+            logging.info(
+                'Caught "InternalError: Invalid variable reference.". '
+                'Quitting.')
+            do_training = False
+            should_retry = False
+            break
+          else:
+            # Pass exception through.
+            raise
+
+        # Exited training loop. Write results to disk.
+        if is_chief and results_writer:
+          assert not should_retry
+          with tf.gfile.FastGFile(status_file, 'w') as f:
+            f.write('done')
+          (program_count,
+           found_solution,
+           code_solution,
+           best_reward,
+           global_step) = session.run(
+               [trainer.program_count,
+                trainer.found_solution_flag,
+                trainer.code_solution_variable,
+                trainer.global_best_reward,
+                trainer.global_step])
+          results_dict = {
+              'max_npe': FLAGS.max_npe,
+              'batch_size': config.batch_size,
+              'max_batches': FLAGS.max_npe // config.batch_size,
+              'npe': program_count,
+              'max_global_repetitions': FLAGS.num_repetitions,
+              'max_local_repetitions': FLAGS.num_repetitions,
+              'code_solution': code_solution,
+              'best_reward': best_reward,
+              'num_batches': global_step,
+              'found_solution': found_solution,
+              'task': trainer.data_manager.task_name,
+              'global_rep': run_number}
+          logging.info('results_dict: %s', results_dict)
+          results_writer.append(results_dict)
+
+    except tf.errors.AbortedError:
+      # Catch "Graph handle is not found" error due to preempted jobs.
+      logging.info('Caught AbortedError. Retying.')
+      should_retry = True
+    except tf.errors.DeadlineExceededError:
+      supervisor_deadline_exceeded = True
+      should_retry = False
+
+  if is_chief:
+    logging.info('This is chief worker. Stopping all workers.')
+    sv.stop()
+
+  if supervisor_deadline_exceeded:
+    logging.info('Supervisor timed out. Quitting.')
+  else:
+    logging.info('Reached %s steps. Worker stopped.', global_step)
+
+  # Dump profiling.
+  """
+  How to use profiling data.
+
+  Download the profiler dump to your local machine, say to PROF_FILE_PATH.
+  In a separate script, run something like the following:
+
+  import pstats
+  p = pstats.Stats(PROF_FILE_PATH)
+  p.strip_dirs().sort_stats('cumtime').print_stats()
+
+  This will sort by 'cumtime', which "is the cumulative time spent in this and
+  all subfunctions (from invocation till exit)."
+  https://docs.python.org/2/library/profile.html#instant-user-s-manual
+  """  # pylint: disable=pointless-string-statement
+  if profiler:
+    prof_file = os.path.join(run_dir, 'task_%d.prof' % FLAGS.task_id)
+    logging.info('Done profiling.\nDumping to "%s".', prof_file)
+    profiler.create_stats()
+    with tf.gfile.Open(prof_file, 'w') as f:
+      f.write(marshal.dumps(profiler.stats))
+
+  return trainer
+
+
+def run_training(config=None, tuner=None, logdir=None, trial_name=None,
+                 is_chief=True):
+  """Do all training runs.
+
+  This is the top level training function for policy gradient based models.
+  Run this from the main function.
+
+  Args:
+    config: config_lib.Config instance containing global config (agent and
+        environment hparams). If None, config will be parsed from FLAGS.config.
+    tuner: A tuner instance. Leave as None if not tuning.
+    logdir: Parent directory where all data from all runs will be written. If
+        None, FLAGS.logdir will be used.
+    trial_name: If tuning, set this to a unique string that identifies this
+        trial. If `tuner` is not None, this also must be set.
+    is_chief: True if this worker is the chief.
+
+  Returns:
+    List of results dicts which were written to disk. Each training run gets a
+    results dict. Results dict contains metrics, i.e. (name, value) pairs which
+    give information about the training run.
+
+  Raises:
+    ValueError: If results dicts read from disk contain invalid data.
+  """
+  if not config:
+    # If custom config is not given, get it from flags.
+    config = defaults.default_config_with_updates(FLAGS.config)
+  if not logdir:
+    logdir = FLAGS.logdir
+  if not tf.gfile.Exists(logdir):
+    tf.gfile.MakeDirs(logdir)
+  assert FLAGS.num_repetitions > 0
+  results = results_lib.Results(logdir)
+  results_list, _ = results.read_all()
+
+  logging.info('Starting experiment. Directory: "%s"', logdir)
+
+  if results_list:
+    if results_list[0]['max_npe'] != FLAGS.max_npe:
+      raise ValueError(
+          'Cannot resume training. Max-NPE changed. Was %s, now %s',
+          results_list[0]['max_npe'], FLAGS.max_npe)
+    if results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
+      raise ValueError(
+          'Cannot resume training. Number of repetitions changed. Was %s, '
+          'now %s',
+          results_list[0]['max_global_repetitions'],
+          FLAGS.num_repetitions)
+
+  while len(results_list) < FLAGS.num_repetitions:
+    run_number = len(results_list)
+    rep_container_name = trial_name if trial_name else 'container'
+    if FLAGS.num_repetitions > 1:
+      rep_dir = os.path.join(logdir, 'run_%d' % run_number)
+      rep_container_name = rep_container_name + '_run_' + str(run_number)
+    else:
+      rep_dir = logdir
+
+    logging.info(
+        'Starting repetition %d (%d out of %d)', run_number, run_number + 1,
+        FLAGS.num_repetitions)
+
+    # Train will write result to disk.
+    with tf.container(rep_container_name):
+      trainer = train(config, is_chief, tuner, rep_dir, run_number, results)
+    logging.info('Done training.')
+
+    if is_chief:
+      # Destroy current container immediately (clears current graph).
+      logging.info('Clearing shared variables.')
+      tf.Session.reset(FLAGS.master, containers=[rep_container_name])
+      logging.info('Shared variables cleared.')
+
+      # Delete replay buffer on disk.
+      assert trainer
+      trainer.delete_replay_buffer()
+    else:
+      # Give chief worker time to clean up.
+      sleep_sec = 30.0
+      logging.info('Sleeping for %s sec.', sleep_sec)
+      time.sleep(sleep_sec)
+    tf.reset_default_graph()
+    logging.info('Default graph reset.')
+
+    # Expecting that train wrote new result to disk before returning.
+    results_list, _ = results.read_all()
+  return results_list
--- a/research/brain_coder/single_task/pg_train_test.py
+++ b/research/brain_coder/single_task/pg_train_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Tests for pg_train.
+
+These tests excersize code paths available through configuration options.
+Training will be run for just a few steps with the goal being to check that
+nothing crashes.
+"""
+
+from absl import flags
+import tensorflow as tf
+
+from single_task import defaults  # brain coder
+from single_task import run  # brain coder
+
+FLAGS = flags.FLAGS
+
+
+class TrainTest(tf.test.TestCase):
+
+  def RunTrainingSteps(self, config_string, num_steps=10):
+    """Run a few training steps with the given config.
+
+    Just check that nothing crashes.
+
+    Args:
+      config_string: Config encoded in a string. See
+          $REPO_PATH/common/config_lib.py
+      num_steps: Number of training steps to run. Defaults to 10.
+    """
+    config = defaults.default_config_with_updates(config_string)
+    FLAGS.master = ''
+    FLAGS.max_npe = num_steps * config.batch_size
+    FLAGS.summary_interval = 1
+    FLAGS.logdir = tf.test.get_temp_dir()
+    FLAGS.config = config_string
+    tf.reset_default_graph()
+    run.main(None)
+
+  def testVanillaPolicyGradient(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg"),'
+        'timestep_limit=90,batch_size=64')
+
+  def testVanillaPolicyGradient_VariableLengthSequences(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",eos_token=False),'
+        'timestep_limit=90,batch_size=64')
+
+  def testVanillaActorCritic(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",ema_baseline_decay=0.0),'
+        'timestep_limit=90,batch_size=64')
+
+  def testPolicyGradientWithTopK(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10),'
+        'timestep_limit=90,batch_size=64')
+
+  def testVanillaActorCriticWithTopK(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",ema_baseline_decay=0.0,topk_loss_hparam=1.0,'
+        'topk=10),'
+        'timestep_limit=90,batch_size=64')
+
+  def testPolicyGradientWithTopK_VariableLengthSequences(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10,eos_token=False),'
+        'timestep_limit=90,batch_size=64')
+
+  def testPolicyGradientWithImportanceSampling(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",alpha=0.5),'
+        'timestep_limit=90,batch_size=64')
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/results_lib.py
+++ b/research/brain_coder/single_task/results_lib.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Results object manages distributed reading and writing of results to disk."""
+
+import ast
+from collections import namedtuple
+import os
+import re
+from six.moves import xrange
+import tensorflow as tf
+
+
+ShardStats = namedtuple(
+    'ShardStats',
+    ['num_local_reps_completed', 'max_local_reps', 'finished'])
+
+
+def ge_non_zero(a, b):
+  return a >= b and b > 0
+
+
+def get_shard_id(file_name):
+  assert file_name[-4:].lower() == '.txt'
+  return int(file_name[file_name.rfind('_') + 1: -4])
+
+
+class Results(object):
+  """Manages reading and writing training results to disk asynchronously.
+
+  Each worker writes to its own file, so that there are no race conditions when
+  writing happens. However any worker may read any file, as is the case for
+  `read_all`. Writes are expected to be atomic so that workers will never
+  read incomplete data, and this is likely to be the case on Unix systems.
+  Reading out of date data is fine, as workers calling `read_all` will wait
+  until data from every worker has been written before proceeding.
+  """
+  file_template = 'experiment_results_{0}.txt'
+  search_regex = r'^experiment_results_([0-9])+\.txt$'
+
+  def __init__(self, log_dir, shard_id=0):
+    """Construct `Results` instance.
+
+    Args:
+      log_dir: Where to write results files.
+      shard_id: Unique id for this file (i.e. shard). Each worker that will
+          be writing results should use a different shard id. If there are
+          N shards, each shard should be numbered 0 through N-1.
+    """
+    # Use different files for workers so that they can write to disk async.
+    assert 0 <= shard_id
+    self.file_name = self.file_template.format(shard_id)
+    self.log_dir = log_dir
+    self.results_file = os.path.join(self.log_dir, self.file_name)
+
+  def append(self, metrics):
+    """Append results to results list on disk."""
+    with tf.gfile.FastGFile(self.results_file, 'a') as writer:
+      writer.write(str(metrics) + '\n')
+
+  def read_this_shard(self):
+    """Read only from this shard."""
+    return self._read_shard(self.results_file)
+
+  def _read_shard(self, results_file):
+    """Read only from the given shard file."""
+    try:
+      with tf.gfile.FastGFile(results_file, 'r') as reader:
+        results = [ast.literal_eval(entry) for entry in reader]
+    except tf.errors.NotFoundError:
+      # No results written to disk yet. Return empty list.
+      return []
+    return results
+
+  def _get_max_local_reps(self, shard_results):
+    """Get maximum number of repetitions the given shard needs to complete.
+
+    Worker working on each shard needs to complete a certain number of runs
+    before it finishes. This method will return that number so that we can
+    determine which shards are still not done.
+
+    We assume that workers are including a 'max_local_repetitions' value in
+    their results, which should be the total number of repetitions it needs to
+    run.
+
+    Args:
+      shard_results: Dict mapping metric names to values. This should be read
+          from a shard on disk.
+
+    Returns:
+      Maximum number of repetitions the given shard needs to complete.
+    """
+    mlrs = [r['max_local_repetitions'] for r in shard_results]
+    if not mlrs:
+      return 0
+    for n in mlrs[1:]:
+      assert n == mlrs[0], 'Some reps have different max rep.'
+    return mlrs[0]
+
+  def read_all(self, num_shards=None):
+    """Read results across all shards, i.e. get global results list.
+
+    Args:
+      num_shards: (optional) specifies total number of shards. If the caller
+          wants information about which shards are incomplete, provide this
+          argument (so that shards which have yet to be created are still
+          counted as incomplete shards). Otherwise, no information about
+          incomplete shards will be returned.
+
+    Returns:
+      aggregate: Global list of results (across all shards).
+      shard_stats: List of ShardStats instances, one for each shard. Or None if
+          `num_shards` is None.
+    """
+    try:
+      all_children = tf.gfile.ListDirectory(self.log_dir)
+    except tf.errors.NotFoundError:
+      if num_shards is None:
+        return [], None
+      return [], [[] for _ in xrange(num_shards)]
+    shard_ids = {
+        get_shard_id(fname): fname
+        for fname in all_children if re.search(self.search_regex, fname)}
+
+    if num_shards is None:
+      aggregate = []
+      shard_stats = None
+      for results_file in shard_ids.values():
+        aggregate.extend(self._read_shard(
+            os.path.join(self.log_dir, results_file)))
+    else:
+      results_per_shard = [None] * num_shards
+      for shard_id in xrange(num_shards):
+        if shard_id in shard_ids:
+          results_file = shard_ids[shard_id]
+          results_per_shard[shard_id] = self._read_shard(
+              os.path.join(self.log_dir, results_file))
+        else:
+          results_per_shard[shard_id] = []
+
+      # Compute shard stats.
+      shard_stats = []
+      for shard_results in results_per_shard:
+        max_local_reps = self._get_max_local_reps(shard_results)
+        shard_stats.append(ShardStats(
+            num_local_reps_completed=len(shard_results),
+            max_local_reps=max_local_reps,
+            finished=ge_non_zero(len(shard_results), max_local_reps)))
+
+      # Compute aggregate.
+      aggregate = [
+          r for shard_results in results_per_shard for r in shard_results]
+
+    return aggregate, shard_stats
--- a/research/brain_coder/single_task/results_lib_test.py
+++ b/research/brain_coder/single_task/results_lib_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Tests for results_lib."""
+
+import contextlib
+import os
+import shutil
+import tempfile
+from six.moves import xrange
+import tensorflow as tf
+
+from single_task import results_lib  # brain coder
+
+
+@contextlib.contextmanager
+def temporary_directory(suffix='', prefix='tmp', base_path=None):
+  """A context manager to create a temporary directory and clean up on exit.
+
+  The parameters are the same ones expected by tempfile.mkdtemp.
+  The directory will be securely and atomically created.
+  Everything under it will be removed when exiting the context.
+
+  Args:
+    suffix: optional suffix.
+    prefix: options prefix.
+    base_path: the base path under which to create the temporary directory.
+  Yields:
+    The absolute path of the new temporary directory.
+  """
+  temp_dir_path = tempfile.mkdtemp(suffix, prefix, base_path)
+  try:
+    yield temp_dir_path
+  finally:
+    try:
+      shutil.rmtree(temp_dir_path)
+    except OSError as e:
+      if e.message == 'Cannot call rmtree on a symbolic link':
+        # Interesting synthetic exception made up by shutil.rmtree.
+        # Means we received a symlink from mkdtemp.
+        # Also means must clean up the symlink instead.
+        os.unlink(temp_dir_path)
+      else:
+        raise
+
+
+def freeze(dictionary):
+  """Convert dict to hashable frozenset."""
+  return frozenset(dictionary.iteritems())
+
+
+class ResultsLibTest(tf.test.TestCase):
+
+  def testResults(self):
+    with temporary_directory() as logdir:
+      results_obj = results_lib.Results(logdir)
+      self.assertEqual(results_obj.read_this_shard(), [])
+      results_obj.append(
+          {'foo': 1.5, 'bar': 2.5, 'baz': 0})
+      results_obj.append(
+          {'foo': 5.5, 'bar': -1, 'baz': 2})
+      self.assertEqual(
+          results_obj.read_this_shard(),
+          [{'foo': 1.5, 'bar': 2.5, 'baz': 0},
+           {'foo': 5.5, 'bar': -1, 'baz': 2}])
+
+  def testShardedResults(self):
+    with temporary_directory() as logdir:
+      n = 4  # Number of shards.
+      results_objs = [
+          results_lib.Results(logdir, shard_id=i) for i in xrange(n)]
+      for i, robj in enumerate(results_objs):
+        robj.append({'foo': i, 'bar': 1 + i * 2})
+      results_list, _ = results_objs[0].read_all()
+
+      # Check results. Order does not matter here.
+      self.assertEqual(
+          set(freeze(r) for r in results_list),
+          set(freeze({'foo': i, 'bar': 1 + i * 2}) for i in xrange(n)))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/run.py
+++ b/research/brain_coder/single_task/run.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+r"""Run training.
+
+Choose training algorithm and task(s) and follow these examples.
+
+Run synchronous policy gradient training locally:
+
+CONFIG="agent=c(algorithm='pg'),env=c(task='reverse')"
+OUT_DIR="/tmp/bf_pg_local"
+rm -rf $OUT_DIR
+bazel run -c opt single_task:run -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe=0 \
+    --logdir="$OUT_DIR" \
+    --summary_interval=1 \
+    --model_v=0
+learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
+
+
+Run genetic algorithm locally:
+
+CONFIG="agent=c(algorithm='ga'),env=c(task='reverse')"
+OUT_DIR="/tmp/bf_ga_local"
+rm -rf $OUT_DIR
+bazel run -c opt single_task:run -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe=0 \
+    --logdir="$OUT_DIR"
+
+
+Run uniform random search locally:
+
+CONFIG="agent=c(algorithm='rand'),env=c(task='reverse')"
+OUT_DIR="/tmp/bf_rand_local"
+rm -rf $OUT_DIR
+bazel run -c opt single_task:run -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe=0 \
+    --logdir="$OUT_DIR"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from single_task import defaults  # brain coder
+from single_task import ga_train  # brain coder
+from single_task import pg_train  # brain coder
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('config', '', 'Configuration.')
+flags.DEFINE_string(
+    'logdir', None, 'Absolute path where to write results.')
+flags.DEFINE_integer('task_id', 0, 'ID for this worker.')
+flags.DEFINE_integer('num_workers', 1, 'How many workers there are.')
+flags.DEFINE_integer(
+    'max_npe', 0,
+    'NPE = number of programs executed. Maximum number of programs to execute '
+    'in each run. Training will complete when this threshold is reached. Set '
+    'to 0 for unlimited training.')
+flags.DEFINE_integer(
+    'num_repetitions', 1,
+    'Number of times the same experiment will be run (globally across all '
+    'workers). Each run is independent.')
+flags.DEFINE_string(
+    'log_level', 'INFO',
+    'The threshold for what messages will be logged. One of DEBUG, INFO, WARN, '
+    'ERROR, or FATAL.')
+
+
+# To register an algorithm:
+# 1) Add dependency in the BUILD file to this build rule.
+# 2) Import the algorithm's module at the top of this file.
+# 3) Add a new entry in the following dict. The key is the algorithm name
+#    (used to select the algorithm in the config). The value is the module
+#    defining the expected functions for training and tuning. See the docstring
+#    for `get_namespace` for further details.
+ALGORITHM_REGISTRATION = {
+    'pg': pg_train,
+    'ga': ga_train,
+    'rand': ga_train,
+}
+
+
+def get_namespace(config_string):
+  """Get namespace for the selected algorithm.
+
+  Users who want to add additional algorithm types should modify this function.
+  The algorithm's namespace should contain the following functions:
+    run_training: Run the main training loop.
+    define_tuner_hparam_space: Return the hparam tuning space for the algo.
+    write_hparams_to_config: Helper for tuning. Write hparams chosen for tuning
+        to the Config object.
+  Look at pg_train.py and ga_train.py for function signatures and
+  implementations.
+
+  Args:
+    config_string: String representation of a Config object. This will get
+        parsed into a Config in order to determine what algorithm to use.
+
+  Returns:
+    algorithm_namespace: The module corresponding to the algorithm given in the
+        config.
+    config: The Config object resulting from parsing `config_string`.
+
+  Raises:
+    ValueError: If config.agent.algorithm is not one of the registered
+        algorithms.
+  """
+  config = defaults.default_config_with_updates(config_string)
+  if config.agent.algorithm not in ALGORITHM_REGISTRATION:
+    raise ValueError('Unknown algorithm type "%s"' % (config.agent.algorithm,))
+  else:
+    return ALGORITHM_REGISTRATION[config.agent.algorithm], config
+
+
+def main(argv):
+  del argv  # Unused.
+
+  logging.set_verbosity(FLAGS.log_level)
+
+  flags.mark_flag_as_required('logdir')
+  if FLAGS.num_workers <= 0:
+    raise ValueError('num_workers flag must be greater than 0.')
+  if FLAGS.task_id < 0:
+    raise ValueError('task_id flag must be greater than or equal to 0.')
+  if FLAGS.task_id >= FLAGS.num_workers:
+    raise ValueError(
+        'task_id flag must be strictly less than num_workers flag.')
+
+  ns, _ = get_namespace(FLAGS.config)
+  ns.run_training(is_chief=FLAGS.task_id == 0)
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/research/brain_coder/single_task/run_eval_tasks.py
+++ b/research/brain_coder/single_task/run_eval_tasks.py
+#!/usr/bin/env python
+from __future__ import print_function
+
+r"""This script can launch any eval experiments from the paper.
+
+This is a script. Run with python, not bazel.
+
+Usage:
+./single_task/run_eval_tasks.py \
+    --exp EXP --desc DESC [--tuning_tasks] [--iclr_tasks] [--task TASK] \
+    [--tasks TASK1 TASK2 ...]
+
+where EXP is one of the keys in `experiments`,
+and DESC is a string description of the set of experiments (such as "v0")
+
+Set only one of these flags:
+--tuning_tasks flag only runs tuning tasks.
+--iclr_tasks flag only runs the tasks included in the paper.
+--regression_tests flag runs tasks which function as regression tests.
+--task flag manually selects a single task to run.
+--tasks flag takes a custom list of tasks.
+
+Other flags:
+--reps N specifies N repetitions per experiment, Default is 25.
+--training_replicas R specifies that R workers will be launched to train one
+    task (for neural network algorithms). These workers will update a global
+    model stored on a parameter server. Defaults to 1. If R > 1, a parameter
+    server will also be launched.
+
+
+Run everything:
+exps=( pg-20M pg-topk-20M topk-20M ga-20M rand-20M )
+BIN_DIR="single_task"
+for exp in "${exps[@]}"
+do
+  ./$BIN_DIR/run_eval_tasks.py \
+      --exp "$exp" --iclr_tasks
+done
+"""
+
+import argparse
+from collections import namedtuple
+import subprocess
+
+
+S = namedtuple('S', ['length'])
+default_length = 100
+
+
+iclr_tasks = [
+    'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
+    'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
+    'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
+    'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
+    'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
+
+
+regression_test_tasks = ['reverse', 'test-hill-climb']
+
+
+E = namedtuple(
+    'E',
+    ['name', 'method_type', 'config', 'simplify', 'batch_size', 'max_npe'])
+
+
+def make_experiment_settings(name, **kwargs):
+  # Unpack experiment info from name.
+  def split_last(string, char):
+    i = string.rindex(char)
+    return string[:i], string[i+1:]
+  def si_to_int(si_string):
+    return int(
+        si_string.upper().replace('K', '0'*3).replace('M', '0'*6)
+        .replace('G', '0'*9))
+  method_type, max_npe = split_last(name, '-')
+  assert method_type
+  assert max_npe
+  return E(
+      name=name, method_type=method_type, max_npe=si_to_int(max_npe), **kwargs)
+
+
+experiments_set = {
+    make_experiment_settings(
+        'pg-20M',
+        config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.0,topk=0,'
+               'pi_loss_hparam=1.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'pg-topk-20M',
+        config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=50.0,topk=10,'
+               'pi_loss_hparam=1.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'topk-20M',
+        config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
+               'pi_loss_hparam=0.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'topk-0ent-20M',
+        config='entropy_beta=0.000,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
+               'pi_loss_hparam=0.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'ga-20M',
+        config='crossover_rate=0.95,mutation_rate=0.15',
+        simplify=False,
+        batch_size=100),  # Population size.
+    make_experiment_settings(
+        'rand-20M',
+        config='',
+        simplify=False,
+        batch_size=1),
+    make_experiment_settings(
+        'simpl-500M',
+        config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.5,topk=10,'
+               'pi_loss_hparam=1.0,alpha=0.0',
+        simplify=True,
+        batch_size=64),
+}
+
+experiments = {e.name: e for e in experiments_set}
+
+
+# pylint: disable=redefined-outer-name
+def parse_args(extra_args=()):
+  """Parse arguments and extract task and experiment info."""
+  parser = argparse.ArgumentParser(description='Run all eval tasks.')
+  parser.add_argument('--exp', required=True)
+  parser.add_argument('--tuning_tasks', action='store_true')
+  parser.add_argument('--iclr_tasks', action='store_true')
+  parser.add_argument('--regression_tests', action='store_true')
+  parser.add_argument('--desc', default='v0')
+  parser.add_argument('--reps', default=25)
+  parser.add_argument('--task')
+  parser.add_argument('--tasks', nargs='+')
+  for arg_string, default in extra_args:
+    parser.add_argument(arg_string, default=default)
+  args = parser.parse_args()
+
+  print('Running experiment: %s' % (args.exp,))
+  if args.desc:
+    print('Extra description: "%s"' % (args.desc,))
+  if args.exp not in experiments:
+    raise ValueError('Experiment name is not valid')
+  experiment_name = args.exp
+  experiment_settings = experiments[experiment_name]
+  assert experiment_settings.name == experiment_name
+
+  if args.tasks:
+    print('Launching tasks from args: %s' % (args.tasks,))
+    tasks = {t: S(length=default_length) for t in args.tasks}
+  elif args.task:
+    print('Launching single task "%s"' % args.task)
+    tasks = {args.task: S(length=default_length)}
+  elif args.tuning_tasks:
+    print('Only running tuning tasks')
+    tasks = {name: S(length=default_length)
+             for name in ['reverse-tune', 'remove-char-tune']}
+  elif args.iclr_tasks:
+    print('Running eval tasks from ICLR paper.')
+    tasks = {name: S(length=default_length) for name in iclr_tasks}
+  elif args.regression_tests:
+    tasks = {name: S(length=default_length) for name in regression_test_tasks}
+  print('Tasks: %s' % tasks.keys())
+
+  print('reps = %d' % (int(args.reps),))
+
+  return args, tasks, experiment_settings
+
+
+def run(command_string):
+  subprocess.call(command_string, shell=True)
+
+
+if __name__ == '__main__':
+  LAUNCH_TRAINING_COMMAND = 'single_task/launch_training.sh'
+  COMPILE_COMMAND = 'bazel build -c opt single_task:run.par'
+
+  args, tasks, experiment_settings = parse_args(
+      extra_args=(('--training_replicas', 1),))
+
+  if experiment_settings.method_type in (
+      'pg', 'pg-topk', 'topk', 'topk-0ent', 'simpl'):
+    # Runs PG and TopK.
+
+    def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
+                     batch_size, do_simplify, custom_config_str):
+      """Constructs terminal command for launching NN based algorithms.
+
+      The arguments to this function will be used to create config for the
+      experiment.
+
+      Args:
+        job_name: Name of the job to launch. Should uniquely identify this
+            experiment run.
+        task: Name of the coding task to solve.
+        max_npe: Maximum number of programs executed. An integer.
+        num_reps: Number of times to run the experiment. An integer.
+        code_length: Maximum allowed length of synthesized code.
+        batch_size: Minibatch size for gradient descent.
+        do_simplify: Whether to run the experiment in code simplification mode.
+            A bool.
+        custom_config_str: Additional config for the model config string.
+
+      Returns:
+        The terminal command that launches the specified experiment.
+      """
+      config = """
+        env=c(task='{0}',correct_syntax=False),
+        agent=c(
+          algorithm='pg',
+          policy_lstm_sizes=[35,35],value_lstm_sizes=[35,35],
+          grad_clip_threshold=50.0,param_init_factor=0.5,regularizer=0.0,
+          softmax_tr=1.0,optimizer='rmsprop',ema_baseline_decay=0.99,
+          eos_token={3},{4}),
+        timestep_limit={1},batch_size={2}
+      """.replace(' ', '').replace('\n', '').format(
+          task, code_length, batch_size, do_simplify, custom_config_str)
+      num_ps = 0 if args.training_replicas == 1 else 1
+      return (
+          r'{0} --job_name={1} --config="{2}" --max_npe={3} '
+          '--num_repetitions={4} --num_workers={5} --num_ps={6} '
+          '--stop_on_success={7}'
+          .format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
+                  args.training_replicas, num_ps, str(not do_simplify).lower()))
+
+  else:
+    # Runs GA and Rand.
+    assert experiment_settings.method_type in ('ga', 'rand')
+
+    def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
+                     batch_size, do_simplify, custom_config_str):
+      """Constructs terminal command for launching GA or uniform random search.
+
+      The arguments to this function will be used to create config for the
+      experiment.
+
+      Args:
+        job_name: Name of the job to launch. Should uniquely identify this
+            experiment run.
+        task: Name of the coding task to solve.
+        max_npe: Maximum number of programs executed. An integer.
+        num_reps: Number of times to run the experiment. An integer.
+        code_length: Maximum allowed length of synthesized code.
+        batch_size: Minibatch size for gradient descent.
+        do_simplify: Whether to run the experiment in code simplification mode.
+            A bool.
+        custom_config_str: Additional config for the model config string.
+
+      Returns:
+        The terminal command that launches the specified experiment.
+      """
+      assert not do_simplify
+      if custom_config_str:
+        custom_config_str = ',' + custom_config_str
+      config = """
+        env=c(task='{0}',correct_syntax=False),
+        agent=c(
+          algorithm='{4}'
+          {3}),
+        timestep_limit={1},batch_size={2}
+      """.replace(' ', '').replace('\n', '').format(
+          task, code_length, batch_size, custom_config_str,
+          experiment_settings.method_type)
+      num_workers = num_reps  # Do each rep in parallel.
+      return (
+          r'{0} --job_name={1} --config="{2}" --max_npe={3} '
+          '--num_repetitions={4} --num_workers={5} --num_ps={6} '
+          '--stop_on_success={7}'
+          .format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
+                  num_workers, 0, str(not do_simplify).lower()))
+
+  print('Compiling...')
+  run(COMPILE_COMMAND)
+
+  print('Launching %d coding tasks...' % len(tasks))
+  for task, task_settings in tasks.iteritems():
+    name = 'bf_rl_iclr'
+    desc = '{0}.{1}_{2}'.format(args.desc, experiment_settings.name, task)
+    job_name = '{}.{}'.format(name, desc)
+    print('Job name: %s' % job_name)
+    reps = int(args.reps) if not experiment_settings.simplify else 1
+    run_cmd = make_run_cmd(
+        job_name, task, experiment_settings.max_npe, reps,
+        task_settings.length, experiment_settings.batch_size,
+        experiment_settings.simplify,
+        experiment_settings.config)
+    print('Running command:\n' + run_cmd)
+    run(run_cmd)
+
+  print('Done.')
+# pylint: enable=redefined-outer-name
--- a/research/brain_coder/single_task/test_tasks.py
+++ b/research/brain_coder/single_task/test_tasks.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Tasks that test correctness of algorithms."""
+
+from six.moves import xrange
+from common import reward as reward_lib  # brain coder
+from single_task import misc  # brain coder
+
+
+class BasicTaskManager(object):
+  """Wraps a generic reward function."""
+
+  def __init__(self, reward_fn):
+    self.reward_fn = reward_fn
+    self.good_reward = 1.0
+
+  def _score_string(self, string):
+    actions = misc.bf_string_to_tokens(string)
+    reward, correct = self.reward_fn(actions)
+    return misc.RewardInfo(
+        episode_rewards=[0.0] * (len(string) - 1) + [reward],
+        input_case=None,
+        correct_output=None,
+        code_output=actions,
+        input_type=None,
+        output_type=misc.IOType.integer,
+        reason='correct' if correct else 'wrong')
+
+  def rl_batch(self, batch_size):
+    reward_fns = [self._score_string] * batch_size
+    return reward_fns
+
+
+class Trie(object):
+  """Trie for sequences."""
+  EOS = ()
+
+  def __init__(self):
+    self.trie = {}
+
+  def insert(self, sequence):
+    d = self.trie
+    for e in sequence:
+      if e not in d:
+        d[e] = {}
+      d = d[e]
+    d[self.EOS] = True   # Terminate sequence.
+
+  def prefix_match(self, sequence):
+    """Return prefix of `sequence` which exists in the trie."""
+    d = self.trie
+    index = 0
+    for i, e in enumerate(sequence + [self.EOS]):
+      index = i
+      if e in d:
+        d = d[e]
+        if e == self.EOS:
+          return sequence, True
+      else:
+        break
+    return sequence[:index], False
+
+  def next_choices(self, sequence):
+    d = self.trie
+    for e in sequence:
+      if e in d:
+        d = d[e]
+      else:
+        raise ValueError('Sequence not a prefix: %s' % (sequence,))
+    return d.keys()
+
+
+class HillClimbingTask(object):
+  """Simple task that tests reward hill climbing ability.
+
+  There are a set of paths (sequences of tokens) which are rewarded. The total
+  reward for a path is proportional to its length, so the longest path is the
+  target. Shorter paths can be dead ends.
+  """
+
+  def __init__(self):
+    # Paths are sequences of sub-sequences. Here we form unique sub-sequences
+    # out of 3 arbitrary ints. We use sub-sequences instead of single entities
+    # to make the task harder by making the episodes last longer, i.e. more
+    # for the agent to remember.
+    a = (1, 2, 3)
+    b = (4, 5, 6)
+    c = (7, 8, 7)
+    d = (6, 5, 4)
+    e = (3, 2, 1)
+    f = (8, 5, 1)
+    g = (6, 4, 2)
+    h = (1, 8, 3)
+    self.paths = Trie()
+    self.paths.insert([a, b, h])
+    self.paths.insert([a, b, c, d, e, f, g, h])
+    self.paths.insert([a, b, c, d, e, b, a])
+    self.paths.insert([a, b, g, h])
+    self.paths.insert([a, e, f, g])
+    self.correct_sequence = misc.flatten([a, b, c, d, e, f, g, h])
+
+    def distance_fn(a, b):
+      len_diff = abs(len(a) - len(b))
+      return sum(reward_lib.mod_abs_diff(ai - 1, bi - 1, 8)
+                 for ai, bi in zip(a, b)) + len_diff * 4  # 8 / 2 = 4
+    self.distance_fn = distance_fn
+
+  def __call__(self, actions):
+    # Compute reward for action sequence.
+    actions = [a for a in actions if a > 0]
+    sequence = [tuple(actions[i: i + 3]) for i in xrange(0, len(actions), 3)]
+    prefix, complete = self.paths.prefix_match(sequence)
+    if complete:
+      return float(len(prefix)), actions == self.correct_sequence
+    if len(prefix) == len(sequence):
+      return float(len(prefix)), False
+    next_pred = sequence[len(prefix)]
+    choices = self.paths.next_choices(prefix)
+    if choices == [()]:
+      return (len(prefix) - len(next_pred) / 3.0), False
+    min_dist = min(self.distance_fn(c, next_pred) for c in choices)
+    # +1 reward for each element in the sequence correct, plus fraction torwards
+    # closest next element.
+    # Maximum distance possible is num_actions * base / 2 = 3 * 8 / 2 = 12
+    return (len(prefix) + (1 - min_dist / 12.0)), False
--- a/research/brain_coder/single_task/test_tasks_test.py
+++ b/research/brain_coder/single_task/test_tasks_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Tests for test_tasks."""
+
+import numpy as np
+import tensorflow as tf
+
+from single_task import misc  # brain coder
+from single_task import test_tasks  # brain coder
+
+
+def get_reward(reward_fn, candidate):
+  return sum(reward_fn(misc.bf_tokens_to_string(candidate)).episode_rewards)
+
+
+class TestTasksTest(tf.test.TestCase):
+
+  def testHillClimbingTask(self):
+    task = test_tasks.BasicTaskManager(test_tasks.HillClimbingTask())
+    reward_fns = task.rl_batch(1)
+    reward_fn = reward_fns[0]
+    self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 0]), 8 / 12.))
+    self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 2, 0]), 11 / 12.))
+    self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 3, 0]), 1.0))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 2, 0]), 1. + 8 / 12.))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 0]), 2.0))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 0]), 3.0))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 0]), 3.0))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 1, 0]),
+                   3.0 - 4 / 12.))
+    self.assertTrue(
+        np.isclose(
+            get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 1, 1, 1, 1, 0]),
+            2.0))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 3, 0]),
+                   3.0 + 1 / 12.))
+    self.assertTrue(
+        np.isclose(
+            get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1,
+                                   8, 5, 1, 6, 4, 2, 1, 8, 3, 0]),
+            8.0))
+    self.assertTrue(
+        np.isclose(
+            get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1,
+                                   8, 5, 1, 6, 4, 2, 1, 8, 3, 1, 1, 0]),
+            8.0 - 8 / 12.))
+    self.assertTrue(
+        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3,
+                                          2, 1, 8, 5, 1, 6, 4, 2, 1, 8, 3, 1, 1,
+                                          1, 1, 1, 1, 1, 0]),
+                   7.0))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/tune.py
+++ b/research/brain_coder/single_task/tune.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+r"""Run grid search.
+
+Look at launch_tuning.sh for details on how to tune at scale.
+
+Usage example:
+Tune with one worker on the local machine.
+
+CONFIG="agent=c(algorithm='pg'),"
+CONFIG+="env=c(task_cycle=['reverse-tune', 'remove-tune'])"
+HPARAM_SPACE_TYPE="pg"
+OUT_DIR="/tmp/bf_pg_tune"
+MAX_NPE=5000000
+NUM_REPETITIONS=50
+rm -rf $OUT_DIR
+mkdir $OUT_DIR
+bazel run -c opt single_task:tune -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe="$MAX_NPE" \
+    --num_repetitions="$NUM_REPETITIONS" \
+    --logdir="$OUT_DIR" \
+    --summary_interval=1 \
+    --model_v=0 \
+    --hparam_space="$HPARAM_SPACE_TYPE" \
+    --tuner_id=0 \
+    --num_tuners=1 \
+    2>&1 >"$OUT_DIR/tuner_0.log"
+learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
+"""
+
+import ast
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+from six.moves import xrange
+import tensorflow as tf
+
+from single_task import defaults  # brain coder
+from single_task import run as run_lib  # brain coder
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer(
+    'tuner_id', 0,
+    'The unique ID for this tuning worker.')
+flags.DEFINE_integer(
+    'num_tuners', 1,
+    'How many tuners are there.')
+flags.DEFINE_string(
+    'hparam_space', 'default',
+    'String name which denotes the hparam space to tune over. This is '
+    'algorithm dependent.')
+flags.DEFINE_string(
+    'fixed_hparams', '',
+    'HParams string. Used to fix hparams during tuning.')
+flags.DEFINE_float(
+    'success_rate_objective_weight', 1.0,
+    'How much to weight success rate vs num programs seen. By default, only '
+    'success rate is optimized (this is the setting used in the paper).')
+
+
+def parse_hparams_string(hparams_str):
+  hparams = {}
+  for term in hparams_str.split(','):
+    if not term:
+      continue
+    name, value = term.split('=')
+    hparams[name.strip()] = ast.literal_eval(value)
+  return hparams
+
+
+def int_to_multibase(n, bases):
+  digits = [0] * len(bases)
+  for i, b in enumerate(bases):
+    n, d = divmod(n, b)
+    digits[i] = d
+  return digits
+
+
+def hparams_for_index(index, tuning_space):
+  keys = sorted(tuning_space.keys())
+  indices = int_to_multibase(index, [len(tuning_space[k]) for k in keys])
+  return tf.contrib.training.HParams(
+      **{k: tuning_space[k][i] for k, i in zip(keys, indices)})
+
+
+def run_tuner_loop(ns):
+  """Run tuning loop for this worker."""
+  is_chief = FLAGS.task_id == 0
+  tuning_space = ns.define_tuner_hparam_space(
+      hparam_space_type=FLAGS.hparam_space)
+  fixed_hparams = parse_hparams_string(FLAGS.fixed_hparams)
+  for name, value in fixed_hparams.iteritems():
+    tuning_space[name] = [value]
+  tuning_space_size = np.prod([len(values) for values in tuning_space.values()])
+
+  num_local_trials, remainder = divmod(tuning_space_size, FLAGS.num_tuners)
+  if FLAGS.tuner_id < remainder:
+    num_local_trials += 1
+  starting_trial_id = (
+      num_local_trials * FLAGS.tuner_id + min(remainder, FLAGS.tuner_id))
+
+  logging.info('tuning_space_size: %d', tuning_space_size)
+  logging.info('num_local_trials: %d', num_local_trials)
+  logging.info('starting_trial_id: %d', starting_trial_id)
+
+  for local_trial_index in xrange(num_local_trials):
+    trial_config = defaults.default_config_with_updates(FLAGS.config)
+    global_trial_index = local_trial_index + starting_trial_id
+    trial_name = 'trial_' + str(global_trial_index)
+    trial_dir = os.path.join(FLAGS.logdir, trial_name)
+    hparams = hparams_for_index(global_trial_index, tuning_space)
+    ns.write_hparams_to_config(
+        trial_config, hparams, hparam_space_type=FLAGS.hparam_space)
+
+    results_list = ns.run_training(
+        config=trial_config, tuner=None, logdir=trial_dir, is_chief=is_chief,
+        trial_name=trial_name)
+
+    if not is_chief:
+      # Only chief worker needs to write tuning results to disk.
+      continue
+
+    objective, metrics = compute_tuning_objective(
+        results_list, hparams, trial_name, num_trials=tuning_space_size)
+    logging.info('metrics:\n%s', metrics)
+    logging.info('objective: %s', objective)
+    logging.info('programs_seen_fraction: %s',
+                 metrics['programs_seen_fraction'])
+    logging.info('success_rate: %s', metrics['success_rate'])
+    logging.info('success_rate_objective_weight: %s',
+                 FLAGS.success_rate_objective_weight)
+
+    tuning_results_file = os.path.join(trial_dir, 'tuning_results.txt')
+    with tf.gfile.FastGFile(tuning_results_file, 'a') as writer:
+      writer.write(str(metrics) + '\n')
+
+    logging.info('Trial %s complete.', trial_name)
+
+
+def compute_tuning_objective(results_list, hparams, trial_name, num_trials):
+  """Compute tuning objective and metrics given results and trial information.
+
+  Args:
+    results_list: List of results dicts read from disk. These are written by
+        workers.
+    hparams: tf.contrib.training.HParams instance containing the hparams used
+        in this trial (only the hparams which are being tuned).
+    trial_name: Name of this trial. Used to create a trial directory.
+    num_trials: Total number of trials that need to be run. This is saved in the
+        metrics dict for future reference.
+
+  Returns:
+    objective: The objective computed for this trial. Choose the hparams for the
+        trial with the largest objective value.
+    metrics: Information about this trial. A dict.
+  """
+  found_solution = [r['found_solution'] for r in results_list]
+  successful_program_counts = [
+      r['npe'] for r in results_list if r['found_solution']]
+
+  success_rate = sum(found_solution) / float(len(results_list))
+
+  max_programs = FLAGS.max_npe  # Per run.
+  all_program_counts = [
+      r['npe'] if r['found_solution'] else max_programs
+      for r in results_list]
+  programs_seen_fraction = (
+      float(sum(all_program_counts))
+      / (max_programs * len(all_program_counts)))
+
+  # min/max/avg stats are over successful runs.
+  metrics = {
+      'num_runs': len(results_list),
+      'num_succeeded': sum(found_solution),
+      'success_rate': success_rate,
+      'programs_seen_fraction': programs_seen_fraction,
+      'avg_programs': np.mean(successful_program_counts),
+      'max_possible_programs_per_run': max_programs,
+      'global_step': sum([r['num_batches'] for r in results_list]),
+      'hparams': hparams.values(),
+      'trial_name': trial_name,
+      'num_trials': num_trials}
+
+  # Report stats per tasks.
+  tasks = [r['task'] for r in results_list]
+  for task in set(tasks):
+    task_list = [r for r in results_list if r['task'] == task]
+    found_solution = [r['found_solution'] for r in task_list]
+    successful_rewards = [
+        r['best_reward'] for r in task_list
+        if r['found_solution']]
+    successful_num_batches = [
+        r['num_batches']
+        for r in task_list if r['found_solution']]
+    successful_program_counts = [
+        r['npe'] for r in task_list if r['found_solution']]
+    metrics_append = {
+        task + '__num_runs': len(task_list),
+        task + '__num_succeeded': sum(found_solution),
+        task + '__success_rate': (
+            sum(found_solution) / float(len(task_list)))}
+    metrics.update(metrics_append)
+    if any(found_solution):
+      metrics_append = {
+          task + '__min_reward': min(successful_rewards),
+          task + '__max_reward': max(successful_rewards),
+          task + '__avg_reward': np.median(successful_rewards),
+          task + '__min_programs': min(successful_program_counts),
+          task + '__max_programs': max(successful_program_counts),
+          task + '__avg_programs': np.mean(successful_program_counts),
+          task + '__min_batches': min(successful_num_batches),
+          task + '__max_batches': max(successful_num_batches),
+          task + '__avg_batches': np.mean(successful_num_batches)}
+      metrics.update(metrics_append)
+
+  # Objective will be maximized.
+  # Maximize success rate, minimize num programs seen.
+  # Max objective is always 1.
+  weight = FLAGS.success_rate_objective_weight
+  objective = (
+      weight * success_rate
+      + (1 - weight) * (1 - programs_seen_fraction))
+  metrics['objective'] = objective
+
+  return objective, metrics
+
+
+def main(argv):
+  del argv
+
+  logging.set_verbosity(FLAGS.log_level)
+
+  if not FLAGS.logdir:
+    raise ValueError('logdir flag must be provided.')
+  if FLAGS.num_workers <= 0:
+    raise ValueError('num_workers flag must be greater than 0.')
+  if FLAGS.task_id < 0:
+    raise ValueError('task_id flag must be greater than or equal to 0.')
+  if FLAGS.task_id >= FLAGS.num_workers:
+    raise ValueError(
+        'task_id flag must be strictly less than num_workers flag.')
+  if FLAGS.num_tuners <= 0:
+    raise ValueError('num_tuners flag must be greater than 0.')
+  if FLAGS.tuner_id < 0:
+    raise ValueError('tuner_id flag must be greater than or equal to 0.')
+  if FLAGS.tuner_id >= FLAGS.num_tuners:
+    raise ValueError(
+        'tuner_id flag must be strictly less than num_tuners flag.')
+
+  ns, _ = run_lib.get_namespace(FLAGS.config)
+  run_tuner_loop(ns)
+
+
+if __name__ == '__main__':
+  app.run(main)
--- a/research/cognitive_mapping_and_planning/scripts/script_env_vis.py
+++ b/research/cognitive_mapping_and_planning/scripts/script_env_vis.py
@@ -30,8 +30,8 @@ from tensorflow.python.platform import flags
 import datasets.nav_env_config as nec
 import datasets.nav_env as nav_env
 import cv2
-from datasets import factory 
-import render.swiftshader_renderer as renderer 
+from datasets import factory
+import render.swiftshader_renderer as renderer

 SwiftshaderRenderer = renderer.SwiftshaderRenderer
 VisualNavigationEnv = nav_env.VisualNavigationEnv
@@ -53,10 +53,10 @@ def get_args():
  navtask.camera_param.width = sz
  navtask.task_params.img_height = sz
  navtask.task_params.img_width = sz
-  
+
  # navtask.task_params.semantic_task.class_map_names = ['chair', 'door', 'table']
  # navtask.task_params.type = 'to_nearest_obj_acc'
-  
+
  logging.info('navtask: %s', navtask)
  return navtask

@@ -90,12 +90,12 @@ def walk_through(b):

  root = tk.Tk()
  image = b.render_nodes(b.task.nodes[[current_node],:])[0]
-  print image.shape
+  print(image.shape)
  image = image.astype(np.uint8)
  im = Image.fromarray(image)
  im = ImageTk.PhotoImage(im)
  panel = tk.Label(root, image=im)
- 
+
  map_size = b.traversible.shape
  sc = np.max(map_size)/256.
  loc = np.array([[map_size[1]/2., map_size[0]/2.]])
@@ -128,15 +128,15 @@ def walk_through(b):
    global current_node
    current_node = b.take_action([current_node], [3], 1)[0][0]
    refresh()
-  
+
  def right_key(event):
    global current_node
    current_node = b.take_action([current_node], [1], 1)[0][0]
    refresh()

  def quit(event):
-    root.destroy() 
-  
+    root.destroy()
+
  panel_overhead.grid(row=4, column=5, rowspan=1, columnspan=1,
                      sticky=tk.W+tk.E+tk.N+tk.S)
  panel.bind('<Left>', left_key)
@@ -150,19 +150,19 @@ def walk_through(b):

 def simple_window():
  root = tk.Tk()
-  
+
  image = np.zeros((128, 128, 3), dtype=np.uint8)
  image[32:96, 32:96, 0] = 255
  im = Image.fromarray(image)
  im = ImageTk.PhotoImage(im)
-  
+
  image = np.zeros((128, 128, 3), dtype=np.uint8)
  image[32:96, 32:96, 1] = 255
  im2 = Image.fromarray(image)
  im2 = ImageTk.PhotoImage(im2)
-  
+
  panel = tk.Label(root, image=im)
-  
+
  def left_key(event):
    panel.configure(image=im2)
    panel.image = im2
@@ -176,7 +176,7 @@ def simple_window():
  panel.bind('q', quit)
  panel.focus_set()
  panel.pack(side = "bottom", fill = "both", expand = "yes")
-  root.mainloop() 
+  root.mainloop()

 def main(_):
  b = load_building(FLAGS.dataset_name, FLAGS.building_name)

--- a/research/cognitive_mapping_and_planning/scripts/script_plot_trajectory.py
+++ b/research/cognitive_mapping_and_planning/scripts/script_plot_trajectory.py
@@ -17,7 +17,7 @@ r"""
 Code for plotting trajectories in the top view, and also plot first person views
 from saved trajectories. Does not run the network but only loads the mesh data
 to plot the view points.
-  CUDA_VISIBLE_DEVICES=0 LD_LIBRARY_PATH=/opt/cuda-8.0/lib64:/opt/cudnnv51/lib64 
+  CUDA_VISIBLE_DEVICES=0 LD_LIBRARY_PATH=/opt/cuda-8.0/lib64:/opt/cudnnv51/lib64
  PYTHONPATH='.' PYOPENGL_PLATFORM=egl python scripts/script_plot_trajectory.py \
      --first_person --num_steps 40 \
      --config_name cmp.lmap_Msc.clip5.sbpd_d_r2r \
@@ -36,13 +36,13 @@ from tensorflow.contrib import slim
 import cv2
 import logging
 from tensorflow.python.platform import gfile
-from tensorflow.python.platform import app 
-from tensorflow.python.platform import flags 
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags

 from datasets import nav_env
-import scripts.script_nav_agent_release as sna 
+import scripts.script_nav_agent_release as sna
 import src.file_utils as fu
-from src import graph_utils 
+from src import graph_utils
 from src import utils
 FLAGS = flags.FLAGS

@@ -95,7 +95,7 @@ def _compute_hardness():
    # Initialize the agent.
    init_env_state = e.reset(rng_data)

-    gt_dist_to_goal = [e.episode.dist_to_goal[0][j][s] 
+    gt_dist_to_goal = [e.episode.dist_to_goal[0][j][s]
                       for j, s in enumerate(e.episode.start_node_ids)]

    for j in range(args.navtask.task_params.batch_size):
@@ -120,15 +120,15 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
  out_dir = os.path.join(out_dir, FLAGS.config_name+_get_suffix_str(),
                         FLAGS.imset)
  fu.makedirs(out_dir)
-  
+
  # Load the model so that we can render.
  plt.set_cmap('gray')
  samples_per_action = 8; wait_at_action = 0;
-  
+
  Writer = animation.writers['mencoder']
-  writer = Writer(fps=3*(samples_per_action+wait_at_action), 
+  writer = Writer(fps=3*(samples_per_action+wait_at_action),
                  metadata=dict(artist='anonymous'), bitrate=1800)
-  
+
  args = sna.get_args_for_config(FLAGS.config_name + '+bench_'+FLAGS.imset)
  args.navtask.logdir = None
  navtask_ = copy.deepcopy(args.navtask)
@@ -142,10 +142,10 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
  R = lambda: nav_env.get_multiplexer_class(navtask_, 0)
  R = R()
  b = R.buildings[0]
-  
+
  f = [0 for _ in range(wait_at_action)] + \
      [float(_)/samples_per_action for _ in range(samples_per_action)];
-  
+
  # Generate things for it to render.
  inds_to_do = []
  inds_to_do += [1, 4, 10] #1291, 1268, 1273, 1289, 1302, 1426, 1413, 1449, 1399, 1390]
@@ -163,7 +163,7 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
    # axes = [ax]
    for ax in axes:
      ax.set_axis_off()
-    
+
    node_ids = dt['all_node_ids'][i, :, 0]*1
    # Prune so that last node is not repeated more than 3 times?
    if np.all(node_ids[-4:] == node_ids[-1]):
@@ -185,7 +185,7 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
    node_ids_all = np.reshape(node_ids_all[:-1,:], -1)
    perturbs_all = np.reshape(perturbs_all, [-1, 4])
    imgs = b.render_nodes(b.task.nodes[node_ids_all,:], perturb=perturbs_all)
-  
+
    # Get action at each node.
    actions = []
    _, action_to_nodes = b.get_feasible_actions(node_ids)
@@ -193,7 +193,7 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
      action_to_node = action_to_nodes[j]
      node_to_action = dict(zip(action_to_node.values(), action_to_node.keys()))
      actions.append(node_to_action[node_ids[j+1]])
-    
+
    def init_fn():
      return fig,
    gt_dist_to_goal = []
@@ -205,8 +205,8 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
      img = imgs[j]; ax = axes[0]; ax.clear(); ax.set_axis_off();
      img = img.astype(np.uint8); ax.imshow(img);
      tt = ax.set_title(
-          "First Person View\n" + 
-          "Top corners show diagnostics (distance, agents' action) not input to agent.", 
+          "First Person View\n" +
+          "Top corners show diagnostics (distance, agents' action) not input to agent.",
          fontsize=12)
      plt.setp(tt, color='white')

@@ -218,9 +218,9 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
          fontsize=20, color='red',
          transform=ax.transAxes, alpha=1.0)
      t.set_bbox(dict(color='white', alpha=0.85, pad=-0.1))
-      
+
      # Action to take.
-      action_latex = ['$\odot$ ', '$\curvearrowright$ ', '$\curvearrowleft$ ', '$\Uparrow$ ']
+      action_latex = ['$\odot$ ', '$\curvearrowright$ ', '$\curvearrowleft$ ', r'$\Uparrow$ ']
      t = ax.text(0.99, 0.99, action_latex[actions[step_number]],
          horizontalalignment='right',
          verticalalignment='top',
@@ -256,7 +256,7 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
      locs = np.expand_dims(locs, axis=0)
      ax.plot(locs[:,0], locs[:,1], 'r.', alpha=1.0, linewidth=0, markersize=4)
      tt = ax.set_title('Trajectory in topview', fontsize=14)
-      plt.setp(tt, color='white') 
+      plt.setp(tt, color='white')
      return fig,

    line_ani = animation.FuncAnimation(fig, worker,
@@ -265,7 +265,7 @@ def plot_trajectory_first_person(dt, orig_maps, out_dir):
    tmp_file_name = 'tmp.mp4'
    line_ani.save(tmp_file_name, writer=writer, savefig_kwargs={'facecolor':'black'})
    out_file_name = os.path.join(out_dir, 'vis_{:04d}.mp4'.format(i))
-    print out_file_name
+    print(out_file_name)

    if fu.exists(out_file_name):
      gfile.Remove(out_file_name)
@@ -280,12 +280,12 @@ def plot_trajectory(dt, hardness, orig_maps, out_dir):
  out_file = os.path.join(out_dir, 'all_locs_at_t.pkl')
  dt['hardness'] = hardness
  utils.save_variables(out_file, dt.values(), dt.keys(), overwrite=True)
-  
+
  #Plot trajectories onto the maps
  plt.set_cmap('gray')
  for i in range(4000):
    goal_loc = dt['all_goal_locs'][i, :, :]
-    locs = np.concatenate((dt['all_locs'][i,:,:], 
+    locs = np.concatenate((dt['all_locs'][i,:,:],
                           dt['all_locs'][i,:,:]), axis=0)
    xymin = np.minimum(np.min(goal_loc, axis=0), np.min(locs, axis=0))
    xymax = np.maximum(np.max(goal_loc, axis=0), np.max(locs, axis=0))
@@ -305,35 +305,35 @@ def plot_trajectory(dt, hardness, orig_maps, out_dir):
    uniq = np.array(uniq)
    all_locs = all_locs[uniq, :]

-    ax.plot(dt['all_locs'][i, 0, 0], 
+    ax.plot(dt['all_locs'][i, 0, 0],
            dt['all_locs'][i, 0, 1], 'b.', markersize=24)
-    ax.plot(dt['all_goal_locs'][i, 0, 0], 
+    ax.plot(dt['all_goal_locs'][i, 0, 0],
            dt['all_goal_locs'][i, 0, 1], 'g*', markersize=19)
    ax.plot(all_locs[:,0], all_locs[:,1], 'r', alpha=0.4, linewidth=2)
    ax.scatter(all_locs[:,0], all_locs[:,1],
-               c=5+np.arange(all_locs.shape[0])*1./all_locs.shape[0], 
+               c=5+np.arange(all_locs.shape[0])*1./all_locs.shape[0],
               cmap='Reds', s=30, linewidth=0)
    ax.imshow(orig_maps, origin='lower', vmin=-1.0, vmax=2.0, aspect='equal')
    ax.set_xlim([xy1[0], xy2[0]])
    ax.set_ylim([xy1[1], xy2[1]])
-    
+
    file_name = os.path.join(out_dir, 'trajectory_{:04d}.png'.format(i))
-    print file_name
-    with fu.fopen(file_name, 'w') as f: 
+    print(file_name)
+    with fu.fopen(file_name, 'w') as f:
      plt.savefig(f)
    plt.close(fig)
-  
+

 def main(_):
  a = _load_trajectory()
  h_dists, gt_dists, orig_maps = _compute_hardness()
  hardness = 1.-h_dists*1./ gt_dists
-  
+
  if FLAGS.top_view:
    plot_trajectory(a, hardness, orig_maps, out_dir=FLAGS.out_dir)

  if FLAGS.first_person:
    plot_trajectory_first_person(a, orig_maps, out_dir=FLAGS.out_dir)
-  
+
 if __name__ == '__main__':
  app.run()
--- a/research/cognitive_mapping_and_planning/src/utils.py
+++ b/research/cognitive_mapping_and_planning/src/utils.py
@@ -17,6 +17,7 @@ r"""Generaly Utilities.
 """

 import numpy as np, cPickle, os, time
+from six.moves import xrange
 import src.file_utils as fu
 import logging

@@ -93,12 +94,12 @@ def tic_toc_print(interval, string):
  global tic_toc_print_time_old
  if 'tic_toc_print_time_old' not in globals():
    tic_toc_print_time_old = time.time()
-    print string
+    print(string)
  else:
    new_time = time.time()
    if new_time - tic_toc_print_time_old > interval:
      tic_toc_print_time_old = new_time;
-      print string
+      print(string)

 def mkdir_if_missing(output_dir):
  if not fu.exists(output_dir):
@@ -126,7 +127,7 @@ def load_variables(pickle_file_name):
 def voc_ap(rec, prec):
  rec = rec.reshape((-1,1))
  prec = prec.reshape((-1,1))
-  z = np.zeros((1,1)) 
+  z = np.zeros((1,1))
  o = np.ones((1,1))
  mrec = np.vstack((z, rec, o))
  mpre = np.vstack((z, prec, z))
@@ -165,4 +166,3 @@ def calc_pr(gt, out, wt=None):

  ap = voc_ap(rec, prec)
  return ap, rec, prec
-
--- a/research/compression/entropy_coder/core/entropy_coder_single.py
+++ b/research/compression/entropy_coder/core/entropy_coder_single.py
@@ -58,7 +58,7 @@ def main(_):
  #iteration = FLAGS.iteration

  if not tf.gfile.Exists(FLAGS.input_codes):
-    print '\nInput codes not found.\n'
+    print('\nInput codes not found.\n')
    return

  with tf.gfile.FastGFile(FLAGS.input_codes, 'rb') as code_file:

--- a/research/compression/entropy_coder/core/entropy_coder_train.py
+++ b/research/compression/entropy_coder/core/entropy_coder_train.py
@@ -171,7 +171,7 @@ def train():
              'code_length': model.average_code_length
          }
          np_tensors = sess.run(tf_tensors, feed_dict=feed_dict)
-          print np_tensors['code_length']
+          print(np_tensors['code_length'])

      sv.Stop()


--- a/research/compression/entropy_coder/dataset/gen_synthetic_dataset.py
+++ b/research/compression/entropy_coder/dataset/gen_synthetic_dataset.py
@@ -18,6 +18,7 @@
 import os

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 import synthetic_model

--- a/research/compression/entropy_coder/dataset/synthetic_model.py
+++ b/research/compression/entropy_coder/dataset/synthetic_model.py
@@ -16,6 +16,7 @@
 """Binary code sample generator."""

 import numpy as np
+from six.moves import xrange


 _CRC_LINE = [

--- a/research/compression/entropy_coder/lib/block_util.py
+++ b/research/compression/entropy_coder/lib/block_util.py
@@ -21,6 +21,7 @@ from __future__ import unicode_literals
 import math

 import numpy as np
+import six
 import tensorflow as tf


@@ -39,7 +40,7 @@ class RsqrtInitializer(object):
        1.0 / sqrt(product(shape[dims]))
      **kwargs: Extra keyword arguments to pass to tf.truncated_normal.
    """
-    if isinstance(dims, (int, long)):
+    if isinstance(dims, six.integer_types):
      self._dims = [dims]
    else:
      self._dims = dims
@@ -73,7 +74,7 @@ class RectifierInitializer(object):
        sqrt(scale / product(shape[dims])).
      **kwargs: Extra keyword arguments to pass to tf.truncated_normal.
    """
-    if isinstance(dims, (int, long)):
+    if isinstance(dims, six.integer_types):
      self._dims = [dims]
    else:
      self._dims = dims

--- a/research/compression/entropy_coder/lib/blocks_masked_conv2d.py
+++ b/research/compression/entropy_coder/lib/blocks_masked_conv2d.py
@@ -16,6 +16,7 @@
 """Define some typical masked 2D convolutions."""

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 import block_util

--- a/research/compression/entropy_coder/lib/blocks_masked_conv2d_test.py
+++ b/research/compression/entropy_coder/lib/blocks_masked_conv2d_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import unicode_literals

 import numpy as np
+from six.moves import xrange
 import tensorflow as tf

 import blocks_masked_conv2d