Merge pull request #3126 from danabo/master

Open source release of Brain Coder.

Merge pull request #3126 from danabo/master
Open source release of Brain Coder.
61822dab · Lukasz Kaiser · GitHub · 54babf62 · a00f7e2b · 61822dab
Unverified Commit 61822dab authored Jan 08, 2018 by Lukasz Kaiser Committed by GitHub Jan 08, 2018
20 changed files
--- a/research/brain_coder/single_task/aggregate_tuning_results.py
+++ b/research/brain_coder/single_task/aggregate_tuning_results.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+r"""After running tuning, use this script to aggregate the results.
+Usage:
+OUT_DIR="<my_tuning_dir>"
+bazel run -c opt single_task:aggregate_tuning_results -- \
+    --alsologtostderr \
+    --tuning_dir="$OUT_DIR"
+"""
+import ast
+import os
+from absl import app
+from absl import flags
+import tensorflow as tf
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'tuning_dir', '',
+    'Absolute path where results tuning trial folders are found.')
+def main(argv):
+  del argv  # Unused.
+  try:
+    trial_dirs = tf.gfile.ListDirectory(FLAGS.tuning_dir)
+  except tf.errors.NotFoundError:
+    print('Tuning directory %s does not exist.' % (FLAGS.tuning_dir,))
+    return
+  metrics = []
+  for trial_dir in trial_dirs:
+    tuning_results_file = os.path.join(
+        FLAGS.tuning_dir, trial_dir, 'tuning_results.txt')
+    if tf.gfile.Exists(tuning_results_file):
+      with tf.gfile.FastGFile(tuning_results_file, 'r') as reader:
+        for line in reader:
+          metrics.append(ast.literal_eval(line.replace(': nan,', ': 0.0,')))
+  if not metrics:
+    print('No trials found.')
+    return
+  num_trials = [m['num_trials'] for m in metrics]
+  assert all(n == num_trials[0] for n in num_trials)
+  num_trials = num_trials[0]
+  print('Found %d completed trials out of %d' % (len(metrics), num_trials))
+  # Sort by objective descending.
+  sorted_trials = sorted(metrics, key=lambda m: -m['objective'])
+  for i, metrics in enumerate(sorted_trials):
+    hparams = metrics['hparams']
+    keys = sorted(hparams.keys())
+    print(
+        str(i).ljust(4) + ': '
+        + '{0:.2f}'.format(metrics['objective']).ljust(10)
+        + '['
+        + ','.join(['{}={}'.format(k, hparams[k]).ljust(24) for k in keys])
+        + ']')
+if __name__ == '__main__':
+  app.run(main)
--- a/research/brain_coder/single_task/code_tasks.py
+++ b/research/brain_coder/single_task/code_tasks.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tasks for RL."""
+import abc
+import copy
+import itertools
+import random
+from absl import logging
+import numpy as np
+from common import bf  # brain coder
+from common import reward as r  # brain coder
+from single_task import misc  # brain coder
+from single_task import test_tasks  # brain coder
+MAX_EXECUTION_STEPS = 5000
+def make_task(task_name, override_kwargs=None, max_code_length=100,
+              require_correct_syntax=False,
+              do_code_simplification=False,
+              correct_bonus=2.0, code_length_bonus=1.0):
+  """Make tasks with setting from paper."""
+  logging.info('Making paper-config task.')
+  n = 16  # Number of test cases.
+  task_mapping = {
+      'print-hello': (
+          PrintTask, dict(base=27, fixed_string=[8, 5, 12, 12, 15])),
+      'print': (PrintIntTask, dict(base=256, fixed_string=[1, 2, 3, 4, 5])),
+      'echo': (EchoTask, dict(base=27, min_length=1, max_length=6)),
+      'remove-char': (
+          RemoveCharTask, dict(base=256, n=n, min_len=1, max_len=6)),
+      'reverse': (
+          ReverseTask, dict(base=256, n=n, min_len=1, max_len=6)),
+      'reverse-tune': (
+          ReverseTaskV2, dict(base=256, reward_type='static-bylen')),
+      'remove-char-tune': (RemoveCharTaskV2, dict(base=27)),
+      'prefix': (CommonPrefixTask, dict(base=27)),
+      'find': (FindSubStrTask, dict(base=27)),
+      'sort3': (SortFixedTaskV2, dict(base=27, n=150, length=3)),
+      'count-char': (CountCharTaskV2, dict(n=n, max_len=6)),
+      'bool-logic': (BooleanLogicTask, dict()),
+      'add': (AddTask, dict(n=9)),
+      'echo-twice': (EchoTwiceTask, dict(n=n)),
+      'echo-thrice': (EchoThriceTask, dict(n=n)),
+      'copy-reverse': (CopyReverseTask, dict(n=n)),
+      'zero-cascade': (EchoZeroCascadeTask, dict(n=n)),
+      'cascade': (EchoCascadeTask, dict(n=n)),
+      'shift-left': (ShiftLeftTask, dict(n=n)),
+      'shift-right': (ShiftRightTask, dict(n=n)),
+      'riffle': (RiffleTask, dict(n=n)),
+      'unriffle': (UnriffleTask, dict(n=n)),
+      'middle-char': (MiddleCharTask, dict(n=n)),
+      'remove-last': (RemoveLastTask, dict(n=n)),
+      'remove-last-two': (RemoveLastTwoTask, dict(n=n)),
+      'echo-alternating': (EchoAlternatingTask, dict(n=n)),
+      'echo-half': (EchoHalfTask, dict(n=n)),
+      'length': (LengthTask, dict(n=n)),
+      'echo-second-seq': (EchoSecondSequenceTask, dict(n=n)),
+      'echo-nth-seq': (EchoNthSequenceTask, dict(n=n)),
+      'substring': (SubstringTask, dict(n=n)),
+      'divide-2': (Divide2Task, dict(n=n)),
+      'dedup': (DedupTask, dict(n=n)),
+      'remove-target-char': (RemoveTargetCharTask, dict(n=n)),
+      'list-index': (ListIndexTask, dict(n=n)),
+      'fib': (FibonacciTask, dict()),
+      'count-down': (BottlesOfBeerTask, dict()),
+      'split': (SplitTask, dict()),
+      'trim-left': (TrimLeftTask, dict()),
+      'circle-route': (
+          JudgeRouteCircleTask, dict(n=100, max_len=32)),
+      'multiply': (MultiplyTask, dict(n=100)),
+      'divmod': (DivModTask, dict(n=100)),
+  }
+  if task_name not in task_mapping:
+    # Test tasks.
+    if task_name == 'test-hill-climb':
+      return test_tasks.BasicTaskManager(test_tasks.HillClimbingTask())
+    raise ValueError('Unknown task type "%s"' % task_name)
+  task_cls, kwargs = task_mapping[task_name]
+  if override_kwargs:
+    if not isinstance(override_kwargs, dict):
+      raise ValueError(
+          'override_kwargs must be a dict, got: %s', override_kwargs)
+    kwargs.update(override_kwargs)
+  task = task_cls(**kwargs)
+  reward_fn = r.absolute_distance_reward
+  # reward_fn = r.absolute_mod_distance_reward
+  # reward_fn = r.absolute_log_distance_reward
+  logging.info('Using reward function: %s', reward_fn.__name__)
+  # We want reward with and without code simplification to be scaled the same
+  # way. Without code simplification, give the maximum code length bonus
+  # every time.
+  min_code_length = 0.0 if do_code_simplification else max_code_length
+  return MultiIOTaskManager(
+      task=task, correct_bonus=correct_bonus,
+      code_length_bonus=code_length_bonus,
+      max_code_length=max_code_length, min_code_length=min_code_length,
+      reward_fn=reward_fn, require_correct_syntax=require_correct_syntax)
+def concat(lists):
+  if not lists:
+    return []
+  l = lists[0]
+  for k in lists[1:]:
+    l += k
+  return l
+def concat_join(lists, sep):
+  if not lists:
+    return []
+  l = lists[0]
+  for k in lists[1:]:
+    l += [sep] + k
+  return l
+def clipped_linear(x, x0, y0, slope, y_range):
+  min_y, max_y = y_range
+  return min(max(slope * (x - x0) + y0, min_y), max_y)
+class MultiIOTaskManager(object):
+  """Supports tasks which test the code with multiple I/O examples."""
+  def __init__(self, task, max_code_length=32, min_code_length=0,
+               max_execution_steps=MAX_EXECUTION_STEPS, correct_bonus=1.0,
+               code_length_bonus=1.0, failure_reward=-2.0, reward_fn=None,
+               require_correct_syntax=False):
+    assert isinstance(task, BaseTask)
+    self.task = task
+    self.max_code_length = max_code_length
+    self.min_code_length = min_code_length
+    self.max_execution_steps = max_execution_steps
+    self.require_correct_syntax = require_correct_syntax
+    self.correct_bonus = correct_bonus
+    self.code_length_bonus = code_length_bonus
+    self.failure_reward = failure_reward
+    self.time_penalty = (
+        1.0 / (max_code_length - min_code_length)
+        if max_code_length > min_code_length else 0.0)
+    if reward_fn is None:
+      self.reward_fn = r.absolute_distance_reward
+    else:
+      self.reward_fn = reward_fn
+    self.input_type = (
+        task.input_type if hasattr(task, 'input_type') else misc.IOType.integer)
+    self.output_type = (
+        task.output_type if hasattr(task, 'output_type')
+        else misc.IOType.integer)
+    self._compute_best_reward()
+  def _compute_best_reward(self):
+    io_seqs = self.task.make_io_set()
+    reward = 0.0
+    for _, output_seq in io_seqs:
+      reward += self.reward_fn(output_seq, output_seq, self.task.base)
+      reward += self.correct_bonus
+      reward += self.code_length_bonus  # Bonus for shortest code.
+    self.best_reward = reward
+    self.good_reward = 0.75 * reward
+    logging.info('Known best reward: %.4f', self.best_reward)
+  def _score_batch(self, code_strings):
+    return [self._score_code(code) for code in code_strings]
+  def _score_code(self, code):
+    """Run test cases on code and compute reward.
+    Args:
+      code: A single BF code string.
+    Returns:
+      misc.RewardInfo namedtuple instance containing reward and code execution
+          information, including inputs, expected outputs, code outputs, input
+          and output types, and reason for the reward obtained.
+    """
+    # Get list of 2-tuples, each containing an input sequence and an output
+    # sequence.
+    io_seqs = self.task.make_io_set()
+    terminal_reward = 0.0
+    results = []
+    reason = 'correct'
+    for input_seq, output_seq in io_seqs:
+      eval_result = bf.evaluate(
+          code, input_buffer=input_seq, timeout=0.1,
+          max_steps=self.max_execution_steps,
+          base=self.task.base,
+          require_correct_syntax=self.require_correct_syntax)
+      result, success = eval_result.output, eval_result.success
+      if not success:
+        # Code execution timed out.
+        terminal_reward = self.failure_reward
+        results = []
+        reason = eval_result.failure_reason
+        break
+      else:
+        terminal_reward += self.reward_fn(result, output_seq, self.task.base)
+        if result == output_seq:
+          terminal_reward += self.correct_bonus  # Bonus for correct answer.
+          # Only add additional reward for shorter code. Subtracting reward
+          # interferes with the main objective. Only optimize for length once
+          # any solution is found.
+          if self.min_code_length == self.max_code_length:
+            terminal_reward += self.code_length_bonus
+          else:
+            terminal_reward += self.code_length_bonus * clipped_linear(
+                x=len(code), x0=self.min_code_length, y0=1.0,
+                slope=-self.time_penalty, y_range=(0.0, 1.0))
+          # reason remains 'correct' if it is already
+        elif reason == 'correct':
+          reason = 'wrong'
+      results.append(result)
+    # Return list of rewards, one for each char in the code. All are 0 except
+    # for the terminal reward.
+    terminal_reward /= self.best_reward
+    return misc.RewardInfo(
+        episode_rewards=[0.0] * (len(code) - 1) + [terminal_reward],
+        input_case=misc.IOTuple(i for i, o in io_seqs),
+        correct_output=misc.IOTuple(o for i, o in io_seqs),
+        code_output=misc.IOTuple(results),
+        input_type=self.input_type,
+        output_type=self.output_type,
+        reason=reason)
+  def rl_batch(self, batch_size):
+    """Produces list of reward functions. One for each program in the batch."""
+    return [self._score_code] * batch_size
+def conditional_overwrite(current_value, new_value, allowed_overwrite_values):
+  if current_value in allowed_overwrite_values:
+    return new_value
+  return current_value
+class BaseTask(object):
+  """A coding task.
+  All coding tasks should inherit this class.
+  """
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, base=256):
+    self.base = base  # All tasks must set the integer base that the expect.
+  @abc.abstractmethod
+  def make_io_set(self):
+    """Generate a set of test cases for the task.
+    Returns:
+      List of tuples, where each tuple is (input_case, output_case).
+      input_case and output_case are lists of integers.
+    """
+    pass
+# ==============================================================================
+# ICLR tasks.
+# ==============================================================================
+class PrintTask(BaseTask):
+  """Print string coding task.
+  Code needs to output a fixed string (given as a hyperparameter to the
+  task constructor). Program input is ignored.
+  """
+  def __init__(self, base, fixed_string=None):
+    super(type(self), self).__init__()
+    self.base = base  # base includes EOS
+    self.eos = 0
+    if fixed_string:
+      self.fixed_string = fixed_string
+    else:
+      self.fixed_string = [1, 2, 3, 0]  # ABC<EOS>
+    self.min_length = self.max_length = len(self.fixed_string)
+  def make_io_set(self):
+    return [(list(), list(self.fixed_string))]
+class RemoveCharTaskV2(BaseTask):
+  """Remove character coding task (version 2).
+  Code needs to pipe input to output, but with all the 'A' (value 1) chars
+  removed. 'A' appears exactly once in each input.
+  Test cases are hard-coded.
+  """
+  def __init__(self, base):
+    super(type(self), self).__init__()
+    self.base = base
+    self.eos = 0
+    self.remove_char = 1
+    assert base >= 27
+  def make_io_set(self):
+    rm = self.remove_char
+    return [
+        ([rm, 0], [0]),
+        ([20, rm, 0], [20, 0]),
+        ([rm, 13, 0], [13, 0]),
+        ([6, rm, 17, 0], [6, 17, 0]),
+        ([rm, 11, 24, 0], [11, 24, 0]),
+        ([2, 16, 21, rm, 0], [2, 16, 21, 0]),
+        ([18, rm, 12, 26, 7, 0], [18, 12, 26, 7, 0]),
+        ([9, 10, 22, rm, 4, 0], [9, 10, 22, 4, 0])]
+class RemoveCharTask(BaseTask):
+  """Remove character coding task.
+  Code needs to pipe input to output, but with all the 'A' (value 1) chars
+  removed. 'A' appears at least once in each input.
+  Test cases are dynamically generated, allowing for the number of test cases
+  to be a hyperparameter.
+  """
+  def __init__(self, base, n, min_len, max_len):
+    super(type(self), self).__init__()
+    self.base = base
+    self.eos = 0
+    self.remove_char = 1
+    assert base >= 27
+    self._io_pairs = self._make_io_examples(n, min_len, max_len)
+  def _make_io_examples(self, n, min_len, max_len):
+    """Generate test cases for the task."""
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    for _ in xrange(n):
+      length = rand.randrange(min_len, max_len + 1)
+      rm_char_pos = rand.randrange(0, length)
+      input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
+      input_seq[rm_char_pos] = self.remove_char
+      output_seq = list(input_seq)
+      del output_seq[rm_char_pos]
+      output_seq.append(0)
+      io_examples.append((input_seq, output_seq))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class ReverseTaskV2(BaseTask):
+  """Reverse string coding task (version 2).
+  Code needs to pipe input to output, but in reverse order.
+  Stochastic test case = new test case randomly generated for every run of
+  `make_io_set`, i.e. different test cases every time code is scored.
+  Task supports different types of test cases:
+    rand-one: Code is scored on one stochastic test case.
+    rand-many: Code is scored on 5 stochastic test cases.
+    static-bylen: Code is scored on 5 static test cases. There is one test
+        case for string lengths 1 through 5.
+    rand-bylen: Code is scored on 5 stochastic test cases, where there is one
+        test case for string lengths 1 through 5.
+  """
+  def __init__(self, base, reward_type):
+    super(type(self), self).__init__()
+    self.base = base  # base includes EOS
+    assert base >= 27
+    self.eos = 0
+    self.io_pair_fn = {
+        # One random example at a time.
+        'rand-one': lambda: self._io_rand(1),
+        # K randomy examples at a time (any lengths).
+        'rand-many': lambda: self._io_rand(5),
+        # Static examples, one for each length.
+        'static-bylen': self._io_static_by_len,
+        # Random examples, one for each length.
+        'rand-bylen': self._io_rand_by_len}[reward_type]
+  def _make_io_examples(self, sequences):
+    outputs = [list(i) for i in sequences]
+    for o in outputs:
+      o.reverse()
+      o.append(0)
+    inputs = [i + [0] for i in sequences]
+    return zip(inputs, outputs)
+  def _io_rand(self, k):
+    inputs = [(np.random.choice(26, random.randrange(1, 6)) + 1).tolist()
+              for _ in xrange(k)]
+    return self._make_io_examples(inputs)
+  def _io_rand_by_len(self, k=5):
+    inputs = [(np.random.choice(26, length) + 1).tolist()
+              for length in xrange(1, k + 1)]
+    return self._make_io_examples(inputs)
+  def _io_static_by_len(self):
+    return [
+        ([7, 0], [7, 0]),
+        ([6, 2, 0], [2, 6, 0]),
+        ([5, 1, 10, 0], [10, 1, 5, 0]),
+        ([8, 6, 5, 15, 0], [15, 5, 6, 8, 0]),
+        ([10, 12, 5, 2, 7, 0], [7, 2, 5, 12, 10, 0])]
+  def make_io_set(self):
+    return self.io_pair_fn()
+class ReverseTask(BaseTask):
+  """Reverse string coding task.
+  Code needs to pipe input to output, but in reverse order.
+  Test cases are dynamically generated, allowing for the number of test cases
+  to be a hyperparameter.
+  """
+  def __init__(self, base, n, min_len, max_len):
+    super(type(self), self).__init__()
+    self.base = base  # base includes EOS
+    assert base >= 27
+    self.eos = 0
+    self._io_pairs = self._make_io_examples(n, min_len, max_len)
+  def _make_io_examples(self, n, min_len, max_len):
+    """Generate test cases for the task."""
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    for _ in xrange(n):
+      length = rand.randrange(min_len, max_len + 1)
+      input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
+      output_seq = list(input_seq)
+      output_seq.reverse()
+      output_seq.append(0)
+      io_examples.append((input_seq, output_seq))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class CommonPrefixTask(BaseTask):
+  """Common prefix coding task.
+  Code needs to output the common prefix between two input lists. Input lists
+  are variable length, where each list ends with a 0. A common prefix is a
+  sequence which both lists start with.
+  """
+  def __init__(self, base):
+    super(type(self), self).__init__()
+    assert base >= 27
+    self.base = base
+    self.eos = 0
+  def make_io_set(self):
+    return [
+        ([12, 24, 18, 0, 12, 5, 0], [12, 0]),
+        ([1, 2, 3, 0, 1, 2, 17, 14, 0], [1, 2, 0]),
+        ([15, 2, 1, 9, 2, 0, 15, 2, 1, 25, 8, 14, 0], [15, 2, 1, 0]),
+        ([14, 9, 7, 8, 6, 16, 0, 14, 9, 7, 8, 8, 6, 8, 26, 0],
+         [14, 9, 7, 8, 0]),
+        ([12, 4, 16, 22, 1, 17, 0, 12, 4, 16, 22, 1, 8, 10, 0],
+         [12, 4, 16, 22, 1, 0])]
+class CountCharTask(BaseTask):
+  def __init__(self):
+    super(type(self), self).__init__()
+    self.base = 27
+    self.eos = 0
+    self.char = 1
+    self.input_type = misc.IOType.string
+    self.output_type = misc.IOType.integer
+  def make_io_set(self):
+    return [
+        ([10, 0], [0]),
+        ([1, 0], [1]),
+        ([1, 1, 0], [2]),
+        ([11, 1, 0], [1]),
+        ([1, 24, 0], [1]),
+        ([13, 6, 0], [0]),
+        ([9, 2, 7, 0], [0]),
+        ([1, 24, 11, 0], [1]),
+        ([19, 1, 1, 0], [2]),
+        ([1, 6, 1, 0], [2]),
+        ([22, 16, 17, 9, 0], [0]),
+        ([1, 1, 1, 19, 0], [3]),
+        ([1, 1, 1, 1, 0], [4]),
+        ([9, 4, 19, 11, 5, 0], [0]),
+        ([24, 11, 26, 1, 15, 0], [1]),
+        ([1, 1, 20, 1, 1, 0], [4]),
+        ([1, 1, 1, 1, 1, 0], [5])]
+class CountCharTaskV2(BaseTask):
+  """Count char coding task (version 2).
+  Code must output the number of occurances of character 'A' (value 1) in an
+  input string.
+  Test cases are dynamically generated, allowing for the number of test cases
+  to be a hyperparameter.
+  """
+  def __init__(self, n, max_len):
+    super(type(self), self).__init__()
+    self.base = 27
+    self.eos = 0
+    self.char = 1
+    self.other_chars = [c for c in xrange(self.base)
+                        if c not in (self.eos, self.char)]
+    self.input_type = misc.IOType.string
+    self.output_type = misc.IOType.integer
+    self._io_pairs = self._make_io_examples(n, max_len)
+  def _make_io_examples(self, n, max_len):
+    """Generate test cases for the task."""
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    io_examples.append(([10, 0], [0]))
+    io_examples.append(([1, 0], [1]))
+    io_examples.append(([1, 1, 0], [2]))
+    io_examples.append(([9, 4, 19, 11, 5, 0], [0]))
+    io_examples.append(([24, 11, 26, 1, 15, 0], [1]))
+    for _ in xrange(n - 5):
+      length = rand.randrange(2, max_len + 1)
+      num_chars = rand.randrange(0, max_len + 1)
+      input_seq = [self.char] * num_chars + [0] * (length - num_chars)
+      rand.shuffle(input_seq)
+      for i in xrange(len(input_seq)):
+        if not input_seq[i]:
+          input_seq[i] = self.other_chars[rand.randrange(len(self.other_chars))]
+      output_seq = [num_chars]
+      io_examples.append((input_seq, output_seq))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class AddTask(BaseTask):
+  """Addition coding task.
+  Code needs to read in two integers and output their sum mod the BF base,
+  followed by a terminating 0.
+  """
+  def __init__(self, n=16):
+    super(type(self), self).__init__()
+    self.base = 256
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+    self._io_pairs = self._make_io_examples(n)
+  def _make_io_examples(self, n):
+    """Generate test cases for the task."""
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = [
+        ([4, 0], [4, 0]),
+        ([0, 5], [5, 0]),
+        ([1, 2], [3, 0]),
+        ([67, 21], [88, 0]),
+        ([55, 56], [111, 0]),
+        ([128, 33], [161, 0]),
+        ([221, 251], [216, 0]),
+        ([130, 127], [1, 0]),
+        ([255, 1], [0, 0])]
+    extra_examples = max(n - len(io_examples), 0)
+    for _ in xrange(extra_examples):
+      a = rand.randrange(256)
+      b = rand.randrange(256)
+      input_seq = [a, b]
+      output_seq = [(a + b) % 256, 0]
+      io_examples.append((input_seq, output_seq))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class BooleanLogicTask(BaseTask):
+  """Boolean logic (truth table) coding task.
+  Code needs to memorize a boolean truth table. Specifically, it must encode a
+  mapping from triple of bools to a single bool.
+  """
+  def __init__(self):
+    super(type(self), self).__init__()
+    self.base = 2
+    self.input_type = misc.IOType.boolean
+    self.output_type = misc.IOType.boolean
+    # X(~Z) + (~Y)(~Z) + (~X)YZ
+    self._truth_fn = (
+        lambda x, y, z:  # pylint: disable=g-long-lambda
+        (x and not z) or (not y and not z) or (not x and y and z))
+    self._test_cases = [
+        ([x, y, z], [int(self._truth_fn(x, y, z))])
+        for x, y, z in itertools.product(range(2), range(2), range(2))]
+  def make_io_set(self):
+    return copy.deepcopy(self._test_cases)
+# ------------------------------------------------------------------------------
+# The following tasks are generated from known BF solutions. This guarantees
+# that each task can be solved within the maximum code length, and maximum
+# execution steps.
+# ------------------------------------------------------------------------------
+def default_input_fn_factory(min_length=1, max_length=6, base=256):
+  def _input_gen(rand):
+    l = rand.randrange(min_length, max_length + 1)
+    return [rand.randrange(base) for _ in xrange(l)]
+  return _input_gen
+class KnownCodeBaseTask(BaseTask):
+  """These tasks generate their test cases from a known BF solution.
+  This ensures that each task has a solution which is under the max character
+  length, and that it solves the test cases under the max number of execution
+  steps.
+  """
+  def __init__(self, code_solution, make_input_fn, n=100, base=256,
+               max_steps=5000, seed=6849275409234):
+    super(KnownCodeBaseTask, self).__init__()
+    # Make sure known solution is less than the code length used in experiments.
+    assert len(code_solution) < 100
+    self.code_solution = code_solution
+    self.make_input_fn = make_input_fn
+    self.n = n
+    self.base = base
+    self.max_steps = max_steps
+    self.seed = seed
+    self._test_cases = list(self._test_case_generator(code_solution))
+  def _test_case_generator(self, code_solution):
+    rand = random.Random(self.seed)
+    for _ in xrange(self.n):
+      input_case = self.make_input_fn(rand)
+      result = bf.evaluate(
+          code_solution, input_buffer=input_case, max_steps=self.max_steps,
+          base=self.base, require_correct_syntax=False)
+      if not result.success:
+        raise RuntimeError(
+            'Program must succeed. Failed on input: %s' % input_case)
+      yield input_case, result.output
+  def make_io_set(self):
+    return copy.deepcopy(self._test_cases)
+class EchoTwiceTask(KnownCodeBaseTask):
+  """Echo twice."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,.[>,.]<[<]>[.>].',
+        default_input_fn_factory(),
+        **kwargs)
+class EchoThriceTask(KnownCodeBaseTask):
+  """Echo three times."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,.[>,.]<[<]>[.>].<[<]>[.>].',
+        default_input_fn_factory(),
+        **kwargs)
+class CopyReverseTask(KnownCodeBaseTask):
+  """Echo forwards, backwards, and then forwards again."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,.[>,.]<[.<].>[.>].',
+        default_input_fn_factory(),
+        **kwargs)
+class EchoZeroCascadeTask(KnownCodeBaseTask):
+  """Print k-th char with k zeros inbetween (1-indexed)."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        ',[.>[->+>.<<]>+[-<+>]<<,]',
+        default_input_fn_factory(),
+        **kwargs)
+class EchoCascadeTask(KnownCodeBaseTask):
+  """Print k-th char k times (1-indexed)."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        ',>>+<<[>>[-<+>]<[->+<<.>]>+<<,].',
+        default_input_fn_factory(base=20),
+        **kwargs)
+class ShiftLeftTask(KnownCodeBaseTask):
+  """Circulate shift input left."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        ',>,[.,]<.,.',
+        default_input_fn_factory(),
+        **kwargs)
+class ShiftRightTask(KnownCodeBaseTask):
+  """Circular shift input right."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,[>,]<.[-]<[<]>[.>].',
+        default_input_fn_factory(),
+        **kwargs)
+class RiffleTask(KnownCodeBaseTask):
+  """Shuffle like a deck of cards.
+  For input of length N, output values in the following index order:
+  N-1, 0, N-2, 1, N-3, 2, ...
+  """
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,[>,]<[.[-]<[<]>.[-]>[>]<]',
+        default_input_fn_factory(base=20, max_length=8),
+        **kwargs)
+class UnriffleTask(KnownCodeBaseTask):
+  """Inverse of riffle."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,[>,[.[-]],]<[.<].',
+        default_input_fn_factory(base=20, max_length=8),
+        **kwargs)
+class MiddleCharTask(KnownCodeBaseTask):
+  """Print middle char if length is odd, or 0 if even."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,[>,]<<[[>]<[,<[<]>,>[>]][>]<<]>.',
+        default_input_fn_factory(max_length=10),
+        **kwargs)
+class RemoveLastTask(KnownCodeBaseTask):
+  """Remove last character."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        ',>,[[<.[-]>[-<+>]],].',
+        default_input_fn_factory(base=20),
+        **kwargs)
+class RemoveLastTwoTask(KnownCodeBaseTask):
+  """Remove last two characters."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        ',>,>,[[<<.[-]>[-<+>]>[-<+>]],].',
+        default_input_fn_factory(base=10),
+        **kwargs)
+class EchoAlternatingTask(KnownCodeBaseTask):
+  # Print even numbered chars first (0-indexed), then odd numbered chars
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>,[.,>,]<<[<]>[.>].',
+        default_input_fn_factory(base=20, max_length=8),
+        **kwargs)
+class EchoHalfTask(KnownCodeBaseTask):
+  """Echo only first half of the input (round down when odd lengthed)."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>>+>,[[<]>+[>],]<[<]>-[-[-<<+>]<[>]>]<<[->+<]>[[>]>.,<+[<]>-].',
+        default_input_fn_factory(base=20, max_length=9),
+        **kwargs)
+class LengthTask(KnownCodeBaseTask):
+  """Print length of the input sequence."""
+  def __init__(self, **kwargs):
+    super(type(self), self).__init__(
+        '>+>,[[<]>+[>],]<[<]>-.',
+        default_input_fn_factory(max_length=14),
+        **kwargs)
+class EchoSecondSequenceTask(KnownCodeBaseTask):
+  """Echo second sequence. Sequences are separated by 0."""
+  def __init__(self, **kwargs):
+    def echo_second_gen(rand):
+      l = rand.randrange(1, 6)
+      x = [rand.randrange(256) for _ in xrange(l)]
+      l = rand.randrange(1, 6)
+      y = [rand.randrange(256) for _ in xrange(l)]
+      return x + [0] + y + [0]
+    super(type(self), self).__init__(
+        ',[,],[.,].',
+        echo_second_gen,
+        **kwargs)
+class EchoNthSequenceTask(KnownCodeBaseTask):
+  """Echo n-th sequence (1-indexed). Sequences are separated by 0."""
+  def __init__(self, **kwargs):
+    def echo_nth_gen(rand):
+      k = rand.randrange(1, 7)
+      n = rand.randrange(1, k + 1)
+      x = []
+      for _ in xrange(k):
+        l = rand.randrange(0, 4)
+        x += [rand.randrange(256) for _ in xrange(l)] + [0]
+      return [n] + x
+    super(type(self), self).__init__(
+        ',-[->,[,]<],[.,].',
+        echo_nth_gen,
+        **kwargs)
+class SubstringTask(KnownCodeBaseTask):
+  """Echo substring.
+  First two inputs are i and l, where i is the starting index (0-indexed)
+  and l is the length of the substring.
+  """
+  def __init__(self, **kwargs):
+    def substring_gen(rand):
+      l = rand.randrange(2, 16)
+      i, j = sorted([rand.randrange(l), rand.randrange(l)])
+      n = j - i
+      x = [rand.randrange(256) for _ in xrange(l)] + [0]
+      return [i, n] + x
+    super(type(self), self).__init__(
+        '>,<,>[->,<]>,<<[->>.,<<]',
+        substring_gen,
+        **kwargs)
+class Divide2Task(KnownCodeBaseTask):
+  """Divide by 2 (integer floor division)."""
+  def __init__(self, **kwargs):
+    def int_input_gen(rand):
+      return [rand.randrange(256)]
+    super(type(self), self).__init__(
+        ',[-[->>+<]>[<]<]>>.',
+        int_input_gen,
+        **kwargs)
+class DedupTask(KnownCodeBaseTask):
+  """Deduplicate adjacent duplicate chars."""
+  def __init__(self, **kwargs):
+    def dedup_input_gen(rand):
+      np_random = np.random.RandomState(rand.randrange(2147483647))
+      num_unique = rand.randrange(1, 5)
+      unique = np_random.choice(6, num_unique, replace=False) + 1
+      return [v for v in unique for _ in xrange(rand.randrange(1, 5))] + [0]
+    super(type(self), self).__init__(
+        '>>,.[[-<+<+>>],[-<->]<[[-<->]<.>]<[->>+<<]>>]',
+        dedup_input_gen,
+        **kwargs)
+# ==============================================================================
+# Extra tasks.
+# ==============================================================================
+class PrintIntTask(BaseTask):
+  """Print integer coding task.
+  Code needs to output a fixed single value (given as a hyperparameter to the
+  task constructor). Program input is ignored.
+  """
+  def __init__(self, base, fixed_string):
+    super(type(self), self).__init__()
+    self.base = base
+    self.eos = 0
+    self.fixed_string = fixed_string
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+  def make_io_set(self):
+    return [(list(), list(self.fixed_string))]
+class EchoTask(BaseTask):
+  """Echo string coding task.
+  Code needs to pipe input to putput (without any modifications).
+  """
+  def __init__(self, base, min_length=1, max_length=5):
+    super(type(self), self).__init__()
+    self.base = base  # base includes EOS
+    self.eos = 0
+    self.min_length = min_length
+    self.max_length = max_length
+    self._io_pairs = self._make_io_examples(25)
+  def _make_io_examples(self, n):
+    # Test cases are fixed, but varied.
+    np_random = np.random.RandomState(1234567890)
+    io_pairs = []
+    for _ in xrange(n):
+      length = np_random.randint(self.min_length, self.max_length + 1)
+      input_seq = np_random.randint(1, self.base, length).tolist() + [self.eos]
+      output_seq = list(input_seq)
+      io_pairs.append((input_seq, output_seq))
+    return io_pairs
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class JudgeRouteCircleTask(BaseTask):
+  """Judge route circle coding task.
+  Code needs to determine if the given route makes a closed loop.
+  Encoding: U = 1, R = 2, D = 3, L = 4.
+  Based on
+  https://leetcode.com/problems/judge-route-circle/description/
+  """
+  base = 256
+  input_type = misc.IOType.integer
+  output_type = misc.IOType.integer
+  def __init__(self, n, max_len=12):
+    super(type(self), self).__init__()
+    self.eos = 0
+    self._io_pairs = self._make_io_examples(n, max_len)
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+  def _solve(self, input_seq):
+    assert input_seq[-1] == 0
+    pos = [0, 0]  # (x, y)
+    for move in input_seq[:-1]:
+      assert 0 < move <= 4
+      if move & 1 == 0:  # Left or Right.
+        pos[0] += 3 - move  # Add or subtract 1.
+      else:
+        pos[1] += 2 - move  # Add or subtract 1.
+    return [int(not pos[0] and not pos[1])]
+  def _make_io_examples(self, n, max_len):
+    """Generate test cases for the task."""
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    io_examples.append(([0], [1]))
+    io_examples.append(([4, 2, 0], [1]))
+    io_examples.append(([2, 4, 0], [1]))
+    io_examples.append(([3, 1, 0], [1]))
+    io_examples.append(([1, 3, 0], [1]))
+    io_examples.append(([1, 0], [0]))
+    io_examples.append(([2, 0], [0]))
+    io_examples.append(([3, 0], [0]))
+    io_examples.append(([4, 0], [0]))
+    for _ in xrange(n):
+      is_true = rand.randrange(2)
+      length = rand.randrange(1, max_len + 1)
+      if is_true:
+        # Make a true case.
+        length = (length >> 1) << 1  # Make even.
+        partition = (rand.randrange(length + 1) >> 1) << 1
+        a = partition >> 1
+        b = (length - partition) >> 1
+        counts = {1: a, 2: b, 3: a, 4: b}
+      else:
+        # Make a false case.
+        partitions = (
+            [0]
+            + sorted([rand.randrange(length + 1) for _ in range(3)])
+            + [length])
+        counts = {n: partitions[n] - partitions[n - 1] for n in range(1, 5)}
+        if counts[1] == counts[3] and counts[2] == counts[4]:
+          # By chance we sampled a true case. Make it false by exchanging
+          # one count between even and odd pairs.
+          base = 1 + 2 * rand.randrange(2)
+          a, b = (base, base + 1) if rand.randrange(2) else (base + 1, base)
+          if counts[a] == length or counts[b] == 0:
+            # If counts are at their extreme values, then swap who gets
+            # incremented and decremented.
+            a, b = b, a
+          counts[a] += 1
+          counts[b] -= 1
+          assert counts[a] <= length and counts[b] >= 0
+      assert sum(counts.values()) == length
+      input_seq = [n for n in xrange(1, 5) for _ in xrange(counts[n])]
+      rand.shuffle(input_seq)
+      input_seq += [0]
+      output_seq = self._solve(input_seq)
+      assert output_seq[0] == is_true
+      io_examples.append((input_seq, output_seq))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class MultiplyTask(BaseTask):
+  """Multiply coding task.
+  Code needs to multiple two ints.
+  Solution:
+  http://robl.co/brief-look-at-brainfuck/
+  ,>,><<[->[->+>+<<]>>[-<<+>>]<<<]>>.
+  """
+  base = 512
+  input_type = misc.IOType.integer
+  output_type = misc.IOType.integer
+  def __init__(self, n):
+    super(type(self), self).__init__()
+    self.eos = 0
+    self._io_pairs = self._make_io_examples(n)
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+  def _factors(self, n):
+    return set(i for i in range(1, int(n**0.5) + 1) if n % i == 0)
+  def _make_io_examples(self, n):
+    """Generate test cases for the task."""
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    for _ in xrange(n):
+      n = rand.randrange(self.base)
+      if n == 0:
+        a, b = 0, rand.randrange(self.base)
+      else:
+        f = list(self._factors(n))
+        a = f[rand.randrange(len(f))]
+        b = n // a
+      if rand.randrange(2):
+        a, b = b, a
+      io_examples.append(([a, b], [n]))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class DivModTask(BaseTask):
+  """Divmod coding task.
+  Code needs to take the quotient and remainder of two ints.
+  Solution:
+  http://robl.co/brief-look-at-brainfuck/
+  ,>,><<[>[->+>+<<]>[-<<-[>]>>>[<[-<->]<[>]>>[[-]>>+<]>-<]<<]>>>+<<[-<<+>>]<<<]>
+  >>>>[-<<<<<+>>>>>]<<<<<.>.>
+  """
+  base = 512
+  input_type = misc.IOType.integer
+  output_type = misc.IOType.integer
+  def __init__(self, n):
+    super(type(self), self).__init__()
+    self.eos = 0
+    self._io_pairs = self._make_io_examples(n)
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+  def _make_io_examples(self, n):
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    for _ in xrange(n):
+      n = rand.randrange(0, self.base)
+      k = rand.randrange(1, self.base)  # Divisor cannot be 0.
+      io_examples.append(([n, k], list(divmod(n, k))))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class FibonacciTask(BaseTask):
+  def __init__(self):
+    super(type(self), self).__init__()
+    self.base = 256
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+  def make_io_set(self):
+    return [
+        ([0], [0, 1]),
+        ([1], [1, 1]),
+        ([2], [1, 2]),
+        ([3], [2, 3]),
+        ([4], [3, 5]),
+        ([5], [5, 8]),
+        ([6], [8, 13]),
+        ([7], [13, 21]),
+        ([8], [21, 34]),
+        ([9], [34, 55]),
+        ([10], [55, 89]),
+        ([11], [89, 144]),
+        ([12], [144, 233]),
+        ([13], [233, 121])]
+class FindSubStrTask(BaseTask):
+  """Find sub-string coding task.
+  Code needs to output a bool: True if the input string contains a hard-coded
+  substring, 'AB' (values [1, 2]).
+  """
+  def __init__(self, base):
+    super(type(self), self).__init__()
+    assert base >= 27
+    self.base = base
+    self.eos = 0
+    self.find_str = [1, 2]
+    self.input_type = misc.IOType.string
+    self.output_type = misc.IOType.boolean
+  def make_io_set(self):
+    return [
+        ([1, 1, 23, 0], [0]),
+        ([21, 3, 2, 0], [0]),
+        ([2, 1, 19, 0], [0]),
+        ([2, 24, 15, 3, 0], [0]),
+        ([24, 6, 10, 16, 4, 0], [0]),
+        ([1, 2, 12, 0], [1]),
+        ([7, 1, 2, 0], [1]),
+        ([1, 2, 11, 3, 0], [1]),
+        ([1, 1, 2, 18, 0], [1]),
+        ([7, 25, 1, 2, 0], [1]),
+        ([3, 1, 2, 11, 8, 0], [1]),
+        ([15, 16, 20, 1, 2, 0], [1])]
+class SortFixedTask(BaseTask):
+  """Sort list coding task.
+  Code needs to output a sorted input list. The task consists of lists of the
+  same length L, where L is provided to this task's constructor as a
+  hyperparameter.
+  """
+  def __init__(self, base, length=3):
+    super(type(self), self).__init__()
+    assert base >= 27
+    self.base = base
+    self.eos = 0
+    self.length = length
+    assert length == 3  # More lengths will be supported.
+  def make_io_set(self):
+    if self.length == 3:
+      return [
+          ([1, 20, 6], [1, 6, 20]),
+          ([13, 6, 7], [6, 7, 13]),
+          ([24, 2, 23], [2, 23, 24]),
+          ([16, 12, 3], [3, 12, 16]),
+          ([11, 24, 4], [4, 11, 24]),
+          ([10, 1, 19], [1, 10, 19])]
+class SortFixedTaskV2(BaseTask):
+  """Sort list coding task (version 2).
+  Code needs to output a sorted input list. The task consists of lists of the
+  same length L, where L is provided to this task's constructor as a
+  hyperparameter.
+  Test cases are dynamically generated, allowing for the number of test cases
+  to be a hyperparameter.
+  """
+  def __init__(self, base, n, length=3):
+    super(type(self), self).__init__()
+    assert base >= 27
+    self.base = base
+    self.eos = 0
+    self._io_pairs = self._make_io_examples(n, length)
+    self.input_type = misc.IOType.integer
+    self.output_type = misc.IOType.integer
+  def _make_io_examples(self, n, length):
+    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
+    io_examples = []
+    for _ in xrange(n):
+      input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
+      output_seq = sorted(input_seq)
+      io_examples.append((input_seq, output_seq))
+    return io_examples
+  def make_io_set(self):
+    return copy.deepcopy(self._io_pairs)
+class RemoveTargetCharTask(KnownCodeBaseTask):
+  """Remove target character from string, where first input is the target.
+  Target can appear multiple times.
+  """
+  def __init__(self, **kwargs):
+    def randrange_hole(rand, a, hole, b):
+      x = rand.randrange(a, b - 1)
+      if x >= hole:
+        return x + 1
+      return x
+    def remove_target_char_gen(rand):
+      char = rand.randrange(1, 6)
+      l = rand.randrange(1, 8)
+      input_seq = [randrange_hole(rand, 1, char, 256) for _ in xrange(l)]
+      idx = range(l)
+      rand.shuffle(idx)
+      num_targets = rand.randrange(0, l)
+      for pos in idx[:num_targets]:
+        input_seq[pos] = char
+      return [char] + input_seq + [0]
+    super(type(self), self).__init__(
+        ',>>>,[<<<[->+>+<<]>>[->->+<<]>[>[-<+>]<.[-]]>[-]<<<[-<+>]>>,].',
+        remove_target_char_gen,
+        **kwargs)
+class ListIndexTask(KnownCodeBaseTask):
+  """Echo i-th value in the given list."""
+  def __init__(self, **kwargs):
+    def array_index_gen(rand):
+      l = rand.randrange(1, 16)
+      i = rand.randrange(l)
+      return [i] + [rand.randrange(256) for _ in xrange(l)] + [0]
+    super(type(self), self).__init__(
+        ',[->,<]>,.',
+        array_index_gen,
+        **kwargs)
+# ==============================================================================
+# Tasks based on primaryobjects paper.
+# ==============================================================================
+def string2tokens(string):
+  return [ord(c) for c in string]
+def stringlist2tokens(strings):
+  return [string2tokens(string) for string in strings]
+def string2tokens_b27(string):
+  return [ord(c.lower()) - ord('a') + 1 for c in string]
+def stringlist2tokens_b27(strings):
+  return [string2tokens_b27(string) for string in strings]
+class BottlesOfBeerTask(BaseTask):
+  """Bottles of beer coding task.
+  This is a counting task. Code needs to read in an int N and then output
+  every int from N to 0, each separated by a 0.
+  """
+  base = 256
+  input_type = misc.IOType.integer
+  output_type = misc.IOType.integer
+  def make_io_set(self):
+    return [
+        ([1], [1, 0]),
+        ([2], [2, 0, 1, 0]),
+        ([3], [3, 0, 2, 0, 1, 0]),
+        ([4], [4, 0, 3, 0, 2, 0, 1, 0]),
+        ([5], [5, 0, 4, 0, 3, 0, 2, 0, 1, 0]),
+        ([6], [6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0])]
+class SplitTask(BaseTask):
+  """Split coding task.
+  Code needs to pipe input strings to output, but insert a 0 after every 3
+  characters. This is in essence splitting the string into intervals of length
+  3.
+  """
+  base = 28
+  input_type = misc.IOType.string
+  output_type = misc.IOType.integer
+  def _splicer(self, lst, insert, interval=3):
+    for i, item in enumerate(lst):
+      yield item
+      if (i + 1) % interval == 0 and i < len(lst) - 1:
+        yield insert
+  def __init__(self):
+    super(type(self), self).__init__()
+    inputs = stringlist2tokens_b27(
+        ['hello', 'orange', 'spaghetti', 'wins', 'one'])
+    targets = [list(self._splicer(i, 27)) for i in inputs]
+    self._test_cases = list(zip(inputs, targets))
+  def make_io_set(self):
+    return copy.deepcopy(self._test_cases)
+class TrimLeftTask(BaseTask):
+  """Trim left coding task.
+  Code needs to pipe input strings to output, but remove everything before the
+  first quotation char (").
+  """
+  base = 256
+  input_type = misc.IOType.integer
+  output_type = misc.IOType.integer
+  def __init__(self):
+    super(type(self), self).__init__()
+    inputs = stringlist2tokens(
+        ['a "inside" over', 'xy "test" rights', 'ca6 "foresting" service',
+         'abc"def"yz.', 'A"B"'])
+    targets = stringlist2tokens(
+        ['"inside" over', '"test" rights', '"foresting" service', '"def"yz.',
+         '"B"'])
+    self._test_cases = list(zip(inputs, targets))
+  def make_io_set(self):
+    return copy.deepcopy(self._test_cases)
--- a/research/brain_coder/single_task/code_tasks_test.py
+++ b/research/brain_coder/single_task/code_tasks_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for code_tasks."""
+import numpy as np
+import tensorflow as tf
+from single_task import code_tasks  # brain coder
+from single_task import defaults  # brain coder
+def pad(string, pad_length, pad_char):
+  return string + pad_char * (pad_length - len(string))
+class CodeTasksTest(tf.test.TestCase):
+  def assertClose(self, a, b):
+    self.assertTrue(
+        np.isclose(a, b, atol=1e-4),
+        'Expecting approximately equal values. Got: %s, %s' % (a, b))
+  def testMultiIOTaskManager(self):
+    maxlen = 100
+    padchr = '['
+    task = code_tasks.make_paper_task(
+        'print', timestep_limit=maxlen, do_code_simplification=False)
+    reward_fns = task.rl_batch(1)
+    r = reward_fns[0]
+    self.assertClose(
+        r(pad('++++++++.---.+++++++...', maxlen, padchr)).episode_rewards[-1],
+        0.2444)
+    self.assertClose(
+        r(pad('++++++++.---.+++++++..+++.',
+              maxlen, padchr)).episode_rewards[-1],
+        1.0)
+    task = code_tasks.make_paper_task(
+        'print', timestep_limit=maxlen, do_code_simplification=True)
+    reward_fns = task.rl_batch(1)
+    r = reward_fns[0]
+    self.assertClose(
+        r('++++++++.---.+++++++...').episode_rewards[-1],
+        0.2444)
+    self.assertClose(
+        r('++++++++.---.+++++++..+++.').episode_rewards[-1],
+        0.935)
+    self.assertClose(
+        r(pad('++++++++.---.+++++++..+++.',
+              maxlen, padchr)).episode_rewards[-1],
+        0.75)
+    task = code_tasks.make_paper_task(
+        'reverse', timestep_limit=maxlen, do_code_simplification=False)
+    reward_fns = task.rl_batch(1)
+    r = reward_fns[0]
+    self.assertClose(
+        r(pad('>,>,>,.<.<.<.', maxlen, padchr)).episode_rewards[-1],
+        0.1345)
+    self.assertClose(
+        r(pad(',[>,]+[,<.]', maxlen, padchr)).episode_rewards[-1],
+        1.0)
+    task = code_tasks.make_paper_task(
+        'reverse', timestep_limit=maxlen, do_code_simplification=True)
+    reward_fns = task.rl_batch(1)
+    r = reward_fns[0]
+    self.assertClose(r('>,>,>,.<.<.<.').episode_rewards[-1], 0.1324)
+    self.assertClose(r(',[>,]+[,<.]').episode_rewards[-1], 0.9725)
+    self.assertClose(
+        r(pad(',[>,]+[,<.]', maxlen, padchr)).episode_rewards[-1],
+        0.75)
+  def testMakeTask(self):
+    maxlen = 100
+    padchr = '['
+    config = defaults.default_config_with_updates(
+        'env=c(config_for_iclr=False,fixed_string=[8,5,12,12,15])')
+    task = code_tasks.make_task(config.env, 'print', timestep_limit=maxlen)
+    reward_fns = task.rl_batch(1)
+    r = reward_fns[0]
+    self.assertClose(
+        r('++++++++.---.+++++++...').episode_rewards[-1],
+        0.2444)
+    self.assertClose(
+        r('++++++++.---.+++++++..+++.').episode_rewards[-1],
+        0.935)
+    self.assertClose(
+        r(pad('++++++++.---.+++++++..+++.',
+              maxlen, padchr)).episode_rewards[-1],
+        0.75)
+  def testKnownCodeBaseTask(self):
+    maxlen = 100
+    padchr = '['
+    task = code_tasks.make_paper_task(
+        'shift-left', timestep_limit=maxlen, do_code_simplification=False)
+    reward_fns = task.rl_batch(1)
+    r = reward_fns[0]
+    self.assertClose(
+        r(pad(',>,[.,]<.,.', maxlen, padchr)).episode_rewards[-1],
+        1.0)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/data.py
+++ b/research/brain_coder/single_task/data.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Manage data for pretraining and RL tasks."""
+import ast
+from collections import namedtuple
+from absl import logging
+from single_task import code_tasks  # brain coder
+RLBatch = namedtuple('RLBatch', ['reward_fns', 'batch_size', 'good_reward'])
+class DataManager(object):
+  """Interface between environment and model."""
+  def __init__(self, global_config, run_number=None,
+               do_code_simplification=False):
+    """Constructs a DataManager.
+    Args:
+      global_config: A config_lib.Config instance containing all config. See
+          config in defaults.py.
+      run_number: Which run this is (of the same experiment). This should be set
+          when a task cycle is defined in the config. A task cycle is a list of
+          tasks to cycle through repeatedly, and the selected task is a function
+          of the run number, i.e. 0-th run, 1-st run, 2-nd run, etc...
+          This can be None if only a single task is set in the config.
+      do_code_simplification: When global_config.env.config_for_iclr is True,
+          use this option to create code simplification (code golf) tasks, vs
+          fixed length coding tasks. If True, a task with code simplification
+          reward will be constructed.
+    Raises:
+      ValueError: If global_config.env.task and global_config.env.task_cycle
+          are both set, or both not set. Only one should be given.
+      ValueError: If global_config.env.task_cycle is set but run_number is None.
+    """
+    env_config = global_config.env
+    self.batch_size = global_config.batch_size
+    if env_config.task_cycle:
+      if env_config.task:
+        raise ValueError('Do not set both `task` and `task_cycle`.')
+      if run_number is None:
+        raise ValueError('Do not use task_cycle for single-run experiment.')
+      index = run_number % len(env_config.task_cycle)
+      self.task_name = env_config.task_cycle[index]
+      logging.info('run_number: %d,  task_cycle index: %d', run_number, index)
+      logging.info('task_cycle: %s', env_config.task_cycle)
+    elif env_config.task:
+      self.task_name = env_config.task
+    else:
+      raise ValueError('Either `task` or `task_cycle` must be set.')
+    logging.info('Task for this run: "%s"', self.task_name)
+    logging.info('config_for_iclr=True; do_code_simplification=%s',
+                 do_code_simplification)
+    self.rl_task = code_tasks.make_task(
+        task_name=self.task_name,
+        override_kwargs=ast.literal_eval(env_config.task_kwargs),
+        max_code_length=global_config.timestep_limit,
+        require_correct_syntax=env_config.correct_syntax,
+        do_code_simplification=do_code_simplification,
+        correct_bonus=env_config.task_manager_config.correct_bonus,
+        code_length_bonus=env_config.task_manager_config.code_length_bonus)
+  def sample_rl_batch(self):
+    """Create reward functions from the current task.
+    Returns:
+      RLBatch namedtuple instance, which holds functions and information for
+      a minibatch of episodes.
+      * reward_fns: A reward function for each episode. Maps code string to
+          reward.
+      * batch_size: Number of episodes in this minibatch.
+      * good_reward: Estimated threshold of rewards which indicate the algorithm
+          is starting to solve the task. This is a heuristic that tries to
+          reduce the amount of stuff written to disk.
+    """
+    reward_fns = self.rl_task.rl_batch(self.batch_size)
+    return RLBatch(
+        reward_fns=reward_fns,
+        batch_size=self.batch_size,
+        good_reward=self.rl_task.good_reward)
--- a/research/brain_coder/single_task/defaults.py
+++ b/research/brain_coder/single_task/defaults.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Default configuration for agent and environment."""
+from absl import logging
+from common import config_lib  # brain coder
+def default_config():
+  return config_lib.Config(
+      agent=config_lib.OneOf(
+          [config_lib.Config(
+              algorithm='pg',
+              policy_lstm_sizes=[35,35],
+              # Set value_lstm_sizes to None to share weights with policy.
+              value_lstm_sizes=[35,35],
+              obs_embedding_size=10,
+              grad_clip_threshold=10.0,
+              param_init_factor=1.0,
+              lr=5e-5,
+              pi_loss_hparam=1.0,
+              vf_loss_hparam=0.5,
+              entropy_beta=1e-2,
+              regularizer=0.0,
+              softmax_tr=1.0,  # Reciprocal temperature.
+              optimizer='rmsprop',  # 'adam', 'sgd', 'rmsprop'
+              topk=0,  # Top-k unique codes will be stored.
+              topk_loss_hparam=0.0,  # off policy loss multiplier.
+              # Uniformly sample this many episodes from topk buffer per batch.
+              # If topk is 0, this has no effect.
+              topk_batch_size=1,
+              # Exponential moving average baseline for REINFORCE.
+              # If zero, A2C is used.
+              # If non-zero, should be close to 1, like .99, .999, etc.
+              ema_baseline_decay=0.99,
+              # Whether agent can emit EOS token. If true, agent can emit EOS
+              # token which ends the episode early (ends the sequence).
+              # If false, agent must emit tokens until the timestep limit is
+              # reached. e.g. True means variable length code, False means fixed
+              # length code.
+              # WARNING: Making this false slows things down.
+              eos_token=False,
+              replay_temperature=1.0,
+              # Replay probability. 1 = always replay, 0 = always on policy.
+              alpha=0.0,
+              # Whether to normalize importance weights in each minibatch.
+              iw_normalize=True),
+           config_lib.Config(
+              algorithm='ga',
+              crossover_rate=0.99,
+              mutation_rate=0.086),
+           config_lib.Config(
+              algorithm='rand')],
+          algorithm='pg',
+      ),
+      env=config_lib.Config(
+          # If True, task-specific settings are not needed.
+          task='',  # 'print', 'echo', 'reverse', 'remove', ...
+          task_cycle=[],  # If non-empty, reptitions will cycle through tasks.
+          task_kwargs='{}',  # Python dict literal.
+          task_manager_config=config_lib.Config(
+              # Reward recieved per test case. These bonuses will be scaled
+              # based on how many test cases there are.
+              correct_bonus=2.0,  # Bonus for code getting correct answer.
+              code_length_bonus=1.0),  # Maximum bonus for short code.
+          correct_syntax=False,
+      ),
+      batch_size=64,
+      timestep_limit=32)
+def default_config_with_updates(config_string, do_logging=True):
+  if do_logging:
+    logging.info('Config string: "%s"', config_string)
+  config = default_config()
+  config.strict_update(config_lib.Config.parse(config_string))
+  if do_logging:
+    logging.info('Config:\n%s', config.pretty_str())
+  return config
--- a/research/brain_coder/single_task/ga_lib.py
+++ b/research/brain_coder/single_task/ga_lib.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Genetic algorithm for BF tasks.
+Inspired by https://github.com/primaryobjects/AI-Programmer.
+GA function code borrowed from https://github.com/DEAP/deap.
+"""
+from collections import namedtuple
+import random
+from absl import flags
+from absl import logging
+import numpy as np
+from common import bf  # brain coder
+from common import utils  # brain coder
+from single_task import misc  # brain coder
+FLAGS = flags.FLAGS
+# Saving reward of previous programs saves computation if a program appears
+# again.
+USE_REWARD_CACHE = True  # Disable this if GA is using up too much memory.
+GENES = bf.CHARS
+MAX_PROGRAM_STEPS = 500
+STEP_BONUS = True
+ALPHANUM_CHARS = (
+    ['_'] +
+    [chr(ord('a') + i_) for i_ in range(26)] +
+    [chr(ord('A') + i_) for i_ in range(26)] +
+    [chr(ord('0') + i_) for i_ in range(10)])
+Result = namedtuple(
+    'Result',
+    ['reward', 'inputs', 'code_outputs', 'target_outputs', 'type_in',
+     'type_out', 'base', 'correct'])
+class IOType(object):
+  string = 'string'
+  integer = 'integer'
+class CustomType(object):
+  def __init__(self, to_str_fn):
+    self.to_str_fn = to_str_fn
+  def __call__(self, obj):
+    return self.to_str_fn(obj)
+def tokens_list_repr(tokens, repr_type, base):
+  """Make human readable representation of program IO."""
+  if isinstance(repr_type, CustomType):
+    return repr_type(tokens)
+  elif repr_type == IOType.string:
+    chars = (
+        [ALPHANUM_CHARS[t] for t in tokens] if base < len(ALPHANUM_CHARS)
+        else [chr(t) for t in tokens])
+    return ''.join(chars)
+  elif repr_type == IOType.integer:
+    return str(tokens)
+  raise ValueError('No such representation type "%s"', repr_type)
+def io_repr(result):
+  """Make human readable representation of test cases."""
+  inputs = ','.join(
+      tokens_list_repr(tokens, result.type_in, result.base)
+      for tokens in result.inputs)
+  code_outputs = ','.join(
+      tokens_list_repr(tokens, result.type_out, result.base)
+      for tokens in result.code_outputs)
+  target_outputs = ','.join(
+      tokens_list_repr(tokens, result.type_out, result.base)
+      for tokens in result.target_outputs)
+  return inputs, target_outputs, code_outputs
+def make_task_eval_fn(task_manager):
+  """Returns a wrapper that converts an RL task into a GA task.
+  Args:
+    task_manager: Is a task manager object from code_tasks.py
+  Returns:
+    A function that takes as input a single list of a code chars, and outputs
+    a Result namedtuple instance containing the reward and information about
+    code execution.
+  """
+  def to_data_list(single_or_tuple):
+    if isinstance(single_or_tuple, misc.IOTuple):
+      return list(single_or_tuple)
+    return [single_or_tuple]
+  def to_ga_type(rl_type):
+    if rl_type == misc.IOType.string:
+      return IOType.string
+    return IOType.integer
+  # Wrapper function.
+  def evalbf(bf_chars):
+    result = task_manager._score_code(''.join(bf_chars))
+    reward = sum(result.episode_rewards)
+    correct = result.reason == 'correct'
+    return Result(
+        reward=reward,
+        inputs=to_data_list(result.input_case),
+        code_outputs=to_data_list(result.code_output),
+        target_outputs=to_data_list(result.correct_output),
+        type_in=to_ga_type(result.input_type),
+        type_out=to_ga_type(result.output_type),
+        correct=correct,
+        base=task_manager.task.base)
+  return evalbf
+def debug_str(individual, task_eval_fn):
+  res = task_eval_fn(individual)
+  input_str, target_output_str, code_output_str = io_repr(res)
+  return (
+      ''.join(individual) +
+      ' | ' + input_str +
+      ' | ' + target_output_str +
+      ' | ' + code_output_str +
+      ' | ' + str(res.reward) +
+      ' | ' + str(res.correct))
+def mutate_single(code_tokens, mutation_rate):
+  """Mutate a single code string.
+  Args:
+    code_tokens: A string/list/Individual of BF code chars. Must end with EOS
+        symbol '_'.
+    mutation_rate: Float between 0 and 1 which sets the probability of each char
+        being mutated.
+  Returns:
+    An Individual instance containing the mutated code string.
+  Raises:
+    ValueError: If `code_tokens` does not end with EOS symbol.
+  """
+  if len(code_tokens) <= 1:
+    return code_tokens
+  if code_tokens[-1] == '_':
+    # Do this check to ensure that the code strings have not been corrupted.
+    raise ValueError('`code_tokens` must end with EOS symbol.')
+  else:
+    cs = Individual(code_tokens)
+    eos = []
+  mutated = False
+  for pos in range(len(cs)):
+    if random.random() < mutation_rate:
+      mutated = True
+      new_char = GENES[random.randrange(len(GENES))]
+      x = random.random()
+      if x < 0.25 and pos != 0 and pos != len(cs) - 1:
+        # Insertion mutation.
+        if random.random() < 0.50:
+          # Shift up.
+          cs = cs[:pos] + [new_char] + cs[pos:-1]
+        else:
+          # Shift down.
+          cs = cs[1:pos] + [new_char] + cs[pos:]
+      elif x < 0.50:
+        # Deletion mutation.
+        if random.random() < 0.50:
+          # Shift down.
+          cs = cs[:pos] + cs[pos + 1:] + [new_char]
+        else:
+          # Shift up.
+          cs = [new_char] + cs[:pos] + cs[pos + 1:]
+      elif x < 0.75:
+        # Shift rotate mutation (position invariant).
+        if random.random() < 0.50:
+          # Shift down.
+          cs = cs[1:] + [cs[0]]
+        else:
+          # Shift up.
+          cs = [cs[-1]] + cs[:-1]
+      else:
+        # Replacement mutation.
+        cs = cs[:pos] + [new_char] + cs[pos + 1:]
+  assert len(cs) + len(eos) == len(code_tokens)
+  if mutated:
+    return Individual(cs + eos)
+  else:
+    return Individual(code_tokens)
+def crossover(parent1, parent2):
+  """Performs crossover mating between two code strings.
+  Crossover mating is where a random position is selected, and the chars
+  after that point are swapped. The resulting new code strings are returned.
+  Args:
+    parent1: First code string.
+    parent2: Second code string.
+  Returns:
+    A 2-tuple of children, i.e. the resulting code strings after swapping.
+  """
+  max_parent, min_parent = (
+      (parent1, parent2) if len(parent1) > len(parent2)
+      else (parent2, parent1))
+  pos = random.randrange(len(max_parent))
+  if pos >= len(min_parent):
+    child1 = max_parent[:pos]
+    child2 = min_parent + max_parent[pos:]
+  else:
+    child1 = max_parent[:pos] + min_parent[pos:]
+    child2 = min_parent[:pos] + max_parent[pos:]
+  return Individual(child1), Individual(child2)
+def _make_even(n):
+  """Return largest even integer less than or equal to `n`."""
+  return (n >> 1) << 1
+def mutate_and_crossover(population, mutation_rate, crossover_rate):
+  """Take a generational step over a population.
+  Transforms population of parents into population of children (of the same
+  size) via crossover mating and then mutation on the resulting children.
+  Args:
+    population: Parent population. A list of Individual objects.
+    mutation_rate: Probability of mutation. See `mutate_single`.
+    crossover_rate: Probability that two parents will mate.
+  Returns:
+    Child population. A list of Individual objects.
+  """
+  children = [None] * len(population)
+  for i in xrange(0, _make_even(len(population)), 2):
+    p1 = population[i]
+    p2 = population[i + 1]
+    if random.random() < crossover_rate:
+      p1, p2 = crossover(p1, p2)
+    c1 = mutate_single(p1, mutation_rate)
+    c2 = mutate_single(p2, mutation_rate)
+    children[i] = c1
+    children[i + 1] = c2
+  if children[-1] is None:
+    children[-1] = population[-1]
+  return children
+def ga_loop(population, cxpb, mutpb, ngen, task_eval_fn, halloffame=None,
+            checkpoint_writer=None):
+  """A bare bones genetic algorithm.
+  Similar to chapter 7 of Back, Fogel and Michalewicz, "Evolutionary
+  Computation 1 : Basic Algorithms and Operators", 2000.
+  Args:
+    population: A list of individuals.
+    cxpb: The probability of mating two individuals.
+    mutpb: The probability of mutating a gene.
+    ngen: The number of generation. Unlimited if zero.
+    task_eval_fn: A python function which maps an Individual to a Result
+        namedtuple.
+    halloffame: (optional) a utils.MaxUniquePriorityQueue object that will be
+        used to aggregate the best individuals found during search.
+    checkpoint_writer: (optional) an object that can save and load populations.
+        Needs to have `write`, `load`, and `has_checkpoint` methods. Used to
+        periodically save progress. In event of a restart, the population will
+        be loaded from disk.
+  Returns:
+    GaResult namedtuple instance. This contains information about the GA run,
+    including the resulting population, best reward (fitness) obtained, and
+    the best code string found.
+  """
+  has_checkpoint = False
+  if checkpoint_writer and checkpoint_writer.has_checkpoint():
+    try:
+      gen, population, halloffame = checkpoint_writer.load()
+    except EOFError:  # Data was corrupted. Start over.
+      pass
+    else:
+      has_checkpoint = True
+      logging.info(
+          'Loaded population from checkpoint. Starting at generation %d', gen)
+      # Evaluate the individuals with an invalid fitness
+      invalid_ind = [ind for ind in population if not ind.fitness.valid]
+      for ind in invalid_ind:
+        ind.fitness.values = task_eval_fn(ind).reward,
+      for _, ind in halloffame.iter_in_order():
+        ind.fitness.values = task_eval_fn(ind).reward,
+  if not has_checkpoint:
+    # Evaluate the individuals with an invalid fitness
+    invalid_ind = [ind for ind in population if not ind.fitness.valid]
+    for ind in invalid_ind:
+      ind.fitness.values = task_eval_fn(ind).reward,
+    if halloffame is not None:
+      for ind in population:
+        halloffame.push(ind.fitness.values, tuple(ind), ind)
+    logging.info('Initialized new population.')
+    gen = 1
+  pop_size = len(population)
+  program_reward_cache = {} if USE_REWARD_CACHE else None
+  # Begin the generational process
+  while ngen == 0 or gen <= ngen:
+    # Select the next generation individuals
+    offspring = roulette_selection(population, pop_size - len(halloffame))
+    # Vary the pool of individuals
+    # offspring = varAnd(offspring, toolbox, cxpb, mutpb)
+    offspring = mutate_and_crossover(
+        offspring, mutation_rate=mutpb, crossover_rate=cxpb)
+    # Evaluate the individuals with an invalid fitness
+    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
+    for ind in invalid_ind:
+      str_repr = ''.join(ind)
+      if program_reward_cache is not None and str_repr in program_reward_cache:
+        ind.fitness.values = (program_reward_cache[str_repr],)
+      else:
+        eval_result = task_eval_fn(ind)
+        ind.fitness.values = (eval_result.reward,)
+        if program_reward_cache is not None:
+          program_reward_cache[str_repr] = eval_result.reward
+    # Replace the current population by the offspring
+    population = list(offspring)
+    # Update the hall of fame with the generated individuals
+    if halloffame is not None:
+      for ind in population:
+        halloffame.push(ind.fitness.values, tuple(ind), ind)
+    # elitism
+    population.extend([ind for _, ind in halloffame.iter_in_order()])
+    if gen % 100 == 0:
+      top_code = '\n'.join([debug_str(ind, task_eval_fn)
+                            for ind in topk(population, k=4)])
+      logging.info('gen: %d\nNPE: %d\n%s\n\n', gen, gen * pop_size, top_code)
+      best_code = ''.join(halloffame.get_max()[1])
+      res = task_eval_fn(best_code)
+      # Write population and hall-of-fame to disk.
+      if checkpoint_writer:
+        checkpoint_writer.write(gen, population, halloffame)
+      if res.correct:
+        logging.info('Solution found:\n%s\nreward = %s\n',
+                     best_code, res.reward)
+        break
+    gen += 1
+  best_code = ''.join(halloffame.get_max()[1])
+  res = task_eval_fn(best_code)
+  return GaResult(
+      population=population, best_code=best_code, reward=res.reward,
+      solution_found=res.correct, generations=gen,
+      num_programs=gen * len(population),
+      max_generations=ngen, max_num_programs=ngen * len(population))
+GaResult = namedtuple(
+    'GaResult',
+    ['population', 'best_code', 'reward', 'generations', 'num_programs',
+     'solution_found', 'max_generations', 'max_num_programs'])
+def reward_conversion(reward):
+  """Convert real value into positive value."""
+  if reward <= 0:
+    return 0.05
+  return reward + 0.05
+def roulette_selection(population, k):
+  """Select `k` individuals with prob proportional to fitness.
+  Each of the `k` selections is independent.
+  Warning:
+    The roulette selection by definition cannot be used for minimization
+    or when the fitness can be smaller or equal to 0.
+  Args:
+    population: A list of Individual objects to select from.
+    k: The number of individuals to select.
+  Returns:
+    A list of selected individuals.
+  """
+  fitnesses = np.asarray(
+      [reward_conversion(ind.fitness.values[0])
+       for ind in population])
+  assert np.all(fitnesses > 0)
+  sum_fits = fitnesses.sum()
+  chosen = [None] * k
+  for i in xrange(k):
+    u = random.random() * sum_fits
+    sum_ = 0
+    for ind, fitness in zip(population, fitnesses):
+      sum_ += fitness
+      if sum_ > u:
+        chosen[i] = Individual(ind)
+        break
+    if not chosen[i]:
+      chosen[i] = Individual(population[-1])
+  return chosen
+def make_population(make_individual_fn, n):
+  return [make_individual_fn() for _ in xrange(n)]
+def best(population):
+  best_ind = None
+  for ind in population:
+    if best_ind is None or best_ind.fitness.values < ind.fitness.values:
+      best_ind = ind
+  return best_ind
+def topk(population, k):
+  q = utils.MaxUniquePriorityQueue(k)
+  for ind in population:
+    q.push(ind.fitness.values, tuple(ind), ind)
+  return [ind for _, ind in q.iter_in_order()]
+class Fitness(object):
+  def __init__(self):
+    self.values = ()
+  @property
+  def valid(self):
+    """Assess if a fitness is valid or not."""
+    return bool(self.values)
+class Individual(list):
+  def __init__(self, *args):
+    super(Individual, self).__init__(*args)
+    self.fitness = Fitness()
+def random_individual(genome_size):
+  return lambda: Individual(np.random.choice(GENES, genome_size).tolist())
--- a/research/brain_coder/single_task/ga_train.py
+++ b/research/brain_coder/single_task/ga_train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Genetic algorithm for BF tasks.
+Also contains the uniform random search algorithm.
+Inspired by https://github.com/primaryobjects/AI-Programmer.
+GA function code borrowed from https://github.com/DEAP/deap.
+"""
+import cPickle
+import os
+import sys
+from time import sleep
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+from common import utils  # brain coder
+from single_task import data  # brain coder
+from single_task import defaults  # brain coder
+from single_task import ga_lib  # brain coder
+from single_task import results_lib  # brain coder
+FLAGS = flags.FLAGS
+def define_tuner_hparam_space(hparam_space_type):
+  """Define tunable hparams for grid search."""
+  if hparam_space_type != 'ga':
+    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
+  return {
+      'population_size': [10, 25, 50, 100, 500],
+      'crossover_rate': [0.2, 0.5, 0.7, 0.9, 0.95],
+      'mutation_rate': [0.01, 0.03, 0.05, 0.1, 0.15]}
+def write_hparams_to_config(config, hparams, hparam_space_type):
+  """Write hparams given by the tuner into the Config object."""
+  if hparam_space_type != 'ga':
+    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
+  config.batch_size = hparams.population_size
+  config.agent.crossover_rate = hparams.crossover_rate
+  config.agent.mutation_rate = hparams.mutation_rate
+class CheckpointWriter(object):
+  """Manages loading and saving GA populations to disk.
+  This object is used by the genetic algorithm to save progress periodically
+  so that a recent population can be loaded from disk in the event of a restart.
+  """
+  def __init__(self, checkpoint_dir, population_size):
+    self.checkpoint_file = os.path.join(checkpoint_dir, 'checkpoint.pickle')
+    self.population_size = population_size
+  def write(self, gen, population, halloffame):
+    """Write GA state to disk.
+    Overwrites previous saved state.
+    Args:
+      gen: Generation number.
+      population: List of Individual objects.
+      halloffame: Hall-of-fame buffer. Typically a priority queue.
+    """
+    raw = cPickle.dumps((gen, population, halloffame))
+    with tf.gfile.FastGFile(self.checkpoint_file, 'w') as f:
+      f.write(raw)
+  def load(self):
+    """Loads GA state from disk.
+    Loads whatever is on disk, which will be whatever the most recent call
+    to `write` wrote.
+    Returns:
+      gen: Generation number.
+      population: List of Individual objects.
+      halloffame: Hall-of-fame buffer. Typically a priority queue.
+    """
+    with tf.gfile.FastGFile(self.checkpoint_file, 'r') as f:
+      raw = f.read()
+    objs = cPickle.loads(raw)
+    # Validate data.
+    assert isinstance(objs, tuple) and len(objs) == 3, (
+        'Expecting a 3-tuple, but got %s instead.' % (objs,))
+    gen, population, halloffame = objs
+    assert isinstance(gen, int), (
+        'Expecting `gen` to be an integer, got %s' % (gen,))
+    assert (
+        isinstance(population, list)
+        and len(population) == self.population_size
+    ), (
+        'Expecting `population` to be a list with size %d, got %s'
+        % (self.population_size, population))
+    assert halloffame is None or len(halloffame) == 2, (
+        'Expecting hall-of-fame object to have length two, got length %d'
+        % len(halloffame))
+    logging.info('Loaded pop from checkpoint file: "%s".',
+                 self.checkpoint_file)
+    return gen, population, halloffame
+  def has_checkpoint(self):
+    """Checks if a checkpoint exists on disk, and if so returns True."""
+    return tf.gfile.Exists(self.checkpoint_file)
+def run_training(config=None, tuner=None, logdir=None, trial_name=None,  # pylint: disable=unused-argument
+                 is_chief=True):
+  """Do all training runs.
+  This is the top level training function for policy gradient based models.
+  Run this from the main function.
+  Args:
+    config: config_lib.Config instance containing global config (agent and
+        environment hparams). If None, config will be parsed from FLAGS.config.
+    tuner: (unused) A tuner instance. Leave as None if not tuning.
+    logdir: Parent directory where all data from all runs will be written. If
+        None, FLAGS.logdir will be used.
+    trial_name: (unused) If tuning, set this to a unique string that identifies
+        this trial. If `tuner` is not None, this also must be set.
+    is_chief: True if this worker is the chief.
+  Returns:
+    List of results dicts which were written to disk. Each training run gets a
+    results dict. Results dict contains metrics, i.e. (name, value) pairs which
+    give information about the training run.
+  Raises:
+    ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions.
+    ValueError: If results dicts read from disk contain invalid data.
+  """
+  if not config:
+    # If custom config is not given, get it from flags.
+    config = defaults.default_config_with_updates(FLAGS.config)
+  if not logdir:
+    logdir = FLAGS.logdir
+  if FLAGS.num_repetitions % FLAGS.num_workers != 0:
+    raise ValueError('Number of workers must divide number of repetitions')
+  num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers
+  logging.info('Running %d reps globally.', FLAGS.num_repetitions)
+  logging.info('This worker will run %d local reps.', num_local_reps)
+  if FLAGS.max_npe:
+    max_generations = FLAGS.max_npe // config.batch_size
+    logging.info('Max samples per rep: %d', FLAGS.max_npe)
+    logging.info('Max generations per rep: %d', max_generations)
+  else:
+    max_generations = sys.maxint
+    logging.info('Running unlimited generations.')
+  assert FLAGS.num_workers > 0
+  logging.info('Starting experiment. Directory: "%s"', logdir)
+  results = results_lib.Results(logdir, FLAGS.task_id)
+  local_results_list = results.read_this_shard()
+  if local_results_list:
+    if local_results_list[0]['max_npe'] != FLAGS.max_npe:
+      raise ValueError(
+          'Cannot resume training. Max-NPE changed. Was %s, now %s',
+          local_results_list[0]['max_npe'], FLAGS.max_npe)
+    if local_results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
+      raise ValueError(
+          'Cannot resume training. Number of repetitions changed. Was %s, '
+          'now %s',
+          local_results_list[0]['max_global_repetitions'],
+          FLAGS.num_repetitions)
+  start_rep = len(local_results_list)
+  for rep in xrange(start_rep, num_local_reps):
+    global_rep = num_local_reps * FLAGS.task_id + rep
+    logging.info(
+        'Starting repetition: Rep = %d. (global rep = %d)',
+        rep, global_rep)
+    # Save data for each rep, like checkpoints, goes into separate folders.
+    run_dir = os.path.join(logdir, 'run_%d' % global_rep)
+    if not tf.gfile.IsDirectory(run_dir):
+      tf.gfile.MakeDirs(run_dir)
+    checkpoint_writer = CheckpointWriter(run_dir,
+                                         population_size=config.batch_size)
+    data_manager = data.DataManager(config, run_number=global_rep)
+    task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task)
+    if config.agent.algorithm == 'rand':
+      logging.info('Running random search.')
+      assert FLAGS.max_npe
+      result = run_random_search(
+          FLAGS.max_npe, run_dir, task_eval_fn, config.timestep_limit)
+    else:
+      assert config.agent.algorithm == 'ga'
+      logging.info('Running genetic algorithm.')
+      pop = ga_lib.make_population(
+          ga_lib.random_individual(config.timestep_limit),
+          n=config.batch_size)
+      hof = utils.MaxUniquePriorityQueue(2)  # Hall of fame.
+      result = ga_lib.ga_loop(
+          pop,
+          cxpb=config.agent.crossover_rate, mutpb=config.agent.mutation_rate,
+          task_eval_fn=task_eval_fn,
+          ngen=max_generations, halloffame=hof,
+          checkpoint_writer=checkpoint_writer)
+    logging.info('Finished rep. Num gens: %d', result.generations)
+    results_dict = {
+        'max_npe': FLAGS.max_npe,
+        'batch_size': config.batch_size,
+        'max_batches': FLAGS.max_npe // config.batch_size,
+        'npe': result.num_programs,
+        'max_global_repetitions': FLAGS.num_repetitions,
+        'max_local_repetitions': num_local_reps,
+        'code_solution': result.best_code if result.solution_found else '',
+        'best_reward': result.reward,
+        'num_batches': result.generations,
+        'found_solution': result.solution_found,
+        'task': data_manager.task_name,
+        'global_rep': global_rep}
+    logging.info('results_dict: %s', results_dict)
+    results.append(results_dict)
+  if is_chief:
+    logging.info(
+        'Worker is chief. Waiting for all workers to finish so that results '
+        'can be reported to the tuner.')
+    global_results_list, shard_stats = results.read_all(
+        num_shards=FLAGS.num_workers)
+    while not all(s.finished for s in shard_stats):
+      logging.info(
+          'Still waiting on these workers: %s',
+          ', '.join(
+              ['%d (%d reps left)'
+               % (i, s.max_local_reps - s.num_local_reps_completed)
+               for i, s in enumerate(shard_stats)
+               if not s.finished]))
+      sleep(60)
+      global_results_list, shard_stats = results.read_all(
+          num_shards=FLAGS.num_workers)
+    logging.info(
+        '%d results obtained. Chief worker is exiting the experiment.',
+        len(global_results_list))
+    return global_results_list
+def run_random_search(max_num_programs, checkpoint_dir, task_eval_fn,
+                      timestep_limit):
+  """Run uniform random search routine.
+  Randomly samples programs from a uniform distribution until either a valid
+  program is found, or the maximum NPE is reached. Results are written to disk
+  and returned.
+  Args:
+    max_num_programs: Maximum NPE (number of programs executed). If no solution
+        is found after this many programs are tried, the run is stopped and
+        considered a failure.
+    checkpoint_dir: Where to save state during the run.
+    task_eval_fn: Function that maps code string to result containing total
+        reward and info about success.
+    timestep_limit: Maximum length of code strings.
+  Returns:
+    ga_lib.GaResult namedtuple instance. This contains the best code and highest
+    reward found.
+  """
+  checkpoint_file = os.path.join(checkpoint_dir, 'random_search.txt')
+  num_programs_seen = 0
+  found_solution = False
+  best_code = ''
+  best_reward = 0.0
+  if tf.gfile.Exists(checkpoint_file):
+    try:
+      with tf.gfile.FastGFile(checkpoint_file, 'r') as f:
+        lines = list(f)
+        num_programs_seen = int(lines[0])
+        found_solution = bool(int(lines[1]))
+        if found_solution:
+          best_code = lines[2]
+          best_reward = float(lines[3])
+    except:  # pylint: disable=bare-except
+      pass
+  while not found_solution and num_programs_seen < max_num_programs:
+    if num_programs_seen % 1000 == 0:
+      logging.info('num_programs_seen = %d', num_programs_seen)
+      with tf.gfile.FastGFile(checkpoint_file, 'w') as f:
+        f.write(str(num_programs_seen) + '\n')
+        f.write(str(int(found_solution)) + '\n')
+    code = np.random.choice(ga_lib.GENES, timestep_limit).tolist()
+    res = task_eval_fn(code)
+    found_solution = res.correct
+    num_programs_seen += 1
+    if found_solution:
+      best_code = ''.join(code)
+      best_reward = res.reward
+  logging.info('num_programs_seen = %d', num_programs_seen)
+  logging.info('found solution: %s', found_solution)
+  with tf.gfile.FastGFile(checkpoint_file, 'w') as f:
+    f.write(str(num_programs_seen) + '\n')
+    f.write(str(int(found_solution)) + '\n')
+    if found_solution:
+      f.write(best_code + '\n')
+      f.write(str(best_reward) + '\n')
+  return ga_lib.GaResult(
+      population=[], best_code=best_code, reward=best_reward,
+      solution_found=found_solution, generations=num_programs_seen,
+      num_programs=num_programs_seen, max_generations=max_num_programs,
+      max_num_programs=max_num_programs)
--- a/research/brain_coder/single_task/ga_train_test.py
+++ b/research/brain_coder/single_task/ga_train_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for ga_train.
+Tests that ga runs for a few generations without crashing.
+"""
+from absl import flags
+import tensorflow as tf
+from single_task import defaults  # brain coder
+from single_task import run  # brain coder
+FLAGS = flags.FLAGS
+class GaTest(tf.test.TestCase):
+  def RunTrainingSteps(self, config_string, num_steps=10):
+    """Run a few training steps with the given config.
+    Just check that nothing crashes.
+    Args:
+      config_string: Config encoded in a string. See
+          $REPO_PATH/common/config_lib.py
+      num_steps: Number of training steps to run. Defaults to 10.
+    """
+    config = defaults.default_config_with_updates(config_string)
+    FLAGS.max_npe = num_steps * config.batch_size
+    FLAGS.logdir = tf.test.get_temp_dir()
+    FLAGS.config = config_string
+    run.main(None)
+  def testGeneticAlgorithm(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="ga"),'
+        'timestep_limit=40,batch_size=64')
+  def testUniformRandomSearch(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="rand"),'
+        'timestep_limit=40,batch_size=64')
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/launch_training.sh
+++ b/research/brain_coder/single_task/launch_training.sh
+#!/bin/bash
+# Launches training jobs.
+# Modify this file to launch workers with your prefered cloud API.
+# The following implementation runs each worker as a subprocess on the local
+# machine.
+MODELS_DIR="/tmp/models"
+# Get command line options.
+OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_workers:,num_ps:,max_npe:,num_repetitions:,stop_on_success:" -- "$@")
+if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
+eval set -- "$OPTS"
+JOB_NAME=""           # Name of the process and the logs directory.
+CONFIG=""             # Model and environment hparams.
+# NUM_WORKERS: Number of workers to launch for this training job. If using
+# neural networks, each worker will be 1 replica.
+NUM_WORKERS=1
+# NUM_PS: Number of parameter servers to launch for this training job. Only set
+# this if using neural networks. For 1 worker, no parameter servers are needed.
+# For more than 1 worker, at least 1 parameter server is needed to store the
+# global model.
+NUM_PS=0
+# MAX_NPE: Maximum number of programs executed. Training will quit once this
+# threshold is reached. If 0, the threshold is infinite.
+MAX_NPE=0
+NUM_REPETITIONS=1     # How many times to run this experiment.
+STOP_ON_SUCCESS=true  # Whether to halt training when a solution is found.
+# Parse options into variables.
+while true; do
+  case "$1" in
+    --job_name ) JOB_NAME="$2"; shift; shift ;;
+    --config ) CONFIG="$2"; shift; shift ;;
+    --num_workers ) NUM_WORKERS="$2"; shift; shift ;;
+    --num_ps ) NUM_PS="$2"; shift; shift ;;
+    --max_npe ) MAX_NPE="$2"; shift; shift ;;
+    --num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
+    --stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
+    -- ) shift; break ;;
+    * ) break ;;
+  esac
+done
+# Launch jobs.
+# TODO: multi-worker RL training
+LOGDIR="$MODELS_DIR/$JOB_NAME"
+mkdir -p $LOGDIR
+BIN_DIR="bazel-bin/single_task"
+for (( i=0; i<NUM_WORKERS; i++))
+do
+  # Expecting run.par to be built.
+  $BIN_DIR/run.par \
+      --alsologtostderr \
+      --config="$CONFIG" \
+      --logdir="$LOGDIR" \
+      --max_npe="$MAX_NPE" \
+      --num_repetitions="$NUM_REPETITIONS" \
+      --stop_on_success="$STOP_ON_SUCCESS" \
+      --task_id="$i" \
+      --num_workers="$NUM_WORKERS" \
+      --summary_tasks=1 \
+      2> "$LOGDIR/task_$i.log" &  # Run as subprocess
+  echo "Launched task $i. Logs: $LOGDIR/task_$i.log"
+done
+# Use "pidof run.par" to find jobs.
+# Kill with "pkill run.par"
--- a/research/brain_coder/single_task/launch_tuning.sh
+++ b/research/brain_coder/single_task/launch_tuning.sh
+#!/bin/bash
+# Launches tuning jobs.
+# Modify this file to launch workers with your prefered cloud API.
+# The following implementation runs each worker as a subprocess on the local
+# machine.
+MODELS_DIR="/tmp/models"
+# Get command line options.
+OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_tuners:,num_workers_per_tuner:,num_ps_per_tuner:,max_npe:,num_repetitions:,stop_on_success:,fixed_hparams:,hparam_space_type:" -- "$@")
+if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
+eval set -- "$OPTS"
+JOB_NAME=""           # Name of the process and the logs directory.
+CONFIG=""             # Model and environment hparams.
+# NUM_TUNERS: Number of tuning jobs to launch. Each tuning job can train a
+# hparam combination. So more tuners means more hparams tried in parallel.
+NUM_TUNERS=1
+# NUM_WORKERS_PER_TUNER: Number of workers to launch for each tuning job. If
+# using neural networks, each worker will be 1 replica.
+NUM_WORKERS_PER_TUNER=1
+# NUM_PS_PER_TUNER: Number of parameter servers to launch for this tuning job.
+# Only set this if using neural networks. For 1 worker per tuner, no parameter
+# servers are needed. For more than 1 worker per tuner, at least 1 parameter
+# server per tuner is needed to store the global model for each tuner.
+NUM_PS_PER_TUNER=0
+# MAX_NPE: Maximum number of programs executed. Training will quit once this
+# threshold is reached. If 0, the threshold is infinite.
+MAX_NPE=0
+NUM_REPETITIONS=25    # How many times to run this experiment.
+STOP_ON_SUCCESS=true  # Whether to halt training when a solution is found.
+# FIXED_HPARAMS: Hold hparams fixed in the grid search. This reduces the search
+# space.
+FIXED_HPARAMS=""
+# HPARAM_SPACE_TYPE: Specifies the hparam search space. See
+# `define_tuner_hparam_space` functions defined in pg_train.py and ga_train.py.
+HPARAM_SPACE_TYPE="pg"
+# Parse options into variables.
+while true; do
+  case "$1" in
+    --job_name ) JOB_NAME="$2"; shift; shift ;;
+    --config ) CONFIG="$2"; shift; shift ;;
+    --num_tuners ) NUM_TUNERS="$2"; shift; shift ;;
+    --num_workers_per_tuner ) NUM_WORKERS_PER_TUNER="$2"; shift; shift ;;
+    --num_ps_per_tuner ) NUM_PS_PER_TUNER="$2"; shift; shift ;;
+    --max_npe ) MAX_NPE="$2"; shift; shift ;;
+    --num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
+    --stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
+    --fixed_hparams ) FIXED_HPARAMS="$2"; shift; shift ;;
+    --hparam_space_type ) HPARAM_SPACE_TYPE="$2"; shift; shift ;;
+    -- ) shift; break ;;
+    * ) break ;;
+  esac
+done
+# Launch jobs.
+# TODO: multi-worker RL training
+LOGDIR="$MODELS_DIR/$JOB_NAME"
+mkdir -p $LOGDIR
+BIN_DIR="bazel-bin/single_task"
+for ((tuner=0;tuner<NUM_TUNERS;tuner+=1)); do
+  for ((i=0;i<NUM_WORKERS_PER_TUNER;i++)); do
+    # Expecting tune.par to be built.
+    echo "$LOGDIR"
+    $BIN_DIR/tune.par \
+        --alsologtostderr \
+        --config="$CONFIG" \
+        --logdir="$LOGDIR" \
+        --max_npe="$MAX_NPE" \
+        --num_repetitions="$NUM_REPETITIONS" \
+        --stop_on_success="$STOP_ON_SUCCESS" \
+        --summary_tasks=1 \
+        --hparam_space="$HPARAM_SPACE_TYPE" \
+        --fixed_hparams="$FIXED_HPARAMS" \
+        --tuner_id=$tuner \
+        --num_tuners=$NUM_TUNERS \
+        2> "$LOGDIR/tuner_$tuner.task_$i.log" &  # Run as subprocess
+    echo "Launched tuner $tuner, task $i. Logs: $LOGDIR/tuner_$tuner.task_$i.log"
+  done
+done
+# Use "pidof tune.par" to find jobs.
+# Kill with "pkill tune.par"
--- a/research/brain_coder/single_task/misc.py
+++ b/research/brain_coder/single_task/misc.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Utilities specific to this project."""
+from collections import namedtuple
+from six import string_types
+#####################
+# BF-lang utilities #
+#####################
+BF_EOS_INT = 0  # Also used as SOS (start of sequence).
+BF_EOS_CHAR = TEXT_EOS_CHAR = '_'
+BF_LANG_INTS = range(1, 9)
+BF_INT_TO_CHAR = [BF_EOS_CHAR, '>', '<', '+', '-', '[', ']', '.', ',']
+BF_CHAR_TO_INT = dict([(c, i) for i, c in enumerate(BF_INT_TO_CHAR)])
+RewardInfo = namedtuple('RewardInfo', ['episode_rewards', 'input_case',
+                                       'correct_output',
+                                       'code_output', 'reason', 'input_type',
+                                       'output_type'])
+class IOType(object):
+  string = 'string'
+  integer = 'integer'
+  boolean = 'boolean'
+class IOTuple(tuple):
+  pass
+def flatten(lst):
+  return [item for row in lst for item in row]
+def bf_num_tokens():
+  # BF tokens plus EOS.
+  return len(BF_INT_TO_CHAR)
+def bf_char2int(bf_char):
+  """Convert BF code char to int token."""
+  return BF_CHAR_TO_INT[bf_char]
+def bf_int2char(bf_int):
+  """Convert BF int token to code char."""
+  return BF_INT_TO_CHAR[bf_int]
+def bf_tokens_to_string(bf_tokens, truncate=True):
+  """Convert token list to code string. Will truncate at EOS token.
+  Args:
+    bf_tokens: Python list of ints representing the code string.
+    truncate: If true, the output string will end at the first EOS token.
+        If false, the entire token list is converted to string.
+  Returns:
+    String representation of the tokens.
+  Raises:
+    ValueError: If bf_tokens is not a python list.
+  """
+  if not isinstance(bf_tokens, list):
+    raise ValueError('Only python list supported here.')
+  if truncate:
+    try:
+      eos_index = bf_tokens.index(BF_EOS_INT)
+    except ValueError:
+      eos_index = len(bf_tokens)
+  else:
+    eos_index = len(bf_tokens)
+  return ''.join([BF_INT_TO_CHAR[t] for t in bf_tokens[:eos_index]])
+def bf_string_to_tokens(bf_string):
+  """Convert string to token list. Will strip and append EOS token."""
+  tokens = [BF_CHAR_TO_INT[char] for char in bf_string.strip()]
+  tokens.append(BF_EOS_INT)
+  return tokens
+def tokens_to_text(tokens):
+  """Convert token list to human readable text."""
+  return ''.join(
+      [TEXT_EOS_CHAR if t == 0 else chr(t - 1 + ord('A')) for t in tokens])
+###################################
+# Number representation utilities #
+###################################
+# https://en.wikipedia.org/wiki/Metric_prefix
+si_magnitudes = {
+    'k': 1e3,
+    'm': 1e6,
+    'g': 1e9}
+def si_to_int(s):
+  """Convert string ending with SI magnitude to int.
+  Examples: 5K ==> 5000, 12M ==> 12000000.
+  Args:
+    s: String in the form 'xx..xP' where x is a digit and P is an SI prefix.
+  Returns:
+    Integer equivalent to the string.
+  """
+  if isinstance(s, string_types) and s[-1].lower() in si_magnitudes.keys():
+    return int(int(s[:-1]) * si_magnitudes[s[-1].lower()])
+  return int(s)
+def int_to_si(n):
+  """Convert integer to string with SI magnitude.
+  `n` will be truncated.
+  Examples: 5432 ==> 5k, 12345678 ==> 12M
+  Args:
+    n: Integer to represent as a string.
+  Returns:
+    String representation of `n` containing SI magnitude.
+  """
+  m = abs(n)
+  sign = -1 if n < 0 else 1
+  if m < 1e3:
+    return str(n)
+  if m < 1e6:
+    return '{0}K'.format(sign*int(m / 1e3))
+  if m < 1e9:
+    return '{0}M'.format(sign*int(m / 1e6))
+  if m < 1e12:
+    return '{0}G'.format(sign*int(m / 1e9))
+  return str(m)
--- a/research/brain_coder/single_task/pg_agent.py
+++ b/research/brain_coder/single_task/pg_agent.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Language model agent.
+Agent outputs code in a sequence just like a language model. Can be trained
+as a language model or using RL, or a combination of the two.
+"""
+from collections import namedtuple
+from math import exp
+from math import log
+import time
+from absl import logging
+import numpy as np
+import tensorflow as tf
+from common import rollout as rollout_lib  # brain coder
+from common import utils  # brain coder
+from single_task import misc  # brain coder
+# Experiments in the ICLR 2018 paper used reduce_sum instead of reduce_mean for
+# some losses. We make all loses be batch_size independent, and multiply the
+# changed losses by 64, which was the fixed batch_size when the experiments
+# where run. The loss hyperparameters still match what is reported in the paper.
+MAGIC_LOSS_MULTIPLIER = 64
+def rshift_time(tensor_2d, fill=misc.BF_EOS_INT):
+  """Right shifts a 2D tensor along the time dimension (axis-1)."""
+  dim_0 = tf.shape(tensor_2d)[0]
+  fill_tensor = tf.fill([dim_0, 1], fill)
+  return tf.concat([fill_tensor, tensor_2d[:, :-1]], axis=1)
+def join(a, b):
+  # Concat a and b along 0-th dim.
+  if a is None or len(a) == 0:  # pylint: disable=g-explicit-length-test
+    return b
+  if b is None or len(b) == 0:  # pylint: disable=g-explicit-length-test
+    return a
+  return np.concatenate((a, b))
+def make_optimizer(kind, lr):
+  if kind == 'sgd':
+    return tf.train.GradientDescentOptimizer(lr)
+  elif kind == 'adam':
+    return tf.train.AdamOptimizer(lr)
+  elif kind == 'rmsprop':
+    return tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.99)
+  else:
+    raise ValueError('Optimizer type "%s" not recognized.' % kind)
+class LinearWrapper(tf.contrib.rnn.RNNCell):
+  """RNNCell wrapper that adds a linear layer to the output."""
+  def __init__(self, cell, output_size, dtype=tf.float32, suppress_index=None):
+    self.cell = cell
+    self._output_size = output_size
+    self._dtype = dtype
+    self._suppress_index = suppress_index
+    self.smallest_float = -2.4e38
+  def __call__(self, inputs, state, scope=None):
+    with tf.variable_scope(type(self).__name__):
+      outputs, state = self.cell(inputs, state, scope=scope)
+      logits = tf.matmul(
+          outputs,
+          tf.get_variable('w_output',
+                          [self.cell.output_size, self.output_size],
+                          dtype=self._dtype))
+      if self._suppress_index is not None:
+        # Replace the target index with -inf, so that it never gets selected.
+        batch_size = tf.shape(logits)[0]
+        logits = tf.concat(
+            [logits[:, :self._suppress_index],
+             tf.fill([batch_size, 1], self.smallest_float),
+             logits[:, self._suppress_index + 1:]],
+            axis=1)
+    return logits, state
+  @property
+  def output_size(self):
+    return self._output_size
+  @property
+  def state_size(self):
+    return self.cell.state_size
+  def zero_state(self, batch_size, dtype):
+    return self.cell.zero_state(batch_size, dtype)
+UpdateStepResult = namedtuple(
+    'UpdateStepResult',
+    ['global_step', 'global_npe', 'summaries_list', 'gradients_dict'])
+class AttrDict(dict):
+  """Dict with attributes as keys.
+  https://stackoverflow.com/a/14620633
+  """
+  def __init__(self, *args, **kwargs):
+    super(AttrDict, self).__init__(*args, **kwargs)
+    self.__dict__ = self
+class LMAgent(object):
+  """Language model agent."""
+  action_space = misc.bf_num_tokens()
+  observation_space = misc.bf_num_tokens()
+  def __init__(self, global_config, task_id=0,
+               logging_file=None,
+               experience_replay_file=None,
+               global_best_reward_fn=None,
+               found_solution_op=None,
+               assign_code_solution_fn=None,
+               program_count=None,
+               do_iw_summaries=False,
+               stop_on_success=True,
+               dtype=tf.float32,
+               verbose_level=0,
+               is_local=True):
+    self.config = config = global_config.agent
+    self.logging_file = logging_file
+    self.experience_replay_file = experience_replay_file
+    self.task_id = task_id
+    self.verbose_level = verbose_level
+    self.global_best_reward_fn = global_best_reward_fn
+    self.found_solution_op = found_solution_op
+    self.assign_code_solution_fn = assign_code_solution_fn
+    self.parent_scope_name = tf.get_variable_scope().name
+    self.dtype = dtype
+    self.allow_eos_token = config.eos_token
+    self.stop_on_success = stop_on_success
+    self.pi_loss_hparam = config.pi_loss_hparam
+    self.vf_loss_hparam = config.vf_loss_hparam
+    self.is_local = is_local
+    self.top_reward = 0.0
+    self.embeddings_trainable = True
+    self.no_op = tf.no_op()
+    self.learning_rate = tf.constant(
+        config.lr, dtype=dtype, name='learning_rate')
+    self.initializer = tf.contrib.layers.variance_scaling_initializer(
+        factor=config.param_init_factor,
+        mode='FAN_AVG',
+        uniform=True,
+        dtype=dtype)  # TF's default initializer.
+    tf.get_variable_scope().set_initializer(self.initializer)
+    self.a2c = config.ema_baseline_decay == 0
+    if not self.a2c:
+      logging.info('Using exponential moving average REINFORCE baselines.')
+      self.ema_baseline_decay = config.ema_baseline_decay
+      self.ema_by_len = [0.0] * global_config.timestep_limit
+    else:
+      logging.info('Using advantage (a2c) with learned value function.')
+      self.ema_baseline_decay = 0.0
+      self.ema_by_len = None
+    # Top-k
+    if config.topk and config.topk_loss_hparam:
+      self.topk_loss_hparam = config.topk_loss_hparam
+      self.topk_batch_size = config.topk_batch_size
+      if self.topk_batch_size <= 0:
+        raise ValueError('topk_batch_size must be a positive integer. Got %s',
+                         self.topk_batch_size)
+      self.top_episodes = utils.MaxUniquePriorityQueue(config.topk)
+      logging.info('Made max-priorty-queue with capacity %d',
+                   self.top_episodes.capacity)
+    else:
+      self.top_episodes = None
+      self.topk_loss_hparam = 0.0
+      logging.info('No max-priorty-queue')
+    # Experience replay.
+    self.replay_temperature = config.replay_temperature
+    self.num_replay_per_batch = int(global_config.batch_size * config.alpha)
+    self.num_on_policy_per_batch = (
+        global_config.batch_size - self.num_replay_per_batch)
+    self.replay_alpha = (
+        self.num_replay_per_batch / float(global_config.batch_size))
+    logging.info('num_replay_per_batch: %d', self.num_replay_per_batch)
+    logging.info('num_on_policy_per_batch: %d', self.num_on_policy_per_batch)
+    logging.info('replay_alpha: %s', self.replay_alpha)
+    if self.num_replay_per_batch > 0:
+      # Train with off-policy episodes from replay buffer.
+      start_time = time.time()
+      self.experience_replay = utils.RouletteWheel(
+          unique_mode=True, save_file=experience_replay_file)
+      logging.info('Took %s sec to load replay buffer from disk.',
+                   int(time.time() - start_time))
+      logging.info('Replay buffer file location: "%s"',
+                   self.experience_replay.save_file)
+    else:
+      # Only train on-policy.
+      self.experience_replay = None
+    if program_count is not None:
+      self.program_count = program_count
+      self.program_count_add_ph = tf.placeholder(
+          tf.int64, [], 'program_count_add_ph')
+      self.program_count_add_op = self.program_count.assign_add(
+          self.program_count_add_ph)
+    ################################
+    # RL policy and value networks #
+    ################################
+    batch_size = global_config.batch_size
+    logging.info('batch_size: %d', batch_size)
+    self.policy_cell = LinearWrapper(
+        tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.BasicLSTMCell(cell_size)
+             for cell_size in config.policy_lstm_sizes]),
+        self.action_space,
+        dtype=dtype,
+        suppress_index=None if self.allow_eos_token else misc.BF_EOS_INT)
+    self.value_cell = LinearWrapper(
+        tf.contrib.rnn.MultiRNNCell(
+            [tf.contrib.rnn.BasicLSTMCell(cell_size)
+             for cell_size in config.value_lstm_sizes]),
+        1,
+        dtype=dtype)
+    obs_embedding_scope = 'obs_embed'
+    with tf.variable_scope(
+        obs_embedding_scope,
+        initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)):
+      obs_embeddings = tf.get_variable(
+          'embeddings',
+          [self.observation_space, config.obs_embedding_size],
+          dtype=dtype, trainable=self.embeddings_trainable)
+      self.obs_embeddings = obs_embeddings
+    ################################
+    # RL policy and value networks #
+    ################################
+    initial_state = tf.fill([batch_size], misc.BF_EOS_INT)
+    def loop_fn(loop_time, cell_output, cell_state, loop_state):
+      """Function called by tf.nn.raw_rnn to instantiate body of the while_loop.
+      See https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn for more
+      information.
+      When time is 0, and cell_output, cell_state, loop_state are all None,
+      `loop_fn` will create the initial input, internal cell state, and loop
+      state. When time > 0, `loop_fn` will operate on previous cell output,
+      state, and loop state.
+      Args:
+        loop_time: A scalar tensor holding the current timestep (zero based
+            counting).
+        cell_output: Output of the raw_rnn cell at the current timestep.
+        cell_state: Cell internal state at the current timestep.
+        loop_state: Additional loop state. These tensors were returned by the
+            previous call to `loop_fn`.
+      Returns:
+        elements_finished: Bool tensor of shape [batch_size] which marks each
+            sequence in the batch as being finished or not finished.
+        next_input: A tensor containing input to be fed into the cell at the
+            next timestep.
+        next_cell_state: Cell internal state to be fed into the cell at the
+            next timestep.
+        emit_output: Tensor to be added to the TensorArray returned by raw_rnn
+            as output from the while_loop.
+        next_loop_state: Additional loop state. These tensors will be fed back
+            into the next call to `loop_fn` as `loop_state`.
+      """
+      if cell_output is None:  # 0th time step.
+        next_cell_state = self.policy_cell.zero_state(batch_size, dtype)
+        elements_finished = tf.zeros([batch_size], tf.bool)
+        output_lengths = tf.ones([batch_size], dtype=tf.int32)
+        next_input = tf.gather(obs_embeddings, initial_state)
+        emit_output = None
+        next_loop_state = (
+            tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True),
+            output_lengths,
+            elements_finished
+        )
+      else:
+        scaled_logits = cell_output * config.softmax_tr  # Scale temperature.
+        prev_chosen, prev_output_lengths, prev_elements_finished = loop_state
+        next_cell_state = cell_state
+        chosen_outputs = tf.to_int32(tf.where(
+            tf.logical_not(prev_elements_finished),
+            tf.multinomial(logits=scaled_logits, num_samples=1)[:, 0],
+            tf.zeros([batch_size], dtype=tf.int64)))
+        elements_finished = tf.logical_or(
+            tf.equal(chosen_outputs, misc.BF_EOS_INT),
+            loop_time >= global_config.timestep_limit)
+        output_lengths = tf.where(
+            elements_finished,
+            prev_output_lengths,
+            # length includes EOS token. empty seq has len 1.
+            tf.tile(tf.expand_dims(loop_time + 1, 0), [batch_size])
+        )
+        next_input = tf.gather(obs_embeddings, chosen_outputs)
+        emit_output = scaled_logits
+        next_loop_state = (prev_chosen.write(loop_time - 1, chosen_outputs),
+                           output_lengths,
+                           tf.logical_or(prev_elements_finished,
+                                         elements_finished))
+      return (elements_finished, next_input, next_cell_state, emit_output,
+              next_loop_state)
+    with tf.variable_scope('policy'):
+      (decoder_outputs_ta,
+       _,  # decoder_state
+       (sampled_output_ta, output_lengths, _)) = tf.nn.raw_rnn(
+           cell=self.policy_cell,
+           loop_fn=loop_fn)
+    policy_logits = tf.transpose(decoder_outputs_ta.stack(), (1, 0, 2),
+                                 name='policy_logits')
+    sampled_tokens = tf.transpose(sampled_output_ta.stack(), (1, 0),
+                                  name='sampled_tokens')
+    # Add SOS to beginning of the sequence.
+    rshift_sampled_tokens = rshift_time(sampled_tokens, fill=misc.BF_EOS_INT)
+    # Initial state is 0, 2nd state is first token.
+    # Note: If value of last state is computed, this will be used as bootstrap.
+    if self.a2c:
+      with tf.variable_scope('value'):
+        value_output, _ = tf.nn.dynamic_rnn(
+            self.value_cell,
+            tf.gather(obs_embeddings, rshift_sampled_tokens),
+            sequence_length=output_lengths,
+            dtype=dtype)
+      value = tf.squeeze(value_output, axis=[2])
+    else:
+      value = tf.zeros([], dtype=dtype)
+    # for sampling actions from the agent, and which told tensors for doing
+    # gradient updates on the agent.
+    self.sampled_batch = AttrDict(
+        logits=policy_logits,
+        value=value,
+        tokens=sampled_tokens,
+        episode_lengths=output_lengths,
+        probs=tf.nn.softmax(policy_logits),
+        log_probs=tf.nn.log_softmax(policy_logits))
+    # adjusted_lengths can be less than the full length of each episode.
+    # Use this to train on only part of an episode (starting from t=0).
+    self.adjusted_lengths = tf.placeholder(
+        tf.int32, [None], name='adjusted_lengths')
+    self.policy_multipliers = tf.placeholder(
+        dtype,
+        [None, None],
+        name='policy_multipliers')
+    # Empirical value, i.e. discounted sum of observed future rewards from each
+    # time step in the episode.
+    self.empirical_values = tf.placeholder(
+        dtype,
+        [None, None],
+        name='empirical_values')
+    # Off-policy training. Just add supervised loss to the RL loss.
+    self.off_policy_targets = tf.placeholder(
+        tf.int32,
+        [None, None],
+        name='off_policy_targets')
+    self.off_policy_target_lengths = tf.placeholder(
+        tf.int32, [None], name='off_policy_target_lengths')
+    self.actions = tf.placeholder(tf.int32, [None, None], name='actions')
+    # Add SOS to beginning of the sequence.
+    inputs = rshift_time(self.actions, fill=misc.BF_EOS_INT)
+    with tf.variable_scope('policy', reuse=True):
+      logits, _ = tf.nn.dynamic_rnn(
+          self.policy_cell, tf.gather(obs_embeddings, inputs),
+          sequence_length=self.adjusted_lengths,
+          dtype=dtype)
+    if self.a2c:
+      with tf.variable_scope('value', reuse=True):
+        value_output, _ = tf.nn.dynamic_rnn(
+            self.value_cell,
+            tf.gather(obs_embeddings, inputs),
+            sequence_length=self.adjusted_lengths,
+            dtype=dtype)
+      value2 = tf.squeeze(value_output, axis=[2])
+    else:
+      value2 = tf.zeros([], dtype=dtype)
+    self.given_batch = AttrDict(
+        logits=logits,
+        value=value2,
+        tokens=sampled_tokens,
+        episode_lengths=self.adjusted_lengths,
+        probs=tf.nn.softmax(logits),
+        log_probs=tf.nn.log_softmax(logits))
+    # Episode masks.
+    max_episode_length = tf.shape(self.actions)[1]
+    # range_row shape: [1, max_episode_length]
+    range_row = tf.expand_dims(tf.range(max_episode_length), 0)
+    episode_masks = tf.cast(
+        tf.less(range_row, tf.expand_dims(self.given_batch.episode_lengths, 1)),
+        dtype=dtype)
+    episode_masks_3d = tf.expand_dims(episode_masks, 2)
+    # Length adjusted episodes.
+    self.a_probs = a_probs = self.given_batch.probs * episode_masks_3d
+    self.a_log_probs = a_log_probs = (
+        self.given_batch.log_probs * episode_masks_3d)
+    self.a_value = a_value = self.given_batch.value * episode_masks
+    self.a_policy_multipliers = a_policy_multipliers = (
+        self.policy_multipliers * episode_masks)
+    if self.a2c:
+      self.a_empirical_values = a_empirical_values = (
+          self.empirical_values * episode_masks)
+    # pi_loss is scalar
+    acs_onehot = tf.one_hot(self.actions, self.action_space, dtype=dtype)
+    self.acs_onehot = acs_onehot
+    chosen_masked_log_probs = acs_onehot * a_log_probs
+    pi_target = tf.expand_dims(a_policy_multipliers, -1)
+    pi_loss_per_step = chosen_masked_log_probs * pi_target  # Maximize.
+    self.pi_loss = pi_loss = (
+        -tf.reduce_mean(tf.reduce_sum(pi_loss_per_step, axis=[1, 2]), axis=0)
+        * MAGIC_LOSS_MULTIPLIER)  # Minimize.
+    assert len(self.pi_loss.shape) == 0  # pylint: disable=g-explicit-length-test
+    # shape: [batch_size, time]
+    self.chosen_log_probs = tf.reduce_sum(chosen_masked_log_probs, axis=2)
+    self.chosen_probs = tf.reduce_sum(acs_onehot * a_probs, axis=2)
+    # loss of value function
+    if self.a2c:
+      vf_loss_per_step = tf.square(a_value - a_empirical_values)
+      self.vf_loss = vf_loss = (
+          tf.reduce_mean(tf.reduce_sum(vf_loss_per_step, axis=1), axis=0)
+          * MAGIC_LOSS_MULTIPLIER)  # Minimize.
+      assert len(self.vf_loss.shape) == 0  # pylint: disable=g-explicit-length-test
+    else:
+      self.vf_loss = vf_loss = 0.0
+    # Maximize entropy regularizer
+    self.entropy = entropy = (
+        -tf.reduce_mean(
+            tf.reduce_sum(a_probs * a_log_probs, axis=[1, 2]), axis=0)
+        * MAGIC_LOSS_MULTIPLIER)  # Maximize
+    self.negentropy = -entropy  # Minimize negentropy.
+    assert len(self.negentropy.shape) == 0  # pylint: disable=g-explicit-length-test
+    # off-policy loss
+    self.offp_switch = tf.placeholder(dtype, [], name='offp_switch')
+    if self.top_episodes is not None:
+      # Add SOS to beginning of the sequence.
+      offp_inputs = tf.gather(obs_embeddings,
+                              rshift_time(self.off_policy_targets,
+                                          fill=misc.BF_EOS_INT))
+      with tf.variable_scope('policy', reuse=True):
+        offp_logits, _ = tf.nn.dynamic_rnn(
+            self.policy_cell, offp_inputs, self.off_policy_target_lengths,
+            dtype=dtype)  # shape: [batch_size, time, action_space]
+      topk_loss_per_step = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=self.off_policy_targets,
+          logits=offp_logits,
+          name='topk_loss_per_logit')
+      # Take mean over batch dimension so that the loss multiplier strength is
+      # independent of batch size. Sum over time dimension.
+      topk_loss = tf.reduce_mean(
+          tf.reduce_sum(topk_loss_per_step, axis=1), axis=0)
+      assert len(topk_loss.shape) == 0  # pylint: disable=g-explicit-length-test
+      self.topk_loss = topk_loss * self.offp_switch
+      logging.info('Including off policy loss.')
+    else:
+      self.topk_loss = topk_loss = 0.0
+    self.entropy_hparam = tf.constant(
+        config.entropy_beta, dtype=dtype, name='entropy_beta')
+    self.pi_loss_term = pi_loss * self.pi_loss_hparam
+    self.vf_loss_term = vf_loss * self.vf_loss_hparam
+    self.entropy_loss_term = self.negentropy * self.entropy_hparam
+    self.topk_loss_term = self.topk_loss_hparam * topk_loss
+    self.loss = (
+        self.pi_loss_term
+        + self.vf_loss_term
+        + self.entropy_loss_term
+        + self.topk_loss_term)
+    params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
+                               tf.get_variable_scope().name)
+    self.trainable_variables = params
+    self.sync_variables = self.trainable_variables
+    non_embedding_params = [p for p in params
+                            if obs_embedding_scope not in p.name]
+    self.non_embedding_params = non_embedding_params
+    self.params = params
+    if config.regularizer:
+      logging.info('Adding L2 regularizer with scale %.2f.',
+                   config.regularizer)
+      self.regularizer = config.regularizer * sum(
+          tf.nn.l2_loss(w) for w in non_embedding_params)
+      self.loss += self.regularizer
+    else:
+      logging.info('Skipping regularizer.')
+      self.regularizer = 0.0
+    # Only build gradients graph for local model.
+    if self.is_local:
+      unclipped_grads = tf.gradients(self.loss, params)
+      self.dense_unclipped_grads = [
+          tf.convert_to_tensor(g) for g in unclipped_grads]
+      self.grads, self.global_grad_norm = tf.clip_by_global_norm(
+          unclipped_grads, config.grad_clip_threshold)
+      self.gradients_dict = dict(zip(params, self.grads))
+      self.optimizer = make_optimizer(config.optimizer, self.learning_rate)
+      self.all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+                                             tf.get_variable_scope().name)
+    self.do_iw_summaries = do_iw_summaries
+    if self.do_iw_summaries:
+      b = None
+      self.log_iw_replay_ph = tf.placeholder(tf.float32, [b],
+                                             'log_iw_replay_ph')
+      self.log_iw_policy_ph = tf.placeholder(tf.float32, [b],
+                                             'log_iw_policy_ph')
+      self.log_prob_replay_ph = tf.placeholder(tf.float32, [b],
+                                               'log_prob_replay_ph')
+      self.log_prob_policy_ph = tf.placeholder(tf.float32, [b],
+                                               'log_prob_policy_ph')
+      self.log_norm_replay_weights_ph = tf.placeholder(
+          tf.float32, [b], 'log_norm_replay_weights_ph')
+      self.iw_summary_op = tf.summary.merge([
+          tf.summary.histogram('is/log_iw_replay', self.log_iw_replay_ph),
+          tf.summary.histogram('is/log_iw_policy', self.log_iw_policy_ph),
+          tf.summary.histogram('is/log_prob_replay', self.log_prob_replay_ph),
+          tf.summary.histogram('is/log_prob_policy', self.log_prob_policy_ph),
+          tf.summary.histogram(
+              'is/log_norm_replay_weights', self.log_norm_replay_weights_ph),
+      ])
+  def make_summary_ops(self):
+    """Construct summary ops for the model."""
+    # size = number of timesteps across entire batch. Number normalized by size
+    # will not be affected by the amount of padding at the ends of sequences
+    # in the batch.
+    size = tf.cast(
+        tf.reduce_sum(self.given_batch.episode_lengths), dtype=self.dtype)
+    offp_size = tf.cast(tf.reduce_sum(self.off_policy_target_lengths),
+                        dtype=self.dtype)
+    scope_prefix = self.parent_scope_name
+    def _remove_prefix(prefix, name):
+      assert name.startswith(prefix)
+      return name[len(prefix):]
+    # RL summaries.
+    self.rl_summary_op = tf.summary.merge(
+        [tf.summary.scalar('model/policy_loss', self.pi_loss / size),
+         tf.summary.scalar('model/value_loss', self.vf_loss / size),
+         tf.summary.scalar('model/topk_loss', self.topk_loss / offp_size),
+         tf.summary.scalar('model/entropy', self.entropy / size),
+         tf.summary.scalar('model/loss', self.loss / size),
+         tf.summary.scalar('model/grad_norm',
+                           tf.global_norm(self.grads)),
+         tf.summary.scalar('model/unclipped_grad_norm', self.global_grad_norm),
+         tf.summary.scalar('model/non_embedding_var_norm',
+                           tf.global_norm(self.non_embedding_params)),
+         tf.summary.scalar('hparams/entropy_beta', self.entropy_hparam),
+         tf.summary.scalar('hparams/topk_loss_hparam', self.topk_loss_hparam),
+         tf.summary.scalar('hparams/learning_rate', self.learning_rate),
+         tf.summary.scalar('model/trainable_var_norm',
+                           tf.global_norm(self.trainable_variables)),
+         tf.summary.scalar('loss/loss', self.loss),
+         tf.summary.scalar('loss/entropy', self.entropy_loss_term),
+         tf.summary.scalar('loss/vf', self.vf_loss_term),
+         tf.summary.scalar('loss/policy', self.pi_loss_term),
+         tf.summary.scalar('loss/offp', self.topk_loss_term)] +
+        [tf.summary.scalar(
+            'param_norms/' + _remove_prefix(scope_prefix + '/', p.name),
+            tf.norm(p))
+         for p in self.params] +
+        [tf.summary.scalar(
+            'grad_norms/' + _remove_prefix(scope_prefix + '/', p.name),
+            tf.norm(g))
+         for p, g in zip(self.params, self.grads)] +
+        [tf.summary.scalar(
+            'unclipped_grad_norms/' + _remove_prefix(scope_prefix + '/',
+                                                     p.name),
+            tf.norm(g))
+         for p, g in zip(self.params, self.dense_unclipped_grads)])
+    self.text_summary_placeholder = tf.placeholder(tf.string, shape=[])
+    self.rl_text_summary_op = tf.summary.text('rl',
+                                              self.text_summary_placeholder)
+  def _rl_text_summary(self, session, step, npe, tot_r, num_steps,
+                       input_case, code_output, code, reason):
+    """Logs summary about a single episode and creates a text_summary for TB.
+    Args:
+      session: tf.Session instance.
+      step: Global training step.
+      npe: Number of programs executed so far.
+      tot_r: Total reward.
+      num_steps: Number of timesteps in the episode (i.e. code length).
+      input_case: Inputs for test cases.
+      code_output: Outputs produced by running the code on the inputs.
+      code: String representation of the code.
+      reason: Reason for the reward assigned by the task.
+    Returns:
+      Serialized text summary data for tensorboard.
+    """
+    if not input_case:
+      input_case = ' '
+    if not code_output:
+      code_output = ' '
+    if not code:
+      code = ' '
+    text = (
+        'Tot R: **%.2f**;  Len: **%d**;  Reason: **%s**\n\n'
+        'Input: **`%s`**; Output: **`%s`**\n\nCode: **`%s`**'
+        % (tot_r, num_steps, reason, input_case, code_output, code))
+    text_summary = session.run(self.rl_text_summary_op,
+                               {self.text_summary_placeholder: text})
+    logging.info(
+        'Step %d.\t NPE: %d\t Reason: %s.\t Tot R: %.2f.\t Length: %d. '
+        '\tInput: %s \tOutput: %s \tProgram: %s',
+        step, npe, reason, tot_r, num_steps, input_case,
+        code_output, code)
+    return text_summary
+  def _rl_reward_summary(self, total_rewards):
+    """Create summary ops that report on episode rewards.
+    Creates summaries for average, median, max, and min rewards in the batch.
+    Args:
+      total_rewards: Tensor of shape [batch_size] containing the total reward
+          from each episode in the batch.
+    Returns:
+      tf.Summary op.
+    """
+    tr = np.asarray(total_rewards)
+    reward_summary = tf.Summary(value=[
+        tf.Summary.Value(
+            tag='reward/avg',
+            simple_value=np.mean(tr)),
+        tf.Summary.Value(
+            tag='reward/med',
+            simple_value=np.median(tr)),
+        tf.Summary.Value(
+            tag='reward/max',
+            simple_value=np.max(tr)),
+        tf.Summary.Value(
+            tag='reward/min',
+            simple_value=np.min(tr))])
+    return reward_summary
+  def _iw_summary(self, session, replay_iw, replay_log_probs,
+                  norm_replay_weights, on_policy_iw,
+                  on_policy_log_probs):
+    """Compute summaries for importance weights at a given batch.
+    Args:
+      session: tf.Session instance.
+      replay_iw: Importance weights for episodes from replay buffer.
+      replay_log_probs: Total log probabilities of the replay episodes under the
+          current policy.
+      norm_replay_weights: Normalized replay weights, i.e. values in `replay_iw`
+          divided by the total weight in the entire replay buffer. Note, this is
+          also the probability of selecting each episode from the replay buffer
+          (in a roulette wheel replay buffer).
+      on_policy_iw: Importance weights for episodes sampled from the current
+          policy.
+      on_policy_log_probs: Total log probabilities of the on-policy episodes
+          under the current policy.
+    Returns:
+      Serialized TF summaries. Use a summary writer to write these summaries to
+      disk.
+    """
+    return session.run(
+        self.iw_summary_op,
+        {self.log_iw_replay_ph: np.log(replay_iw),
+         self.log_iw_policy_ph: np.log(on_policy_iw),
+         self.log_norm_replay_weights_ph: np.log(norm_replay_weights),
+         self.log_prob_replay_ph: replay_log_probs,
+         self.log_prob_policy_ph: on_policy_log_probs})
+  def _compute_iw(self, policy_log_probs, replay_weights):
+    """Compute importance weights for a batch of episodes.
+    Arguments are iterables of length batch_size.
+    Args:
+      policy_log_probs: Log probability of each episode under the current
+          policy.
+      replay_weights: Weight of each episode in the replay buffer. 0 for
+          episodes not sampled from the replay buffer (i.e. sampled from the
+          policy).
+    Returns:
+      Numpy array of shape [batch_size] containing the importance weight for
+      each episode in the batch.
+    """
+    log_total_replay_weight = log(self.experience_replay.total_weight)
+    # importance weight
+    # = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
+    # = 1 / ((1-a) + a*q/p)
+    a = float(self.replay_alpha)
+    a_com = 1.0 - a  # compliment of a
+    importance_weights = np.asarray(
+        [1.0 / (a_com
+                + a * exp((log(replay_weight) - log_total_replay_weight)
+                          - log_p))
+         if replay_weight > 0 else 1.0 / a_com
+         for log_p, replay_weight
+         in zip(policy_log_probs, replay_weights)])
+    return importance_weights
+  def update_step(self, session, rl_batch, train_op, global_step_op,
+                  return_gradients=False):
+    """Perform gradient update on the model.
+    Args:
+      session: tf.Session instance.
+      rl_batch: RLBatch instance from data.py. Use DataManager to create a
+          RLBatch for each call to update_step. RLBatch contains a batch of
+          tasks.
+      train_op: A TF op which will perform the gradient update. LMAgent does not
+          own its training op, so that trainers can do distributed training
+          and construct a specialized training op.
+      global_step_op: A TF op which will return the current global step when
+          run (should not increment it).
+      return_gradients: If True, the gradients will be saved and returned from
+          this method call. This is useful for testing.
+    Returns:
+      Results from the update step in a UpdateStepResult namedtuple, including
+      global step, global NPE, serialized summaries, and optionally gradients.
+    """
+    assert self.is_local
+    # Do update for REINFORCE or REINFORCE + replay buffer.
+    if self.experience_replay is None:
+      # Train with on-policy REINFORCE.
+      # Sample new programs from the policy.
+      num_programs_from_policy = rl_batch.batch_size
+      (batch_actions,
+       batch_values,
+       episode_lengths) = session.run(
+           [self.sampled_batch.tokens, self.sampled_batch.value,
+            self.sampled_batch.episode_lengths])
+      if episode_lengths.size == 0:
+        # This should not happen.
+        logging.warn(
+            'Shapes:\n'
+            'batch_actions.shape: %s\n'
+            'batch_values.shape: %s\n'
+            'episode_lengths.shape: %s\n',
+            batch_actions.shape, batch_values.shape, episode_lengths.shape)
+      # Compute rewards.
+      code_scores = compute_rewards(
+          rl_batch, batch_actions, episode_lengths)
+      code_strings = code_scores.code_strings
+      batch_tot_r = code_scores.total_rewards
+      test_cases = code_scores.test_cases
+      code_outputs = code_scores.code_outputs
+      reasons = code_scores.reasons
+      # Process on-policy samples.
+      batch_targets, batch_returns = process_episodes(
+          code_scores.batch_rewards, episode_lengths, a2c=self.a2c,
+          baselines=self.ema_by_len,
+          batch_values=batch_values)
+      batch_policy_multipliers = batch_targets
+      batch_emp_values = batch_returns if self.a2c else [[]]
+      adjusted_lengths = episode_lengths
+      if self.top_episodes:
+        assert len(self.top_episodes) > 0  # pylint: disable=g-explicit-length-test
+        off_policy_targets = [
+            item for item, _
+            in self.top_episodes.random_sample(self.topk_batch_size)]
+        off_policy_target_lengths = [len(t) for t in off_policy_targets]
+        off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
+                                             dtype=np.int32)
+        offp_switch = 1
+      else:
+        off_policy_targets = [[0]]
+        off_policy_target_lengths = [1]
+        offp_switch = 0
+      fetches = {
+          'global_step': global_step_op,
+          'program_count': self.program_count,
+          'summaries': self.rl_summary_op,
+          'train_op': train_op,
+          'gradients': self.gradients_dict if return_gradients else self.no_op}
+      fetched = session.run(
+          fetches,
+          {self.actions: batch_actions,
+           self.empirical_values: batch_emp_values,
+           self.policy_multipliers: batch_policy_multipliers,
+           self.adjusted_lengths: adjusted_lengths,
+           self.off_policy_targets: off_policy_targets,
+           self.off_policy_target_lengths: off_policy_target_lengths,
+           self.offp_switch: offp_switch})
+      combined_adjusted_lengths = adjusted_lengths
+      combined_returns = batch_returns
+    else:
+      # Train with REINFORCE + off-policy replay buffer by using importance
+      # sampling.
+      # Sample new programs from the policy.
+      # Note: batch size is constant. A full batch will be sampled, but not all
+      # programs will be executed and added to the replay buffer. Those which
+      # are not executed will be discarded and not counted.
+      batch_actions, batch_values, episode_lengths, log_probs = session.run(
+          [self.sampled_batch.tokens, self.sampled_batch.value,
+           self.sampled_batch.episode_lengths, self.sampled_batch.log_probs])
+      if episode_lengths.size == 0:
+        # This should not happen.
+        logging.warn(
+            'Shapes:\n'
+            'batch_actions.shape: %s\n'
+            'batch_values.shape: %s\n'
+            'episode_lengths.shape: %s\n',
+            batch_actions.shape, batch_values.shape, episode_lengths.shape)
+      # Sample from experince replay buffer
+      empty_replay_buffer = (
+          self.experience_replay.is_empty()
+          if self.experience_replay is not None else True)
+      num_programs_from_replay_buff = (
+          self.num_replay_per_batch if not empty_replay_buffer else 0)
+      num_programs_from_policy = (
+          rl_batch.batch_size - num_programs_from_replay_buff)
+      if (not empty_replay_buffer) and num_programs_from_replay_buff:
+        result = self.experience_replay.sample_many(
+            num_programs_from_replay_buff)
+        experience_samples, replay_weights = zip(*result)
+        (replay_actions,
+         replay_rewards,
+         _,  # log probs
+         replay_adjusted_lengths) = zip(*experience_samples)
+        replay_batch_actions = utils.stack_pad(replay_actions, pad_axes=0,
+                                               dtype=np.int32)
+        # compute log probs for replay samples under current policy
+        all_replay_log_probs, = session.run(
+            [self.given_batch.log_probs],
+            {self.actions: replay_batch_actions,
+             self.adjusted_lengths: replay_adjusted_lengths})
+        replay_log_probs = [
+            np.choose(replay_actions[i], all_replay_log_probs[i, :l].T).sum()
+            for i, l in enumerate(replay_adjusted_lengths)]
+      else:
+        # Replay buffer is empty. Do not sample from it.
+        replay_actions = None
+        replay_policy_multipliers = None
+        replay_adjusted_lengths = None
+        replay_log_probs = None
+        replay_weights = None
+        replay_returns = None
+        on_policy_weights = [0] * num_programs_from_replay_buff
+      assert not self.a2c  # TODO(danabo): Support A2C with importance sampling.
+      # Compute rewards.
+      code_scores = compute_rewards(
+          rl_batch, batch_actions, episode_lengths,
+          batch_size=num_programs_from_policy)
+      code_strings = code_scores.code_strings
+      batch_tot_r = code_scores.total_rewards
+      test_cases = code_scores.test_cases
+      code_outputs = code_scores.code_outputs
+      reasons = code_scores.reasons
+      # Process on-policy samples.
+      p = num_programs_from_policy
+      batch_targets, batch_returns = process_episodes(
+          code_scores.batch_rewards, episode_lengths[:p], a2c=False,
+          baselines=self.ema_by_len)
+      batch_policy_multipliers = batch_targets
+      batch_emp_values = [[]]
+      on_policy_returns = batch_returns
+      # Process off-policy samples.
+      if (not empty_replay_buffer) and num_programs_from_replay_buff:
+        offp_batch_rewards = [
+            [0.0] * (l - 1) + [r]
+            for l, r in zip(replay_adjusted_lengths, replay_rewards)]
+        assert len(offp_batch_rewards) == num_programs_from_replay_buff
+        assert len(replay_adjusted_lengths) == num_programs_from_replay_buff
+        replay_batch_targets, replay_returns = process_episodes(
+            offp_batch_rewards, replay_adjusted_lengths, a2c=False,
+            baselines=self.ema_by_len)
+        # Convert 2D array back into ragged 2D list.
+        replay_policy_multipliers = [
+            replay_batch_targets[i, :l]
+            for i, l
+            in enumerate(
+                replay_adjusted_lengths[:num_programs_from_replay_buff])]
+      adjusted_lengths = episode_lengths[:num_programs_from_policy]
+      if self.top_episodes:
+        assert len(self.top_episodes) > 0  # pylint: disable=g-explicit-length-test
+        off_policy_targets = [
+            item for item, _
+            in self.top_episodes.random_sample(self.topk_batch_size)]
+        off_policy_target_lengths = [len(t) for t in off_policy_targets]
+        off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
+                                             dtype=np.int32)
+        offp_switch = 1
+      else:
+        off_policy_targets = [[0]]
+        off_policy_target_lengths = [1]
+        offp_switch = 0
+      # On-policy episodes.
+      if num_programs_from_policy:
+        separate_actions = [
+            batch_actions[i, :l]
+            for i, l in enumerate(adjusted_lengths)]
+        chosen_log_probs = [
+            np.choose(separate_actions[i], log_probs[i, :l].T)
+            for i, l in enumerate(adjusted_lengths)]
+        new_experiences = [
+            (separate_actions[i],
+             batch_tot_r[i],
+             chosen_log_probs[i].sum(), l)
+            for i, l in enumerate(adjusted_lengths)]
+        on_policy_policy_multipliers = [
+            batch_policy_multipliers[i, :l]
+            for i, l in enumerate(adjusted_lengths)]
+        (on_policy_actions,
+         _,  # rewards
+         on_policy_log_probs,
+         on_policy_adjusted_lengths) = zip(*new_experiences)
+      else:
+        new_experiences = []
+        on_policy_policy_multipliers = []
+        on_policy_actions = []
+        on_policy_log_probs = []
+        on_policy_adjusted_lengths = []
+      if (not empty_replay_buffer) and num_programs_from_replay_buff:
+        # Look for new experiences in replay buffer. Assign weight if an episode
+        # is in the buffer.
+        on_policy_weights = [0] * num_programs_from_policy
+        for i, cs in enumerate(code_strings):
+          if self.experience_replay.has_key(cs):
+            on_policy_weights[i] = self.experience_replay.get_weight(cs)
+      # Randomly select on-policy or off policy episodes to train on.
+      combined_actions = join(replay_actions, on_policy_actions)
+      combined_policy_multipliers = join(
+          replay_policy_multipliers, on_policy_policy_multipliers)
+      combined_adjusted_lengths = join(
+          replay_adjusted_lengths, on_policy_adjusted_lengths)
+      combined_returns = join(replay_returns, on_policy_returns)
+      combined_actions = utils.stack_pad(combined_actions, pad_axes=0)
+      combined_policy_multipliers = utils.stack_pad(combined_policy_multipliers,
+                                                    pad_axes=0)
+      # P
+      combined_on_policy_log_probs = join(replay_log_probs, on_policy_log_probs)
+      # Q
+      # Assume weight is zero for all sequences sampled from the policy.
+      combined_q_weights = join(replay_weights, on_policy_weights)
+      # Importance adjustment. Naive formulation:
+      # E_{x~p}[f(x)] ~= 1/N sum_{x~p}(f(x)) ~= 1/N sum_{x~q}(f(x) * p(x)/q(x)).
+      # p(x) is the policy, and q(x) is the off-policy distribution, i.e. replay
+      # buffer distribution. Importance weight w(x) = p(x) / q(x).
+      # Instead of sampling from the replay buffer only, we sample from a
+      # mixture distribution of the policy and replay buffer.
+      # We are sampling from the mixture a*q(x) + (1-a)*p(x), where 0 <= a <= 1.
+      # Thus the importance weight w(x) = p(x) / (a*q(x) + (1-a)*p(x))
+      # = 1 / ((1-a) + a*q(x)/p(x)) where q(x) is 0 for x sampled from the
+      #                             policy.
+      # Note: a = self.replay_alpha
+      if empty_replay_buffer:
+        # The replay buffer is empty.
+        # Do no gradient update this step. The replay buffer will have stuff in
+        # it next time.
+        combined_policy_multipliers *= 0
+      elif not num_programs_from_replay_buff:
+        combined_policy_multipliers = np.ones([len(combined_actions), 1],
+                                              dtype=np.float32)
+      else:
+        # If a < 1 compute importance weights
+        # importance weight
+        # = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
+        # = 1 / ((1-a) + a*q/p)
+        importance_weights = self._compute_iw(combined_on_policy_log_probs,
+                                              combined_q_weights)
+        if self.config.iw_normalize:
+          importance_weights *= (
+              float(rl_batch.batch_size) / importance_weights.sum())
+        combined_policy_multipliers *= importance_weights.reshape(-1, 1)
+      # Train on replay batch, top-k MLE.
+      assert self.program_count is not None
+      fetches = {
+          'global_step': global_step_op,
+          'program_count': self.program_count,
+          'summaries': self.rl_summary_op,
+          'train_op': train_op,
+          'gradients': self.gradients_dict if return_gradients else self.no_op}
+      fetched = session.run(
+          fetches,
+          {self.actions: combined_actions,
+           self.empirical_values: [[]],  # replay_emp_values,
+           self.policy_multipliers: combined_policy_multipliers,
+           self.adjusted_lengths: combined_adjusted_lengths,
+           self.off_policy_targets: off_policy_targets,
+           self.off_policy_target_lengths: off_policy_target_lengths,
+           self.offp_switch: offp_switch})
+      # Add to experience replay buffer.
+      self.experience_replay.add_many(
+          objs=new_experiences,
+          weights=[exp(r / self.replay_temperature) for r in batch_tot_r],
+          keys=code_strings)
+    # Update program count.
+    session.run(
+        [self.program_count_add_op],
+        {self.program_count_add_ph: num_programs_from_policy})
+    # Update EMA baselines on the mini-batch which we just did traning on.
+    if not self.a2c:
+      for i in xrange(rl_batch.batch_size):
+        episode_length = combined_adjusted_lengths[i]
+        empirical_returns = combined_returns[i, :episode_length]
+        for j in xrange(episode_length):
+          # Update ema_baselines in place.
+          self.ema_by_len[j] = (
+              self.ema_baseline_decay * self.ema_by_len[j]
+              + (1 - self.ema_baseline_decay) * empirical_returns[j])
+    global_step = fetched['global_step']
+    global_npe = fetched['program_count']
+    core_summaries = fetched['summaries']
+    summaries_list = [core_summaries]
+    if num_programs_from_policy:
+      s_i = 0
+      text_summary = self._rl_text_summary(
+          session,
+          global_step,
+          global_npe,
+          batch_tot_r[s_i],
+          episode_lengths[s_i], test_cases[s_i],
+          code_outputs[s_i], code_strings[s_i], reasons[s_i])
+      reward_summary = self._rl_reward_summary(batch_tot_r)
+      is_best = False
+      if self.global_best_reward_fn:
+        # Save best reward.
+        best_reward = np.max(batch_tot_r)
+        is_best = self.global_best_reward_fn(session, best_reward)
+      if self.found_solution_op is not None and 'correct' in reasons:
+        session.run(self.found_solution_op)
+        # Save program to disk for record keeping.
+        if self.stop_on_success:
+          solutions = [
+              {'code': code_strings[i], 'reward': batch_tot_r[i],
+               'npe': global_npe}
+              for i in xrange(len(reasons)) if reasons[i] == 'correct']
+        elif is_best:
+          solutions = [
+              {'code': code_strings[np.argmax(batch_tot_r)],
+               'reward': np.max(batch_tot_r),
+               'npe': global_npe}]
+        else:
+          solutions = []
+        if solutions:
+          if self.assign_code_solution_fn:
+            self.assign_code_solution_fn(session, solutions[0]['code'])
+          with tf.gfile.FastGFile(self.logging_file, 'a') as writer:
+            for solution_dict in solutions:
+              writer.write(str(solution_dict) + '\n')
+      max_i = np.argmax(batch_tot_r)
+      max_tot_r = batch_tot_r[max_i]
+      if max_tot_r >= self.top_reward:
+        if max_tot_r >= self.top_reward:
+          self.top_reward = max_tot_r
+        logging.info('Top code: r=%.2f, \t%s', max_tot_r, code_strings[max_i])
+      if self.top_episodes is not None:
+        self.top_episodes.push(
+            max_tot_r, tuple(batch_actions[max_i, :episode_lengths[max_i]]))
+      summaries_list += [text_summary, reward_summary]
+      if self.do_iw_summaries and not empty_replay_buffer:
+        # prob of replay samples under replay buffer sampling.
+        norm_replay_weights = [
+            w / self.experience_replay.total_weight
+            for w in replay_weights]
+        replay_iw = self._compute_iw(replay_log_probs, replay_weights)
+        on_policy_iw = self._compute_iw(on_policy_log_probs, on_policy_weights)
+        summaries_list.append(
+            self._iw_summary(
+                session, replay_iw, replay_log_probs, norm_replay_weights,
+                on_policy_iw, on_policy_log_probs))
+    return UpdateStepResult(
+        global_step=global_step,
+        global_npe=global_npe,
+        summaries_list=summaries_list,
+        gradients_dict=fetched['gradients'])
+def io_to_text(io_case, io_type):
+  if isinstance(io_case, misc.IOTuple):
+    # If there are many strings, join them with ','.
+    return ','.join([io_to_text(e, io_type) for e in io_case])
+  if io_type == misc.IOType.string:
+    # There is one string. Return it.
+    return misc.tokens_to_text(io_case)
+  if (io_type == misc.IOType.integer
+      or io_type == misc.IOType.boolean):
+    if len(io_case) == 1:
+      return str(io_case[0])
+    return str(io_case)
+CodeScoreInfo = namedtuple(
+    'CodeScoreInfo',
+    ['code_strings', 'batch_rewards', 'total_rewards', 'test_cases',
+     'code_outputs', 'reasons'])
+def compute_rewards(rl_batch, batch_actions, episode_lengths, batch_size=None):
+  """Compute rewards for each episode in the batch.
+  Args:
+    rl_batch: A data.RLBatch instance. This holds information about the task
+        each episode is solving, and a reward function for each episode.
+    batch_actions: Contains batch of episodes. Each sequence of actions will be
+        converted into a BF program and then scored. A numpy array of shape
+        [batch_size, max_sequence_length].
+    episode_lengths: The sequence length of each episode in the batch. Iterable
+        of length batch_size.
+    batch_size: (optional) number of programs to score. Use this to limit the
+        number of programs executed from this batch. For example, when doing
+        importance sampling some of the on-policy episodes will be discarded
+        and they should not be executed. `batch_size` can be less than or equal
+        to the size of the input batch.
+  Returns:
+    CodeScoreInfo namedtuple instance. This holds not just the computed rewards,
+    but additional information computed during code execution which can be used
+    for debugging and monitoring. this includes: BF code strings, test cases
+    the code was executed on, code outputs from those test cases, and reasons
+    for success or failure.
+  """
+  code_strings = [
+      ''.join([misc.bf_int2char(a) for a in action_sequence[:l]])
+      for action_sequence, l in zip(batch_actions, episode_lengths)]
+  if batch_size is None:
+    batch_size = len(code_strings)
+  else:
+    assert batch_size <= len(code_strings)
+    code_strings = code_strings[:batch_size]
+  if isinstance(rl_batch.reward_fns, (list, tuple)):
+    # reward_fns is a list of functions, same length as code_strings.
+    assert len(rl_batch.reward_fns) >= batch_size
+    r_fn_results = [
+        rl_batch.reward_fns[i](code_strings[i]) for i in xrange(batch_size)]
+  else:
+    # reward_fns is allowed to be one function which processes a batch of code
+    # strings. This is useful for efficiency and batch level computation.
+    r_fn_results = rl_batch.reward_fns(code_strings)
+  # Expecting that r_fn returns a list of rewards. Length of list equals
+  # length of the code string (including EOS char).
+  batch_rewards = [r.episode_rewards for r in r_fn_results]
+  total_rewards = [sum(b) for b in batch_rewards]
+  test_cases = [io_to_text(r.input_case, r.input_type) for r in r_fn_results]
+  code_outputs = [io_to_text(r.code_output, r.output_type)
+                  for r in r_fn_results]
+  reasons = [r.reason for r in r_fn_results]
+  return CodeScoreInfo(
+      code_strings=code_strings,
+      batch_rewards=batch_rewards,
+      total_rewards=total_rewards,
+      test_cases=test_cases,
+      code_outputs=code_outputs,
+      reasons=reasons)
+def process_episodes(
+    batch_rewards, episode_lengths, a2c=False, baselines=None,
+    batch_values=None):
+  """Compute REINFORCE targets.
+  REINFORCE here takes the form:
+  grad_t = grad[log(pi(a_t|c_t))*target_t]
+  where c_t is context: i.e. RNN state or environment state (or both).
+  Two types of targets are supported:
+  1) Advantage actor critic (a2c).
+  2) Vanilla REINFORCE with baseline.
+  Args:
+    batch_rewards: Rewards received in each episode in the batch. A numpy array
+        of shape [batch_size, max_sequence_length]. Note, these are per-timestep
+        rewards, not total reward.
+    episode_lengths: Length of each episode. An iterable of length batch_size.
+    a2c: A bool. Whether to compute a2c targets (True) or vanilla targets
+        (False).
+    baselines: If a2c is False, provide baselines for each timestep. This is a
+        list (or indexable container) of length max_time. Note: baselines are
+        shared across all episodes, which is why there is no batch dimension.
+        It is up to the caller to update baselines accordingly.
+    batch_values: If a2c is True, provide values computed by a value estimator.
+        A numpy array of shape [batch_size, max_sequence_length].
+  Returns:
+    batch_targets: REINFORCE targets for each episode and timestep. A numpy
+        array of shape [batch_size, max_sequence_length].
+    batch_returns: Returns computed for each episode and timestep. This is for
+        reference, and is not used in the REINFORCE gradient update (but was
+        used to compute the targets). A numpy array of shape
+        [batch_size, max_sequence_length].
+  """
+  num_programs = len(batch_rewards)
+  assert num_programs <= len(episode_lengths)
+  batch_returns = [None] * num_programs
+  batch_targets = [None] * num_programs
+  for i in xrange(num_programs):
+    episode_length = episode_lengths[i]
+    assert len(batch_rewards[i]) == episode_length
+    # Compute target for each timestep.
+    # If we are computing A2C:
+    #    target_t = advantage_t = R_t - V(c_t)
+    #    where V(c_t) is a learned value function (provided as `values`).
+    # Otherwise:
+    #    target_t = R_t - baselines[t]
+    #    where `baselines` are provided.
+    # In practice we use a more generalized formulation of advantage. See docs
+    # for `discounted_advantage_and_rewards`.
+    if a2c:
+      # Compute advantage.
+      assert batch_values is not None
+      episode_values = batch_values[i, :episode_length]
+      episode_rewards = batch_rewards[i]
+      emp_val, gen_adv = rollout_lib.discounted_advantage_and_rewards(
+          episode_rewards, episode_values, gamma=1.0, lambda_=1.0)
+      batch_returns[i] = emp_val
+      batch_targets[i] = gen_adv
+    else:
+      # Compute return for each timestep. See section 3 of
+      # https://arxiv.org/pdf/1602.01783.pdf
+      assert baselines is not None
+      empirical_returns = rollout_lib.discount(batch_rewards[i], gamma=1.0)
+      targets = [None] * episode_length
+      for j in xrange(episode_length):
+        targets[j] = empirical_returns[j] - baselines[j]
+      batch_returns[i] = empirical_returns
+      batch_targets[i] = targets
+  batch_returns = utils.stack_pad(batch_returns, 0)
+  if num_programs:
+    batch_targets = utils.stack_pad(batch_targets, 0)
+  else:
+    batch_targets = np.array([], dtype=np.float32)
+  return (batch_targets, batch_returns)
--- a/research/brain_coder/single_task/pg_agent_test.py
+++ b/research/brain_coder/single_task/pg_agent_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for pg_agent."""
+from collections import Counter
+from absl import logging
+import numpy as np
+import tensorflow as tf
+from common import utils  # brain coder
+from single_task import data  # brain coder
+from single_task import defaults  # brain coder
+from single_task import misc  # brain coder
+from single_task import pg_agent as agent_lib  # brain coder
+from single_task import pg_train  # brain coder
+# Symmetric mean absolute percentage error (SMAPE).
+# https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error
+def smape(a, b):
+  return 2.0 * abs(a - b) / float(a + b)
+def onehot(dim, num_dims):
+  value = np.zeros(num_dims, dtype=np.float32)
+  value[dim] = 1
+  return value
+def random_sequence(max_length, num_tokens, eos=0):
+  length = np.random.randint(1, max_length - 1)
+  return np.append(np.random.randint(1, num_tokens, length), eos)
+def repeat_and_pad(v, rep, total_len):
+  return [v] * rep + [0.0] * (total_len - rep)
+class AgentTest(tf.test.TestCase):
+  def testProcessEpisodes(self):
+    batch_size = 3
+    def reward_fn(code_string):
+      return misc.RewardInfo(
+          episode_rewards=[float(ord(c)) for c in code_string],
+          input_case=[],
+          correct_output=[],
+          code_output=[],
+          input_type=misc.IOType.integer,
+          output_type=misc.IOType.integer,
+          reason='none')
+    rl_batch = data.RLBatch(
+        reward_fns=[reward_fn for _ in range(batch_size)],
+        batch_size=batch_size,
+        good_reward=10.0)
+    batch_actions = np.asarray([
+        [4, 5, 3, 6, 8, 1, 0, 0],
+        [1, 2, 3, 4, 0, 0, 0, 0],
+        [8, 7, 6, 5, 4, 3, 2, 1]], dtype=np.int32)
+    batch_values = np.asarray([
+        [0, 1, 2, 1, 0, 1, 1, 0],
+        [0, 2, 1, 2, 1, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0, 1, 1]], dtype=np.float32)
+    episode_lengths = np.asarray([7, 5, 8], dtype=np.int32)
+    scores = agent_lib.compute_rewards(
+        rl_batch, batch_actions, episode_lengths)
+    batch_targets, batch_returns = agent_lib.process_episodes(
+        scores.batch_rewards, episode_lengths, a2c=True,
+        batch_values=batch_values)
+    self.assertEqual(
+        [[473.0, 428.0, 337.0, 294.0, 201.0, 157.0, 95.0, 0.0],
+         [305.0, 243.0, 183.0, 140.0, 95.0, 0.0, 0.0, 0.0],
+         [484.0, 440.0, 394.0, 301.0, 210.0, 165.0, 122.0, 62.0]],
+        batch_returns.tolist())
+    self.assertEqual(
+        [[473.0, 427.0, 335.0, 293.0, 201.0, 156.0, 94.0, 0.0],
+         [305.0, 241.0, 182.0, 138.0, 94.0, 0.0, 0.0, 0.0],
+         [484.0, 439.0, 393.0, 301.0, 210.0, 165.0, 121.0, 61.0]],
+        batch_targets.tolist())
+  def testVarUpdates(self):
+    """Tests that variables get updated as expected.
+    For the RL update, check that gradients are non-zero and that the global
+    model gets updated.
+    """
+    config = defaults.default_config_with_updates(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)')
+    lr = config.agent.lr
+    tf.reset_default_graph()
+    trainer = pg_train.AsyncTrainer(
+        config, task_id=0, ps_tasks=0, num_workers=1)
+    global_init_op = tf.variables_initializer(
+        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
+    with tf.Session() as sess:
+      sess.run(global_init_op)  # Initialize global copy.
+      trainer.initialize(sess)
+      model = trainer.model
+      global_vars = sess.run(trainer.global_model.trainable_variables)
+      local_vars = sess.run(model.trainable_variables)
+      # Make sure names match.
+      g_prefix = 'global/'
+      l_prefix = 'local/'
+      for g, l in zip(trainer.global_model.trainable_variables,
+                      model.trainable_variables):
+        self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):])
+      # Assert that shapes and values are the same between global and local
+      # models.
+      for g, l in zip(global_vars, local_vars):
+        self.assertEqual(g.shape, l.shape)
+        self.assertTrue(np.array_equal(g, l))
+      # Make all gradients dense tensors.
+      for param, grad in model.gradients_dict.items():
+        if isinstance(grad, tf.IndexedSlices):
+          # Converts to dense tensor.
+          model.gradients_dict[param] = tf.multiply(grad, 1.0)
+      # Perform update.
+      results = model.update_step(
+          sess, trainer.data_manager.sample_rl_batch(), trainer.train_op,
+          trainer.global_step, return_gradients=True)
+      grads_dict = results.gradients_dict
+      for grad in grads_dict.values():
+        self.assertIsNotNone(grad)
+        self.assertTrue(np.count_nonzero(grad) > 0)
+      global_update = sess.run(trainer.global_model.trainable_variables)
+      for tf_var, var_before, var_after in zip(
+          model.trainable_variables, local_vars, global_update):
+        # Check that the params were updated.
+        self.assertTrue(np.allclose(
+            var_after,
+            var_before - grads_dict[tf_var] * lr))
+      # Test that global to local sync works.
+      sess.run(trainer.sync_op)
+      global_vars = sess.run(trainer.global_model.trainable_variables)
+      local_vars = sess.run(model.trainable_variables)
+      for l, g in zip(local_vars, global_vars):
+        self.assertTrue(np.allclose(l, g))
+  def testMonteCarloGradients(self):
+    """Test Monte Carlo estimate of REINFORCE gradient.
+    Test that the Monte Carlo estimate of the REINFORCE gradient is
+    approximately equal to the true gradient. We compute the true gradient for a
+    toy environment with a very small action space.
+    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
+    """
+    # Test may have different outcome on different machines due to different
+    # rounding behavior of float arithmetic.
+    tf.reset_default_graph()
+    tf.set_random_seed(12345678987654321)
+    np.random.seed(1294024302)
+    max_length = 2
+    num_tokens = misc.bf_num_tokens()
+    eos = misc.BF_EOS_INT
+    assert eos == 0
+    def sequence_iterator(max_length):
+      """Iterates through all sequences up to the given length."""
+      yield [eos]
+      for a in xrange(1, num_tokens):
+        if max_length > 1:
+          for sub_seq in sequence_iterator(max_length - 1):
+            yield [a] + sub_seq
+        else:
+          yield [a]
+    actions = list(sequence_iterator(max_length))
+    # This batch contains all possible episodes up to max_length.
+    actions_batch = utils.stack_pad(actions, 0)
+    lengths_batch = [len(s) for s in actions]
+    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
+    # reward_map = {tuple(a): np.random.normal(3, 1)
+    #               for a in actions_batch}  # normal distribution
+    # reward_map = {tuple(a): 1.0
+    #               for a in actions_batch}  # expected reward is 1
+    n = 100000  # MC sample size.
+    config = defaults.default_config_with_updates(
+        'env=c(task="print"),'
+        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
+        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
+        'policy_lstm_sizes=[10],eos_token=True),'
+        'batch_size='+str(n)+',timestep_limit='+str(max_length))
+    dtype = tf.float64
+    trainer = pg_train.AsyncTrainer(
+        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
+    model = trainer.model
+    actions_ph = model.actions
+    lengths_ph = model.adjusted_lengths
+    multipliers_ph = model.policy_multipliers
+    global_init_op = tf.variables_initializer(
+        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
+    with tf.Session() as sess, sess.graph.as_default():
+      sess.run(global_init_op)  # Initialize global copy.
+      trainer.initialize(sess)
+      # Compute exact gradients.
+      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
+      true_loss_unnormalized = 0.0
+      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
+      episode_probs_map = {}
+      grads_map = {}
+      for a_idx in xrange(len(actions_batch)):
+        a = actions_batch[a_idx]
+        grads_result, probs_result, loss = sess.run(
+            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
+            {actions_ph: [a],
+             lengths_ph: [lengths_batch[a_idx]],
+             multipliers_ph: [
+                 repeat_and_pad(reward_map[tuple(a)],
+                                lengths_batch[a_idx],
+                                max_length)]})
+        # Take product over time axis.
+        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
+        for i in range(0, len(exact_grads)):
+          exact_grads[i] += grads_result[i] * episode_probs_result
+        episode_probs_map[tuple(a)] = episode_probs_result
+        reward_map[tuple(a)] = reward_map[tuple(a)]
+        grads_map[tuple(a)] = grads_result
+        true_loss_unnormalized += loss
+      # Normalize loss. Since each episode is feed into the model one at a time,
+      # normalization needs to be done manually.
+      true_loss = true_loss_unnormalized / float(len(actions_batch))
+      # Compute Monte Carlo gradients.
+      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
+      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
+      # where len(actions_sampled_from_P) == n.
+      #
+      # In other words, sample from the policy and compute the gradients of the
+      # log probs weighted by the returns. This will excersize the code in
+      # agent.py
+      sampled_actions, sampled_lengths = sess.run(
+          [model.sampled_tokens, model.episode_lengths])
+      pi_multipliers = [
+          repeat_and_pad(reward_map[tuple(a)], l, max_length)
+          for a, l in zip(sampled_actions, sampled_lengths)]
+      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
+          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
+          {actions_ph: sampled_actions,
+           multipliers_ph: pi_multipliers,
+           lengths_ph: sampled_lengths})
+      # Loss is already normalized across the minibatch, so no normalization
+      # is needed.
+      mc_grads = mc_grads_unnormalized
+      mc_loss = mc_loss_unnormalized
+    # Make sure true loss and MC loss are similar.
+    loss_error = smape(true_loss, mc_loss)
+    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)
+    # Check that probs computed for episodes sampled from the model are the same
+    # as the recorded true probs.
+    for i in range(100):
+      acs = tuple(sampled_actions[i].tolist())
+      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
+      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))
+    # Make sure MC estimates of true probs are close.
+    counter = Counter(tuple(e) for e in sampled_actions)
+    for acs, count in counter.iteritems():
+      mc_prob = count / float(len(sampled_actions))
+      true_prob = episode_probs_map[acs]
+      error = smape(mc_prob, true_prob)
+      self.assertTrue(
+          error < 0.15,
+          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
+          % (error, count, mc_prob, true_prob))
+    # Manually recompute MC gradients and make sure they match MC gradients
+    # computed in TF.
+    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
+    for i in range(n):
+      acs = tuple(sampled_actions[i].tolist())
+      for i in range(0, len(mc_grads_recompute)):
+        mc_grads_recompute[i] += grads_map[acs][i]
+    for i in range(0, len(mc_grads_recompute)):
+      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))
+    # Check angle between gradients as fraction of pi.
+    for index in range(len(mc_grads)):
+      v1 = mc_grads[index].reshape(-1)
+      v2 = exact_grads[index].reshape(-1)
+      # angle = arccos(v1 . v2 / (|v1|*|v2|))
+      angle_rad = np.arccos(
+          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
+      logging.info('angle / pi: %s', angle_rad / np.pi)
+      angle_frac = angle_rad / np.pi
+      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
+    # Check norms.
+    for index in range(len(mc_grads)):
+      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
+      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
+      error = smape(v1_norm, v2_norm)
+      self.assertTrue(error < 0.02, msg='actual: %s' % error)
+    # Check expected rewards.
+    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
+    mc_expected_reward = np.mean(
+        [reward_map[tuple(a)] for a in sampled_actions])
+    exact_expected_reward = np.sum(
+        [episode_probs_map[k] * reward_map[k] for k in reward_map])
+    error = smape(mc_expected_reward, exact_expected_reward)
+    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
+  def testNumericalGradChecking(self):
+    # Similar to
+    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
+    epsilon = 1e-4
+    eos = misc.BF_EOS_INT
+    self.assertEqual(0, eos)
+    config = defaults.default_config_with_updates(
+        'env=c(task="print"),'
+        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
+        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
+        'eos_token=True),'
+        'batch_size=64')
+    dtype = tf.float64
+    tf.reset_default_graph()
+    tf.set_random_seed(12345678987654321)
+    np.random.seed(1294024302)
+    trainer = pg_train.AsyncTrainer(
+        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
+    model = trainer.model
+    actions_ph = model.actions
+    lengths_ph = model.adjusted_lengths
+    multipliers_ph = model.policy_multipliers
+    loss = model.pi_loss
+    global_init_op = tf.variables_initializer(
+        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
+    assign_add_placeholders = [None] * len(model.trainable_variables)
+    assign_add_ops = [None] * len(model.trainable_variables)
+    param_shapes = [None] * len(model.trainable_variables)
+    for i, param in enumerate(model.trainable_variables):
+      param_shapes[i] = param.get_shape().as_list()
+      assign_add_placeholders[i] = tf.placeholder(dtype,
+                                                  np.prod(param_shapes[i]))
+      assign_add_ops[i] = param.assign_add(
+          tf.reshape(assign_add_placeholders[i], param_shapes[i]))
+    with tf.Session() as sess:
+      sess.run(global_init_op)  # Initialize global copy.
+      trainer.initialize(sess)
+      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
+      actions_batch = utils.stack_pad(actions_raw, 0)
+      lengths_batch = [len(l) for l in actions_raw]
+      feed = {actions_ph: actions_batch,
+              multipliers_ph: np.ones_like(actions_batch),
+              lengths_ph: lengths_batch}
+      estimated_grads = [None] * len(model.trainable_variables)
+      for i, param in enumerate(model.trainable_variables):
+        param_size = np.prod(param_shapes[i])
+        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
+        for index in xrange(param_size):
+          e = onehot(index, param_size) * epsilon
+          sess.run(assign_add_ops[i],
+                   {assign_add_placeholders[i]: e})
+          j_plus = sess.run(loss, feed)
+          sess.run(assign_add_ops[i],
+                   {assign_add_placeholders[i]: -2 * e})
+          j_minus = sess.run(loss, feed)
+          sess.run(assign_add_ops[i],
+                   {assign_add_placeholders[i]: e})
+          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
+        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])
+      analytic_grads = sess.run(model.dense_unclipped_grads, feed)
+      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
+        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
+        self.assertTrue(np.allclose(g1, g2))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/pg_train.py
+++ b/research/brain_coder/single_task/pg_train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+r"""Train RL agent on coding tasks."""
+import contextlib
+import cPickle
+import cProfile
+import marshal
+import os
+import time
+from absl import flags
+from absl import logging
+import tensorflow as tf
+# internal session lib import
+from single_task import data  # brain coder
+from single_task import defaults  # brain coder
+from single_task import pg_agent as agent_lib  # brain coder
+from single_task import results_lib  # brain coder
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'master', '',
+    'URL of the TensorFlow master to use.')
+flags.DEFINE_integer(
+    'ps_tasks', 0,
+    'Number of parameter server tasks. Only set to 0 for '
+    'single worker training.')
+flags.DEFINE_integer(
+    'summary_interval', 10,
+    'How often to write summaries.')
+flags.DEFINE_integer(
+    'summary_tasks', 16,
+    'If greater than 0 only tasks 0 through summary_tasks - 1 '
+    'will write summaries. If 0, all tasks will write '
+    'summaries.')
+flags.DEFINE_bool(
+    'stop_on_success', True,
+    'If True, training will stop as soon as a solution is found. '
+    'If False, training will continue indefinitely until another '
+    'stopping condition is reached.')
+flags.DEFINE_bool(
+    'do_profiling', False,
+    'If True, cProfile profiler will run and results will be '
+    'written to logdir. WARNING: Results will not be written if '
+    'the code crashes. Make sure it exists successfully.')
+flags.DEFINE_integer('model_v', 0, 'Model verbosity level.')
+flags.DEFINE_bool(
+    'delayed_graph_cleanup', True,
+    'If true, container for n-th run will not be reset until the (n+1)-th run '
+    'is complete. This greatly reduces the chance that a worker is still '
+    'using the n-th container when it is cleared.')
+def define_tuner_hparam_space(hparam_space_type):
+  """Define tunable hparams for grid search."""
+  if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
+    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
+  # Discrete hparam space is stored as a dict from hparam name to discrete
+  # values.
+  hparam_space = {}
+  if hparam_space_type in ('pg', 'pg-topk', 'is'):
+    # Add a floating point parameter named learning rate.
+    hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
+    hparam_space['entropy_beta'] = [0.005, 0.01, 0.05, 0.10]
+  else:  # 'topk'
+    # Add a floating point parameter named learning rate.
+    hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
+    hparam_space['entropy_beta'] = [0.0, 0.005, 0.01, 0.05, 0.10]
+  if hparam_space_type in ('topk', 'pg-topk'):
+    # topk tuning will be enabled.
+    hparam_space['topk'] = [10]
+    hparam_space['topk_loss_hparam'] = [1.0, 10.0, 50.0, 200.0]
+  elif hparam_space_type == 'is':
+    # importance sampling tuning will be enabled.
+    hparam_space['replay_temperature'] = [0.25, 0.5, 1.0, 2.0]
+    hparam_space['alpha'] = [0.5, 0.75, 63/64.]
+  return hparam_space
+def write_hparams_to_config(config, hparams, hparam_space_type):
+  """Write hparams given by the tuner into the Config object."""
+  if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
+    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
+  config.agent.lr = hparams.lr
+  config.agent.entropy_beta = hparams.entropy_beta
+  if hparam_space_type in ('topk', 'pg-topk'):
+    # topk tuning will be enabled.
+    config.agent.topk = hparams.topk
+    config.agent.topk_loss_hparam = hparams.topk_loss_hparam
+  elif hparam_space_type == 'is':
+    # importance sampling tuning will be enabled.
+    config.agent.replay_temperature = hparams.replay_temperature
+    config.agent.alpha = hparams.alpha
+def make_initialized_variable(value, name, shape=None, dtype=tf.float32):
+  """Create a tf.Variable with a constant initializer.
+  Args:
+    value: Constant value to initialize the variable with. This is the value
+        that the variable starts with.
+    name: Name of the variable in the TF graph.
+    shape: Shape of the variable. If None, variable will be a scalar.
+    dtype: Data type of the variable. Should be a TF dtype. Defaults to
+        tf.float32.
+  Returns:
+    tf.Variable instance.
+  """
+  if shape is None:
+    shape = []
+  return tf.get_variable(
+      name=name, shape=shape, initializer=tf.constant_initializer(value),
+      dtype=dtype, trainable=False)
+class AsyncTrainer(object):
+  """Manages graph creation and training.
+  This async trainer creates a global model on the parameter server, and a local
+  model (for this worker). Gradient updates are sent to the global model, and
+  the updated weights are synced to the local copy.
+  """
+  def __init__(self, config, task_id, ps_tasks, num_workers, is_chief=True,
+               summary_writer=None,
+               dtype=tf.float32,
+               summary_interval=1,
+               run_number=0,
+               logging_dir='/tmp', model_v=0):
+    self.config = config
+    self.data_manager = data.DataManager(
+        config, run_number=run_number,
+        do_code_simplification=not FLAGS.stop_on_success)
+    self.task_id = task_id
+    self.ps_tasks = ps_tasks
+    self.is_chief = is_chief
+    if ps_tasks == 0:
+      assert task_id == 0, 'No parameter servers specified. Expecting 1 task.'
+      assert num_workers == 1, (
+          'No parameter servers specified. Expecting 1 task.')
+      worker_device = '/job:localhost/replica:%d/task:0/cpu:0' % task_id
+      # worker_device = '/cpu:0'
+      # ps_device = '/cpu:0'
+    else:
+      assert num_workers > 0, 'There must be at least 1 training worker.'
+      worker_device = '/job:worker/replica:%d/task:0/cpu:0' % task_id
+      # ps_device = '/job:ps/replica:0/task:0/cpu:0'
+    logging.info('worker_device: %s', worker_device)
+    logging_file = os.path.join(
+        logging_dir, 'solutions_%d.txt' % task_id)
+    experience_replay_file = os.path.join(
+        logging_dir, 'replay_buffer_%d.pickle' % task_id)
+    self.topk_file = os.path.join(
+        logging_dir, 'topk_buffer_%d.pickle' % task_id)
+    tf.get_variable_scope().set_use_resource(True)
+    # global model
+    with tf.device(tf.train.replica_device_setter(ps_tasks,
+                                                  ps_device='/job:ps/replica:0',
+                                                  worker_device=worker_device)):
+      with tf.variable_scope('global'):
+        global_model = agent_lib.LMAgent(config, dtype=dtype, is_local=False)
+        global_params_dict = {p.name: p
+                              for p in global_model.sync_variables}
+        self.global_model = global_model
+        self.global_step = make_initialized_variable(
+            0, 'global_step', dtype=tf.int64)
+        self.global_best_reward = make_initialized_variable(
+            -10.0, 'global_best_reward', dtype=tf.float64)
+        self.is_best_model = make_initialized_variable(
+            False, 'is_best_model', dtype=tf.bool)
+        self.reset_is_best_model = self.is_best_model.assign(False)
+        self.global_best_reward_placeholder = tf.placeholder(
+            tf.float64, [], name='global_best_reward_placeholder')
+        self.assign_global_best_reward_op = tf.group(
+            self.global_best_reward.assign(
+                self.global_best_reward_placeholder),
+            self.is_best_model.assign(True))
+        def assign_global_best_reward_fn(session, reward):
+          reward = round(reward, 10)
+          best_reward = round(session.run(self.global_best_reward), 10)
+          is_best = reward > best_reward
+          if is_best:
+            session.run(self.assign_global_best_reward_op,
+                        {self.global_best_reward_placeholder: reward})
+          return is_best
+        self.assign_global_best_reward_fn = assign_global_best_reward_fn
+        # Any worker will set to true when it finds a solution.
+        self.found_solution_flag = make_initialized_variable(
+            False, 'found_solution_flag', dtype=tf.bool)
+        self.found_solution_op = self.found_solution_flag.assign(True)
+        self.run_number = make_initialized_variable(
+            run_number, 'run_number', dtype=tf.int32)
+        # Store a solution when found.
+        self.code_solution_variable = tf.get_variable(
+            'code_solution', [], tf.string,
+            initializer=tf.constant_initializer(''))
+        self.code_solution_ph = tf.placeholder(
+            tf.string, [], name='code_solution_ph')
+        self.code_solution_assign_op = self.code_solution_variable.assign(
+            self.code_solution_ph)
+        def assign_code_solution_fn(session, code_solution_string):
+          session.run(self.code_solution_assign_op,
+                      {self.code_solution_ph: code_solution_string})
+        self.assign_code_solution_fn = assign_code_solution_fn
+        # Count all programs sampled from policy. This does not include
+        # programs sampled from replay buffer.
+        # This equals NPE (number of programs executed). Only programs sampled
+        # from the policy need to be executed.
+        self.program_count = make_initialized_variable(
+            0, 'program_count', dtype=tf.int64)
+    # local model
+    with tf.device(worker_device):
+      with tf.variable_scope('local'):
+        self.model = model = agent_lib.LMAgent(
+            config,
+            task_id=task_id,
+            logging_file=logging_file,
+            experience_replay_file=experience_replay_file,
+            dtype=dtype,
+            global_best_reward_fn=self.assign_global_best_reward_fn,
+            found_solution_op=self.found_solution_op,
+            assign_code_solution_fn=self.assign_code_solution_fn,
+            program_count=self.program_count,
+            stop_on_success=FLAGS.stop_on_success,
+            verbose_level=model_v)
+        local_params = model.trainable_variables
+        local_params_dict = {p.name: p for p in local_params}
+    # Pull global params to local model.
+    def _global_to_local_scope(name):
+      assert name.startswith('global/')
+      return 'local' + name[6:]
+    sync_dict = {
+        local_params_dict[_global_to_local_scope(p_name)]: p
+        for p_name, p in global_params_dict.items()}
+    self.sync_op = tf.group(*[v_local.assign(v_global)
+                              for v_local, v_global
+                              in sync_dict.items()])
+    # Pair local gradients with global params.
+    grad_var_dict = {
+        gradient: sync_dict[local_var]
+        for local_var, gradient in model.gradients_dict.items()}
+    # local model
+    model.make_summary_ops()  # Don't put summaries under 'local' scope.
+    with tf.variable_scope('local'):
+      self.train_op = model.optimizer.apply_gradients(
+          grad_var_dict.items(), global_step=self.global_step)
+      self.local_init_op = tf.variables_initializer(
+          tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+                            tf.get_variable_scope().name))
+    self.local_step = 0
+    self.last_summary_time = time.time()
+    self.summary_interval = summary_interval
+    self.summary_writer = summary_writer
+    self.cached_global_step = -1
+    self.cached_global_npe = -1
+    logging.info('summary_interval: %d', self.summary_interval)
+    # Load top-k buffer.
+    if self.model.top_episodes is not None and tf.gfile.Exists(self.topk_file):
+      try:
+        with tf.gfile.FastGFile(self.topk_file, 'r') as f:
+          self.model.top_episodes = cPickle.loads(f.read())
+        logging.info(
+            'Loaded top-k buffer from disk with %d items. Location: "%s"',
+            len(self.model.top_episodes), self.topk_file)
+      except (cPickle.UnpicklingError, EOFError) as e:
+        logging.warn(
+            'Failed to load existing top-k buffer from disk. Removing bad file.'
+            '\nLocation: "%s"\nException: %s', self.topk_file, str(e))
+        tf.gfile.Remove(self.topk_file)
+  def initialize(self, session):
+    """Run initialization ops."""
+    session.run(self.local_init_op)
+    session.run(self.sync_op)
+    self.cached_global_step, self.cached_global_npe = session.run(
+        [self.global_step, self.program_count])
+  def update_global_model(self, session):
+    """Run an update step.
+    1) Asynchronously copy global weights to local model.
+    2) Call into local model's update_step method, which does the following:
+        a) Sample batch of programs from policy.
+        b) Compute rewards.
+        c) Compute gradients and update the global model asynchronously.
+    3) Write tensorboard summaries to disk.
+    Args:
+      session: tf.Session instance.
+    """
+    session.run(self.sync_op)  # Copy weights from global to local.
+    with session.as_default():
+      result = self.model.update_step(
+          session, self.data_manager.sample_rl_batch(), self.train_op,
+          self.global_step)
+      global_step = result.global_step
+      global_npe = result.global_npe
+      summaries = result.summaries_list
+    self.cached_global_step = global_step
+    self.cached_global_npe = global_npe
+    self.local_step += 1
+    if self.summary_writer and self.local_step % self.summary_interval == 0:
+      if not isinstance(summaries, (tuple, list)):
+        summaries = [summaries]
+      summaries.append(self._local_step_summary())
+      if self.is_chief:
+        (global_best_reward,
+         found_solution_flag,
+         program_count) = session.run(
+             [self.global_best_reward,
+              self.found_solution_flag,
+              self.program_count])
+        summaries.append(
+            tf.Summary(
+                value=[tf.Summary.Value(
+                    tag='model/best_reward',
+                    simple_value=global_best_reward)]))
+        summaries.append(
+            tf.Summary(
+                value=[tf.Summary.Value(
+                    tag='model/solution_found',
+                    simple_value=int(found_solution_flag))]))
+        summaries.append(
+            tf.Summary(
+                value=[tf.Summary.Value(
+                    tag='model/program_count',
+                    simple_value=program_count)]))
+      for s in summaries:
+        self.summary_writer.add_summary(s, global_step)
+      self.last_summary_time = time.time()
+  def _local_step_summary(self):
+    """Compute number of local steps per time increment."""
+    dt = time.time() - self.last_summary_time
+    steps_per_time = self.summary_interval / float(dt)
+    return tf.Summary(value=[
+        tf.Summary.Value(
+            tag='local_step/per_sec',
+            simple_value=steps_per_time),
+        tf.Summary.Value(
+            tag='local_step/step',
+            simple_value=self.local_step)])
+  def maybe_save_best_model(self, session, saver, checkpoint_file):
+    """Check if this model got the highest reward and save to disk if so."""
+    if self.is_chief and session.run(self.is_best_model):
+      logging.info('Saving best model to "%s"', checkpoint_file)
+      saver.save(session, checkpoint_file)
+      session.run(self.reset_is_best_model)
+  def save_replay_buffer(self):
+    """Save replay buffer to disk.
+    Call this periodically so that training can recover if jobs go down.
+    """
+    if self.model.experience_replay is not None:
+      logging.info('Saving experience replay buffer to "%s".',
+                   self.model.experience_replay.save_file)
+      self.model.experience_replay.incremental_save(True)
+  def delete_replay_buffer(self):
+    """Delete replay buffer from disk.
+    Call this at the end of training to clean up. Replay buffer can get very
+    large.
+    """
+    if self.model.experience_replay is not None:
+      logging.info('Deleting experience replay buffer at "%s".',
+                   self.model.experience_replay.save_file)
+      tf.gfile.Remove(self.model.experience_replay.save_file)
+  def save_topk_buffer(self):
+    """Save top-k buffer to disk.
+    Call this periodically so that training can recover if jobs go down.
+    """
+    if self.model.top_episodes is not None:
+      logging.info('Saving top-k buffer to "%s".', self.topk_file)
+      # Overwrite previous data each time.
+      with tf.gfile.FastGFile(self.topk_file, 'w') as f:
+        f.write(cPickle.dumps(self.model.top_episodes))
+@contextlib.contextmanager
+def managed_session(sv, master='', config=None,
+                    start_standard_services=True,
+                    close_summary_writer=True,
+                    max_wait_secs=7200):
+  # Same as Supervisor.managed_session, but with configurable timeout.
+  try:
+    sess = sv.prepare_or_wait_for_session(
+        master=master, config=config,
+        start_standard_services=start_standard_services,
+        max_wait_secs=max_wait_secs)
+    yield sess
+  except tf.errors.DeadlineExceededError:
+    raise
+  except Exception as e:  # pylint: disable=broad-except
+    sv.request_stop(e)
+  finally:
+    try:
+      # Request all the threads to stop and wait for them to do so.  Any
+      # exception raised by the threads is raised again from stop().
+      # Passing stop_grace_period_secs is for blocked enqueue/dequeue
+      # threads which are not checking for `should_stop()`.  They
+      # will be stopped when we close the session further down.
+      sv.stop(close_summary_writer=close_summary_writer)
+    finally:
+      # Close the session to finish up all pending calls.  We do not care
+      # about exceptions raised when closing.  This takes care of
+      # blocked enqueue/dequeue calls.
+      try:
+        sess.close()
+      except Exception:  # pylint: disable=broad-except
+        # Silently ignore exceptions raised by close().
+        pass
+def train(config, is_chief, tuner=None, run_dir=None, run_number=0,
+          results_writer=None):
+  """Run training loop.
+  Args:
+    config: config_lib.Config instance containing global config (agent and env).
+    is_chief: True if this worker is chief. Chief worker manages writing some
+        data to disk and initialization of the global model.
+    tuner: A tuner instance. If not tuning, leave as None.
+    run_dir: Directory where all data for this run will be written. If None,
+        run_dir = FLAGS.logdir. Set this argument when doing multiple runs.
+    run_number: Which run is this.
+    results_writer: Managest writing training results to disk. Results are a
+        dict of metric names and values.
+  Returns:
+    The trainer object used to run training updates.
+  """
+  logging.info('Will run asynchronous training.')
+  if run_dir is None:
+    run_dir = FLAGS.logdir
+  train_dir = os.path.join(run_dir, 'train')
+  best_model_checkpoint = os.path.join(train_dir, 'best.ckpt')
+  events_dir = '%s/events_%d' % (run_dir, FLAGS.task_id)
+  logging.info('Events directory: %s', events_dir)
+  logging_dir = os.path.join(run_dir, 'logs')
+  if not tf.gfile.Exists(logging_dir):
+    tf.gfile.MakeDirs(logging_dir)
+  status_file = os.path.join(logging_dir, 'status.txt')
+  if FLAGS.summary_tasks and FLAGS.task_id < FLAGS.summary_tasks:
+    summary_writer = tf.summary.FileWriter(events_dir)
+  else:
+    summary_writer = None
+  # Only profile task 0.
+  if FLAGS.do_profiling:
+    logging.info('Profiling enabled')
+    profiler = cProfile.Profile()
+    profiler.enable()
+  else:
+    profiler = None
+  trainer = AsyncTrainer(
+      config, FLAGS.task_id, FLAGS.ps_tasks, FLAGS.num_workers,
+      is_chief=is_chief,
+      summary_interval=FLAGS.summary_interval,
+      summary_writer=summary_writer,
+      logging_dir=logging_dir,
+      run_number=run_number,
+      model_v=FLAGS.model_v)
+  variables_to_save = [v for v in tf.global_variables()
+                       if v.name.startswith('global')]
+  global_init_op = tf.variables_initializer(variables_to_save)
+  saver = tf.train.Saver(variables_to_save)
+  var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
+                               tf.get_variable_scope().name)
+  logging.info('Trainable vars:')
+  for v in var_list:
+    logging.info('  %s, %s, %s', v.name, v.device, v.get_shape())
+  logging.info('All vars:')
+  for v in tf.global_variables():
+    logging.info('  %s, %s, %s', v.name, v.device, v.get_shape())
+  def init_fn(unused_sess):
+    logging.info('No checkpoint found. Initialized global params.')
+  sv = tf.train.Supervisor(is_chief=is_chief,
+                           logdir=train_dir,
+                           saver=saver,
+                           summary_op=None,
+                           init_op=global_init_op,
+                           init_fn=init_fn,
+                           summary_writer=summary_writer,
+                           ready_op=tf.report_uninitialized_variables(
+                               variables_to_save),
+                           ready_for_local_init_op=None,
+                           global_step=trainer.global_step,
+                           save_model_secs=30,
+                           save_summaries_secs=30)
+  # Add a thread that periodically checks if this Trial should stop
+  # based on an early stopping policy.
+  if tuner:
+    sv.Loop(60, tuner.check_for_stop, (sv.coord,))
+  last_replay_save_time = time.time()
+  global_step = -1
+  logging.info(
+      'Starting session. '
+      'If this hangs, we\'re mostly likely waiting to connect '
+      'to the parameter server. One common cause is that the parameter '
+      'server DNS name isn\'t resolving yet, or is misspecified.')
+  should_retry = True
+  supervisor_deadline_exceeded = False
+  while should_retry:
+    try:
+      with managed_session(
+          sv, FLAGS.master, max_wait_secs=60) as session, session.as_default():
+        should_retry = False
+        do_training = True
+        try:
+          trainer.initialize(session)
+          if session.run(trainer.run_number) != run_number:
+            # If we loaded existing model from disk, and the saved run number is
+            # different, throw an exception.
+            raise RuntimeError(
+                'Expecting to be on run %d, but is actually on run %d. '
+                'run_dir: "%s"'
+                % (run_number, session.run(trainer.run_number), run_dir))
+          global_step = trainer.cached_global_step
+          logging.info('Starting training at step=%d', global_step)
+          while do_training:
+            trainer.update_global_model(session)
+            if is_chief:
+              trainer.maybe_save_best_model(
+                  session, saver, best_model_checkpoint)
+            global_step = trainer.cached_global_step
+            global_npe = trainer.cached_global_npe
+            if time.time() - last_replay_save_time >= 30:
+              trainer.save_replay_buffer()
+              trainer.save_topk_buffer()
+              last_replay_save_time = time.time()
+            # Stopping conditions.
+            if tuner and tuner.should_trial_stop():
+              logging.info('Tuner requested early stopping. Finishing.')
+              do_training = False
+            if is_chief and FLAGS.stop_on_success:
+              found_solution = session.run(trainer.found_solution_flag)
+              if found_solution:
+                do_training = False
+                logging.info('Solution found. Finishing.')
+            if FLAGS.max_npe and global_npe >= FLAGS.max_npe:
+              # Max NPE (number of programs executed) reached.
+              logging.info('Max NPE reached. Finishing.')
+              do_training = False
+            if sv.should_stop():
+              logging.info('Supervisor issued stop. Finishing.')
+              do_training = False
+        except tf.errors.NotFoundError:
+          # Catch "Error while reading resource variable".
+          # The chief worker likely destroyed the container, so do not retry.
+          logging.info('Caught NotFoundError. Quitting.')
+          do_training = False
+          should_retry = False
+          break
+        except tf.errors.InternalError as e:
+          # Catch "Invalid variable reference."
+          if str(e).startswith('Invalid variable reference.'):
+            # The chief worker likely destroyed the container, so do not
+            # retry.
+            logging.info(
+                'Caught "InternalError: Invalid variable reference.". '
+                'Quitting.')
+            do_training = False
+            should_retry = False
+            break
+          else:
+            # Pass exception through.
+            raise
+        # Exited training loop. Write results to disk.
+        if is_chief and results_writer:
+          assert not should_retry
+          with tf.gfile.FastGFile(status_file, 'w') as f:
+            f.write('done')
+          (program_count,
+           found_solution,
+           code_solution,
+           best_reward,
+           global_step) = session.run(
+               [trainer.program_count,
+                trainer.found_solution_flag,
+                trainer.code_solution_variable,
+                trainer.global_best_reward,
+                trainer.global_step])
+          results_dict = {
+              'max_npe': FLAGS.max_npe,
+              'batch_size': config.batch_size,
+              'max_batches': FLAGS.max_npe // config.batch_size,
+              'npe': program_count,
+              'max_global_repetitions': FLAGS.num_repetitions,
+              'max_local_repetitions': FLAGS.num_repetitions,
+              'code_solution': code_solution,
+              'best_reward': best_reward,
+              'num_batches': global_step,
+              'found_solution': found_solution,
+              'task': trainer.data_manager.task_name,
+              'global_rep': run_number}
+          logging.info('results_dict: %s', results_dict)
+          results_writer.append(results_dict)
+    except tf.errors.AbortedError:
+      # Catch "Graph handle is not found" error due to preempted jobs.
+      logging.info('Caught AbortedError. Retying.')
+      should_retry = True
+    except tf.errors.DeadlineExceededError:
+      supervisor_deadline_exceeded = True
+      should_retry = False
+  if is_chief:
+    logging.info('This is chief worker. Stopping all workers.')
+    sv.stop()
+  if supervisor_deadline_exceeded:
+    logging.info('Supervisor timed out. Quitting.')
+  else:
+    logging.info('Reached %s steps. Worker stopped.', global_step)
+  # Dump profiling.
+  """
+  How to use profiling data.
+  Download the profiler dump to your local machine, say to PROF_FILE_PATH.
+  In a separate script, run something like the following:
+  import pstats
+  p = pstats.Stats(PROF_FILE_PATH)
+  p.strip_dirs().sort_stats('cumtime').print_stats()
+  This will sort by 'cumtime', which "is the cumulative time spent in this and
+  all subfunctions (from invocation till exit)."
+  https://docs.python.org/2/library/profile.html#instant-user-s-manual
+  """  # pylint: disable=pointless-string-statement
+  if profiler:
+    prof_file = os.path.join(run_dir, 'task_%d.prof' % FLAGS.task_id)
+    logging.info('Done profiling.\nDumping to "%s".', prof_file)
+    profiler.create_stats()
+    with tf.gfile.Open(prof_file, 'w') as f:
+      f.write(marshal.dumps(profiler.stats))
+  return trainer
+def run_training(config=None, tuner=None, logdir=None, trial_name=None,
+                 is_chief=True):
+  """Do all training runs.
+  This is the top level training function for policy gradient based models.
+  Run this from the main function.
+  Args:
+    config: config_lib.Config instance containing global config (agent and
+        environment hparams). If None, config will be parsed from FLAGS.config.
+    tuner: A tuner instance. Leave as None if not tuning.
+    logdir: Parent directory where all data from all runs will be written. If
+        None, FLAGS.logdir will be used.
+    trial_name: If tuning, set this to a unique string that identifies this
+        trial. If `tuner` is not None, this also must be set.
+    is_chief: True if this worker is the chief.
+  Returns:
+    List of results dicts which were written to disk. Each training run gets a
+    results dict. Results dict contains metrics, i.e. (name, value) pairs which
+    give information about the training run.
+  Raises:
+    ValueError: If results dicts read from disk contain invalid data.
+  """
+  if not config:
+    # If custom config is not given, get it from flags.
+    config = defaults.default_config_with_updates(FLAGS.config)
+  if not logdir:
+    logdir = FLAGS.logdir
+  if not tf.gfile.Exists(logdir):
+    tf.gfile.MakeDirs(logdir)
+  assert FLAGS.num_repetitions > 0
+  results = results_lib.Results(logdir)
+  results_list, _ = results.read_all()
+  logging.info('Starting experiment. Directory: "%s"', logdir)
+  if results_list:
+    if results_list[0]['max_npe'] != FLAGS.max_npe:
+      raise ValueError(
+          'Cannot resume training. Max-NPE changed. Was %s, now %s',
+          results_list[0]['max_npe'], FLAGS.max_npe)
+    if results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
+      raise ValueError(
+          'Cannot resume training. Number of repetitions changed. Was %s, '
+          'now %s',
+          results_list[0]['max_global_repetitions'],
+          FLAGS.num_repetitions)
+  while len(results_list) < FLAGS.num_repetitions:
+    run_number = len(results_list)
+    rep_container_name = trial_name if trial_name else 'container'
+    if FLAGS.num_repetitions > 1:
+      rep_dir = os.path.join(logdir, 'run_%d' % run_number)
+      rep_container_name = rep_container_name + '_run_' + str(run_number)
+    else:
+      rep_dir = logdir
+    logging.info(
+        'Starting repetition %d (%d out of %d)', run_number, run_number + 1,
+        FLAGS.num_repetitions)
+    # Train will write result to disk.
+    with tf.container(rep_container_name):
+      trainer = train(config, is_chief, tuner, rep_dir, run_number, results)
+    logging.info('Done training.')
+    if is_chief:
+      # Destroy current container immediately (clears current graph).
+      logging.info('Clearing shared variables.')
+      tf.Session.reset(FLAGS.master, containers=[rep_container_name])
+      logging.info('Shared variables cleared.')
+      # Delete replay buffer on disk.
+      assert trainer
+      trainer.delete_replay_buffer()
+    else:
+      # Give chief worker time to clean up.
+      sleep_sec = 30.0
+      logging.info('Sleeping for %s sec.', sleep_sec)
+      time.sleep(sleep_sec)
+    tf.reset_default_graph()
+    logging.info('Default graph reset.')
+    # Expecting that train wrote new result to disk before returning.
+    results_list, _ = results.read_all()
+  return results_list
--- a/research/brain_coder/single_task/pg_train_test.py
+++ b/research/brain_coder/single_task/pg_train_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for pg_train.
+These tests excersize code paths available through configuration options.
+Training will be run for just a few steps with the goal being to check that
+nothing crashes.
+"""
+from absl import flags
+import tensorflow as tf
+from single_task import defaults  # brain coder
+from single_task import run  # brain coder
+FLAGS = flags.FLAGS
+class TrainTest(tf.test.TestCase):
+  def RunTrainingSteps(self, config_string, num_steps=10):
+    """Run a few training steps with the given config.
+    Just check that nothing crashes.
+    Args:
+      config_string: Config encoded in a string. See
+          $REPO_PATH/common/config_lib.py
+      num_steps: Number of training steps to run. Defaults to 10.
+    """
+    config = defaults.default_config_with_updates(config_string)
+    FLAGS.master = ''
+    FLAGS.max_npe = num_steps * config.batch_size
+    FLAGS.summary_interval = 1
+    FLAGS.logdir = tf.test.get_temp_dir()
+    FLAGS.config = config_string
+    tf.reset_default_graph()
+    run.main(None)
+  def testVanillaPolicyGradient(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg"),'
+        'timestep_limit=90,batch_size=64')
+  def testVanillaPolicyGradient_VariableLengthSequences(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",eos_token=False),'
+        'timestep_limit=90,batch_size=64')
+  def testVanillaActorCritic(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",ema_baseline_decay=0.0),'
+        'timestep_limit=90,batch_size=64')
+  def testPolicyGradientWithTopK(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10),'
+        'timestep_limit=90,batch_size=64')
+  def testVanillaActorCriticWithTopK(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",ema_baseline_decay=0.0,topk_loss_hparam=1.0,'
+        'topk=10),'
+        'timestep_limit=90,batch_size=64')
+  def testPolicyGradientWithTopK_VariableLengthSequences(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10,eos_token=False),'
+        'timestep_limit=90,batch_size=64')
+  def testPolicyGradientWithImportanceSampling(self):
+    self.RunTrainingSteps(
+        'env=c(task="reverse"),'
+        'agent=c(algorithm="pg",alpha=0.5),'
+        'timestep_limit=90,batch_size=64')
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/results_lib.py
+++ b/research/brain_coder/single_task/results_lib.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Results object manages distributed reading and writing of results to disk."""
+import ast
+from collections import namedtuple
+import os
+import re
+import tensorflow as tf
+ShardStats = namedtuple(
+    'ShardStats',
+    ['num_local_reps_completed', 'max_local_reps', 'finished'])
+def ge_non_zero(a, b):
+  return a >= b and b > 0
+def get_shard_id(file_name):
+  assert file_name[-4:].lower() == '.txt'
+  return int(file_name[file_name.rfind('_') + 1: -4])
+class Results(object):
+  """Manages reading and writing training results to disk asynchronously.
+  Each worker writes to its own file, so that there are no race conditions when
+  writing happens. However any worker may read any file, as is the case for
+  `read_all`. Writes are expected to be atomic so that workers will never
+  read incomplete data, and this is likely to be the case on Unix systems.
+  Reading out of date data is fine, as workers calling `read_all` will wait
+  until data from every worker has been written before proceeding.
+  """
+  file_template = 'experiment_results_{0}.txt'
+  search_regex = r'^experiment_results_([0-9])+\.txt$'
+  def __init__(self, log_dir, shard_id=0):
+    """Construct `Results` instance.
+    Args:
+      log_dir: Where to write results files.
+      shard_id: Unique id for this file (i.e. shard). Each worker that will
+          be writing results should use a different shard id. If there are
+          N shards, each shard should be numbered 0 through N-1.
+    """
+    # Use different files for workers so that they can write to disk async.
+    assert 0 <= shard_id
+    self.file_name = self.file_template.format(shard_id)
+    self.log_dir = log_dir
+    self.results_file = os.path.join(self.log_dir, self.file_name)
+  def append(self, metrics):
+    """Append results to results list on disk."""
+    with tf.gfile.FastGFile(self.results_file, 'a') as writer:
+      writer.write(str(metrics) + '\n')
+  def read_this_shard(self):
+    """Read only from this shard."""
+    return self._read_shard(self.results_file)
+  def _read_shard(self, results_file):
+    """Read only from the given shard file."""
+    try:
+      with tf.gfile.FastGFile(results_file, 'r') as reader:
+        results = [ast.literal_eval(entry) for entry in reader]
+    except tf.errors.NotFoundError:
+      # No results written to disk yet. Return empty list.
+      return []
+    return results
+  def _get_max_local_reps(self, shard_results):
+    """Get maximum number of repetitions the given shard needs to complete.
+    Worker working on each shard needs to complete a certain number of runs
+    before it finishes. This method will return that number so that we can
+    determine which shards are still not done.
+    We assume that workers are including a 'max_local_repetitions' value in
+    their results, which should be the total number of repetitions it needs to
+    run.
+    Args:
+      shard_results: Dict mapping metric names to values. This should be read
+          from a shard on disk.
+    Returns:
+      Maximum number of repetitions the given shard needs to complete.
+    """
+    mlrs = [r['max_local_repetitions'] for r in shard_results]
+    if not mlrs:
+      return 0
+    for n in mlrs[1:]:
+      assert n == mlrs[0], 'Some reps have different max rep.'
+    return mlrs[0]
+  def read_all(self, num_shards=None):
+    """Read results across all shards, i.e. get global results list.
+    Args:
+      num_shards: (optional) specifies total number of shards. If the caller
+          wants information about which shards are incomplete, provide this
+          argument (so that shards which have yet to be created are still
+          counted as incomplete shards). Otherwise, no information about
+          incomplete shards will be returned.
+    Returns:
+      aggregate: Global list of results (across all shards).
+      shard_stats: List of ShardStats instances, one for each shard. Or None if
+          `num_shards` is None.
+    """
+    try:
+      all_children = tf.gfile.ListDirectory(self.log_dir)
+    except tf.errors.NotFoundError:
+      if num_shards is None:
+        return [], None
+      return [], [[] for _ in xrange(num_shards)]
+    shard_ids = {
+        get_shard_id(fname): fname
+        for fname in all_children if re.search(self.search_regex, fname)}
+    if num_shards is None:
+      aggregate = []
+      shard_stats = None
+      for results_file in shard_ids.values():
+        aggregate.extend(self._read_shard(
+            os.path.join(self.log_dir, results_file)))
+    else:
+      results_per_shard = [None] * num_shards
+      for shard_id in xrange(num_shards):
+        if shard_id in shard_ids:
+          results_file = shard_ids[shard_id]
+          results_per_shard[shard_id] = self._read_shard(
+              os.path.join(self.log_dir, results_file))
+        else:
+          results_per_shard[shard_id] = []
+      # Compute shard stats.
+      shard_stats = []
+      for shard_results in results_per_shard:
+        max_local_reps = self._get_max_local_reps(shard_results)
+        shard_stats.append(ShardStats(
+            num_local_reps_completed=len(shard_results),
+            max_local_reps=max_local_reps,
+            finished=ge_non_zero(len(shard_results), max_local_reps)))
+      # Compute aggregate.
+      aggregate = [
+          r for shard_results in results_per_shard for r in shard_results]
+    return aggregate, shard_stats
--- a/research/brain_coder/single_task/results_lib_test.py
+++ b/research/brain_coder/single_task/results_lib_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for results_lib."""
+import contextlib
+import os
+import shutil
+import tempfile
+import tensorflow as tf
+from single_task import results_lib  # brain coder
+@contextlib.contextmanager
+def temporary_directory(suffix='', prefix='tmp', base_path=None):
+  """A context manager to create a temporary directory and clean up on exit.
+  The parameters are the same ones expected by tempfile.mkdtemp.
+  The directory will be securely and atomically created.
+  Everything under it will be removed when exiting the context.
+  Args:
+    suffix: optional suffix.
+    prefix: options prefix.
+    base_path: the base path under which to create the temporary directory.
+  Yields:
+    The absolute path of the new temporary directory.
+  """
+  temp_dir_path = tempfile.mkdtemp(suffix, prefix, base_path)
+  try:
+    yield temp_dir_path
+  finally:
+    try:
+      shutil.rmtree(temp_dir_path)
+    except OSError as e:
+      if e.message == 'Cannot call rmtree on a symbolic link':
+        # Interesting synthetic exception made up by shutil.rmtree.
+        # Means we received a symlink from mkdtemp.
+        # Also means must clean up the symlink instead.
+        os.unlink(temp_dir_path)
+      else:
+        raise
+def freeze(dictionary):
+  """Convert dict to hashable frozenset."""
+  return frozenset(dictionary.iteritems())
+class ResultsLibTest(tf.test.TestCase):
+  def testResults(self):
+    with temporary_directory() as logdir:
+      results_obj = results_lib.Results(logdir)
+      self.assertEqual(results_obj.read_this_shard(), [])
+      results_obj.append(
+          {'foo': 1.5, 'bar': 2.5, 'baz': 0})
+      results_obj.append(
+          {'foo': 5.5, 'bar': -1, 'baz': 2})
+      self.assertEqual(
+          results_obj.read_this_shard(),
+          [{'foo': 1.5, 'bar': 2.5, 'baz': 0},
+           {'foo': 5.5, 'bar': -1, 'baz': 2}])
+  def testShardedResults(self):
+    with temporary_directory() as logdir:
+      n = 4  # Number of shards.
+      results_objs = [
+          results_lib.Results(logdir, shard_id=i) for i in xrange(n)]
+      for i, robj in enumerate(results_objs):
+        robj.append({'foo': i, 'bar': 1 + i * 2})
+      results_list, _ = results_objs[0].read_all()
+      # Check results. Order does not matter here.
+      self.assertEqual(
+          set(freeze(r) for r in results_list),
+          set(freeze({'foo': i, 'bar': 1 + i * 2}) for i in xrange(n)))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/run.py
+++ b/research/brain_coder/single_task/run.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+r"""Run training.
+Choose training algorithm and task(s) and follow these examples.
+Run synchronous policy gradient training locally:
+CONFIG="agent=c(algorithm='pg'),env=c(task='reverse')"
+OUT_DIR="/tmp/bf_pg_local"
+rm -rf $OUT_DIR
+bazel run -c opt single_task:run -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe=0 \
+    --logdir="$OUT_DIR" \
+    --summary_interval=1 \
+    --model_v=0
+learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
+Run genetic algorithm locally:
+CONFIG="agent=c(algorithm='ga'),env=c(task='reverse')"
+OUT_DIR="/tmp/bf_ga_local"
+rm -rf $OUT_DIR
+bazel run -c opt single_task:run -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe=0 \
+    --logdir="$OUT_DIR"
+Run uniform random search locally:
+CONFIG="agent=c(algorithm='rand'),env=c(task='reverse')"
+OUT_DIR="/tmp/bf_rand_local"
+rm -rf $OUT_DIR
+bazel run -c opt single_task:run -- \
+    --alsologtostderr \
+    --config="$CONFIG" \
+    --max_npe=0 \
+    --logdir="$OUT_DIR"
+"""
+from absl import app
+from absl import flags
+from absl import logging
+from single_task import defaults  # brain coder
+from single_task import ga_train  # brain coder
+from single_task import pg_train  # brain coder
+FLAGS = flags.FLAGS
+flags.DEFINE_string('config', '', 'Configuration.')
+flags.DEFINE_string(
+    'logdir', None, 'Absolute path where to write results.')
+flags.DEFINE_integer('task_id', 0, 'ID for this worker.')
+flags.DEFINE_integer('num_workers', 1, 'How many workers there are.')
+flags.DEFINE_integer(
+    'max_npe', 0,
+    'NPE = number of programs executed. Maximum number of programs to execute '
+    'in each run. Training will complete when this threshold is reached. Set '
+    'to 0 for unlimited training.')
+flags.DEFINE_integer(
+    'num_repetitions', 1,
+    'Number of times the same experiment will be run (globally across all '
+    'workers). Each run is independent.')
+flags.DEFINE_string(
+    'log_level', 'INFO',
+    'The threshold for what messages will be logged. One of DEBUG, INFO, WARN, '
+    'ERROR, or FATAL.')
+# To register an algorithm:
+# 1) Add dependency in the BUILD file to this build rule.
+# 2) Import the algorithm's module at the top of this file.
+# 3) Add a new entry in the following dict. The key is the algorithm name
+#    (used to select the algorithm in the config). The value is the module
+#    defining the expected functions for training and tuning. See the docstring
+#    for `get_namespace` for further details.
+ALGORITHM_REGISTRATION = {
+    'pg': pg_train,
+    'ga': ga_train,
+    'rand': ga_train,
+}
+def get_namespace(config_string):
+  """Get namespace for the selected algorithm.
+  Users who want to add additional algorithm types should modify this function.
+  The algorithm's namespace should contain the following functions:
+    run_training: Run the main training loop.
+    define_tuner_hparam_space: Return the hparam tuning space for the algo.
+    write_hparams_to_config: Helper for tuning. Write hparams chosen for tuning
+        to the Config object.
+  Look at pg_train.py and ga_train.py for function signatures and
+  implementations.
+  Args:
+    config_string: String representation of a Config object. This will get
+        parsed into a Config in order to determine what algorithm to use.
+  Returns:
+    algorithm_namespace: The module corresponding to the algorithm given in the
+        config.
+    config: The Config object resulting from parsing `config_string`.
+  Raises:
+    ValueError: If config.agent.algorithm is not one of the registered
+        algorithms.
+  """
+  config = defaults.default_config_with_updates(config_string)
+  if config.agent.algorithm not in ALGORITHM_REGISTRATION:
+    raise ValueError('Unknown algorithm type "%s"' % (config.agent.algorithm,))
+  else:
+    return ALGORITHM_REGISTRATION[config.agent.algorithm], config
+def main(argv):
+  del argv  # Unused.
+  logging.set_verbosity(FLAGS.log_level)
+  flags.mark_flag_as_required('logdir')
+  if FLAGS.num_workers <= 0:
+    raise ValueError('num_workers flag must be greater than 0.')
+  if FLAGS.task_id < 0:
+    raise ValueError('task_id flag must be greater than or equal to 0.')
+  if FLAGS.task_id >= FLAGS.num_workers:
+    raise ValueError(
+        'task_id flag must be strictly less than num_workers flag.')
+  ns, _ = get_namespace(FLAGS.config)
+  ns.run_training(is_chief=FLAGS.task_id == 0)
+if __name__ == '__main__':
+  app.run(main)
--- a/research/brain_coder/single_task/run_eval_tasks.py
+++ b/research/brain_coder/single_task/run_eval_tasks.py
+#!/usr/bin/env python
+from __future__ import print_function
+r"""This script can launch any eval experiments from the paper.
+This is a script. Run with python, not bazel.
+Usage:
+./single_task/run_eval_tasks.py \
+    --exp EXP --desc DESC [--tuning_tasks] [--iclr_tasks] [--task TASK] \
+    [--tasks TASK1 TASK2 ...]
+where EXP is one of the keys in `experiments`,
+and DESC is a string description of the set of experiments (such as "v0")
+Set only one of these flags:
+--tuning_tasks flag only runs tuning tasks.
+--iclr_tasks flag only runs the tasks included in the paper.
+--regression_tests flag runs tasks which function as regression tests.
+--task flag manually selects a single task to run.
+--tasks flag takes a custom list of tasks.
+Other flags:
+--reps N specifies N repetitions per experiment, Default is 25.
+--training_replicas R specifies that R workers will be launched to train one
+    task (for neural network algorithms). These workers will update a global
+    model stored on a parameter server. Defaults to 1. If R > 1, a parameter
+    server will also be launched.
+Run everything:
+exps=( pg-20M pg-topk-20M topk-20M ga-20M rand-20M )
+BIN_DIR="single_task"
+for exp in "${exps[@]}"
+do
+  ./$BIN_DIR/run_eval_tasks.py \
+      --exp "$exp" --iclr_tasks
+done
+"""
+import argparse
+from collections import namedtuple
+import subprocess
+S = namedtuple('S', ['length'])
+default_length = 100
+iclr_tasks = [
+    'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
+    'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
+    'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
+    'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
+    'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
+regression_test_tasks = ['reverse', 'test-hill-climb']
+E = namedtuple(
+    'E',
+    ['name', 'method_type', 'config', 'simplify', 'batch_size', 'max_npe'])
+def make_experiment_settings(name, **kwargs):
+  # Unpack experiment info from name.
+  def split_last(string, char):
+    i = string.rindex(char)
+    return string[:i], string[i+1:]
+  def si_to_int(si_string):
+    return int(
+        si_string.upper().replace('K', '0'*3).replace('M', '0'*6)
+        .replace('G', '0'*9))
+  method_type, max_npe = split_last(name, '-')
+  assert method_type
+  assert max_npe
+  return E(
+      name=name, method_type=method_type, max_npe=si_to_int(max_npe), **kwargs)
+experiments_set = {
+    make_experiment_settings(
+        'pg-20M',
+        config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.0,topk=0,'
+               'pi_loss_hparam=1.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'pg-topk-20M',
+        config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=50.0,topk=10,'
+               'pi_loss_hparam=1.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'topk-20M',
+        config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
+               'pi_loss_hparam=0.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'topk-0ent-20M',
+        config='entropy_beta=0.000,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
+               'pi_loss_hparam=0.0,alpha=0.0',
+        simplify=False,
+        batch_size=64),
+    make_experiment_settings(
+        'ga-20M',
+        config='crossover_rate=0.95,mutation_rate=0.15',
+        simplify=False,
+        batch_size=100),  # Population size.
+    make_experiment_settings(
+        'rand-20M',
+        config='',
+        simplify=False,
+        batch_size=1),
+    make_experiment_settings(
+        'simpl-500M',
+        config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.5,topk=10,'
+               'pi_loss_hparam=1.0,alpha=0.0',
+        simplify=True,
+        batch_size=64),
+}
+experiments = {e.name: e for e in experiments_set}
+# pylint: disable=redefined-outer-name
+def parse_args(extra_args=()):
+  """Parse arguments and extract task and experiment info."""
+  parser = argparse.ArgumentParser(description='Run all eval tasks.')
+  parser.add_argument('--exp', required=True)
+  parser.add_argument('--tuning_tasks', action='store_true')
+  parser.add_argument('--iclr_tasks', action='store_true')
+  parser.add_argument('--regression_tests', action='store_true')
+  parser.add_argument('--desc', default='v0')
+  parser.add_argument('--reps', default=25)
+  parser.add_argument('--task')
+  parser.add_argument('--tasks', nargs='+')
+  for arg_string, default in extra_args:
+    parser.add_argument(arg_string, default=default)
+  args = parser.parse_args()
+  print('Running experiment: %s' % (args.exp,))
+  if args.desc:
+    print('Extra description: "%s"' % (args.desc,))
+  if args.exp not in experiments:
+    raise ValueError('Experiment name is not valid')
+  experiment_name = args.exp
+  experiment_settings = experiments[experiment_name]
+  assert experiment_settings.name == experiment_name
+  if args.tasks:
+    print('Launching tasks from args: %s' % (args.tasks,))
+    tasks = {t: S(length=default_length) for t in args.tasks}
+  elif args.task:
+    print('Launching single task "%s"' % args.task)
+    tasks = {args.task: S(length=default_length)}
+  elif args.tuning_tasks:
+    print('Only running tuning tasks')
+    tasks = {name: S(length=default_length)
+             for name in ['reverse-tune', 'remove-char-tune']}
+  elif args.iclr_tasks:
+    print('Running eval tasks from ICLR paper.')
+    tasks = {name: S(length=default_length) for name in iclr_tasks}
+  elif args.regression_tests:
+    tasks = {name: S(length=default_length) for name in regression_test_tasks}
+  print('Tasks: %s' % tasks.keys())
+  print('reps = %d' % (int(args.reps),))
+  return args, tasks, experiment_settings
+def run(command_string):
+  subprocess.call(command_string, shell=True)
+if __name__ == '__main__':
+  LAUNCH_TRAINING_COMMAND = 'single_task/launch_training.sh'
+  COMPILE_COMMAND = 'bazel build -c opt single_task:run.par'
+  args, tasks, experiment_settings = parse_args(
+      extra_args=(('--training_replicas', 1),))
+  if experiment_settings.method_type in (
+      'pg', 'pg-topk', 'topk', 'topk-0ent', 'simpl'):
+    # Runs PG and TopK.
+    def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
+                     batch_size, do_simplify, custom_config_str):
+      """Constructs terminal command for launching NN based algorithms.
+      The arguments to this function will be used to create config for the
+      experiment.
+      Args:
+        job_name: Name of the job to launch. Should uniquely identify this
+            experiment run.
+        task: Name of the coding task to solve.
+        max_npe: Maximum number of programs executed. An integer.
+        num_reps: Number of times to run the experiment. An integer.
+        code_length: Maximum allowed length of synthesized code.
+        batch_size: Minibatch size for gradient descent.
+        do_simplify: Whether to run the experiment in code simplification mode.
+            A bool.
+        custom_config_str: Additional config for the model config string.
+      Returns:
+        The terminal command that launches the specified experiment.
+      """
+      config = """
+        env=c(task='{0}',correct_syntax=False),
+        agent=c(
+          algorithm='pg',
+          policy_lstm_sizes=[35,35],value_lstm_sizes=[35,35],
+          grad_clip_threshold=50.0,param_init_factor=0.5,regularizer=0.0,
+          softmax_tr=1.0,optimizer='rmsprop',ema_baseline_decay=0.99,
+          eos_token={3},{4}),
+        timestep_limit={1},batch_size={2}
+      """.replace(' ', '').replace('\n', '').format(
+          task, code_length, batch_size, do_simplify, custom_config_str)
+      num_ps = 0 if args.training_replicas == 1 else 1
+      return (
+          r'{0} --job_name={1} --config="{2}" --max_npe={3} '
+          '--num_repetitions={4} --num_workers={5} --num_ps={6} '
+          '--stop_on_success={7}'
+          .format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
+                  args.training_replicas, num_ps, str(not do_simplify).lower()))
+  else:
+    # Runs GA and Rand.
+    assert experiment_settings.method_type in ('ga', 'rand')
+    def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
+                     batch_size, do_simplify, custom_config_str):
+      """Constructs terminal command for launching GA or uniform random search.
+      The arguments to this function will be used to create config for the
+      experiment.
+      Args:
+        job_name: Name of the job to launch. Should uniquely identify this
+            experiment run.
+        task: Name of the coding task to solve.
+        max_npe: Maximum number of programs executed. An integer.
+        num_reps: Number of times to run the experiment. An integer.
+        code_length: Maximum allowed length of synthesized code.
+        batch_size: Minibatch size for gradient descent.
+        do_simplify: Whether to run the experiment in code simplification mode.
+            A bool.
+        custom_config_str: Additional config for the model config string.
+      Returns:
+        The terminal command that launches the specified experiment.
+      """
+      assert not do_simplify
+      if custom_config_str:
+        custom_config_str = ',' + custom_config_str
+      config = """
+        env=c(task='{0}',correct_syntax=False),
+        agent=c(
+          algorithm='{4}'
+          {3}),
+        timestep_limit={1},batch_size={2}
+      """.replace(' ', '').replace('\n', '').format(
+          task, code_length, batch_size, custom_config_str,
+          experiment_settings.method_type)
+      num_workers = num_reps  # Do each rep in parallel.
+      return (
+          r'{0} --job_name={1} --config="{2}" --max_npe={3} '
+          '--num_repetitions={4} --num_workers={5} --num_ps={6} '
+          '--stop_on_success={7}'
+          .format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
+                  num_workers, 0, str(not do_simplify).lower()))
+  print('Compiling...')
+  run(COMPILE_COMMAND)
+  print('Launching %d coding tasks...' % len(tasks))
+  for task, task_settings in tasks.iteritems():
+    name = 'bf_rl_iclr'
+    desc = '{0}.{1}_{2}'.format(args.desc, experiment_settings.name, task)
+    job_name = '{}.{}'.format(name, desc)
+    print('Job name: %s' % job_name)
+    reps = int(args.reps) if not experiment_settings.simplify else 1
+    run_cmd = make_run_cmd(
+        job_name, task, experiment_settings.max_npe, reps,
+        task_settings.length, experiment_settings.batch_size,
+        experiment_settings.simplify,
+        experiment_settings.config)
+    print('Running command:\n' + run_cmd)
+    run(run_cmd)
+  print('Done.')
+# pylint: enable=redefined-outer-name
--- a/research/brain_coder/single_task/test_tasks.py
+++ b/research/brain_coder/single_task/test_tasks.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tasks that test correctness of algorithms."""
+from common import reward as reward_lib  # brain coder
+from single_task import misc  # brain coder
+class BasicTaskManager(object):
+  """Wraps a generic reward function."""
+  def __init__(self, reward_fn):
+    self.reward_fn = reward_fn
+    self.good_reward = 1.0
+  def _score_string(self, string):
+    actions = misc.bf_string_to_tokens(string)
+    reward, correct = self.reward_fn(actions)
+    return misc.RewardInfo(
+        episode_rewards=[0.0] * (len(string) - 1) + [reward],
+        input_case=None,
+        correct_output=None,
+        code_output=actions,
+        input_type=None,
+        output_type=misc.IOType.integer,
+        reason='correct' if correct else 'wrong')
+  def rl_batch(self, batch_size):
+    reward_fns = [self._score_string] * batch_size
+    return reward_fns
+class Trie(object):
+  """Trie for sequences."""
+  EOS = ()
+  def __init__(self):
+    self.trie = {}
+  def insert(self, sequence):
+    d = self.trie
+    for e in sequence:
+      if e not in d:
+        d[e] = {}
+      d = d[e]
+    d[self.EOS] = True   # Terminate sequence.
+  def prefix_match(self, sequence):
+    """Return prefix of `sequence` which exists in the trie."""
+    d = self.trie
+    index = 0
+    for i, e in enumerate(sequence + [self.EOS]):
+      index = i
+      if e in d:
+        d = d[e]
+        if e == self.EOS:
+          return sequence, True
+      else:
+        break
+    return sequence[:index], False
+  def next_choices(self, sequence):
+    d = self.trie
+    for e in sequence:
+      if e in d:
+        d = d[e]
+      else:
+        raise ValueError('Sequence not a prefix: %s' % (sequence,))
+    return d.keys()
+class HillClimbingTask(object):
+  """Simple task that tests reward hill climbing ability.
+  There are a set of paths (sequences of tokens) which are rewarded. The total
+  reward for a path is proportional to its length, so the longest path is the
+  target. Shorter paths can be dead ends.
+  """
+  def __init__(self):
+    # Paths are sequences of sub-sequences. Here we form unique sub-sequences
+    # out of 3 arbitrary ints. We use sub-sequences instead of single entities
+    # to make the task harder by making the episodes last longer, i.e. more
+    # for the agent to remember.
+    a = (1, 2, 3)
+    b = (4, 5, 6)
+    c = (7, 8, 7)
+    d = (6, 5, 4)
+    e = (3, 2, 1)
+    f = (8, 5, 1)
+    g = (6, 4, 2)
+    h = (1, 8, 3)
+    self.paths = Trie()
+    self.paths.insert([a, b, h])
+    self.paths.insert([a, b, c, d, e, f, g, h])
+    self.paths.insert([a, b, c, d, e, b, a])
+    self.paths.insert([a, b, g, h])
+    self.paths.insert([a, e, f, g])
+    self.correct_sequence = misc.flatten([a, b, c, d, e, f, g, h])
+    def distance_fn(a, b):
+      len_diff = abs(len(a) - len(b))
+      return sum(reward_lib.mod_abs_diff(ai - 1, bi - 1, 8)
+                 for ai, bi in zip(a, b)) + len_diff * 4  # 8 / 2 = 4
+    self.distance_fn = distance_fn
+  def __call__(self, actions):
+    # Compute reward for action sequence.
+    actions = [a for a in actions if a > 0]
+    sequence = [tuple(actions[i: i + 3]) for i in xrange(0, len(actions), 3)]
+    prefix, complete = self.paths.prefix_match(sequence)
+    if complete:
+      return float(len(prefix)), actions == self.correct_sequence
+    if len(prefix) == len(sequence):
+      return float(len(prefix)), False
+    next_pred = sequence[len(prefix)]
+    choices = self.paths.next_choices(prefix)
+    if choices == [()]:
+      return (len(prefix) - len(next_pred) / 3.0), False
+    min_dist = min(self.distance_fn(c, next_pred) for c in choices)
+    # +1 reward for each element in the sequence correct, plus fraction torwards
+    # closest next element.
+    # Maximum distance possible is num_actions * base / 2 = 3 * 8 / 2 = 12
+    return (len(prefix) + (1 - min_dist / 12.0)), False