Removing research/community models

f5fc733a · Byzantine · 09bc9f54 · 09bc9f54 · 09bc9f54 · 09bc9f54
Commit f5fc733a authored Feb 03, 2022 by Byzantine
20 changed files
--- a/research/brain_coder/common/reward.py
+++ b/research/brain_coder/common/reward.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Reward functions, distance functions, and reward managers."""
-
-from abc import ABCMeta
-from abc import abstractmethod
-from math import log
-
-
-# All sequences here are assumed to be lists of ints bounded
-# between 0 and `base`-1 (inclusive).
-
-
-#################################
-### Scalar Distance Functions ###
-#################################
-
-
-def abs_diff(a, b, base=0):
-  """Absolute value of difference between scalars.
-
-  abs_diff is symmetric, i.e. `a` and `b` are interchangeable.
-
-  Args:
-    a: First argument. An int.
-    b: Seconds argument. An int.
-    base: Dummy argument so that the argument signature matches other scalar
-        diff functions. abs_diff is the same in all bases.
-
-  Returns:
-    abs(a - b).
-  """
-  del base  # Unused.
-  return abs(a - b)
-
-
-def mod_abs_diff(a, b, base):
-  """Shortest distance between `a` and `b` in the modular integers base `base`.
-
-  The smallest distance between a and b is returned.
-  Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98.
-
-  mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable.
-
-  Args:
-    a: First argument. An int.
-    b: Seconds argument. An int.
-    base: The modulo base. A positive int.
-
-  Returns:
-    Shortest distance.
-  """
-  diff = abs(a - b)
-  if diff >= base:
-    diff %= base
-  return min(diff, (-diff) + base)
-
-
-###############################
-### List Distance Functions ###
-###############################
-
-
-def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff):
-  """Asymmetric list distance function.
-
-  List distance is the sum of element-wise distances, like Hamming distance, but
-  where `pred` can be longer or shorter than `target`. For each position in both
-  `pred` and `target`, distance between those elements is computed with
-  `scalar_diff_fn`. For missing or extra elements in `pred`, the maximum
-  distance is assigned, which is equal to `base`.
-
-  Distance is 0 when `pred` and `target` are identical, and will be a positive
-  integer when they are not.
-
-  Args:
-    pred: Prediction list. Distance from this list is computed.
-    target: Target list. Distance to this list is computed.
-    base: The integer base to use. For example, a list of chars would use base
-        256.
-    scalar_diff_fn: Element-wise distance function.
-
-  Returns:
-    List distance between `pred` and `target`.
-  """
-  d = 0
-  for i, target_t in enumerate(target):
-    if i >= len(pred):
-      d += base  # A missing slot is worth the max distance.
-    else:
-      # Add element-wise distance for this slot.
-      d += scalar_diff_fn(pred[i], target_t, base)
-  if len(pred) > len(target):
-    # Each extra slot is worth the max distance.
-    d += (len(pred) - len(target)) * base
-  return d
-
-
-def log_absolute_distance(pred, target, base):
-  """Asymmetric list distance function that uses log distance.
-
-  A list distance which computes sum of element-wise distances, similar to
-  `absolute_distance`. Unlike `absolute_distance`, this scales the resulting
-  distance to be a float.
-
-  Element-wise distance are log-scale. Distance between two list changes
-  relatively less for elements that are far apart, but changes a lot (goes to 0
-  faster) when values get close together.
-
-  Args:
-    pred: List of ints. Computes distance from this list to the target.
-    target: List of ints. This is the "correct" list which the prediction list
-        is trying to match.
-    base: Integer base.
-
-  Returns:
-    Float distance normalized so that when `pred` is at most as long as `target`
-    the distance is between 0.0 and 1.0. Distance grows unboundedly large
-    as `pred` grows past `target` in length.
-  """
-  if not target:
-    length_normalizer = 1.0
-    if not pred:
-      # Distance between [] and [] is 0.0 since they are equal.
-      return 0.0
-  else:
-    length_normalizer = float(len(target))
-  # max_dist is the maximum element-wise distance, before taking log and
-  # scaling. Since we use `mod_abs_diff`, it would be (base // 2), but we add
-  # 1 to it so that missing or extra positions get the maximum penalty.
-  max_dist = base // 2 + 1
-
-  # The log-distance will be scaled by a factor.
-  # Note: +1 is added to the numerator and denominator to avoid log(0). This
-  # only has a translational effect, i.e. log(dist + 1) / log(max_dist + 1).
-  factor = log(max_dist + 1)
-
-  d = 0.0  # Total distance to be computed.
-  for i, target_t in enumerate(target):
-    if i >= len(pred):
-      # Assign the max element-wise distance for missing positions. This is 1.0
-      # after scaling.
-      d += 1.0
-    else:
-      # Add the log-dist divided by a scaling factor.
-      d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor
-  if len(pred) > len(target):
-    # Add the max element-wise distance for each extra position.
-    # Since max dist after scaling is 1, this is just the difference in list
-    # lengths.
-    d += (len(pred) - len(target))
-  return d / length_normalizer  # Normalize again by the target length.
-
-
-########################
-### Reward Functions ###
-########################
-
-# Reward functions assign reward based on program output.
-# Warning: only use these functions as the terminal rewards in episodes, i.e.
-# for the "final" programs.
-
-
-def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff):
-  """Reward function based on absolute_distance function.
-
-  Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
-  so that 0.0 reward is given when `pred` is the empty list (assuming `target`
-  is not empty). Reward can go negative when `pred` is longer than `target`.
-
-  This is an asymmetric reward function, so which list is the prediction and
-  which is the target matters.
-
-  Args:
-    pred: Prediction sequence. This should be the sequence outputted by the
-        generated code. List of ints n, where 0 <= n < base.
-    target: Target sequence. The correct sequence that the generated code needs
-        to output. List of ints n, where 0 <= n < base.
-    base: Base of the computation.
-    scalar_diff_fn: Element-wise distance function.
-
-  Returns:
-    Reward computed based on `pred` and `target`. A float.
-  """
-  unit_dist = float(base * len(target))
-  if unit_dist == 0:
-    unit_dist = base
-  dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn)
-  return (unit_dist - dist) / unit_dist
-
-
-def absolute_mod_distance_reward(pred, target, base):
-  """Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used.
-
-  Args:
-    pred: Prediction sequence. This should be the sequence outputted by the
-        generated code. List of ints n, where 0 <= n < base.
-    target: Target sequence. The correct sequence that the generated code needs
-        to output. List of ints n, where 0 <= n < base.
-    base: Base of the computation.
-
-  Returns:
-    Reward computed based on `pred` and `target`. A float.
-  """
-  return absolute_distance_reward(pred, target, base, mod_abs_diff)
-
-
-def absolute_log_distance_reward(pred, target, base):
-  """Compute reward using `log_absolute_distance`.
-
-  Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
-  so that 0.0 reward is given when `pred` is the empty list (assuming `target`
-  is not empty). Reward can go negative when `pred` is longer than `target`.
-
-  This is an asymmetric reward function, so which list is the prediction and
-  which is the target matters.
-
-  This reward function has the nice property that much more reward is given
-  for getting the correct value (at each position) than for there being any
-  value at all. For example, in base 100, lets say pred = [1] * 1000
-  and target = [10] * 1000. A lot of reward would be given for being 80%
-  accurate (worst element-wise distance is 50, distances here are 9) using
-  `absolute_distance`. `log_absolute_distance` on the other hand will give
-  greater and greater reward increments the closer each predicted value gets to
-  the target. That makes the reward given for accuracy somewhat independant of
-  the base.
-
-  Args:
-    pred: Prediction sequence. This should be the sequence outputted by the
-        generated code. List of ints n, where 0 <= n < base.
-    target: Target sequence. The correct sequence that the generated code needs
-        to output. List of ints n, where 0 <= n < base.
-    base: Base of the computation.
-
-  Returns:
-    Reward computed based on `pred` and `target`. A float.
-  """
-  return 1.0 - log_absolute_distance(pred, target, base)
-
-
-#######################
-### Reward Managers ###
-#######################
-
-# Reward managers assign reward to many code attempts throughout an episode.
-
-
-class RewardManager(object):
-  """Reward managers administer reward across an episode.
-
-  Reward managers are used for "editor" environments. These are environments
-  where the agent has some way to edit its code over time, and run its code
-  many time in the same episode, so that it can make incremental improvements.
-
-  Reward managers are instantiated with a target sequence, which is the known
-  correct program output. The manager is called on the output from a proposed
-  code, and returns reward. If many proposal outputs are tried, reward may be
-  some stateful function that takes previous tries into account. This is done,
-  in part, so that an agent cannot accumulate unbounded reward just by trying
-  junk programs as often as possible. So reward managers should not give the
-  same reward twice if the next proposal is not better than the last.
-  """
-  __metaclass__ = ABCMeta
-
-  def __init__(self, target, base, distance_fn=absolute_distance):
-    self._target = list(target)
-    self._base = base
-    self._distance_fn = distance_fn
-
-  @abstractmethod
-  def __call__(self, sequence):
-    """Call this reward manager like a function to get reward.
-
-    Calls to reward manager are stateful, and will take previous sequences
-    into account. Repeated calls with the same sequence may produce different
-    rewards.
-
-    Args:
-      sequence: List of integers (each between 0 and base - 1). This is the
-          proposal sequence. Reward will be computed based on the distance
-          from this sequence to the target (distance function and target are
-          given in the constructor), as well as previous sequences tried during
-          the lifetime of this object.
-
-    Returns:
-      Float value. The reward received from this call.
-    """
-    return 0.0
-
-
-class DeltaRewardManager(RewardManager):
-  """Simple reward manager that assigns reward for the net change in distance.
-
-  Given some (possibly asymmetric) list distance function, gives reward for
-  relative changes in prediction distance to the target.
-
-  For example, if on the first call the distance is 3.0, the change in distance
-  is -3 (from starting distance of 0). That relative change will be scaled to
-  produce a negative reward for this step. On the next call, the distance is 2.0
-  which is a +1 change, and that will be scaled to give a positive reward.
-  If the final call has distance 0 (the target is achieved), that is another
-  positive change of +2. The total reward across all 3 calls is then 0, which is
-  the highest posible episode total.
-
-  Reward is scaled so that the maximum element-wise distance is worth 1.0.
-  Maximum total episode reward attainable is 0.
-  """
-
-  def __init__(self, target, base, distance_fn=absolute_distance):
-    super(DeltaRewardManager, self).__init__(target, base, distance_fn)
-    self._last_diff = 0
-
-  def _diff(self, seq):
-    return self._distance_fn(seq, self._target, self._base)
-
-  def _delta_reward(self, seq):
-    # Reward is relative to previous sequence diff.
-    # Reward is scaled so that maximum token difference is worth 1.0.
-    # Reward = (last_diff - this_diff) / self.base.
-    # Reward is positive if this sequence is closer to the target than the
-    # previous sequence, and negative if this sequence is further away.
-    diff = self._diff(seq)
-    reward = (self._last_diff - diff) / float(self._base)
-    self._last_diff = diff
-    return reward
-
-  def __call__(self, seq):
-    return self._delta_reward(seq)
-
-
-class FloorRewardManager(RewardManager):
-  """Assigns positive reward for each step taken closer to the target.
-
-  Given some (possibly asymmetric) list distance function, gives reward for
-  whenever a new episode minimum distance is reached. No reward is given if
-  the distance regresses to a higher value, so that the sum of rewards
-  for the episode is positive.
-
-  Reward is scaled so that the maximum element-wise distance is worth 1.0.
-  Maximum total episode reward attainable is len(target).
-
-  If the prediction sequence is longer than the target, a reward of -1 is given.
-  Subsequence predictions which are also longer get 0 reward. The -1 penalty
-  will be canceled out with a +1 reward when a prediction is given which is at
-  most the length of the target.
-  """
-
-  def __init__(self, target, base, distance_fn=absolute_distance):
-    super(FloorRewardManager, self).__init__(target, base, distance_fn)
-    self._last_diff = 0
-    self._min_diff = self._max_diff()
-    self._too_long_penality_given = False
-
-  def _max_diff(self):
-    return self._distance_fn([], self._target, self._base)
-
-  def _diff(self, seq):
-    return self._distance_fn(seq, self._target, self._base)
-
-  def _delta_reward(self, seq):
-    # Reward is only given if this sequence is closer to the target than any
-    # previous sequence.
-    # Reward is scaled so that maximum token difference is worth 1.0
-    # Reward = (min_diff - this_diff) / self.base
-    # Reward is always positive.
-    diff = self._diff(seq)
-    if diff < self._min_diff:
-      reward = (self._min_diff - diff) / float(self._base)
-      self._min_diff = diff
-    else:
-      reward = 0.0
-    return reward
-
-  def __call__(self, seq):
-    if len(seq) > len(self._target):  # Output is too long.
-      if not self._too_long_penality_given:
-        self._too_long_penality_given = True
-        reward = -1.0
-      else:
-        reward = 0.0  # Don't give this penalty more than once.
-      return reward
-
-    reward = self._delta_reward(seq)
-    if self._too_long_penality_given:
-      reward += 1.0  # Return the subtracted reward.
-      self._too_long_penality_given = False
-    return reward
-
--- a/research/brain_coder/common/reward_test.py
+++ b/research/brain_coder/common/reward_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for common.reward."""
-
-from math import log
-import numpy as np
-import tensorflow as tf
-
-from common import reward  # brain coder
-
-
-class RewardTest(tf.test.TestCase):
-
-  def testAbsDiff(self):
-    self.assertEqual(5, reward.abs_diff(15, 20))
-    self.assertEqual(5, reward.abs_diff(20, 15))
-
-  def testModAbsDiff(self):
-    self.assertEqual(5, reward.mod_abs_diff(15, 20, 25))
-    self.assertEqual(5, reward.mod_abs_diff(20, 15, 25))
-    self.assertEqual(2, reward.mod_abs_diff(1, 24, 25))
-    self.assertEqual(2, reward.mod_abs_diff(24, 1, 25))
-
-    self.assertEqual(0, reward.mod_abs_diff(0, 0, 5))
-    self.assertEqual(1, reward.mod_abs_diff(0, 1, 5))
-    self.assertEqual(2, reward.mod_abs_diff(0, 2, 5))
-    self.assertEqual(2, reward.mod_abs_diff(0, 3, 5))
-    self.assertEqual(1, reward.mod_abs_diff(0, 4, 5))
-
-    self.assertEqual(0, reward.mod_abs_diff(-1, 4, 5))
-    self.assertEqual(1, reward.mod_abs_diff(-5, 4, 5))
-    self.assertEqual(1, reward.mod_abs_diff(-7, 4, 5))
-    self.assertEqual(1, reward.mod_abs_diff(13, 4, 5))
-    self.assertEqual(1, reward.mod_abs_diff(15, 4, 5))
-
-  def testAbsoluteDistance_AbsDiffMethod(self):
-    self.assertEqual(
-        4,
-        reward.absolute_distance([0], [4], 5, scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        0,
-        reward.absolute_distance([4], [4], 5, scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        0,
-        reward.absolute_distance([], [], 5, scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([1], [], 5, scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([], [1], 5, scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        0,
-        reward.absolute_distance([1, 2, 3], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        1,
-        reward.absolute_distance([1, 2, 4], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        1,
-        reward.absolute_distance([1, 2, 2], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([1, 2], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([1, 2, 3, 4], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.abs_diff))
-    self.assertEqual(
-        6,
-        reward.absolute_distance([4, 4, 4], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.abs_diff))
-
-  def testAbsoluteDistance_ModDiffMethod(self):
-    self.assertEqual(
-        1,
-        reward.absolute_distance([0], [4], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        0,
-        reward.absolute_distance([4], [4], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        0,
-        reward.absolute_distance([], [], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([1], [], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([], [1], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        0,
-        reward.absolute_distance([1, 2, 3], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        1,
-        reward.absolute_distance([1, 2, 4], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        1,
-        reward.absolute_distance([1, 2, 2], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([1, 2], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([1, 2, 3, 4], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-    self.assertEqual(
-        5,
-        reward.absolute_distance([4, 4, 4], [1, 2, 3], 5,
-                                 scalar_diff_fn=reward.mod_abs_diff))
-
-  def testLogAbsoluteDistance(self):
-    def log_diff(diff, base):
-      return log(diff + 1) / log(base // 2 + 2)
-
-    self.assertEqual(
-        log_diff(1, 5),
-        reward.log_absolute_distance([0], [4], 5))
-    self.assertEqual(
-        log_diff(2, 5),
-        reward.log_absolute_distance([1], [4], 5))
-    self.assertEqual(
-        log_diff(2, 5),
-        reward.log_absolute_distance([2], [4], 5))
-    self.assertEqual(
-        log_diff(1, 5),
-        reward.log_absolute_distance([3], [4], 5))
-    self.assertEqual(
-        log_diff(3, 5),  # max_dist = base // 2 + 1 = 3
-        reward.log_absolute_distance([], [4], 5))
-    self.assertEqual(
-        0 + log_diff(3, 5),  # max_dist = base // 2 + 1 = 3
-        reward.log_absolute_distance([4, 4], [4], 5))
-    self.assertEqual(
-        0,
-        reward.log_absolute_distance([4], [4], 5))
-    self.assertEqual(
-        0,
-        reward.log_absolute_distance([], [], 5))
-    self.assertEqual(
-        1,
-        reward.log_absolute_distance([1], [], 5))
-    self.assertEqual(
-        1,
-        reward.log_absolute_distance([], [1], 5))
-
-    self.assertEqual(
-        0,
-        reward.log_absolute_distance([1, 2, 3], [1, 2, 3], 5))
-    self.assertEqual(
-        log_diff(1, 5) / 3,  # divided by target length.
-        reward.log_absolute_distance([1, 2, 4], [1, 2, 3], 5))
-    self.assertEqual(
-        log_diff(1, 5) / 3,
-        reward.log_absolute_distance([1, 2, 2], [1, 2, 3], 5))
-    self.assertEqual(
-        log_diff(3, 5) / 3,  # max_dist
-        reward.log_absolute_distance([1, 2], [1, 2, 3], 5))
-    self.assertEqual(
-        log_diff(3, 5) / 3,  # max_dist
-        reward.log_absolute_distance([1, 2, 3, 4], [1, 2, 3], 5))
-    # Add log differences for each position.
-    self.assertEqual(
-        (log_diff(2, 5) + log_diff(2, 5) + log_diff(1, 5)) / 3,
-        reward.log_absolute_distance([4, 4, 4], [1, 2, 3], 5))
-
-  def testAbsoluteDistanceReward(self):
-    self.assertEqual(
-        1,
-        reward.absolute_distance_reward([1, 2, 3], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - 1 / (5 * 3.),  # 1 - distance / (base * target_len)
-        reward.absolute_distance_reward([1, 2, 4], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - 1 / (5 * 3.),
-        reward.absolute_distance_reward([1, 2, 2], [1, 2, 3], 5))
-    self.assertTrue(np.isclose(
-        1 - 5 / (5 * 3.),
-        reward.absolute_distance_reward([1, 2], [1, 2, 3], 5)))
-    self.assertTrue(np.isclose(
-        1 - 5 / (5 * 3.),
-        reward.absolute_distance_reward([1, 2, 3, 4], [1, 2, 3], 5)))
-    # Add log differences for each position.
-    self.assertEqual(
-        1 - (3 + 2 + 1) / (5 * 3.),
-        reward.absolute_distance_reward([4, 4, 4], [1, 2, 3], 5))
-    self.assertEqual(
-        1,
-        reward.absolute_distance_reward([], [], 5))
-
-  def testAbsoluteModDistanceReward(self):
-    self.assertEqual(
-        1,
-        reward.absolute_mod_distance_reward([1, 2, 3], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - 1 / (5 * 3.),  # 1 - distance / (base * target_len)
-        reward.absolute_mod_distance_reward([1, 2, 4], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - 1 / (5 * 3.),
-        reward.absolute_mod_distance_reward([1, 2, 2], [1, 2, 3], 5))
-    self.assertTrue(np.isclose(
-        1 - 5 / (5 * 3.),
-        reward.absolute_mod_distance_reward([1, 2], [1, 2, 3], 5)))
-    self.assertTrue(np.isclose(
-        1 - 5 / (5 * 3.),
-        reward.absolute_mod_distance_reward([1, 2, 3, 4], [1, 2, 3], 5)))
-    # Add log differences for each position.
-    self.assertTrue(np.isclose(
-        1 - (2 + 2 + 1) / (5 * 3.),
-        reward.absolute_mod_distance_reward([4, 4, 4], [1, 2, 3], 5)))
-    self.assertTrue(np.isclose(
-        1 - (1 + 2 + 2) / (5 * 3.),
-        reward.absolute_mod_distance_reward([0, 1, 2], [4, 4, 4], 5)))
-    self.assertEqual(
-        1,
-        reward.absolute_mod_distance_reward([], [], 5))
-
-  def testAbsoluteLogDistanceReward(self):
-    def log_diff(diff, base):
-      return log(diff + 1) / log(base // 2 + 2)
-
-    self.assertEqual(
-        1,
-        reward.absolute_log_distance_reward([1, 2, 3], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - log_diff(1, 5) / 3,  # divided by target length.
-        reward.absolute_log_distance_reward([1, 2, 4], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - log_diff(1, 5) / 3,
-        reward.absolute_log_distance_reward([1, 2, 2], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - log_diff(3, 5) / 3,  # max_dist
-        reward.absolute_log_distance_reward([1, 2], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - log_diff(3, 5) / 3,  # max_dist
-        reward.absolute_log_distance_reward([1, 2, 3, 4], [1, 2, 3], 5))
-    # Add log differences for each position.
-    self.assertEqual(
-        1 - (log_diff(2, 5) + log_diff(2, 5) + log_diff(1, 5)) / 3,
-        reward.absolute_log_distance_reward([4, 4, 4], [1, 2, 3], 5))
-    self.assertEqual(
-        1 - (log_diff(1, 5) + log_diff(2, 5) + log_diff(2, 5)) / 3,
-        reward.absolute_log_distance_reward([0, 1, 2], [4, 4, 4], 5))
-    self.assertEqual(
-        1,
-        reward.absolute_log_distance_reward([], [], 5))
-
-  def testDeltaRewardManager(self):
-    reward_manager = reward.DeltaRewardManager(
-        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
-    self.assertEqual(-3, reward_manager([1]))
-    self.assertEqual(0, reward_manager([1]))
-    self.assertEqual(4 / 5., reward_manager([1, 3]))
-    self.assertEqual(-4 / 5, reward_manager([1]))
-    self.assertEqual(3, reward_manager([1, 2, 3, 4]))
-    self.assertEqual(-1, reward_manager([1, 2, 3]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4, 3]))
-    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 3, 2]))
-    self.assertEqual(2, reward_manager([1, 2, 3, 4]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
-
-  def testFloorRewardMananger(self):
-    reward_manager = reward.FloorRewardManager(
-        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
-    self.assertEqual(1, reward_manager([1]))
-    self.assertEqual(0, reward_manager([1]))
-    self.assertEqual(4 / 5., reward_manager([1, 3]))
-    self.assertEqual(0, reward_manager([1]))
-    self.assertEqual(1 / 5., reward_manager([1, 2]))
-    self.assertEqual(0, reward_manager([0, 1]))
-    self.assertEqual(0, reward_manager([]))
-    self.assertEqual(0, reward_manager([1, 2]))
-    self.assertEqual(2, reward_manager([1, 2, 3, 4]))
-    self.assertEqual(0, reward_manager([1, 2, 3]))
-    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 3]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4, 3, 2]))
-    self.assertEqual(1, reward_manager([1, 2, 3, 4]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
-
-    reward_manager = reward.FloorRewardManager(
-        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
-    self.assertEqual(1, reward_manager([1]))
-    self.assertEqual(-1, reward_manager([1, 0, 0, 0, 0, 0]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4, 0, 0]))
-    self.assertEqual(0, reward_manager([1, 2, 3, 4, 0]))
-    self.assertEqual(1, reward_manager([]))
-    self.assertEqual(0, reward_manager([]))
-    self.assertEqual(0, reward_manager([1]))
-    self.assertEqual(1, reward_manager([1, 2]))
-    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 0, 0]))
-    self.assertEqual(0, reward_manager([1, 1, 1, 1, 1]))
-    self.assertEqual(1 + 2, reward_manager([1, 2, 3, 4]))
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/common/rollout.py
+++ b/research/brain_coder/common/rollout.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Utilities related to computing training batches from episode rollouts.
-
-Implementations here are based on code from Open AI:
-https://github.com/openai/universe-starter-agent/blob/master/a3c.py.
-"""
-
-from collections import namedtuple
-import numpy as np
-import scipy.signal
-
-from common import utils  # brain coder
-
-
-class Rollout(object):
-  """Holds a rollout for an episode.
-
-  A rollout is a record of the states observed in some environment and actions
-  taken by the agent to arrive at those states. Other information includes
-  rewards received after each action, values estimated for each state, whether
-  the rollout concluded the episide, and total reward received. Everything
-  should be given in time order.
-
-  At each time t, the agent sees state s_t, takes action a_t, and then receives
-  reward r_t. The agent may optionally estimate a state value V(s_t) for each
-  state.
-
-  For an episode of length T:
-  states = [s_0, ..., s_(T-1)]
-  actions = [a_0, ..., a_(T-1)]
-  rewards = [r_0, ..., r_(T-1)]
-  values = [V(s_0), ..., V(s_(T-1))]
-
-  Note that there is an extra state s_T observed after taking action a_(T-1),
-  but this is not included in the rollout.
-
-  Rollouts have an `terminated` attribute which is True when the rollout is
-  "finalized", i.e. it holds a full episode. terminated will be False when
-  time steps are still being added to it.
-  """
-
-  def __init__(self):
-    self.states = []
-    self.actions = []
-    self.rewards = []
-    self.values = []
-    self.total_reward = 0.0
-    self.terminated = False
-
-  def add(self, state, action, reward, value=0.0, terminated=False):
-    """Add the next timestep to this rollout.
-
-    Args:
-      state: The state observed at the start of this timestep.
-      action: The action taken after observing the given state.
-      reward: The reward received for taking the given action.
-      value: The value estimated for the given state.
-      terminated: Whether this timestep ends the episode.
-
-    Raises:
-      ValueError: If this.terminated is already True, meaning that the episode
-          has already ended.
-    """
-    if self.terminated:
-      raise ValueError(
-          'Trying to add timestep to an already terminal rollout.')
-    self.states += [state]
-    self.actions += [action]
-    self.rewards += [reward]
-    self.values += [value]
-    self.terminated = terminated
-    self.total_reward += reward
-
-  def add_many(self, states, actions, rewards, values=None, terminated=False):
-    """Add many timesteps to this rollout.
-
-    Arguments are the same as `add`, but are lists of equal size.
-
-    Args:
-      states: The states observed.
-      actions: The actions taken.
-      rewards: The rewards received.
-      values: The values estimated for the given states.
-      terminated: Whether this sequence ends the episode.
-
-    Raises:
-      ValueError: If the lengths of all the input lists are not equal.
-      ValueError: If this.terminated is already True, meaning that the episode
-          has already ended.
-    """
-    if len(states) != len(actions):
-      raise ValueError(
-          'Number of states and actions must be the same. Got %d states and '
-          '%d actions' % (len(states), len(actions)))
-    if len(states) != len(rewards):
-      raise ValueError(
-          'Number of states and rewards must be the same. Got %d states and '
-          '%d rewards' % (len(states), len(rewards)))
-    if values is not None and len(states) != len(values):
-      raise ValueError(
-          'Number of states and values must be the same. Got %d states and '
-          '%d values' % (len(states), len(values)))
-    if self.terminated:
-      raise ValueError(
-          'Trying to add timesteps to an already terminal rollout.')
-    self.states += states
-    self.actions += actions
-    self.rewards += rewards
-    self.values += values if values is not None else [0.0] * len(states)
-    self.terminated = terminated
-    self.total_reward += sum(rewards)
-
-  def extend(self, other):
-    """Append another rollout to this rollout."""
-    assert not self.terminated
-    self.states.extend(other.states)
-    self.actions.extend(other.actions)
-    self.rewards.extend(other.rewards)
-    self.values.extend(other.values)
-    self.terminated = other.terminated
-    self.total_reward += other.total_reward
-
-
-def discount(x, gamma):
-  """Returns discounted sums for each value in x, with discount factor gamma.
-
-  This can be used to compute the return (discounted sum of rewards) at each
-  timestep given a sequence of rewards. See the definitions for return and
-  REINFORCE in section 3 of https://arxiv.org/pdf/1602.01783.pdf.
-
-  Let g^k mean gamma ** k.
-  For list [x_0, ..., x_N], the following list of discounted sums is computed:
-  [x_0 + g^1 * x_1 + g^2 * x_2 + ... g^N * x_N,
-   x_1 + g^1 * x_2 + g^2 * x_3 + ... g^(N-1) * x_N,
-   x_2 + g^1 * x_3 + g^2 * x_4 + ... g^(N-2) * x_N,
-   ...,
-   x_(N-1) + g^1 * x_N,
-   x_N]
-
-  Args:
-    x: List of numbers [x_0, ..., x_N].
-    gamma: Float between 0 and 1 (inclusive). This is the discount factor.
-
-  Returns:
-    List of discounted sums.
-  """
-  return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
-
-
-def discounted_advantage_and_rewards(rewards, values, gamma, lambda_=1.0):
-  """Compute advantages and returns (discounted sum of rewards).
-
-  For an episode of length T, rewards = [r_0, ..., r_(T-1)].
-  Each reward r_t is observed after taking action a_t at state s_t. A final
-  state s_T is observed but no reward is given at this state since no action
-  a_T is taken (otherwise there would be a new state s_(T+1)).
-
-  `rewards` and `values` are for a single episode. Return R_t is the discounted
-  sum of future rewards starting at time t, where `gamma` is the discount
-  factor.
-  R_t = r_t + gamma * r_(t+1) + gamma**2 * r_(t+2) + ...
-        + gamma**(T-1-t) * r_(T-1)
-
-  Advantage A(a_t, s_t) is approximated by computing A(a_t, s_t) = R_t - V(s_t)
-  where V(s_t) is an approximation of the value at that state, given in the
-  `values` list. Returns R_t are needed for all REINFORCE algorithms. Advantage
-  is used for the advantage actor critic variant of REINFORCE.
-  See algorithm S3 in https://arxiv.org/pdf/1602.01783.pdf.
-
-  Additionally another parameter `lambda_` controls the bias-variance tradeoff.
-  See "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438.
-  lambda_ = 1 reduces to regular advantage.
-  0 <= lambda_ < 1 trades off variance for bias, with lambda_ = 0 being the
-  most biased.
-
-  Bootstrapping is also supported. If an episode does not end in a terminal
-  state (either because the episode was ended early, or the environment does not
-  have end states), the true return cannot be computed from the rewards alone.
-  However, it can be estimated by computing the value (an approximation of
-  return) of the last state s_T. Thus the `values` list will have an extra item:
-  values = [V(s_0), ..., V(s_(T-1)), V(s_T)].
-
-  Args:
-    rewards: List of observed rewards [r_0, ..., r_(T-1)].
-    values: List of estimated values [V(s_0), ..., V(s_(T-1))] with an optional
-        extra V(s_T) item.
-    gamma: Discount factor. Number between 0 and 1. 1 means no discount.
-        If not 1, gamma is typically near 1, like 0.99.
-    lambda_: Bias-variance tradeoff factor. Between 0 and 1.
-
-  Returns:
-    empirical_values: Returns at each timestep.
-    generalized_advantage: Avantages at each timestep.
-
-  Raises:
-    ValueError: If shapes of `rewards` and `values` are not rank 1.
-    ValueError: If len(values) not in (len(rewards), len(rewards) + 1).
-  """
-  rewards = np.asarray(rewards, dtype=np.float32)
-  values = np.asarray(values, dtype=np.float32)
-  if rewards.ndim != 1:
-    raise ValueError('Single episode only. rewards must be rank 1.')
-  if values.ndim != 1:
-    raise ValueError('Single episode only. values must be rank 1.')
-  if len(values) == len(rewards):
-    # No bootstrapping.
-    values = np.append(values, 0)
-    empirical_values = discount(rewards, gamma)
-  elif len(values) == len(rewards) + 1:
-    # With bootstrapping.
-    # Last value is for the terminal state (final state after last action was
-    # taken).
-    empirical_values = discount(np.append(rewards, values[-1]), gamma)[:-1]
-  else:
-    raise ValueError('values should contain the same number of items or one '
-                     'more item than rewards')
-  delta = rewards + gamma * values[1:] - values[:-1]
-  generalized_advantage = discount(delta, gamma * lambda_)
-
-  # empirical_values is the discounted sum of rewards into the future.
-  # generalized_advantage is the target for each policy update.
-  return empirical_values, generalized_advantage
-
-
-"""Batch holds a minibatch of episodes.
-
-Let bi = batch_index, i.e. the index of each episode in the minibatch.
-Let t = time.
-
-Attributes:
-  states: States for each timestep in each episode. Indexed by states[bi, t].
-  actions: Actions for each timestep in each episode. Indexed by actions[bi, t].
-  discounted_adv: Advantages (computed by discounted_advantage_and_rewards)
-      for each timestep in each episode. Indexed by discounted_adv[bi, t].
-  discounted_r: Returns (discounted sum of rewards computed by
-      discounted_advantage_and_rewards) for each timestep in each episode.
-      Indexed by discounted_r[bi, t].
-  total_rewards: Total reward for each episode, i.e. sum of rewards across all
-      timesteps (not discounted). Indexed by total_rewards[bi].
-  episode_lengths: Number of timesteps in each episode. If an episode has
-      N actions, N rewards, and N states, then its length is N. Indexed by
-      episode_lengths[bi].
-  batch_size: Number of episodes in this minibatch. An integer.
-  max_time: Maximum episode length in the batch. An integer.
-"""  # pylint: disable=pointless-string-statement
-Batch = namedtuple(
-    'Batch',
-    ['states', 'actions', 'discounted_adv', 'discounted_r', 'total_rewards',
-     'episode_lengths', 'batch_size', 'max_time'])
-
-
-def process_rollouts(rollouts, gamma, lambda_=1.0):
-  """Convert a batch of rollouts into tensors ready to be fed into a model.
-
-  Lists from each episode are stacked into 2D tensors and padded with 0s up to
-  the maximum timestep in the batch.
-
-  Args:
-    rollouts: A list of Rollout instances.
-    gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma
-        argument in discounted_advantage_and_rewards.
-    lambda_: See lambda_ argument in discounted_advantage_and_rewards.
-
-  Returns:
-    Batch instance. states, actions, discounted_adv, and discounted_r are
-    numpy arrays with shape (batch_size, max_episode_length). episode_lengths
-    is a list of ints. total_rewards is a list of floats (total reward in each
-    episode). batch_size and max_time are ints.
-
-  Raises:
-    ValueError: If any of the rollouts are not terminal.
-  """
-  for ro in rollouts:
-    if not ro.terminated:
-      raise ValueError('Can only process terminal rollouts.')
-
-  episode_lengths = [len(ro.states) for ro in rollouts]
-  batch_size = len(rollouts)
-  max_time = max(episode_lengths)
-
-  states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time)
-  actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time)
-
-  discounted_rewards = [None] * batch_size
-  discounted_adv = [None] * batch_size
-  for i, ro in enumerate(rollouts):
-    disc_r, disc_adv = discounted_advantage_and_rewards(
-        ro.rewards, ro.values, gamma, lambda_)
-    discounted_rewards[i] = disc_r
-    discounted_adv[i] = disc_adv
-  discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time)
-  discounted_adv = utils.stack_pad(discounted_adv, 0, max_time)
-
-  total_rewards = [sum(ro.rewards) for ro in rollouts]
-
-  return Batch(states=states,
-               actions=actions,
-               discounted_adv=discounted_adv,
-               discounted_r=discounted_rewards,
-               total_rewards=total_rewards,
-               episode_lengths=episode_lengths,
-               batch_size=batch_size,
-               max_time=max_time)
--- a/research/brain_coder/common/rollout_test.py
+++ b/research/brain_coder/common/rollout_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for common.rollout."""
-
-import numpy as np
-import tensorflow as tf
-
-from common import rollout as rollout_lib  # brain coder
-
-
-class RolloutTest(tf.test.TestCase):
-
-  def MakeRollout(self, states, actions, rewards, values=None, terminated=True):
-    rollout = rollout_lib.Rollout()
-    rollout.add_many(
-        states=states, actions=actions, rewards=rewards, values=values,
-        terminated=terminated)
-    return rollout
-
-  def testDiscount(self):
-    discounted = np.array([1.0 / 2 ** n for n in range(4, -1, -1)])
-    discounted[:2] += [1.0 / 2 ** n for n in range(1, -1, -1)]
-
-    self.assertTrue(np.array_equal(
-        rollout_lib.discount([0.0, 1.0, 0.0, 0.0, 1.0], 0.50),
-        discounted))
-    self.assertTrue(np.array_equal(
-        rollout_lib.discount(np.array([0.0, 1.0, 0.0, 0.0, 1.0]), 0.50),
-        discounted))
-
-  def testDiscountedAdvantageAndRewards(self):
-    # lambda=1, No bootstrapping.
-    values = [0.1, 0.5, 0.5, 0.25]
-    (empirical_values,
-     generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
-         [0.0, 0.0, 0.0, 1.0],
-         values,
-         gamma=0.75,
-         lambda_=1.0)
-    expected_discounted_r = (
-        np.array([1.0 * 0.75 ** n for n in range(3, -1, -1)]))
-    expected_adv = expected_discounted_r - values
-    self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
-    self.assertTrue(np.allclose(generalized_advantage, expected_adv))
-
-    # lambda=1, With bootstrapping.
-    values = [0.1, 0.5, 0.5, 0.25, 0.75]
-    (empirical_values,
-     generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
-         [0.0, 0.0, 0.0, 1.0],
-         values,
-         gamma=0.75,
-         lambda_=1.0)
-    expected_discounted_r = (
-        np.array([0.75 * 0.75 ** n for n in range(4, 0, -1)])
-        + np.array([1.0 * 0.75 ** n for n in range(3, -1, -1)]))
-    expected_adv = expected_discounted_r - values[:-1]
-    self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
-    self.assertTrue(np.allclose(generalized_advantage, expected_adv))
-
-    # lambda=0.5, With bootstrapping.
-    values = [0.1, 0.5, 0.5, 0.25, 0.75]
-    rewards = [0.0, 0.0, 0.0, 1.0]
-    l = 0.5  # lambda
-    g = 0.75  # gamma
-    (empirical_values,
-     generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
-         rewards,
-         values,
-         gamma=g,
-         lambda_=l)
-    expected_discounted_r = (
-        np.array([0.75 * g ** n for n in range(4, 0, -1)])
-        + np.array([1.0 * g ** n for n in range(3, -1, -1)]))
-    expected_adv = [0.0] * len(values)
-    for t in range(3, -1, -1):
-      delta_t = rewards[t] + g * values[t + 1] - values[t]
-      expected_adv[t] = delta_t + g * l * expected_adv[t + 1]
-    expected_adv = expected_adv[:-1]
-    self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
-    self.assertTrue(np.allclose(generalized_advantage, expected_adv))
-
-  def testProcessRollouts(self):
-    g = 0.95
-    rollouts = [
-        self.MakeRollout(
-            states=[3, 6, 9],
-            actions=[1, 2, 3],
-            rewards=[1.0, -1.0, 0.5],
-            values=[0.5, 0.5, 0.1]),
-        self.MakeRollout(
-            states=[10],
-            actions=[5],
-            rewards=[1.0],
-            values=[0.5])]
-    batch = rollout_lib.process_rollouts(rollouts, gamma=g)
-
-    self.assertEqual(2, batch.batch_size)
-    self.assertEqual(3, batch.max_time)
-    self.assertEqual([3, 1], batch.episode_lengths)
-    self.assertEqual([0.5, 1.0], batch.total_rewards)
-    self.assertEqual(
-        [[3, 6, 9], [10, 0, 0]],
-        batch.states.tolist())
-    self.assertEqual(
-        [[1, 2, 3], [5, 0, 0]],
-        batch.actions.tolist())
-
-    rew1, rew2 = rollouts[0].rewards, rollouts[1].rewards
-    expected_discounted_rewards = [
-        [rew1[0] + g * rew1[1] + g * g * rew1[2],
-         rew1[1] + g * rew1[2],
-         rew1[2]],
-        [rew2[0], 0.0, 0.0]]
-    expected_advantages = [
-        [dr - v
-         for dr, v
-         in zip(expected_discounted_rewards[0], rollouts[0].values)],
-        [expected_discounted_rewards[1][0] - rollouts[1].values[0], 0.0, 0.0]]
-    self.assertTrue(
-        np.allclose(expected_discounted_rewards, batch.discounted_r))
-    self.assertTrue(
-        np.allclose(expected_advantages, batch.discounted_adv))
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/common/schedules.py
+++ b/research/brain_coder/common/schedules.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Schedule functions for controlling hparams over time."""
-
-from abc import ABCMeta
-from abc import abstractmethod
-import math
-
-from common import config_lib  # brain coder
-
-
-class Schedule(object):
-  """Schedule is a function which sets a hyperparameter's value over time.
-
-  For example, a schedule can be used to decay an hparams, or oscillate it over
-  time.
-
-  This object is constructed with an instance of config_lib.Config (will be
-  specific to each class implementation). For example if this is a decay
-  schedule, the config may specify the rate of decay and decay start time. Then
-  the object instance is called like a function, mapping global step (an integer
-  counting how many calls to the train op have been made) to the hparam value.
-
-  Properties of a schedule function f(t):
-  0) Domain of t is the non-negative integers (t may be 0).
-  1) Range of f is the reals.
-  2) Schedule functions can assume that they will be called in time order. This
-     allows schedules to be stateful.
-  3) Schedule functions should be deterministic. Two schedule instances with the
-     same config must always give the same value for each t, and regardless of
-     what t's it was previously called on. Users may call f(t) on arbitrary
-     (positive) time jumps. Essentially, multiple schedule instances used in
-     replica training will behave the same.
-  4) Duplicate successive calls on the same time are allowed.
-  """
-  __metaclass__ = ABCMeta
-
-  @abstractmethod
-  def __init__(self, config):
-    """Construct this schedule with a config specific to each class impl.
-
-    Args:
-      config: An instance of config_lib.Config.
-    """
-    pass
-
-  @abstractmethod
-  def __call__(self, global_step):
-    """Map `global_step` to a value.
-
-    `global_step` is an integer counting how many calls to the train op have
-    been made across all replicas (hence why it is global). Implementations
-    may assume calls to be made in time order, i.e. `global_step` now >=
-    previous `global_step` values.
-
-    Args:
-      global_step: Non-negative integer.
-
-    Returns:
-      Hparam value at this step. A number.
-    """
-    pass
-
-
-class ConstSchedule(Schedule):
-  """Constant function.
-
-  config:
-    const: Constant value at every step.
-
-  f(t) = const.
-  """
-
-  def __init__(self, config):
-    super(ConstSchedule, self).__init__(config)
-    self.const = config.const
-
-  def __call__(self, global_step):
-    return self.const
-
-
-class LinearDecaySchedule(Schedule):
-  """Linear decay function.
-
-  config:
-    initial: Decay starts from this value.
-    final: Decay ends at this value.
-    start_time: Step when decay starts. Constant before it.
-    end_time: When decay ends. Constant after it.
-
-  f(t) is a linear function when start_time <= t <= end_time, with slope of
-  (final - initial) / (end_time - start_time). f(t) = initial
-  when t <= start_time. f(t) = final when t >= end_time.
-
-  If start_time == end_time, this becomes a step function.
-  """
-
-  def __init__(self, config):
-    super(LinearDecaySchedule, self).__init__(config)
-    self.initial = config.initial
-    self.final = config.final
-    self.start_time = config.start_time
-    self.end_time = config.end_time
-
-    if self.end_time < self.start_time:
-      raise ValueError('start_time must be before end_time.')
-
-    # Linear interpolation.
-    self._time_diff = float(self.end_time - self.start_time)
-    self._diff = float(self.final - self.initial)
-    self._slope = (
-        self._diff / self._time_diff if self._time_diff > 0 else float('inf'))
-
-  def __call__(self, global_step):
-    if global_step <= self.start_time:
-      return self.initial
-    if global_step > self.end_time:
-      return self.final
-    return self.initial + (global_step - self.start_time) * self._slope
-
-
-class ExponentialDecaySchedule(Schedule):
-  """Exponential decay function.
-
-  See https://en.wikipedia.org/wiki/Exponential_decay.
-
-  Use this decay function to decay over orders of magnitude. For example, to
-  decay learning rate from 1e-2 to 1e-6. Exponential decay will decay the
-  exponent linearly.
-
-  config:
-    initial: Decay starts from this value.
-    final: Decay ends at this value.
-    start_time: Step when decay starts. Constant before it.
-    end_time: When decay ends. Constant after it.
-
-  f(t) is an exponential decay function when start_time <= t <= end_time. The
-  decay rate and amplitude are chosen so that f(t) = initial when
-  t = start_time, and f(t) = final when t = end_time. f(t) is constant for
-  t < start_time or t > end_time. initial and final must be positive values.
-
-  If start_time == end_time, this becomes a step function.
-  """
-
-  def __init__(self, config):
-    super(ExponentialDecaySchedule, self).__init__(config)
-    self.initial = config.initial
-    self.final = config.final
-    self.start_time = config.start_time
-    self.end_time = config.end_time
-
-    if self.initial <= 0 or self.final <= 0:
-      raise ValueError('initial and final must be positive numbers.')
-
-    # Linear interpolation in log space.
-    self._linear_fn = LinearDecaySchedule(
-        config_lib.Config(
-            initial=math.log(self.initial),
-            final=math.log(self.final),
-            start_time=self.start_time,
-            end_time=self.end_time))
-
-  def __call__(self, global_step):
-    return math.exp(self._linear_fn(global_step))
-
-
-class SmootherstepDecaySchedule(Schedule):
-  """Smootherstep decay function.
-
-  A sigmoidal like transition from initial to final values. A smoother
-  transition than linear and exponential decays, hence the name.
-  See https://en.wikipedia.org/wiki/Smoothstep.
-
-  config:
-    initial: Decay starts from this value.
-    final: Decay ends at this value.
-    start_time: Step when decay starts. Constant before it.
-    end_time: When decay ends. Constant after it.
-
-  f(t) is fully defined here:
-  https://en.wikipedia.org/wiki/Smoothstep#Variations.
-
-  f(t) is smooth, as in its first-derivative exists everywhere.
-  """
-
-  def __init__(self, config):
-    super(SmootherstepDecaySchedule, self).__init__(config)
-    self.initial = config.initial
-    self.final = config.final
-    self.start_time = config.start_time
-    self.end_time = config.end_time
-
-    if self.end_time < self.start_time:
-      raise ValueError('start_time must be before end_time.')
-
-    self._time_diff = float(self.end_time - self.start_time)
-    self._diff = float(self.final - self.initial)
-
-  def __call__(self, global_step):
-    if global_step <= self.start_time:
-      return self.initial
-    if global_step > self.end_time:
-      return self.final
-    x = (global_step - self.start_time) / self._time_diff
-
-    # Smootherstep
-    return self.initial + x * x * x * (x * (x * 6 - 15) + 10) * self._diff
-
-
-class HardOscillatorSchedule(Schedule):
-  """Hard oscillator function.
-
-  config:
-    high: Max value of the oscillator. Value at constant plateaus.
-    low: Min value of the oscillator. Value at constant valleys.
-    start_time: Global step when oscillation starts. Constant before this.
-    period: Width of one oscillation, i.e. number of steps over which the
-        oscillation takes place.
-    transition_fraction: Fraction of the period spent transitioning between high
-        and low values. 50% of this time is spent rising, and 50% of this time
-        is spent falling. 50% of the remaining time is spent constant at the
-        high value, and 50% of the remaining time is spent constant at the low
-        value. transition_fraction = 1.0 means the entire period is spent
-        rising and falling. transition_fraction = 0.0 means no time is spent
-        rising and falling, i.e. the function jumps instantaneously between
-        high and low.
-
-  f(t) = high when t < start_time.
-  f(t) is periodic when t >= start_time, with f(t + period) = f(t).
-  f(t) is linear with positive slope when rising, and negative slope when
-  falling. At the start of the period t0, f(t0) = high and begins to descend.
-  At the middle of the period f is low and is constant until the ascension
-  begins. f then rises from low to high and is constant again until the period
-  repeats.
-
-  Note: when transition_fraction is 0, f starts the period low and ends high.
-  """
-
-  def __init__(self, config):
-    super(HardOscillatorSchedule, self).__init__(config)
-    self.high = config.high
-    self.low = config.low
-    self.start_time = config.start_time
-    self.period = float(config.period)
-    self.transition_fraction = config.transition_fraction
-    self.half_transition_fraction = config.transition_fraction / 2.0
-
-    if self.transition_fraction < 0 or self.transition_fraction > 1.0:
-      raise ValueError('transition_fraction must be between 0 and 1.0')
-    if self.period <= 0:
-      raise ValueError('period must be positive')
-
-    self._slope = (
-        float(self.high - self.low) / self.half_transition_fraction
-        if self.half_transition_fraction > 0 else float('inf'))
-
-  def __call__(self, global_step):
-    if global_step < self.start_time:
-      return self.high
-    period_pos = ((global_step - self.start_time) / self.period) % 1.0
-    if period_pos >= 0.5:
-      # ascending
-      period_pos -= 0.5
-      if period_pos < self.half_transition_fraction:
-        return self.low + period_pos * self._slope
-      else:
-        return self.high
-    else:
-      # descending
-      if period_pos < self.half_transition_fraction:
-        return self.high - period_pos * self._slope
-      else:
-        return self.low
-
-
-_NAME_TO_CONFIG = {
-    'const': ConstSchedule,
-    'linear_decay': LinearDecaySchedule,
-    'exp_decay': ExponentialDecaySchedule,
-    'smooth_decay': SmootherstepDecaySchedule,
-    'hard_osc': HardOscillatorSchedule,
-}
-
-
-def make_schedule(config):
-  """Schedule factory.
-
-  Given `config` containing a `fn` property, a Schedule implementation is
-  instantiated with `config`. See `_NAME_TO_CONFIG` for `fn` options.
-
-  Args:
-    config: Config with a `fn` option that specifies which Schedule
-        implementation to use. `config` is passed into the constructor.
-
-  Returns:
-    A Schedule impl instance.
-  """
-  schedule_class = _NAME_TO_CONFIG[config.fn]
-  return schedule_class(config)
--- a/research/brain_coder/common/schedules_test.py
+++ b/research/brain_coder/common/schedules_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for common.schedules."""
-
-from math import exp
-from math import sqrt
-import numpy as np
-from six.moves import xrange
-import tensorflow as tf
-
-from common import config_lib  # brain coder
-from common import schedules  # brain coder
-
-
-class SchedulesTest(tf.test.TestCase):
-
-  def ScheduleTestHelper(self, config, schedule_subtype, io_values):
-    """Run common checks for schedules.
-
-    Args:
-      config: Config object which is passed into schedules.make_schedule.
-      schedule_subtype: The expected schedule type to be instantiated.
-      io_values: List of (input, output) pairs. Must be in ascending input
-          order. No duplicate inputs.
-    """
-
-    # Check that make_schedule makes the correct type.
-    f = schedules.make_schedule(config)
-    self.assertTrue(isinstance(f, schedule_subtype))
-
-    # Check that multiple instances returned from make_schedule behave the same.
-    fns = [schedules.make_schedule(config) for _ in xrange(3)]
-
-    # Check that all the inputs map to the right outputs.
-    for i, o in io_values:
-      for f in fns:
-        f_out = f(i)
-        self.assertTrue(
-            np.isclose(o, f_out),
-            'Wrong value at input %d. Expected %s, got %s' % (i, o, f_out))
-
-    # Check that a subset of the io_values are still correct.
-    f = schedules.make_schedule(config)
-    subseq = [io_values[i**2] for i in xrange(int(sqrt(len(io_values))))]
-    if subseq[-1] != io_values[-1]:
-      subseq.append(io_values[-1])
-    for i, o in subseq:
-      f_out = f(i)
-      self.assertTrue(
-          np.isclose(o, f_out),
-          'Wrong value at input %d. Expected %s, got %s' % (i, o, f_out))
-
-    # Check duplicate calls.
-    f = schedules.make_schedule(config)
-    for i, o in io_values:
-      for _ in xrange(3):
-        f_out = f(i)
-        self.assertTrue(
-            np.isclose(o, f_out),
-            'Duplicate calls at input %d are not equal. Expected %s, got %s'
-            % (i, o, f_out))
-
-  def testConstSchedule(self):
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='const', const=5),
-        schedules.ConstSchedule,
-        [(0, 5), (1, 5), (10, 5), (20, 5), (100, 5), (1000000, 5)])
-
-  def testLinearDecaySchedule(self):
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='linear_decay', initial=2, final=0, start_time=10,
-                          end_time=20),
-        schedules.LinearDecaySchedule,
-        [(0, 2), (1, 2), (10, 2), (11, 1.8), (15, 1), (19, 0.2), (20, 0),
-         (100000, 0)])
-
-    # Test step function.
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='linear_decay', initial=2, final=0, start_time=10,
-                          end_time=10),
-        schedules.LinearDecaySchedule,
-        [(0, 2), (1, 2), (10, 2), (11, 0), (15, 0)])
-
-  def testExponentialDecaySchedule(self):
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='exp_decay', initial=exp(-1), final=exp(-6),
-                          start_time=10, end_time=20),
-        schedules.ExponentialDecaySchedule,
-        [(0, exp(-1)), (1, exp(-1)), (10, exp(-1)), (11, exp(-1/2. - 1)),
-         (15, exp(-5/2. - 1)), (19, exp(-9/2. - 1)), (20, exp(-6)),
-         (100000, exp(-6))])
-
-    # Test step function.
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='exp_decay', initial=exp(-1), final=exp(-6),
-                          start_time=10, end_time=10),
-        schedules.ExponentialDecaySchedule,
-        [(0, exp(-1)), (1, exp(-1)), (10, exp(-1)), (11, exp(-6)),
-         (15, exp(-6))])
-
-  def testSmootherstepDecaySchedule(self):
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='smooth_decay', initial=2, final=0, start_time=10,
-                          end_time=20),
-        schedules.SmootherstepDecaySchedule,
-        [(0, 2), (1, 2), (10, 2), (11, 1.98288), (15, 1), (19, 0.01712),
-         (20, 0), (100000, 0)])
-
-    # Test step function.
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='smooth_decay', initial=2, final=0, start_time=10,
-                          end_time=10),
-        schedules.SmootherstepDecaySchedule,
-        [(0, 2), (1, 2), (10, 2), (11, 0), (15, 0)])
-
-  def testHardOscillatorSchedule(self):
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='hard_osc', high=2, low=0, start_time=100,
-                          period=10, transition_fraction=0.5),
-        schedules.HardOscillatorSchedule,
-        [(0, 2), (1, 2), (10, 2), (100, 2), (101, 1.2), (102, 0.4), (103, 0),
-         (104, 0), (105, 0), (106, 0.8), (107, 1.6), (108, 2), (109, 2),
-         (110, 2), (111, 1.2), (112, 0.4), (115, 0), (116, 0.8), (119, 2),
-         (120, 2), (100001, 1.2), (100002, 0.4), (100005, 0), (100006, 0.8),
-         (100010, 2)])
-
-    # Test instantaneous step.
-    self.ScheduleTestHelper(
-        config_lib.Config(fn='hard_osc', high=2, low=0, start_time=100,
-                          period=10, transition_fraction=0),
-        schedules.HardOscillatorSchedule,
-        [(0, 2), (1, 2), (10, 2), (99, 2), (100, 0), (104, 0), (105, 2),
-         (106, 2), (109, 2), (110, 0)])
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/common/utils.py
+++ b/research/brain_coder/common/utils.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Configuration class."""
-
-import bisect
-from collections import deque
-import cPickle
-import heapq
-import random
-
-from absl import logging
-import numpy as np
-import six
-from six.moves import xrange
-import tensorflow as tf
-
-
-def tuple_to_record(tuple_, record_type):
-  return record_type(**dict(zip(record_type.__slots__, tuple_)))
-
-
-def make_record(type_name, attributes, defaults=None):
-  """Factory for mutable record classes.
-
-  A record acts just like a collections.namedtuple except slots are writable.
-  One exception is that record classes are not equivalent to tuples or other
-  record classes of the same length.
-
-  Note, each call to `make_record` produces a unique type. Two calls will make
-  different types even if `type_name` is the same each time.
-
-  Args:
-    type_name: Name of the record type to create.
-    attributes: List of names of each record attribute. The order of the list
-        is preserved.
-    defaults: (optional) default values for attributes. A dict mapping attribute
-        names to values.
-
-  Returns:
-    A new record type.
-
-  Raises:
-    ValueError: If,
-        `defaults` is not a dict,
-        `attributes` contains duplicate names,
-        `defaults` keys are not contained in `attributes`.
-  """
-  if defaults is None:
-    defaults = {}
-  if not isinstance(defaults, dict):
-    raise ValueError('defaults must be a dict.')
-  attr_set = set(attributes)
-  if len(attr_set) < len(attributes):
-    raise ValueError('No duplicate attributes allowed.')
-  if not set(defaults.keys()).issubset(attr_set):
-    raise ValueError('Default attributes must be given in the attributes list.')
-
-  class RecordClass(object):
-    """A record type.
-
-    Acts like mutable tuple with named slots.
-    """
-    __slots__ = list(attributes)
-    _defaults = dict(defaults)
-
-    def __init__(self, *args, **kwargs):
-      if len(args) > len(self.__slots__):
-        raise ValueError('Too many arguments. %s has length %d.'
-                         % (type(self).__name__, len(self.__slots__)))
-      for attr, val in self._defaults.items():
-        setattr(self, attr, val)
-      for i, arg in enumerate(args):
-        setattr(self, self.__slots__[i], arg)
-      for attr, val in kwargs.items():
-        setattr(self, attr, val)
-      for attr in self.__slots__:
-        if not hasattr(self, attr):
-          raise ValueError('Required attr "%s" is not set.' % attr)
-
-    def __len__(self):
-      return len(self.__slots__)
-
-    def __iter__(self):
-      for attr in self.__slots__:
-        yield getattr(self, attr)
-
-    def __getitem__(self, index):
-      return getattr(self, self.__slots__[index])
-
-    def __setitem__(self, index, value):
-      return setattr(self, self.__slots__[index], value)
-
-    def __eq__(self, other):
-      # Types must be equal as well as values.
-      return (isinstance(other, type(self))
-              and all(a == b for a, b in zip(self, other)))
-
-    def __str__(self):
-      return '%s(%s)' % (
-          type(self).__name__,
-          ', '.join(attr + '=' + str(getattr(self, attr))
-                    for attr in self.__slots__))
-
-    def __repr__(self):
-      return str(self)
-
-  RecordClass.__name__ = type_name
-  return RecordClass
-
-
-# Making minibatches.
-def stack_pad(tensors, pad_axes=None, pad_to_lengths=None, dtype=np.float32,
-              pad_value=0):
-  """Stack tensors along 0-th dim and pad them to be the same shape.
-
-  Args:
-    tensors: Any list of iterables (python list, numpy array, etc). Can be 1D
-        or multi-D iterables.
-    pad_axes: An int or list of ints. Axes to pad along.
-    pad_to_lengths: Length in each dimension. If pad_axes was an int, this is an
-        int or None. If pad_axes was a list of ints, this is a list of mixed int
-        and None types with the same length, or None. A None length means the
-        maximum length among the given tensors is used.
-    dtype: Type of output numpy array. Defaults to np.float32.
-    pad_value: Value to use for padding. Defaults to 0.
-
-  Returns:
-    Numpy array containing the tensors stacked along the 0-th dimension and
-        padded along the specified dimensions.
-
-  Raises:
-    ValueError: If the tensors do not have equal shapes along non-padded
-        dimensions.
-  """
-  tensors = [np.asarray(t) for t in tensors]
-  max_lengths = [max(l) for l in zip(*[t.shape for t in tensors])]
-  same_axes = dict(enumerate(max_lengths))
-  if pad_axes is None:
-    pad_axes = []
-  if isinstance(pad_axes, six.integer_types):
-    if pad_to_lengths is not None:
-      max_lengths[pad_axes] = pad_to_lengths
-    del same_axes[pad_axes]
-  else:
-    if pad_to_lengths is None:
-      pad_to_lengths = [None] * len(pad_axes)
-    for i, l in zip(pad_axes, pad_to_lengths):
-      if l is not None:
-        max_lengths[i] = l
-      del same_axes[i]
-  same_axes_items = same_axes.items()
-  dest = np.full([len(tensors)] + max_lengths, pad_value, dtype=dtype)
-  for i, t in enumerate(tensors):
-    for j, l in same_axes_items:
-      if t.shape[j] != l:
-        raise ValueError(
-            'Tensor at index %d does not have size %d along axis %d'
-            % (i, l, j))
-    dest[[i] + [slice(0, d) for d in t.shape]] = t
-  return dest
-
-
-class RandomQueue(deque):
-
-  def __init__(self, capacity):
-    super(RandomQueue, self).__init__([], capacity)
-    self.capacity = capacity
-
-  def random_sample(self, sample_size):
-    idx = np.random.choice(len(self), sample_size)
-    return [self[i] for i in idx]
-
-  def push(self, item):
-    # Append to right. Oldest element will be popped from left.
-    self.append(item)
-
-
-class MPQItemContainer(object):
-  """Class for holding an item with its score.
-
-  Defines a comparison function for use in the heap-queue.
-  """
-
-  def __init__(self, score, item, extra_data):
-    self.item = item
-    self.score = score
-    self.extra_data = extra_data
-
-  def __cmp__(self, other):
-    assert isinstance(other, type(self))
-    return cmp(self.score, other.score)
-
-  def __iter__(self):
-    """Allows unpacking like a tuple."""
-    yield self.score
-    yield self.item
-    yield self.extra_data
-
-  def __repr__(self):
-    """String representation of this item.
-
-    `extra_data` is not included in the representation. We are assuming that
-    `extra_data` is not easily interpreted by a human (if it was, it should be
-    hashable, like a string or tuple).
-
-    Returns:
-      String representation of `self`.
-    """
-    return str((self.score, self.item))
-
-  def __str__(self):
-    return repr(self)
-
-
-class MaxUniquePriorityQueue(object):
-  """A maximum priority queue where duplicates are not added.
-
-  The top items by score remain in the queue. When the capacity is reached,
-  the lowest scored item in the queue will be dropped.
-
-  This implementation differs from a typical priority queue, in that the minimum
-  score is popped, instead of the maximum. Largest scores remain stuck in the
-  queue. This is useful for accumulating the best known items from a population.
-
-  The items used to determine uniqueness must be hashable, but additional
-  non-hashable data may be stored with each item.
-  """
-
-  def __init__(self, capacity):
-    self.capacity = capacity
-    self.heap = []
-    self.unique_items = set()
-
-  def push(self, score, item, extra_data=None):
-    """Push an item onto the queue.
-
-    If the queue is at capacity, the item with the smallest score will be
-    dropped. Note that it is assumed each item has exactly one score. The same
-    item with a different score will still be dropped.
-
-    Args:
-      score: Number used to prioritize items in the queue. Largest scores are
-          kept in the queue.
-      item: A hashable item to be stored. Duplicates of this item will not be
-          added to the queue.
-      extra_data: An extra (possible not hashable) data to store with the item.
-    """
-    if item in self.unique_items:
-      return
-    if len(self.heap) >= self.capacity:
-      _, popped_item, _ = heapq.heappushpop(
-          self.heap, MPQItemContainer(score, item, extra_data))
-      self.unique_items.add(item)
-      self.unique_items.remove(popped_item)
-    else:
-      heapq.heappush(self.heap, MPQItemContainer(score, item, extra_data))
-      self.unique_items.add(item)
-
-  def pop(self):
-    """Pop the item with the lowest score.
-
-    Returns:
-      score: Item's score.
-      item: The item that was popped.
-      extra_data: Any extra data stored with the item.
-    """
-    if not self.heap:
-      return ()
-    score, item, extra_data = heapq.heappop(self.heap)
-    self.unique_items.remove(item)
-    return score, item, extra_data
-
-  def get_max(self):
-    """Peek at the item with the highest score.
-
-    Returns:
-      Same as `pop`.
-    """
-    if not self.heap:
-      return ()
-    score, item, extra_data = heapq.nlargest(1, self.heap)[0]
-    return score, item, extra_data
-
-  def get_min(self):
-    """Peek at the item with the lowest score.
-
-    Returns:
-      Same as `pop`.
-    """
-    if not self.heap:
-      return ()
-    score, item, extra_data = heapq.nsmallest(1, self.heap)[0]
-    return score, item, extra_data
-
-  def random_sample(self, sample_size):
-    """Randomly select items from the queue.
-
-    This does not modify the queue.
-
-    Items are drawn from a uniform distribution, and not weighted by score.
-
-    Args:
-      sample_size: Number of random samples to draw. The same item can be
-          sampled multiple times.
-
-    Returns:
-      List of sampled items (of length `sample_size`). Each element in the list
-      is a tuple: (item, extra_data).
-    """
-    idx = np.random.choice(len(self.heap), sample_size)
-    return [(self.heap[i].item, self.heap[i].extra_data) for i in idx]
-
-  def iter_in_order(self):
-    """Iterate over items in the queue from largest score to smallest.
-
-    Yields:
-      item: Hashable item.
-      extra_data: Extra data stored with the item.
-    """
-    for _, item, extra_data in heapq.nlargest(len(self.heap), self.heap):
-      yield item, extra_data
-
-  def __len__(self):
-    return len(self.heap)
-
-  def __iter__(self):
-    for _, item, _ in self.heap:
-      yield item
-
-  def __repr__(self):
-    return '[' + ', '.join(repr(c) for c in self.heap) + ']'
-
-  def __str__(self):
-    return repr(self)
-
-
-class RouletteWheel(object):
-  """Randomly samples stored objects proportionally to their given weights.
-
-  Stores objects and weights. Acts like a roulette wheel where each object is
-  given a slice of the roulette disk proportional to its weight.
-
-  This can be used as a replay buffer where past experiences are sampled
-  proportionally to their weights. A good choice of "weight" for reinforcement
-  learning is exp(reward / temperature) where temperature -> inf makes the
-  distribution more uniform and temperature -> 0 makes the distribution more
-  peaky.
-
-  To prevent experiences from being overweighted by appearing in the replay
-  buffer multiple times, a "unique mode" is supported where duplicate
-  experiences are ignored. In unique mode, weights can be quickly retrieved from
-  keys.
-  """
-
-  def __init__(self, unique_mode=False, save_file=None):
-    """Construct empty RouletteWheel.
-
-    If `save_file` is not None, and the file already exists on disk, whatever
-    is in the file will be loaded into this instance. This allows jobs using
-    RouletteWheel to resume after preemption.
-
-    Args:
-      unique_mode: If True, puts this RouletteWheel into unique mode, where
-          objects are added with hashable keys, so that duplicates are ignored.
-      save_file: Optional file path to save to. Must be a string containing
-          an absolute path to a file, or None. File will be Python pickle
-          format.
-    """
-    self.unique_mode = unique_mode
-    self.objects = []
-    self.weights = []
-    self.partial_sums = []
-    if self.unique_mode:
-      self.keys_to_weights = {}
-    self.save_file = save_file
-    self.save_to_disk_buffer = []
-
-    if save_file is not None and tf.gfile.Exists(save_file):
-      # Load from disk.
-      with tf.gfile.OpenFast(save_file, 'r') as f:
-        count = 0
-        while 1:
-          try:
-            obj, weight, key = cPickle.load(f)
-          except EOFError:
-            break
-          else:
-            self.add(obj, weight, key)
-            count += 1
-      logging.info('Loaded %d samples from disk.', count)
-      # Clear buffer since these items are already on disk.
-      self.save_to_disk_buffer = []
-
-  def __iter__(self):
-    return iter(zip(self.objects, self.weights))
-
-  def __len__(self):
-    return len(self.objects)
-
-  def is_empty(self):
-    """Returns whether there is anything in the roulette wheel."""
-    return not self.partial_sums
-
-  @property
-  def total_weight(self):
-    """Total cumulative weight across all objects."""
-    if self.partial_sums:
-      return self.partial_sums[-1]
-    return 0.0
-
-  def has_key(self, key):
-    if self.unique_mode:
-      RuntimeError('has_key method can only be called in unique mode.')
-    return key in self.keys_to_weights
-
-  def get_weight(self, key):
-    if self.unique_mode:
-      RuntimeError('get_weight method can only be called in unique mode.')
-    return self.keys_to_weights[key]
-
-  def add(self, obj, weight, key=None):
-    """Add one object and its weight to the roulette wheel.
-
-    Args:
-      obj: Any object to be stored.
-      weight: A non-negative float. The given object will be drawn with
-          probability proportional to this weight when sampling.
-      key: This argument is only used when in unique mode. To allow `obj` to
-          be an unhashable type, like list, a separate hashable key is given.
-          Each `key` should be unique to each `obj`. `key` is used to check if
-          `obj` has been added to the roulette wheel before.
-
-    Returns:
-      True if the object was added, False if it was not added due to it being
-      a duplicate (this only happens in unique mode).
-
-    Raises:
-      ValueError: If `weight` is negative.
-      ValueError: If `key` is not given when in unique mode, or if `key` is
-          given when not in unique mode.
-    """
-    if weight < 0:
-      raise ValueError('Weight must be non-negative')
-    if self.unique_mode:
-      if key is None:
-        raise ValueError(
-            'Hashable key required for objects when unique mode is enabled.')
-      if key in self.keys_to_weights:
-        # Weight updates are not allowed. Ignore the given value of `weight`.
-        return False
-      self.keys_to_weights[key] = weight
-    elif key is not None:
-      raise ValueError(
-          'key argument should not be used when unique mode is disabled.')
-    self.objects.append(obj)
-    self.weights.append(weight)
-    self.partial_sums.append(self.total_weight + weight)
-    if self.save_file is not None:
-      # Record new item in buffer.
-      self.save_to_disk_buffer.append((obj, weight, key))
-    return True
-
-  def add_many(self, objs, weights, keys=None):
-    """Add many object and their weights to the roulette wheel.
-
-    Arguments are the same as the `add` method, except each is a list. Lists
-    must all be the same length.
-
-    Args:
-      objs: List of objects to be stored.
-      weights: List of non-negative floats. See `add` method.
-      keys: List of hashable keys. This argument is only used when in unique
-          mode. See `add` method.
-
-    Returns:
-      Number of objects added. This number will be less than the number of
-      objects provided if we are in unique mode and some keys are already
-      in the roulette wheel.
-
-    Raises:
-      ValueError: If `keys` argument is provided when unique_mode == False, or
-          is not provided when unique_mode == True.
-      ValueError: If any of the lists are not the same length.
-      ValueError: If any of the weights are negative.
-    """
-    if keys is not None and not self.unique_mode:
-      raise ValueError('Not in unique mode. Do not provide keys.')
-    elif keys is None and self.unique_mode:
-      raise ValueError('In unique mode. You must provide hashable keys.')
-    if keys and len(objs) != len(keys):
-      raise ValueError('Number of objects does not equal number of keys.')
-    if len(objs) != len(weights):
-      raise ValueError('Number of objects does not equal number of weights.')
-    return sum([self.add(obj, weights[i], key=keys[i] if keys else None)
-                for i, obj in enumerate(objs)])
-
-  def sample(self):
-    """Spin the roulette wheel.
-
-    Randomly select an object with probability proportional to its weight.
-
-    Returns:
-      object: The selected object.
-      weight: The weight of the selected object.
-
-    Raises:
-      RuntimeError: If the roulette wheel is empty.
-    """
-    if self.is_empty():
-      raise RuntimeError('Trying to sample from empty roulette wheel.')
-    spin = random.random() * self.total_weight
-
-    # Binary search.
-    i = bisect.bisect_right(self.partial_sums, spin)
-    if i == len(self.partial_sums):
-      # This should not happen since random.random() will always be strictly
-      # less than 1.0, and the last partial sum equals self.total_weight().
-      # However it may happen due to rounding error. In that case it is easy to
-      # handle this, just select the last object.
-      i -= 1
-
-    return self.objects[i], self.weights[i]
-
-  def sample_many(self, count):
-    """Spin the roulette wheel `count` times and return the results."""
-    if self.is_empty():
-      raise RuntimeError('Trying to sample from empty roulette wheel.')
-    return [self.sample() for _ in xrange(count)]
-
-  def incremental_save(self, log_info=False):
-    """Write new entries to disk.
-
-    This performs an append operation on the `save_file` given in the
-    constructor. Any entries added since the last call to `incremental_save`
-    will be appended to the file.
-
-    If a new RouletteWheel is constructed with the same `save_file`, all the
-    entries written there will be automatically loaded into the instance.
-    This is useful when a job resumes after preemption.
-
-    Args:
-      log_info: If True, info about this operation will be logged.
-
-    Raises:
-      RuntimeError: If `save_file` given in the constructor is None.
-    """
-    if self.save_file is None:
-      raise RuntimeError('Cannot call incremental_save. `save_file` is None.')
-    if log_info:
-      logging.info('Saving %d new samples to disk.',
-                   len(self.save_to_disk_buffer))
-    with tf.gfile.OpenFast(self.save_file, 'a') as f:
-      for entry in self.save_to_disk_buffer:
-        cPickle.dump(entry, f)
-    # Clear the buffer.
-    self.save_to_disk_buffer = []
--- a/research/brain_coder/common/utils_test.py
+++ b/research/brain_coder/common/utils_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for common.utils.
-"""
-
-from collections import Counter
-import random
-import tempfile
-import numpy as np
-import tensorflow as tf
-
-from common import utils  # brain coder
-
-
-class UtilsTest(tf.test.TestCase):
-
-  def testStackPad(self):
-    # 1D.
-    tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
-    result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=6)
-    self.assertTrue(np.array_equal(
-        result,
-        np.asarray([[1, 2, 3, 0, 0, 0],
-                    [4, 5, 6, 7, 8, 0],
-                    [9, 0, 0, 0, 0, 0]], dtype=np.float32)))
-
-    # 3D.
-    tensors = [[[[1, 2, 3], [4, 5, 6]]],
-               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
-               [[[0, 1, 2]], [[3, 4, 5]]]]
-    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
-    self.assertTrue(np.array_equal(
-        result,
-        np.asarray([[[[1, 2, 3], [4, 5, 6]],
-                     [[0, 0, 0], [0, 0, 0]]],
-                    [[[7, 8, 9], [0, 1, 2]],
-                     [[3, 4, 5], [6, 7, 8]]],
-                    [[[0, 1, 2], [0, 0, 0]],
-                     [[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
-
-  def testStackPadNoAxes(self):
-    # 2D.
-    tensors = [[[1, 2, 3], [4, 5, 6]],
-               [[7, 8, 9], [1, 2, 3]],
-               [[4, 5, 6], [7, 8, 9]]]
-    result = utils.stack_pad(tensors)
-    self.assertTrue(np.array_equal(
-        result,
-        np.asarray(tensors)))
-
-  def testStackPadNoneLength(self):
-    # 1D.
-    tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
-    result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=None)
-    self.assertTrue(np.array_equal(
-        result,
-        np.asarray([[1, 2, 3, 0, 0],
-                    [4, 5, 6, 7, 8],
-                    [9, 0, 0, 0, 0]], dtype=np.float32)))
-
-    # 3D.
-    tensors = [[[[1, 2, 3], [4, 5, 6]]],
-               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
-               [[[0, 1, 2]], [[3, 4, 5]]]]
-    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=None)
-    self.assertTrue(np.array_equal(
-        result,
-        np.asarray([[[[1, 2, 3], [4, 5, 6]],
-                     [[0, 0, 0], [0, 0, 0]]],
-                    [[[7, 8, 9], [0, 1, 2]],
-                     [[3, 4, 5], [6, 7, 8]]],
-                    [[[0, 1, 2], [0, 0, 0]],
-                     [[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
-
-    # 3D with partial pad_to_lengths.
-    tensors = [[[[1, 2, 3], [4, 5, 6]]],
-               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
-               [[[0, 1, 2]], [[3, 4, 5]]]]
-    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[None, 3])
-    self.assertTrue(np.array_equal(
-        result,
-        np.asarray([[[[1, 2, 3], [4, 5, 6], [0, 0, 0]],
-                     [[0, 0, 0], [0, 0, 0], [0, 0, 0]]],
-                    [[[7, 8, 9], [0, 1, 2], [0, 0, 0]],
-                     [[3, 4, 5], [6, 7, 8], [0, 0, 0]]],
-                    [[[0, 1, 2], [0, 0, 0], [0, 0, 0]],
-                     [[3, 4, 5], [0, 0, 0], [0, 0, 0]]]], dtype=np.float32)))
-
-  def testStackPadValueError(self):
-    # 3D.
-    tensors = [[[[1, 2, 3], [4, 5, 6]]],
-               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
-               [[[0, 1, 2]], [[3, 4, 5]]],
-               [[[1, 2, 3, 4]]]]
-
-    # Not all tensors have the same shape along axis 2.
-    with self.assertRaises(ValueError):
-      utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
-
-  def testRecord(self):
-    my_record = utils.make_record('my_record', ['a', 'b', 'c'], {'b': 55})
-    inst = my_record(a=1, b=2, c=3)
-    self.assertEqual(1, inst.a)
-    self.assertEqual(2, inst.b)
-    self.assertEqual(3, inst.c)
-    self.assertEqual(1, inst[0])
-    self.assertEqual(2, inst[1])
-    self.assertEqual(3, inst[2])
-    self.assertEqual([1, 2, 3], list(iter(inst)))
-    self.assertEqual(3, len(inst))
-
-    inst.b = 999
-    self.assertEqual(999, inst.b)
-    self.assertEqual(999, inst[1])
-
-    inst2 = my_record(1, 999, 3)
-    self.assertTrue(inst == inst2)
-    inst2[1] = 3
-    self.assertFalse(inst == inst2)
-
-    inst3 = my_record(a=1, c=3)
-    inst.b = 55
-    self.assertEqual(inst, inst3)
-
-  def testRecordUnique(self):
-    record1 = utils.make_record('record1', ['a', 'b', 'c'])
-    record2 = utils.make_record('record2', ['a', 'b', 'c'])
-    self.assertNotEqual(record1(1, 2, 3), record2(1, 2, 3))
-    self.assertEqual(record1(1, 2, 3), record1(1, 2, 3))
-
-  def testTupleToRecord(self):
-    my_record = utils.make_record('my_record', ['a', 'b', 'c'])
-    inst = utils.tuple_to_record((5, 6, 7), my_record)
-    self.assertEqual(my_record(5, 6, 7), inst)
-
-  def testRecordErrors(self):
-    my_record = utils.make_record('my_record', ['a', 'b', 'c'], {'b': 10})
-
-    with self.assertRaises(ValueError):
-      my_record(c=5)  # Did not provide required argument 'a'.
-    with self.assertRaises(ValueError):
-      my_record(1, 2, 3, 4)  # Too many arguments.
-
-  def testRandomQueue(self):
-    np.random.seed(567890)
-    queue = utils.RandomQueue(5)
-    queue.push(5)
-    queue.push(6)
-    queue.push(7)
-    queue.push(8)
-    queue.push(9)
-    queue.push(10)
-    self.assertTrue(5 not in queue)
-    sample = queue.random_sample(1000)
-    self.assertEqual(1000, len(sample))
-    self.assertEqual([6, 7, 8, 9, 10], sorted(np.unique(sample).tolist()))
-
-  def testMaxUniquePriorityQueue(self):
-    queue = utils.MaxUniquePriorityQueue(5)
-    queue.push(1.0, 'string 1')
-    queue.push(-0.5, 'string 2')
-    queue.push(0.5, 'string 3')
-    self.assertEqual((-0.5, 'string 2', None), queue.pop())
-    queue.push(0.1, 'string 4')
-    queue.push(1.5, 'string 5')
-    queue.push(0.0, 'string 6')
-    queue.push(0.2, 'string 7')
-    self.assertEqual((1.5, 'string 5', None), queue.get_max())
-    self.assertEqual((0.1, 'string 4', None), queue.get_min())
-    self.assertEqual(
-        [('string 5', None), ('string 1', None), ('string 3', None),
-         ('string 7', None), ('string 4', None)],
-        list(queue.iter_in_order()))
-
-  def testMaxUniquePriorityQueue_Duplicates(self):
-    queue = utils.MaxUniquePriorityQueue(5)
-    queue.push(0.0, 'string 1')
-    queue.push(0.0, 'string 2')
-    queue.push(0.0, 'string 3')
-    self.assertEqual((0.0, 'string 1', None), queue.pop())
-    self.assertEqual((0.0, 'string 2', None), queue.pop())
-    self.assertEqual((0.0, 'string 3', None), queue.pop())
-    self.assertEqual(0, len(queue))
-    queue.push(0.1, 'string 4')
-    queue.push(1.5, 'string 5')
-    queue.push(0.3, 'string 6')
-    queue.push(0.2, 'string 7')
-    queue.push(0.0, 'string 8')
-    queue.push(1.5, 'string 5')
-    queue.push(1.5, 'string 5')
-    self.assertEqual((1.5, 'string 5', None), queue.get_max())
-    self.assertEqual((0.0, 'string 8', None), queue.get_min())
-    self.assertEqual(
-        [('string 5', None), ('string 6', None), ('string 7', None),
-         ('string 4', None), ('string 8', None)],
-        list(queue.iter_in_order()))
-
-  def testMaxUniquePriorityQueue_ExtraData(self):
-    queue = utils.MaxUniquePriorityQueue(5)
-    queue.push(1.0, 'string 1', [1, 2, 3])
-    queue.push(0.5, 'string 2', [4, 5, 6])
-    queue.push(0.5, 'string 3', [7, 8, 9])
-    queue.push(0.5, 'string 2', [10, 11, 12])
-    self.assertEqual((0.5, 'string 2', [4, 5, 6]), queue.pop())
-    self.assertEqual((0.5, 'string 3', [7, 8, 9]), queue.pop())
-    self.assertEqual((1.0, 'string 1', [1, 2, 3]), queue.pop())
-    self.assertEqual(0, len(queue))
-    queue.push(0.5, 'string 2', [10, 11, 12])
-    self.assertEqual((0.5, 'string 2', [10, 11, 12]), queue.pop())
-
-  def testRouletteWheel(self):
-    random.seed(12345678987654321)
-    r = utils.RouletteWheel()
-    self.assertTrue(r.is_empty())
-    with self.assertRaises(RuntimeError):
-      r.sample()  # Cannot sample when empty.
-    self.assertEqual(0, r.total_weight)
-    self.assertEqual(True, r.add('a', 0.1))
-    self.assertFalse(r.is_empty())
-    self.assertEqual(0.1, r.total_weight)
-    self.assertEqual(True, r.add('b', 0.01))
-    self.assertEqual(0.11, r.total_weight)
-    self.assertEqual(True, r.add('c', 0.5))
-    self.assertEqual(True, r.add('d', 0.1))
-    self.assertEqual(True, r.add('e', 0.05))
-    self.assertEqual(True, r.add('f', 0.03))
-    self.assertEqual(True, r.add('g', 0.001))
-    self.assertEqual(0.791, r.total_weight)
-    self.assertFalse(r.is_empty())
-
-    # Check that sampling is correct.
-    obj, weight = r.sample()
-    self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
-    self.assertTrue((obj, weight) in r)
-    for obj, weight in r.sample_many(100):
-      self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
-      self.assertTrue((obj, weight) in r)
-
-    # Check that sampling distribution is correct.
-    n = 1000000
-    c = Counter(r.sample_many(n))
-    for obj, w in r:
-      estimated_w = c[(obj, w)] / float(n) * r.total_weight
-      self.assertTrue(
-          np.isclose(w, estimated_w, atol=1e-3),
-          'Expected %s, got %s, for object %s' % (w, estimated_w, obj))
-
-  def testRouletteWheel_AddMany(self):
-    random.seed(12345678987654321)
-    r = utils.RouletteWheel()
-    self.assertTrue(r.is_empty())
-    with self.assertRaises(RuntimeError):
-      r.sample()  # Cannot sample when empty.
-    self.assertEqual(0, r.total_weight)
-    count = r.add_many(
-        ['a', 'b', 'c', 'd', 'e', 'f', 'g'],
-        [0.1, 0.01, 0.5, 0.1, 0.05, 0.03, 0.001])
-    self.assertEqual(7, count)
-    self.assertFalse(r.is_empty())
-    self.assertEqual(0.791, r.total_weight)
-
-    # Adding no items is allowed.
-    count = r.add_many([], [])
-    self.assertEqual(0, count)
-    self.assertFalse(r.is_empty())
-    self.assertEqual(0.791, r.total_weight)
-
-    # Check that sampling is correct.
-    obj, weight = r.sample()
-    self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
-    self.assertTrue((obj, weight) in r)
-    for obj, weight in r.sample_many(100):
-      self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
-      self.assertTrue((obj, weight) in r)
-
-    # Check that sampling distribution is correct.
-    n = 1000000
-    c = Counter(r.sample_many(n))
-    for obj, w in r:
-      estimated_w = c[(obj, w)] / float(n) * r.total_weight
-      self.assertTrue(
-          np.isclose(w, estimated_w, atol=1e-3),
-          'Expected %s, got %s, for object %s' % (w, estimated_w, obj))
-
-  def testRouletteWheel_AddZeroWeights(self):
-    r = utils.RouletteWheel()
-    self.assertEqual(True, r.add('a', 0))
-    self.assertFalse(r.is_empty())
-    self.assertEqual(4, r.add_many(['b', 'c', 'd', 'e'], [0, 0.1, 0, 0]))
-    self.assertEqual(
-        [('a', 0.0), ('b', 0.0), ('c', 0.1), ('d', 0.0), ('e', 0.0)],
-        list(r))
-
-  def testRouletteWheel_UniqueMode(self):
-    random.seed(12345678987654321)
-    r = utils.RouletteWheel(unique_mode=True)
-    self.assertEqual(True, r.add([1, 2, 3], 1, 'a'))
-    self.assertEqual(True, r.add([4, 5], 0.5, 'b'))
-    self.assertEqual(False, r.add([1, 2, 3], 1.5, 'a'))
-    self.assertEqual(
-        [([1, 2, 3], 1.0), ([4, 5], 0.5)],
-        list(r))
-    self.assertEqual(1.5, r.total_weight)
-    self.assertEqual(
-        2,
-        r.add_many(
-            [[5, 6, 2, 3], [1, 2, 3], [8], [1, 2, 3]],
-            [0.1, 0.2, 0.1, 2.0],
-            ['c', 'a', 'd', 'a']))
-    self.assertEqual(
-        [([1, 2, 3], 1.0), ([4, 5], 0.5), ([5, 6, 2, 3], 0.1), ([8], 0.1)],
-        list(r))
-    self.assertTrue(np.isclose(1.7, r.total_weight))
-    self.assertEqual(0, r.add_many([], [], []))  # Adding no items is allowed.
-    with self.assertRaises(ValueError):
-      # Key not given.
-      r.add([7, 8, 9], 2.0)
-    with self.assertRaises(ValueError):
-      # Keys not given.
-      r.add_many([[7, 8, 9], [10]], [2.0, 2.0])
-    self.assertEqual(True, r.has_key('a'))
-    self.assertEqual(True, r.has_key('b'))
-    self.assertEqual(False, r.has_key('z'))
-    self.assertEqual(1.0, r.get_weight('a'))
-    self.assertEqual(0.5, r.get_weight('b'))
-
-    r = utils.RouletteWheel(unique_mode=False)
-    self.assertEqual(True, r.add([1, 2, 3], 1))
-    self.assertEqual(True, r.add([4, 5], 0.5))
-    self.assertEqual(True, r.add([1, 2, 3], 1.5))
-    self.assertEqual(
-        [([1, 2, 3], 1.0), ([4, 5], 0.5), ([1, 2, 3], 1.5)],
-        list(r))
-    self.assertEqual(3, r.total_weight)
-    self.assertEqual(
-        4,
-        r.add_many(
-            [[5, 6, 2, 3], [1, 2, 3], [8], [1, 2, 3]],
-            [0.1, 0.2, 0.1, 0.2]))
-    self.assertEqual(
-        [([1, 2, 3], 1.0), ([4, 5], 0.5), ([1, 2, 3], 1.5),
-         ([5, 6, 2, 3], 0.1), ([1, 2, 3], 0.2), ([8], 0.1), ([1, 2, 3], 0.2)],
-        list(r))
-    self.assertTrue(np.isclose(3.6, r.total_weight))
-    with self.assertRaises(ValueError):
-      # Key is given.
-      r.add([7, 8, 9], 2.0, 'a')
-    with self.assertRaises(ValueError):
-      # Keys are given.
-      r.add_many([[7, 8, 9], [10]], [2.0, 2.0], ['a', 'b'])
-
-  def testRouletteWheel_IncrementalSave(self):
-    f = tempfile.NamedTemporaryFile()
-    r = utils.RouletteWheel(unique_mode=True, save_file=f.name)
-    entries = [
-        ([1, 2, 3], 0.1, 'a'),
-        ([4, 5], 0.2, 'b'),
-        ([6], 0.3, 'c'),
-        ([7, 8, 9, 10], 0.25, 'd'),
-        ([-1, -2], 0.15, 'e'),
-        ([-3, -4, -5], 0.5, 'f')]
-
-    self.assertTrue(r.is_empty())
-    for i in range(0, len(entries), 2):
-      r.add(*entries[i])
-      r.add(*entries[i + 1])
-      r.incremental_save()
-
-      r2 = utils.RouletteWheel(unique_mode=True, save_file=f.name)
-      self.assertEqual(i + 2, len(r2))
-      count = 0
-      for j, (obj, weight) in enumerate(r2):
-        self.assertEqual(entries[j][0], obj)
-        self.assertEqual(entries[j][1], weight)
-        self.assertEqual(weight, r2.get_weight(entries[j][2]))
-        count += 1
-      self.assertEqual(i + 2, count)
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/BUILD
+++ b/research/brain_coder/single_task/BUILD
-licenses(["notice"])
-
-package(default_visibility = [
-    "//learning/brain/research/neural_coder:__subpackages__",
-])
-
-load("@subpar//:subpar.bzl", "par_binary")
-
-par_binary(
-    name = "run",
-    srcs = ["run.py"],
-    deps = [
-        ":defaults",
-        ":ga_train",
-        ":pg_train",
-        # absl dep :app
-        # absl dep /flags
-        # absl dep /logging
-    ],
-)
-
-par_binary(
-    name = "tune",
-    srcs = ["tune.py"],
-    deps = [
-        ":defaults",
-        ":run",
-        # file dep
-        # absl dep :app
-        # absl dep /flags
-        # absl dep /logging
-        # numpy dep
-        # tensorflow dep
-    ],
-)
-
-py_library(
-    name = "ga_train",
-    srcs = ["ga_train.py"],
-    deps = [
-        ":data",
-        ":defaults",
-        ":ga_lib",
-        ":results_lib",
-        # file dep
-        # absl dep /flags
-        # absl dep /logging
-        # numpy dep
-        # tensorflow dep
-        "//common:utils",  # project
-    ],
-)
-
-py_library(
-    name = "ga_lib",
-    srcs = ["ga_lib.py"],
-    deps = [
-        ":misc",
-        # absl dep /flags
-        # absl dep /logging
-        # numpy dep
-        "//common:bf",  # project
-        "//common:utils",  # project
-    ],
-)
-
-py_test(
-    name = "ga_train_test",
-    srcs = ["ga_train_test.py"],
-    deps = [
-        ":defaults",
-        ":run",
-        # absl dep /flags
-        # tensorflow dep
-    ],
-)
-
-py_library(
-    name = "pg_train",
-    srcs = ["pg_train.py"],
-    deps = [
-        ":data",
-        ":defaults",
-        ":pg_agent",
-        ":results_lib",
-        # file dep
-        # absl dep /flags
-        # absl dep /logging
-        # tensorflow dep
-        # tensorflow internal dep  # build_cleaner: keep
-    ],
-)
-
-py_library(
-    name = "pg_agent",
-    srcs = ["pg_agent.py"],
-    deps = [
-        ":misc",
-        # file dep
-        # absl dep /logging
-        # numpy dep
-        # tensorflow dep
-        "//common:rollout",  # project
-        "//common:utils",  # project
-    ],
-)
-
-py_test(
-    name = "pg_agent_test",
-    srcs = ["pg_agent_test.py"],
-    deps = [
-        ":data",
-        ":defaults",
-        ":misc",
-        ":pg_agent",
-        ":pg_train",
-        # absl dep /logging
-        # numpy dep
-        # tensorflow dep
-        "//common:utils",  # project
-    ],
-)
-
-py_library(
-    name = "defaults",
-    srcs = ["defaults.py"],
-    deps = [
-        # absl dep /logging
-        "//common:config_lib",  # project
-    ],
-)
-
-py_library(
-    name = "misc",
-    srcs = ["misc.py"],
-)
-
-py_library(
-    name = "data",
-    srcs = ["data.py"],
-    deps = [
-        ":code_tasks",
-        # absl dep /logging
-    ],
-)
-
-py_library(
-    name = "code_tasks",
-    srcs = ["code_tasks.py"],
-    deps = [
-        ":misc",
-        ":test_tasks",
-        # absl dep /logging
-        # numpy dep
-        "//common:bf",  # project
-        "//common:reward",  # project
-    ],
-)
-
-py_test(
-    name = "code_tasks_test",
-    srcs = ["code_tasks_test.py"],
-    deps = [
-        ":code_tasks",
-        ":defaults",
-        # numpy dep
-        # tensorflow dep
-    ],
-)
-
-py_library(
-    name = "test_tasks",
-    srcs = ["test_tasks.py"],
-    deps = [
-        ":misc",
-        "//common:reward",  # project
-    ],
-)
-
-py_test(
-    name = "test_tasks_test",
-    srcs = ["test_tasks_test.py"],
-    deps = [
-        ":misc",
-        ":test_tasks",
-        # numpy dep
-        # tensorflow dep
-    ],
-)
-
-py_test(
-    name = "pg_train_test",
-    size = "large",
-    srcs = ["pg_train_test.py"],
-    deps = [
-        ":defaults",
-        ":run",
-        # absl dep /logging
-        # tensorflow dep
-    ],
-)
-
-py_library(
-    name = "results_lib",
-    srcs = ["results_lib.py"],
-    deps = [
-        # file dep
-        # tensorflow dep
-    ],
-)
-
-py_test(
-    name = "results_lib_test",
-    srcs = ["results_lib_test.py"],
-    deps = [
-        ":results_lib",
-        # tensorflow dep
-    ],
-)
-
-par_binary(
-    name = "aggregate_experiment_results",
-    srcs = ["aggregate_experiment_results.py"],
-    deps = [
-        ":misc",
-        ":results_lib",
-        # file dep
-        # absl dep :app
-        # absl dep /flags
-        # numpy dep
-        # tensorflow dep
-    ],
-)
-
-par_binary(
-    name = "aggregate_tuning_results",
-    srcs = ["aggregate_tuning_results.py"],
-    deps = [
-        # file dep
-        # absl dep :app
-        # absl dep /flags
-        # tensorflow dep
-    ],
-)
--- a/research/brain_coder/single_task/README.md
+++ b/research/brain_coder/single_task/README.md
-# Experiments for ICLR 2018 paper.
-
-[Neural Program Synthesis with Priority Queue Training](https://arxiv.org/abs/1801.03526).
-
-Runs policy gradient (REINFORCE), priority queue training, genetic algorithm,
-and uniform random search.
-
-Run all examples below out of your top-level repo directory, i.e. where your git
-clone resides.
-
-
-## Just tell me how to run something and see results
-```bash
-# These tasks are the fastest to learn. 'echo' and 'count-down' are very
-# easy. run_eval_tasks.py will do most of the work to run all the jobs.
-# Should take between 10 and 30 minutes.
-
-# How many repetitions each experiment will run. In the paper, we use 25. Less
-# reps means faster experiments, but noisier results.
-REPS=25
-
-# Extra description in the job names for these experiments. Use this description
-# to distinguish between multiple runs of the same experiment.
-DESC="demo"
-
-# The tasks to run.
-TASKS="reverse echo-second-seq"
-
-# The model types and max NPE.
-EXPS=( pg-20M topk-20M ga-20M rand-20M )
-
-# Where training data is saved. This is chosen by launch_training.sh. Custom
-# implementations of launch_training.sh may use different locations.
-MODELS_DIR="/tmp/models"
-
-# Run run_eval_tasks.py for each experiment name in EXPS.
-for exp in "${EXPS[@]}"
-do
-  ./single_task/run_eval_tasks.py \
-      --exp "$exp" --tasks $TASKS --desc "$DESC" --reps $REPS
-done
-
-# During training or after completion, run this to aggregate results into a
-# table. This is also useful for seeing how much progress has been made.
-# Make sure the arguments here match the settings used above.
-# Note: This can take a few minutes because it reads from every experiment
-# directory.
-bazel run single_task:aggregate_experiment_results -- \
-  --models_dir="$MODELS_DIR" \
-  --max_npe="20M" \
-  --task_list="$TASKS" \
-  --model_types="[('pg', '$DESC'), ('topk', '$DESC'), ('ga', '$DESC'),
-                  ('rand', '$DESC')]" \
-  --csv_file="/tmp/results_table.csv"
-```
-
-
-## Reproduce tuning results in paper
-```bash
-bazel build -c opt single_task:tune.par
-
-# PG and TopK Tuning.
-MAX_NPE=5000000
-CONFIG="
-env=c(task_cycle=['reverse-tune','remove-tune']),
-agent=c(
-  algorithm='pg',
-  grad_clip_threshold=50.0,param_init_factor=0.5,entropy_beta=0.05,lr=1e-5,
-  optimizer='rmsprop',ema_baseline_decay=0.99,topk_loss_hparam=0.0,topk=0,
-  replay_temperature=1.0,alpha=0.0,eos_token=False),
-timestep_limit=50,batch_size=64"
-
-./single_task/launch_tuning.sh \
-    --job_name="iclr_pg_gridsearch.reverse-remove" \
-    --config="$CONFIG" \
-    --max_npe="$MAX_NPE" \
-    --num_workers_per_tuner=1 \
-    --num_ps_per_tuner=0 \
-    --num_tuners=1 \
-    --num_repetitions=50 \
-    --hparam_space_type="pg" \
-    --stop_on_success=true
-./single_task/launch_tuning.sh \
-    --job_name="iclr_pg_topk_gridsearch.reverse-remove" \
-    --config="$CONFIG" \
-    --max_npe="$MAX_NPE" \
-    --num_workers_per_tuner=1 \
-    --num_ps_per_tuner=0 \
-    --num_tuners=1 \
-    --num_repetitions=50 \
-    --hparam_space_type="pg-topk" \
-    --fixed_hparams="topk=10" \
-    --stop_on_success=true
-./single_task/launch_tuning.sh \
-    --job_name="iclr_topk_gridsearch.reverse-remove" \
-    --config="$CONFIG" \
-    --max_npe="$MAX_NPE" \
-    --num_workers_per_tuner=1 \
-    --num_ps_per_tuner=0 \
-    --num_tuners=1 \
-    --num_repetitions=50 \
-    --hparam_space_type="topk" \
-    --fixed_hparams="topk=10" \
-    --stop_on_success=true
-
-# GA Tuning.
-CONFIG="
-env=c(task_cycle=['reverse-tune','remove-char-tune']),
-agent=c(algorithm='ga'),
-timestep_limit=50"
-./single_task/launch_tuning.sh \
-    --job_name="iclr_ga_gridsearch.reverse-remove" \
-    --config="$CONFIG" \
-    --max_npe="$MAX_NPE" \
-    --num_workers_per_tuner=25 \
-    --num_ps_per_tuner=0 \
-    --num_tuners=1 \
-    --num_repetitions=50 \
-    --hparam_space_type="ga" \
-    --stop_on_success=true
-
-# Aggregate tuning results. Run after tuning jobs complete.
-bazel run -c opt single_task:aggregate_tuning_results -- \
-    --tuning_dir="$MODELS_DIR/iclr_pg_gridsearch.reverse-remove"
-bazel run -c opt single_task:aggregate_tuning_results -- \
-    --tuning_dir="$MODELS_DIR/iclr_pg_topk_gridsearch.reverse-remove"
-bazel run -c opt single_task:aggregate_tuning_results -- \
-    --tuning_dir="$MODELS_DIR/iclr_topk_gridsearch.reverse-remove"
-bazel run -c opt single_task:aggregate_tuning_results -- \
-    --tuning_dir="$MODELS_DIR/iclr_ga_gridsearch.reverse-remove"
-```
-
-## Reproduce eval results in paper
-```bash
-DESC="v0"  # Description for each experiment. "Version 0" is a good default.
-EXPS=( pg-5M topk-5M ga-5M rand-5M pg-20M topk-20M ga-20M rand-20M )
-for exp in "${EXPS[@]}"
-do
-  ./single_task/run_eval_tasks.py \
-      --exp "$exp" --iclr_tasks --desc "$DESC"
-done
-```
-
-## Run single experiment
-```bash
-EXP="topk-20M"  # Learning algorithm + max-NPE
-TASK="reverse"  # Coding task
-DESC="v0"  # Description for each experiment. "Version 0" is a good default.
-./single_task/run_eval_tasks.py \
-    --exp "$EXP" --task "$TASK" --desc "$DESC"
-```
-
-## Fetch eval results into a table
-```bash
-# These arguments should match the settings you used to run the experiments.
-MODELS_DIR="/tmp/models"
-MAX_NPE="20M"
-DESC="v0"  # Same description used in the experiments.
-# MODEL_TYPES specifies each model type and the description used in their
-# experiments.
-MODEL_TYPES="[('pg', '$DESC'), ('topk', '$DESC'),
-              ('ga', '$DESC'), ('rand', '$DESC')]"
-TASKS=""  # Empty string will default to all ICLR tasks.
-# To specify custom task list, give task names separated by spaces. Example:
-# TASKS="reverse remove-char"
-bazel run single_task:aggregate_experiment_results -- \
-    --models_dir="$MODELS_DIR" \
-    --max_npe="$MAX_NPE" \
-    --task_list="$TASKS" \
-    --model_types="$MODEL_TYPES" \
-    --csv_file="/tmp/results_table.csv"
-```
-
-## Reproduce shortest code examples in paper
-```bash
-# Maximum NPE is higher here. We only do 1 repetition, and the algorithm needs
-# time to simplify its solution.
-MODELS_DIR="/tmp/models"
-NPE="500M"
-DESC="short-code"
-./single_task/run_eval_tasks.py \
-    --exp "simpl-$NPE" --desc "$DESC" --iclr_tasks --reps 1
-
-# Aggregate best code strings. Run after training completes.
-TASKS=""  # Empty string. Will default to all ICLR tasks.
-bazel run single_task:aggregate_experiment_results -- \
-    --models_dir="$MODELS_DIR" \
-    --max_npe="$NPE" \
-    --task_list="$TASKS" \
-    --model_types="[('topk', '$DESC')]" \
-    --data=code
-```
--- a/research/brain_coder/single_task/aggregate_experiment_results.py
+++ b/research/brain_coder/single_task/aggregate_experiment_results.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-r"""This script crawls experiment directories for results and aggregates them.
-
-Usage example:
-
-MODELS_DIR="/tmp/models"
-bazel run single_task:aggregate_experiment_results -- \
-    --models_dir="$MODELS_DIR" \
-    --max_npe="20M" \
-    --task_list="add echo" \
-    --model_types="[('topk', 'v0'), ('ga', 'v0')]" \
-    --csv_file=/tmp/results_table.csv
-"""
-
-import ast
-from collections import namedtuple
-import csv
-import os
-import re
-import StringIO
-import sys
-
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow as tf
-
-from single_task import misc  # brain coder
-from single_task import results_lib  # brain coder
-
-DEFAULT_MODELS = [('pg', 'v0'), ('topk', 'v0'), ('ga', 'v0'), ('rand', 'v0')]
-DEFAULT_TASKS = [
-    'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
-    'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
-    'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
-    'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
-    'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(
-    'models_dir', '',
-    'Absolute path where results folders are found.')
-flags.DEFINE_string(
-    'exp_prefix', 'bf_rl_iclr',
-    'Prefix for all experiment folders.')
-flags.DEFINE_string(
-    'max_npe', '5M',
-    'String representation of max NPE of the experiments.')
-flags.DEFINE_spaceseplist(
-    'task_list', DEFAULT_TASKS,
-    'List of task names separated by spaces. If empty string, defaults to '
-    '`DEFAULT_TASKS`. These are the rows of the results table.')
-flags.DEFINE_string(
-    'model_types', str(DEFAULT_MODELS),
-    'String representation of a python list of 2-tuples, each a model_type + '
-    'job description pair. Descriptions allow you to choose among different '
-    'runs of the same experiment. These are the columns of the results table.')
-flags.DEFINE_string(
-    'csv_file', '/tmp/results_table.csv',
-    'Where to write results table. Format is CSV.')
-flags.DEFINE_enum(
-    'data', 'success_rates', ['success_rates', 'code'],
-    'What type of data to aggregate.')
-
-
-def make_csv_string(table):
-  """Convert 2D list to CSV string."""
-  s = StringIO.StringIO()
-  writer = csv.writer(s)
-  writer.writerows(table)
-  value = s.getvalue()
-  s.close()
-  return value
-
-
-def process_results(metrics):
-  """Extract useful information from given metrics.
-
-  Args:
-    metrics: List of results dicts. These should have been written to disk by
-        training jobs.
-
-  Returns:
-    Dict mapping stats names to values.
-
-  Raises:
-    ValueError: If max_npe or max_global_repetitions values are inconsistant
-        across dicts in the `metrics` list.
-  """
-  count = len(metrics)
-  success_count = 0
-  total_npe = 0  # Counting NPE across all runs.
-  success_npe = 0  # Counting NPE in successful runs only.
-  max_npe = 0
-  max_repetitions = 0
-  for metric_dict in metrics:
-    if not max_npe:
-      max_npe = metric_dict['max_npe']
-    elif max_npe != metric_dict['max_npe']:
-      raise ValueError(
-          'Invalid experiment. Different reps have different max-NPE settings.')
-    if not max_repetitions:
-      max_repetitions = metric_dict['max_global_repetitions']
-    elif max_repetitions != metric_dict['max_global_repetitions']:
-      raise ValueError(
-          'Invalid experiment. Different reps have different num-repetition '
-          'settings.')
-    if metric_dict['found_solution']:
-      success_count += 1
-      success_npe += metric_dict['npe']
-    total_npe += metric_dict['npe']
-  stats = {}
-  stats['max_npe'] = max_npe
-  stats['max_repetitions'] = max_repetitions
-  stats['repetitions'] = count
-  stats['successes'] = success_count  # successful reps
-  stats['failures'] = count - success_count  # failed reps
-  stats['success_npe'] = success_npe
-  stats['total_npe'] = total_npe
-  if success_count:
-    # Only successful runs counted.
-    stats['avg_success_npe'] = stats['success_npe'] / float(success_count)
-  else:
-    stats['avg_success_npe'] = 0.0
-  if count:
-    stats['success_rate'] = success_count / float(count)
-    stats['avg_total_npe'] = stats['total_npe'] / float(count)
-  else:
-    stats['success_rate'] = 0.0
-    stats['avg_total_npe'] = 0.0
-
-  return stats
-
-
-ProcessedResults = namedtuple('ProcessedResults', ['metrics', 'processed'])
-
-
-def get_results_for_experiment(
-    models_dir, task_name, model_type='pg', max_npe='5M', desc='v0',
-    name_prefix='bf_rl_paper', extra_desc=''):
-  """Get and process results for a given experiment.
-
-  An experiment is a set of runs with the same hyperparameters and environment.
-  It is uniquely specified by a (task_name, model_type, max_npe) triple, as
-  well as an optional description.
-
-  We assume that each experiment has a folder with the same name as the job that
-  ran the experiment. The name is computed by
-  "%name_prefix%.%desc%-%max_npe%_%task_name%".
-
-  Args:
-    models_dir: Parent directory containing experiment folders.
-    task_name: String name of task (the coding env). See code_tasks.py or
-        run_eval_tasks.py
-    model_type: Name of the algorithm, such as 'pg', 'topk', 'ga', 'rand'.
-    max_npe: String SI unit representation of the maximum NPE threshold for the
-        experiment. For example, "5M" means 5 million.
-    desc: Description.
-    name_prefix: Prefix of job names. Normally leave this as default.
-    extra_desc: Optional extra description at the end of the job name.
-
-  Returns:
-    ProcessedResults namedtuple instance, containing
-    metrics: Raw dicts read from disk.
-    processed: Stats computed by `process_results`.
-
-  Raises:
-    ValueError: If max_npe in the metrics does not match NPE in the experiment
-        folder name.
-  """
-  folder = name_prefix + '.{0}.{1}-{2}_{3}'.format(desc, model_type, max_npe,
-                                                   task_name)
-  if extra_desc:
-    folder += '.' + extra_desc
-
-  results = results_lib.Results(os.path.join(models_dir, folder))
-  metrics, _ = results.read_all()
-  processed = process_results(metrics)
-  if (not np.isclose(processed['max_npe'], misc.si_to_int(max_npe))
-      and processed['repetitions']):
-    raise ValueError(
-        'Invalid experiment. Max-NPE setting does not match expected max-NPE '
-        'in experiment name.')
-  return ProcessedResults(metrics=metrics, processed=processed)
-
-
-BestCodeResults = namedtuple(
-    'BestCodeResults',
-    ['code', 'reward', 'npe', 'folder', 'finished', 'error'])
-
-
-class BestCodeResultError(object):
-  success = 0
-  no_solution_found = 1
-  experiment_does_not_exist = 2
-
-
-def get_best_code_for_experiment(
-    models_dir, task_name, model_type='pg', max_npe='5M', desc=0,
-    name_prefix='bf_rl_paper', extra_desc=''):
-  """Like `get_results_for_experiment`, but fetches the code solutions."""
-  folder = name_prefix + '.{0}.{1}-{2}_{3}'.format(desc, model_type, max_npe,
-                                                   task_name)
-  if extra_desc:
-    folder += '.' + extra_desc
-
-  log_dir = os.path.join(models_dir, folder, 'logs')
-  search_regex = r'^solutions_([0-9])+\.txt$'
-  try:
-    all_children = tf.gfile.ListDirectory(log_dir)
-  except tf.errors.NotFoundError:
-    return BestCodeResults(
-        code=None, reward=0.0, npe=0, folder=folder, finished=False,
-        error=BestCodeResultError.experiment_does_not_exist)
-  solution_files = [
-      fname for fname in all_children if re.search(search_regex, fname)]
-  max_reward = 0.0
-  npe = 0
-  best_code = None
-  for fname in solution_files:
-    with tf.gfile.FastGFile(os.path.join(log_dir, fname), 'r') as reader:
-      results = [ast.literal_eval(entry) for entry in reader]
-    for res in results:
-      if res['reward'] > max_reward:
-        best_code = res['code']
-        max_reward = res['reward']
-        npe = res['npe']
-  error = (
-      BestCodeResultError.success if best_code
-      else BestCodeResultError.no_solution_found)
-  try:
-    # If there is a status.txt file, check if it contains the status of the job.
-    with tf.gfile.FastGFile(os.path.join(log_dir, 'status.txt'), 'r') as f:
-      # Job is done, so mark this experiment as finished.
-      finished = f.read().lower().strip() == 'done'
-  except tf.errors.NotFoundError:
-    # No status file has been written, so the experiment is not done. No need to
-    # report an error here, because we do not require that experiment jobs write
-    # out a status.txt file until they have finished.
-    finished = False
-  return BestCodeResults(
-      code=best_code, reward=max_reward, npe=npe, folder=folder,
-      finished=finished, error=error)
-
-
-def make_results_table(
-    models=None,
-    tasks=None,
-    max_npe='5M',
-    name_prefix='bf_rl_paper',
-    extra_desc='',
-    models_dir='/tmp'):
-  """Creates a table of results: algorithm + version by tasks.
-
-  Args:
-    models: The table columns. A list of (algorithm, desc) tuples.
-    tasks: The table rows. List of task names.
-    max_npe: String SI unit representation of the maximum NPE threshold for the
-        experiment. For example, "5M" means 5 million. All entries in the table
-        share the same max-NPE.
-    name_prefix: Name prefix used in logging directory for the experiment.
-    extra_desc: Extra description added to name of logging directory for the
-        experiment.
-    models_dir: Parent directory containing all experiment folders.
-
-  Returns:
-    A 2D list holding the table cells.
-  """
-  if models is None:
-    models = DEFAULT_MODELS
-  if tasks is None:
-    tasks = DEFAULT_TASKS
-  model_results = {}
-  for model_type, desc in models:
-    model_results[model_type] = {
-        tname: get_results_for_experiment(
-            models_dir, tname, model_type, max_npe, desc,
-            name_prefix=name_prefix, extra_desc=extra_desc
-        ).processed
-        for tname in tasks}
-
-  def info(stats):
-    return [str(stats['repetitions']),
-            '%.2f' % stats['success_rate'],
-            str(int(stats['avg_total_npe']))]
-
-  rows = [['max NPE: ' + max_npe]
-          + misc.flatten([['{0} ({1})'.format(m, d), '', '']
-                          for m, d in models])]
-  rows.append(
-      [''] + misc.flatten([['reps', 'success rate', 'avg NPE']
-                           for _ in models]))
-  for tname in tasks:
-    rows.append(
-        [tname]
-        + misc.flatten([info(model_results[model][tname])
-                        for model, _ in models]))
-
-  return rows
-
-
-def print_results_table(results_table):
-  """Print human readable results table to stdout."""
-  print('')
-  print('=== Results Table ===')
-  print('Format: # reps [success rate, avg total NPE]')
-
-  def info_str(info_row):
-    # num_runs (success_rate, avg_total_npe)
-    if not info_row[0]:
-      return '0'
-    return '%s [%s, %s]' % (str(info_row[0]).ljust(2), info_row[1], info_row[2])
-
-  nc = len(results_table[0])  # num cols
-  out_table = [
-      [results_table[0][0]] + [results_table[0][i] for i in range(1, nc, 3)]]
-  for row in results_table[2:]:
-    out_table.append([row[0]] + [info_str(row[i:i+3]) for i in range(1, nc, 3)])
-
-  nc = len(out_table[0])  # num cols
-  col_widths = [max(len(row[col]) for row in out_table) for col in range(nc)]
-
-  table_string = ''
-  for row in out_table:
-    table_string += ''.join(
-        [row[c].ljust(col_widths[c] + 2) for c in range(nc)]) + '\n'
-
-  print(table_string)
-
-
-def main(argv):
-  del argv  # Unused.
-
-  name_prefix = FLAGS.exp_prefix
-  print('Experiments prefix: %s' % name_prefix)
-
-  model_types = ast.literal_eval(FLAGS.model_types)
-
-  if FLAGS.data == 'success_rates':
-    results_table = make_results_table(
-        models=model_types, tasks=FLAGS.task_list, max_npe=FLAGS.max_npe,
-        models_dir=FLAGS.models_dir,
-        name_prefix=name_prefix, extra_desc='')
-    with tf.gfile.FastGFile(FLAGS.csv_file, 'w') as f:
-      f.write(make_csv_string(results_table))
-
-    print_results_table(results_table)
-  else:
-    # Best code
-    print('* = experiment is still running')
-    print('')
-    print('=== Best Synthesized Code ===')
-    for model_type, desc in model_types:
-      print('%s (%s)' % (model_type, desc))
-      sys.stdout.flush()
-      for tname in FLAGS.task_list:
-        res = get_best_code_for_experiment(
-            FLAGS.models_dir, tname, model_type, FLAGS.max_npe, desc,
-            name_prefix=name_prefix, extra_desc='')
-        unfinished_mark = '' if res.finished else ' *'
-        tname += unfinished_mark
-        if res.error == BestCodeResultError.success:
-          print('  %s' % tname)
-          print('    %s' % res.code)
-          print('    R=%.6f, NPE=%s' % (res.reward, misc.int_to_si(res.npe)))
-        elif res.error == BestCodeResultError.experiment_does_not_exist:
-          print('  Experiment does not exist. Check arguments.')
-          print('  Experiment folder: %s' % res.folder)
-          break
-        else:
-          print('  %s' % tname)
-          print('    (none)')
-        sys.stdout.flush()
-
-
-if __name__ == '__main__':
-  app.run(main)
--- a/research/brain_coder/single_task/aggregate_tuning_results.py
+++ b/research/brain_coder/single_task/aggregate_tuning_results.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-r"""After running tuning, use this script to aggregate the results.
-
-Usage:
-
-OUT_DIR="<my_tuning_dir>"
-bazel run -c opt single_task:aggregate_tuning_results -- \
-    --alsologtostderr \
-    --tuning_dir="$OUT_DIR"
-"""
-
-import ast
-import os
-
-from absl import app
-from absl import flags
-import tensorflow as tf
-
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(
-    'tuning_dir', '',
-    'Absolute path where results tuning trial folders are found.')
-
-
-def main(argv):
-  del argv  # Unused.
-
-  try:
-    trial_dirs = tf.gfile.ListDirectory(FLAGS.tuning_dir)
-  except tf.errors.NotFoundError:
-    print('Tuning directory %s does not exist.' % (FLAGS.tuning_dir,))
-    return
-
-  metrics = []
-  for trial_dir in trial_dirs:
-    tuning_results_file = os.path.join(
-        FLAGS.tuning_dir, trial_dir, 'tuning_results.txt')
-    if tf.gfile.Exists(tuning_results_file):
-      with tf.gfile.FastGFile(tuning_results_file, 'r') as reader:
-        for line in reader:
-          metrics.append(ast.literal_eval(line.replace(': nan,', ': 0.0,')))
-
-  if not metrics:
-    print('No trials found.')
-    return
-
-  num_trials = [m['num_trials'] for m in metrics]
-  assert all(n == num_trials[0] for n in num_trials)
-  num_trials = num_trials[0]
-  print('Found %d completed trials out of %d' % (len(metrics), num_trials))
-
-  # Sort by objective descending.
-  sorted_trials = sorted(metrics, key=lambda m: -m['objective'])
-
-  for i, metrics in enumerate(sorted_trials):
-    hparams = metrics['hparams']
-    keys = sorted(hparams.keys())
-    print(
-        str(i).ljust(4) + ': '
-        + '{0:.2f}'.format(metrics['objective']).ljust(10)
-        + '['
-        + ','.join(['{}={}'.format(k, hparams[k]).ljust(24) for k in keys])
-        + ']')
-
-
-if __name__ == '__main__':
-  app.run(main)
--- a/research/brain_coder/single_task/code_tasks.py
+++ b/research/brain_coder/single_task/code_tasks.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tasks for RL."""
-
-import abc
-import copy
-import itertools
-import random
-
-from absl import logging
-import numpy as np
-from six.moves import xrange
-
-from common import bf  # brain coder
-from common import reward as r  # brain coder
-from single_task import misc  # brain coder
-from single_task import test_tasks  # brain coder
-
-
-MAX_EXECUTION_STEPS = 5000
-
-
-def make_task(task_name, override_kwargs=None, max_code_length=100,
-              require_correct_syntax=False,
-              do_code_simplification=False,
-              correct_bonus=2.0, code_length_bonus=1.0):
-  """Make tasks with setting from paper."""
-  logging.info('Making paper-config task.')
-  n = 16  # Number of test cases.
-  task_mapping = {
-      'print-hello': (
-          PrintTask, dict(base=27, fixed_string=[8, 5, 12, 12, 15])),
-      'print': (PrintIntTask, dict(base=256, fixed_string=[1, 2, 3, 4, 5])),
-      'echo': (EchoTask, dict(base=27, min_length=1, max_length=6)),
-      'remove-char': (
-          RemoveCharTask, dict(base=256, n=n, min_len=1, max_len=6)),
-      'reverse': (
-          ReverseTask, dict(base=256, n=n, min_len=1, max_len=6)),
-      'reverse-tune': (
-          ReverseTaskV2, dict(base=256, reward_type='static-bylen')),
-      'remove-char-tune': (RemoveCharTaskV2, dict(base=27)),
-      'prefix': (CommonPrefixTask, dict(base=27)),
-      'find': (FindSubStrTask, dict(base=27)),
-      'sort3': (SortFixedTaskV2, dict(base=27, n=150, length=3)),
-      'count-char': (CountCharTaskV2, dict(n=n, max_len=6)),
-      'bool-logic': (BooleanLogicTask, dict()),
-      'add': (AddTask, dict(n=9)),
-      'echo-twice': (EchoTwiceTask, dict(n=n)),
-      'echo-thrice': (EchoThriceTask, dict(n=n)),
-      'copy-reverse': (CopyReverseTask, dict(n=n)),
-      'zero-cascade': (EchoZeroCascadeTask, dict(n=n)),
-      'cascade': (EchoCascadeTask, dict(n=n)),
-      'shift-left': (ShiftLeftTask, dict(n=n)),
-      'shift-right': (ShiftRightTask, dict(n=n)),
-      'riffle': (RiffleTask, dict(n=n)),
-      'unriffle': (UnriffleTask, dict(n=n)),
-      'middle-char': (MiddleCharTask, dict(n=n)),
-      'remove-last': (RemoveLastTask, dict(n=n)),
-      'remove-last-two': (RemoveLastTwoTask, dict(n=n)),
-      'echo-alternating': (EchoAlternatingTask, dict(n=n)),
-      'echo-half': (EchoHalfTask, dict(n=n)),
-      'length': (LengthTask, dict(n=n)),
-      'echo-second-seq': (EchoSecondSequenceTask, dict(n=n)),
-      'echo-nth-seq': (EchoNthSequenceTask, dict(n=n)),
-      'substring': (SubstringTask, dict(n=n)),
-      'divide-2': (Divide2Task, dict(n=n)),
-      'dedup': (DedupTask, dict(n=n)),
-      'remove-target-char': (RemoveTargetCharTask, dict(n=n)),
-      'list-index': (ListIndexTask, dict(n=n)),
-      'fib': (FibonacciTask, dict()),
-      'count-down': (BottlesOfBeerTask, dict()),
-      'split': (SplitTask, dict()),
-      'trim-left': (TrimLeftTask, dict()),
-      'circle-route': (
-          JudgeRouteCircleTask, dict(n=100, max_len=32)),
-      'multiply': (MultiplyTask, dict(n=100)),
-      'divmod': (DivModTask, dict(n=100)),
-  }
-
-  if task_name not in task_mapping:
-    # Test tasks.
-    if task_name == 'test-hill-climb':
-      return test_tasks.BasicTaskManager(test_tasks.HillClimbingTask())
-    raise ValueError('Unknown task type "%s"' % task_name)
-  task_cls, kwargs = task_mapping[task_name]
-
-  if override_kwargs:
-    if not isinstance(override_kwargs, dict):
-      raise ValueError(
-          'override_kwargs must be a dict, got: %s', override_kwargs)
-    kwargs.update(override_kwargs)
-
-  task = task_cls(**kwargs)
-
-  reward_fn = r.absolute_distance_reward
-  # reward_fn = r.absolute_mod_distance_reward
-  # reward_fn = r.absolute_log_distance_reward
-  logging.info('Using reward function: %s', reward_fn.__name__)
-
-  # We want reward with and without code simplification to be scaled the same
-  # way. Without code simplification, give the maximum code length bonus
-  # every time.
-  min_code_length = 0.0 if do_code_simplification else max_code_length
-
-  return MultiIOTaskManager(
-      task=task, correct_bonus=correct_bonus,
-      code_length_bonus=code_length_bonus,
-      max_code_length=max_code_length, min_code_length=min_code_length,
-      reward_fn=reward_fn, require_correct_syntax=require_correct_syntax)
-
-
-def concat(lists):
-  if not lists:
-    return []
-  l = lists[0]
-  for k in lists[1:]:
-    l += k
-  return l
-
-
-def concat_join(lists, sep):
-  if not lists:
-    return []
-  l = lists[0]
-  for k in lists[1:]:
-    l += [sep] + k
-  return l
-
-
-def clipped_linear(x, x0, y0, slope, y_range):
-  min_y, max_y = y_range
-  return min(max(slope * (x - x0) + y0, min_y), max_y)
-
-
-class MultiIOTaskManager(object):
-  """Supports tasks which test the code with multiple I/O examples."""
-
-  def __init__(self, task, max_code_length=32, min_code_length=0,
-               max_execution_steps=MAX_EXECUTION_STEPS, correct_bonus=1.0,
-               code_length_bonus=1.0, failure_reward=-2.0, reward_fn=None,
-               require_correct_syntax=False):
-    assert isinstance(task, BaseTask)
-    self.task = task
-    self.max_code_length = max_code_length
-    self.min_code_length = min_code_length
-    self.max_execution_steps = max_execution_steps
-    self.require_correct_syntax = require_correct_syntax
-    self.correct_bonus = correct_bonus
-    self.code_length_bonus = code_length_bonus
-    self.failure_reward = failure_reward
-    self.time_penalty = (
-        1.0 / (max_code_length - min_code_length)
-        if max_code_length > min_code_length else 0.0)
-    if reward_fn is None:
-      self.reward_fn = r.absolute_distance_reward
-    else:
-      self.reward_fn = reward_fn
-    self.input_type = (
-        task.input_type if hasattr(task, 'input_type') else misc.IOType.integer)
-    self.output_type = (
-        task.output_type if hasattr(task, 'output_type')
-        else misc.IOType.integer)
-    self._compute_best_reward()
-
-  def _compute_best_reward(self):
-    io_seqs = self.task.make_io_set()
-    reward = 0.0
-    for _, output_seq in io_seqs:
-      reward += self.reward_fn(output_seq, output_seq, self.task.base)
-      reward += self.correct_bonus
-      reward += self.code_length_bonus  # Bonus for shortest code.
-    self.best_reward = reward
-    self.good_reward = 0.75 * reward
-    logging.info('Known best reward: %.4f', self.best_reward)
-
-  def _score_batch(self, code_strings):
-    return [self._score_code(code) for code in code_strings]
-
-  def _score_code(self, code):
-    """Run test cases on code and compute reward.
-
-    Args:
-      code: A single BF code string.
-
-    Returns:
-      misc.RewardInfo namedtuple instance containing reward and code execution
-          information, including inputs, expected outputs, code outputs, input
-          and output types, and reason for the reward obtained.
-    """
-    # Get list of 2-tuples, each containing an input sequence and an output
-    # sequence.
-    io_seqs = self.task.make_io_set()
-    terminal_reward = 0.0
-    results = []
-    reason = 'correct'
-    for input_seq, output_seq in io_seqs:
-      eval_result = bf.evaluate(
-          code, input_buffer=input_seq, timeout=0.1,
-          max_steps=self.max_execution_steps,
-          base=self.task.base,
-          require_correct_syntax=self.require_correct_syntax)
-      result, success = eval_result.output, eval_result.success
-      if not success:
-        # Code execution timed out.
-        terminal_reward = self.failure_reward
-        results = []
-        reason = eval_result.failure_reason
-        break
-      else:
-        terminal_reward += self.reward_fn(result, output_seq, self.task.base)
-        if result == output_seq:
-          terminal_reward += self.correct_bonus  # Bonus for correct answer.
-
-          # Only add additional reward for shorter code. Subtracting reward
-          # interferes with the main objective. Only optimize for length once
-          # any solution is found.
-          if self.min_code_length == self.max_code_length:
-            terminal_reward += self.code_length_bonus
-          else:
-            terminal_reward += self.code_length_bonus * clipped_linear(
-                x=len(code), x0=self.min_code_length, y0=1.0,
-                slope=-self.time_penalty, y_range=(0.0, 1.0))
-
-          # reason remains 'correct' if it is already
-        elif reason == 'correct':
-          reason = 'wrong'
-      results.append(result)
-
-    # Return list of rewards, one for each char in the code. All are 0 except
-    # for the terminal reward.
-    terminal_reward /= self.best_reward
-    return misc.RewardInfo(
-        episode_rewards=[0.0] * (len(code) - 1) + [terminal_reward],
-        input_case=misc.IOTuple(i for i, o in io_seqs),
-        correct_output=misc.IOTuple(o for i, o in io_seqs),
-        code_output=misc.IOTuple(results),
-        input_type=self.input_type,
-        output_type=self.output_type,
-        reason=reason)
-
-  def rl_batch(self, batch_size):
-    """Produces list of reward functions. One for each program in the batch."""
-    return [self._score_code] * batch_size
-
-
-def conditional_overwrite(current_value, new_value, allowed_overwrite_values):
-  if current_value in allowed_overwrite_values:
-    return new_value
-  return current_value
-
-
-class BaseTask(object):
-  """A coding task.
-
-  All coding tasks should inherit this class.
-  """
-  __metaclass__ = abc.ABCMeta
-
-  def __init__(self, base=256):
-    self.base = base  # All tasks must set the integer base that the expect.
-
-  @abc.abstractmethod
-  def make_io_set(self):
-    """Generate a set of test cases for the task.
-
-    Returns:
-      List of tuples, where each tuple is (input_case, output_case).
-      input_case and output_case are lists of integers.
-    """
-    pass
-
-
-# ==============================================================================
-# ICLR tasks.
-# ==============================================================================
-
-
-class PrintTask(BaseTask):
-  """Print string coding task.
-
-  Code needs to output a fixed string (given as a hyperparameter to the
-  task constructor). Program input is ignored.
-  """
-
-  def __init__(self, base, fixed_string=None):
-    super(type(self), self).__init__()
-    self.base = base  # base includes EOS
-    self.eos = 0
-    if fixed_string:
-      self.fixed_string = fixed_string
-    else:
-      self.fixed_string = [1, 2, 3, 0]  # ABC<EOS>
-    self.min_length = self.max_length = len(self.fixed_string)
-
-  def make_io_set(self):
-    return [(list(), list(self.fixed_string))]
-
-
-class RemoveCharTaskV2(BaseTask):
-  """Remove character coding task (version 2).
-
-  Code needs to pipe input to output, but with all the 'A' (value 1) chars
-  removed. 'A' appears exactly once in each input.
-
-  Test cases are hard-coded.
-  """
-
-  def __init__(self, base):
-    super(type(self), self).__init__()
-    self.base = base
-    self.eos = 0
-    self.remove_char = 1
-    assert base >= 27
-
-  def make_io_set(self):
-    rm = self.remove_char
-    return [
-        ([rm, 0], [0]),
-        ([20, rm, 0], [20, 0]),
-        ([rm, 13, 0], [13, 0]),
-        ([6, rm, 17, 0], [6, 17, 0]),
-        ([rm, 11, 24, 0], [11, 24, 0]),
-        ([2, 16, 21, rm, 0], [2, 16, 21, 0]),
-        ([18, rm, 12, 26, 7, 0], [18, 12, 26, 7, 0]),
-        ([9, 10, 22, rm, 4, 0], [9, 10, 22, 4, 0])]
-
-
-class RemoveCharTask(BaseTask):
-  """Remove character coding task.
-
-  Code needs to pipe input to output, but with all the 'A' (value 1) chars
-  removed. 'A' appears at least once in each input.
-
-  Test cases are dynamically generated, allowing for the number of test cases
-  to be a hyperparameter.
-  """
-
-  def __init__(self, base, n, min_len, max_len):
-    super(type(self), self).__init__()
-    self.base = base
-    self.eos = 0
-    self.remove_char = 1
-    assert base >= 27
-    self._io_pairs = self._make_io_examples(n, min_len, max_len)
-
-  def _make_io_examples(self, n, min_len, max_len):
-    """Generate test cases for the task."""
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    for _ in xrange(n):
-      length = rand.randrange(min_len, max_len + 1)
-      rm_char_pos = rand.randrange(0, length)
-      input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
-      input_seq[rm_char_pos] = self.remove_char
-      output_seq = list(input_seq)
-      del output_seq[rm_char_pos]
-      output_seq.append(0)
-      io_examples.append((input_seq, output_seq))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class ReverseTaskV2(BaseTask):
-  """Reverse string coding task (version 2).
-
-  Code needs to pipe input to output, but in reverse order.
-
-  Stochastic test case = new test case randomly generated for every run of
-  `make_io_set`, i.e. different test cases every time code is scored.
-
-  Task supports different types of test cases:
-    rand-one: Code is scored on one stochastic test case.
-    rand-many: Code is scored on 5 stochastic test cases.
-    static-bylen: Code is scored on 5 static test cases. There is one test
-        case for string lengths 1 through 5.
-    rand-bylen: Code is scored on 5 stochastic test cases, where there is one
-        test case for string lengths 1 through 5.
-  """
-
-  def __init__(self, base, reward_type):
-    super(type(self), self).__init__()
-    self.base = base  # base includes EOS
-    assert base >= 27
-    self.eos = 0
-    self.io_pair_fn = {
-        # One random example at a time.
-        'rand-one': lambda: self._io_rand(1),
-        # K randomy examples at a time (any lengths).
-        'rand-many': lambda: self._io_rand(5),
-        # Static examples, one for each length.
-        'static-bylen': self._io_static_by_len,
-        # Random examples, one for each length.
-        'rand-bylen': self._io_rand_by_len}[reward_type]
-
-  def _make_io_examples(self, sequences):
-    outputs = [list(i) for i in sequences]
-    for o in outputs:
-      o.reverse()
-      o.append(0)
-    inputs = [i + [0] for i in sequences]
-    return zip(inputs, outputs)
-
-  def _io_rand(self, k):
-    inputs = [(np.random.choice(26, random.randrange(1, 6)) + 1).tolist()
-              for _ in xrange(k)]
-    return self._make_io_examples(inputs)
-
-  def _io_rand_by_len(self, k=5):
-    inputs = [(np.random.choice(26, length) + 1).tolist()
-              for length in xrange(1, k + 1)]
-    return self._make_io_examples(inputs)
-
-  def _io_static_by_len(self):
-    return [
-        ([7, 0], [7, 0]),
-        ([6, 2, 0], [2, 6, 0]),
-        ([5, 1, 10, 0], [10, 1, 5, 0]),
-        ([8, 6, 5, 15, 0], [15, 5, 6, 8, 0]),
-        ([10, 12, 5, 2, 7, 0], [7, 2, 5, 12, 10, 0])]
-
-  def make_io_set(self):
-    return self.io_pair_fn()
-
-
-class ReverseTask(BaseTask):
-  """Reverse string coding task.
-
-  Code needs to pipe input to output, but in reverse order.
-
-  Test cases are dynamically generated, allowing for the number of test cases
-  to be a hyperparameter.
-  """
-
-  def __init__(self, base, n, min_len, max_len):
-    super(type(self), self).__init__()
-    self.base = base  # base includes EOS
-    assert base >= 27
-    self.eos = 0
-    self._io_pairs = self._make_io_examples(n, min_len, max_len)
-
-  def _make_io_examples(self, n, min_len, max_len):
-    """Generate test cases for the task."""
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    for _ in xrange(n):
-      length = rand.randrange(min_len, max_len + 1)
-      input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
-      output_seq = list(input_seq)
-      output_seq.reverse()
-      output_seq.append(0)
-      io_examples.append((input_seq, output_seq))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class CommonPrefixTask(BaseTask):
-  """Common prefix coding task.
-
-  Code needs to output the common prefix between two input lists. Input lists
-  are variable length, where each list ends with a 0. A common prefix is a
-  sequence which both lists start with.
-  """
-
-  def __init__(self, base):
-    super(type(self), self).__init__()
-    assert base >= 27
-    self.base = base
-    self.eos = 0
-
-  def make_io_set(self):
-    return [
-        ([12, 24, 18, 0, 12, 5, 0], [12, 0]),
-        ([1, 2, 3, 0, 1, 2, 17, 14, 0], [1, 2, 0]),
-        ([15, 2, 1, 9, 2, 0, 15, 2, 1, 25, 8, 14, 0], [15, 2, 1, 0]),
-        ([14, 9, 7, 8, 6, 16, 0, 14, 9, 7, 8, 8, 6, 8, 26, 0],
-         [14, 9, 7, 8, 0]),
-        ([12, 4, 16, 22, 1, 17, 0, 12, 4, 16, 22, 1, 8, 10, 0],
-         [12, 4, 16, 22, 1, 0])]
-
-
-class CountCharTask(BaseTask):
-
-  def __init__(self):
-    super(type(self), self).__init__()
-    self.base = 27
-    self.eos = 0
-    self.char = 1
-    self.input_type = misc.IOType.string
-    self.output_type = misc.IOType.integer
-
-  def make_io_set(self):
-    return [
-        ([10, 0], [0]),
-        ([1, 0], [1]),
-        ([1, 1, 0], [2]),
-        ([11, 1, 0], [1]),
-        ([1, 24, 0], [1]),
-        ([13, 6, 0], [0]),
-        ([9, 2, 7, 0], [0]),
-        ([1, 24, 11, 0], [1]),
-        ([19, 1, 1, 0], [2]),
-        ([1, 6, 1, 0], [2]),
-        ([22, 16, 17, 9, 0], [0]),
-        ([1, 1, 1, 19, 0], [3]),
-        ([1, 1, 1, 1, 0], [4]),
-        ([9, 4, 19, 11, 5, 0], [0]),
-        ([24, 11, 26, 1, 15, 0], [1]),
-        ([1, 1, 20, 1, 1, 0], [4]),
-        ([1, 1, 1, 1, 1, 0], [5])]
-
-
-class CountCharTaskV2(BaseTask):
-  """Count char coding task (version 2).
-
-  Code must output the number of occurances of character 'A' (value 1) in an
-  input string.
-
-  Test cases are dynamically generated, allowing for the number of test cases
-  to be a hyperparameter.
-  """
-
-  def __init__(self, n, max_len):
-    super(type(self), self).__init__()
-    self.base = 27
-    self.eos = 0
-    self.char = 1
-    self.other_chars = [c for c in xrange(self.base)
-                        if c not in (self.eos, self.char)]
-    self.input_type = misc.IOType.string
-    self.output_type = misc.IOType.integer
-    self._io_pairs = self._make_io_examples(n, max_len)
-
-  def _make_io_examples(self, n, max_len):
-    """Generate test cases for the task."""
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    io_examples.append(([10, 0], [0]))
-    io_examples.append(([1, 0], [1]))
-    io_examples.append(([1, 1, 0], [2]))
-    io_examples.append(([9, 4, 19, 11, 5, 0], [0]))
-    io_examples.append(([24, 11, 26, 1, 15, 0], [1]))
-    for _ in xrange(n - 5):
-      length = rand.randrange(2, max_len + 1)
-      num_chars = rand.randrange(0, max_len + 1)
-      input_seq = [self.char] * num_chars + [0] * (length - num_chars)
-      rand.shuffle(input_seq)
-      for i in xrange(len(input_seq)):
-        if not input_seq[i]:
-          input_seq[i] = self.other_chars[rand.randrange(len(self.other_chars))]
-      output_seq = [num_chars]
-      io_examples.append((input_seq, output_seq))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class AddTask(BaseTask):
-  """Addition coding task.
-
-  Code needs to read in two integers and output their sum mod the BF base,
-  followed by a terminating 0.
-  """
-
-  def __init__(self, n=16):
-    super(type(self), self).__init__()
-    self.base = 256
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-    self._io_pairs = self._make_io_examples(n)
-
-  def _make_io_examples(self, n):
-    """Generate test cases for the task."""
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = [
-        ([4, 0], [4, 0]),
-        ([0, 5], [5, 0]),
-        ([1, 2], [3, 0]),
-        ([67, 21], [88, 0]),
-        ([55, 56], [111, 0]),
-        ([128, 33], [161, 0]),
-        ([221, 251], [216, 0]),
-        ([130, 127], [1, 0]),
-        ([255, 1], [0, 0])]
-    extra_examples = max(n - len(io_examples), 0)
-    for _ in xrange(extra_examples):
-      a = rand.randrange(256)
-      b = rand.randrange(256)
-      input_seq = [a, b]
-      output_seq = [(a + b) % 256, 0]
-      io_examples.append((input_seq, output_seq))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class BooleanLogicTask(BaseTask):
-  """Boolean logic (truth table) coding task.
-
-  Code needs to memorize a boolean truth table. Specifically, it must encode a
-  mapping from triple of bools to a single bool.
-  """
-
-  def __init__(self):
-    super(type(self), self).__init__()
-    self.base = 2
-    self.input_type = misc.IOType.boolean
-    self.output_type = misc.IOType.boolean
-    # X(~Z) + (~Y)(~Z) + (~X)YZ
-    self._truth_fn = (
-        lambda x, y, z:  # pylint: disable=g-long-lambda
-        (x and not z) or (not y and not z) or (not x and y and z))
-    self._test_cases = [
-        ([x, y, z], [int(self._truth_fn(x, y, z))])
-        for x, y, z in itertools.product(range(2), range(2), range(2))]
-
-  def make_io_set(self):
-    return copy.deepcopy(self._test_cases)
-
-
-# ------------------------------------------------------------------------------
-# The following tasks are generated from known BF solutions. This guarantees
-# that each task can be solved within the maximum code length, and maximum
-# execution steps.
-# ------------------------------------------------------------------------------
-
-
-def default_input_fn_factory(min_length=1, max_length=6, base=256):
-  def _input_gen(rand):
-    l = rand.randrange(min_length, max_length + 1)
-    return [rand.randrange(base) for _ in xrange(l)]
-  return _input_gen
-
-
-class KnownCodeBaseTask(BaseTask):
-  """These tasks generate their test cases from a known BF solution.
-
-  This ensures that each task has a solution which is under the max character
-  length, and that it solves the test cases under the max number of execution
-  steps.
-  """
-
-  def __init__(self, code_solution, make_input_fn, n=100, base=256,
-               max_steps=5000, seed=6849275409234):
-    super(KnownCodeBaseTask, self).__init__()
-    # Make sure known solution is less than the code length used in experiments.
-    assert len(code_solution) < 100
-    self.code_solution = code_solution
-    self.make_input_fn = make_input_fn
-    self.n = n
-    self.base = base
-    self.max_steps = max_steps
-    self.seed = seed
-    self._test_cases = list(self._test_case_generator(code_solution))
-
-  def _test_case_generator(self, code_solution):
-    rand = random.Random(self.seed)
-    for _ in xrange(self.n):
-      input_case = self.make_input_fn(rand)
-      result = bf.evaluate(
-          code_solution, input_buffer=input_case, max_steps=self.max_steps,
-          base=self.base, require_correct_syntax=False)
-      if not result.success:
-        raise RuntimeError(
-            'Program must succeed. Failed on input: %s' % input_case)
-      yield input_case, result.output
-
-  def make_io_set(self):
-    return copy.deepcopy(self._test_cases)
-
-
-class EchoTwiceTask(KnownCodeBaseTask):
-  """Echo twice."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,.[>,.]<[<]>[.>].',
-        default_input_fn_factory(),
-        **kwargs)
-
-
-class EchoThriceTask(KnownCodeBaseTask):
-  """Echo three times."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,.[>,.]<[<]>[.>].<[<]>[.>].',
-        default_input_fn_factory(),
-        **kwargs)
-
-
-class CopyReverseTask(KnownCodeBaseTask):
-  """Echo forwards, backwards, and then forwards again."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,.[>,.]<[.<].>[.>].',
-        default_input_fn_factory(),
-        **kwargs)
-
-
-class EchoZeroCascadeTask(KnownCodeBaseTask):
-  """Print k-th char with k zeros inbetween (1-indexed)."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        ',[.>[->+>.<<]>+[-<+>]<<,]',
-        default_input_fn_factory(),
-        **kwargs)
-
-
-class EchoCascadeTask(KnownCodeBaseTask):
-  """Print k-th char k times (1-indexed)."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        ',>>+<<[>>[-<+>]<[->+<<.>]>+<<,].',
-        default_input_fn_factory(base=20),
-        **kwargs)
-
-
-class ShiftLeftTask(KnownCodeBaseTask):
-  """Circulate shift input left."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        ',>,[.,]<.,.',
-        default_input_fn_factory(),
-        **kwargs)
-
-
-class ShiftRightTask(KnownCodeBaseTask):
-  """Circular shift input right."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,[>,]<.[-]<[<]>[.>].',
-        default_input_fn_factory(),
-        **kwargs)
-
-
-class RiffleTask(KnownCodeBaseTask):
-  """Shuffle like a deck of cards.
-
-  For input of length N, output values in the following index order:
-  N-1, 0, N-2, 1, N-3, 2, ...
-  """
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,[>,]<[.[-]<[<]>.[-]>[>]<]',
-        default_input_fn_factory(base=20, max_length=8),
-        **kwargs)
-
-
-class UnriffleTask(KnownCodeBaseTask):
-  """Inverse of riffle."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,[>,[.[-]],]<[.<].',
-        default_input_fn_factory(base=20, max_length=8),
-        **kwargs)
-
-
-class MiddleCharTask(KnownCodeBaseTask):
-  """Print middle char if length is odd, or 0 if even."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,[>,]<<[[>]<[,<[<]>,>[>]][>]<<]>.',
-        default_input_fn_factory(max_length=10),
-        **kwargs)
-
-
-class RemoveLastTask(KnownCodeBaseTask):
-  """Remove last character."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        ',>,[[<.[-]>[-<+>]],].',
-        default_input_fn_factory(base=20),
-        **kwargs)
-
-
-class RemoveLastTwoTask(KnownCodeBaseTask):
-  """Remove last two characters."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        ',>,>,[[<<.[-]>[-<+>]>[-<+>]],].',
-        default_input_fn_factory(base=10),
-        **kwargs)
-
-
-class EchoAlternatingTask(KnownCodeBaseTask):
-  # Print even numbered chars first (0-indexed), then odd numbered chars
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>,[.,>,]<<[<]>[.>].',
-        default_input_fn_factory(base=20, max_length=8),
-        **kwargs)
-
-
-class EchoHalfTask(KnownCodeBaseTask):
-  """Echo only first half of the input (round down when odd lengthed)."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>>+>,[[<]>+[>],]<[<]>-[-[-<<+>]<[>]>]<<[->+<]>[[>]>.,<+[<]>-].',
-        default_input_fn_factory(base=20, max_length=9),
-        **kwargs)
-
-
-class LengthTask(KnownCodeBaseTask):
-  """Print length of the input sequence."""
-
-  def __init__(self, **kwargs):
-    super(type(self), self).__init__(
-        '>+>,[[<]>+[>],]<[<]>-.',
-        default_input_fn_factory(max_length=14),
-        **kwargs)
-
-
-class EchoSecondSequenceTask(KnownCodeBaseTask):
-  """Echo second sequence. Sequences are separated by 0."""
-
-  def __init__(self, **kwargs):
-    def echo_second_gen(rand):
-      l = rand.randrange(1, 6)
-      x = [rand.randrange(256) for _ in xrange(l)]
-      l = rand.randrange(1, 6)
-      y = [rand.randrange(256) for _ in xrange(l)]
-      return x + [0] + y + [0]
-    super(type(self), self).__init__(
-        ',[,],[.,].',
-        echo_second_gen,
-        **kwargs)
-
-
-class EchoNthSequenceTask(KnownCodeBaseTask):
-  """Echo n-th sequence (1-indexed). Sequences are separated by 0."""
-
-  def __init__(self, **kwargs):
-    def echo_nth_gen(rand):
-      k = rand.randrange(1, 7)
-      n = rand.randrange(1, k + 1)
-      x = []
-      for _ in xrange(k):
-        l = rand.randrange(0, 4)
-        x += [rand.randrange(256) for _ in xrange(l)] + [0]
-      return [n] + x
-    super(type(self), self).__init__(
-        ',-[->,[,]<],[.,].',
-        echo_nth_gen,
-        **kwargs)
-
-
-class SubstringTask(KnownCodeBaseTask):
-  """Echo substring.
-
-  First two inputs are i and l, where i is the starting index (0-indexed)
-  and l is the length of the substring.
-  """
-
-  def __init__(self, **kwargs):
-    def substring_gen(rand):
-      l = rand.randrange(2, 16)
-      i, j = sorted([rand.randrange(l), rand.randrange(l)])
-      n = j - i
-      x = [rand.randrange(256) for _ in xrange(l)] + [0]
-      return [i, n] + x
-    super(type(self), self).__init__(
-        '>,<,>[->,<]>,<<[->>.,<<]',
-        substring_gen,
-        **kwargs)
-
-
-class Divide2Task(KnownCodeBaseTask):
-  """Divide by 2 (integer floor division)."""
-
-  def __init__(self, **kwargs):
-    def int_input_gen(rand):
-      return [rand.randrange(256)]
-    super(type(self), self).__init__(
-        ',[-[->>+<]>[<]<]>>.',
-        int_input_gen,
-        **kwargs)
-
-
-class DedupTask(KnownCodeBaseTask):
-  """Deduplicate adjacent duplicate chars."""
-
-  def __init__(self, **kwargs):
-    def dedup_input_gen(rand):
-      np_random = np.random.RandomState(rand.randrange(2147483647))
-      num_unique = rand.randrange(1, 5)
-      unique = np_random.choice(6, num_unique, replace=False) + 1
-      return [v for v in unique for _ in xrange(rand.randrange(1, 5))] + [0]
-    super(type(self), self).__init__(
-        '>>,.[[-<+<+>>],[-<->]<[[-<->]<.>]<[->>+<<]>>]',
-        dedup_input_gen,
-        **kwargs)
-
-
-# ==============================================================================
-# Extra tasks.
-# ==============================================================================
-
-
-class PrintIntTask(BaseTask):
-  """Print integer coding task.
-
-  Code needs to output a fixed single value (given as a hyperparameter to the
-  task constructor). Program input is ignored.
-  """
-
-  def __init__(self, base, fixed_string):
-    super(type(self), self).__init__()
-    self.base = base
-    self.eos = 0
-    self.fixed_string = fixed_string
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-
-  def make_io_set(self):
-    return [(list(), list(self.fixed_string))]
-
-
-class EchoTask(BaseTask):
-  """Echo string coding task.
-
-  Code needs to pipe input to putput (without any modifications).
-  """
-
-  def __init__(self, base, min_length=1, max_length=5):
-    super(type(self), self).__init__()
-    self.base = base  # base includes EOS
-    self.eos = 0
-    self.min_length = min_length
-    self.max_length = max_length
-    self._io_pairs = self._make_io_examples(25)
-
-  def _make_io_examples(self, n):
-    # Test cases are fixed, but varied.
-    np_random = np.random.RandomState(1234567890)
-    io_pairs = []
-    for _ in xrange(n):
-      length = np_random.randint(self.min_length, self.max_length + 1)
-      input_seq = np_random.randint(1, self.base, length).tolist() + [self.eos]
-      output_seq = list(input_seq)
-      io_pairs.append((input_seq, output_seq))
-    return io_pairs
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class JudgeRouteCircleTask(BaseTask):
-  """Judge route circle coding task.
-
-  Code needs to determine if the given route makes a closed loop.
-  Encoding: U = 1, R = 2, D = 3, L = 4.
-
-  Based on
-  https://leetcode.com/problems/judge-route-circle/description/
-  """
-  base = 256
-  input_type = misc.IOType.integer
-  output_type = misc.IOType.integer
-
-  def __init__(self, n, max_len=12):
-    super(type(self), self).__init__()
-    self.eos = 0
-    self._io_pairs = self._make_io_examples(n, max_len)
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-
-  def _solve(self, input_seq):
-    assert input_seq[-1] == 0
-    pos = [0, 0]  # (x, y)
-    for move in input_seq[:-1]:
-      assert 0 < move <= 4
-      if move & 1 == 0:  # Left or Right.
-        pos[0] += 3 - move  # Add or subtract 1.
-      else:
-        pos[1] += 2 - move  # Add or subtract 1.
-    return [int(not pos[0] and not pos[1])]
-
-  def _make_io_examples(self, n, max_len):
-    """Generate test cases for the task."""
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    io_examples.append(([0], [1]))
-    io_examples.append(([4, 2, 0], [1]))
-    io_examples.append(([2, 4, 0], [1]))
-    io_examples.append(([3, 1, 0], [1]))
-    io_examples.append(([1, 3, 0], [1]))
-    io_examples.append(([1, 0], [0]))
-    io_examples.append(([2, 0], [0]))
-    io_examples.append(([3, 0], [0]))
-    io_examples.append(([4, 0], [0]))
-    for _ in xrange(n):
-      is_true = rand.randrange(2)
-      length = rand.randrange(1, max_len + 1)
-      if is_true:
-        # Make a true case.
-        length = (length >> 1) << 1  # Make even.
-        partition = (rand.randrange(length + 1) >> 1) << 1
-        a = partition >> 1
-        b = (length - partition) >> 1
-        counts = {1: a, 2: b, 3: a, 4: b}
-      else:
-        # Make a false case.
-        partitions = (
-            [0]
-            + sorted([rand.randrange(length + 1) for _ in range(3)])
-            + [length])
-        counts = {n: partitions[n] - partitions[n - 1] for n in range(1, 5)}
-        if counts[1] == counts[3] and counts[2] == counts[4]:
-          # By chance we sampled a true case. Make it false by exchanging
-          # one count between even and odd pairs.
-          base = 1 + 2 * rand.randrange(2)
-          a, b = (base, base + 1) if rand.randrange(2) else (base + 1, base)
-          if counts[a] == length or counts[b] == 0:
-            # If counts are at their extreme values, then swap who gets
-            # incremented and decremented.
-            a, b = b, a
-          counts[a] += 1
-          counts[b] -= 1
-          assert counts[a] <= length and counts[b] >= 0
-      assert sum(counts.values()) == length
-      input_seq = [n for n in xrange(1, 5) for _ in xrange(counts[n])]
-      rand.shuffle(input_seq)
-      input_seq += [0]
-      output_seq = self._solve(input_seq)
-      assert output_seq[0] == is_true
-      io_examples.append((input_seq, output_seq))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class MultiplyTask(BaseTask):
-  """Multiply coding task.
-
-  Code needs to multiple two ints.
-
-  Solution:
-  http://robl.co/brief-look-at-brainfuck/
-  ,>,><<[->[->+>+<<]>>[-<<+>>]<<<]>>.
-  """
-  base = 512
-  input_type = misc.IOType.integer
-  output_type = misc.IOType.integer
-
-  def __init__(self, n):
-    super(type(self), self).__init__()
-    self.eos = 0
-    self._io_pairs = self._make_io_examples(n)
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-
-  def _factors(self, n):
-    return set(i for i in range(1, int(n**0.5) + 1) if n % i == 0)
-
-  def _make_io_examples(self, n):
-    """Generate test cases for the task."""
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    for _ in xrange(n):
-      n = rand.randrange(self.base)
-      if n == 0:
-        a, b = 0, rand.randrange(self.base)
-      else:
-        f = list(self._factors(n))
-        a = f[rand.randrange(len(f))]
-        b = n // a
-      if rand.randrange(2):
-        a, b = b, a
-      io_examples.append(([a, b], [n]))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class DivModTask(BaseTask):
-  """Divmod coding task.
-
-  Code needs to take the quotient and remainder of two ints.
-
-  Solution:
-  http://robl.co/brief-look-at-brainfuck/
-  ,>,><<[>[->+>+<<]>[-<<-[>]>>>[<[-<->]<[>]>>[[-]>>+<]>-<]<<]>>>+<<[-<<+>>]<<<]>
-  >>>>[-<<<<<+>>>>>]<<<<<.>.>
-  """
-  base = 512
-  input_type = misc.IOType.integer
-  output_type = misc.IOType.integer
-
-  def __init__(self, n):
-    super(type(self), self).__init__()
-    self.eos = 0
-    self._io_pairs = self._make_io_examples(n)
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-
-  def _make_io_examples(self, n):
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    for _ in xrange(n):
-      n = rand.randrange(0, self.base)
-      k = rand.randrange(1, self.base)  # Divisor cannot be 0.
-      io_examples.append(([n, k], list(divmod(n, k))))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class FibonacciTask(BaseTask):
-
-  def __init__(self):
-    super(type(self), self).__init__()
-    self.base = 256
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-
-  def make_io_set(self):
-    return [
-        ([0], [0, 1]),
-        ([1], [1, 1]),
-        ([2], [1, 2]),
-        ([3], [2, 3]),
-        ([4], [3, 5]),
-        ([5], [5, 8]),
-        ([6], [8, 13]),
-        ([7], [13, 21]),
-        ([8], [21, 34]),
-        ([9], [34, 55]),
-        ([10], [55, 89]),
-        ([11], [89, 144]),
-        ([12], [144, 233]),
-        ([13], [233, 121])]
-
-
-class FindSubStrTask(BaseTask):
-  """Find sub-string coding task.
-
-  Code needs to output a bool: True if the input string contains a hard-coded
-  substring, 'AB' (values [1, 2]).
-  """
-
-  def __init__(self, base):
-    super(type(self), self).__init__()
-    assert base >= 27
-    self.base = base
-    self.eos = 0
-    self.find_str = [1, 2]
-    self.input_type = misc.IOType.string
-    self.output_type = misc.IOType.boolean
-
-  def make_io_set(self):
-    return [
-        ([1, 1, 23, 0], [0]),
-        ([21, 3, 2, 0], [0]),
-        ([2, 1, 19, 0], [0]),
-        ([2, 24, 15, 3, 0], [0]),
-        ([24, 6, 10, 16, 4, 0], [0]),
-        ([1, 2, 12, 0], [1]),
-        ([7, 1, 2, 0], [1]),
-        ([1, 2, 11, 3, 0], [1]),
-        ([1, 1, 2, 18, 0], [1]),
-        ([7, 25, 1, 2, 0], [1]),
-        ([3, 1, 2, 11, 8, 0], [1]),
-        ([15, 16, 20, 1, 2, 0], [1])]
-
-
-class SortFixedTask(BaseTask):
-  """Sort list coding task.
-
-  Code needs to output a sorted input list. The task consists of lists of the
-  same length L, where L is provided to this task's constructor as a
-  hyperparameter.
-  """
-
-  def __init__(self, base, length=3):
-    super(type(self), self).__init__()
-    assert base >= 27
-    self.base = base
-    self.eos = 0
-    self.length = length
-    assert length == 3  # More lengths will be supported.
-
-  def make_io_set(self):
-    if self.length == 3:
-      return [
-          ([1, 20, 6], [1, 6, 20]),
-          ([13, 6, 7], [6, 7, 13]),
-          ([24, 2, 23], [2, 23, 24]),
-          ([16, 12, 3], [3, 12, 16]),
-          ([11, 24, 4], [4, 11, 24]),
-          ([10, 1, 19], [1, 10, 19])]
-
-
-class SortFixedTaskV2(BaseTask):
-  """Sort list coding task (version 2).
-
-  Code needs to output a sorted input list. The task consists of lists of the
-  same length L, where L is provided to this task's constructor as a
-  hyperparameter.
-
-  Test cases are dynamically generated, allowing for the number of test cases
-  to be a hyperparameter.
-  """
-
-  def __init__(self, base, n, length=3):
-    super(type(self), self).__init__()
-    assert base >= 27
-    self.base = base
-    self.eos = 0
-    self._io_pairs = self._make_io_examples(n, length)
-    self.input_type = misc.IOType.integer
-    self.output_type = misc.IOType.integer
-
-  def _make_io_examples(self, n, length):
-    rand = random.Random(6849275409234)  # Test cases are fixed, but varied.
-    io_examples = []
-    for _ in xrange(n):
-      input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
-      output_seq = sorted(input_seq)
-      io_examples.append((input_seq, output_seq))
-    return io_examples
-
-  def make_io_set(self):
-    return copy.deepcopy(self._io_pairs)
-
-
-class RemoveTargetCharTask(KnownCodeBaseTask):
-  """Remove target character from string, where first input is the target.
-
-  Target can appear multiple times.
-  """
-
-  def __init__(self, **kwargs):
-    def randrange_hole(rand, a, hole, b):
-      x = rand.randrange(a, b - 1)
-      if x >= hole:
-        return x + 1
-      return x
-    def remove_target_char_gen(rand):
-      char = rand.randrange(1, 6)
-      l = rand.randrange(1, 8)
-      input_seq = [randrange_hole(rand, 1, char, 256) for _ in xrange(l)]
-      idx = range(l)
-      rand.shuffle(idx)
-      num_targets = rand.randrange(0, l)
-      for pos in idx[:num_targets]:
-        input_seq[pos] = char
-      return [char] + input_seq + [0]
-    super(type(self), self).__init__(
-        ',>>>,[<<<[->+>+<<]>>[->->+<<]>[>[-<+>]<.[-]]>[-]<<<[-<+>]>>,].',
-        remove_target_char_gen,
-        **kwargs)
-
-
-class ListIndexTask(KnownCodeBaseTask):
-  """Echo i-th value in the given list."""
-
-  def __init__(self, **kwargs):
-    def array_index_gen(rand):
-      l = rand.randrange(1, 16)
-      i = rand.randrange(l)
-      return [i] + [rand.randrange(256) for _ in xrange(l)] + [0]
-    super(type(self), self).__init__(
-        ',[->,<]>,.',
-        array_index_gen,
-        **kwargs)
-
-
-# ==============================================================================
-# Tasks based on primaryobjects paper.
-# ==============================================================================
-
-
-def string2tokens(string):
-  return [ord(c) for c in string]
-
-
-def stringlist2tokens(strings):
-  return [string2tokens(string) for string in strings]
-
-
-def string2tokens_b27(string):
-  return [ord(c.lower()) - ord('a') + 1 for c in string]
-
-
-def stringlist2tokens_b27(strings):
-  return [string2tokens_b27(string) for string in strings]
-
-
-class BottlesOfBeerTask(BaseTask):
-  """Bottles of beer coding task.
-
-  This is a counting task. Code needs to read in an int N and then output
-  every int from N to 0, each separated by a 0.
-  """
-  base = 256
-  input_type = misc.IOType.integer
-  output_type = misc.IOType.integer
-
-  def make_io_set(self):
-    return [
-        ([1], [1, 0]),
-        ([2], [2, 0, 1, 0]),
-        ([3], [3, 0, 2, 0, 1, 0]),
-        ([4], [4, 0, 3, 0, 2, 0, 1, 0]),
-        ([5], [5, 0, 4, 0, 3, 0, 2, 0, 1, 0]),
-        ([6], [6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0])]
-
-
-class SplitTask(BaseTask):
-  """Split coding task.
-
-  Code needs to pipe input strings to output, but insert a 0 after every 3
-  characters. This is in essence splitting the string into intervals of length
-  3.
-  """
-  base = 28
-  input_type = misc.IOType.string
-  output_type = misc.IOType.integer
-
-  def _splicer(self, lst, insert, interval=3):
-    for i, item in enumerate(lst):
-      yield item
-      if (i + 1) % interval == 0 and i < len(lst) - 1:
-        yield insert
-
-  def __init__(self):
-    super(type(self), self).__init__()
-    inputs = stringlist2tokens_b27(
-        ['hello', 'orange', 'spaghetti', 'wins', 'one'])
-    targets = [list(self._splicer(i, 27)) for i in inputs]
-    self._test_cases = list(zip(inputs, targets))
-
-  def make_io_set(self):
-    return copy.deepcopy(self._test_cases)
-
-
-class TrimLeftTask(BaseTask):
-  """Trim left coding task.
-
-  Code needs to pipe input strings to output, but remove everything before the
-  first quotation char (").
-  """
-  base = 256
-  input_type = misc.IOType.integer
-  output_type = misc.IOType.integer
-
-  def __init__(self):
-    super(type(self), self).__init__()
-    inputs = stringlist2tokens(
-        ['a "inside" over', 'xy "test" rights', 'ca6 "foresting" service',
-         'abc"def"yz.', 'A"B"'])
-    targets = stringlist2tokens(
-        ['"inside" over', '"test" rights', '"foresting" service', '"def"yz.',
-         '"B"'])
-    self._test_cases = list(zip(inputs, targets))
-
-  def make_io_set(self):
-    return copy.deepcopy(self._test_cases)
--- a/research/brain_coder/single_task/code_tasks_test.py
+++ b/research/brain_coder/single_task/code_tasks_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for code_tasks."""
-
-import numpy as np
-import tensorflow as tf
-
-from single_task import code_tasks  # brain coder
-from single_task import defaults  # brain coder
-
-
-def pad(string, pad_length, pad_char):
-  return string + pad_char * (pad_length - len(string))
-
-
-class CodeTasksTest(tf.test.TestCase):
-
-  def assertClose(self, a, b):
-    self.assertTrue(
-        np.isclose(a, b, atol=1e-4),
-        'Expecting approximately equal values. Got: %s, %s' % (a, b))
-
-  def testMultiIOTaskManager(self):
-    maxlen = 100
-    padchr = '['
-    task = code_tasks.make_paper_task(
-        'print', timestep_limit=maxlen, do_code_simplification=False)
-    reward_fns = task.rl_batch(1)
-    r = reward_fns[0]
-    self.assertClose(
-        r(pad('++++++++.---.+++++++...', maxlen, padchr)).episode_rewards[-1],
-        0.2444)
-    self.assertClose(
-        r(pad('++++++++.---.+++++++..+++.',
-              maxlen, padchr)).episode_rewards[-1],
-        1.0)
-
-    task = code_tasks.make_paper_task(
-        'print', timestep_limit=maxlen, do_code_simplification=True)
-    reward_fns = task.rl_batch(1)
-    r = reward_fns[0]
-    self.assertClose(
-        r('++++++++.---.+++++++...').episode_rewards[-1],
-        0.2444)
-    self.assertClose(
-        r('++++++++.---.+++++++..+++.').episode_rewards[-1],
-        0.935)
-    self.assertClose(
-        r(pad('++++++++.---.+++++++..+++.',
-              maxlen, padchr)).episode_rewards[-1],
-        0.75)
-
-    task = code_tasks.make_paper_task(
-        'reverse', timestep_limit=maxlen, do_code_simplification=False)
-    reward_fns = task.rl_batch(1)
-    r = reward_fns[0]
-    self.assertClose(
-        r(pad('>,>,>,.<.<.<.', maxlen, padchr)).episode_rewards[-1],
-        0.1345)
-    self.assertClose(
-        r(pad(',[>,]+[,<.]', maxlen, padchr)).episode_rewards[-1],
-        1.0)
-
-    task = code_tasks.make_paper_task(
-        'reverse', timestep_limit=maxlen, do_code_simplification=True)
-    reward_fns = task.rl_batch(1)
-    r = reward_fns[0]
-    self.assertClose(r('>,>,>,.<.<.<.').episode_rewards[-1], 0.1324)
-    self.assertClose(r(',[>,]+[,<.]').episode_rewards[-1], 0.9725)
-    self.assertClose(
-        r(pad(',[>,]+[,<.]', maxlen, padchr)).episode_rewards[-1],
-        0.75)
-
-  def testMakeTask(self):
-    maxlen = 100
-    padchr = '['
-    config = defaults.default_config_with_updates(
-        'env=c(config_for_iclr=False,fixed_string=[8,5,12,12,15])')
-    task = code_tasks.make_task(config.env, 'print', timestep_limit=maxlen)
-    reward_fns = task.rl_batch(1)
-    r = reward_fns[0]
-    self.assertClose(
-        r('++++++++.---.+++++++...').episode_rewards[-1],
-        0.2444)
-    self.assertClose(
-        r('++++++++.---.+++++++..+++.').episode_rewards[-1],
-        0.935)
-    self.assertClose(
-        r(pad('++++++++.---.+++++++..+++.',
-              maxlen, padchr)).episode_rewards[-1],
-        0.75)
-
-  def testKnownCodeBaseTask(self):
-    maxlen = 100
-    padchr = '['
-    task = code_tasks.make_paper_task(
-        'shift-left', timestep_limit=maxlen, do_code_simplification=False)
-    reward_fns = task.rl_batch(1)
-    r = reward_fns[0]
-    self.assertClose(
-        r(pad(',>,[.,]<.,.', maxlen, padchr)).episode_rewards[-1],
-        1.0)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/data.py
+++ b/research/brain_coder/single_task/data.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Manage data for pretraining and RL tasks."""
-
-import ast
-from collections import namedtuple
-
-from absl import logging
-
-from single_task import code_tasks  # brain coder
-
-
-RLBatch = namedtuple('RLBatch', ['reward_fns', 'batch_size', 'good_reward'])
-
-
-class DataManager(object):
-  """Interface between environment and model."""
-
-  def __init__(self, global_config, run_number=None,
-               do_code_simplification=False):
-    """Constructs a DataManager.
-
-    Args:
-      global_config: A config_lib.Config instance containing all config. See
-          config in defaults.py.
-      run_number: Which run this is (of the same experiment). This should be set
-          when a task cycle is defined in the config. A task cycle is a list of
-          tasks to cycle through repeatedly, and the selected task is a function
-          of the run number, i.e. 0-th run, 1-st run, 2-nd run, etc...
-          This can be None if only a single task is set in the config.
-      do_code_simplification: When global_config.env.config_for_iclr is True,
-          use this option to create code simplification (code golf) tasks, vs
-          fixed length coding tasks. If True, a task with code simplification
-          reward will be constructed.
-
-    Raises:
-      ValueError: If global_config.env.task and global_config.env.task_cycle
-          are both set, or both not set. Only one should be given.
-      ValueError: If global_config.env.task_cycle is set but run_number is None.
-    """
-    env_config = global_config.env
-    self.batch_size = global_config.batch_size
-
-    if env_config.task_cycle:
-      if env_config.task:
-        raise ValueError('Do not set both `task` and `task_cycle`.')
-      if run_number is None:
-        raise ValueError('Do not use task_cycle for single-run experiment.')
-      index = run_number % len(env_config.task_cycle)
-      self.task_name = env_config.task_cycle[index]
-      logging.info('run_number: %d,  task_cycle index: %d', run_number, index)
-      logging.info('task_cycle: %s', env_config.task_cycle)
-    elif env_config.task:
-      self.task_name = env_config.task
-    else:
-      raise ValueError('Either `task` or `task_cycle` must be set.')
-    logging.info('Task for this run: "%s"', self.task_name)
-
-    logging.info('config_for_iclr=True; do_code_simplification=%s',
-                 do_code_simplification)
-    self.rl_task = code_tasks.make_task(
-        task_name=self.task_name,
-        override_kwargs=ast.literal_eval(env_config.task_kwargs),
-        max_code_length=global_config.timestep_limit,
-        require_correct_syntax=env_config.correct_syntax,
-        do_code_simplification=do_code_simplification,
-        correct_bonus=env_config.task_manager_config.correct_bonus,
-        code_length_bonus=env_config.task_manager_config.code_length_bonus)
-
-  def sample_rl_batch(self):
-    """Create reward functions from the current task.
-
-    Returns:
-      RLBatch namedtuple instance, which holds functions and information for
-      a minibatch of episodes.
-      * reward_fns: A reward function for each episode. Maps code string to
-          reward.
-      * batch_size: Number of episodes in this minibatch.
-      * good_reward: Estimated threshold of rewards which indicate the algorithm
-          is starting to solve the task. This is a heuristic that tries to
-          reduce the amount of stuff written to disk.
-    """
-    reward_fns = self.rl_task.rl_batch(self.batch_size)
-    return RLBatch(
-        reward_fns=reward_fns,
-        batch_size=self.batch_size,
-        good_reward=self.rl_task.good_reward)
--- a/research/brain_coder/single_task/defaults.py
+++ b/research/brain_coder/single_task/defaults.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Default configuration for agent and environment."""
-
-from absl import logging
-
-from common import config_lib  # brain coder
-
-
-def default_config():
-  return config_lib.Config(
-      agent=config_lib.OneOf(
-          [config_lib.Config(
-              algorithm='pg',
-              policy_lstm_sizes=[35,35],
-              # Set value_lstm_sizes to None to share weights with policy.
-              value_lstm_sizes=[35,35],
-              obs_embedding_size=10,
-              grad_clip_threshold=10.0,
-              param_init_factor=1.0,
-              lr=5e-5,
-              pi_loss_hparam=1.0,
-              vf_loss_hparam=0.5,
-              entropy_beta=1e-2,
-              regularizer=0.0,
-              softmax_tr=1.0,  # Reciprocal temperature.
-              optimizer='rmsprop',  # 'adam', 'sgd', 'rmsprop'
-              topk=0,  # Top-k unique codes will be stored.
-              topk_loss_hparam=0.0,  # off policy loss multiplier.
-              # Uniformly sample this many episodes from topk buffer per batch.
-              # If topk is 0, this has no effect.
-              topk_batch_size=1,
-              # Exponential moving average baseline for REINFORCE.
-              # If zero, A2C is used.
-              # If non-zero, should be close to 1, like .99, .999, etc.
-              ema_baseline_decay=0.99,
-              # Whether agent can emit EOS token. If true, agent can emit EOS
-              # token which ends the episode early (ends the sequence).
-              # If false, agent must emit tokens until the timestep limit is
-              # reached. e.g. True means variable length code, False means fixed
-              # length code.
-              # WARNING: Making this false slows things down.
-              eos_token=False,
-              replay_temperature=1.0,
-              # Replay probability. 1 = always replay, 0 = always on policy.
-              alpha=0.0,
-              # Whether to normalize importance weights in each minibatch.
-              iw_normalize=True),
-           config_lib.Config(
-              algorithm='ga',
-              crossover_rate=0.99,
-              mutation_rate=0.086),
-           config_lib.Config(
-              algorithm='rand')],
-          algorithm='pg',
-      ),
-      env=config_lib.Config(
-          # If True, task-specific settings are not needed.
-          task='',  # 'print', 'echo', 'reverse', 'remove', ...
-          task_cycle=[],  # If non-empty, reptitions will cycle through tasks.
-          task_kwargs='{}',  # Python dict literal.
-          task_manager_config=config_lib.Config(
-              # Reward recieved per test case. These bonuses will be scaled
-              # based on how many test cases there are.
-              correct_bonus=2.0,  # Bonus for code getting correct answer.
-              code_length_bonus=1.0),  # Maximum bonus for short code.
-          correct_syntax=False,
-      ),
-      batch_size=64,
-      timestep_limit=32)
-
-
-def default_config_with_updates(config_string, do_logging=True):
-  if do_logging:
-    logging.info('Config string: "%s"', config_string)
-  config = default_config()
-  config.strict_update(config_lib.Config.parse(config_string))
-  if do_logging:
-    logging.info('Config:\n%s', config.pretty_str())
-  return config
--- a/research/brain_coder/single_task/ga_lib.py
+++ b/research/brain_coder/single_task/ga_lib.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Genetic algorithm for BF tasks.
-
-Inspired by https://github.com/primaryobjects/AI-Programmer.
-GA function code borrowed from https://github.com/DEAP/deap.
-"""
-
-from collections import namedtuple
-import random
-
-from absl import flags
-from absl import logging
-import numpy as np
-from six.moves import xrange
-
-from common import bf  # brain coder
-from common import utils  # brain coder
-from single_task import misc  # brain coder
-
-FLAGS = flags.FLAGS
-
-# Saving reward of previous programs saves computation if a program appears
-# again.
-USE_REWARD_CACHE = True  # Disable this if GA is using up too much memory.
-GENES = bf.CHARS
-MAX_PROGRAM_STEPS = 500
-STEP_BONUS = True
-
-ALPHANUM_CHARS = (
-    ['_'] +
-    [chr(ord('a') + i_) for i_ in range(26)] +
-    [chr(ord('A') + i_) for i_ in range(26)] +
-    [chr(ord('0') + i_) for i_ in range(10)])
-
-Result = namedtuple(
-    'Result',
-    ['reward', 'inputs', 'code_outputs', 'target_outputs', 'type_in',
-     'type_out', 'base', 'correct'])
-
-
-class IOType(object):
-  string = 'string'
-  integer = 'integer'
-
-
-class CustomType(object):
-
-  def __init__(self, to_str_fn):
-    self.to_str_fn = to_str_fn
-
-  def __call__(self, obj):
-    return self.to_str_fn(obj)
-
-
-def tokens_list_repr(tokens, repr_type, base):
-  """Make human readable representation of program IO."""
-  if isinstance(repr_type, CustomType):
-    return repr_type(tokens)
-  elif repr_type == IOType.string:
-    chars = (
-        [ALPHANUM_CHARS[t] for t in tokens] if base < len(ALPHANUM_CHARS)
-        else [chr(t) for t in tokens])
-    return ''.join(chars)
-  elif repr_type == IOType.integer:
-    return str(tokens)
-  raise ValueError('No such representation type "%s"', repr_type)
-
-
-def io_repr(result):
-  """Make human readable representation of test cases."""
-  inputs = ','.join(
-      tokens_list_repr(tokens, result.type_in, result.base)
-      for tokens in result.inputs)
-  code_outputs = ','.join(
-      tokens_list_repr(tokens, result.type_out, result.base)
-      for tokens in result.code_outputs)
-  target_outputs = ','.join(
-      tokens_list_repr(tokens, result.type_out, result.base)
-      for tokens in result.target_outputs)
-  return inputs, target_outputs, code_outputs
-
-
-def make_task_eval_fn(task_manager):
-  """Returns a wrapper that converts an RL task into a GA task.
-
-  Args:
-    task_manager: Is a task manager object from code_tasks.py
-
-  Returns:
-    A function that takes as input a single list of a code chars, and outputs
-    a Result namedtuple instance containing the reward and information about
-    code execution.
-  """
-  def to_data_list(single_or_tuple):
-    if isinstance(single_or_tuple, misc.IOTuple):
-      return list(single_or_tuple)
-    return [single_or_tuple]
-
-  def to_ga_type(rl_type):
-    if rl_type == misc.IOType.string:
-      return IOType.string
-    return IOType.integer
-
-  # Wrapper function.
-  def evalbf(bf_chars):
-    result = task_manager._score_code(''.join(bf_chars))
-    reward = sum(result.episode_rewards)
-    correct = result.reason == 'correct'
-    return Result(
-        reward=reward,
-        inputs=to_data_list(result.input_case),
-        code_outputs=to_data_list(result.code_output),
-        target_outputs=to_data_list(result.correct_output),
-        type_in=to_ga_type(result.input_type),
-        type_out=to_ga_type(result.output_type),
-        correct=correct,
-        base=task_manager.task.base)
-
-  return evalbf
-
-
-def debug_str(individual, task_eval_fn):
-  res = task_eval_fn(individual)
-  input_str, target_output_str, code_output_str = io_repr(res)
-  return (
-      ''.join(individual) +
-      ' | ' + input_str +
-      ' | ' + target_output_str +
-      ' | ' + code_output_str +
-      ' | ' + str(res.reward) +
-      ' | ' + str(res.correct))
-
-
-def mutate_single(code_tokens, mutation_rate):
-  """Mutate a single code string.
-
-  Args:
-    code_tokens: A string/list/Individual of BF code chars. Must end with EOS
-        symbol '_'.
-    mutation_rate: Float between 0 and 1 which sets the probability of each char
-        being mutated.
-
-  Returns:
-    An Individual instance containing the mutated code string.
-
-  Raises:
-    ValueError: If `code_tokens` does not end with EOS symbol.
-  """
-  if len(code_tokens) <= 1:
-    return code_tokens
-  if code_tokens[-1] == '_':
-    # Do this check to ensure that the code strings have not been corrupted.
-    raise ValueError('`code_tokens` must end with EOS symbol.')
-  else:
-    cs = Individual(code_tokens)
-    eos = []
-  mutated = False
-  for pos in range(len(cs)):
-    if random.random() < mutation_rate:
-      mutated = True
-      new_char = GENES[random.randrange(len(GENES))]
-      x = random.random()
-      if x < 0.25 and pos != 0 and pos != len(cs) - 1:
-        # Insertion mutation.
-        if random.random() < 0.50:
-          # Shift up.
-          cs = cs[:pos] + [new_char] + cs[pos:-1]
-        else:
-          # Shift down.
-          cs = cs[1:pos] + [new_char] + cs[pos:]
-      elif x < 0.50:
-        # Deletion mutation.
-        if random.random() < 0.50:
-          # Shift down.
-          cs = cs[:pos] + cs[pos + 1:] + [new_char]
-        else:
-          # Shift up.
-          cs = [new_char] + cs[:pos] + cs[pos + 1:]
-      elif x < 0.75:
-        # Shift rotate mutation (position invariant).
-        if random.random() < 0.50:
-          # Shift down.
-          cs = cs[1:] + [cs[0]]
-        else:
-          # Shift up.
-          cs = [cs[-1]] + cs[:-1]
-      else:
-        # Replacement mutation.
-        cs = cs[:pos] + [new_char] + cs[pos + 1:]
-  assert len(cs) + len(eos) == len(code_tokens)
-  if mutated:
-    return Individual(cs + eos)
-  else:
-    return Individual(code_tokens)
-
-
-def crossover(parent1, parent2):
-  """Performs crossover mating between two code strings.
-
-  Crossover mating is where a random position is selected, and the chars
-  after that point are swapped. The resulting new code strings are returned.
-
-  Args:
-    parent1: First code string.
-    parent2: Second code string.
-
-  Returns:
-    A 2-tuple of children, i.e. the resulting code strings after swapping.
-  """
-  max_parent, min_parent = (
-      (parent1, parent2) if len(parent1) > len(parent2)
-      else (parent2, parent1))
-  pos = random.randrange(len(max_parent))
-  if pos >= len(min_parent):
-    child1 = max_parent[:pos]
-    child2 = min_parent + max_parent[pos:]
-  else:
-    child1 = max_parent[:pos] + min_parent[pos:]
-    child2 = min_parent[:pos] + max_parent[pos:]
-  return Individual(child1), Individual(child2)
-
-
-def _make_even(n):
-  """Return largest even integer less than or equal to `n`."""
-  return (n >> 1) << 1
-
-
-def mutate_and_crossover(population, mutation_rate, crossover_rate):
-  """Take a generational step over a population.
-
-  Transforms population of parents into population of children (of the same
-  size) via crossover mating and then mutation on the resulting children.
-
-  Args:
-    population: Parent population. A list of Individual objects.
-    mutation_rate: Probability of mutation. See `mutate_single`.
-    crossover_rate: Probability that two parents will mate.
-
-  Returns:
-    Child population. A list of Individual objects.
-  """
-  children = [None] * len(population)
-  for i in xrange(0, _make_even(len(population)), 2):
-    p1 = population[i]
-    p2 = population[i + 1]
-    if random.random() < crossover_rate:
-      p1, p2 = crossover(p1, p2)
-    c1 = mutate_single(p1, mutation_rate)
-    c2 = mutate_single(p2, mutation_rate)
-    children[i] = c1
-    children[i + 1] = c2
-  if children[-1] is None:
-    children[-1] = population[-1]
-  return children
-
-
-def ga_loop(population, cxpb, mutpb, ngen, task_eval_fn, halloffame=None,
-            checkpoint_writer=None):
-  """A bare bones genetic algorithm.
-
-  Similar to chapter 7 of Back, Fogel and Michalewicz, "Evolutionary
-  Computation 1 : Basic Algorithms and Operators", 2000.
-
-  Args:
-    population: A list of individuals.
-    cxpb: The probability of mating two individuals.
-    mutpb: The probability of mutating a gene.
-    ngen: The number of generation. Unlimited if zero.
-    task_eval_fn: A python function which maps an Individual to a Result
-        namedtuple.
-    halloffame: (optional) a utils.MaxUniquePriorityQueue object that will be
-        used to aggregate the best individuals found during search.
-    checkpoint_writer: (optional) an object that can save and load populations.
-        Needs to have `write`, `load`, and `has_checkpoint` methods. Used to
-        periodically save progress. In event of a restart, the population will
-        be loaded from disk.
-
-  Returns:
-    GaResult namedtuple instance. This contains information about the GA run,
-    including the resulting population, best reward (fitness) obtained, and
-    the best code string found.
-  """
-
-  has_checkpoint = False
-  if checkpoint_writer and checkpoint_writer.has_checkpoint():
-    try:
-      gen, population, halloffame = checkpoint_writer.load()
-    except EOFError:  # Data was corrupted. Start over.
-      pass
-    else:
-      has_checkpoint = True
-      logging.info(
-          'Loaded population from checkpoint. Starting at generation %d', gen)
-
-      # Evaluate the individuals with an invalid fitness
-      invalid_ind = [ind for ind in population if not ind.fitness.valid]
-      for ind in invalid_ind:
-        ind.fitness.values = task_eval_fn(ind).reward,
-      for _, ind in halloffame.iter_in_order():
-        ind.fitness.values = task_eval_fn(ind).reward,
-
-  if not has_checkpoint:
-    # Evaluate the individuals with an invalid fitness
-    invalid_ind = [ind for ind in population if not ind.fitness.valid]
-    for ind in invalid_ind:
-      ind.fitness.values = task_eval_fn(ind).reward,
-
-    if halloffame is not None:
-      for ind in population:
-        halloffame.push(ind.fitness.values, tuple(ind), ind)
-
-    logging.info('Initialized new population.')
-
-    gen = 1
-
-  pop_size = len(population)
-  program_reward_cache = {} if USE_REWARD_CACHE else None
-
-  # Begin the generational process
-  while ngen == 0 or gen <= ngen:
-    # Select the next generation individuals
-    offspring = roulette_selection(population, pop_size - len(halloffame))
-
-    # Vary the pool of individuals
-    # offspring = varAnd(offspring, toolbox, cxpb, mutpb)
-    offspring = mutate_and_crossover(
-        offspring, mutation_rate=mutpb, crossover_rate=cxpb)
-
-    # Evaluate the individuals with an invalid fitness
-    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
-    for ind in invalid_ind:
-      str_repr = ''.join(ind)
-      if program_reward_cache is not None and str_repr in program_reward_cache:
-        ind.fitness.values = (program_reward_cache[str_repr],)
-      else:
-        eval_result = task_eval_fn(ind)
-        ind.fitness.values = (eval_result.reward,)
-        if program_reward_cache is not None:
-          program_reward_cache[str_repr] = eval_result.reward
-
-    # Replace the current population by the offspring
-    population = list(offspring)
-
-    # Update the hall of fame with the generated individuals
-    if halloffame is not None:
-      for ind in population:
-        halloffame.push(ind.fitness.values, tuple(ind), ind)
-
-    # elitism
-    population.extend([ind for _, ind in halloffame.iter_in_order()])
-
-    if gen % 100 == 0:
-      top_code = '\n'.join([debug_str(ind, task_eval_fn)
-                            for ind in topk(population, k=4)])
-      logging.info('gen: %d\nNPE: %d\n%s\n\n', gen, gen * pop_size, top_code)
-
-      best_code = ''.join(halloffame.get_max()[1])
-      res = task_eval_fn(best_code)
-
-      # Write population and hall-of-fame to disk.
-      if checkpoint_writer:
-        checkpoint_writer.write(gen, population, halloffame)
-
-      if res.correct:
-        logging.info('Solution found:\n%s\nreward = %s\n',
-                     best_code, res.reward)
-        break
-
-    gen += 1
-
-  best_code = ''.join(halloffame.get_max()[1])
-  res = task_eval_fn(best_code)
-
-  return GaResult(
-      population=population, best_code=best_code, reward=res.reward,
-      solution_found=res.correct, generations=gen,
-      num_programs=gen * len(population),
-      max_generations=ngen, max_num_programs=ngen * len(population))
-
-
-GaResult = namedtuple(
-    'GaResult',
-    ['population', 'best_code', 'reward', 'generations', 'num_programs',
-     'solution_found', 'max_generations', 'max_num_programs'])
-
-
-def reward_conversion(reward):
-  """Convert real value into positive value."""
-  if reward <= 0:
-    return 0.05
-  return reward + 0.05
-
-
-def roulette_selection(population, k):
-  """Select `k` individuals with prob proportional to fitness.
-
-  Each of the `k` selections is independent.
-
-  Warning:
-    The roulette selection by definition cannot be used for minimization
-    or when the fitness can be smaller or equal to 0.
-
-  Args:
-    population: A list of Individual objects to select from.
-    k: The number of individuals to select.
-
-  Returns:
-    A list of selected individuals.
-  """
-  fitnesses = np.asarray(
-      [reward_conversion(ind.fitness.values[0])
-       for ind in population])
-  assert np.all(fitnesses > 0)
-
-  sum_fits = fitnesses.sum()
-  chosen = [None] * k
-  for i in xrange(k):
-    u = random.random() * sum_fits
-    sum_ = 0
-    for ind, fitness in zip(population, fitnesses):
-      sum_ += fitness
-      if sum_ > u:
-        chosen[i] = Individual(ind)
-        break
-    if not chosen[i]:
-      chosen[i] = Individual(population[-1])
-
-  return chosen
-
-
-def make_population(make_individual_fn, n):
-  return [make_individual_fn() for _ in xrange(n)]
-
-
-def best(population):
-  best_ind = None
-  for ind in population:
-    if best_ind is None or best_ind.fitness.values < ind.fitness.values:
-      best_ind = ind
-  return best_ind
-
-
-def topk(population, k):
-  q = utils.MaxUniquePriorityQueue(k)
-  for ind in population:
-    q.push(ind.fitness.values, tuple(ind), ind)
-  return [ind for _, ind in q.iter_in_order()]
-
-
-class Fitness(object):
-
-  def __init__(self):
-    self.values = ()
-
-  @property
-  def valid(self):
-    """Assess if a fitness is valid or not."""
-    return bool(self.values)
-
-
-class Individual(list):
-
-  def __init__(self, *args):
-    super(Individual, self).__init__(*args)
-    self.fitness = Fitness()
-
-
-def random_individual(genome_size):
-  return lambda: Individual(np.random.choice(GENES, genome_size).tolist())
--- a/research/brain_coder/single_task/ga_train.py
+++ b/research/brain_coder/single_task/ga_train.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Genetic algorithm for BF tasks.
-
-Also contains the uniform random search algorithm.
-
-Inspired by https://github.com/primaryobjects/AI-Programmer.
-GA function code borrowed from https://github.com/DEAP/deap.
-"""
-
-import cPickle
-import os
-import sys
-from time import sleep
-
-from absl import flags
-from absl import logging
-import numpy as np
-from six.moves import xrange
-import tensorflow as tf
-
-from common import utils  # brain coder
-from single_task import data  # brain coder
-from single_task import defaults  # brain coder
-from single_task import ga_lib  # brain coder
-from single_task import results_lib  # brain coder
-
-FLAGS = flags.FLAGS
-
-
-def define_tuner_hparam_space(hparam_space_type):
-  """Define tunable hparams for grid search."""
-  if hparam_space_type != 'ga':
-    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
-  return {
-      'population_size': [10, 25, 50, 100, 500],
-      'crossover_rate': [0.2, 0.5, 0.7, 0.9, 0.95],
-      'mutation_rate': [0.01, 0.03, 0.05, 0.1, 0.15]}
-
-
-def write_hparams_to_config(config, hparams, hparam_space_type):
-  """Write hparams given by the tuner into the Config object."""
-  if hparam_space_type != 'ga':
-    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
-  config.batch_size = hparams.population_size
-  config.agent.crossover_rate = hparams.crossover_rate
-  config.agent.mutation_rate = hparams.mutation_rate
-
-
-class CheckpointWriter(object):
-  """Manages loading and saving GA populations to disk.
-
-  This object is used by the genetic algorithm to save progress periodically
-  so that a recent population can be loaded from disk in the event of a restart.
-  """
-
-  def __init__(self, checkpoint_dir, population_size):
-    self.checkpoint_file = os.path.join(checkpoint_dir, 'checkpoint.pickle')
-    self.population_size = population_size
-
-  def write(self, gen, population, halloffame):
-    """Write GA state to disk.
-
-    Overwrites previous saved state.
-
-    Args:
-      gen: Generation number.
-      population: List of Individual objects.
-      halloffame: Hall-of-fame buffer. Typically a priority queue.
-    """
-    raw = cPickle.dumps((gen, population, halloffame))
-    with tf.gfile.FastGFile(self.checkpoint_file, 'w') as f:
-      f.write(raw)
-
-  def load(self):
-    """Loads GA state from disk.
-
-    Loads whatever is on disk, which will be whatever the most recent call
-    to `write` wrote.
-
-    Returns:
-      gen: Generation number.
-      population: List of Individual objects.
-      halloffame: Hall-of-fame buffer. Typically a priority queue.
-    """
-    with tf.gfile.FastGFile(self.checkpoint_file, 'r') as f:
-      raw = f.read()
-    objs = cPickle.loads(raw)
-    # Validate data.
-    assert isinstance(objs, tuple) and len(objs) == 3, (
-        'Expecting a 3-tuple, but got %s instead.' % (objs,))
-    gen, population, halloffame = objs
-    assert isinstance(gen, int), (
-        'Expecting `gen` to be an integer, got %s' % (gen,))
-    assert (
-        isinstance(population, list)
-        and len(population) == self.population_size
-    ), (
-        'Expecting `population` to be a list with size %d, got %s'
-        % (self.population_size, population))
-    assert halloffame is None or len(halloffame) == 2, (
-        'Expecting hall-of-fame object to have length two, got length %d'
-        % len(halloffame))
-    logging.info('Loaded pop from checkpoint file: "%s".',
-                 self.checkpoint_file)
-    return gen, population, halloffame
-
-  def has_checkpoint(self):
-    """Checks if a checkpoint exists on disk, and if so returns True."""
-    return tf.gfile.Exists(self.checkpoint_file)
-
-
-def run_training(config=None, tuner=None, logdir=None, trial_name=None,  # pylint: disable=unused-argument
-                 is_chief=True):
-  """Do all training runs.
-
-  This is the top level training function for policy gradient based models.
-  Run this from the main function.
-
-  Args:
-    config: config_lib.Config instance containing global config (agent and
-        environment hparams). If None, config will be parsed from FLAGS.config.
-    tuner: (unused) A tuner instance. Leave as None if not tuning.
-    logdir: Parent directory where all data from all runs will be written. If
-        None, FLAGS.logdir will be used.
-    trial_name: (unused) If tuning, set this to a unique string that identifies
-        this trial. If `tuner` is not None, this also must be set.
-    is_chief: True if this worker is the chief.
-
-  Returns:
-    List of results dicts which were written to disk. Each training run gets a
-    results dict. Results dict contains metrics, i.e. (name, value) pairs which
-    give information about the training run.
-
-  Raises:
-    ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions.
-    ValueError: If results dicts read from disk contain invalid data.
-  """
-  if not config:
-    # If custom config is not given, get it from flags.
-    config = defaults.default_config_with_updates(FLAGS.config)
-  if not logdir:
-    logdir = FLAGS.logdir
-
-  if FLAGS.num_repetitions % FLAGS.num_workers != 0:
-    raise ValueError('Number of workers must divide number of repetitions')
-  num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers
-  logging.info('Running %d reps globally.', FLAGS.num_repetitions)
-  logging.info('This worker will run %d local reps.', num_local_reps)
-  if FLAGS.max_npe:
-    max_generations = FLAGS.max_npe // config.batch_size
-    logging.info('Max samples per rep: %d', FLAGS.max_npe)
-    logging.info('Max generations per rep: %d', max_generations)
-  else:
-    max_generations = sys.maxint
-    logging.info('Running unlimited generations.')
-
-  assert FLAGS.num_workers > 0
-  logging.info('Starting experiment. Directory: "%s"', logdir)
-  results = results_lib.Results(logdir, FLAGS.task_id)
-  local_results_list = results.read_this_shard()
-  if local_results_list:
-    if local_results_list[0]['max_npe'] != FLAGS.max_npe:
-      raise ValueError(
-          'Cannot resume training. Max-NPE changed. Was %s, now %s',
-          local_results_list[0]['max_npe'], FLAGS.max_npe)
-    if local_results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
-      raise ValueError(
-          'Cannot resume training. Number of repetitions changed. Was %s, '
-          'now %s',
-          local_results_list[0]['max_global_repetitions'],
-          FLAGS.num_repetitions)
-  start_rep = len(local_results_list)
-
-  for rep in xrange(start_rep, num_local_reps):
-    global_rep = num_local_reps * FLAGS.task_id + rep
-    logging.info(
-        'Starting repetition: Rep = %d. (global rep = %d)',
-        rep, global_rep)
-
-    # Save data for each rep, like checkpoints, goes into separate folders.
-    run_dir = os.path.join(logdir, 'run_%d' % global_rep)
-
-    if not tf.gfile.IsDirectory(run_dir):
-      tf.gfile.MakeDirs(run_dir)
-    checkpoint_writer = CheckpointWriter(run_dir,
-                                         population_size=config.batch_size)
-
-    data_manager = data.DataManager(config, run_number=global_rep)
-    task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task)
-
-    if config.agent.algorithm == 'rand':
-      logging.info('Running random search.')
-      assert FLAGS.max_npe
-      result = run_random_search(
-          FLAGS.max_npe, run_dir, task_eval_fn, config.timestep_limit)
-    else:
-      assert config.agent.algorithm == 'ga'
-      logging.info('Running genetic algorithm.')
-      pop = ga_lib.make_population(
-          ga_lib.random_individual(config.timestep_limit),
-          n=config.batch_size)
-      hof = utils.MaxUniquePriorityQueue(2)  # Hall of fame.
-      result = ga_lib.ga_loop(
-          pop,
-          cxpb=config.agent.crossover_rate, mutpb=config.agent.mutation_rate,
-          task_eval_fn=task_eval_fn,
-          ngen=max_generations, halloffame=hof,
-          checkpoint_writer=checkpoint_writer)
-
-    logging.info('Finished rep. Num gens: %d', result.generations)
-
-    results_dict = {
-        'max_npe': FLAGS.max_npe,
-        'batch_size': config.batch_size,
-        'max_batches': FLAGS.max_npe // config.batch_size,
-        'npe': result.num_programs,
-        'max_global_repetitions': FLAGS.num_repetitions,
-        'max_local_repetitions': num_local_reps,
-        'code_solution': result.best_code if result.solution_found else '',
-        'best_reward': result.reward,
-        'num_batches': result.generations,
-        'found_solution': result.solution_found,
-        'task': data_manager.task_name,
-        'global_rep': global_rep}
-    logging.info('results_dict: %s', results_dict)
-    results.append(results_dict)
-
-  if is_chief:
-    logging.info(
-        'Worker is chief. Waiting for all workers to finish so that results '
-        'can be reported to the tuner.')
-
-    global_results_list, shard_stats = results.read_all(
-        num_shards=FLAGS.num_workers)
-    while not all(s.finished for s in shard_stats):
-      logging.info(
-          'Still waiting on these workers: %s',
-          ', '.join(
-              ['%d (%d reps left)'
-               % (i, s.max_local_reps - s.num_local_reps_completed)
-               for i, s in enumerate(shard_stats)
-               if not s.finished]))
-      sleep(60)
-      global_results_list, shard_stats = results.read_all(
-          num_shards=FLAGS.num_workers)
-
-    logging.info(
-        '%d results obtained. Chief worker is exiting the experiment.',
-        len(global_results_list))
-
-    return global_results_list
-
-
-def run_random_search(max_num_programs, checkpoint_dir, task_eval_fn,
-                      timestep_limit):
-  """Run uniform random search routine.
-
-  Randomly samples programs from a uniform distribution until either a valid
-  program is found, or the maximum NPE is reached. Results are written to disk
-  and returned.
-
-  Args:
-    max_num_programs: Maximum NPE (number of programs executed). If no solution
-        is found after this many programs are tried, the run is stopped and
-        considered a failure.
-    checkpoint_dir: Where to save state during the run.
-    task_eval_fn: Function that maps code string to result containing total
-        reward and info about success.
-    timestep_limit: Maximum length of code strings.
-
-  Returns:
-    ga_lib.GaResult namedtuple instance. This contains the best code and highest
-    reward found.
-  """
-  checkpoint_file = os.path.join(checkpoint_dir, 'random_search.txt')
-  num_programs_seen = 0
-  found_solution = False
-  best_code = ''
-  best_reward = 0.0
-  if tf.gfile.Exists(checkpoint_file):
-    try:
-      with tf.gfile.FastGFile(checkpoint_file, 'r') as f:
-        lines = list(f)
-        num_programs_seen = int(lines[0])
-        found_solution = bool(int(lines[1]))
-        if found_solution:
-          best_code = lines[2]
-          best_reward = float(lines[3])
-    except:  # pylint: disable=bare-except
-      pass
-
-  while not found_solution and num_programs_seen < max_num_programs:
-    if num_programs_seen % 1000 == 0:
-      logging.info('num_programs_seen = %d', num_programs_seen)
-      with tf.gfile.FastGFile(checkpoint_file, 'w') as f:
-        f.write(str(num_programs_seen) + '\n')
-        f.write(str(int(found_solution)) + '\n')
-
-    code = np.random.choice(ga_lib.GENES, timestep_limit).tolist()
-    res = task_eval_fn(code)
-    found_solution = res.correct
-    num_programs_seen += 1
-
-    if found_solution:
-      best_code = ''.join(code)
-      best_reward = res.reward
-
-  logging.info('num_programs_seen = %d', num_programs_seen)
-  logging.info('found solution: %s', found_solution)
-  with tf.gfile.FastGFile(checkpoint_file, 'w') as f:
-    f.write(str(num_programs_seen) + '\n')
-    f.write(str(int(found_solution)) + '\n')
-    if found_solution:
-      f.write(best_code + '\n')
-      f.write(str(best_reward) + '\n')
-
-  return ga_lib.GaResult(
-      population=[], best_code=best_code, reward=best_reward,
-      solution_found=found_solution, generations=num_programs_seen,
-      num_programs=num_programs_seen, max_generations=max_num_programs,
-      max_num_programs=max_num_programs)
--- a/research/brain_coder/single_task/ga_train_test.py
+++ b/research/brain_coder/single_task/ga_train_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for ga_train.
-
-Tests that ga runs for a few generations without crashing.
-"""
-
-from absl import flags
-import tensorflow as tf
-
-from single_task import defaults  # brain coder
-from single_task import run  # brain coder
-
-FLAGS = flags.FLAGS
-
-
-class GaTest(tf.test.TestCase):
-
-  def RunTrainingSteps(self, config_string, num_steps=10):
-    """Run a few training steps with the given config.
-
-    Just check that nothing crashes.
-
-    Args:
-      config_string: Config encoded in a string. See
-          $REPO_PATH/common/config_lib.py
-      num_steps: Number of training steps to run. Defaults to 10.
-    """
-    config = defaults.default_config_with_updates(config_string)
-    FLAGS.max_npe = num_steps * config.batch_size
-    FLAGS.logdir = tf.test.get_temp_dir()
-    FLAGS.config = config_string
-    run.main(None)
-
-  def testGeneticAlgorithm(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="ga"),'
-        'timestep_limit=40,batch_size=64')
-
-  def testUniformRandomSearch(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="rand"),'
-        'timestep_limit=40,batch_size=64')
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/launch_training.sh
+++ b/research/brain_coder/single_task/launch_training.sh
-#!/bin/bash
-# Launches training jobs.
-# Modify this file to launch workers with your prefered cloud API.
-# The following implementation runs each worker as a subprocess on the local
-# machine.
-
-MODELS_DIR="/tmp/models"
-
-# Get command line options.
-OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_workers:,num_ps:,max_npe:,num_repetitions:,stop_on_success:" -- "$@")
-if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
-
-eval set -- "$OPTS"
-
-JOB_NAME=""           # Name of the process and the logs directory.
-CONFIG=""             # Model and environment hparams.
-# NUM_WORKERS: Number of workers to launch for this training job. If using
-# neural networks, each worker will be 1 replica.
-NUM_WORKERS=1
-# NUM_PS: Number of parameter servers to launch for this training job. Only set
-# this if using neural networks. For 1 worker, no parameter servers are needed.
-# For more than 1 worker, at least 1 parameter server is needed to store the
-# global model.
-NUM_PS=0
-# MAX_NPE: Maximum number of programs executed. Training will quit once this
-# threshold is reached. If 0, the threshold is infinite.
-MAX_NPE=0
-NUM_REPETITIONS=1     # How many times to run this experiment.
-STOP_ON_SUCCESS=true  # Whether to halt training when a solution is found.
-
-# Parse options into variables.
-while true; do
-  case "$1" in
-    --job_name ) JOB_NAME="$2"; shift; shift ;;
-    --config ) CONFIG="$2"; shift; shift ;;
-    --num_workers ) NUM_WORKERS="$2"; shift; shift ;;
-    --num_ps ) NUM_PS="$2"; shift; shift ;;
-    --max_npe ) MAX_NPE="$2"; shift; shift ;;
-    --num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
-    --stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
-    -- ) shift; break ;;
-    * ) break ;;
-  esac
-done
-
-# Launch jobs.
-# TODO: multi-worker RL training
-
-LOGDIR="$MODELS_DIR/$JOB_NAME"
-mkdir -p $LOGDIR
-
-BIN_DIR="bazel-bin/single_task"
-for (( i=0; i<NUM_WORKERS; i++))
-do
-  # Expecting run.par to be built.
-  $BIN_DIR/run.par \
-      --alsologtostderr \
-      --config="$CONFIG" \
-      --logdir="$LOGDIR" \
-      --max_npe="$MAX_NPE" \
-      --num_repetitions="$NUM_REPETITIONS" \
-      --stop_on_success="$STOP_ON_SUCCESS" \
-      --task_id="$i" \
-      --num_workers="$NUM_WORKERS" \
-      --summary_tasks=1 \
-      2> "$LOGDIR/task_$i.log" &  # Run as subprocess
-  echo "Launched task $i. Logs: $LOGDIR/task_$i.log"
-done
-
-
-# Use "pidof run.par" to find jobs.
-# Kill with "pkill run.par"