Open source release of Brain Coder.

a00f7e2b · Dan Abolafia · 54babf62 · a00f7e2b · a00f7e2b · a00f7e2b
Commit a00f7e2b authored Jan 08, 2018 by Dan Abolafia
20 changed files
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -5,6 +5,7 @@
 /research/attention_ocr/ @alexgorban
 /research/audioset/ @plakal @dpwe
 /research/autoencoders/ @snurkabill
+/research/brain_coder/ @danabo
 /research/cognitive_mapping_and_planning/ @s-gupta
 /research/compression/ @nmjohn
 /research/delf/ @andrefaraujo

--- a/research/README.md
+++ b/research/README.md
@@ -20,6 +20,7 @@ installation](https://www.tensorflow.org/install).
 -   [audioset](audioset): Models and supporting code for use with
    [AudioSet](http://g.co/audioset).
 -   [autoencoder](autoencoder): various autoencoders.
+-   [brain_coder](brain_coder): Program synthesis with reinforcement learning.
 -   [cognitive_mapping_and_planning](cognitive_mapping_and_planning):
    implementation of a spatial memory based mapping and planning architecture
    for visual navigation.

--- a/research/brain_coder/README.md
+++ b/research/brain_coder/README.md
+# Brain Coder
+*Authors: Daniel Abolafia, Quoc Le, Mohammad Norouzi*
+Brain coder is a code synthesis experimental environment. We provide code that reproduces the results from our recent paper [Code Synthesis with Priority Queue Training](https://openreview.net/forum?id=r1AoGNlC-). See single_task/README.md for details on how to build and reproduce those experiments.
+## Installation
+First install dependencies seperately:
+* [bazel](https://docs.bazel.build/versions/master/install.html)
+* [TensorFlow](https://www.tensorflow.org/install/)
+* [scipy](https://www.scipy.org/install.html)
+* [absl-py](https://github.com/abseil/abseil-py)
+Note: even if you already have these dependencies installed, make sure they are
+up-to-date to avoid unnecessary debugging.
+## Building
+Use bazel from the top-level repo directory.
+For example:
+```bash
+bazel build single_task:run
+```
+View README.md files in subdirectories for more details.
--- a/research/brain_coder/WORKSPACE
+++ b/research/brain_coder/WORKSPACE
+git_repository(
+    name = "subpar",
+    remote = "https://github.com/google/subpar",
+    tag = "1.0.0",
+)
--- a/research/brain_coder/common/BUILD
+++ b/research/brain_coder/common/BUILD
+licenses(["notice"])
+package(default_visibility = [
+    "//:__subpackages__",
+])
+py_library(
+    name = "bf",
+    srcs = ["bf.py"],
+)
+py_test(
+    name = "bf_test",
+    srcs = ["bf_test.py"],
+    deps = [
+        ":bf",
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "config_lib",
+    srcs = ["config_lib.py"],
+)
+py_test(
+    name = "config_lib_test",
+    srcs = ["config_lib_test.py"],
+    deps = [
+        ":config_lib",
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "reward",
+    srcs = ["reward.py"],
+)
+py_test(
+    name = "reward_test",
+    srcs = ["reward_test.py"],
+    deps = [
+        ":reward",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "rollout",
+    srcs = ["rollout.py"],
+    deps = [
+        ":utils",
+        # numpy dep
+        # scipy dep
+    ],
+)
+py_test(
+    name = "rollout_test",
+    srcs = ["rollout_test.py"],
+    deps = [
+        ":rollout",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "schedules",
+    srcs = ["schedules.py"],
+    deps = [":config_lib"],
+)
+py_test(
+    name = "schedules_test",
+    srcs = ["schedules_test.py"],
+    deps = [
+        ":config_lib",
+        ":schedules",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    deps = [
+# file dep
+        # absl dep /logging
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_test(
+    name = "utils_test",
+    srcs = ["utils_test.py"],
+    deps = [
+        ":utils",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
--- a/research/brain_coder/common/bf.py
+++ b/research/brain_coder/common/bf.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""BrainF**k interpreter.
+Language info: https://en.wikipedia.org/wiki/Brainfuck
+Based on public implementation:
+https://github.com/pocmo/Python-Brainfuck/blob/master/brainfuck.py
+"""
+from collections import namedtuple
+import time
+EvalResult = namedtuple(
+    'EvalResult', ['output', 'success', 'failure_reason', 'steps', 'time',
+                   'memory', 'program_trace'])
+ExecutionSnapshot = namedtuple(
+    'ExecutionSnapshot',
+    ['codeptr', 'codechar', 'memptr', 'memval', 'memory', 'next_input',
+     'output_buffer'])
+class Status(object):
+  SUCCESS = 'success'
+  TIMEOUT = 'timeout'
+  STEP_LIMIT = 'step-limit'
+  SYNTAX_ERROR = 'syntax-error'
+CHARS = INT_TO_CHAR = ['>', '<', '+', '-', '[', ']', '.', ',']
+CHAR_TO_INT = dict([(c, i) for i, c in enumerate(INT_TO_CHAR)])
+class LookAheadIterator(object):
+  """Same API as Python iterator, with additional peek method."""
+  def __init__(self, iterable):
+    self._it = iter(iterable)
+    self._current_element = None
+    self._done = False
+    self._preload_next()
+  def _preload_next(self):
+    try:
+      self._current_element = self._it.next()
+    except StopIteration:
+      self._done = True
+  def next(self):
+    if self._done:
+      raise StopIteration
+    element = self._current_element
+    self._preload_next()
+    return element
+  def peek(self, default_value=None):
+    if self._done:
+      if default_value is None:
+        raise StopIteration
+      return default_value
+    return self._current_element
+def buildbracemap(code):
+  """Build jump map.
+  Args:
+    code: List or string or BF chars.
+  Returns:
+    bracemap: dict mapping open and close brace positions in the code to their
+        destination jumps. Specifically, positions of matching open/close braces
+        if they exist.
+    correct_syntax: True if all braces match. False if there are unmatched
+        braces in the code. Even if there are unmatched braces, a bracemap will
+        be built, and unmatched braces will map to themselves.
+  """
+  bracestack, bracemap = [], {}
+  correct_syntax = True
+  for position, command in enumerate(code):
+    if command == '[':
+      bracestack.append(position)
+    if command == ']':
+      if not bracestack:  # Unmatched closing brace.
+        bracemap[position] = position  # Don't jump to any position.
+        correct_syntax = False
+        continue
+      start = bracestack.pop()
+      bracemap[start] = position
+      bracemap[position] = start
+  if bracestack:  # Unmatched opening braces.
+    for pos in bracestack:
+      bracemap[pos] = pos  # Don't jump to any position.
+      correct_syntax = False
+  return bracemap, correct_syntax
+def evaluate(code, input_buffer=None, init_memory=None, base=256, timeout=1.0,
+             max_steps=None, require_correct_syntax=True, output_memory=False,
+             debug=False):
+  """Execute BF code.
+  Args:
+    code: String or list of BF characters. Any character not in CHARS will be
+        ignored.
+    input_buffer: A list of ints which will be used as the program's input
+        stream. Each read op "," will read an int from this list. 0's will be
+        read once the end of the list is reached, or if no input buffer is
+        given.
+    init_memory: A list of ints. Memory for first k positions will be
+        initialized to this list (where k = len(init_memory)). Memory positions
+        are initialized to 0 by default.
+    base: Integer base for the memory. When a memory value is incremented to
+        `base` it will overflow to 0. When a memory value is decremented to -1
+        it will underflow to `base` - 1.
+    timeout: Time limit for program execution in seconds. Set to None to
+        disable.
+    max_steps: Execution step limit. An execution step is the execution of one
+        operation (code character), even if that op has been executed before.
+        Execution exits when this many steps are reached. Set to None to
+        disable. Disabled by default.
+    require_correct_syntax: If True, unmatched braces will cause `evaluate` to
+        return without executing the code. The failure reason will be
+        `Status.SYNTAX_ERROR`. If False, unmatched braces are ignored
+        and execution will continue.
+    output_memory: If True, the state of the memory at the end of execution is
+        returned.
+    debug: If True, then a full program trace will be returned.
+  Returns:
+    EvalResult namedtuple containing
+      output: List of ints which were written out by the program with the "."
+          operation.
+      success: Boolean. Whether execution completed successfully.
+      failure_reason: One of the attributes of `Status`. Gives extra info
+          about why execution was not successful.
+      steps: Number of execution steps the program ran for.
+      time: Amount of time in seconds the program ran for.
+      memory: If `output_memory` is True, a list of memory cells up to the last
+          one written to. otherwise, None.
+  """
+  input_iter = (
+      LookAheadIterator(input_buffer) if input_buffer is not None
+      else LookAheadIterator([]))
+  # Null memory value. This is the value of an empty memory. Also the value
+  # returned by the read operation when the input buffer is empty, or the
+  # end of the buffer is reached.
+  null_value = 0
+  code = list(code)
+  bracemap, correct_syntax = buildbracemap(code)  # will modify code list
+  if require_correct_syntax and not correct_syntax:
+    return EvalResult([], False, Status.SYNTAX_ERROR, 0, 0.0,
+                      [] if output_memory else None, [] if debug else None)
+  output_buffer = []
+  codeptr, cellptr = 0, 0
+  cells = list(init_memory) if init_memory else [0]
+  program_trace = [] if debug else None
+  success = True
+  reason = Status.SUCCESS
+  start_time = time.time()
+  steps = 0
+  while codeptr < len(code):
+    command = code[codeptr]
+    if debug:
+      # Add step to program trace.
+      program_trace.append(ExecutionSnapshot(
+          codeptr=codeptr, codechar=command, memptr=cellptr,
+          memval=cells[cellptr], memory=list(cells),
+          next_input=input_iter.peek(null_value),
+          output_buffer=list(output_buffer)))
+    if command == '>':
+      cellptr += 1
+      if cellptr == len(cells): cells.append(null_value)
+    if command == '<':
+      cellptr = 0 if cellptr <= 0 else cellptr - 1
+    if command == '+':
+      cells[cellptr] = cells[cellptr] + 1 if cells[cellptr] < (base - 1) else 0
+    if command == '-':
+      cells[cellptr] = cells[cellptr] - 1 if cells[cellptr] > 0 else (base - 1)
+    if command == '[' and cells[cellptr] == 0: codeptr = bracemap[codeptr]
+    if command == ']' and cells[cellptr] != 0: codeptr = bracemap[codeptr]
+    if command == '.': output_buffer.append(cells[cellptr])
+    if command == ',': cells[cellptr] = next(input_iter, null_value)
+    codeptr += 1
+    steps += 1
+    if timeout is not None and time.time() - start_time > timeout:
+      success = False
+      reason = Status.TIMEOUT
+      break
+    if max_steps is not None and steps >= max_steps:
+      success = False
+      reason = Status.STEP_LIMIT
+      break
+  if debug:
+    # Add step to program trace.
+    command = code[codeptr] if codeptr < len(code) else ''
+    program_trace.append(ExecutionSnapshot(
+        codeptr=codeptr, codechar=command, memptr=cellptr,
+        memval=cells[cellptr], memory=list(cells),
+        next_input=input_iter.peek(null_value),
+        output_buffer=list(output_buffer)))
+  return EvalResult(
+      output=output_buffer,
+      success=success,
+      failure_reason=reason,
+      steps=steps,
+      time=time.time() - start_time,
+      memory=cells if output_memory else None,
+      program_trace=program_trace)
--- a/research/brain_coder/common/bf_test.py
+++ b/research/brain_coder/common/bf_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.bf."""
+import tensorflow as tf
+from common import bf  # brain coder
+class BfTest(tf.test.TestCase):
+  def assertCorrectOutput(self, target_output, eval_result):
+    self.assertEqual(target_output, eval_result.output)
+    self.assertTrue(eval_result.success)
+    self.assertEqual(bf.Status.SUCCESS, eval_result.failure_reason)
+  def testBasicOps(self):
+    self.assertCorrectOutput(
+        [3, 1, 2],
+        bf.evaluate('+++.--.+.'))
+    self.assertCorrectOutput(
+        [1, 1, 2],
+        bf.evaluate('+.<.>++.'))
+    self.assertCorrectOutput(
+        [0],
+        bf.evaluate('+,.'))
+    self.assertCorrectOutput(
+        [ord(char) for char in 'Hello World!\n'],
+        bf.evaluate(
+            '>++++++++[-<+++++++++>]<.>>+>-[+]++>++>+++[>[->+++<<+++>]<<]>-----'
+            '.>->+++..+++.>-.<<+[>[+>+]>>]<--------------.>>.+++.------.-------'
+            '-.>+.>+.'))
+  def testBase(self):
+    self.assertCorrectOutput(
+        [1, 4],
+        bf.evaluate('+.--.', base=5, input_buffer=[]))
+  def testInputBuffer(self):
+    self.assertCorrectOutput(
+        [2, 3, 4],
+        bf.evaluate('>,[>,]<[.<]', input_buffer=[4, 3, 2]))
+  def testBadChars(self):
+    self.assertCorrectOutput(
+        [2, 3, 4],
+        bf.evaluate('>,[>,]hello<world[.<]comments',
+                    input_buffer=[4, 3, 2]))
+  def testUnmatchedBraces(self):
+    self.assertCorrectOutput(
+        [3, 6, 1],
+        bf.evaluate('+++.]]]]>----.[[[[[>+.',
+                    input_buffer=[],
+                    base=10,
+                    require_correct_syntax=False))
+    eval_result = bf.evaluate(
+        '+++.]]]]>----.[[[[[>+.',
+        input_buffer=[],
+        base=10,
+        require_correct_syntax=True)
+    self.assertEqual([], eval_result.output)
+    self.assertFalse(eval_result.success)
+    self.assertEqual(bf.Status.SYNTAX_ERROR,
+                     eval_result.failure_reason)
+  def testTimeout(self):
+    er = bf.evaluate('+.[].', base=5, input_buffer=[], timeout=0.1)
+    self.assertEqual(
+        ([1], False, bf.Status.TIMEOUT),
+        (er.output, er.success, er.failure_reason))
+    self.assertTrue(0.07 < er.time < 0.21)
+    er = bf.evaluate('+.[-].', base=5, input_buffer=[], timeout=0.1)
+    self.assertEqual(
+        ([1, 0], True, bf.Status.SUCCESS),
+        (er.output, er.success, er.failure_reason))
+    self.assertTrue(er.time < 0.15)
+  def testMaxSteps(self):
+    er = bf.evaluate('+.[].', base=5, input_buffer=[], timeout=None,
+                     max_steps=100)
+    self.assertEqual(
+        ([1], False, bf.Status.STEP_LIMIT, 100),
+        (er.output, er.success, er.failure_reason, er.steps))
+    er = bf.evaluate('+.[-].', base=5, input_buffer=[], timeout=None,
+                     max_steps=100)
+    self.assertEqual(
+        ([1, 0], True, bf.Status.SUCCESS),
+        (er.output, er.success, er.failure_reason))
+    self.assertTrue(er.steps < 100)
+  def testOutputMemory(self):
+    er = bf.evaluate('+>++>+++>++++.', base=256, input_buffer=[],
+                     output_memory=True)
+    self.assertEqual(
+        ([4], True, bf.Status.SUCCESS),
+        (er.output, er.success, er.failure_reason))
+    self.assertEqual([1, 2, 3, 4], er.memory)
+  def testProgramTrace(self):
+    es = bf.ExecutionSnapshot
+    er = bf.evaluate(',[.>,].', base=256, input_buffer=[2, 1], debug=True)
+    self.assertEqual(
+        [es(codeptr=0, codechar=',', memptr=0, memval=0, memory=[0],
+            next_input=2, output_buffer=[]),
+         es(codeptr=1, codechar='[', memptr=0, memval=2, memory=[2],
+            next_input=1, output_buffer=[]),
+         es(codeptr=2, codechar='.', memptr=0, memval=2, memory=[2],
+            next_input=1, output_buffer=[]),
+         es(codeptr=3, codechar='>', memptr=0, memval=2, memory=[2],
+            next_input=1, output_buffer=[2]),
+         es(codeptr=4, codechar=',', memptr=1, memval=0, memory=[2, 0],
+            next_input=1, output_buffer=[2]),
+         es(codeptr=5, codechar=']', memptr=1, memval=1, memory=[2, 1],
+            next_input=0, output_buffer=[2]),
+         es(codeptr=2, codechar='.', memptr=1, memval=1, memory=[2, 1],
+            next_input=0, output_buffer=[2]),
+         es(codeptr=3, codechar='>', memptr=1, memval=1, memory=[2, 1],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=4, codechar=',', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=5, codechar=']', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=6, codechar='.', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=7, codechar='', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1, 0])],
+        er.program_trace)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/config_lib.py
+++ b/research/brain_coder/common/config_lib.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Objects for storing configuration and passing config into binaries.
+Config class stores settings and hyperparameters for models, data, and anything
+else that may be specific to a particular run.
+"""
+import ast
+import itertools
+class Config(dict):
+  """Stores model configuration, hyperparameters, or dataset parameters."""
+  def __getattr__(self, attr):
+    return self[attr]
+  def __setattr__(self, attr, value):
+    self[attr] = value
+  def pretty_str(self, new_lines=True, indent=2, final_indent=0):
+    prefix = (' ' * indent) if new_lines else ''
+    final_prefix = (' ' * final_indent) if new_lines else ''
+    kv = ['%s%s=%s' % (prefix, k,
+                       (repr(v) if not isinstance(v, Config)
+                        else v.pretty_str(new_lines=new_lines,
+                                          indent=indent+2,
+                                          final_indent=indent)))
+          for k, v in self.items()]
+    if new_lines:
+      return 'Config(\n%s\n%s)' % (',\n'.join(kv), final_prefix)
+    else:
+      return 'Config(%s)' % ', '.join(kv)
+  def _update_iterator(self, *args, **kwargs):
+    """Convert mixed input into an iterator over (key, value) tuples.
+    Follows the dict.update call signature.
+    Args:
+      *args: (Optional) Pass a dict or iterable of (key, value) 2-tuples as
+          an unnamed argument. Only one unnamed argument allowed.
+      **kwargs: (Optional) Pass (key, value) pairs as named arguments, where the
+          argument name is the key and the argument value is the value.
+    Returns:
+      An iterator over (key, value) tuples given in the input.
+    Raises:
+      TypeError: If more than one unnamed argument is given.
+    """
+    if len(args) > 1:
+      raise TypeError('Expected at most 1 unnamed arguments, got %d'
+                      % len(args))
+    obj = args[0] if args else dict()
+    if isinstance(obj, dict):
+      return itertools.chain(obj.items(), kwargs.items())
+    # Assume obj is an iterable of 2-tuples.
+    return itertools.chain(obj, kwargs.items())
+  def make_default(self, keys=None):
+    """Convert OneOf objects into their default configs.
+    Recursively calls into Config objects.
+    Args:
+      keys: Iterable of key names to check. If None, all keys in self will be
+          used.
+    """
+    if keys is None:
+      keys = self.keys()
+    for k in keys:
+      # Replace OneOf with its default value.
+      if isinstance(self[k], OneOf):
+        self[k] = self[k].default()
+      # Recursively call into all Config objects, even those that came from
+      # OneOf objects in the previous code line (for nested OneOf objects).
+      if isinstance(self[k], Config):
+        self[k].make_default()
+  def update(self, *args, **kwargs):
+    """Same as dict.update except nested Config objects are updated.
+    Args:
+      *args: (Optional) Pass a dict or list of (key, value) 2-tuples as unnamed
+          argument.
+      **kwargs: (Optional) Pass (key, value) pairs as named arguments, where the
+          argument name is the key and the argument value is the value.
+    """
+    key_set = set(self.keys())
+    for k, v in self._update_iterator(*args, **kwargs):
+      if k in key_set:
+        key_set.remove(k)  # This key is updated so exclude from make_default.
+      if k in self and isinstance(self[k], Config) and isinstance(v, dict):
+        self[k].update(v)
+      elif k in self and isinstance(self[k], OneOf) and isinstance(v, dict):
+        # Replace OneOf with the chosen config.
+        self[k] = self[k].update(v)
+      else:
+        self[k] = v
+    self.make_default(key_set)
+  def strict_update(self, *args, **kwargs):
+    """Same as Config.update except keys and types are not allowed to change.
+    If a given key is not already in this instance, an exception is raised. If a
+    given value does not have the same type as the existing value for the same
+    key, an exception is raised. Use this method to catch config mistakes.
+    Args:
+      *args: (Optional) Pass a dict or list of (key, value) 2-tuples as unnamed
+          argument.
+      **kwargs: (Optional) Pass (key, value) pairs as named arguments, where the
+          argument name is the key and the argument value is the value.
+    Raises:
+      TypeError: If more than one unnamed argument is given.
+      TypeError: If new value type does not match existing type.
+      KeyError: If a given key is not already defined in this instance.
+    """
+    key_set = set(self.keys())
+    for k, v in self._update_iterator(*args, **kwargs):
+      if k in self:
+        key_set.remove(k)  # This key is updated so exclude from make_default.
+        if isinstance(self[k], Config):
+          if not isinstance(v, dict):
+            raise TypeError('dict required for Config value, got %s' % type(v))
+          self[k].strict_update(v)
+        elif isinstance(self[k], OneOf):
+          if not isinstance(v, dict):
+            raise TypeError('dict required for OneOf value, got %s' % type(v))
+          # Replace OneOf with the chosen config.
+          self[k] = self[k].strict_update(v)
+        else:
+          if not isinstance(v, type(self[k])):
+            raise TypeError('Expecting type %s for key %s, got type %s'
+                            % (type(self[k]), k, type(v)))
+          self[k] = v
+      else:
+        raise KeyError(
+            'Key %s does not exist. New key creation not allowed in '
+            'strict_update.' % k)
+    self.make_default(key_set)
+  @staticmethod
+  def from_str(config_str):
+    """Inverse of Config.__str__."""
+    parsed = ast.literal_eval(config_str)
+    assert isinstance(parsed, dict)
+    def _make_config(dictionary):
+      for k, v in dictionary.items():
+        if isinstance(v, dict):
+          dictionary[k] = _make_config(v)
+      return Config(**dictionary)
+    return _make_config(parsed)
+  @staticmethod
+  def parse(key_val_string):
+    """Parse hyperparameter string into Config object.
+    Format is 'key=val,key=val,...'
+    Values can be any python literal, or another Config object encoded as
+    'c(key=val,key=val,...)'.
+    c(...) expressions can be arbitrarily nested.
+    Example:
+    'a=1,b=3e-5,c=[1,2,3],d="hello world",e={"a":1,"b":2},f=c(x=1,y=[10,20])'
+    Args:
+      key_val_string: The hyperparameter string.
+    Returns:
+      Config object parsed from the input string.
+    """
+    if not key_val_string.strip():
+      return Config()
+    def _pair_to_kv(pair):
+      split_index = pair.find('=')
+      key, val = pair[:split_index].strip(), pair[split_index+1:].strip()
+      if val.startswith('c(') and val.endswith(')'):
+        val = Config.parse(val[2:-1])
+      else:
+        val = ast.literal_eval(val)
+      return key, val
+    return Config(**dict([_pair_to_kv(pair)
+                          for pair in _comma_iterator(key_val_string)]))
+class OneOf(object):
+  """Stores branching config.
+  In some cases there may be options which each have their own set of config
+  params. For example, if specifying config for an environment, each environment
+  can have custom config options. OneOf is a way to organize branching config.
+  Usage example:
+  one_of = OneOf(
+      [Config(a=1, b=2),
+       Config(a=2, c='hello'),
+       Config(a=3, d=10, e=-10)],
+      a=1)
+  config = one_of.strict_update(Config(a=3, d=20))
+  config == {'a': 3, 'd': 20, 'e': -10}
+  """
+  def __init__(self, choices, **kwargs):
+    """Constructor.
+    Usage: OneOf([Config(...), Config(...), ...], attribute=default_value)
+    Args:
+      choices: An iterable of Config objects. When update/strict_update is
+          called on this OneOf, one of these Config will be selected.
+      **kwargs: Give exactly one config attribute to branch on. The value of
+          this attribute during update/strict_update will determine which
+          Config is used.
+    Raises:
+      ValueError: If kwargs does not contain exactly one entry. Should give one
+          named argument which is used as the attribute to condition on.
+    """
+    if len(kwargs) != 1:
+      raise ValueError(
+          'Incorrect usage. Must give exactly one named argument. The argument '
+          'name is the config attribute to condition on, and the argument '
+          'value is the default choice. Got %d named arguments.' % len(kwargs))
+    key, default_value = kwargs.items()[0]
+    self.key = key
+    self.default_value = default_value
+    # Make sure each choice is a Config object.
+    for config in choices:
+      if not isinstance(config, Config):
+        raise TypeError('choices must be a list of Config objects. Got %s.'
+                        % type(config))
+    # Map value for key to the config with that value.
+    self.value_map = {config[key]: config for config in choices}
+    self.default_config = self.value_map[self.default_value]
+    # Make sure there are no duplicate values.
+    if len(self.value_map) != len(choices):
+      raise ValueError('Multiple choices given for the same value of %s.' % key)
+    # Check that the default value is valid.
+    if self.default_value not in self.value_map:
+      raise ValueError(
+          'Default value is not an available choice. Got %s=%s. Choices are %s.'
+          % (key, self.default_value, self.value_map.keys()))
+  def default(self):
+    return self.default_config
+  def update(self, other):
+    """Choose a config and update it.
+    If `other` is a Config, one of the config choices is selected and updated.
+    Otherwise `other` is returned.
+    Args:
+      other: Will update chosen config with this value by calling `update` on
+          the config.
+    Returns:
+      The chosen config after updating it, or `other` if no config could be
+      selected.
+    """
+    if not isinstance(other, Config):
+      return other
+    if self.key not in other or other[self.key] not in self.value_map:
+      return other
+    target = self.value_map[other[self.key]]
+    target.update(other)
+    return target
+  def strict_update(self, config):
+    """Choose a config and update it.
+    `config` must be a Config object. `config` must have the key used to select
+    among the config choices, and that key must have a value which one of the
+    config choices has.
+    Args:
+      config: A Config object. the chosen config will be update by calling
+           `strict_update`.
+    Returns:
+      The chosen config after updating it.
+    Raises:
+      TypeError: If `config` is not a Config instance.
+      ValueError: If `config` does not have the branching key in its key set.
+      ValueError: If the value of the config's branching key is not one of the
+          valid choices.
+    """
+    if not isinstance(config, Config):
+      raise TypeError('Expecting Config instance, got %s.' % type(config))
+    if self.key not in config:
+      raise ValueError(
+          'Branching key %s required but not found in %s' % (self.key, config))
+    if config[self.key] not in self.value_map:
+      raise ValueError(
+          'Value %s for key %s is not a possible choice. Choices are %s.'
+          % (config[self.key], self.key, self.value_map.keys()))
+    target = self.value_map[config[self.key]]
+    target.strict_update(config)
+    return target
+def _next_comma(string, start_index):
+  """Finds the position of the next comma not used in a literal collection."""
+  paren_count = 0
+  for i in xrange(start_index, len(string)):
+    c = string[i]
+    if c == '(' or c == '[' or c == '{':
+      paren_count += 1
+    elif c == ')' or c == ']' or c == '}':
+      paren_count -= 1
+    if paren_count == 0 and c == ',':
+      return i
+  return -1
+def _comma_iterator(string):
+  index = 0
+  while 1:
+    next_index = _next_comma(string, index)
+    if next_index == -1:
+      yield string[index:]
+      return
+    yield string[index:next_index]
+    index = next_index + 1
--- a/research/brain_coder/common/config_lib_test.py
+++ b/research/brain_coder/common/config_lib_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.config_lib."""
+import tensorflow as tf
+from common import config_lib  # brain coder
+class ConfigLibTest(tf.test.TestCase):
+  def testConfig(self):
+    config = config_lib.Config(hello='world', foo='bar', num=123, f=56.7)
+    self.assertEqual('world', config.hello)
+    self.assertEqual('bar', config['foo'])
+    config.hello = 'everyone'
+    config['bar'] = 9000
+    self.assertEqual('everyone', config['hello'])
+    self.assertEqual(9000, config.bar)
+    self.assertEqual(5, len(config))
+  def testConfigUpdate(self):
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.update({'b': 10, 'd': 4})
+    self.assertEqual({'a': 1, 'b': 10, 'c': 3, 'd': 4}, config)
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.update(b=10, d=4)
+    self.assertEqual({'a': 1, 'b': 10, 'c': 3, 'd': 4}, config)
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.update({'e': 5}, b=10, d=4)
+    self.assertEqual({'a': 1, 'b': 10, 'c': 3, 'd': 4, 'e': 5}, config)
+    config = config_lib.Config(
+        a=1,
+        b=2,
+        x=config_lib.Config(
+            l='a',
+            y=config_lib.Config(m=1, n=2),
+            z=config_lib.Config(
+                q=config_lib.Config(a=10, b=20),
+                r=config_lib.Config(s=1, t=2))))
+    config.update(x={'y': {'m': 10}, 'z': {'r': {'s': 5}}})
+    self.assertEqual(
+        config_lib.Config(
+            a=1, b=2,
+            x=config_lib.Config(
+                l='a',
+                y=config_lib.Config(m=10, n=2),
+                z=config_lib.Config(
+                    q=config_lib.Config(a=10, b=20),
+                    r=config_lib.Config(s=5, t=2)))),
+        config)
+    config = config_lib.Config(
+        foo='bar',
+        num=100,
+        x=config_lib.Config(a=1, b=2, c=config_lib.Config(h=10, i=20, j=30)),
+        y=config_lib.Config(qrs=5, tuv=10),
+        d={'a': 1, 'b': 2},
+        l=[1, 2, 3])
+    config.update(
+        config_lib.Config(
+            foo='hat',
+            num=50.5,
+            x={'a': 5, 'z': -10},
+            y=config_lib.Config(wxyz=-1)),
+        d={'a': 10, 'c': 20},
+        l=[3, 4, 5, 6])
+    self.assertEqual(
+        config_lib.Config(
+            foo='hat',
+            num=50.5,
+            x=config_lib.Config(a=5, b=2, z=-10,
+                                c=config_lib.Config(h=10, i=20, j=30)),
+            y=config_lib.Config(qrs=5, tuv=10, wxyz=-1),
+            d={'a': 10, 'c': 20},
+            l=[3, 4, 5, 6]),
+        config)
+    self.assertTrue(isinstance(config.x, config_lib.Config))
+    self.assertTrue(isinstance(config.x.c, config_lib.Config))
+    self.assertTrue(isinstance(config.y, config_lib.Config))
+    config = config_lib.Config(
+        foo='bar',
+        num=100,
+        x=config_lib.Config(a=1, b=2, c=config_lib.Config(h=10, i=20, j=30)),
+        y=config_lib.Config(qrs=5, tuv=10),
+        d={'a': 1, 'b': 2},
+        l=[1, 2, 3])
+    config.update(
+        config_lib.Config(
+            foo=1234,
+            num='hello',
+            x={'a': 5, 'z': -10, 'c': {'h': -5, 'k': 40}},
+            y=[1, 2, 3, 4],
+            d='stuff',
+            l={'a': 1, 'b': 2}))
+    self.assertEqual(
+        config_lib.Config(
+            foo=1234,
+            num='hello',
+            x=config_lib.Config(a=5, b=2, z=-10,
+                                c=config_lib.Config(h=-5, i=20, j=30, k=40)),
+            y=[1, 2, 3, 4],
+            d='stuff',
+            l={'a': 1, 'b': 2}),
+        config)
+    self.assertTrue(isinstance(config.x, config_lib.Config))
+    self.assertTrue(isinstance(config.x.c, config_lib.Config))
+    self.assertTrue(isinstance(config.y, list))
+  def testConfigStrictUpdate(self):
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.strict_update({'b': 10, 'c': 20})
+    self.assertEqual({'a': 1, 'b': 10, 'c': 20}, config)
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.strict_update(b=10, c=20)
+    self.assertEqual({'a': 1, 'b': 10, 'c': 20}, config)
+    config = config_lib.Config(a=1, b=2, c=3, d=4)
+    config.strict_update({'d': 100}, b=10, a=20)
+    self.assertEqual({'a': 20, 'b': 10, 'c': 3, 'd': 100}, config)
+    config = config_lib.Config(
+        a=1,
+        b=2,
+        x=config_lib.Config(
+            l='a',
+            y=config_lib.Config(m=1, n=2),
+            z=config_lib.Config(
+                q=config_lib.Config(a=10, b=20),
+                r=config_lib.Config(s=1, t=2))))
+    config.strict_update(x={'y': {'m': 10}, 'z': {'r': {'s': 5}}})
+    self.assertEqual(
+        config_lib.Config(
+            a=1, b=2,
+            x=config_lib.Config(
+                l='a',
+                y=config_lib.Config(m=10, n=2),
+                z=config_lib.Config(
+                    q=config_lib.Config(a=10, b=20),
+                    r=config_lib.Config(s=5, t=2)))),
+        config)
+    config = config_lib.Config(
+        foo='bar',
+        num=100,
+        x=config_lib.Config(a=1, b=2, c=config_lib.Config(h=10, i=20, j=30)),
+        y=config_lib.Config(qrs=5, tuv=10),
+        d={'a': 1, 'b': 2},
+        l=[1, 2, 3])
+    config.strict_update(
+        config_lib.Config(
+            foo='hat',
+            num=50,
+            x={'a': 5, 'c': {'h': 100}},
+            y=config_lib.Config(tuv=-1)),
+        d={'a': 10, 'c': 20},
+        l=[3, 4, 5, 6])
+    self.assertEqual(
+        config_lib.Config(
+            foo='hat',
+            num=50,
+            x=config_lib.Config(a=5, b=2,
+                                c=config_lib.Config(h=100, i=20, j=30)),
+            y=config_lib.Config(qrs=5, tuv=-1),
+            d={'a': 10, 'c': 20},
+            l=[3, 4, 5, 6]),
+        config)
+  def testConfigStrictUpdateFail(self):
+    config = config_lib.Config(a=1, b=2, c=3, x=config_lib.Config(a=1, b=2))
+    with self.assertRaises(KeyError):
+      config.strict_update({'b': 10, 'c': 20, 'd': 50})
+    with self.assertRaises(KeyError):
+      config.strict_update(b=10, d=50)
+    with self.assertRaises(KeyError):
+      config.strict_update(x={'c': 3})
+    with self.assertRaises(TypeError):
+      config.strict_update(a='string')
+    with self.assertRaises(TypeError):
+      config.strict_update(x={'a': 'string'})
+    with self.assertRaises(TypeError):
+      config.strict_update(x=[1, 2, 3])
+  def testConfigFromStr(self):
+    config = config_lib.Config.from_str("{'c': {'d': 5}, 'b': 2, 'a': 1}")
+    self.assertEqual(
+        {'c': {'d': 5}, 'b': 2, 'a': 1}, config)
+    self.assertTrue(isinstance(config, config_lib.Config))
+    self.assertTrue(isinstance(config.c, config_lib.Config))
+  def testConfigParse(self):
+    config = config_lib.Config.parse(
+        'hello="world",num=1234.5,lst=[10,20.5,True,"hi",("a","b","c")],'
+        'dct={9:10,"stuff":"qwerty","subdict":{1:True,2:False}},'
+        'subconfig=c(a=1,b=[1,2,[3,4]],c=c(f="f",g="g"))')
+    self.assertEqual(
+        {'hello': 'world', 'num': 1234.5,
+         'lst': [10, 20.5, True, 'hi', ('a', 'b', 'c')],
+         'dct': {9: 10, 'stuff': 'qwerty', 'subdict': {1: True, 2: False}},
+         'subconfig': {'a': 1, 'b': [1, 2, [3, 4]], 'c': {'f': 'f', 'g': 'g'}}},
+        config)
+    self.assertTrue(isinstance(config, config_lib.Config))
+    self.assertTrue(isinstance(config.subconfig, config_lib.Config))
+    self.assertTrue(isinstance(config.subconfig.c, config_lib.Config))
+    self.assertFalse(isinstance(config.dct, config_lib.Config))
+    self.assertFalse(isinstance(config.dct['subdict'], config_lib.Config))
+    self.assertTrue(isinstance(config.lst[4], tuple))
+  def testConfigParseErrors(self):
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=[1,2,b="hello"')
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=1,b=c(x="a",y="b"')
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=1,b=c(x="a")y="b"')
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=1,b=c(x="a"),y="b",')
+  def testOneOf(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(task=2, a='world', b='stuff'),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=1,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=1, a='hi'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='hi', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=3)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=3, c=1234),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='world', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=4,d=9999)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=4, d=9999),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=5'))
+    self.assertEqual(
+        config_lib.Config(
+            data=5,
+            model=config_lib.Config(stuff=2)),
+        config)
+  def testOneOfStrict(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(task=2, a='world', b='stuff'),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=1,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=1, a='hi'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='hi', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=3)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=3, c=1234),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='world', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+  def testNestedOneOf(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(
+                   task=2,
+                   a=config_lib.OneOf(
+                       [config_lib.Config(x=1, y=2),
+                        config_lib.Config(x=-1, y=1000, z=4)],
+                       x=1)),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a=c(x=-1,z=8))'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=-1, y=1000, z=8)),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a=c(x=-1,z=8))'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=-1, y=1000, z=8)),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse('model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=1, y=2)),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse('model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=1, y=2)),
+            model=config_lib.Config(stuff=2)),
+        config)
+  def testOneOfStrictErrors(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(task=2, a='world', b='stuff'),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    with self.assertRaises(TypeError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=[1,2,3]'))
+    config = make_config()
+    with self.assertRaises(KeyError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=c(task=3,c=5678,d=9999)'))
+    config = make_config()
+    with self.assertRaises(ValueError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=c(task=4,d=9999)'))
+    config = make_config()
+    with self.assertRaises(TypeError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=5'))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/reward.py
+++ b/research/brain_coder/common/reward.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Reward functions, distance functions, and reward managers."""
+from abc import ABCMeta
+from abc import abstractmethod
+from math import log
+# All sequences here are assumed to be lists of ints bounded
+# between 0 and `base`-1 (inclusive).
+#################################
+### Scalar Distance Functions ###
+#################################
+def abs_diff(a, b, base=0):
+  """Absolute value of difference between scalars.
+  abs_diff is symmetric, i.e. `a` and `b` are interchangeable.
+  Args:
+    a: First argument. An int.
+    b: Seconds argument. An int.
+    base: Dummy argument so that the argument signature matches other scalar
+        diff functions. abs_diff is the same in all bases.
+  Returns:
+    abs(a - b).
+  """
+  del base  # Unused.
+  return abs(a - b)
+def mod_abs_diff(a, b, base):
+  """Shortest distance between `a` and `b` in the modular integers base `base`.
+  The smallest distance between a and b is returned.
+  Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98.
+  mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable.
+  Args:
+    a: First argument. An int.
+    b: Seconds argument. An int.
+    base: The modulo base. A positive int.
+  Returns:
+    Shortest distance.
+  """
+  diff = abs(a - b)
+  if diff >= base:
+    diff %= base
+  return min(diff, (-diff) + base)
+###############################
+### List Distance Functions ###
+###############################
+def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff):
+  """Asymmetric list distance function.
+  List distance is the sum of element-wise distances, like Hamming distance, but
+  where `pred` can be longer or shorter than `target`. For each position in both
+  `pred` and `target`, distance between those elements is computed with
+  `scalar_diff_fn`. For missing or extra elements in `pred`, the maximum
+  distance is assigned, which is equal to `base`.
+  Distance is 0 when `pred` and `target` are identical, and will be a positive
+  integer when they are not.
+  Args:
+    pred: Prediction list. Distance from this list is computed.
+    target: Target list. Distance to this list is computed.
+    base: The integer base to use. For example, a list of chars would use base
+        256.
+    scalar_diff_fn: Element-wise distance function.
+  Returns:
+    List distance between `pred` and `target`.
+  """
+  d = 0
+  for i, target_t in enumerate(target):
+    if i >= len(pred):
+      d += base  # A missing slot is worth the max distance.
+    else:
+      # Add element-wise distance for this slot.
+      d += scalar_diff_fn(pred[i], target_t, base)
+  if len(pred) > len(target):
+    # Each extra slot is worth the max distance.
+    d += (len(pred) - len(target)) * base
+  return d
+def log_absolute_distance(pred, target, base):
+  """Asymmetric list distance function that uses log distance.
+  A list distance which computes sum of element-wise distances, similar to
+  `absolute_distance`. Unlike `absolute_distance`, this scales the resulting
+  distance to be a float.
+  Element-wise distance are log-scale. Distance between two list changes
+  relatively less for elements that are far apart, but changes a lot (goes to 0
+  faster) when values get close together.
+  Args:
+    pred: List of ints. Computes distance from this list to the target.
+    target: List of ints. This is the "correct" list which the prediction list
+        is trying to match.
+    base: Integer base.
+  Returns:
+    Float distance normalized so that when `pred` is at most as long as `target`
+    the distance is between 0.0 and 1.0. Distance grows unboundedly large
+    as `pred` grows past `target` in length.
+  """
+  if not target:
+    length_normalizer = 1.0
+    if not pred:
+      # Distance between [] and [] is 0.0 since they are equal.
+      return 0.0
+  else:
+    length_normalizer = float(len(target))
+  # max_dist is the maximum element-wise distance, before taking log and
+  # scaling. Since we use `mod_abs_diff`, it would be (base // 2), but we add
+  # 1 to it so that missing or extra positions get the maximum penalty.
+  max_dist = base // 2 + 1
+  # The log-distance will be scaled by a factor.
+  # Note: +1 is added to the numerator and denominator to avoid log(0). This
+  # only has a translational effect, i.e. log(dist + 1) / log(max_dist + 1).
+  factor = log(max_dist + 1)
+  d = 0.0  # Total distance to be computed.
+  for i, target_t in enumerate(target):
+    if i >= len(pred):
+      # Assign the max element-wise distance for missing positions. This is 1.0
+      # after scaling.
+      d += 1.0
+    else:
+      # Add the log-dist divided by a scaling factor.
+      d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor
+  if len(pred) > len(target):
+    # Add the max element-wise distance for each extra position.
+    # Since max dist after scaling is 1, this is just the difference in list
+    # lengths.
+    d += (len(pred) - len(target))
+  return d / length_normalizer  # Normalize again by the target length.
+########################
+### Reward Functions ###
+########################
+# Reward functions assign reward based on program output.
+# Warning: only use these functions as the terminal rewards in episodes, i.e.
+# for the "final" programs.
+def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff):
+  """Reward function based on absolute_distance function.
+  Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
+  so that 0.0 reward is given when `pred` is the empty list (assuming `target`
+  is not empty). Reward can go negative when `pred` is longer than `target`.
+  This is an asymmetric reward function, so which list is the prediction and
+  which is the target matters.
+  Args:
+    pred: Prediction sequence. This should be the sequence outputted by the
+        generated code. List of ints n, where 0 <= n < base.
+    target: Target sequence. The correct sequence that the generated code needs
+        to output. List of ints n, where 0 <= n < base.
+    base: Base of the computation.
+    scalar_diff_fn: Element-wise distance function.
+  Returns:
+    Reward computed based on `pred` and `target`. A float.
+  """
+  unit_dist = float(base * len(target))
+  if unit_dist == 0:
+    unit_dist = base
+  dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn)
+  return (unit_dist - dist) / unit_dist
+def absolute_mod_distance_reward(pred, target, base):
+  """Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used.
+  Args:
+    pred: Prediction sequence. This should be the sequence outputted by the
+        generated code. List of ints n, where 0 <= n < base.
+    target: Target sequence. The correct sequence that the generated code needs
+        to output. List of ints n, where 0 <= n < base.
+    base: Base of the computation.
+  Returns:
+    Reward computed based on `pred` and `target`. A float.
+  """
+  return absolute_distance_reward(pred, target, base, mod_abs_diff)
+def absolute_log_distance_reward(pred, target, base):
+  """Compute reward using `log_absolute_distance`.
+  Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
+  so that 0.0 reward is given when `pred` is the empty list (assuming `target`
+  is not empty). Reward can go negative when `pred` is longer than `target`.
+  This is an asymmetric reward function, so which list is the prediction and
+  which is the target matters.
+  This reward function has the nice property that much more reward is given
+  for getting the correct value (at each position) than for there being any
+  value at all. For example, in base 100, lets say pred = [1] * 1000
+  and target = [10] * 1000. A lot of reward would be given for being 80%
+  accurate (worst element-wise distance is 50, distances here are 9) using
+  `absolute_distance`. `log_absolute_distance` on the other hand will give
+  greater and greater reward increments the closer each predicted value gets to
+  the target. That makes the reward given for accuracy somewhat independant of
+  the base.
+  Args:
+    pred: Prediction sequence. This should be the sequence outputted by the
+        generated code. List of ints n, where 0 <= n < base.
+    target: Target sequence. The correct sequence that the generated code needs
+        to output. List of ints n, where 0 <= n < base.
+    base: Base of the computation.
+  Returns:
+    Reward computed based on `pred` and `target`. A float.
+  """
+  return 1.0 - log_absolute_distance(pred, target, base)
+#######################
+### Reward Managers ###
+#######################
+# Reward managers assign reward to many code attempts throughout an episode.
+class RewardManager(object):
+  """Reward managers administer reward across an episode.
+  Reward managers are used for "editor" environments. These are environments
+  where the agent has some way to edit its code over time, and run its code
+  many time in the same episode, so that it can make incremental improvements.
+  Reward managers are instantiated with a target sequence, which is the known
+  correct program output. The manager is called on the output from a proposed
+  code, and returns reward. If many proposal outputs are tried, reward may be
+  some stateful function that takes previous tries into account. This is done,
+  in part, so that an agent cannot accumulate unbounded reward just by trying
+  junk programs as often as possible. So reward managers should not give the
+  same reward twice if the next proposal is not better than the last.
+  """
+  __metaclass__ = ABCMeta
+  def __init__(self, target, base, distance_fn=absolute_distance):
+    self._target = list(target)
+    self._base = base
+    self._distance_fn = distance_fn
+  @abstractmethod
+  def __call__(self, sequence):
+    """Call this reward manager like a function to get reward.
+    Calls to reward manager are stateful, and will take previous sequences
+    into account. Repeated calls with the same sequence may produce different
+    rewards.
+    Args:
+      sequence: List of integers (each between 0 and base - 1). This is the
+          proposal sequence. Reward will be computed based on the distance
+          from this sequence to the target (distance function and target are
+          given in the constructor), as well as previous sequences tried during
+          the lifetime of this object.
+    Returns:
+      Float value. The reward received from this call.
+    """
+    return 0.0
+class DeltaRewardManager(RewardManager):
+  """Simple reward manager that assigns reward for the net change in distance.
+  Given some (possibly asymmetric) list distance function, gives reward for
+  relative changes in prediction distance to the target.
+  For example, if on the first call the distance is 3.0, the change in distance
+  is -3 (from starting distance of 0). That relative change will be scaled to
+  produce a negative reward for this step. On the next call, the distance is 2.0
+  which is a +1 change, and that will be scaled to give a positive reward.
+  If the final call has distance 0 (the target is achieved), that is another
+  positive change of +2. The total reward across all 3 calls is then 0, which is
+  the highest posible episode total.
+  Reward is scaled so that the maximum element-wise distance is worth 1.0.
+  Maximum total episode reward attainable is 0.
+  """
+  def __init__(self, target, base, distance_fn=absolute_distance):
+    super(DeltaRewardManager, self).__init__(target, base, distance_fn)
+    self._last_diff = 0
+  def _diff(self, seq):
+    return self._distance_fn(seq, self._target, self._base)
+  def _delta_reward(self, seq):
+    # Reward is relative to previous sequence diff.
+    # Reward is scaled so that maximum token difference is worth 1.0.
+    # Reward = (last_diff - this_diff) / self.base.
+    # Reward is positive if this sequence is closer to the target than the
+    # previous sequence, and negative if this sequence is further away.
+    diff = self._diff(seq)
+    reward = (self._last_diff - diff) / float(self._base)
+    self._last_diff = diff
+    return reward
+  def __call__(self, seq):
+    return self._delta_reward(seq)
+class FloorRewardManager(RewardManager):
+  """Assigns positive reward for each step taken closer to the target.
+  Given some (possibly asymmetric) list distance function, gives reward for
+  whenever a new episode minimum distance is reached. No reward is given if
+  the distance regresses to a higher value, so that the sum of rewards
+  for the episode is positive.
+  Reward is scaled so that the maximum element-wise distance is worth 1.0.
+  Maximum total episode reward attainable is len(target).
+  If the prediction sequence is longer than the target, a reward of -1 is given.
+  Subsequence predictions which are also longer get 0 reward. The -1 penalty
+  will be canceled out with a +1 reward when a prediction is given which is at
+  most the length of the target.
+  """
+  def __init__(self, target, base, distance_fn=absolute_distance):
+    super(FloorRewardManager, self).__init__(target, base, distance_fn)
+    self._last_diff = 0
+    self._min_diff = self._max_diff()
+    self._too_long_penality_given = False
+  def _max_diff(self):
+    return self._distance_fn([], self._target, self._base)
+  def _diff(self, seq):
+    return self._distance_fn(seq, self._target, self._base)
+  def _delta_reward(self, seq):
+    # Reward is only given if this sequence is closer to the target than any
+    # previous sequence.
+    # Reward is scaled so that maximum token difference is worth 1.0
+    # Reward = (min_diff - this_diff) / self.base
+    # Reward is always positive.
+    diff = self._diff(seq)
+    if diff < self._min_diff:
+      reward = (self._min_diff - diff) / float(self._base)
+      self._min_diff = diff
+    else:
+      reward = 0.0
+    return reward
+  def __call__(self, seq):
+    if len(seq) > len(self._target):  # Output is too long.
+      if not self._too_long_penality_given:
+        self._too_long_penality_given = True
+        reward = -1.0
+      else:
+        reward = 0.0  # Don't give this penalty more than once.
+      return reward
+    reward = self._delta_reward(seq)
+    if self._too_long_penality_given:
+      reward += 1.0  # Return the subtracted reward.
+      self._too_long_penality_given = False
+    return reward
--- a/research/brain_coder/common/reward_test.py
+++ b/research/brain_coder/common/reward_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.reward."""
+from math import log
+import numpy as np
+import tensorflow as tf
+from common import reward  # brain coder
+class RewardTest(tf.test.TestCase):
+  def testAbsDiff(self):
+    self.assertEqual(5, reward.abs_diff(15, 20))
+    self.assertEqual(5, reward.abs_diff(20, 15))
+  def testModAbsDiff(self):
+    self.assertEqual(5, reward.mod_abs_diff(15, 20, 25))
+    self.assertEqual(5, reward.mod_abs_diff(20, 15, 25))
+    self.assertEqual(2, reward.mod_abs_diff(1, 24, 25))
+    self.assertEqual(2, reward.mod_abs_diff(24, 1, 25))
+    self.assertEqual(0, reward.mod_abs_diff(0, 0, 5))
+    self.assertEqual(1, reward.mod_abs_diff(0, 1, 5))
+    self.assertEqual(2, reward.mod_abs_diff(0, 2, 5))
+    self.assertEqual(2, reward.mod_abs_diff(0, 3, 5))
+    self.assertEqual(1, reward.mod_abs_diff(0, 4, 5))
+    self.assertEqual(0, reward.mod_abs_diff(-1, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(-5, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(-7, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(13, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(15, 4, 5))
+  def testAbsoluteDistance_AbsDiffMethod(self):
+    self.assertEqual(
+        4,
+        reward.absolute_distance([0], [4], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([4], [4], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([], [], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1], [], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([], [1], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([1, 2, 3], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2, 3, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        6,
+        reward.absolute_distance([4, 4, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+  def testAbsoluteDistance_ModDiffMethod(self):
+    self.assertEqual(
+        1,
+        reward.absolute_distance([0], [4], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([4], [4], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([], [], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1], [], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([], [1], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([1, 2, 3], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2, 3, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([4, 4, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+  def testLogAbsoluteDistance(self):
+    def log_diff(diff, base):
+      return log(diff + 1) / log(base // 2 + 2)
+    self.assertEqual(
+        log_diff(1, 5),
+        reward.log_absolute_distance([0], [4], 5))
+    self.assertEqual(
+        log_diff(2, 5),
+        reward.log_absolute_distance([1], [4], 5))
+    self.assertEqual(
+        log_diff(2, 5),
+        reward.log_absolute_distance([2], [4], 5))
+    self.assertEqual(
+        log_diff(1, 5),
+        reward.log_absolute_distance([3], [4], 5))
+    self.assertEqual(
+        log_diff(3, 5),  # max_dist = base // 2 + 1 = 3
+        reward.log_absolute_distance([], [4], 5))
+    self.assertEqual(
+        0 + log_diff(3, 5),  # max_dist = base // 2 + 1 = 3
+        reward.log_absolute_distance([4, 4], [4], 5))
+    self.assertEqual(
+        0,
+        reward.log_absolute_distance([4], [4], 5))
+    self.assertEqual(
+        0,
+        reward.log_absolute_distance([], [], 5))
+    self.assertEqual(
+        1,
+        reward.log_absolute_distance([1], [], 5))
+    self.assertEqual(
+        1,
+        reward.log_absolute_distance([], [1], 5))
+    self.assertEqual(
+        0,
+        reward.log_absolute_distance([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(1, 5) / 3,  # divided by target length.
+        reward.log_absolute_distance([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(1, 5) / 3,
+        reward.log_absolute_distance([1, 2, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(3, 5) / 3,  # max_dist
+        reward.log_absolute_distance([1, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(3, 5) / 3,  # max_dist
+        reward.log_absolute_distance([1, 2, 3, 4], [1, 2, 3], 5))
+    # Add log differences for each position.
+    self.assertEqual(
+        (log_diff(2, 5) + log_diff(2, 5) + log_diff(1, 5)) / 3,
+        reward.log_absolute_distance([4, 4, 4], [1, 2, 3], 5))
+  def testAbsoluteDistanceReward(self):
+    self.assertEqual(
+        1,
+        reward.absolute_distance_reward([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),  # 1 - distance / (base * target_len)
+        reward.absolute_distance_reward([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),
+        reward.absolute_distance_reward([1, 2, 2], [1, 2, 3], 5))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_distance_reward([1, 2], [1, 2, 3], 5)))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_distance_reward([1, 2, 3, 4], [1, 2, 3], 5)))
+    # Add log differences for each position.
+    self.assertEqual(
+        1 - (3 + 2 + 1) / (5 * 3.),
+        reward.absolute_distance_reward([4, 4, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1,
+        reward.absolute_distance_reward([], [], 5))
+  def testAbsoluteModDistanceReward(self):
+    self.assertEqual(
+        1,
+        reward.absolute_mod_distance_reward([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),  # 1 - distance / (base * target_len)
+        reward.absolute_mod_distance_reward([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),
+        reward.absolute_mod_distance_reward([1, 2, 2], [1, 2, 3], 5))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_mod_distance_reward([1, 2], [1, 2, 3], 5)))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_mod_distance_reward([1, 2, 3, 4], [1, 2, 3], 5)))
+    # Add log differences for each position.
+    self.assertTrue(np.isclose(
+        1 - (2 + 2 + 1) / (5 * 3.),
+        reward.absolute_mod_distance_reward([4, 4, 4], [1, 2, 3], 5)))
+    self.assertTrue(np.isclose(
+        1 - (1 + 2 + 2) / (5 * 3.),
+        reward.absolute_mod_distance_reward([0, 1, 2], [4, 4, 4], 5)))
+    self.assertEqual(
+        1,
+        reward.absolute_mod_distance_reward([], [], 5))
+  def testAbsoluteLogDistanceReward(self):
+    def log_diff(diff, base):
+      return log(diff + 1) / log(base // 2 + 2)
+    self.assertEqual(
+        1,
+        reward.absolute_log_distance_reward([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(1, 5) / 3,  # divided by target length.
+        reward.absolute_log_distance_reward([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(1, 5) / 3,
+        reward.absolute_log_distance_reward([1, 2, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(3, 5) / 3,  # max_dist
+        reward.absolute_log_distance_reward([1, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(3, 5) / 3,  # max_dist
+        reward.absolute_log_distance_reward([1, 2, 3, 4], [1, 2, 3], 5))
+    # Add log differences for each position.
+    self.assertEqual(
+        1 - (log_diff(2, 5) + log_diff(2, 5) + log_diff(1, 5)) / 3,
+        reward.absolute_log_distance_reward([4, 4, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - (log_diff(1, 5) + log_diff(2, 5) + log_diff(2, 5)) / 3,
+        reward.absolute_log_distance_reward([0, 1, 2], [4, 4, 4], 5))
+    self.assertEqual(
+        1,
+        reward.absolute_log_distance_reward([], [], 5))
+  def testDeltaRewardManager(self):
+    reward_manager = reward.DeltaRewardManager(
+        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
+    self.assertEqual(-3, reward_manager([1]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(4 / 5., reward_manager([1, 3]))
+    self.assertEqual(-4 / 5, reward_manager([1]))
+    self.assertEqual(3, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(-1, reward_manager([1, 2, 3]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 3]))
+    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 3, 2]))
+    self.assertEqual(2, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+  def testFloorRewardMananger(self):
+    reward_manager = reward.FloorRewardManager(
+        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
+    self.assertEqual(1, reward_manager([1]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(4 / 5., reward_manager([1, 3]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(1 / 5., reward_manager([1, 2]))
+    self.assertEqual(0, reward_manager([0, 1]))
+    self.assertEqual(0, reward_manager([]))
+    self.assertEqual(0, reward_manager([1, 2]))
+    self.assertEqual(2, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3]))
+    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 3]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 3, 2]))
+    self.assertEqual(1, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+    reward_manager = reward.FloorRewardManager(
+        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
+    self.assertEqual(1, reward_manager([1]))
+    self.assertEqual(-1, reward_manager([1, 0, 0, 0, 0, 0]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 0, 0]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 0]))
+    self.assertEqual(1, reward_manager([]))
+    self.assertEqual(0, reward_manager([]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(1, reward_manager([1, 2]))
+    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 0, 0]))
+    self.assertEqual(0, reward_manager([1, 1, 1, 1, 1]))
+    self.assertEqual(1 + 2, reward_manager([1, 2, 3, 4]))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/rollout.py
+++ b/research/brain_coder/common/rollout.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Utilities related to computing training batches from episode rollouts.
+Implementations here are based on code from Open AI:
+https://github.com/openai/universe-starter-agent/blob/master/a3c.py.
+"""
+from collections import namedtuple
+import numpy as np
+import scipy.signal
+from common import utils  # brain coder
+class Rollout(object):
+  """Holds a rollout for an episode.
+  A rollout is a record of the states observed in some environment and actions
+  taken by the agent to arrive at those states. Other information includes
+  rewards received after each action, values estimated for each state, whether
+  the rollout concluded the episide, and total reward received. Everything
+  should be given in time order.
+  At each time t, the agent sees state s_t, takes action a_t, and then receives
+  reward r_t. The agent may optionally estimate a state value V(s_t) for each
+  state.
+  For an episode of length T:
+  states = [s_0, ..., s_(T-1)]
+  actions = [a_0, ..., a_(T-1)]
+  rewards = [r_0, ..., r_(T-1)]
+  values = [V(s_0), ..., V(s_(T-1))]
+  Note that there is an extra state s_T observed after taking action a_(T-1),
+  but this is not included in the rollout.
+  Rollouts have an `terminated` attribute which is True when the rollout is
+  "finalized", i.e. it holds a full episode. terminated will be False when
+  time steps are still being added to it.
+  """
+  def __init__(self):
+    self.states = []
+    self.actions = []
+    self.rewards = []
+    self.values = []
+    self.total_reward = 0.0
+    self.terminated = False
+  def add(self, state, action, reward, value=0.0, terminated=False):
+    """Add the next timestep to this rollout.
+    Args:
+      state: The state observed at the start of this timestep.
+      action: The action taken after observing the given state.
+      reward: The reward received for taking the given action.
+      value: The value estimated for the given state.
+      terminated: Whether this timestep ends the episode.
+    Raises:
+      ValueError: If this.terminated is already True, meaning that the episode
+          has already ended.
+    """
+    if self.terminated:
+      raise ValueError(
+          'Trying to add timestep to an already terminal rollout.')
+    self.states += [state]
+    self.actions += [action]
+    self.rewards += [reward]
+    self.values += [value]
+    self.terminated = terminated
+    self.total_reward += reward
+  def add_many(self, states, actions, rewards, values=None, terminated=False):
+    """Add many timesteps to this rollout.
+    Arguments are the same as `add`, but are lists of equal size.
+    Args:
+      states: The states observed.
+      actions: The actions taken.
+      rewards: The rewards received.
+      values: The values estimated for the given states.
+      terminated: Whether this sequence ends the episode.
+    Raises:
+      ValueError: If the lengths of all the input lists are not equal.
+      ValueError: If this.terminated is already True, meaning that the episode
+          has already ended.
+    """
+    if len(states) != len(actions):
+      raise ValueError(
+          'Number of states and actions must be the same. Got %d states and '
+          '%d actions' % (len(states), len(actions)))
+    if len(states) != len(rewards):
+      raise ValueError(
+          'Number of states and rewards must be the same. Got %d states and '
+          '%d rewards' % (len(states), len(rewards)))
+    if values is not None and len(states) != len(values):
+      raise ValueError(
+          'Number of states and values must be the same. Got %d states and '
+          '%d values' % (len(states), len(values)))
+    if self.terminated:
+      raise ValueError(
+          'Trying to add timesteps to an already terminal rollout.')
+    self.states += states
+    self.actions += actions
+    self.rewards += rewards
+    self.values += values if values is not None else [0.0] * len(states)
+    self.terminated = terminated
+    self.total_reward += sum(rewards)
+  def extend(self, other):
+    """Append another rollout to this rollout."""
+    assert not self.terminated
+    self.states.extend(other.states)
+    self.actions.extend(other.actions)
+    self.rewards.extend(other.rewards)
+    self.values.extend(other.values)
+    self.terminated = other.terminated
+    self.total_reward += other.total_reward
+def discount(x, gamma):
+  """Returns discounted sums for each value in x, with discount factor gamma.
+  This can be used to compute the return (discounted sum of rewards) at each
+  timestep given a sequence of rewards. See the definitions for return and
+  REINFORCE in section 3 of https://arxiv.org/pdf/1602.01783.pdf.
+  Let g^k mean gamma ** k.
+  For list [x_0, ..., x_N], the following list of discounted sums is computed:
+  [x_0 + g^1 * x_1 + g^2 * x_2 + ... g^N * x_N,
+   x_1 + g^1 * x_2 + g^2 * x_3 + ... g^(N-1) * x_N,
+   x_2 + g^1 * x_3 + g^2 * x_4 + ... g^(N-2) * x_N,
+   ...,
+   x_(N-1) + g^1 * x_N,
+   x_N]
+  Args:
+    x: List of numbers [x_0, ..., x_N].
+    gamma: Float between 0 and 1 (inclusive). This is the discount factor.
+  Returns:
+    List of discounted sums.
+  """
+  return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+def discounted_advantage_and_rewards(rewards, values, gamma, lambda_=1.0):
+  """Compute advantages and returns (discounted sum of rewards).
+  For an episode of length T, rewards = [r_0, ..., r_(T-1)].
+  Each reward r_t is observed after taking action a_t at state s_t. A final
+  state s_T is observed but no reward is given at this state since no action
+  a_T is taken (otherwise there would be a new state s_(T+1)).
+  `rewards` and `values` are for a single episode. Return R_t is the discounted
+  sum of future rewards starting at time t, where `gamma` is the discount
+  factor.
+  R_t = r_t + gamma * r_(t+1) + gamma**2 * r_(t+2) + ...
+        + gamma**(T-1-t) * r_(T-1)
+  Advantage A(a_t, s_t) is approximated by computing A(a_t, s_t) = R_t - V(s_t)
+  where V(s_t) is an approximation of the value at that state, given in the
+  `values` list. Returns R_t are needed for all REINFORCE algorithms. Advantage
+  is used for the advantage actor critic variant of REINFORCE.
+  See algorithm S3 in https://arxiv.org/pdf/1602.01783.pdf.
+  Additionally another parameter `lambda_` controls the bias-variance tradeoff.
+  See "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438.
+  lambda_ = 1 reduces to regular advantage.
+  0 <= lambda_ < 1 trades off variance for bias, with lambda_ = 0 being the
+  most biased.
+  Bootstrapping is also supported. If an episode does not end in a terminal
+  state (either because the episode was ended early, or the environment does not
+  have end states), the true return cannot be computed from the rewards alone.
+  However, it can be estimated by computing the value (an approximation of
+  return) of the last state s_T. Thus the `values` list will have an extra item:
+  values = [V(s_0), ..., V(s_(T-1)), V(s_T)].
+  Args:
+    rewards: List of observed rewards [r_0, ..., r_(T-1)].
+    values: List of estimated values [V(s_0), ..., V(s_(T-1))] with an optional
+        extra V(s_T) item.
+    gamma: Discount factor. Number between 0 and 1. 1 means no discount.
+        If not 1, gamma is typically near 1, like 0.99.
+    lambda_: Bias-variance tradeoff factor. Between 0 and 1.
+  Returns:
+    empirical_values: Returns at each timestep.
+    generalized_advantage: Avantages at each timestep.
+  Raises:
+    ValueError: If shapes of `rewards` and `values` are not rank 1.
+    ValueError: If len(values) not in (len(rewards), len(rewards) + 1).
+  """
+  rewards = np.asarray(rewards, dtype=np.float32)
+  values = np.asarray(values, dtype=np.float32)
+  if rewards.ndim != 1:
+    raise ValueError('Single episode only. rewards must be rank 1.')
+  if values.ndim != 1:
+    raise ValueError('Single episode only. values must be rank 1.')
+  if len(values) == len(rewards):
+    # No bootstrapping.
+    values = np.append(values, 0)
+    empirical_values = discount(rewards, gamma)
+  elif len(values) == len(rewards) + 1:
+    # With bootstrapping.
+    # Last value is for the terminal state (final state after last action was
+    # taken).
+    empirical_values = discount(np.append(rewards, values[-1]), gamma)[:-1]
+  else:
+    raise ValueError('values should contain the same number of items or one '
+                     'more item than rewards')
+  delta = rewards + gamma * values[1:] - values[:-1]
+  generalized_advantage = discount(delta, gamma * lambda_)
+  # empirical_values is the discounted sum of rewards into the future.
+  # generalized_advantage is the target for each policy update.
+  return empirical_values, generalized_advantage
+"""Batch holds a minibatch of episodes.
+Let bi = batch_index, i.e. the index of each episode in the minibatch.
+Let t = time.
+Attributes:
+  states: States for each timestep in each episode. Indexed by states[bi, t].
+  actions: Actions for each timestep in each episode. Indexed by actions[bi, t].
+  discounted_adv: Advantages (computed by discounted_advantage_and_rewards)
+      for each timestep in each episode. Indexed by discounted_adv[bi, t].
+  discounted_r: Returns (discounted sum of rewards computed by
+      discounted_advantage_and_rewards) for each timestep in each episode.
+      Indexed by discounted_r[bi, t].
+  total_rewards: Total reward for each episode, i.e. sum of rewards across all
+      timesteps (not discounted). Indexed by total_rewards[bi].
+  episode_lengths: Number of timesteps in each episode. If an episode has
+      N actions, N rewards, and N states, then its length is N. Indexed by
+      episode_lengths[bi].
+  batch_size: Number of episodes in this minibatch. An integer.
+  max_time: Maximum episode length in the batch. An integer.
+"""  # pylint: disable=pointless-string-statement
+Batch = namedtuple(
+    'Batch',
+    ['states', 'actions', 'discounted_adv', 'discounted_r', 'total_rewards',
+     'episode_lengths', 'batch_size', 'max_time'])
+def process_rollouts(rollouts, gamma, lambda_=1.0):
+  """Convert a batch of rollouts into tensors ready to be fed into a model.
+  Lists from each episode are stacked into 2D tensors and padded with 0s up to
+  the maximum timestep in the batch.
+  Args:
+    rollouts: A list of Rollout instances.
+    gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma
+        argument in discounted_advantage_and_rewards.
+    lambda_: See lambda_ argument in discounted_advantage_and_rewards.
+  Returns:
+    Batch instance. states, actions, discounted_adv, and discounted_r are
+    numpy arrays with shape (batch_size, max_episode_length). episode_lengths
+    is a list of ints. total_rewards is a list of floats (total reward in each
+    episode). batch_size and max_time are ints.
+  Raises:
+    ValueError: If any of the rollouts are not terminal.
+  """
+  for ro in rollouts:
+    if not ro.terminated:
+      raise ValueError('Can only process terminal rollouts.')
+  episode_lengths = [len(ro.states) for ro in rollouts]
+  batch_size = len(rollouts)
+  max_time = max(episode_lengths)
+  states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time)
+  actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time)
+  discounted_rewards = [None] * batch_size
+  discounted_adv = [None] * batch_size
+  for i, ro in enumerate(rollouts):
+    disc_r, disc_adv = discounted_advantage_and_rewards(
+        ro.rewards, ro.values, gamma, lambda_)
+    discounted_rewards[i] = disc_r
+    discounted_adv[i] = disc_adv
+  discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time)
+  discounted_adv = utils.stack_pad(discounted_adv, 0, max_time)
+  total_rewards = [sum(ro.rewards) for ro in rollouts]
+  return Batch(states=states,
+               actions=actions,
+               discounted_adv=discounted_adv,
+               discounted_r=discounted_rewards,
+               total_rewards=total_rewards,
+               episode_lengths=episode_lengths,
+               batch_size=batch_size,
+               max_time=max_time)
--- a/research/brain_coder/common/rollout_test.py
+++ b/research/brain_coder/common/rollout_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.rollout."""
+import numpy as np
+import tensorflow as tf
+from common import rollout as rollout_lib  # brain coder
+class RolloutTest(tf.test.TestCase):
+  def MakeRollout(self, states, actions, rewards, values=None, terminated=True):
+    rollout = rollout_lib.Rollout()
+    rollout.add_many(
+        states=states, actions=actions, rewards=rewards, values=values,
+        terminated=terminated)
+    return rollout
+  def testDiscount(self):
+    discounted = np.array([1.0 / 2 ** n for n in range(4, -1, -1)])
+    discounted[:2] += [1.0 / 2 ** n for n in range(1, -1, -1)]
+    self.assertTrue(np.array_equal(
+        rollout_lib.discount([0.0, 1.0, 0.0, 0.0, 1.0], 0.50),
+        discounted))
+    self.assertTrue(np.array_equal(
+        rollout_lib.discount(np.array([0.0, 1.0, 0.0, 0.0, 1.0]), 0.50),
+        discounted))
+  def testDiscountedAdvantageAndRewards(self):
+    # lambda=1, No bootstrapping.
+    values = [0.1, 0.5, 0.5, 0.25]
+    (empirical_values,
+     generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
+         [0.0, 0.0, 0.0, 1.0],
+         values,
+         gamma=0.75,
+         lambda_=1.0)
+    expected_discounted_r = (
+        np.array([1.0 * 0.75 ** n for n in range(3, -1, -1)]))
+    expected_adv = expected_discounted_r - values
+    self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
+    self.assertTrue(np.allclose(generalized_advantage, expected_adv))
+    # lambda=1, With bootstrapping.
+    values = [0.1, 0.5, 0.5, 0.25, 0.75]
+    (empirical_values,
+     generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
+         [0.0, 0.0, 0.0, 1.0],
+         values,
+         gamma=0.75,
+         lambda_=1.0)
+    expected_discounted_r = (
+        np.array([0.75 * 0.75 ** n for n in range(4, 0, -1)])
+        + np.array([1.0 * 0.75 ** n for n in range(3, -1, -1)]))
+    expected_adv = expected_discounted_r - values[:-1]
+    self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
+    self.assertTrue(np.allclose(generalized_advantage, expected_adv))
+    # lambda=0.5, With bootstrapping.
+    values = [0.1, 0.5, 0.5, 0.25, 0.75]
+    rewards = [0.0, 0.0, 0.0, 1.0]
+    l = 0.5  # lambda
+    g = 0.75  # gamma
+    (empirical_values,
+     generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
+         rewards,
+         values,
+         gamma=g,
+         lambda_=l)
+    expected_discounted_r = (
+        np.array([0.75 * g ** n for n in range(4, 0, -1)])
+        + np.array([1.0 * g ** n for n in range(3, -1, -1)]))
+    expected_adv = [0.0] * len(values)
+    for t in range(3, -1, -1):
+      delta_t = rewards[t] + g * values[t + 1] - values[t]
+      expected_adv[t] = delta_t + g * l * expected_adv[t + 1]
+    expected_adv = expected_adv[:-1]
+    self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
+    self.assertTrue(np.allclose(generalized_advantage, expected_adv))
+  def testProcessRollouts(self):
+    g = 0.95
+    rollouts = [
+        self.MakeRollout(
+            states=[3, 6, 9],
+            actions=[1, 2, 3],
+            rewards=[1.0, -1.0, 0.5],
+            values=[0.5, 0.5, 0.1]),
+        self.MakeRollout(
+            states=[10],
+            actions=[5],
+            rewards=[1.0],
+            values=[0.5])]
+    batch = rollout_lib.process_rollouts(rollouts, gamma=g)
+    self.assertEqual(2, batch.batch_size)
+    self.assertEqual(3, batch.max_time)
+    self.assertEqual([3, 1], batch.episode_lengths)
+    self.assertEqual([0.5, 1.0], batch.total_rewards)
+    self.assertEqual(
+        [[3, 6, 9], [10, 0, 0]],
+        batch.states.tolist())
+    self.assertEqual(
+        [[1, 2, 3], [5, 0, 0]],
+        batch.actions.tolist())
+    rew1, rew2 = rollouts[0].rewards, rollouts[1].rewards
+    expected_discounted_rewards = [
+        [rew1[0] + g * rew1[1] + g * g * rew1[2],
+         rew1[1] + g * rew1[2],
+         rew1[2]],
+        [rew2[0], 0.0, 0.0]]
+    expected_advantages = [
+        [dr - v
+         for dr, v
+         in zip(expected_discounted_rewards[0], rollouts[0].values)],
+        [expected_discounted_rewards[1][0] - rollouts[1].values[0], 0.0, 0.0]]
+    self.assertTrue(
+        np.allclose(expected_discounted_rewards, batch.discounted_r))
+    self.assertTrue(
+        np.allclose(expected_advantages, batch.discounted_adv))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/schedules.py
+++ b/research/brain_coder/common/schedules.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Schedule functions for controlling hparams over time."""
+from abc import ABCMeta
+from abc import abstractmethod
+import math
+from common import config_lib  # brain coder
+class Schedule(object):
+  """Schedule is a function which sets a hyperparameter's value over time.
+  For example, a schedule can be used to decay an hparams, or oscillate it over
+  time.
+  This object is constructed with an instance of config_lib.Config (will be
+  specific to each class implementation). For example if this is a decay
+  schedule, the config may specify the rate of decay and decay start time. Then
+  the object instance is called like a function, mapping global step (an integer
+  counting how many calls to the train op have been made) to the hparam value.
+  Properties of a schedule function f(t):
+  0) Domain of t is the non-negative integers (t may be 0).
+  1) Range of f is the reals.
+  2) Schedule functions can assume that they will be called in time order. This
+     allows schedules to be stateful.
+  3) Schedule functions should be deterministic. Two schedule instances with the
+     same config must always give the same value for each t, and regardless of
+     what t's it was previously called on. Users may call f(t) on arbitrary
+     (positive) time jumps. Essentially, multiple schedule instances used in
+     replica training will behave the same.
+  4) Duplicate successive calls on the same time are allowed.
+  """
+  __metaclass__ = ABCMeta
+  @abstractmethod
+  def __init__(self, config):
+    """Construct this schedule with a config specific to each class impl.
+    Args:
+      config: An instance of config_lib.Config.
+    """
+    pass
+  @abstractmethod
+  def __call__(self, global_step):
+    """Map `global_step` to a value.
+    `global_step` is an integer counting how many calls to the train op have
+    been made across all replicas (hence why it is global). Implementations
+    may assume calls to be made in time order, i.e. `global_step` now >=
+    previous `global_step` values.
+    Args:
+      global_step: Non-negative integer.
+    Returns:
+      Hparam value at this step. A number.
+    """
+    pass
+class ConstSchedule(Schedule):
+  """Constant function.
+  config:
+    const: Constant value at every step.
+  f(t) = const.
+  """
+  def __init__(self, config):
+    super(ConstSchedule, self).__init__(config)
+    self.const = config.const
+  def __call__(self, global_step):
+    return self.const
+class LinearDecaySchedule(Schedule):
+  """Linear decay function.
+  config:
+    initial: Decay starts from this value.
+    final: Decay ends at this value.
+    start_time: Step when decay starts. Constant before it.
+    end_time: When decay ends. Constant after it.
+  f(t) is a linear function when start_time <= t <= end_time, with slope of
+  (final - initial) / (end_time - start_time). f(t) = initial
+  when t <= start_time. f(t) = final when t >= end_time.
+  If start_time == end_time, this becomes a step function.
+  """
+  def __init__(self, config):
+    super(LinearDecaySchedule, self).__init__(config)
+    self.initial = config.initial
+    self.final = config.final
+    self.start_time = config.start_time
+    self.end_time = config.end_time
+    if self.end_time < self.start_time:
+      raise ValueError('start_time must be before end_time.')
+    # Linear interpolation.
+    self._time_diff = float(self.end_time - self.start_time)
+    self._diff = float(self.final - self.initial)
+    self._slope = (
+        self._diff / self._time_diff if self._time_diff > 0 else float('inf'))
+  def __call__(self, global_step):
+    if global_step <= self.start_time:
+      return self.initial
+    if global_step > self.end_time:
+      return self.final
+    return self.initial + (global_step - self.start_time) * self._slope
+class ExponentialDecaySchedule(Schedule):
+  """Exponential decay function.
+  See https://en.wikipedia.org/wiki/Exponential_decay.
+  Use this decay function to decay over orders of magnitude. For example, to
+  decay learning rate from 1e-2 to 1e-6. Exponential decay will decay the
+  exponent linearly.
+  config:
+    initial: Decay starts from this value.
+    final: Decay ends at this value.
+    start_time: Step when decay starts. Constant before it.
+    end_time: When decay ends. Constant after it.
+  f(t) is an exponential decay function when start_time <= t <= end_time. The
+  decay rate and amplitude are chosen so that f(t) = initial when
+  t = start_time, and f(t) = final when t = end_time. f(t) is constant for
+  t < start_time or t > end_time. initial and final must be positive values.
+  If start_time == end_time, this becomes a step function.
+  """
+  def __init__(self, config):
+    super(ExponentialDecaySchedule, self).__init__(config)
+    self.initial = config.initial
+    self.final = config.final
+    self.start_time = config.start_time
+    self.end_time = config.end_time
+    if self.initial <= 0 or self.final <= 0:
+      raise ValueError('initial and final must be positive numbers.')
+    # Linear interpolation in log space.
+    self._linear_fn = LinearDecaySchedule(
+        config_lib.Config(
+            initial=math.log(self.initial),
+            final=math.log(self.final),
+            start_time=self.start_time,
+            end_time=self.end_time))
+  def __call__(self, global_step):
+    return math.exp(self._linear_fn(global_step))
+class SmootherstepDecaySchedule(Schedule):
+  """Smootherstep decay function.
+  A sigmoidal like transition from initial to final values. A smoother
+  transition than linear and exponential decays, hence the name.
+  See https://en.wikipedia.org/wiki/Smoothstep.
+  config:
+    initial: Decay starts from this value.
+    final: Decay ends at this value.
+    start_time: Step when decay starts. Constant before it.
+    end_time: When decay ends. Constant after it.
+  f(t) is fully defined here:
+  https://en.wikipedia.org/wiki/Smoothstep#Variations.
+  f(t) is smooth, as in its first-derivative exists everywhere.
+  """
+  def __init__(self, config):
+    super(SmootherstepDecaySchedule, self).__init__(config)
+    self.initial = config.initial
+    self.final = config.final
+    self.start_time = config.start_time
+    self.end_time = config.end_time
+    if self.end_time < self.start_time:
+      raise ValueError('start_time must be before end_time.')
+    self._time_diff = float(self.end_time - self.start_time)
+    self._diff = float(self.final - self.initial)
+  def __call__(self, global_step):
+    if global_step <= self.start_time:
+      return self.initial
+    if global_step > self.end_time:
+      return self.final
+    x = (global_step - self.start_time) / self._time_diff
+    # Smootherstep
+    return self.initial + x * x * x * (x * (x * 6 - 15) + 10) * self._diff
+class HardOscillatorSchedule(Schedule):
+  """Hard oscillator function.
+  config:
+    high: Max value of the oscillator. Value at constant plateaus.
+    low: Min value of the oscillator. Value at constant valleys.
+    start_time: Global step when oscillation starts. Constant before this.
+    period: Width of one oscillation, i.e. number of steps over which the
+        oscillation takes place.
+    transition_fraction: Fraction of the period spent transitioning between high
+        and low values. 50% of this time is spent rising, and 50% of this time
+        is spent falling. 50% of the remaining time is spent constant at the
+        high value, and 50% of the remaining time is spent constant at the low
+        value. transition_fraction = 1.0 means the entire period is spent
+        rising and falling. transition_fraction = 0.0 means no time is spent
+        rising and falling, i.e. the function jumps instantaneously between
+        high and low.
+  f(t) = high when t < start_time.
+  f(t) is periodic when t >= start_time, with f(t + period) = f(t).
+  f(t) is linear with positive slope when rising, and negative slope when
+  falling. At the start of the period t0, f(t0) = high and begins to descend.
+  At the middle of the period f is low and is constant until the ascension
+  begins. f then rises from low to high and is constant again until the period
+  repeats.
+  Note: when transition_fraction is 0, f starts the period low and ends high.
+  """
+  def __init__(self, config):
+    super(HardOscillatorSchedule, self).__init__(config)
+    self.high = config.high
+    self.low = config.low
+    self.start_time = config.start_time
+    self.period = float(config.period)
+    self.transition_fraction = config.transition_fraction
+    self.half_transition_fraction = config.transition_fraction / 2.0
+    if self.transition_fraction < 0 or self.transition_fraction > 1.0:
+      raise ValueError('transition_fraction must be between 0 and 1.0')
+    if self.period <= 0:
+      raise ValueError('period must be positive')
+    self._slope = (
+        float(self.high - self.low) / self.half_transition_fraction
+        if self.half_transition_fraction > 0 else float('inf'))
+  def __call__(self, global_step):
+    if global_step < self.start_time:
+      return self.high
+    period_pos = ((global_step - self.start_time) / self.period) % 1.0
+    if period_pos >= 0.5:
+      # ascending
+      period_pos -= 0.5
+      if period_pos < self.half_transition_fraction:
+        return self.low + period_pos * self._slope
+      else:
+        return self.high
+    else:
+      # descending
+      if period_pos < self.half_transition_fraction:
+        return self.high - period_pos * self._slope
+      else:
+        return self.low
+_NAME_TO_CONFIG = {
+    'const': ConstSchedule,
+    'linear_decay': LinearDecaySchedule,
+    'exp_decay': ExponentialDecaySchedule,
+    'smooth_decay': SmootherstepDecaySchedule,
+    'hard_osc': HardOscillatorSchedule,
+}
+def make_schedule(config):
+  """Schedule factory.
+  Given `config` containing a `fn` property, a Schedule implementation is
+  instantiated with `config`. See `_NAME_TO_CONFIG` for `fn` options.
+  Args:
+    config: Config with a `fn` option that specifies which Schedule
+        implementation to use. `config` is passed into the constructor.
+  Returns:
+    A Schedule impl instance.
+  """
+  schedule_class = _NAME_TO_CONFIG[config.fn]
+  return schedule_class(config)
--- a/research/brain_coder/common/schedules_test.py
+++ b/research/brain_coder/common/schedules_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.schedules."""
+from math import exp
+from math import sqrt
+import numpy as np
+import tensorflow as tf
+from common import config_lib  # brain coder
+from common import schedules  # brain coder
+class SchedulesTest(tf.test.TestCase):
+  def ScheduleTestHelper(self, config, schedule_subtype, io_values):
+    """Run common checks for schedules.
+    Args:
+      config: Config object which is passed into schedules.make_schedule.
+      schedule_subtype: The expected schedule type to be instantiated.
+      io_values: List of (input, output) pairs. Must be in ascending input
+          order. No duplicate inputs.
+    """
+    # Check that make_schedule makes the correct type.
+    f = schedules.make_schedule(config)
+    self.assertTrue(isinstance(f, schedule_subtype))
+    # Check that multiple instances returned from make_schedule behave the same.
+    fns = [schedules.make_schedule(config) for _ in xrange(3)]
+    # Check that all the inputs map to the right outputs.
+    for i, o in io_values:
+      for f in fns:
+        f_out = f(i)
+        self.assertTrue(
+            np.isclose(o, f_out),
+            'Wrong value at input %d. Expected %s, got %s' % (i, o, f_out))
+    # Check that a subset of the io_values are still correct.
+    f = schedules.make_schedule(config)
+    subseq = [io_values[i**2] for i in xrange(int(sqrt(len(io_values))))]
+    if subseq[-1] != io_values[-1]:
+      subseq.append(io_values[-1])
+    for i, o in subseq:
+      f_out = f(i)
+      self.assertTrue(
+          np.isclose(o, f_out),
+          'Wrong value at input %d. Expected %s, got %s' % (i, o, f_out))
+    # Check duplicate calls.
+    f = schedules.make_schedule(config)
+    for i, o in io_values:
+      for _ in xrange(3):
+        f_out = f(i)
+        self.assertTrue(
+            np.isclose(o, f_out),
+            'Duplicate calls at input %d are not equal. Expected %s, got %s'
+            % (i, o, f_out))
+  def testConstSchedule(self):
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='const', const=5),
+        schedules.ConstSchedule,
+        [(0, 5), (1, 5), (10, 5), (20, 5), (100, 5), (1000000, 5)])
+  def testLinearDecaySchedule(self):
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='linear_decay', initial=2, final=0, start_time=10,
+                          end_time=20),
+        schedules.LinearDecaySchedule,
+        [(0, 2), (1, 2), (10, 2), (11, 1.8), (15, 1), (19, 0.2), (20, 0),
+         (100000, 0)])
+    # Test step function.
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='linear_decay', initial=2, final=0, start_time=10,
+                          end_time=10),
+        schedules.LinearDecaySchedule,
+        [(0, 2), (1, 2), (10, 2), (11, 0), (15, 0)])
+  def testExponentialDecaySchedule(self):
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='exp_decay', initial=exp(-1), final=exp(-6),
+                          start_time=10, end_time=20),
+        schedules.ExponentialDecaySchedule,
+        [(0, exp(-1)), (1, exp(-1)), (10, exp(-1)), (11, exp(-1/2. - 1)),
+         (15, exp(-5/2. - 1)), (19, exp(-9/2. - 1)), (20, exp(-6)),
+         (100000, exp(-6))])
+    # Test step function.
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='exp_decay', initial=exp(-1), final=exp(-6),
+                          start_time=10, end_time=10),
+        schedules.ExponentialDecaySchedule,
+        [(0, exp(-1)), (1, exp(-1)), (10, exp(-1)), (11, exp(-6)),
+         (15, exp(-6))])
+  def testSmootherstepDecaySchedule(self):
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='smooth_decay', initial=2, final=0, start_time=10,
+                          end_time=20),
+        schedules.SmootherstepDecaySchedule,
+        [(0, 2), (1, 2), (10, 2), (11, 1.98288), (15, 1), (19, 0.01712),
+         (20, 0), (100000, 0)])
+    # Test step function.
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='smooth_decay', initial=2, final=0, start_time=10,
+                          end_time=10),
+        schedules.SmootherstepDecaySchedule,
+        [(0, 2), (1, 2), (10, 2), (11, 0), (15, 0)])
+  def testHardOscillatorSchedule(self):
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='hard_osc', high=2, low=0, start_time=100,
+                          period=10, transition_fraction=0.5),
+        schedules.HardOscillatorSchedule,
+        [(0, 2), (1, 2), (10, 2), (100, 2), (101, 1.2), (102, 0.4), (103, 0),
+         (104, 0), (105, 0), (106, 0.8), (107, 1.6), (108, 2), (109, 2),
+         (110, 2), (111, 1.2), (112, 0.4), (115, 0), (116, 0.8), (119, 2),
+         (120, 2), (100001, 1.2), (100002, 0.4), (100005, 0), (100006, 0.8),
+         (100010, 2)])
+    # Test instantaneous step.
+    self.ScheduleTestHelper(
+        config_lib.Config(fn='hard_osc', high=2, low=0, start_time=100,
+                          period=10, transition_fraction=0),
+        schedules.HardOscillatorSchedule,
+        [(0, 2), (1, 2), (10, 2), (99, 2), (100, 0), (104, 0), (105, 2),
+         (106, 2), (109, 2), (110, 0)])
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/utils.py
+++ b/research/brain_coder/common/utils.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Configuration class."""
+import bisect
+from collections import deque
+import cPickle
+import heapq
+import random
+from absl import logging
+import numpy as np
+import tensorflow as tf
+def tuple_to_record(tuple_, record_type):
+  return record_type(**dict(zip(record_type.__slots__, tuple_)))
+def make_record(type_name, attributes, defaults=None):
+  """Factory for mutable record classes.
+  A record acts just like a collections.namedtuple except slots are writable.
+  One exception is that record classes are not equivalent to tuples or other
+  record classes of the same length.
+  Note, each call to `make_record` produces a unique type. Two calls will make
+  different types even if `type_name` is the same each time.
+  Args:
+    type_name: Name of the record type to create.
+    attributes: List of names of each record attribute. The order of the list
+        is preserved.
+    defaults: (optional) default values for attributes. A dict mapping attribute
+        names to values.
+  Returns:
+    A new record type.
+  Raises:
+    ValueError: If,
+        `defaults` is not a dict,
+        `attributes` contains duplicate names,
+        `defaults` keys are not contained in `attributes`.
+  """
+  if defaults is None:
+    defaults = {}
+  if not isinstance(defaults, dict):
+    raise ValueError('defaults must be a dict.')
+  attr_set = set(attributes)
+  if len(attr_set) < len(attributes):
+    raise ValueError('No duplicate attributes allowed.')
+  if not set(defaults.keys()).issubset(attr_set):
+    raise ValueError('Default attributes must be given in the attributes list.')
+  class RecordClass(object):
+    """A record type.
+    Acts like mutable tuple with named slots.
+    """
+    __slots__ = list(attributes)
+    _defaults = dict(defaults)
+    def __init__(self, *args, **kwargs):
+      if len(args) > len(self.__slots__):
+        raise ValueError('Too many arguments. %s has length %d.'
+                         % (type(self).__name__, len(self.__slots__)))
+      for attr, val in self._defaults.items():
+        setattr(self, attr, val)
+      for i, arg in enumerate(args):
+        setattr(self, self.__slots__[i], arg)
+      for attr, val in kwargs.items():
+        setattr(self, attr, val)
+      for attr in self.__slots__:
+        if not hasattr(self, attr):
+          raise ValueError('Required attr "%s" is not set.' % attr)
+    def __len__(self):
+      return len(self.__slots__)
+    def __iter__(self):
+      for attr in self.__slots__:
+        yield getattr(self, attr)
+    def __getitem__(self, index):
+      return getattr(self, self.__slots__[index])
+    def __setitem__(self, index, value):
+      return setattr(self, self.__slots__[index], value)
+    def __eq__(self, other):
+      # Types must be equal as well as values.
+      return (isinstance(other, type(self))
+              and all(a == b for a, b in zip(self, other)))
+    def __str__(self):
+      return '%s(%s)' % (
+          type(self).__name__,
+          ', '.join(attr + '=' + str(getattr(self, attr))
+                    for attr in self.__slots__))
+    def __repr__(self):
+      return str(self)
+  RecordClass.__name__ = type_name
+  return RecordClass
+# Making minibatches.
+def stack_pad(tensors, pad_axes=None, pad_to_lengths=None, dtype=np.float32,
+              pad_value=0):
+  """Stack tensors along 0-th dim and pad them to be the same shape.
+  Args:
+    tensors: Any list of iterables (python list, numpy array, etc). Can be 1D
+        or multi-D iterables.
+    pad_axes: An int or list of ints. Axes to pad along.
+    pad_to_lengths: Length in each dimension. If pad_axes was an int, this is an
+        int or None. If pad_axes was a list of ints, this is a list of mixed int
+        and None types with the same length, or None. A None length means the
+        maximum length among the given tensors is used.
+    dtype: Type of output numpy array. Defaults to np.float32.
+    pad_value: Value to use for padding. Defaults to 0.
+  Returns:
+    Numpy array containing the tensors stacked along the 0-th dimension and
+        padded along the specified dimensions.
+  Raises:
+    ValueError: If the tensors do not have equal shapes along non-padded
+        dimensions.
+  """
+  tensors = [np.asarray(t) for t in tensors]
+  max_lengths = [max(l) for l in zip(*[t.shape for t in tensors])]
+  same_axes = dict(enumerate(max_lengths))
+  if pad_axes is None:
+    pad_axes = []
+  if isinstance(pad_axes, (int, long)):
+    if pad_to_lengths is not None:
+      max_lengths[pad_axes] = pad_to_lengths
+    del same_axes[pad_axes]
+  else:
+    if pad_to_lengths is None:
+      pad_to_lengths = [None] * len(pad_axes)
+    for i, l in zip(pad_axes, pad_to_lengths):
+      if l is not None:
+        max_lengths[i] = l
+      del same_axes[i]
+  same_axes_items = same_axes.items()
+  dest = np.full([len(tensors)] + max_lengths, pad_value, dtype=dtype)
+  for i, t in enumerate(tensors):
+    for j, l in same_axes_items:
+      if t.shape[j] != l:
+        raise ValueError(
+            'Tensor at index %d does not have size %d along axis %d'
+            % (i, l, j))
+    dest[[i] + [slice(0, d) for d in t.shape]] = t
+  return dest
+class RandomQueue(deque):
+  def __init__(self, capacity):
+    super(RandomQueue, self).__init__([], capacity)
+    self.capacity = capacity
+  def random_sample(self, sample_size):
+    idx = np.random.choice(len(self), sample_size)
+    return [self[i] for i in idx]
+  def push(self, item):
+    # Append to right. Oldest element will be popped from left.
+    self.append(item)
+class MPQItemContainer(object):
+  """Class for holding an item with its score.
+  Defines a comparison function for use in the heap-queue.
+  """
+  def __init__(self, score, item, extra_data):
+    self.item = item
+    self.score = score
+    self.extra_data = extra_data
+  def __cmp__(self, other):
+    assert isinstance(other, type(self))
+    return cmp(self.score, other.score)
+  def __iter__(self):
+    """Allows unpacking like a tuple."""
+    yield self.score
+    yield self.item
+    yield self.extra_data
+  def __repr__(self):
+    """String representation of this item.
+    `extra_data` is not included in the representation. We are assuming that
+    `extra_data` is not easily interpreted by a human (if it was, it should be
+    hashable, like a string or tuple).
+    Returns:
+      String representation of `self`.
+    """
+    return str((self.score, self.item))
+  def __str__(self):
+    return repr(self)
+class MaxUniquePriorityQueue(object):
+  """A maximum priority queue where duplicates are not added.
+  The top items by score remain in the queue. When the capacity is reached,
+  the lowest scored item in the queue will be dropped.
+  This implementation differs from a typical priority queue, in that the minimum
+  score is popped, instead of the maximum. Largest scores remain stuck in the
+  queue. This is useful for accumulating the best known items from a population.
+  The items used to determine uniqueness must be hashable, but additional
+  non-hashable data may be stored with each item.
+  """
+  def __init__(self, capacity):
+    self.capacity = capacity
+    self.heap = []
+    self.unique_items = set()
+  def push(self, score, item, extra_data=None):
+    """Push an item onto the queue.
+    If the queue is at capacity, the item with the smallest score will be
+    dropped. Note that it is assumed each item has exactly one score. The same
+    item with a different score will still be dropped.
+    Args:
+      score: Number used to prioritize items in the queue. Largest scores are
+          kept in the queue.
+      item: A hashable item to be stored. Duplicates of this item will not be
+          added to the queue.
+      extra_data: An extra (possible not hashable) data to store with the item.
+    """
+    if item in self.unique_items:
+      return
+    if len(self.heap) >= self.capacity:
+      _, popped_item, _ = heapq.heappushpop(
+          self.heap, MPQItemContainer(score, item, extra_data))
+      self.unique_items.add(item)
+      self.unique_items.remove(popped_item)
+    else:
+      heapq.heappush(self.heap, MPQItemContainer(score, item, extra_data))
+      self.unique_items.add(item)
+  def pop(self):
+    """Pop the item with the lowest score.
+    Returns:
+      score: Item's score.
+      item: The item that was popped.
+      extra_data: Any extra data stored with the item.
+    """
+    if not self.heap:
+      return ()
+    score, item, extra_data = heapq.heappop(self.heap)
+    self.unique_items.remove(item)
+    return score, item, extra_data
+  def get_max(self):
+    """Peek at the item with the highest score.
+    Returns:
+      Same as `pop`.
+    """
+    if not self.heap:
+      return ()
+    score, item, extra_data = heapq.nlargest(1, self.heap)[0]
+    return score, item, extra_data
+  def get_min(self):
+    """Peek at the item with the lowest score.
+    Returns:
+      Same as `pop`.
+    """
+    if not self.heap:
+      return ()
+    score, item, extra_data = heapq.nsmallest(1, self.heap)[0]
+    return score, item, extra_data
+  def random_sample(self, sample_size):
+    """Randomly select items from the queue.
+    This does not modify the queue.
+    Items are drawn from a uniform distribution, and not weighted by score.
+    Args:
+      sample_size: Number of random samples to draw. The same item can be
+          sampled multiple times.
+    Returns:
+      List of sampled items (of length `sample_size`). Each element in the list
+      is a tuple: (item, extra_data).
+    """
+    idx = np.random.choice(len(self.heap), sample_size)
+    return [(self.heap[i].item, self.heap[i].extra_data) for i in idx]
+  def iter_in_order(self):
+    """Iterate over items in the queue from largest score to smallest.
+    Yields:
+      item: Hashable item.
+      extra_data: Extra data stored with the item.
+    """
+    for _, item, extra_data in heapq.nlargest(len(self.heap), self.heap):
+      yield item, extra_data
+  def __len__(self):
+    return len(self.heap)
+  def __iter__(self):
+    for _, item, _ in self.heap:
+      yield item
+  def __repr__(self):
+    return '[' + ', '.join(repr(c) for c in self.heap) + ']'
+  def __str__(self):
+    return repr(self)
+class RouletteWheel(object):
+  """Randomly samples stored objects proportionally to their given weights.
+  Stores objects and weights. Acts like a roulette wheel where each object is
+  given a slice of the roulette disk proportional to its weight.
+  This can be used as a replay buffer where past experiences are sampled
+  proportionally to their weights. A good choice of "weight" for reinforcement
+  learning is exp(reward / temperature) where temperature -> inf makes the
+  distribution more uniform and temperature -> 0 makes the distribution more
+  peaky.
+  To prevent experiences from being overweighted by appearing in the replay
+  buffer multiple times, a "unique mode" is supported where duplicate
+  experiences are ignored. In unique mode, weights can be quickly retrieved from
+  keys.
+  """
+  def __init__(self, unique_mode=False, save_file=None):
+    """Construct empty RouletteWheel.
+    If `save_file` is not None, and the file already exists on disk, whatever
+    is in the file will be loaded into this instance. This allows jobs using
+    RouletteWheel to resume after preemption.
+    Args:
+      unique_mode: If True, puts this RouletteWheel into unique mode, where
+          objects are added with hashable keys, so that duplicates are ignored.
+      save_file: Optional file path to save to. Must be a string containing
+          an absolute path to a file, or None. File will be Python pickle
+          format.
+    """
+    self.unique_mode = unique_mode
+    self.objects = []
+    self.weights = []
+    self.partial_sums = []
+    if self.unique_mode:
+      self.keys_to_weights = {}
+    self.save_file = save_file
+    self.save_to_disk_buffer = []
+    if save_file is not None and tf.gfile.Exists(save_file):
+      # Load from disk.
+      with tf.gfile.OpenFast(save_file, 'r') as f:
+        count = 0
+        while 1:
+          try:
+            obj, weight, key = cPickle.load(f)
+          except EOFError:
+            break
+          else:
+            self.add(obj, weight, key)
+            count += 1
+      logging.info('Loaded %d samples from disk.', count)
+      # Clear buffer since these items are already on disk.
+      self.save_to_disk_buffer = []
+  def __iter__(self):
+    return iter(zip(self.objects, self.weights))
+  def __len__(self):
+    return len(self.objects)
+  def is_empty(self):
+    """Returns whether there is anything in the roulette wheel."""
+    return not self.partial_sums
+  @property
+  def total_weight(self):
+    """Total cumulative weight across all objects."""
+    if self.partial_sums:
+      return self.partial_sums[-1]
+    return 0.0
+  def has_key(self, key):
+    if self.unique_mode:
+      RuntimeError('has_key method can only be called in unique mode.')
+    return key in self.keys_to_weights
+  def get_weight(self, key):
+    if self.unique_mode:
+      RuntimeError('get_weight method can only be called in unique mode.')
+    return self.keys_to_weights[key]
+  def add(self, obj, weight, key=None):
+    """Add one object and its weight to the roulette wheel.
+    Args:
+      obj: Any object to be stored.
+      weight: A non-negative float. The given object will be drawn with
+          probability proportional to this weight when sampling.
+      key: This argument is only used when in unique mode. To allow `obj` to
+          be an unhashable type, like list, a separate hashable key is given.
+          Each `key` should be unique to each `obj`. `key` is used to check if
+          `obj` has been added to the roulette wheel before.
+    Returns:
+      True if the object was added, False if it was not added due to it being
+      a duplicate (this only happens in unique mode).
+    Raises:
+      ValueError: If `weight` is negative.
+      ValueError: If `key` is not given when in unique mode, or if `key` is
+          given when not in unique mode.
+    """
+    if weight < 0:
+      raise ValueError('Weight must be non-negative')
+    if self.unique_mode:
+      if key is None:
+        raise ValueError(
+            'Hashable key required for objects when unique mode is enabled.')
+      if key in self.keys_to_weights:
+        # Weight updates are not allowed. Ignore the given value of `weight`.
+        return False
+      self.keys_to_weights[key] = weight
+    elif key is not None:
+      raise ValueError(
+          'key argument should not be used when unique mode is disabled.')
+    self.objects.append(obj)
+    self.weights.append(weight)
+    self.partial_sums.append(self.total_weight + weight)
+    if self.save_file is not None:
+      # Record new item in buffer.
+      self.save_to_disk_buffer.append((obj, weight, key))
+    return True
+  def add_many(self, objs, weights, keys=None):
+    """Add many object and their weights to the roulette wheel.
+    Arguments are the same as the `add` method, except each is a list. Lists
+    must all be the same length.
+    Args:
+      objs: List of objects to be stored.
+      weights: List of non-negative floats. See `add` method.
+      keys: List of hashable keys. This argument is only used when in unique
+          mode. See `add` method.
+    Returns:
+      Number of objects added. This number will be less than the number of
+      objects provided if we are in unique mode and some keys are already
+      in the roulette wheel.
+    Raises:
+      ValueError: If `keys` argument is provided when unique_mode == False, or
+          is not provided when unique_mode == True.
+      ValueError: If any of the lists are not the same length.
+      ValueError: If any of the weights are negative.
+    """
+    if keys is not None and not self.unique_mode:
+      raise ValueError('Not in unique mode. Do not provide keys.')
+    elif keys is None and self.unique_mode:
+      raise ValueError('In unique mode. You must provide hashable keys.')
+    if keys and len(objs) != len(keys):
+      raise ValueError('Number of objects does not equal number of keys.')
+    if len(objs) != len(weights):
+      raise ValueError('Number of objects does not equal number of weights.')
+    return sum([self.add(obj, weights[i], key=keys[i] if keys else None)
+                for i, obj in enumerate(objs)])
+  def sample(self):
+    """Spin the roulette wheel.
+    Randomly select an object with probability proportional to its weight.
+    Returns:
+      object: The selected object.
+      weight: The weight of the selected object.
+    Raises:
+      RuntimeError: If the roulette wheel is empty.
+    """
+    if self.is_empty():
+      raise RuntimeError('Trying to sample from empty roulette wheel.')
+    spin = random.random() * self.total_weight
+    # Binary search.
+    i = bisect.bisect_right(self.partial_sums, spin)
+    if i == len(self.partial_sums):
+      # This should not happen since random.random() will always be strictly
+      # less than 1.0, and the last partial sum equals self.total_weight().
+      # However it may happen due to rounding error. In that case it is easy to
+      # handle this, just select the last object.
+      i -= 1
+    return self.objects[i], self.weights[i]
+  def sample_many(self, count):
+    """Spin the roulette wheel `count` times and return the results."""
+    if self.is_empty():
+      raise RuntimeError('Trying to sample from empty roulette wheel.')
+    return [self.sample() for _ in xrange(count)]
+  def incremental_save(self, log_info=False):
+    """Write new entries to disk.
+    This performs an append operation on the `save_file` given in the
+    constructor. Any entries added since the last call to `incremental_save`
+    will be appended to the file.
+    If a new RouletteWheel is constructed with the same `save_file`, all the
+    entries written there will be automatically loaded into the instance.
+    This is useful when a job resumes after preemption.
+    Args:
+      log_info: If True, info about this operation will be logged.
+    Raises:
+      RuntimeError: If `save_file` given in the constructor is None.
+    """
+    if self.save_file is None:
+      raise RuntimeError('Cannot call incremental_save. `save_file` is None.')
+    if log_info:
+      logging.info('Saving %d new samples to disk.',
+                   len(self.save_to_disk_buffer))
+    with tf.gfile.OpenFast(self.save_file, 'a') as f:
+      for entry in self.save_to_disk_buffer:
+        cPickle.dump(entry, f)
+    # Clear the buffer.
+    self.save_to_disk_buffer = []
--- a/research/brain_coder/common/utils_test.py
+++ b/research/brain_coder/common/utils_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.utils.
+"""
+from collections import Counter
+import random
+import tempfile
+import numpy as np
+import tensorflow as tf
+from common import utils  # brain coder
+class UtilsTest(tf.test.TestCase):
+  def testStackPad(self):
+    # 1D.
+    tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
+    result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=6)
+    self.assertTrue(np.array_equal(
+        result,
+        np.asarray([[1, 2, 3, 0, 0, 0],
+                    [4, 5, 6, 7, 8, 0],
+                    [9, 0, 0, 0, 0, 0]], dtype=np.float32)))
+    # 3D.
+    tensors = [[[[1, 2, 3], [4, 5, 6]]],
+               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
+               [[[0, 1, 2]], [[3, 4, 5]]]]
+    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
+    self.assertTrue(np.array_equal(
+        result,
+        np.asarray([[[[1, 2, 3], [4, 5, 6]],
+                     [[0, 0, 0], [0, 0, 0]]],
+                    [[[7, 8, 9], [0, 1, 2]],
+                     [[3, 4, 5], [6, 7, 8]]],
+                    [[[0, 1, 2], [0, 0, 0]],
+                     [[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
+  def testStackPadNoAxes(self):
+    # 2D.
+    tensors = [[[1, 2, 3], [4, 5, 6]],
+               [[7, 8, 9], [1, 2, 3]],
+               [[4, 5, 6], [7, 8, 9]]]
+    result = utils.stack_pad(tensors)
+    self.assertTrue(np.array_equal(
+        result,
+        np.asarray(tensors)))
+  def testStackPadNoneLength(self):
+    # 1D.
+    tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
+    result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=None)
+    self.assertTrue(np.array_equal(
+        result,
+        np.asarray([[1, 2, 3, 0, 0],
+                    [4, 5, 6, 7, 8],
+                    [9, 0, 0, 0, 0]], dtype=np.float32)))
+    # 3D.
+    tensors = [[[[1, 2, 3], [4, 5, 6]]],
+               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
+               [[[0, 1, 2]], [[3, 4, 5]]]]
+    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=None)
+    self.assertTrue(np.array_equal(
+        result,
+        np.asarray([[[[1, 2, 3], [4, 5, 6]],
+                     [[0, 0, 0], [0, 0, 0]]],
+                    [[[7, 8, 9], [0, 1, 2]],
+                     [[3, 4, 5], [6, 7, 8]]],
+                    [[[0, 1, 2], [0, 0, 0]],
+                     [[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
+    # 3D with partial pad_to_lengths.
+    tensors = [[[[1, 2, 3], [4, 5, 6]]],
+               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
+               [[[0, 1, 2]], [[3, 4, 5]]]]
+    result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[None, 3])
+    self.assertTrue(np.array_equal(
+        result,
+        np.asarray([[[[1, 2, 3], [4, 5, 6], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0]]],
+                    [[[7, 8, 9], [0, 1, 2], [0, 0, 0]],
+                     [[3, 4, 5], [6, 7, 8], [0, 0, 0]]],
+                    [[[0, 1, 2], [0, 0, 0], [0, 0, 0]],
+                     [[3, 4, 5], [0, 0, 0], [0, 0, 0]]]], dtype=np.float32)))
+  def testStackPadValueError(self):
+    # 3D.
+    tensors = [[[[1, 2, 3], [4, 5, 6]]],
+               [[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
+               [[[0, 1, 2]], [[3, 4, 5]]],
+               [[[1, 2, 3, 4]]]]
+    # Not all tensors have the same shape along axis 2.
+    with self.assertRaises(ValueError):
+      utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
+  def testRecord(self):
+    my_record = utils.make_record('my_record', ['a', 'b', 'c'], {'b': 55})
+    inst = my_record(a=1, b=2, c=3)
+    self.assertEqual(1, inst.a)
+    self.assertEqual(2, inst.b)
+    self.assertEqual(3, inst.c)
+    self.assertEqual(1, inst[0])
+    self.assertEqual(2, inst[1])
+    self.assertEqual(3, inst[2])
+    self.assertEqual([1, 2, 3], list(iter(inst)))
+    self.assertEqual(3, len(inst))
+    inst.b = 999
+    self.assertEqual(999, inst.b)
+    self.assertEqual(999, inst[1])
+    inst2 = my_record(1, 999, 3)
+    self.assertTrue(inst == inst2)
+    inst2[1] = 3
+    self.assertFalse(inst == inst2)
+    inst3 = my_record(a=1, c=3)
+    inst.b = 55
+    self.assertEqual(inst, inst3)
+  def testRecordUnique(self):
+    record1 = utils.make_record('record1', ['a', 'b', 'c'])
+    record2 = utils.make_record('record2', ['a', 'b', 'c'])
+    self.assertNotEqual(record1(1, 2, 3), record2(1, 2, 3))
+    self.assertEqual(record1(1, 2, 3), record1(1, 2, 3))
+  def testTupleToRecord(self):
+    my_record = utils.make_record('my_record', ['a', 'b', 'c'])
+    inst = utils.tuple_to_record((5, 6, 7), my_record)
+    self.assertEqual(my_record(5, 6, 7), inst)
+  def testRecordErrors(self):
+    my_record = utils.make_record('my_record', ['a', 'b', 'c'], {'b': 10})
+    with self.assertRaises(ValueError):
+      my_record(c=5)  # Did not provide required argument 'a'.
+    with self.assertRaises(ValueError):
+      my_record(1, 2, 3, 4)  # Too many arguments.
+  def testRandomQueue(self):
+    np.random.seed(567890)
+    queue = utils.RandomQueue(5)
+    queue.push(5)
+    queue.push(6)
+    queue.push(7)
+    queue.push(8)
+    queue.push(9)
+    queue.push(10)
+    self.assertTrue(5 not in queue)
+    sample = queue.random_sample(1000)
+    self.assertEqual(1000, len(sample))
+    self.assertEqual([6, 7, 8, 9, 10], sorted(np.unique(sample).tolist()))
+  def testMaxUniquePriorityQueue(self):
+    queue = utils.MaxUniquePriorityQueue(5)
+    queue.push(1.0, 'string 1')
+    queue.push(-0.5, 'string 2')
+    queue.push(0.5, 'string 3')
+    self.assertEqual((-0.5, 'string 2', None), queue.pop())
+    queue.push(0.1, 'string 4')
+    queue.push(1.5, 'string 5')
+    queue.push(0.0, 'string 6')
+    queue.push(0.2, 'string 7')
+    self.assertEqual((1.5, 'string 5', None), queue.get_max())
+    self.assertEqual((0.1, 'string 4', None), queue.get_min())
+    self.assertEqual(
+        [('string 5', None), ('string 1', None), ('string 3', None),
+         ('string 7', None), ('string 4', None)],
+        list(queue.iter_in_order()))
+  def testMaxUniquePriorityQueue_Duplicates(self):
+    queue = utils.MaxUniquePriorityQueue(5)
+    queue.push(0.0, 'string 1')
+    queue.push(0.0, 'string 2')
+    queue.push(0.0, 'string 3')
+    self.assertEqual((0.0, 'string 1', None), queue.pop())
+    self.assertEqual((0.0, 'string 2', None), queue.pop())
+    self.assertEqual((0.0, 'string 3', None), queue.pop())
+    self.assertEqual(0, len(queue))
+    queue.push(0.1, 'string 4')
+    queue.push(1.5, 'string 5')
+    queue.push(0.3, 'string 6')
+    queue.push(0.2, 'string 7')
+    queue.push(0.0, 'string 8')
+    queue.push(1.5, 'string 5')
+    queue.push(1.5, 'string 5')
+    self.assertEqual((1.5, 'string 5', None), queue.get_max())
+    self.assertEqual((0.0, 'string 8', None), queue.get_min())
+    self.assertEqual(
+        [('string 5', None), ('string 6', None), ('string 7', None),
+         ('string 4', None), ('string 8', None)],
+        list(queue.iter_in_order()))
+  def testMaxUniquePriorityQueue_ExtraData(self):
+    queue = utils.MaxUniquePriorityQueue(5)
+    queue.push(1.0, 'string 1', [1, 2, 3])
+    queue.push(0.5, 'string 2', [4, 5, 6])
+    queue.push(0.5, 'string 3', [7, 8, 9])
+    queue.push(0.5, 'string 2', [10, 11, 12])
+    self.assertEqual((0.5, 'string 2', [4, 5, 6]), queue.pop())
+    self.assertEqual((0.5, 'string 3', [7, 8, 9]), queue.pop())
+    self.assertEqual((1.0, 'string 1', [1, 2, 3]), queue.pop())
+    self.assertEqual(0, len(queue))
+    queue.push(0.5, 'string 2', [10, 11, 12])
+    self.assertEqual((0.5, 'string 2', [10, 11, 12]), queue.pop())
+  def testRouletteWheel(self):
+    random.seed(12345678987654321)
+    r = utils.RouletteWheel()
+    self.assertTrue(r.is_empty())
+    with self.assertRaises(RuntimeError):
+      r.sample()  # Cannot sample when empty.
+    self.assertEqual(0, r.total_weight)
+    self.assertEqual(True, r.add('a', 0.1))
+    self.assertFalse(r.is_empty())
+    self.assertEqual(0.1, r.total_weight)
+    self.assertEqual(True, r.add('b', 0.01))
+    self.assertEqual(0.11, r.total_weight)
+    self.assertEqual(True, r.add('c', 0.5))
+    self.assertEqual(True, r.add('d', 0.1))
+    self.assertEqual(True, r.add('e', 0.05))
+    self.assertEqual(True, r.add('f', 0.03))
+    self.assertEqual(True, r.add('g', 0.001))
+    self.assertEqual(0.791, r.total_weight)
+    self.assertFalse(r.is_empty())
+    # Check that sampling is correct.
+    obj, weight = r.sample()
+    self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
+    self.assertTrue((obj, weight) in r)
+    for obj, weight in r.sample_many(100):
+      self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
+      self.assertTrue((obj, weight) in r)
+    # Check that sampling distribution is correct.
+    n = 1000000
+    c = Counter(r.sample_many(n))
+    for obj, w in r:
+      estimated_w = c[(obj, w)] / float(n) * r.total_weight
+      self.assertTrue(
+          np.isclose(w, estimated_w, atol=1e-3),
+          'Expected %s, got %s, for object %s' % (w, estimated_w, obj))
+  def testRouletteWheel_AddMany(self):
+    random.seed(12345678987654321)
+    r = utils.RouletteWheel()
+    self.assertTrue(r.is_empty())
+    with self.assertRaises(RuntimeError):
+      r.sample()  # Cannot sample when empty.
+    self.assertEqual(0, r.total_weight)
+    count = r.add_many(
+        ['a', 'b', 'c', 'd', 'e', 'f', 'g'],
+        [0.1, 0.01, 0.5, 0.1, 0.05, 0.03, 0.001])
+    self.assertEqual(7, count)
+    self.assertFalse(r.is_empty())
+    self.assertEqual(0.791, r.total_weight)
+    # Adding no items is allowed.
+    count = r.add_many([], [])
+    self.assertEqual(0, count)
+    self.assertFalse(r.is_empty())
+    self.assertEqual(0.791, r.total_weight)
+    # Check that sampling is correct.
+    obj, weight = r.sample()
+    self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
+    self.assertTrue((obj, weight) in r)
+    for obj, weight in r.sample_many(100):
+      self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
+      self.assertTrue((obj, weight) in r)
+    # Check that sampling distribution is correct.
+    n = 1000000
+    c = Counter(r.sample_many(n))
+    for obj, w in r:
+      estimated_w = c[(obj, w)] / float(n) * r.total_weight
+      self.assertTrue(
+          np.isclose(w, estimated_w, atol=1e-3),
+          'Expected %s, got %s, for object %s' % (w, estimated_w, obj))
+  def testRouletteWheel_AddZeroWeights(self):
+    r = utils.RouletteWheel()
+    self.assertEqual(True, r.add('a', 0))
+    self.assertFalse(r.is_empty())
+    self.assertEqual(4, r.add_many(['b', 'c', 'd', 'e'], [0, 0.1, 0, 0]))
+    self.assertEqual(
+        [('a', 0.0), ('b', 0.0), ('c', 0.1), ('d', 0.0), ('e', 0.0)],
+        list(r))
+  def testRouletteWheel_UniqueMode(self):
+    random.seed(12345678987654321)
+    r = utils.RouletteWheel(unique_mode=True)
+    self.assertEqual(True, r.add([1, 2, 3], 1, 'a'))
+    self.assertEqual(True, r.add([4, 5], 0.5, 'b'))
+    self.assertEqual(False, r.add([1, 2, 3], 1.5, 'a'))
+    self.assertEqual(
+        [([1, 2, 3], 1.0), ([4, 5], 0.5)],
+        list(r))
+    self.assertEqual(1.5, r.total_weight)
+    self.assertEqual(
+        2,
+        r.add_many(
+            [[5, 6, 2, 3], [1, 2, 3], [8], [1, 2, 3]],
+            [0.1, 0.2, 0.1, 2.0],
+            ['c', 'a', 'd', 'a']))
+    self.assertEqual(
+        [([1, 2, 3], 1.0), ([4, 5], 0.5), ([5, 6, 2, 3], 0.1), ([8], 0.1)],
+        list(r))
+    self.assertTrue(np.isclose(1.7, r.total_weight))
+    self.assertEqual(0, r.add_many([], [], []))  # Adding no items is allowed.
+    with self.assertRaises(ValueError):
+      # Key not given.
+      r.add([7, 8, 9], 2.0)
+    with self.assertRaises(ValueError):
+      # Keys not given.
+      r.add_many([[7, 8, 9], [10]], [2.0, 2.0])
+    self.assertEqual(True, r.has_key('a'))
+    self.assertEqual(True, r.has_key('b'))
+    self.assertEqual(False, r.has_key('z'))
+    self.assertEqual(1.0, r.get_weight('a'))
+    self.assertEqual(0.5, r.get_weight('b'))
+    r = utils.RouletteWheel(unique_mode=False)
+    self.assertEqual(True, r.add([1, 2, 3], 1))
+    self.assertEqual(True, r.add([4, 5], 0.5))
+    self.assertEqual(True, r.add([1, 2, 3], 1.5))
+    self.assertEqual(
+        [([1, 2, 3], 1.0), ([4, 5], 0.5), ([1, 2, 3], 1.5)],
+        list(r))
+    self.assertEqual(3, r.total_weight)
+    self.assertEqual(
+        4,
+        r.add_many(
+            [[5, 6, 2, 3], [1, 2, 3], [8], [1, 2, 3]],
+            [0.1, 0.2, 0.1, 0.2]))
+    self.assertEqual(
+        [([1, 2, 3], 1.0), ([4, 5], 0.5), ([1, 2, 3], 1.5),
+         ([5, 6, 2, 3], 0.1), ([1, 2, 3], 0.2), ([8], 0.1), ([1, 2, 3], 0.2)],
+        list(r))
+    self.assertTrue(np.isclose(3.6, r.total_weight))
+    with self.assertRaises(ValueError):
+      # Key is given.
+      r.add([7, 8, 9], 2.0, 'a')
+    with self.assertRaises(ValueError):
+      # Keys are given.
+      r.add_many([[7, 8, 9], [10]], [2.0, 2.0], ['a', 'b'])
+  def testRouletteWheel_IncrementalSave(self):
+    f = tempfile.NamedTemporaryFile()
+    r = utils.RouletteWheel(unique_mode=True, save_file=f.name)
+    entries = [
+        ([1, 2, 3], 0.1, 'a'),
+        ([4, 5], 0.2, 'b'),
+        ([6], 0.3, 'c'),
+        ([7, 8, 9, 10], 0.25, 'd'),
+        ([-1, -2], 0.15, 'e'),
+        ([-3, -4, -5], 0.5, 'f')]
+    self.assertTrue(r.is_empty())
+    for i in range(0, len(entries), 2):
+      r.add(*entries[i])
+      r.add(*entries[i + 1])
+      r.incremental_save()
+      r2 = utils.RouletteWheel(unique_mode=True, save_file=f.name)
+      self.assertEqual(i + 2, len(r2))
+      count = 0
+      for j, (obj, weight) in enumerate(r2):
+        self.assertEqual(entries[j][0], obj)
+        self.assertEqual(entries[j][1], weight)
+        self.assertEqual(weight, r2.get_weight(entries[j][2]))
+        count += 1
+      self.assertEqual(i + 2, count)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/single_task/BUILD
+++ b/research/brain_coder/single_task/BUILD
+licenses(["notice"])
+package(default_visibility = [
+    "//learning/brain/research/neural_coder:__subpackages__",
+])
+load("@subpar//:subpar.bzl", "par_binary")
+par_binary(
+    name = "run",
+    srcs = ["run.py"],
+    deps = [
+        ":defaults",
+        ":ga_train",
+        ":pg_train",
+        # absl dep :app
+        # absl dep /flags
+        # absl dep /logging
+    ],
+)
+par_binary(
+    name = "tune",
+    srcs = ["tune.py"],
+    deps = [
+        ":defaults",
+        ":run",
+# file dep
+        # absl dep :app
+        # absl dep /flags
+        # absl dep /logging
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "ga_train",
+    srcs = ["ga_train.py"],
+    deps = [
+        ":data",
+        ":defaults",
+        ":ga_lib",
+        ":results_lib",
+# file dep
+        # absl dep /flags
+        # absl dep /logging
+        # numpy dep
+        # tensorflow dep
+        "//common:utils",  # project
+    ],
+)
+py_library(
+    name = "ga_lib",
+    srcs = ["ga_lib.py"],
+    deps = [
+        ":misc",
+        # absl dep /flags
+        # absl dep /logging
+        # numpy dep
+        "//common:bf",  # project
+        "//common:utils",  # project
+    ],
+)
+py_test(
+    name = "ga_train_test",
+    srcs = ["ga_train_test.py"],
+    deps = [
+        ":defaults",
+        ":run",
+        # absl dep /flags
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "pg_train",
+    srcs = ["pg_train.py"],
+    deps = [
+        ":data",
+        ":defaults",
+        ":pg_agent",
+        ":results_lib",
+# file dep
+        # absl dep /flags
+        # absl dep /logging
+        # tensorflow dep
+        # tensorflow internal dep  # build_cleaner: keep
+    ],
+)
+py_library(
+    name = "pg_agent",
+    srcs = ["pg_agent.py"],
+    deps = [
+        ":misc",
+# file dep
+        # absl dep /logging
+        # numpy dep
+        # tensorflow dep
+        "//common:rollout",  # project
+        "//common:utils",  # project
+    ],
+)
+py_test(
+    name = "pg_agent_test",
+    srcs = ["pg_agent_test.py"],
+    deps = [
+        ":data",
+        ":defaults",
+        ":misc",
+        ":pg_agent",
+        ":pg_train",
+        # absl dep /logging
+        # numpy dep
+        # tensorflow dep
+        "//common:utils",  # project
+    ],
+)
+py_library(
+    name = "defaults",
+    srcs = ["defaults.py"],
+    deps = [
+        # absl dep /logging
+        "//common:config_lib",  # project
+    ],
+)
+py_library(
+    name = "misc",
+    srcs = ["misc.py"],
+)
+py_library(
+    name = "data",
+    srcs = ["data.py"],
+    deps = [
+        ":code_tasks",
+        # absl dep /logging
+    ],
+)
+py_library(
+    name = "code_tasks",
+    srcs = ["code_tasks.py"],
+    deps = [
+        ":misc",
+        ":test_tasks",
+        # absl dep /logging
+        # numpy dep
+        "//common:bf",  # project
+        "//common:reward",  # project
+    ],
+)
+py_test(
+    name = "code_tasks_test",
+    srcs = ["code_tasks_test.py"],
+    deps = [
+        ":code_tasks",
+        ":defaults",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "test_tasks",
+    srcs = ["test_tasks.py"],
+    deps = [
+        ":misc",
+        "//common:reward",  # project
+    ],
+)
+py_test(
+    name = "test_tasks_test",
+    srcs = ["test_tasks_test.py"],
+    deps = [
+        ":misc",
+        ":test_tasks",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_test(
+    name = "pg_train_test",
+    size = "large",
+    srcs = ["pg_train_test.py"],
+    deps = [
+        ":defaults",
+        ":run",
+        # absl dep /logging
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "results_lib",
+    srcs = ["results_lib.py"],
+    deps = [
+# file dep
+        # tensorflow dep
+    ],
+)
+py_test(
+    name = "results_lib_test",
+    srcs = ["results_lib_test.py"],
+    deps = [
+        ":results_lib",
+        # tensorflow dep
+    ],
+)
+par_binary(
+    name = "aggregate_experiment_results",
+    srcs = ["aggregate_experiment_results.py"],
+    deps = [
+        ":misc",
+        ":results_lib",
+# file dep
+        # absl dep :app
+        # absl dep /flags
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+par_binary(
+    name = "aggregate_tuning_results",
+    srcs = ["aggregate_tuning_results.py"],
+    deps = [
+# file dep
+        # absl dep :app
+        # absl dep /flags
+        # tensorflow dep
+    ],
+)
--- a/research/brain_coder/single_task/README.md
+++ b/research/brain_coder/single_task/README.md
+# Experiments for ICLR 2018 paper.
+[Code Synthesis with Priority Queue Training](https://openreview.net/forum?id=r1AoGNlC-).
+Runs policy gradient (REINFORCE), priority queue training, genetic algorithm,
+and uniform random search.
+Run all examples below out of your top-level repo directory, i.e. where your git
+clone resides.
+## Just tell me how to run something and see results
+```bash
+# These tasks are the fastest to learn. 'echo' and 'count-down' are very
+# easy. run_eval_tasks.py will do most of the work to run all the jobs.
+# Should take between 10 and 30 minutes.
+# How many repetitions each experiment will run. In the paper, we use 25. Less
+# reps means faster experiments, but noisier results.
+REPS=25
+# Extra description in the job names for these experiments. Use this description
+# to distinguish between multiple runs of the same experiment.
+DESC="demo"
+# The tasks to run.
+TASKS="reverse echo-second-seq"
+# The model types and max NPE.
+EXPS=( pg-20M topk-20M ga-20M rand-20M )
+# Where training data is saved. This is chosen by launch_training.sh. Custom
+# implementations of launch_training.sh may use different locations.
+MODELS_DIR="/tmp/models"
+# Run run_eval_tasks.py for each experiment name in EXPS.
+for exp in "${EXPS[@]}"
+do
+  ./single_task/run_eval_tasks.py \
+      --exp "$exp" --tasks $TASKS --desc "$DESC" --reps $REPS
+done
+# During training or after completion, run this to aggregate results into a
+# table. This is also useful for seeing how much progress has been made.
+# Make sure the arguments here match the settings used above.
+# Note: This can take a few minutes because it reads from every experiment
+# directory.
+bazel run single_task:aggregate_experiment_results -- \
+  --models_dir="$MODELS_DIR" \
+  --max_npe="20M" \
+  --task_list="$TASKS" \
+  --model_types="[('pg', '$DESC'), ('topk', '$DESC'), ('ga', '$DESC'),
+                  ('rand', '$DESC')]" \
+  --csv_file="/tmp/results_table.csv"
+```
+## Reproduce tuning results in paper
+```bash
+bazel build -c opt single_task:tune.par
+# PG and TopK Tuning.
+MAX_NPE=5000000
+CONFIG="
+env=c(task_cycle=['reverse-tune','remove-tune']),
+agent=c(
+  algorithm='pg',
+  grad_clip_threshold=50.0,param_init_factor=0.5,entropy_beta=0.05,lr=1e-5,
+  optimizer='rmsprop',ema_baseline_decay=0.99,topk_loss_hparam=0.0,topk=0,
+  replay_temperature=1.0,alpha=0.0,eos_token=False),
+timestep_limit=50,batch_size=64"
+./single_task/launch_tuning.sh \
+    --job_name="iclr_pg_gridsearch.reverse-remove" \
+    --config="$CONFIG" \
+    --max_npe="$MAX_NPE" \
+    --num_workers_per_tuner=1 \
+    --num_ps_per_tuner=0 \
+    --num_tuners=1 \
+    --num_repetitions=50 \
+    --hparam_space_type="pg" \
+    --stop_on_success=true
+./single_task/launch_tuning.sh \
+    --job_name="iclr_pg_topk_gridsearch.reverse-remove" \
+    --config="$CONFIG" \
+    --max_npe="$MAX_NPE" \
+    --num_workers_per_tuner=1 \
+    --num_ps_per_tuner=0 \
+    --num_tuners=1 \
+    --num_repetitions=50 \
+    --hparam_space_type="pg-topk" \
+    --fixed_hparams="topk=10" \
+    --stop_on_success=true
+./single_task/launch_tuning.sh \
+    --job_name="iclr_topk_gridsearch.reverse-remove" \
+    --config="$CONFIG" \
+    --max_npe="$MAX_NPE" \
+    --num_workers_per_tuner=1 \
+    --num_ps_per_tuner=0 \
+    --num_tuners=1 \
+    --num_repetitions=50 \
+    --hparam_space_type="topk" \
+    --fixed_hparams="topk=10" \
+    --stop_on_success=true
+# GA Tuning.
+CONFIG="
+env=c(task_cycle=['reverse-tune','remove-char-tune']),
+agent=c(algorithm='ga'),
+timestep_limit=50"
+./single_task/launch_tuning.sh \
+    --job_name="iclr_ga_gridsearch.reverse-remove" \
+    --config="$CONFIG" \
+    --max_npe="$MAX_NPE" \
+    --num_workers_per_tuner=25 \
+    --num_ps_per_tuner=0 \
+    --num_tuners=1 \
+    --num_repetitions=50 \
+    --hparam_space_type="ga" \
+    --stop_on_success=true
+# Aggregate tuning results. Run after tuning jobs complete.
+bazel run -c opt single_task:aggregate_tuning_results -- \
+    --tuning_dir="$MODELS_DIR/iclr_pg_gridsearch.reverse-remove"
+bazel run -c opt single_task:aggregate_tuning_results -- \
+    --tuning_dir="$MODELS_DIR/iclr_pg_topk_gridsearch.reverse-remove"
+bazel run -c opt single_task:aggregate_tuning_results -- \
+    --tuning_dir="$MODELS_DIR/iclr_topk_gridsearch.reverse-remove"
+bazel run -c opt single_task:aggregate_tuning_results -- \
+    --tuning_dir="$MODELS_DIR/iclr_ga_gridsearch.reverse-remove"
+```
+## Reproduce eval results in paper
+```bash
+DESC="v0"  # Description for each experiment. "Version 0" is a good default.
+EXPS=( pg-5M topk-5M ga-5M rand-5M pg-20M topk-20M ga-20M rand-20M )
+for exp in "${EXPS[@]}"
+do
+  ./single_task/run_eval_tasks.py \
+      --exp "$exp" --iclr_tasks --desc "$DESC"
+done
+```
+## Run single experiment
+```bash
+EXP="topk-20M"  # Learning algorithm + max-NPE
+TASK="reverse"  # Coding task
+DESC="v0"  # Description for each experiment. "Version 0" is a good default.
+./single_task/run_eval_tasks.py \
+    --exp "$EXP" --task "$TASK" --desc "$DESC"
+```
+## Fetch eval results into a table
+```bash
+# These arguments should match the settings you used to run the experiments.
+MODELS_DIR="/tmp/models"
+MAX_NPE="20M"
+DESC="v0"  # Same description used in the experiments.
+# MODEL_TYPES specifies each model type and the description used in their
+# experiments.
+MODEL_TYPES="[('pg', '$DESC'), ('topk', '$DESC'),
+              ('ga', '$DESC'), ('rand', '$DESC')]"
+TASKS=""  # Empty string will default to all ICLR tasks.
+# To specify custom task list, give task names separated by spaces. Example:
+# TASKS="reverse remove-char"
+bazel run single_task:aggregate_experiment_results -- \
+    --models_dir="$MODELS_DIR" \
+    --max_npe="$MAX_NPE" \
+    --task_list="$TASKS" \
+    --model_types="$MODEL_TYPES" \
+    --csv_file="/tmp/results_table.csv"
+```
+## Reproduce shortest code examples in paper
+```bash
+# Maximum NPE is higher here. We only do 1 repetition, and the algorithm needs
+# time to simplify its solution.
+MODELS_DIR="/tmp/models"
+NPE="500M"
+DESC="short-code"
+./single_task/run_eval_tasks.py \
+    --exp "simpl-$NPE" --desc "$DESC" --iclr_tasks --reps 1
+# Aggregate best code strings. Run after training completes.
+TASKS=""  # Empty string. Will default to all ICLR tasks.
+bazel run single_task:aggregate_experiment_results -- \
+    --models_dir="$MODELS_DIR" \
+    --max_npe="$NPE" \
+    --task_list="$TASKS" \
+    --model_types="[('topk', '$DESC')]" \
+    --data=code
+```
--- a/research/brain_coder/single_task/aggregate_experiment_results.py
+++ b/research/brain_coder/single_task/aggregate_experiment_results.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+r"""This script crawls experiment directories for results and aggregates them.
+Usage example:
+MODELS_DIR="/tmp/models"
+bazel run single_task:aggregate_experiment_results -- \
+    --models_dir="$MODELS_DIR" \
+    --max_npe="20M" \
+    --task_list="add echo" \
+    --model_types="[('topk', 'v0'), ('ga', 'v0')]" \
+    --csv_file=/tmp/results_table.csv
+"""
+import ast
+from collections import namedtuple
+import csv
+import os
+import re
+import StringIO
+import sys
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+from single_task import misc  # brain coder
+from single_task import results_lib  # brain coder
+DEFAULT_MODELS = [('pg', 'v0'), ('topk', 'v0'), ('ga', 'v0'), ('rand', 'v0')]
+DEFAULT_TASKS = [
+    'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
+    'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
+    'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
+    'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
+    'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'models_dir', '',
+    'Absolute path where results folders are found.')
+flags.DEFINE_string(
+    'exp_prefix', 'bf_rl_iclr',
+    'Prefix for all experiment folders.')
+flags.DEFINE_string(
+    'max_npe', '5M',
+    'String representation of max NPE of the experiments.')
+flags.DEFINE_spaceseplist(
+    'task_list', DEFAULT_TASKS,
+    'List of task names separated by spaces. If empty string, defaults to '
+    '`DEFAULT_TASKS`. These are the rows of the results table.')
+flags.DEFINE_string(
+    'model_types', str(DEFAULT_MODELS),
+    'String representation of a python list of 2-tuples, each a model_type + '
+    'job description pair. Descriptions allow you to choose among different '
+    'runs of the same experiment. These are the columns of the results table.')
+flags.DEFINE_string(
+    'csv_file', '/tmp/results_table.csv',
+    'Where to write results table. Format is CSV.')
+flags.DEFINE_enum(
+    'data', 'success_rates', ['success_rates', 'code'],
+    'What type of data to aggregate.')
+def make_csv_string(table):
+  """Convert 2D list to CSV string."""
+  s = StringIO.StringIO()
+  writer = csv.writer(s)
+  writer.writerows(table)
+  value = s.getvalue()
+  s.close()
+  return value
+def process_results(metrics):
+  """Extract useful information from given metrics.
+  Args:
+    metrics: List of results dicts. These should have been written to disk by
+        training jobs.
+  Returns:
+    Dict mapping stats names to values.
+  Raises:
+    ValueError: If max_npe or max_global_repetitions values are inconsistant
+        across dicts in the `metrics` list.
+  """
+  count = len(metrics)
+  success_count = 0
+  total_npe = 0  # Counting NPE across all runs.
+  success_npe = 0  # Counting NPE in successful runs only.
+  max_npe = 0
+  max_repetitions = 0
+  for metric_dict in metrics:
+    if not max_npe:
+      max_npe = metric_dict['max_npe']
+    elif max_npe != metric_dict['max_npe']:
+      raise ValueError(
+          'Invalid experiment. Different reps have different max-NPE settings.')
+    if not max_repetitions:
+      max_repetitions = metric_dict['max_global_repetitions']
+    elif max_repetitions != metric_dict['max_global_repetitions']:
+      raise ValueError(
+          'Invalid experiment. Different reps have different num-repetition '
+          'settings.')
+    if metric_dict['found_solution']:
+      success_count += 1
+      success_npe += metric_dict['npe']
+    total_npe += metric_dict['npe']
+  stats = {}
+  stats['max_npe'] = max_npe
+  stats['max_repetitions'] = max_repetitions
+  stats['repetitions'] = count
+  stats['successes'] = success_count  # successful reps
+  stats['failures'] = count - success_count  # failed reps
+  stats['success_npe'] = success_npe
+  stats['total_npe'] = total_npe
+  if success_count:
+    # Only successful runs counted.
+    stats['avg_success_npe'] = stats['success_npe'] / float(success_count)
+  else:
+    stats['avg_success_npe'] = 0.0
+  if count:
+    stats['success_rate'] = success_count / float(count)
+    stats['avg_total_npe'] = stats['total_npe'] / float(count)
+  else:
+    stats['success_rate'] = 0.0
+    stats['avg_total_npe'] = 0.0
+  return stats
+ProcessedResults = namedtuple('ProcessedResults', ['metrics', 'processed'])
+def get_results_for_experiment(
+    models_dir, task_name, model_type='pg', max_npe='5M', desc='v0',
+    name_prefix='bf_rl_paper', extra_desc=''):
+  """Get and process results for a given experiment.
+  An experiment is a set of runs with the same hyperparameters and environment.
+  It is uniquely specified by a (task_name, model_type, max_npe) triple, as
+  well as an optional description.
+  We assume that each experiment has a folder with the same name as the job that
+  ran the experiment. The name is computed by
+  "%name_prefix%.%desc%-%max_npe%_%task_name%".
+  Args:
+    models_dir: Parent directory containing experiment folders.
+    task_name: String name of task (the coding env). See code_tasks.py or
+        run_eval_tasks.py
+    model_type: Name of the algorithm, such as 'pg', 'topk', 'ga', 'rand'.
+    max_npe: String SI unit representation of the maximum NPE threshold for the
+        experiment. For example, "5M" means 5 million.
+    desc: Description.
+    name_prefix: Prefix of job names. Normally leave this as default.
+    extra_desc: Optional extra description at the end of the job name.
+  Returns:
+    ProcessedResults namedtuple instance, containing
+    metrics: Raw dicts read from disk.
+    processed: Stats computed by `process_results`.
+  Raises:
+    ValueError: If max_npe in the metrics does not match NPE in the experiment
+        folder name.
+  """
+  folder = name_prefix + '.{0}.{1}-{2}_{3}'.format(desc, model_type, max_npe,
+                                                   task_name)
+  if extra_desc:
+    folder += '.' + extra_desc
+  results = results_lib.Results(os.path.join(models_dir, folder))
+  metrics, _ = results.read_all()
+  processed = process_results(metrics)
+  if (not np.isclose(processed['max_npe'], misc.si_to_int(max_npe))
+      and processed['repetitions']):
+    raise ValueError(
+        'Invalid experiment. Max-NPE setting does not match expected max-NPE '
+        'in experiment name.')
+  return ProcessedResults(metrics=metrics, processed=processed)
+BestCodeResults = namedtuple(
+    'BestCodeResults',
+    ['code', 'reward', 'npe', 'folder', 'finished', 'error'])
+class BestCodeResultError(object):
+  success = 0
+  no_solution_found = 1
+  experiment_does_not_exist = 2
+def get_best_code_for_experiment(
+    models_dir, task_name, model_type='pg', max_npe='5M', desc=0,
+    name_prefix='bf_rl_paper', extra_desc=''):
+  """Like `get_results_for_experiment`, but fetches the code solutions."""
+  folder = name_prefix + '.{0}.{1}-{2}_{3}'.format(desc, model_type, max_npe,
+                                                   task_name)
+  if extra_desc:
+    folder += '.' + extra_desc
+  log_dir = os.path.join(models_dir, folder, 'logs')
+  search_regex = r'^solutions_([0-9])+\.txt$'
+  try:
+    all_children = tf.gfile.ListDirectory(log_dir)
+  except tf.errors.NotFoundError:
+    return BestCodeResults(
+        code=None, reward=0.0, npe=0, folder=folder, finished=False,
+        error=BestCodeResultError.experiment_does_not_exist)
+  solution_files = [
+      fname for fname in all_children if re.search(search_regex, fname)]
+  max_reward = 0.0
+  npe = 0
+  best_code = None
+  for fname in solution_files:
+    with tf.gfile.FastGFile(os.path.join(log_dir, fname), 'r') as reader:
+      results = [ast.literal_eval(entry) for entry in reader]
+    for res in results:
+      if res['reward'] > max_reward:
+        best_code = res['code']
+        max_reward = res['reward']
+        npe = res['npe']
+  error = (
+      BestCodeResultError.success if best_code
+      else BestCodeResultError.no_solution_found)
+  try:
+    # If there is a status.txt file, check if it contains the status of the job.
+    with tf.gfile.FastGFile(os.path.join(log_dir, 'status.txt'), 'r') as f:
+      # Job is done, so mark this experiment as finished.
+      finished = f.read().lower().strip() == 'done'
+  except tf.errors.NotFoundError:
+    # No status file has been written, so the experiment is not done. No need to
+    # report an error here, because we do not require that experiment jobs write
+    # out a status.txt file until they have finished.
+    finished = False
+  return BestCodeResults(
+      code=best_code, reward=max_reward, npe=npe, folder=folder,
+      finished=finished, error=error)
+def make_results_table(
+    models=None,
+    tasks=None,
+    max_npe='5M',
+    name_prefix='bf_rl_paper',
+    extra_desc='',
+    models_dir='/tmp'):
+  """Creates a table of results: algorithm + version by tasks.
+  Args:
+    models: The table columns. A list of (algorithm, desc) tuples.
+    tasks: The table rows. List of task names.
+    max_npe: String SI unit representation of the maximum NPE threshold for the
+        experiment. For example, "5M" means 5 million. All entries in the table
+        share the same max-NPE.
+    name_prefix: Name prefix used in logging directory for the experiment.
+    extra_desc: Extra description added to name of logging directory for the
+        experiment.
+    models_dir: Parent directory containing all experiment folders.
+  Returns:
+    A 2D list holding the table cells.
+  """
+  if models is None:
+    models = DEFAULT_MODELS
+  if tasks is None:
+    tasks = DEFAULT_TASKS
+  model_results = {}
+  for model_type, desc in models:
+    model_results[model_type] = {
+        tname: get_results_for_experiment(
+            models_dir, tname, model_type, max_npe, desc,
+            name_prefix=name_prefix, extra_desc=extra_desc
+        ).processed
+        for tname in tasks}
+  def info(stats):
+    return [str(stats['repetitions']),
+            '%.2f' % stats['success_rate'],
+            str(int(stats['avg_total_npe']))]
+  rows = [['max NPE: ' + max_npe]
+          + misc.flatten([['{0} ({1})'.format(m, d), '', '']
+                          for m, d in models])]
+  rows.append(
+      [''] + misc.flatten([['reps', 'success rate', 'avg NPE']
+                           for _ in models]))
+  for tname in tasks:
+    rows.append(
+        [tname]
+        + misc.flatten([info(model_results[model][tname])
+                        for model, _ in models]))
+  return rows
+def print_results_table(results_table):
+  """Print human readable results table to stdout."""
+  print('')
+  print('=== Results Table ===')
+  print('Format: # reps [success rate, avg total NPE]')
+  def info_str(info_row):
+    # num_runs (success_rate, avg_total_npe)
+    if not info_row[0]:
+      return '0'
+    return '%s [%s, %s]' % (str(info_row[0]).ljust(2), info_row[1], info_row[2])
+  nc = len(results_table[0])  # num cols
+  out_table = [
+      [results_table[0][0]] + [results_table[0][i] for i in range(1, nc, 3)]]
+  for row in results_table[2:]:
+    out_table.append([row[0]] + [info_str(row[i:i+3]) for i in range(1, nc, 3)])
+  nc = len(out_table[0])  # num cols
+  col_widths = [max(len(row[col]) for row in out_table) for col in range(nc)]
+  table_string = ''
+  for row in out_table:
+    table_string += ''.join(
+        [row[c].ljust(col_widths[c] + 2) for c in range(nc)]) + '\n'
+  print(table_string)
+def main(argv):
+  del argv  # Unused.
+  name_prefix = FLAGS.exp_prefix
+  print('Experiments prefix: %s' % name_prefix)
+  model_types = ast.literal_eval(FLAGS.model_types)
+  if FLAGS.data == 'success_rates':
+    results_table = make_results_table(
+        models=model_types, tasks=FLAGS.task_list, max_npe=FLAGS.max_npe,
+        models_dir=FLAGS.models_dir,
+        name_prefix=name_prefix, extra_desc='')
+    with tf.gfile.FastGFile(FLAGS.csv_file, 'w') as f:
+      f.write(make_csv_string(results_table))
+    print_results_table(results_table)
+  else:
+    # Best code
+    print('* = experiment is still running')
+    print('')
+    print('=== Best Synthesized Code ===')
+    for model_type, desc in model_types:
+      print('%s (%s)' % (model_type, desc))
+      sys.stdout.flush()
+      for tname in FLAGS.task_list:
+        res = get_best_code_for_experiment(
+            FLAGS.models_dir, tname, model_type, FLAGS.max_npe, desc,
+            name_prefix=name_prefix, extra_desc='')
+        unfinished_mark = '' if res.finished else ' *'
+        tname += unfinished_mark
+        if res.error == BestCodeResultError.success:
+          print('  %s' % tname)
+          print('    %s' % res.code)
+          print('    R=%.6f, NPE=%s' % (res.reward, misc.int_to_si(res.npe)))
+        elif res.error == BestCodeResultError.experiment_does_not_exist:
+          print('  Experiment does not exist. Check arguments.')
+          print('  Experiment folder: %s' % res.folder)
+          break
+        else:
+          print('  %s' % tname)
+          print('    (none)')
+        sys.stdout.flush()
+if __name__ == '__main__':
+  app.run(main)