Merge branch 'master' into patch-6

78ddf6eb · cclauss · GitHub · 50cb0365 · 1f34fcaf · 78ddf6eb
Unverified Commit 78ddf6eb authored Jan 26, 2018 by cclauss Committed by GitHub Jan 26, 2018
20 changed files
--- a/research/attention_ocr/python/testdata/fsns_train_22.png
+++ b/research/attention_ocr/python/testdata/fsns_train_22.png
--- a/research/attention_ocr/python/testdata/fsns_train_23.png
+++ b/research/attention_ocr/python/testdata/fsns_train_23.png
--- a/research/attention_ocr/python/testdata/fsns_train_24.png
+++ b/research/attention_ocr/python/testdata/fsns_train_24.png
--- a/research/attention_ocr/python/testdata/fsns_train_25.png
+++ b/research/attention_ocr/python/testdata/fsns_train_25.png
--- a/research/attention_ocr/python/testdata/fsns_train_26.png
+++ b/research/attention_ocr/python/testdata/fsns_train_26.png
--- a/research/attention_ocr/python/testdata/fsns_train_27.png
+++ b/research/attention_ocr/python/testdata/fsns_train_27.png
--- a/research/attention_ocr/python/testdata/fsns_train_28.png
+++ b/research/attention_ocr/python/testdata/fsns_train_28.png
--- a/research/attention_ocr/python/testdata/fsns_train_29.png
+++ b/research/attention_ocr/python/testdata/fsns_train_29.png
--- a/research/attention_ocr/python/testdata/fsns_train_30.png
+++ b/research/attention_ocr/python/testdata/fsns_train_30.png
--- a/research/attention_ocr/python/testdata/fsns_train_31.png
+++ b/research/attention_ocr/python/testdata/fsns_train_31.png
--- a/research/brain_coder/README.md
+++ b/research/brain_coder/README.md
+# Brain Coder
+*Authors: Daniel Abolafia, Mohammad Norouzi, Quoc Le*
+Brain coder is a code synthesis experimental environment. We provide code that reproduces the results from our recent paper [Neural Program Synthesis with Priority Queue Training](https://arxiv.org/abs/1801.03526). See single_task/README.md for details on how to build and reproduce those experiments.
+## Installation
+First install dependencies seperately:
+* [bazel](https://docs.bazel.build/versions/master/install.html)
+* [TensorFlow](https://www.tensorflow.org/install/)
+* [scipy](https://www.scipy.org/install.html)
+* [absl-py](https://github.com/abseil/abseil-py)
+Note: even if you already have these dependencies installed, make sure they are
+up-to-date to avoid unnecessary debugging.
+## Building
+Use bazel from the top-level repo directory.
+For example:
+```bash
+bazel build single_task:run
+```
+View README.md files in subdirectories for more details.
--- a/research/brain_coder/WORKSPACE
+++ b/research/brain_coder/WORKSPACE
+git_repository(
+    name = "subpar",
+    remote = "https://github.com/google/subpar",
+    tag = "1.0.0",
+)
--- a/research/brain_coder/common/BUILD
+++ b/research/brain_coder/common/BUILD
+licenses(["notice"])
+package(default_visibility = [
+    "//:__subpackages__",
+])
+py_library(
+    name = "bf",
+    srcs = ["bf.py"],
+)
+py_test(
+    name = "bf_test",
+    srcs = ["bf_test.py"],
+    deps = [
+        ":bf",
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "config_lib",
+    srcs = ["config_lib.py"],
+)
+py_test(
+    name = "config_lib_test",
+    srcs = ["config_lib_test.py"],
+    deps = [
+        ":config_lib",
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "reward",
+    srcs = ["reward.py"],
+)
+py_test(
+    name = "reward_test",
+    srcs = ["reward_test.py"],
+    deps = [
+        ":reward",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "rollout",
+    srcs = ["rollout.py"],
+    deps = [
+        ":utils",
+        # numpy dep
+        # scipy dep
+    ],
+)
+py_test(
+    name = "rollout_test",
+    srcs = ["rollout_test.py"],
+    deps = [
+        ":rollout",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "schedules",
+    srcs = ["schedules.py"],
+    deps = [":config_lib"],
+)
+py_test(
+    name = "schedules_test",
+    srcs = ["schedules_test.py"],
+    deps = [
+        ":config_lib",
+        ":schedules",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    deps = [
+        # file dep
+        # absl dep /logging
+        # numpy dep
+        # tensorflow dep
+    ],
+)
+py_test(
+    name = "utils_test",
+    srcs = ["utils_test.py"],
+    deps = [
+        ":utils",
+        # numpy dep
+        # tensorflow dep
+    ],
+)
--- a/research/brain_coder/common/bf.py
+++ b/research/brain_coder/common/bf.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""BrainF**k interpreter.
+Language info: https://en.wikipedia.org/wiki/Brainfuck
+Based on public implementation:
+https://github.com/pocmo/Python-Brainfuck/blob/master/brainfuck.py
+"""
+from collections import namedtuple
+import time
+EvalResult = namedtuple(
+    'EvalResult', ['output', 'success', 'failure_reason', 'steps', 'time',
+                   'memory', 'program_trace'])
+ExecutionSnapshot = namedtuple(
+    'ExecutionSnapshot',
+    ['codeptr', 'codechar', 'memptr', 'memval', 'memory', 'next_input',
+     'output_buffer'])
+class Status(object):
+  SUCCESS = 'success'
+  TIMEOUT = 'timeout'
+  STEP_LIMIT = 'step-limit'
+  SYNTAX_ERROR = 'syntax-error'
+CHARS = INT_TO_CHAR = ['>', '<', '+', '-', '[', ']', '.', ',']
+CHAR_TO_INT = dict([(c, i) for i, c in enumerate(INT_TO_CHAR)])
+class LookAheadIterator(object):
+  """Same API as Python iterator, with additional peek method."""
+  def __init__(self, iterable):
+    self._it = iter(iterable)
+    self._current_element = None
+    self._done = False
+    self._preload_next()
+  def _preload_next(self):
+    try:
+      self._current_element = self._it.next()
+    except StopIteration:
+      self._done = True
+  def next(self):
+    if self._done:
+      raise StopIteration
+    element = self._current_element
+    self._preload_next()
+    return element
+  def peek(self, default_value=None):
+    if self._done:
+      if default_value is None:
+        raise StopIteration
+      return default_value
+    return self._current_element
+def buildbracemap(code):
+  """Build jump map.
+  Args:
+    code: List or string or BF chars.
+  Returns:
+    bracemap: dict mapping open and close brace positions in the code to their
+        destination jumps. Specifically, positions of matching open/close braces
+        if they exist.
+    correct_syntax: True if all braces match. False if there are unmatched
+        braces in the code. Even if there are unmatched braces, a bracemap will
+        be built, and unmatched braces will map to themselves.
+  """
+  bracestack, bracemap = [], {}
+  correct_syntax = True
+  for position, command in enumerate(code):
+    if command == '[':
+      bracestack.append(position)
+    if command == ']':
+      if not bracestack:  # Unmatched closing brace.
+        bracemap[position] = position  # Don't jump to any position.
+        correct_syntax = False
+        continue
+      start = bracestack.pop()
+      bracemap[start] = position
+      bracemap[position] = start
+  if bracestack:  # Unmatched opening braces.
+    for pos in bracestack:
+      bracemap[pos] = pos  # Don't jump to any position.
+      correct_syntax = False
+  return bracemap, correct_syntax
+def evaluate(code, input_buffer=None, init_memory=None, base=256, timeout=1.0,
+             max_steps=None, require_correct_syntax=True, output_memory=False,
+             debug=False):
+  """Execute BF code.
+  Args:
+    code: String or list of BF characters. Any character not in CHARS will be
+        ignored.
+    input_buffer: A list of ints which will be used as the program's input
+        stream. Each read op "," will read an int from this list. 0's will be
+        read once the end of the list is reached, or if no input buffer is
+        given.
+    init_memory: A list of ints. Memory for first k positions will be
+        initialized to this list (where k = len(init_memory)). Memory positions
+        are initialized to 0 by default.
+    base: Integer base for the memory. When a memory value is incremented to
+        `base` it will overflow to 0. When a memory value is decremented to -1
+        it will underflow to `base` - 1.
+    timeout: Time limit for program execution in seconds. Set to None to
+        disable.
+    max_steps: Execution step limit. An execution step is the execution of one
+        operation (code character), even if that op has been executed before.
+        Execution exits when this many steps are reached. Set to None to
+        disable. Disabled by default.
+    require_correct_syntax: If True, unmatched braces will cause `evaluate` to
+        return without executing the code. The failure reason will be
+        `Status.SYNTAX_ERROR`. If False, unmatched braces are ignored
+        and execution will continue.
+    output_memory: If True, the state of the memory at the end of execution is
+        returned.
+    debug: If True, then a full program trace will be returned.
+  Returns:
+    EvalResult namedtuple containing
+      output: List of ints which were written out by the program with the "."
+          operation.
+      success: Boolean. Whether execution completed successfully.
+      failure_reason: One of the attributes of `Status`. Gives extra info
+          about why execution was not successful.
+      steps: Number of execution steps the program ran for.
+      time: Amount of time in seconds the program ran for.
+      memory: If `output_memory` is True, a list of memory cells up to the last
+          one written to. otherwise, None.
+  """
+  input_iter = (
+      LookAheadIterator(input_buffer) if input_buffer is not None
+      else LookAheadIterator([]))
+  # Null memory value. This is the value of an empty memory. Also the value
+  # returned by the read operation when the input buffer is empty, or the
+  # end of the buffer is reached.
+  null_value = 0
+  code = list(code)
+  bracemap, correct_syntax = buildbracemap(code)  # will modify code list
+  if require_correct_syntax and not correct_syntax:
+    return EvalResult([], False, Status.SYNTAX_ERROR, 0, 0.0,
+                      [] if output_memory else None, [] if debug else None)
+  output_buffer = []
+  codeptr, cellptr = 0, 0
+  cells = list(init_memory) if init_memory else [0]
+  program_trace = [] if debug else None
+  success = True
+  reason = Status.SUCCESS
+  start_time = time.time()
+  steps = 0
+  while codeptr < len(code):
+    command = code[codeptr]
+    if debug:
+      # Add step to program trace.
+      program_trace.append(ExecutionSnapshot(
+          codeptr=codeptr, codechar=command, memptr=cellptr,
+          memval=cells[cellptr], memory=list(cells),
+          next_input=input_iter.peek(null_value),
+          output_buffer=list(output_buffer)))
+    if command == '>':
+      cellptr += 1
+      if cellptr == len(cells): cells.append(null_value)
+    if command == '<':
+      cellptr = 0 if cellptr <= 0 else cellptr - 1
+    if command == '+':
+      cells[cellptr] = cells[cellptr] + 1 if cells[cellptr] < (base - 1) else 0
+    if command == '-':
+      cells[cellptr] = cells[cellptr] - 1 if cells[cellptr] > 0 else (base - 1)
+    if command == '[' and cells[cellptr] == 0: codeptr = bracemap[codeptr]
+    if command == ']' and cells[cellptr] != 0: codeptr = bracemap[codeptr]
+    if command == '.': output_buffer.append(cells[cellptr])
+    if command == ',': cells[cellptr] = next(input_iter, null_value)
+    codeptr += 1
+    steps += 1
+    if timeout is not None and time.time() - start_time > timeout:
+      success = False
+      reason = Status.TIMEOUT
+      break
+    if max_steps is not None and steps >= max_steps:
+      success = False
+      reason = Status.STEP_LIMIT
+      break
+  if debug:
+    # Add step to program trace.
+    command = code[codeptr] if codeptr < len(code) else ''
+    program_trace.append(ExecutionSnapshot(
+        codeptr=codeptr, codechar=command, memptr=cellptr,
+        memval=cells[cellptr], memory=list(cells),
+        next_input=input_iter.peek(null_value),
+        output_buffer=list(output_buffer)))
+  return EvalResult(
+      output=output_buffer,
+      success=success,
+      failure_reason=reason,
+      steps=steps,
+      time=time.time() - start_time,
+      memory=cells if output_memory else None,
+      program_trace=program_trace)
--- a/research/brain_coder/common/bf_test.py
+++ b/research/brain_coder/common/bf_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.bf."""
+import tensorflow as tf
+from common import bf  # brain coder
+class BfTest(tf.test.TestCase):
+  def assertCorrectOutput(self, target_output, eval_result):
+    self.assertEqual(target_output, eval_result.output)
+    self.assertTrue(eval_result.success)
+    self.assertEqual(bf.Status.SUCCESS, eval_result.failure_reason)
+  def testBasicOps(self):
+    self.assertCorrectOutput(
+        [3, 1, 2],
+        bf.evaluate('+++.--.+.'))
+    self.assertCorrectOutput(
+        [1, 1, 2],
+        bf.evaluate('+.<.>++.'))
+    self.assertCorrectOutput(
+        [0],
+        bf.evaluate('+,.'))
+    self.assertCorrectOutput(
+        [ord(char) for char in 'Hello World!\n'],
+        bf.evaluate(
+            '>++++++++[-<+++++++++>]<.>>+>-[+]++>++>+++[>[->+++<<+++>]<<]>-----'
+            '.>->+++..+++.>-.<<+[>[+>+]>>]<--------------.>>.+++.------.-------'
+            '-.>+.>+.'))
+  def testBase(self):
+    self.assertCorrectOutput(
+        [1, 4],
+        bf.evaluate('+.--.', base=5, input_buffer=[]))
+  def testInputBuffer(self):
+    self.assertCorrectOutput(
+        [2, 3, 4],
+        bf.evaluate('>,[>,]<[.<]', input_buffer=[4, 3, 2]))
+  def testBadChars(self):
+    self.assertCorrectOutput(
+        [2, 3, 4],
+        bf.evaluate('>,[>,]hello<world[.<]comments',
+                    input_buffer=[4, 3, 2]))
+  def testUnmatchedBraces(self):
+    self.assertCorrectOutput(
+        [3, 6, 1],
+        bf.evaluate('+++.]]]]>----.[[[[[>+.',
+                    input_buffer=[],
+                    base=10,
+                    require_correct_syntax=False))
+    eval_result = bf.evaluate(
+        '+++.]]]]>----.[[[[[>+.',
+        input_buffer=[],
+        base=10,
+        require_correct_syntax=True)
+    self.assertEqual([], eval_result.output)
+    self.assertFalse(eval_result.success)
+    self.assertEqual(bf.Status.SYNTAX_ERROR,
+                     eval_result.failure_reason)
+  def testTimeout(self):
+    er = bf.evaluate('+.[].', base=5, input_buffer=[], timeout=0.1)
+    self.assertEqual(
+        ([1], False, bf.Status.TIMEOUT),
+        (er.output, er.success, er.failure_reason))
+    self.assertTrue(0.07 < er.time < 0.21)
+    er = bf.evaluate('+.[-].', base=5, input_buffer=[], timeout=0.1)
+    self.assertEqual(
+        ([1, 0], True, bf.Status.SUCCESS),
+        (er.output, er.success, er.failure_reason))
+    self.assertTrue(er.time < 0.15)
+  def testMaxSteps(self):
+    er = bf.evaluate('+.[].', base=5, input_buffer=[], timeout=None,
+                     max_steps=100)
+    self.assertEqual(
+        ([1], False, bf.Status.STEP_LIMIT, 100),
+        (er.output, er.success, er.failure_reason, er.steps))
+    er = bf.evaluate('+.[-].', base=5, input_buffer=[], timeout=None,
+                     max_steps=100)
+    self.assertEqual(
+        ([1, 0], True, bf.Status.SUCCESS),
+        (er.output, er.success, er.failure_reason))
+    self.assertTrue(er.steps < 100)
+  def testOutputMemory(self):
+    er = bf.evaluate('+>++>+++>++++.', base=256, input_buffer=[],
+                     output_memory=True)
+    self.assertEqual(
+        ([4], True, bf.Status.SUCCESS),
+        (er.output, er.success, er.failure_reason))
+    self.assertEqual([1, 2, 3, 4], er.memory)
+  def testProgramTrace(self):
+    es = bf.ExecutionSnapshot
+    er = bf.evaluate(',[.>,].', base=256, input_buffer=[2, 1], debug=True)
+    self.assertEqual(
+        [es(codeptr=0, codechar=',', memptr=0, memval=0, memory=[0],
+            next_input=2, output_buffer=[]),
+         es(codeptr=1, codechar='[', memptr=0, memval=2, memory=[2],
+            next_input=1, output_buffer=[]),
+         es(codeptr=2, codechar='.', memptr=0, memval=2, memory=[2],
+            next_input=1, output_buffer=[]),
+         es(codeptr=3, codechar='>', memptr=0, memval=2, memory=[2],
+            next_input=1, output_buffer=[2]),
+         es(codeptr=4, codechar=',', memptr=1, memval=0, memory=[2, 0],
+            next_input=1, output_buffer=[2]),
+         es(codeptr=5, codechar=']', memptr=1, memval=1, memory=[2, 1],
+            next_input=0, output_buffer=[2]),
+         es(codeptr=2, codechar='.', memptr=1, memval=1, memory=[2, 1],
+            next_input=0, output_buffer=[2]),
+         es(codeptr=3, codechar='>', memptr=1, memval=1, memory=[2, 1],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=4, codechar=',', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=5, codechar=']', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=6, codechar='.', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1]),
+         es(codeptr=7, codechar='', memptr=2, memval=0, memory=[2, 1, 0],
+            next_input=0, output_buffer=[2, 1, 0])],
+        er.program_trace)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/config_lib.py
+++ b/research/brain_coder/common/config_lib.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Objects for storing configuration and passing config into binaries.
+Config class stores settings and hyperparameters for models, data, and anything
+else that may be specific to a particular run.
+"""
+import ast
+import itertools
+from six.moves import xrange
+class Config(dict):
+  """Stores model configuration, hyperparameters, or dataset parameters."""
+  def __getattr__(self, attr):
+    return self[attr]
+  def __setattr__(self, attr, value):
+    self[attr] = value
+  def pretty_str(self, new_lines=True, indent=2, final_indent=0):
+    prefix = (' ' * indent) if new_lines else ''
+    final_prefix = (' ' * final_indent) if new_lines else ''
+    kv = ['%s%s=%s' % (prefix, k,
+                       (repr(v) if not isinstance(v, Config)
+                        else v.pretty_str(new_lines=new_lines,
+                                          indent=indent+2,
+                                          final_indent=indent)))
+          for k, v in self.items()]
+    if new_lines:
+      return 'Config(\n%s\n%s)' % (',\n'.join(kv), final_prefix)
+    else:
+      return 'Config(%s)' % ', '.join(kv)
+  def _update_iterator(self, *args, **kwargs):
+    """Convert mixed input into an iterator over (key, value) tuples.
+    Follows the dict.update call signature.
+    Args:
+      *args: (Optional) Pass a dict or iterable of (key, value) 2-tuples as
+          an unnamed argument. Only one unnamed argument allowed.
+      **kwargs: (Optional) Pass (key, value) pairs as named arguments, where the
+          argument name is the key and the argument value is the value.
+    Returns:
+      An iterator over (key, value) tuples given in the input.
+    Raises:
+      TypeError: If more than one unnamed argument is given.
+    """
+    if len(args) > 1:
+      raise TypeError('Expected at most 1 unnamed arguments, got %d'
+                      % len(args))
+    obj = args[0] if args else dict()
+    if isinstance(obj, dict):
+      return itertools.chain(obj.items(), kwargs.items())
+    # Assume obj is an iterable of 2-tuples.
+    return itertools.chain(obj, kwargs.items())
+  def make_default(self, keys=None):
+    """Convert OneOf objects into their default configs.
+    Recursively calls into Config objects.
+    Args:
+      keys: Iterable of key names to check. If None, all keys in self will be
+          used.
+    """
+    if keys is None:
+      keys = self.keys()
+    for k in keys:
+      # Replace OneOf with its default value.
+      if isinstance(self[k], OneOf):
+        self[k] = self[k].default()
+      # Recursively call into all Config objects, even those that came from
+      # OneOf objects in the previous code line (for nested OneOf objects).
+      if isinstance(self[k], Config):
+        self[k].make_default()
+  def update(self, *args, **kwargs):
+    """Same as dict.update except nested Config objects are updated.
+    Args:
+      *args: (Optional) Pass a dict or list of (key, value) 2-tuples as unnamed
+          argument.
+      **kwargs: (Optional) Pass (key, value) pairs as named arguments, where the
+          argument name is the key and the argument value is the value.
+    """
+    key_set = set(self.keys())
+    for k, v in self._update_iterator(*args, **kwargs):
+      if k in key_set:
+        key_set.remove(k)  # This key is updated so exclude from make_default.
+      if k in self and isinstance(self[k], Config) and isinstance(v, dict):
+        self[k].update(v)
+      elif k in self and isinstance(self[k], OneOf) and isinstance(v, dict):
+        # Replace OneOf with the chosen config.
+        self[k] = self[k].update(v)
+      else:
+        self[k] = v
+    self.make_default(key_set)
+  def strict_update(self, *args, **kwargs):
+    """Same as Config.update except keys and types are not allowed to change.
+    If a given key is not already in this instance, an exception is raised. If a
+    given value does not have the same type as the existing value for the same
+    key, an exception is raised. Use this method to catch config mistakes.
+    Args:
+      *args: (Optional) Pass a dict or list of (key, value) 2-tuples as unnamed
+          argument.
+      **kwargs: (Optional) Pass (key, value) pairs as named arguments, where the
+          argument name is the key and the argument value is the value.
+    Raises:
+      TypeError: If more than one unnamed argument is given.
+      TypeError: If new value type does not match existing type.
+      KeyError: If a given key is not already defined in this instance.
+    """
+    key_set = set(self.keys())
+    for k, v in self._update_iterator(*args, **kwargs):
+      if k in self:
+        key_set.remove(k)  # This key is updated so exclude from make_default.
+        if isinstance(self[k], Config):
+          if not isinstance(v, dict):
+            raise TypeError('dict required for Config value, got %s' % type(v))
+          self[k].strict_update(v)
+        elif isinstance(self[k], OneOf):
+          if not isinstance(v, dict):
+            raise TypeError('dict required for OneOf value, got %s' % type(v))
+          # Replace OneOf with the chosen config.
+          self[k] = self[k].strict_update(v)
+        else:
+          if not isinstance(v, type(self[k])):
+            raise TypeError('Expecting type %s for key %s, got type %s'
+                            % (type(self[k]), k, type(v)))
+          self[k] = v
+      else:
+        raise KeyError(
+            'Key %s does not exist. New key creation not allowed in '
+            'strict_update.' % k)
+    self.make_default(key_set)
+  @staticmethod
+  def from_str(config_str):
+    """Inverse of Config.__str__."""
+    parsed = ast.literal_eval(config_str)
+    assert isinstance(parsed, dict)
+    def _make_config(dictionary):
+      for k, v in dictionary.items():
+        if isinstance(v, dict):
+          dictionary[k] = _make_config(v)
+      return Config(**dictionary)
+    return _make_config(parsed)
+  @staticmethod
+  def parse(key_val_string):
+    """Parse hyperparameter string into Config object.
+    Format is 'key=val,key=val,...'
+    Values can be any python literal, or another Config object encoded as
+    'c(key=val,key=val,...)'.
+    c(...) expressions can be arbitrarily nested.
+    Example:
+    'a=1,b=3e-5,c=[1,2,3],d="hello world",e={"a":1,"b":2},f=c(x=1,y=[10,20])'
+    Args:
+      key_val_string: The hyperparameter string.
+    Returns:
+      Config object parsed from the input string.
+    """
+    if not key_val_string.strip():
+      return Config()
+    def _pair_to_kv(pair):
+      split_index = pair.find('=')
+      key, val = pair[:split_index].strip(), pair[split_index+1:].strip()
+      if val.startswith('c(') and val.endswith(')'):
+        val = Config.parse(val[2:-1])
+      else:
+        val = ast.literal_eval(val)
+      return key, val
+    return Config(**dict([_pair_to_kv(pair)
+                          for pair in _comma_iterator(key_val_string)]))
+class OneOf(object):
+  """Stores branching config.
+  In some cases there may be options which each have their own set of config
+  params. For example, if specifying config for an environment, each environment
+  can have custom config options. OneOf is a way to organize branching config.
+  Usage example:
+  one_of = OneOf(
+      [Config(a=1, b=2),
+       Config(a=2, c='hello'),
+       Config(a=3, d=10, e=-10)],
+      a=1)
+  config = one_of.strict_update(Config(a=3, d=20))
+  config == {'a': 3, 'd': 20, 'e': -10}
+  """
+  def __init__(self, choices, **kwargs):
+    """Constructor.
+    Usage: OneOf([Config(...), Config(...), ...], attribute=default_value)
+    Args:
+      choices: An iterable of Config objects. When update/strict_update is
+          called on this OneOf, one of these Config will be selected.
+      **kwargs: Give exactly one config attribute to branch on. The value of
+          this attribute during update/strict_update will determine which
+          Config is used.
+    Raises:
+      ValueError: If kwargs does not contain exactly one entry. Should give one
+          named argument which is used as the attribute to condition on.
+    """
+    if len(kwargs) != 1:
+      raise ValueError(
+          'Incorrect usage. Must give exactly one named argument. The argument '
+          'name is the config attribute to condition on, and the argument '
+          'value is the default choice. Got %d named arguments.' % len(kwargs))
+    key, default_value = kwargs.items()[0]
+    self.key = key
+    self.default_value = default_value
+    # Make sure each choice is a Config object.
+    for config in choices:
+      if not isinstance(config, Config):
+        raise TypeError('choices must be a list of Config objects. Got %s.'
+                        % type(config))
+    # Map value for key to the config with that value.
+    self.value_map = {config[key]: config for config in choices}
+    self.default_config = self.value_map[self.default_value]
+    # Make sure there are no duplicate values.
+    if len(self.value_map) != len(choices):
+      raise ValueError('Multiple choices given for the same value of %s.' % key)
+    # Check that the default value is valid.
+    if self.default_value not in self.value_map:
+      raise ValueError(
+          'Default value is not an available choice. Got %s=%s. Choices are %s.'
+          % (key, self.default_value, self.value_map.keys()))
+  def default(self):
+    return self.default_config
+  def update(self, other):
+    """Choose a config and update it.
+    If `other` is a Config, one of the config choices is selected and updated.
+    Otherwise `other` is returned.
+    Args:
+      other: Will update chosen config with this value by calling `update` on
+          the config.
+    Returns:
+      The chosen config after updating it, or `other` if no config could be
+      selected.
+    """
+    if not isinstance(other, Config):
+      return other
+    if self.key not in other or other[self.key] not in self.value_map:
+      return other
+    target = self.value_map[other[self.key]]
+    target.update(other)
+    return target
+  def strict_update(self, config):
+    """Choose a config and update it.
+    `config` must be a Config object. `config` must have the key used to select
+    among the config choices, and that key must have a value which one of the
+    config choices has.
+    Args:
+      config: A Config object. the chosen config will be update by calling
+           `strict_update`.
+    Returns:
+      The chosen config after updating it.
+    Raises:
+      TypeError: If `config` is not a Config instance.
+      ValueError: If `config` does not have the branching key in its key set.
+      ValueError: If the value of the config's branching key is not one of the
+          valid choices.
+    """
+    if not isinstance(config, Config):
+      raise TypeError('Expecting Config instance, got %s.' % type(config))
+    if self.key not in config:
+      raise ValueError(
+          'Branching key %s required but not found in %s' % (self.key, config))
+    if config[self.key] not in self.value_map:
+      raise ValueError(
+          'Value %s for key %s is not a possible choice. Choices are %s.'
+          % (config[self.key], self.key, self.value_map.keys()))
+    target = self.value_map[config[self.key]]
+    target.strict_update(config)
+    return target
+def _next_comma(string, start_index):
+  """Finds the position of the next comma not used in a literal collection."""
+  paren_count = 0
+  for i in xrange(start_index, len(string)):
+    c = string[i]
+    if c == '(' or c == '[' or c == '{':
+      paren_count += 1
+    elif c == ')' or c == ']' or c == '}':
+      paren_count -= 1
+    if paren_count == 0 and c == ',':
+      return i
+  return -1
+def _comma_iterator(string):
+  index = 0
+  while 1:
+    next_index = _next_comma(string, index)
+    if next_index == -1:
+      yield string[index:]
+      return
+    yield string[index:next_index]
+    index = next_index + 1
--- a/research/brain_coder/common/config_lib_test.py
+++ b/research/brain_coder/common/config_lib_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.config_lib."""
+import tensorflow as tf
+from common import config_lib  # brain coder
+class ConfigLibTest(tf.test.TestCase):
+  def testConfig(self):
+    config = config_lib.Config(hello='world', foo='bar', num=123, f=56.7)
+    self.assertEqual('world', config.hello)
+    self.assertEqual('bar', config['foo'])
+    config.hello = 'everyone'
+    config['bar'] = 9000
+    self.assertEqual('everyone', config['hello'])
+    self.assertEqual(9000, config.bar)
+    self.assertEqual(5, len(config))
+  def testConfigUpdate(self):
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.update({'b': 10, 'd': 4})
+    self.assertEqual({'a': 1, 'b': 10, 'c': 3, 'd': 4}, config)
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.update(b=10, d=4)
+    self.assertEqual({'a': 1, 'b': 10, 'c': 3, 'd': 4}, config)
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.update({'e': 5}, b=10, d=4)
+    self.assertEqual({'a': 1, 'b': 10, 'c': 3, 'd': 4, 'e': 5}, config)
+    config = config_lib.Config(
+        a=1,
+        b=2,
+        x=config_lib.Config(
+            l='a',
+            y=config_lib.Config(m=1, n=2),
+            z=config_lib.Config(
+                q=config_lib.Config(a=10, b=20),
+                r=config_lib.Config(s=1, t=2))))
+    config.update(x={'y': {'m': 10}, 'z': {'r': {'s': 5}}})
+    self.assertEqual(
+        config_lib.Config(
+            a=1, b=2,
+            x=config_lib.Config(
+                l='a',
+                y=config_lib.Config(m=10, n=2),
+                z=config_lib.Config(
+                    q=config_lib.Config(a=10, b=20),
+                    r=config_lib.Config(s=5, t=2)))),
+        config)
+    config = config_lib.Config(
+        foo='bar',
+        num=100,
+        x=config_lib.Config(a=1, b=2, c=config_lib.Config(h=10, i=20, j=30)),
+        y=config_lib.Config(qrs=5, tuv=10),
+        d={'a': 1, 'b': 2},
+        l=[1, 2, 3])
+    config.update(
+        config_lib.Config(
+            foo='hat',
+            num=50.5,
+            x={'a': 5, 'z': -10},
+            y=config_lib.Config(wxyz=-1)),
+        d={'a': 10, 'c': 20},
+        l=[3, 4, 5, 6])
+    self.assertEqual(
+        config_lib.Config(
+            foo='hat',
+            num=50.5,
+            x=config_lib.Config(a=5, b=2, z=-10,
+                                c=config_lib.Config(h=10, i=20, j=30)),
+            y=config_lib.Config(qrs=5, tuv=10, wxyz=-1),
+            d={'a': 10, 'c': 20},
+            l=[3, 4, 5, 6]),
+        config)
+    self.assertTrue(isinstance(config.x, config_lib.Config))
+    self.assertTrue(isinstance(config.x.c, config_lib.Config))
+    self.assertTrue(isinstance(config.y, config_lib.Config))
+    config = config_lib.Config(
+        foo='bar',
+        num=100,
+        x=config_lib.Config(a=1, b=2, c=config_lib.Config(h=10, i=20, j=30)),
+        y=config_lib.Config(qrs=5, tuv=10),
+        d={'a': 1, 'b': 2},
+        l=[1, 2, 3])
+    config.update(
+        config_lib.Config(
+            foo=1234,
+            num='hello',
+            x={'a': 5, 'z': -10, 'c': {'h': -5, 'k': 40}},
+            y=[1, 2, 3, 4],
+            d='stuff',
+            l={'a': 1, 'b': 2}))
+    self.assertEqual(
+        config_lib.Config(
+            foo=1234,
+            num='hello',
+            x=config_lib.Config(a=5, b=2, z=-10,
+                                c=config_lib.Config(h=-5, i=20, j=30, k=40)),
+            y=[1, 2, 3, 4],
+            d='stuff',
+            l={'a': 1, 'b': 2}),
+        config)
+    self.assertTrue(isinstance(config.x, config_lib.Config))
+    self.assertTrue(isinstance(config.x.c, config_lib.Config))
+    self.assertTrue(isinstance(config.y, list))
+  def testConfigStrictUpdate(self):
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.strict_update({'b': 10, 'c': 20})
+    self.assertEqual({'a': 1, 'b': 10, 'c': 20}, config)
+    config = config_lib.Config(a=1, b=2, c=3)
+    config.strict_update(b=10, c=20)
+    self.assertEqual({'a': 1, 'b': 10, 'c': 20}, config)
+    config = config_lib.Config(a=1, b=2, c=3, d=4)
+    config.strict_update({'d': 100}, b=10, a=20)
+    self.assertEqual({'a': 20, 'b': 10, 'c': 3, 'd': 100}, config)
+    config = config_lib.Config(
+        a=1,
+        b=2,
+        x=config_lib.Config(
+            l='a',
+            y=config_lib.Config(m=1, n=2),
+            z=config_lib.Config(
+                q=config_lib.Config(a=10, b=20),
+                r=config_lib.Config(s=1, t=2))))
+    config.strict_update(x={'y': {'m': 10}, 'z': {'r': {'s': 5}}})
+    self.assertEqual(
+        config_lib.Config(
+            a=1, b=2,
+            x=config_lib.Config(
+                l='a',
+                y=config_lib.Config(m=10, n=2),
+                z=config_lib.Config(
+                    q=config_lib.Config(a=10, b=20),
+                    r=config_lib.Config(s=5, t=2)))),
+        config)
+    config = config_lib.Config(
+        foo='bar',
+        num=100,
+        x=config_lib.Config(a=1, b=2, c=config_lib.Config(h=10, i=20, j=30)),
+        y=config_lib.Config(qrs=5, tuv=10),
+        d={'a': 1, 'b': 2},
+        l=[1, 2, 3])
+    config.strict_update(
+        config_lib.Config(
+            foo='hat',
+            num=50,
+            x={'a': 5, 'c': {'h': 100}},
+            y=config_lib.Config(tuv=-1)),
+        d={'a': 10, 'c': 20},
+        l=[3, 4, 5, 6])
+    self.assertEqual(
+        config_lib.Config(
+            foo='hat',
+            num=50,
+            x=config_lib.Config(a=5, b=2,
+                                c=config_lib.Config(h=100, i=20, j=30)),
+            y=config_lib.Config(qrs=5, tuv=-1),
+            d={'a': 10, 'c': 20},
+            l=[3, 4, 5, 6]),
+        config)
+  def testConfigStrictUpdateFail(self):
+    config = config_lib.Config(a=1, b=2, c=3, x=config_lib.Config(a=1, b=2))
+    with self.assertRaises(KeyError):
+      config.strict_update({'b': 10, 'c': 20, 'd': 50})
+    with self.assertRaises(KeyError):
+      config.strict_update(b=10, d=50)
+    with self.assertRaises(KeyError):
+      config.strict_update(x={'c': 3})
+    with self.assertRaises(TypeError):
+      config.strict_update(a='string')
+    with self.assertRaises(TypeError):
+      config.strict_update(x={'a': 'string'})
+    with self.assertRaises(TypeError):
+      config.strict_update(x=[1, 2, 3])
+  def testConfigFromStr(self):
+    config = config_lib.Config.from_str("{'c': {'d': 5}, 'b': 2, 'a': 1}")
+    self.assertEqual(
+        {'c': {'d': 5}, 'b': 2, 'a': 1}, config)
+    self.assertTrue(isinstance(config, config_lib.Config))
+    self.assertTrue(isinstance(config.c, config_lib.Config))
+  def testConfigParse(self):
+    config = config_lib.Config.parse(
+        'hello="world",num=1234.5,lst=[10,20.5,True,"hi",("a","b","c")],'
+        'dct={9:10,"stuff":"qwerty","subdict":{1:True,2:False}},'
+        'subconfig=c(a=1,b=[1,2,[3,4]],c=c(f="f",g="g"))')
+    self.assertEqual(
+        {'hello': 'world', 'num': 1234.5,
+         'lst': [10, 20.5, True, 'hi', ('a', 'b', 'c')],
+         'dct': {9: 10, 'stuff': 'qwerty', 'subdict': {1: True, 2: False}},
+         'subconfig': {'a': 1, 'b': [1, 2, [3, 4]], 'c': {'f': 'f', 'g': 'g'}}},
+        config)
+    self.assertTrue(isinstance(config, config_lib.Config))
+    self.assertTrue(isinstance(config.subconfig, config_lib.Config))
+    self.assertTrue(isinstance(config.subconfig.c, config_lib.Config))
+    self.assertFalse(isinstance(config.dct, config_lib.Config))
+    self.assertFalse(isinstance(config.dct['subdict'], config_lib.Config))
+    self.assertTrue(isinstance(config.lst[4], tuple))
+  def testConfigParseErrors(self):
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=[1,2,b="hello"')
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=1,b=c(x="a",y="b"')
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=1,b=c(x="a")y="b"')
+    with self.assertRaises(SyntaxError):
+      config_lib.Config.parse('a=1,b=c(x="a"),y="b",')
+  def testOneOf(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(task=2, a='world', b='stuff'),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=1,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=1, a='hi'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='hi', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=3)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=3, c=1234),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='world', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=4,d=9999)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=4, d=9999),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=5'))
+    self.assertEqual(
+        config_lib.Config(
+            data=5,
+            model=config_lib.Config(stuff=2)),
+        config)
+  def testOneOfStrict(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(task=2, a='world', b='stuff'),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=1,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=1, a='hi'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a="hi")'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='hi', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=3)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=3, c=1234),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(task=2, a='world', b='stuff'),
+            model=config_lib.Config(stuff=2)),
+        config)
+  def testNestedOneOf(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(
+                   task=2,
+                   a=config_lib.OneOf(
+                       [config_lib.Config(x=1, y=2),
+                        config_lib.Config(x=-1, y=1000, z=4)],
+                       x=1)),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    config.update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a=c(x=-1,z=8))'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=-1, y=1000, z=8)),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse(
+        'model=c(stuff=2),data=c(task=2,a=c(x=-1,z=8))'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=-1, y=1000, z=8)),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.update(config_lib.Config.parse('model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=1, y=2)),
+            model=config_lib.Config(stuff=2)),
+        config)
+    config = make_config()
+    config.strict_update(config_lib.Config.parse('model=c(stuff=2)'))
+    self.assertEqual(
+        config_lib.Config(
+            data=config_lib.Config(
+                task=2,
+                a=config_lib.Config(x=1, y=2)),
+            model=config_lib.Config(stuff=2)),
+        config)
+  def testOneOfStrictErrors(self):
+    def make_config():
+      return config_lib.Config(
+          data=config_lib.OneOf(
+              [config_lib.Config(task=1, a='hello'),
+               config_lib.Config(task=2, a='world', b='stuff'),
+               config_lib.Config(task=3, c=1234)],
+              task=2),
+          model=config_lib.Config(stuff=1))
+    config = make_config()
+    with self.assertRaises(TypeError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=[1,2,3]'))
+    config = make_config()
+    with self.assertRaises(KeyError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=c(task=3,c=5678,d=9999)'))
+    config = make_config()
+    with self.assertRaises(ValueError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=c(task=4,d=9999)'))
+    config = make_config()
+    with self.assertRaises(TypeError):
+      config.strict_update(config_lib.Config.parse(
+          'model=c(stuff=2),data=5'))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/reward.py
+++ b/research/brain_coder/common/reward.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Reward functions, distance functions, and reward managers."""
+from abc import ABCMeta
+from abc import abstractmethod
+from math import log
+# All sequences here are assumed to be lists of ints bounded
+# between 0 and `base`-1 (inclusive).
+#################################
+### Scalar Distance Functions ###
+#################################
+def abs_diff(a, b, base=0):
+  """Absolute value of difference between scalars.
+  abs_diff is symmetric, i.e. `a` and `b` are interchangeable.
+  Args:
+    a: First argument. An int.
+    b: Seconds argument. An int.
+    base: Dummy argument so that the argument signature matches other scalar
+        diff functions. abs_diff is the same in all bases.
+  Returns:
+    abs(a - b).
+  """
+  del base  # Unused.
+  return abs(a - b)
+def mod_abs_diff(a, b, base):
+  """Shortest distance between `a` and `b` in the modular integers base `base`.
+  The smallest distance between a and b is returned.
+  Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98.
+  mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable.
+  Args:
+    a: First argument. An int.
+    b: Seconds argument. An int.
+    base: The modulo base. A positive int.
+  Returns:
+    Shortest distance.
+  """
+  diff = abs(a - b)
+  if diff >= base:
+    diff %= base
+  return min(diff, (-diff) + base)
+###############################
+### List Distance Functions ###
+###############################
+def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff):
+  """Asymmetric list distance function.
+  List distance is the sum of element-wise distances, like Hamming distance, but
+  where `pred` can be longer or shorter than `target`. For each position in both
+  `pred` and `target`, distance between those elements is computed with
+  `scalar_diff_fn`. For missing or extra elements in `pred`, the maximum
+  distance is assigned, which is equal to `base`.
+  Distance is 0 when `pred` and `target` are identical, and will be a positive
+  integer when they are not.
+  Args:
+    pred: Prediction list. Distance from this list is computed.
+    target: Target list. Distance to this list is computed.
+    base: The integer base to use. For example, a list of chars would use base
+        256.
+    scalar_diff_fn: Element-wise distance function.
+  Returns:
+    List distance between `pred` and `target`.
+  """
+  d = 0
+  for i, target_t in enumerate(target):
+    if i >= len(pred):
+      d += base  # A missing slot is worth the max distance.
+    else:
+      # Add element-wise distance for this slot.
+      d += scalar_diff_fn(pred[i], target_t, base)
+  if len(pred) > len(target):
+    # Each extra slot is worth the max distance.
+    d += (len(pred) - len(target)) * base
+  return d
+def log_absolute_distance(pred, target, base):
+  """Asymmetric list distance function that uses log distance.
+  A list distance which computes sum of element-wise distances, similar to
+  `absolute_distance`. Unlike `absolute_distance`, this scales the resulting
+  distance to be a float.
+  Element-wise distance are log-scale. Distance between two list changes
+  relatively less for elements that are far apart, but changes a lot (goes to 0
+  faster) when values get close together.
+  Args:
+    pred: List of ints. Computes distance from this list to the target.
+    target: List of ints. This is the "correct" list which the prediction list
+        is trying to match.
+    base: Integer base.
+  Returns:
+    Float distance normalized so that when `pred` is at most as long as `target`
+    the distance is between 0.0 and 1.0. Distance grows unboundedly large
+    as `pred` grows past `target` in length.
+  """
+  if not target:
+    length_normalizer = 1.0
+    if not pred:
+      # Distance between [] and [] is 0.0 since they are equal.
+      return 0.0
+  else:
+    length_normalizer = float(len(target))
+  # max_dist is the maximum element-wise distance, before taking log and
+  # scaling. Since we use `mod_abs_diff`, it would be (base // 2), but we add
+  # 1 to it so that missing or extra positions get the maximum penalty.
+  max_dist = base // 2 + 1
+  # The log-distance will be scaled by a factor.
+  # Note: +1 is added to the numerator and denominator to avoid log(0). This
+  # only has a translational effect, i.e. log(dist + 1) / log(max_dist + 1).
+  factor = log(max_dist + 1)
+  d = 0.0  # Total distance to be computed.
+  for i, target_t in enumerate(target):
+    if i >= len(pred):
+      # Assign the max element-wise distance for missing positions. This is 1.0
+      # after scaling.
+      d += 1.0
+    else:
+      # Add the log-dist divided by a scaling factor.
+      d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor
+  if len(pred) > len(target):
+    # Add the max element-wise distance for each extra position.
+    # Since max dist after scaling is 1, this is just the difference in list
+    # lengths.
+    d += (len(pred) - len(target))
+  return d / length_normalizer  # Normalize again by the target length.
+########################
+### Reward Functions ###
+########################
+# Reward functions assign reward based on program output.
+# Warning: only use these functions as the terminal rewards in episodes, i.e.
+# for the "final" programs.
+def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff):
+  """Reward function based on absolute_distance function.
+  Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
+  so that 0.0 reward is given when `pred` is the empty list (assuming `target`
+  is not empty). Reward can go negative when `pred` is longer than `target`.
+  This is an asymmetric reward function, so which list is the prediction and
+  which is the target matters.
+  Args:
+    pred: Prediction sequence. This should be the sequence outputted by the
+        generated code. List of ints n, where 0 <= n < base.
+    target: Target sequence. The correct sequence that the generated code needs
+        to output. List of ints n, where 0 <= n < base.
+    base: Base of the computation.
+    scalar_diff_fn: Element-wise distance function.
+  Returns:
+    Reward computed based on `pred` and `target`. A float.
+  """
+  unit_dist = float(base * len(target))
+  if unit_dist == 0:
+    unit_dist = base
+  dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn)
+  return (unit_dist - dist) / unit_dist
+def absolute_mod_distance_reward(pred, target, base):
+  """Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used.
+  Args:
+    pred: Prediction sequence. This should be the sequence outputted by the
+        generated code. List of ints n, where 0 <= n < base.
+    target: Target sequence. The correct sequence that the generated code needs
+        to output. List of ints n, where 0 <= n < base.
+    base: Base of the computation.
+  Returns:
+    Reward computed based on `pred` and `target`. A float.
+  """
+  return absolute_distance_reward(pred, target, base, mod_abs_diff)
+def absolute_log_distance_reward(pred, target, base):
+  """Compute reward using `log_absolute_distance`.
+  Maximum reward, 1.0, is given when the lists are equal. Reward is scaled
+  so that 0.0 reward is given when `pred` is the empty list (assuming `target`
+  is not empty). Reward can go negative when `pred` is longer than `target`.
+  This is an asymmetric reward function, so which list is the prediction and
+  which is the target matters.
+  This reward function has the nice property that much more reward is given
+  for getting the correct value (at each position) than for there being any
+  value at all. For example, in base 100, lets say pred = [1] * 1000
+  and target = [10] * 1000. A lot of reward would be given for being 80%
+  accurate (worst element-wise distance is 50, distances here are 9) using
+  `absolute_distance`. `log_absolute_distance` on the other hand will give
+  greater and greater reward increments the closer each predicted value gets to
+  the target. That makes the reward given for accuracy somewhat independant of
+  the base.
+  Args:
+    pred: Prediction sequence. This should be the sequence outputted by the
+        generated code. List of ints n, where 0 <= n < base.
+    target: Target sequence. The correct sequence that the generated code needs
+        to output. List of ints n, where 0 <= n < base.
+    base: Base of the computation.
+  Returns:
+    Reward computed based on `pred` and `target`. A float.
+  """
+  return 1.0 - log_absolute_distance(pred, target, base)
+#######################
+### Reward Managers ###
+#######################
+# Reward managers assign reward to many code attempts throughout an episode.
+class RewardManager(object):
+  """Reward managers administer reward across an episode.
+  Reward managers are used for "editor" environments. These are environments
+  where the agent has some way to edit its code over time, and run its code
+  many time in the same episode, so that it can make incremental improvements.
+  Reward managers are instantiated with a target sequence, which is the known
+  correct program output. The manager is called on the output from a proposed
+  code, and returns reward. If many proposal outputs are tried, reward may be
+  some stateful function that takes previous tries into account. This is done,
+  in part, so that an agent cannot accumulate unbounded reward just by trying
+  junk programs as often as possible. So reward managers should not give the
+  same reward twice if the next proposal is not better than the last.
+  """
+  __metaclass__ = ABCMeta
+  def __init__(self, target, base, distance_fn=absolute_distance):
+    self._target = list(target)
+    self._base = base
+    self._distance_fn = distance_fn
+  @abstractmethod
+  def __call__(self, sequence):
+    """Call this reward manager like a function to get reward.
+    Calls to reward manager are stateful, and will take previous sequences
+    into account. Repeated calls with the same sequence may produce different
+    rewards.
+    Args:
+      sequence: List of integers (each between 0 and base - 1). This is the
+          proposal sequence. Reward will be computed based on the distance
+          from this sequence to the target (distance function and target are
+          given in the constructor), as well as previous sequences tried during
+          the lifetime of this object.
+    Returns:
+      Float value. The reward received from this call.
+    """
+    return 0.0
+class DeltaRewardManager(RewardManager):
+  """Simple reward manager that assigns reward for the net change in distance.
+  Given some (possibly asymmetric) list distance function, gives reward for
+  relative changes in prediction distance to the target.
+  For example, if on the first call the distance is 3.0, the change in distance
+  is -3 (from starting distance of 0). That relative change will be scaled to
+  produce a negative reward for this step. On the next call, the distance is 2.0
+  which is a +1 change, and that will be scaled to give a positive reward.
+  If the final call has distance 0 (the target is achieved), that is another
+  positive change of +2. The total reward across all 3 calls is then 0, which is
+  the highest posible episode total.
+  Reward is scaled so that the maximum element-wise distance is worth 1.0.
+  Maximum total episode reward attainable is 0.
+  """
+  def __init__(self, target, base, distance_fn=absolute_distance):
+    super(DeltaRewardManager, self).__init__(target, base, distance_fn)
+    self._last_diff = 0
+  def _diff(self, seq):
+    return self._distance_fn(seq, self._target, self._base)
+  def _delta_reward(self, seq):
+    # Reward is relative to previous sequence diff.
+    # Reward is scaled so that maximum token difference is worth 1.0.
+    # Reward = (last_diff - this_diff) / self.base.
+    # Reward is positive if this sequence is closer to the target than the
+    # previous sequence, and negative if this sequence is further away.
+    diff = self._diff(seq)
+    reward = (self._last_diff - diff) / float(self._base)
+    self._last_diff = diff
+    return reward
+  def __call__(self, seq):
+    return self._delta_reward(seq)
+class FloorRewardManager(RewardManager):
+  """Assigns positive reward for each step taken closer to the target.
+  Given some (possibly asymmetric) list distance function, gives reward for
+  whenever a new episode minimum distance is reached. No reward is given if
+  the distance regresses to a higher value, so that the sum of rewards
+  for the episode is positive.
+  Reward is scaled so that the maximum element-wise distance is worth 1.0.
+  Maximum total episode reward attainable is len(target).
+  If the prediction sequence is longer than the target, a reward of -1 is given.
+  Subsequence predictions which are also longer get 0 reward. The -1 penalty
+  will be canceled out with a +1 reward when a prediction is given which is at
+  most the length of the target.
+  """
+  def __init__(self, target, base, distance_fn=absolute_distance):
+    super(FloorRewardManager, self).__init__(target, base, distance_fn)
+    self._last_diff = 0
+    self._min_diff = self._max_diff()
+    self._too_long_penality_given = False
+  def _max_diff(self):
+    return self._distance_fn([], self._target, self._base)
+  def _diff(self, seq):
+    return self._distance_fn(seq, self._target, self._base)
+  def _delta_reward(self, seq):
+    # Reward is only given if this sequence is closer to the target than any
+    # previous sequence.
+    # Reward is scaled so that maximum token difference is worth 1.0
+    # Reward = (min_diff - this_diff) / self.base
+    # Reward is always positive.
+    diff = self._diff(seq)
+    if diff < self._min_diff:
+      reward = (self._min_diff - diff) / float(self._base)
+      self._min_diff = diff
+    else:
+      reward = 0.0
+    return reward
+  def __call__(self, seq):
+    if len(seq) > len(self._target):  # Output is too long.
+      if not self._too_long_penality_given:
+        self._too_long_penality_given = True
+        reward = -1.0
+      else:
+        reward = 0.0  # Don't give this penalty more than once.
+      return reward
+    reward = self._delta_reward(seq)
+    if self._too_long_penality_given:
+      reward += 1.0  # Return the subtracted reward.
+      self._too_long_penality_given = False
+    return reward
--- a/research/brain_coder/common/reward_test.py
+++ b/research/brain_coder/common/reward_test.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Tests for common.reward."""
+from math import log
+import numpy as np
+import tensorflow as tf
+from common import reward  # brain coder
+class RewardTest(tf.test.TestCase):
+  def testAbsDiff(self):
+    self.assertEqual(5, reward.abs_diff(15, 20))
+    self.assertEqual(5, reward.abs_diff(20, 15))
+  def testModAbsDiff(self):
+    self.assertEqual(5, reward.mod_abs_diff(15, 20, 25))
+    self.assertEqual(5, reward.mod_abs_diff(20, 15, 25))
+    self.assertEqual(2, reward.mod_abs_diff(1, 24, 25))
+    self.assertEqual(2, reward.mod_abs_diff(24, 1, 25))
+    self.assertEqual(0, reward.mod_abs_diff(0, 0, 5))
+    self.assertEqual(1, reward.mod_abs_diff(0, 1, 5))
+    self.assertEqual(2, reward.mod_abs_diff(0, 2, 5))
+    self.assertEqual(2, reward.mod_abs_diff(0, 3, 5))
+    self.assertEqual(1, reward.mod_abs_diff(0, 4, 5))
+    self.assertEqual(0, reward.mod_abs_diff(-1, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(-5, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(-7, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(13, 4, 5))
+    self.assertEqual(1, reward.mod_abs_diff(15, 4, 5))
+  def testAbsoluteDistance_AbsDiffMethod(self):
+    self.assertEqual(
+        4,
+        reward.absolute_distance([0], [4], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([4], [4], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([], [], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1], [], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([], [1], 5, scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([1, 2, 3], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2, 3, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+    self.assertEqual(
+        6,
+        reward.absolute_distance([4, 4, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.abs_diff))
+  def testAbsoluteDistance_ModDiffMethod(self):
+    self.assertEqual(
+        1,
+        reward.absolute_distance([0], [4], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([4], [4], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([], [], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1], [], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([], [1], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        0,
+        reward.absolute_distance([1, 2, 3], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        1,
+        reward.absolute_distance([1, 2, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([1, 2, 3, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+    self.assertEqual(
+        5,
+        reward.absolute_distance([4, 4, 4], [1, 2, 3], 5,
+                                 scalar_diff_fn=reward.mod_abs_diff))
+  def testLogAbsoluteDistance(self):
+    def log_diff(diff, base):
+      return log(diff + 1) / log(base // 2 + 2)
+    self.assertEqual(
+        log_diff(1, 5),
+        reward.log_absolute_distance([0], [4], 5))
+    self.assertEqual(
+        log_diff(2, 5),
+        reward.log_absolute_distance([1], [4], 5))
+    self.assertEqual(
+        log_diff(2, 5),
+        reward.log_absolute_distance([2], [4], 5))
+    self.assertEqual(
+        log_diff(1, 5),
+        reward.log_absolute_distance([3], [4], 5))
+    self.assertEqual(
+        log_diff(3, 5),  # max_dist = base // 2 + 1 = 3
+        reward.log_absolute_distance([], [4], 5))
+    self.assertEqual(
+        0 + log_diff(3, 5),  # max_dist = base // 2 + 1 = 3
+        reward.log_absolute_distance([4, 4], [4], 5))
+    self.assertEqual(
+        0,
+        reward.log_absolute_distance([4], [4], 5))
+    self.assertEqual(
+        0,
+        reward.log_absolute_distance([], [], 5))
+    self.assertEqual(
+        1,
+        reward.log_absolute_distance([1], [], 5))
+    self.assertEqual(
+        1,
+        reward.log_absolute_distance([], [1], 5))
+    self.assertEqual(
+        0,
+        reward.log_absolute_distance([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(1, 5) / 3,  # divided by target length.
+        reward.log_absolute_distance([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(1, 5) / 3,
+        reward.log_absolute_distance([1, 2, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(3, 5) / 3,  # max_dist
+        reward.log_absolute_distance([1, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        log_diff(3, 5) / 3,  # max_dist
+        reward.log_absolute_distance([1, 2, 3, 4], [1, 2, 3], 5))
+    # Add log differences for each position.
+    self.assertEqual(
+        (log_diff(2, 5) + log_diff(2, 5) + log_diff(1, 5)) / 3,
+        reward.log_absolute_distance([4, 4, 4], [1, 2, 3], 5))
+  def testAbsoluteDistanceReward(self):
+    self.assertEqual(
+        1,
+        reward.absolute_distance_reward([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),  # 1 - distance / (base * target_len)
+        reward.absolute_distance_reward([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),
+        reward.absolute_distance_reward([1, 2, 2], [1, 2, 3], 5))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_distance_reward([1, 2], [1, 2, 3], 5)))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_distance_reward([1, 2, 3, 4], [1, 2, 3], 5)))
+    # Add log differences for each position.
+    self.assertEqual(
+        1 - (3 + 2 + 1) / (5 * 3.),
+        reward.absolute_distance_reward([4, 4, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1,
+        reward.absolute_distance_reward([], [], 5))
+  def testAbsoluteModDistanceReward(self):
+    self.assertEqual(
+        1,
+        reward.absolute_mod_distance_reward([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),  # 1 - distance / (base * target_len)
+        reward.absolute_mod_distance_reward([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - 1 / (5 * 3.),
+        reward.absolute_mod_distance_reward([1, 2, 2], [1, 2, 3], 5))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_mod_distance_reward([1, 2], [1, 2, 3], 5)))
+    self.assertTrue(np.isclose(
+        1 - 5 / (5 * 3.),
+        reward.absolute_mod_distance_reward([1, 2, 3, 4], [1, 2, 3], 5)))
+    # Add log differences for each position.
+    self.assertTrue(np.isclose(
+        1 - (2 + 2 + 1) / (5 * 3.),
+        reward.absolute_mod_distance_reward([4, 4, 4], [1, 2, 3], 5)))
+    self.assertTrue(np.isclose(
+        1 - (1 + 2 + 2) / (5 * 3.),
+        reward.absolute_mod_distance_reward([0, 1, 2], [4, 4, 4], 5)))
+    self.assertEqual(
+        1,
+        reward.absolute_mod_distance_reward([], [], 5))
+  def testAbsoluteLogDistanceReward(self):
+    def log_diff(diff, base):
+      return log(diff + 1) / log(base // 2 + 2)
+    self.assertEqual(
+        1,
+        reward.absolute_log_distance_reward([1, 2, 3], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(1, 5) / 3,  # divided by target length.
+        reward.absolute_log_distance_reward([1, 2, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(1, 5) / 3,
+        reward.absolute_log_distance_reward([1, 2, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(3, 5) / 3,  # max_dist
+        reward.absolute_log_distance_reward([1, 2], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - log_diff(3, 5) / 3,  # max_dist
+        reward.absolute_log_distance_reward([1, 2, 3, 4], [1, 2, 3], 5))
+    # Add log differences for each position.
+    self.assertEqual(
+        1 - (log_diff(2, 5) + log_diff(2, 5) + log_diff(1, 5)) / 3,
+        reward.absolute_log_distance_reward([4, 4, 4], [1, 2, 3], 5))
+    self.assertEqual(
+        1 - (log_diff(1, 5) + log_diff(2, 5) + log_diff(2, 5)) / 3,
+        reward.absolute_log_distance_reward([0, 1, 2], [4, 4, 4], 5))
+    self.assertEqual(
+        1,
+        reward.absolute_log_distance_reward([], [], 5))
+  def testDeltaRewardManager(self):
+    reward_manager = reward.DeltaRewardManager(
+        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
+    self.assertEqual(-3, reward_manager([1]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(4 / 5., reward_manager([1, 3]))
+    self.assertEqual(-4 / 5, reward_manager([1]))
+    self.assertEqual(3, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(-1, reward_manager([1, 2, 3]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 3]))
+    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 3, 2]))
+    self.assertEqual(2, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+  def testFloorRewardMananger(self):
+    reward_manager = reward.FloorRewardManager(
+        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
+    self.assertEqual(1, reward_manager([1]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(4 / 5., reward_manager([1, 3]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(1 / 5., reward_manager([1, 2]))
+    self.assertEqual(0, reward_manager([0, 1]))
+    self.assertEqual(0, reward_manager([]))
+    self.assertEqual(0, reward_manager([1, 2]))
+    self.assertEqual(2, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3]))
+    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 3]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 3, 2]))
+    self.assertEqual(1, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4]))
+    reward_manager = reward.FloorRewardManager(
+        [1, 2, 3, 4], base=5, distance_fn=reward.absolute_distance)
+    self.assertEqual(1, reward_manager([1]))
+    self.assertEqual(-1, reward_manager([1, 0, 0, 0, 0, 0]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 0, 0]))
+    self.assertEqual(0, reward_manager([1, 2, 3, 4, 0]))
+    self.assertEqual(1, reward_manager([]))
+    self.assertEqual(0, reward_manager([]))
+    self.assertEqual(0, reward_manager([1]))
+    self.assertEqual(1, reward_manager([1, 2]))
+    self.assertEqual(-1, reward_manager([1, 2, 3, 4, 0, 0]))
+    self.assertEqual(0, reward_manager([1, 1, 1, 1, 1]))
+    self.assertEqual(1 + 2, reward_manager([1, 2, 3, 4]))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/brain_coder/common/rollout.py
+++ b/research/brain_coder/common/rollout.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""Utilities related to computing training batches from episode rollouts.
+Implementations here are based on code from Open AI:
+https://github.com/openai/universe-starter-agent/blob/master/a3c.py.
+"""
+from collections import namedtuple
+import numpy as np
+import scipy.signal
+from common import utils  # brain coder
+class Rollout(object):
+  """Holds a rollout for an episode.
+  A rollout is a record of the states observed in some environment and actions
+  taken by the agent to arrive at those states. Other information includes
+  rewards received after each action, values estimated for each state, whether
+  the rollout concluded the episide, and total reward received. Everything
+  should be given in time order.
+  At each time t, the agent sees state s_t, takes action a_t, and then receives
+  reward r_t. The agent may optionally estimate a state value V(s_t) for each
+  state.
+  For an episode of length T:
+  states = [s_0, ..., s_(T-1)]
+  actions = [a_0, ..., a_(T-1)]
+  rewards = [r_0, ..., r_(T-1)]
+  values = [V(s_0), ..., V(s_(T-1))]
+  Note that there is an extra state s_T observed after taking action a_(T-1),
+  but this is not included in the rollout.
+  Rollouts have an `terminated` attribute which is True when the rollout is
+  "finalized", i.e. it holds a full episode. terminated will be False when
+  time steps are still being added to it.
+  """
+  def __init__(self):
+    self.states = []
+    self.actions = []
+    self.rewards = []
+    self.values = []
+    self.total_reward = 0.0
+    self.terminated = False
+  def add(self, state, action, reward, value=0.0, terminated=False):
+    """Add the next timestep to this rollout.
+    Args:
+      state: The state observed at the start of this timestep.
+      action: The action taken after observing the given state.
+      reward: The reward received for taking the given action.
+      value: The value estimated for the given state.
+      terminated: Whether this timestep ends the episode.
+    Raises:
+      ValueError: If this.terminated is already True, meaning that the episode
+          has already ended.
+    """
+    if self.terminated:
+      raise ValueError(
+          'Trying to add timestep to an already terminal rollout.')
+    self.states += [state]
+    self.actions += [action]
+    self.rewards += [reward]
+    self.values += [value]
+    self.terminated = terminated
+    self.total_reward += reward
+  def add_many(self, states, actions, rewards, values=None, terminated=False):
+    """Add many timesteps to this rollout.
+    Arguments are the same as `add`, but are lists of equal size.
+    Args:
+      states: The states observed.
+      actions: The actions taken.
+      rewards: The rewards received.
+      values: The values estimated for the given states.
+      terminated: Whether this sequence ends the episode.
+    Raises:
+      ValueError: If the lengths of all the input lists are not equal.
+      ValueError: If this.terminated is already True, meaning that the episode
+          has already ended.
+    """
+    if len(states) != len(actions):
+      raise ValueError(
+          'Number of states and actions must be the same. Got %d states and '
+          '%d actions' % (len(states), len(actions)))
+    if len(states) != len(rewards):
+      raise ValueError(
+          'Number of states and rewards must be the same. Got %d states and '
+          '%d rewards' % (len(states), len(rewards)))
+    if values is not None and len(states) != len(values):
+      raise ValueError(
+          'Number of states and values must be the same. Got %d states and '
+          '%d values' % (len(states), len(values)))
+    if self.terminated:
+      raise ValueError(
+          'Trying to add timesteps to an already terminal rollout.')
+    self.states += states
+    self.actions += actions
+    self.rewards += rewards
+    self.values += values if values is not None else [0.0] * len(states)
+    self.terminated = terminated
+    self.total_reward += sum(rewards)
+  def extend(self, other):
+    """Append another rollout to this rollout."""
+    assert not self.terminated
+    self.states.extend(other.states)
+    self.actions.extend(other.actions)
+    self.rewards.extend(other.rewards)
+    self.values.extend(other.values)
+    self.terminated = other.terminated
+    self.total_reward += other.total_reward
+def discount(x, gamma):
+  """Returns discounted sums for each value in x, with discount factor gamma.
+  This can be used to compute the return (discounted sum of rewards) at each
+  timestep given a sequence of rewards. See the definitions for return and
+  REINFORCE in section 3 of https://arxiv.org/pdf/1602.01783.pdf.
+  Let g^k mean gamma ** k.
+  For list [x_0, ..., x_N], the following list of discounted sums is computed:
+  [x_0 + g^1 * x_1 + g^2 * x_2 + ... g^N * x_N,
+   x_1 + g^1 * x_2 + g^2 * x_3 + ... g^(N-1) * x_N,
+   x_2 + g^1 * x_3 + g^2 * x_4 + ... g^(N-2) * x_N,
+   ...,
+   x_(N-1) + g^1 * x_N,
+   x_N]
+  Args:
+    x: List of numbers [x_0, ..., x_N].
+    gamma: Float between 0 and 1 (inclusive). This is the discount factor.
+  Returns:
+    List of discounted sums.
+  """
+  return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+def discounted_advantage_and_rewards(rewards, values, gamma, lambda_=1.0):
+  """Compute advantages and returns (discounted sum of rewards).
+  For an episode of length T, rewards = [r_0, ..., r_(T-1)].
+  Each reward r_t is observed after taking action a_t at state s_t. A final
+  state s_T is observed but no reward is given at this state since no action
+  a_T is taken (otherwise there would be a new state s_(T+1)).
+  `rewards` and `values` are for a single episode. Return R_t is the discounted
+  sum of future rewards starting at time t, where `gamma` is the discount
+  factor.
+  R_t = r_t + gamma * r_(t+1) + gamma**2 * r_(t+2) + ...
+        + gamma**(T-1-t) * r_(T-1)
+  Advantage A(a_t, s_t) is approximated by computing A(a_t, s_t) = R_t - V(s_t)
+  where V(s_t) is an approximation of the value at that state, given in the
+  `values` list. Returns R_t are needed for all REINFORCE algorithms. Advantage
+  is used for the advantage actor critic variant of REINFORCE.
+  See algorithm S3 in https://arxiv.org/pdf/1602.01783.pdf.
+  Additionally another parameter `lambda_` controls the bias-variance tradeoff.
+  See "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438.
+  lambda_ = 1 reduces to regular advantage.
+  0 <= lambda_ < 1 trades off variance for bias, with lambda_ = 0 being the
+  most biased.
+  Bootstrapping is also supported. If an episode does not end in a terminal
+  state (either because the episode was ended early, or the environment does not
+  have end states), the true return cannot be computed from the rewards alone.
+  However, it can be estimated by computing the value (an approximation of
+  return) of the last state s_T. Thus the `values` list will have an extra item:
+  values = [V(s_0), ..., V(s_(T-1)), V(s_T)].
+  Args:
+    rewards: List of observed rewards [r_0, ..., r_(T-1)].
+    values: List of estimated values [V(s_0), ..., V(s_(T-1))] with an optional
+        extra V(s_T) item.
+    gamma: Discount factor. Number between 0 and 1. 1 means no discount.
+        If not 1, gamma is typically near 1, like 0.99.
+    lambda_: Bias-variance tradeoff factor. Between 0 and 1.
+  Returns:
+    empirical_values: Returns at each timestep.
+    generalized_advantage: Avantages at each timestep.
+  Raises:
+    ValueError: If shapes of `rewards` and `values` are not rank 1.
+    ValueError: If len(values) not in (len(rewards), len(rewards) + 1).
+  """
+  rewards = np.asarray(rewards, dtype=np.float32)
+  values = np.asarray(values, dtype=np.float32)
+  if rewards.ndim != 1:
+    raise ValueError('Single episode only. rewards must be rank 1.')
+  if values.ndim != 1:
+    raise ValueError('Single episode only. values must be rank 1.')
+  if len(values) == len(rewards):
+    # No bootstrapping.
+    values = np.append(values, 0)
+    empirical_values = discount(rewards, gamma)
+  elif len(values) == len(rewards) + 1:
+    # With bootstrapping.
+    # Last value is for the terminal state (final state after last action was
+    # taken).
+    empirical_values = discount(np.append(rewards, values[-1]), gamma)[:-1]
+  else:
+    raise ValueError('values should contain the same number of items or one '
+                     'more item than rewards')
+  delta = rewards + gamma * values[1:] - values[:-1]
+  generalized_advantage = discount(delta, gamma * lambda_)
+  # empirical_values is the discounted sum of rewards into the future.
+  # generalized_advantage is the target for each policy update.
+  return empirical_values, generalized_advantage
+"""Batch holds a minibatch of episodes.
+Let bi = batch_index, i.e. the index of each episode in the minibatch.
+Let t = time.
+Attributes:
+  states: States for each timestep in each episode. Indexed by states[bi, t].
+  actions: Actions for each timestep in each episode. Indexed by actions[bi, t].
+  discounted_adv: Advantages (computed by discounted_advantage_and_rewards)
+      for each timestep in each episode. Indexed by discounted_adv[bi, t].
+  discounted_r: Returns (discounted sum of rewards computed by
+      discounted_advantage_and_rewards) for each timestep in each episode.
+      Indexed by discounted_r[bi, t].
+  total_rewards: Total reward for each episode, i.e. sum of rewards across all
+      timesteps (not discounted). Indexed by total_rewards[bi].
+  episode_lengths: Number of timesteps in each episode. If an episode has
+      N actions, N rewards, and N states, then its length is N. Indexed by
+      episode_lengths[bi].
+  batch_size: Number of episodes in this minibatch. An integer.
+  max_time: Maximum episode length in the batch. An integer.
+"""  # pylint: disable=pointless-string-statement
+Batch = namedtuple(
+    'Batch',
+    ['states', 'actions', 'discounted_adv', 'discounted_r', 'total_rewards',
+     'episode_lengths', 'batch_size', 'max_time'])
+def process_rollouts(rollouts, gamma, lambda_=1.0):
+  """Convert a batch of rollouts into tensors ready to be fed into a model.
+  Lists from each episode are stacked into 2D tensors and padded with 0s up to
+  the maximum timestep in the batch.
+  Args:
+    rollouts: A list of Rollout instances.
+    gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma
+        argument in discounted_advantage_and_rewards.
+    lambda_: See lambda_ argument in discounted_advantage_and_rewards.
+  Returns:
+    Batch instance. states, actions, discounted_adv, and discounted_r are
+    numpy arrays with shape (batch_size, max_episode_length). episode_lengths
+    is a list of ints. total_rewards is a list of floats (total reward in each
+    episode). batch_size and max_time are ints.
+  Raises:
+    ValueError: If any of the rollouts are not terminal.
+  """
+  for ro in rollouts:
+    if not ro.terminated:
+      raise ValueError('Can only process terminal rollouts.')
+  episode_lengths = [len(ro.states) for ro in rollouts]
+  batch_size = len(rollouts)
+  max_time = max(episode_lengths)
+  states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time)
+  actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time)
+  discounted_rewards = [None] * batch_size
+  discounted_adv = [None] * batch_size
+  for i, ro in enumerate(rollouts):
+    disc_r, disc_adv = discounted_advantage_and_rewards(
+        ro.rewards, ro.values, gamma, lambda_)
+    discounted_rewards[i] = disc_r
+    discounted_adv[i] = disc_adv
+  discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time)
+  discounted_adv = utils.stack_pad(discounted_adv, 0, max_time)
+  total_rewards = [sum(ro.rewards) for ro in rollouts]
+  return Batch(states=states,
+               actions=actions,
+               discounted_adv=discounted_adv,
+               discounted_r=discounted_rewards,
+               total_rewards=total_rewards,
+               episode_lengths=episode_lengths,
+               batch_size=batch_size,
+               max_time=max_time)