Unverified Commit 78ddf6eb authored by cclauss's avatar cclauss Committed by GitHub
Browse files

Merge branch 'master' into patch-6

parents 50cb0365 1f34fcaf
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for common.rollout."""
import numpy as np
import tensorflow as tf
from common import rollout as rollout_lib # brain coder
class RolloutTest(tf.test.TestCase):
def MakeRollout(self, states, actions, rewards, values=None, terminated=True):
rollout = rollout_lib.Rollout()
rollout.add_many(
states=states, actions=actions, rewards=rewards, values=values,
terminated=terminated)
return rollout
def testDiscount(self):
discounted = np.array([1.0 / 2 ** n for n in range(4, -1, -1)])
discounted[:2] += [1.0 / 2 ** n for n in range(1, -1, -1)]
self.assertTrue(np.array_equal(
rollout_lib.discount([0.0, 1.0, 0.0, 0.0, 1.0], 0.50),
discounted))
self.assertTrue(np.array_equal(
rollout_lib.discount(np.array([0.0, 1.0, 0.0, 0.0, 1.0]), 0.50),
discounted))
def testDiscountedAdvantageAndRewards(self):
# lambda=1, No bootstrapping.
values = [0.1, 0.5, 0.5, 0.25]
(empirical_values,
generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
[0.0, 0.0, 0.0, 1.0],
values,
gamma=0.75,
lambda_=1.0)
expected_discounted_r = (
np.array([1.0 * 0.75 ** n for n in range(3, -1, -1)]))
expected_adv = expected_discounted_r - values
self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
self.assertTrue(np.allclose(generalized_advantage, expected_adv))
# lambda=1, With bootstrapping.
values = [0.1, 0.5, 0.5, 0.25, 0.75]
(empirical_values,
generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
[0.0, 0.0, 0.0, 1.0],
values,
gamma=0.75,
lambda_=1.0)
expected_discounted_r = (
np.array([0.75 * 0.75 ** n for n in range(4, 0, -1)])
+ np.array([1.0 * 0.75 ** n for n in range(3, -1, -1)]))
expected_adv = expected_discounted_r - values[:-1]
self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
self.assertTrue(np.allclose(generalized_advantage, expected_adv))
# lambda=0.5, With bootstrapping.
values = [0.1, 0.5, 0.5, 0.25, 0.75]
rewards = [0.0, 0.0, 0.0, 1.0]
l = 0.5 # lambda
g = 0.75 # gamma
(empirical_values,
generalized_advantage) = rollout_lib.discounted_advantage_and_rewards(
rewards,
values,
gamma=g,
lambda_=l)
expected_discounted_r = (
np.array([0.75 * g ** n for n in range(4, 0, -1)])
+ np.array([1.0 * g ** n for n in range(3, -1, -1)]))
expected_adv = [0.0] * len(values)
for t in range(3, -1, -1):
delta_t = rewards[t] + g * values[t + 1] - values[t]
expected_adv[t] = delta_t + g * l * expected_adv[t + 1]
expected_adv = expected_adv[:-1]
self.assertTrue(np.array_equal(empirical_values, expected_discounted_r))
self.assertTrue(np.allclose(generalized_advantage, expected_adv))
def testProcessRollouts(self):
g = 0.95
rollouts = [
self.MakeRollout(
states=[3, 6, 9],
actions=[1, 2, 3],
rewards=[1.0, -1.0, 0.5],
values=[0.5, 0.5, 0.1]),
self.MakeRollout(
states=[10],
actions=[5],
rewards=[1.0],
values=[0.5])]
batch = rollout_lib.process_rollouts(rollouts, gamma=g)
self.assertEqual(2, batch.batch_size)
self.assertEqual(3, batch.max_time)
self.assertEqual([3, 1], batch.episode_lengths)
self.assertEqual([0.5, 1.0], batch.total_rewards)
self.assertEqual(
[[3, 6, 9], [10, 0, 0]],
batch.states.tolist())
self.assertEqual(
[[1, 2, 3], [5, 0, 0]],
batch.actions.tolist())
rew1, rew2 = rollouts[0].rewards, rollouts[1].rewards
expected_discounted_rewards = [
[rew1[0] + g * rew1[1] + g * g * rew1[2],
rew1[1] + g * rew1[2],
rew1[2]],
[rew2[0], 0.0, 0.0]]
expected_advantages = [
[dr - v
for dr, v
in zip(expected_discounted_rewards[0], rollouts[0].values)],
[expected_discounted_rewards[1][0] - rollouts[1].values[0], 0.0, 0.0]]
self.assertTrue(
np.allclose(expected_discounted_rewards, batch.discounted_r))
self.assertTrue(
np.allclose(expected_advantages, batch.discounted_adv))
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Schedule functions for controlling hparams over time."""
from abc import ABCMeta
from abc import abstractmethod
import math
from common import config_lib # brain coder
class Schedule(object):
"""Schedule is a function which sets a hyperparameter's value over time.
For example, a schedule can be used to decay an hparams, or oscillate it over
time.
This object is constructed with an instance of config_lib.Config (will be
specific to each class implementation). For example if this is a decay
schedule, the config may specify the rate of decay and decay start time. Then
the object instance is called like a function, mapping global step (an integer
counting how many calls to the train op have been made) to the hparam value.
Properties of a schedule function f(t):
0) Domain of t is the non-negative integers (t may be 0).
1) Range of f is the reals.
2) Schedule functions can assume that they will be called in time order. This
allows schedules to be stateful.
3) Schedule functions should be deterministic. Two schedule instances with the
same config must always give the same value for each t, and regardless of
what t's it was previously called on. Users may call f(t) on arbitrary
(positive) time jumps. Essentially, multiple schedule instances used in
replica training will behave the same.
4) Duplicate successive calls on the same time are allowed.
"""
__metaclass__ = ABCMeta
@abstractmethod
def __init__(self, config):
"""Construct this schedule with a config specific to each class impl.
Args:
config: An instance of config_lib.Config.
"""
pass
@abstractmethod
def __call__(self, global_step):
"""Map `global_step` to a value.
`global_step` is an integer counting how many calls to the train op have
been made across all replicas (hence why it is global). Implementations
may assume calls to be made in time order, i.e. `global_step` now >=
previous `global_step` values.
Args:
global_step: Non-negative integer.
Returns:
Hparam value at this step. A number.
"""
pass
class ConstSchedule(Schedule):
"""Constant function.
config:
const: Constant value at every step.
f(t) = const.
"""
def __init__(self, config):
super(ConstSchedule, self).__init__(config)
self.const = config.const
def __call__(self, global_step):
return self.const
class LinearDecaySchedule(Schedule):
"""Linear decay function.
config:
initial: Decay starts from this value.
final: Decay ends at this value.
start_time: Step when decay starts. Constant before it.
end_time: When decay ends. Constant after it.
f(t) is a linear function when start_time <= t <= end_time, with slope of
(final - initial) / (end_time - start_time). f(t) = initial
when t <= start_time. f(t) = final when t >= end_time.
If start_time == end_time, this becomes a step function.
"""
def __init__(self, config):
super(LinearDecaySchedule, self).__init__(config)
self.initial = config.initial
self.final = config.final
self.start_time = config.start_time
self.end_time = config.end_time
if self.end_time < self.start_time:
raise ValueError('start_time must be before end_time.')
# Linear interpolation.
self._time_diff = float(self.end_time - self.start_time)
self._diff = float(self.final - self.initial)
self._slope = (
self._diff / self._time_diff if self._time_diff > 0 else float('inf'))
def __call__(self, global_step):
if global_step <= self.start_time:
return self.initial
if global_step > self.end_time:
return self.final
return self.initial + (global_step - self.start_time) * self._slope
class ExponentialDecaySchedule(Schedule):
"""Exponential decay function.
See https://en.wikipedia.org/wiki/Exponential_decay.
Use this decay function to decay over orders of magnitude. For example, to
decay learning rate from 1e-2 to 1e-6. Exponential decay will decay the
exponent linearly.
config:
initial: Decay starts from this value.
final: Decay ends at this value.
start_time: Step when decay starts. Constant before it.
end_time: When decay ends. Constant after it.
f(t) is an exponential decay function when start_time <= t <= end_time. The
decay rate and amplitude are chosen so that f(t) = initial when
t = start_time, and f(t) = final when t = end_time. f(t) is constant for
t < start_time or t > end_time. initial and final must be positive values.
If start_time == end_time, this becomes a step function.
"""
def __init__(self, config):
super(ExponentialDecaySchedule, self).__init__(config)
self.initial = config.initial
self.final = config.final
self.start_time = config.start_time
self.end_time = config.end_time
if self.initial <= 0 or self.final <= 0:
raise ValueError('initial and final must be positive numbers.')
# Linear interpolation in log space.
self._linear_fn = LinearDecaySchedule(
config_lib.Config(
initial=math.log(self.initial),
final=math.log(self.final),
start_time=self.start_time,
end_time=self.end_time))
def __call__(self, global_step):
return math.exp(self._linear_fn(global_step))
class SmootherstepDecaySchedule(Schedule):
"""Smootherstep decay function.
A sigmoidal like transition from initial to final values. A smoother
transition than linear and exponential decays, hence the name.
See https://en.wikipedia.org/wiki/Smoothstep.
config:
initial: Decay starts from this value.
final: Decay ends at this value.
start_time: Step when decay starts. Constant before it.
end_time: When decay ends. Constant after it.
f(t) is fully defined here:
https://en.wikipedia.org/wiki/Smoothstep#Variations.
f(t) is smooth, as in its first-derivative exists everywhere.
"""
def __init__(self, config):
super(SmootherstepDecaySchedule, self).__init__(config)
self.initial = config.initial
self.final = config.final
self.start_time = config.start_time
self.end_time = config.end_time
if self.end_time < self.start_time:
raise ValueError('start_time must be before end_time.')
self._time_diff = float(self.end_time - self.start_time)
self._diff = float(self.final - self.initial)
def __call__(self, global_step):
if global_step <= self.start_time:
return self.initial
if global_step > self.end_time:
return self.final
x = (global_step - self.start_time) / self._time_diff
# Smootherstep
return self.initial + x * x * x * (x * (x * 6 - 15) + 10) * self._diff
class HardOscillatorSchedule(Schedule):
"""Hard oscillator function.
config:
high: Max value of the oscillator. Value at constant plateaus.
low: Min value of the oscillator. Value at constant valleys.
start_time: Global step when oscillation starts. Constant before this.
period: Width of one oscillation, i.e. number of steps over which the
oscillation takes place.
transition_fraction: Fraction of the period spent transitioning between high
and low values. 50% of this time is spent rising, and 50% of this time
is spent falling. 50% of the remaining time is spent constant at the
high value, and 50% of the remaining time is spent constant at the low
value. transition_fraction = 1.0 means the entire period is spent
rising and falling. transition_fraction = 0.0 means no time is spent
rising and falling, i.e. the function jumps instantaneously between
high and low.
f(t) = high when t < start_time.
f(t) is periodic when t >= start_time, with f(t + period) = f(t).
f(t) is linear with positive slope when rising, and negative slope when
falling. At the start of the period t0, f(t0) = high and begins to descend.
At the middle of the period f is low and is constant until the ascension
begins. f then rises from low to high and is constant again until the period
repeats.
Note: when transition_fraction is 0, f starts the period low and ends high.
"""
def __init__(self, config):
super(HardOscillatorSchedule, self).__init__(config)
self.high = config.high
self.low = config.low
self.start_time = config.start_time
self.period = float(config.period)
self.transition_fraction = config.transition_fraction
self.half_transition_fraction = config.transition_fraction / 2.0
if self.transition_fraction < 0 or self.transition_fraction > 1.0:
raise ValueError('transition_fraction must be between 0 and 1.0')
if self.period <= 0:
raise ValueError('period must be positive')
self._slope = (
float(self.high - self.low) / self.half_transition_fraction
if self.half_transition_fraction > 0 else float('inf'))
def __call__(self, global_step):
if global_step < self.start_time:
return self.high
period_pos = ((global_step - self.start_time) / self.period) % 1.0
if period_pos >= 0.5:
# ascending
period_pos -= 0.5
if period_pos < self.half_transition_fraction:
return self.low + period_pos * self._slope
else:
return self.high
else:
# descending
if period_pos < self.half_transition_fraction:
return self.high - period_pos * self._slope
else:
return self.low
_NAME_TO_CONFIG = {
'const': ConstSchedule,
'linear_decay': LinearDecaySchedule,
'exp_decay': ExponentialDecaySchedule,
'smooth_decay': SmootherstepDecaySchedule,
'hard_osc': HardOscillatorSchedule,
}
def make_schedule(config):
"""Schedule factory.
Given `config` containing a `fn` property, a Schedule implementation is
instantiated with `config`. See `_NAME_TO_CONFIG` for `fn` options.
Args:
config: Config with a `fn` option that specifies which Schedule
implementation to use. `config` is passed into the constructor.
Returns:
A Schedule impl instance.
"""
schedule_class = _NAME_TO_CONFIG[config.fn]
return schedule_class(config)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for common.schedules."""
from math import exp
from math import sqrt
import numpy as np
from six.moves import xrange
import tensorflow as tf
from common import config_lib # brain coder
from common import schedules # brain coder
class SchedulesTest(tf.test.TestCase):
def ScheduleTestHelper(self, config, schedule_subtype, io_values):
"""Run common checks for schedules.
Args:
config: Config object which is passed into schedules.make_schedule.
schedule_subtype: The expected schedule type to be instantiated.
io_values: List of (input, output) pairs. Must be in ascending input
order. No duplicate inputs.
"""
# Check that make_schedule makes the correct type.
f = schedules.make_schedule(config)
self.assertTrue(isinstance(f, schedule_subtype))
# Check that multiple instances returned from make_schedule behave the same.
fns = [schedules.make_schedule(config) for _ in xrange(3)]
# Check that all the inputs map to the right outputs.
for i, o in io_values:
for f in fns:
f_out = f(i)
self.assertTrue(
np.isclose(o, f_out),
'Wrong value at input %d. Expected %s, got %s' % (i, o, f_out))
# Check that a subset of the io_values are still correct.
f = schedules.make_schedule(config)
subseq = [io_values[i**2] for i in xrange(int(sqrt(len(io_values))))]
if subseq[-1] != io_values[-1]:
subseq.append(io_values[-1])
for i, o in subseq:
f_out = f(i)
self.assertTrue(
np.isclose(o, f_out),
'Wrong value at input %d. Expected %s, got %s' % (i, o, f_out))
# Check duplicate calls.
f = schedules.make_schedule(config)
for i, o in io_values:
for _ in xrange(3):
f_out = f(i)
self.assertTrue(
np.isclose(o, f_out),
'Duplicate calls at input %d are not equal. Expected %s, got %s'
% (i, o, f_out))
def testConstSchedule(self):
self.ScheduleTestHelper(
config_lib.Config(fn='const', const=5),
schedules.ConstSchedule,
[(0, 5), (1, 5), (10, 5), (20, 5), (100, 5), (1000000, 5)])
def testLinearDecaySchedule(self):
self.ScheduleTestHelper(
config_lib.Config(fn='linear_decay', initial=2, final=0, start_time=10,
end_time=20),
schedules.LinearDecaySchedule,
[(0, 2), (1, 2), (10, 2), (11, 1.8), (15, 1), (19, 0.2), (20, 0),
(100000, 0)])
# Test step function.
self.ScheduleTestHelper(
config_lib.Config(fn='linear_decay', initial=2, final=0, start_time=10,
end_time=10),
schedules.LinearDecaySchedule,
[(0, 2), (1, 2), (10, 2), (11, 0), (15, 0)])
def testExponentialDecaySchedule(self):
self.ScheduleTestHelper(
config_lib.Config(fn='exp_decay', initial=exp(-1), final=exp(-6),
start_time=10, end_time=20),
schedules.ExponentialDecaySchedule,
[(0, exp(-1)), (1, exp(-1)), (10, exp(-1)), (11, exp(-1/2. - 1)),
(15, exp(-5/2. - 1)), (19, exp(-9/2. - 1)), (20, exp(-6)),
(100000, exp(-6))])
# Test step function.
self.ScheduleTestHelper(
config_lib.Config(fn='exp_decay', initial=exp(-1), final=exp(-6),
start_time=10, end_time=10),
schedules.ExponentialDecaySchedule,
[(0, exp(-1)), (1, exp(-1)), (10, exp(-1)), (11, exp(-6)),
(15, exp(-6))])
def testSmootherstepDecaySchedule(self):
self.ScheduleTestHelper(
config_lib.Config(fn='smooth_decay', initial=2, final=0, start_time=10,
end_time=20),
schedules.SmootherstepDecaySchedule,
[(0, 2), (1, 2), (10, 2), (11, 1.98288), (15, 1), (19, 0.01712),
(20, 0), (100000, 0)])
# Test step function.
self.ScheduleTestHelper(
config_lib.Config(fn='smooth_decay', initial=2, final=0, start_time=10,
end_time=10),
schedules.SmootherstepDecaySchedule,
[(0, 2), (1, 2), (10, 2), (11, 0), (15, 0)])
def testHardOscillatorSchedule(self):
self.ScheduleTestHelper(
config_lib.Config(fn='hard_osc', high=2, low=0, start_time=100,
period=10, transition_fraction=0.5),
schedules.HardOscillatorSchedule,
[(0, 2), (1, 2), (10, 2), (100, 2), (101, 1.2), (102, 0.4), (103, 0),
(104, 0), (105, 0), (106, 0.8), (107, 1.6), (108, 2), (109, 2),
(110, 2), (111, 1.2), (112, 0.4), (115, 0), (116, 0.8), (119, 2),
(120, 2), (100001, 1.2), (100002, 0.4), (100005, 0), (100006, 0.8),
(100010, 2)])
# Test instantaneous step.
self.ScheduleTestHelper(
config_lib.Config(fn='hard_osc', high=2, low=0, start_time=100,
period=10, transition_fraction=0),
schedules.HardOscillatorSchedule,
[(0, 2), (1, 2), (10, 2), (99, 2), (100, 0), (104, 0), (105, 2),
(106, 2), (109, 2), (110, 0)])
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Configuration class."""
import bisect
from collections import deque
import cPickle
import heapq
import random
from absl import logging
import numpy as np
import six
from six.moves import xrange
import tensorflow as tf
def tuple_to_record(tuple_, record_type):
return record_type(**dict(zip(record_type.__slots__, tuple_)))
def make_record(type_name, attributes, defaults=None):
"""Factory for mutable record classes.
A record acts just like a collections.namedtuple except slots are writable.
One exception is that record classes are not equivalent to tuples or other
record classes of the same length.
Note, each call to `make_record` produces a unique type. Two calls will make
different types even if `type_name` is the same each time.
Args:
type_name: Name of the record type to create.
attributes: List of names of each record attribute. The order of the list
is preserved.
defaults: (optional) default values for attributes. A dict mapping attribute
names to values.
Returns:
A new record type.
Raises:
ValueError: If,
`defaults` is not a dict,
`attributes` contains duplicate names,
`defaults` keys are not contained in `attributes`.
"""
if defaults is None:
defaults = {}
if not isinstance(defaults, dict):
raise ValueError('defaults must be a dict.')
attr_set = set(attributes)
if len(attr_set) < len(attributes):
raise ValueError('No duplicate attributes allowed.')
if not set(defaults.keys()).issubset(attr_set):
raise ValueError('Default attributes must be given in the attributes list.')
class RecordClass(object):
"""A record type.
Acts like mutable tuple with named slots.
"""
__slots__ = list(attributes)
_defaults = dict(defaults)
def __init__(self, *args, **kwargs):
if len(args) > len(self.__slots__):
raise ValueError('Too many arguments. %s has length %d.'
% (type(self).__name__, len(self.__slots__)))
for attr, val in self._defaults.items():
setattr(self, attr, val)
for i, arg in enumerate(args):
setattr(self, self.__slots__[i], arg)
for attr, val in kwargs.items():
setattr(self, attr, val)
for attr in self.__slots__:
if not hasattr(self, attr):
raise ValueError('Required attr "%s" is not set.' % attr)
def __len__(self):
return len(self.__slots__)
def __iter__(self):
for attr in self.__slots__:
yield getattr(self, attr)
def __getitem__(self, index):
return getattr(self, self.__slots__[index])
def __setitem__(self, index, value):
return setattr(self, self.__slots__[index], value)
def __eq__(self, other):
# Types must be equal as well as values.
return (isinstance(other, type(self))
and all(a == b for a, b in zip(self, other)))
def __str__(self):
return '%s(%s)' % (
type(self).__name__,
', '.join(attr + '=' + str(getattr(self, attr))
for attr in self.__slots__))
def __repr__(self):
return str(self)
RecordClass.__name__ = type_name
return RecordClass
# Making minibatches.
def stack_pad(tensors, pad_axes=None, pad_to_lengths=None, dtype=np.float32,
pad_value=0):
"""Stack tensors along 0-th dim and pad them to be the same shape.
Args:
tensors: Any list of iterables (python list, numpy array, etc). Can be 1D
or multi-D iterables.
pad_axes: An int or list of ints. Axes to pad along.
pad_to_lengths: Length in each dimension. If pad_axes was an int, this is an
int or None. If pad_axes was a list of ints, this is a list of mixed int
and None types with the same length, or None. A None length means the
maximum length among the given tensors is used.
dtype: Type of output numpy array. Defaults to np.float32.
pad_value: Value to use for padding. Defaults to 0.
Returns:
Numpy array containing the tensors stacked along the 0-th dimension and
padded along the specified dimensions.
Raises:
ValueError: If the tensors do not have equal shapes along non-padded
dimensions.
"""
tensors = [np.asarray(t) for t in tensors]
max_lengths = [max(l) for l in zip(*[t.shape for t in tensors])]
same_axes = dict(enumerate(max_lengths))
if pad_axes is None:
pad_axes = []
if isinstance(pad_axes, six.integer_types):
if pad_to_lengths is not None:
max_lengths[pad_axes] = pad_to_lengths
del same_axes[pad_axes]
else:
if pad_to_lengths is None:
pad_to_lengths = [None] * len(pad_axes)
for i, l in zip(pad_axes, pad_to_lengths):
if l is not None:
max_lengths[i] = l
del same_axes[i]
same_axes_items = same_axes.items()
dest = np.full([len(tensors)] + max_lengths, pad_value, dtype=dtype)
for i, t in enumerate(tensors):
for j, l in same_axes_items:
if t.shape[j] != l:
raise ValueError(
'Tensor at index %d does not have size %d along axis %d'
% (i, l, j))
dest[[i] + [slice(0, d) for d in t.shape]] = t
return dest
class RandomQueue(deque):
def __init__(self, capacity):
super(RandomQueue, self).__init__([], capacity)
self.capacity = capacity
def random_sample(self, sample_size):
idx = np.random.choice(len(self), sample_size)
return [self[i] for i in idx]
def push(self, item):
# Append to right. Oldest element will be popped from left.
self.append(item)
class MPQItemContainer(object):
"""Class for holding an item with its score.
Defines a comparison function for use in the heap-queue.
"""
def __init__(self, score, item, extra_data):
self.item = item
self.score = score
self.extra_data = extra_data
def __cmp__(self, other):
assert isinstance(other, type(self))
return cmp(self.score, other.score)
def __iter__(self):
"""Allows unpacking like a tuple."""
yield self.score
yield self.item
yield self.extra_data
def __repr__(self):
"""String representation of this item.
`extra_data` is not included in the representation. We are assuming that
`extra_data` is not easily interpreted by a human (if it was, it should be
hashable, like a string or tuple).
Returns:
String representation of `self`.
"""
return str((self.score, self.item))
def __str__(self):
return repr(self)
class MaxUniquePriorityQueue(object):
"""A maximum priority queue where duplicates are not added.
The top items by score remain in the queue. When the capacity is reached,
the lowest scored item in the queue will be dropped.
This implementation differs from a typical priority queue, in that the minimum
score is popped, instead of the maximum. Largest scores remain stuck in the
queue. This is useful for accumulating the best known items from a population.
The items used to determine uniqueness must be hashable, but additional
non-hashable data may be stored with each item.
"""
def __init__(self, capacity):
self.capacity = capacity
self.heap = []
self.unique_items = set()
def push(self, score, item, extra_data=None):
"""Push an item onto the queue.
If the queue is at capacity, the item with the smallest score will be
dropped. Note that it is assumed each item has exactly one score. The same
item with a different score will still be dropped.
Args:
score: Number used to prioritize items in the queue. Largest scores are
kept in the queue.
item: A hashable item to be stored. Duplicates of this item will not be
added to the queue.
extra_data: An extra (possible not hashable) data to store with the item.
"""
if item in self.unique_items:
return
if len(self.heap) >= self.capacity:
_, popped_item, _ = heapq.heappushpop(
self.heap, MPQItemContainer(score, item, extra_data))
self.unique_items.add(item)
self.unique_items.remove(popped_item)
else:
heapq.heappush(self.heap, MPQItemContainer(score, item, extra_data))
self.unique_items.add(item)
def pop(self):
"""Pop the item with the lowest score.
Returns:
score: Item's score.
item: The item that was popped.
extra_data: Any extra data stored with the item.
"""
if not self.heap:
return ()
score, item, extra_data = heapq.heappop(self.heap)
self.unique_items.remove(item)
return score, item, extra_data
def get_max(self):
"""Peek at the item with the highest score.
Returns:
Same as `pop`.
"""
if not self.heap:
return ()
score, item, extra_data = heapq.nlargest(1, self.heap)[0]
return score, item, extra_data
def get_min(self):
"""Peek at the item with the lowest score.
Returns:
Same as `pop`.
"""
if not self.heap:
return ()
score, item, extra_data = heapq.nsmallest(1, self.heap)[0]
return score, item, extra_data
def random_sample(self, sample_size):
"""Randomly select items from the queue.
This does not modify the queue.
Items are drawn from a uniform distribution, and not weighted by score.
Args:
sample_size: Number of random samples to draw. The same item can be
sampled multiple times.
Returns:
List of sampled items (of length `sample_size`). Each element in the list
is a tuple: (item, extra_data).
"""
idx = np.random.choice(len(self.heap), sample_size)
return [(self.heap[i].item, self.heap[i].extra_data) for i in idx]
def iter_in_order(self):
"""Iterate over items in the queue from largest score to smallest.
Yields:
item: Hashable item.
extra_data: Extra data stored with the item.
"""
for _, item, extra_data in heapq.nlargest(len(self.heap), self.heap):
yield item, extra_data
def __len__(self):
return len(self.heap)
def __iter__(self):
for _, item, _ in self.heap:
yield item
def __repr__(self):
return '[' + ', '.join(repr(c) for c in self.heap) + ']'
def __str__(self):
return repr(self)
class RouletteWheel(object):
"""Randomly samples stored objects proportionally to their given weights.
Stores objects and weights. Acts like a roulette wheel where each object is
given a slice of the roulette disk proportional to its weight.
This can be used as a replay buffer where past experiences are sampled
proportionally to their weights. A good choice of "weight" for reinforcement
learning is exp(reward / temperature) where temperature -> inf makes the
distribution more uniform and temperature -> 0 makes the distribution more
peaky.
To prevent experiences from being overweighted by appearing in the replay
buffer multiple times, a "unique mode" is supported where duplicate
experiences are ignored. In unique mode, weights can be quickly retrieved from
keys.
"""
def __init__(self, unique_mode=False, save_file=None):
"""Construct empty RouletteWheel.
If `save_file` is not None, and the file already exists on disk, whatever
is in the file will be loaded into this instance. This allows jobs using
RouletteWheel to resume after preemption.
Args:
unique_mode: If True, puts this RouletteWheel into unique mode, where
objects are added with hashable keys, so that duplicates are ignored.
save_file: Optional file path to save to. Must be a string containing
an absolute path to a file, or None. File will be Python pickle
format.
"""
self.unique_mode = unique_mode
self.objects = []
self.weights = []
self.partial_sums = []
if self.unique_mode:
self.keys_to_weights = {}
self.save_file = save_file
self.save_to_disk_buffer = []
if save_file is not None and tf.gfile.Exists(save_file):
# Load from disk.
with tf.gfile.OpenFast(save_file, 'r') as f:
count = 0
while 1:
try:
obj, weight, key = cPickle.load(f)
except EOFError:
break
else:
self.add(obj, weight, key)
count += 1
logging.info('Loaded %d samples from disk.', count)
# Clear buffer since these items are already on disk.
self.save_to_disk_buffer = []
def __iter__(self):
return iter(zip(self.objects, self.weights))
def __len__(self):
return len(self.objects)
def is_empty(self):
"""Returns whether there is anything in the roulette wheel."""
return not self.partial_sums
@property
def total_weight(self):
"""Total cumulative weight across all objects."""
if self.partial_sums:
return self.partial_sums[-1]
return 0.0
def has_key(self, key):
if self.unique_mode:
RuntimeError('has_key method can only be called in unique mode.')
return key in self.keys_to_weights
def get_weight(self, key):
if self.unique_mode:
RuntimeError('get_weight method can only be called in unique mode.')
return self.keys_to_weights[key]
def add(self, obj, weight, key=None):
"""Add one object and its weight to the roulette wheel.
Args:
obj: Any object to be stored.
weight: A non-negative float. The given object will be drawn with
probability proportional to this weight when sampling.
key: This argument is only used when in unique mode. To allow `obj` to
be an unhashable type, like list, a separate hashable key is given.
Each `key` should be unique to each `obj`. `key` is used to check if
`obj` has been added to the roulette wheel before.
Returns:
True if the object was added, False if it was not added due to it being
a duplicate (this only happens in unique mode).
Raises:
ValueError: If `weight` is negative.
ValueError: If `key` is not given when in unique mode, or if `key` is
given when not in unique mode.
"""
if weight < 0:
raise ValueError('Weight must be non-negative')
if self.unique_mode:
if key is None:
raise ValueError(
'Hashable key required for objects when unique mode is enabled.')
if key in self.keys_to_weights:
# Weight updates are not allowed. Ignore the given value of `weight`.
return False
self.keys_to_weights[key] = weight
elif key is not None:
raise ValueError(
'key argument should not be used when unique mode is disabled.')
self.objects.append(obj)
self.weights.append(weight)
self.partial_sums.append(self.total_weight + weight)
if self.save_file is not None:
# Record new item in buffer.
self.save_to_disk_buffer.append((obj, weight, key))
return True
def add_many(self, objs, weights, keys=None):
"""Add many object and their weights to the roulette wheel.
Arguments are the same as the `add` method, except each is a list. Lists
must all be the same length.
Args:
objs: List of objects to be stored.
weights: List of non-negative floats. See `add` method.
keys: List of hashable keys. This argument is only used when in unique
mode. See `add` method.
Returns:
Number of objects added. This number will be less than the number of
objects provided if we are in unique mode and some keys are already
in the roulette wheel.
Raises:
ValueError: If `keys` argument is provided when unique_mode == False, or
is not provided when unique_mode == True.
ValueError: If any of the lists are not the same length.
ValueError: If any of the weights are negative.
"""
if keys is not None and not self.unique_mode:
raise ValueError('Not in unique mode. Do not provide keys.')
elif keys is None and self.unique_mode:
raise ValueError('In unique mode. You must provide hashable keys.')
if keys and len(objs) != len(keys):
raise ValueError('Number of objects does not equal number of keys.')
if len(objs) != len(weights):
raise ValueError('Number of objects does not equal number of weights.')
return sum([self.add(obj, weights[i], key=keys[i] if keys else None)
for i, obj in enumerate(objs)])
def sample(self):
"""Spin the roulette wheel.
Randomly select an object with probability proportional to its weight.
Returns:
object: The selected object.
weight: The weight of the selected object.
Raises:
RuntimeError: If the roulette wheel is empty.
"""
if self.is_empty():
raise RuntimeError('Trying to sample from empty roulette wheel.')
spin = random.random() * self.total_weight
# Binary search.
i = bisect.bisect_right(self.partial_sums, spin)
if i == len(self.partial_sums):
# This should not happen since random.random() will always be strictly
# less than 1.0, and the last partial sum equals self.total_weight().
# However it may happen due to rounding error. In that case it is easy to
# handle this, just select the last object.
i -= 1
return self.objects[i], self.weights[i]
def sample_many(self, count):
"""Spin the roulette wheel `count` times and return the results."""
if self.is_empty():
raise RuntimeError('Trying to sample from empty roulette wheel.')
return [self.sample() for _ in xrange(count)]
def incremental_save(self, log_info=False):
"""Write new entries to disk.
This performs an append operation on the `save_file` given in the
constructor. Any entries added since the last call to `incremental_save`
will be appended to the file.
If a new RouletteWheel is constructed with the same `save_file`, all the
entries written there will be automatically loaded into the instance.
This is useful when a job resumes after preemption.
Args:
log_info: If True, info about this operation will be logged.
Raises:
RuntimeError: If `save_file` given in the constructor is None.
"""
if self.save_file is None:
raise RuntimeError('Cannot call incremental_save. `save_file` is None.')
if log_info:
logging.info('Saving %d new samples to disk.',
len(self.save_to_disk_buffer))
with tf.gfile.OpenFast(self.save_file, 'a') as f:
for entry in self.save_to_disk_buffer:
cPickle.dump(entry, f)
# Clear the buffer.
self.save_to_disk_buffer = []
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for common.utils.
"""
from collections import Counter
import random
import tempfile
import numpy as np
import tensorflow as tf
from common import utils # brain coder
class UtilsTest(tf.test.TestCase):
def testStackPad(self):
# 1D.
tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=6)
self.assertTrue(np.array_equal(
result,
np.asarray([[1, 2, 3, 0, 0, 0],
[4, 5, 6, 7, 8, 0],
[9, 0, 0, 0, 0, 0]], dtype=np.float32)))
# 3D.
tensors = [[[[1, 2, 3], [4, 5, 6]]],
[[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
[[[0, 1, 2]], [[3, 4, 5]]]]
result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
self.assertTrue(np.array_equal(
result,
np.asarray([[[[1, 2, 3], [4, 5, 6]],
[[0, 0, 0], [0, 0, 0]]],
[[[7, 8, 9], [0, 1, 2]],
[[3, 4, 5], [6, 7, 8]]],
[[[0, 1, 2], [0, 0, 0]],
[[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
def testStackPadNoAxes(self):
# 2D.
tensors = [[[1, 2, 3], [4, 5, 6]],
[[7, 8, 9], [1, 2, 3]],
[[4, 5, 6], [7, 8, 9]]]
result = utils.stack_pad(tensors)
self.assertTrue(np.array_equal(
result,
np.asarray(tensors)))
def testStackPadNoneLength(self):
# 1D.
tensors = [[1, 2, 3], [4, 5, 6, 7, 8], [9]]
result = utils.stack_pad(tensors, pad_axes=0, pad_to_lengths=None)
self.assertTrue(np.array_equal(
result,
np.asarray([[1, 2, 3, 0, 0],
[4, 5, 6, 7, 8],
[9, 0, 0, 0, 0]], dtype=np.float32)))
# 3D.
tensors = [[[[1, 2, 3], [4, 5, 6]]],
[[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
[[[0, 1, 2]], [[3, 4, 5]]]]
result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=None)
self.assertTrue(np.array_equal(
result,
np.asarray([[[[1, 2, 3], [4, 5, 6]],
[[0, 0, 0], [0, 0, 0]]],
[[[7, 8, 9], [0, 1, 2]],
[[3, 4, 5], [6, 7, 8]]],
[[[0, 1, 2], [0, 0, 0]],
[[3, 4, 5], [0, 0, 0]]]], dtype=np.float32)))
# 3D with partial pad_to_lengths.
tensors = [[[[1, 2, 3], [4, 5, 6]]],
[[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
[[[0, 1, 2]], [[3, 4, 5]]]]
result = utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[None, 3])
self.assertTrue(np.array_equal(
result,
np.asarray([[[[1, 2, 3], [4, 5, 6], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, 0]]],
[[[7, 8, 9], [0, 1, 2], [0, 0, 0]],
[[3, 4, 5], [6, 7, 8], [0, 0, 0]]],
[[[0, 1, 2], [0, 0, 0], [0, 0, 0]],
[[3, 4, 5], [0, 0, 0], [0, 0, 0]]]], dtype=np.float32)))
def testStackPadValueError(self):
# 3D.
tensors = [[[[1, 2, 3], [4, 5, 6]]],
[[[7, 8, 9], [0, 1, 2]], [[3, 4, 5], [6, 7, 8]]],
[[[0, 1, 2]], [[3, 4, 5]]],
[[[1, 2, 3, 4]]]]
# Not all tensors have the same shape along axis 2.
with self.assertRaises(ValueError):
utils.stack_pad(tensors, pad_axes=[0, 1], pad_to_lengths=[2, 2])
def testRecord(self):
my_record = utils.make_record('my_record', ['a', 'b', 'c'], {'b': 55})
inst = my_record(a=1, b=2, c=3)
self.assertEqual(1, inst.a)
self.assertEqual(2, inst.b)
self.assertEqual(3, inst.c)
self.assertEqual(1, inst[0])
self.assertEqual(2, inst[1])
self.assertEqual(3, inst[2])
self.assertEqual([1, 2, 3], list(iter(inst)))
self.assertEqual(3, len(inst))
inst.b = 999
self.assertEqual(999, inst.b)
self.assertEqual(999, inst[1])
inst2 = my_record(1, 999, 3)
self.assertTrue(inst == inst2)
inst2[1] = 3
self.assertFalse(inst == inst2)
inst3 = my_record(a=1, c=3)
inst.b = 55
self.assertEqual(inst, inst3)
def testRecordUnique(self):
record1 = utils.make_record('record1', ['a', 'b', 'c'])
record2 = utils.make_record('record2', ['a', 'b', 'c'])
self.assertNotEqual(record1(1, 2, 3), record2(1, 2, 3))
self.assertEqual(record1(1, 2, 3), record1(1, 2, 3))
def testTupleToRecord(self):
my_record = utils.make_record('my_record', ['a', 'b', 'c'])
inst = utils.tuple_to_record((5, 6, 7), my_record)
self.assertEqual(my_record(5, 6, 7), inst)
def testRecordErrors(self):
my_record = utils.make_record('my_record', ['a', 'b', 'c'], {'b': 10})
with self.assertRaises(ValueError):
my_record(c=5) # Did not provide required argument 'a'.
with self.assertRaises(ValueError):
my_record(1, 2, 3, 4) # Too many arguments.
def testRandomQueue(self):
np.random.seed(567890)
queue = utils.RandomQueue(5)
queue.push(5)
queue.push(6)
queue.push(7)
queue.push(8)
queue.push(9)
queue.push(10)
self.assertTrue(5 not in queue)
sample = queue.random_sample(1000)
self.assertEqual(1000, len(sample))
self.assertEqual([6, 7, 8, 9, 10], sorted(np.unique(sample).tolist()))
def testMaxUniquePriorityQueue(self):
queue = utils.MaxUniquePriorityQueue(5)
queue.push(1.0, 'string 1')
queue.push(-0.5, 'string 2')
queue.push(0.5, 'string 3')
self.assertEqual((-0.5, 'string 2', None), queue.pop())
queue.push(0.1, 'string 4')
queue.push(1.5, 'string 5')
queue.push(0.0, 'string 6')
queue.push(0.2, 'string 7')
self.assertEqual((1.5, 'string 5', None), queue.get_max())
self.assertEqual((0.1, 'string 4', None), queue.get_min())
self.assertEqual(
[('string 5', None), ('string 1', None), ('string 3', None),
('string 7', None), ('string 4', None)],
list(queue.iter_in_order()))
def testMaxUniquePriorityQueue_Duplicates(self):
queue = utils.MaxUniquePriorityQueue(5)
queue.push(0.0, 'string 1')
queue.push(0.0, 'string 2')
queue.push(0.0, 'string 3')
self.assertEqual((0.0, 'string 1', None), queue.pop())
self.assertEqual((0.0, 'string 2', None), queue.pop())
self.assertEqual((0.0, 'string 3', None), queue.pop())
self.assertEqual(0, len(queue))
queue.push(0.1, 'string 4')
queue.push(1.5, 'string 5')
queue.push(0.3, 'string 6')
queue.push(0.2, 'string 7')
queue.push(0.0, 'string 8')
queue.push(1.5, 'string 5')
queue.push(1.5, 'string 5')
self.assertEqual((1.5, 'string 5', None), queue.get_max())
self.assertEqual((0.0, 'string 8', None), queue.get_min())
self.assertEqual(
[('string 5', None), ('string 6', None), ('string 7', None),
('string 4', None), ('string 8', None)],
list(queue.iter_in_order()))
def testMaxUniquePriorityQueue_ExtraData(self):
queue = utils.MaxUniquePriorityQueue(5)
queue.push(1.0, 'string 1', [1, 2, 3])
queue.push(0.5, 'string 2', [4, 5, 6])
queue.push(0.5, 'string 3', [7, 8, 9])
queue.push(0.5, 'string 2', [10, 11, 12])
self.assertEqual((0.5, 'string 2', [4, 5, 6]), queue.pop())
self.assertEqual((0.5, 'string 3', [7, 8, 9]), queue.pop())
self.assertEqual((1.0, 'string 1', [1, 2, 3]), queue.pop())
self.assertEqual(0, len(queue))
queue.push(0.5, 'string 2', [10, 11, 12])
self.assertEqual((0.5, 'string 2', [10, 11, 12]), queue.pop())
def testRouletteWheel(self):
random.seed(12345678987654321)
r = utils.RouletteWheel()
self.assertTrue(r.is_empty())
with self.assertRaises(RuntimeError):
r.sample() # Cannot sample when empty.
self.assertEqual(0, r.total_weight)
self.assertEqual(True, r.add('a', 0.1))
self.assertFalse(r.is_empty())
self.assertEqual(0.1, r.total_weight)
self.assertEqual(True, r.add('b', 0.01))
self.assertEqual(0.11, r.total_weight)
self.assertEqual(True, r.add('c', 0.5))
self.assertEqual(True, r.add('d', 0.1))
self.assertEqual(True, r.add('e', 0.05))
self.assertEqual(True, r.add('f', 0.03))
self.assertEqual(True, r.add('g', 0.001))
self.assertEqual(0.791, r.total_weight)
self.assertFalse(r.is_empty())
# Check that sampling is correct.
obj, weight = r.sample()
self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
self.assertTrue((obj, weight) in r)
for obj, weight in r.sample_many(100):
self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
self.assertTrue((obj, weight) in r)
# Check that sampling distribution is correct.
n = 1000000
c = Counter(r.sample_many(n))
for obj, w in r:
estimated_w = c[(obj, w)] / float(n) * r.total_weight
self.assertTrue(
np.isclose(w, estimated_w, atol=1e-3),
'Expected %s, got %s, for object %s' % (w, estimated_w, obj))
def testRouletteWheel_AddMany(self):
random.seed(12345678987654321)
r = utils.RouletteWheel()
self.assertTrue(r.is_empty())
with self.assertRaises(RuntimeError):
r.sample() # Cannot sample when empty.
self.assertEqual(0, r.total_weight)
count = r.add_many(
['a', 'b', 'c', 'd', 'e', 'f', 'g'],
[0.1, 0.01, 0.5, 0.1, 0.05, 0.03, 0.001])
self.assertEqual(7, count)
self.assertFalse(r.is_empty())
self.assertEqual(0.791, r.total_weight)
# Adding no items is allowed.
count = r.add_many([], [])
self.assertEqual(0, count)
self.assertFalse(r.is_empty())
self.assertEqual(0.791, r.total_weight)
# Check that sampling is correct.
obj, weight = r.sample()
self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
self.assertTrue((obj, weight) in r)
for obj, weight in r.sample_many(100):
self.assertTrue(isinstance(weight, float), 'Type: %s' % type(weight))
self.assertTrue((obj, weight) in r)
# Check that sampling distribution is correct.
n = 1000000
c = Counter(r.sample_many(n))
for obj, w in r:
estimated_w = c[(obj, w)] / float(n) * r.total_weight
self.assertTrue(
np.isclose(w, estimated_w, atol=1e-3),
'Expected %s, got %s, for object %s' % (w, estimated_w, obj))
def testRouletteWheel_AddZeroWeights(self):
r = utils.RouletteWheel()
self.assertEqual(True, r.add('a', 0))
self.assertFalse(r.is_empty())
self.assertEqual(4, r.add_many(['b', 'c', 'd', 'e'], [0, 0.1, 0, 0]))
self.assertEqual(
[('a', 0.0), ('b', 0.0), ('c', 0.1), ('d', 0.0), ('e', 0.0)],
list(r))
def testRouletteWheel_UniqueMode(self):
random.seed(12345678987654321)
r = utils.RouletteWheel(unique_mode=True)
self.assertEqual(True, r.add([1, 2, 3], 1, 'a'))
self.assertEqual(True, r.add([4, 5], 0.5, 'b'))
self.assertEqual(False, r.add([1, 2, 3], 1.5, 'a'))
self.assertEqual(
[([1, 2, 3], 1.0), ([4, 5], 0.5)],
list(r))
self.assertEqual(1.5, r.total_weight)
self.assertEqual(
2,
r.add_many(
[[5, 6, 2, 3], [1, 2, 3], [8], [1, 2, 3]],
[0.1, 0.2, 0.1, 2.0],
['c', 'a', 'd', 'a']))
self.assertEqual(
[([1, 2, 3], 1.0), ([4, 5], 0.5), ([5, 6, 2, 3], 0.1), ([8], 0.1)],
list(r))
self.assertTrue(np.isclose(1.7, r.total_weight))
self.assertEqual(0, r.add_many([], [], [])) # Adding no items is allowed.
with self.assertRaises(ValueError):
# Key not given.
r.add([7, 8, 9], 2.0)
with self.assertRaises(ValueError):
# Keys not given.
r.add_many([[7, 8, 9], [10]], [2.0, 2.0])
self.assertEqual(True, r.has_key('a'))
self.assertEqual(True, r.has_key('b'))
self.assertEqual(False, r.has_key('z'))
self.assertEqual(1.0, r.get_weight('a'))
self.assertEqual(0.5, r.get_weight('b'))
r = utils.RouletteWheel(unique_mode=False)
self.assertEqual(True, r.add([1, 2, 3], 1))
self.assertEqual(True, r.add([4, 5], 0.5))
self.assertEqual(True, r.add([1, 2, 3], 1.5))
self.assertEqual(
[([1, 2, 3], 1.0), ([4, 5], 0.5), ([1, 2, 3], 1.5)],
list(r))
self.assertEqual(3, r.total_weight)
self.assertEqual(
4,
r.add_many(
[[5, 6, 2, 3], [1, 2, 3], [8], [1, 2, 3]],
[0.1, 0.2, 0.1, 0.2]))
self.assertEqual(
[([1, 2, 3], 1.0), ([4, 5], 0.5), ([1, 2, 3], 1.5),
([5, 6, 2, 3], 0.1), ([1, 2, 3], 0.2), ([8], 0.1), ([1, 2, 3], 0.2)],
list(r))
self.assertTrue(np.isclose(3.6, r.total_weight))
with self.assertRaises(ValueError):
# Key is given.
r.add([7, 8, 9], 2.0, 'a')
with self.assertRaises(ValueError):
# Keys are given.
r.add_many([[7, 8, 9], [10]], [2.0, 2.0], ['a', 'b'])
def testRouletteWheel_IncrementalSave(self):
f = tempfile.NamedTemporaryFile()
r = utils.RouletteWheel(unique_mode=True, save_file=f.name)
entries = [
([1, 2, 3], 0.1, 'a'),
([4, 5], 0.2, 'b'),
([6], 0.3, 'c'),
([7, 8, 9, 10], 0.25, 'd'),
([-1, -2], 0.15, 'e'),
([-3, -4, -5], 0.5, 'f')]
self.assertTrue(r.is_empty())
for i in range(0, len(entries), 2):
r.add(*entries[i])
r.add(*entries[i + 1])
r.incremental_save()
r2 = utils.RouletteWheel(unique_mode=True, save_file=f.name)
self.assertEqual(i + 2, len(r2))
count = 0
for j, (obj, weight) in enumerate(r2):
self.assertEqual(entries[j][0], obj)
self.assertEqual(entries[j][1], weight)
self.assertEqual(weight, r2.get_weight(entries[j][2]))
count += 1
self.assertEqual(i + 2, count)
if __name__ == '__main__':
tf.test.main()
licenses(["notice"])
package(default_visibility = [
"//learning/brain/research/neural_coder:__subpackages__",
])
load("@subpar//:subpar.bzl", "par_binary")
par_binary(
name = "run",
srcs = ["run.py"],
deps = [
":defaults",
":ga_train",
":pg_train",
# absl dep :app
# absl dep /flags
# absl dep /logging
],
)
par_binary(
name = "tune",
srcs = ["tune.py"],
deps = [
":defaults",
":run",
# file dep
# absl dep :app
# absl dep /flags
# absl dep /logging
# numpy dep
# tensorflow dep
],
)
py_library(
name = "ga_train",
srcs = ["ga_train.py"],
deps = [
":data",
":defaults",
":ga_lib",
":results_lib",
# file dep
# absl dep /flags
# absl dep /logging
# numpy dep
# tensorflow dep
"//common:utils", # project
],
)
py_library(
name = "ga_lib",
srcs = ["ga_lib.py"],
deps = [
":misc",
# absl dep /flags
# absl dep /logging
# numpy dep
"//common:bf", # project
"//common:utils", # project
],
)
py_test(
name = "ga_train_test",
srcs = ["ga_train_test.py"],
deps = [
":defaults",
":run",
# absl dep /flags
# tensorflow dep
],
)
py_library(
name = "pg_train",
srcs = ["pg_train.py"],
deps = [
":data",
":defaults",
":pg_agent",
":results_lib",
# file dep
# absl dep /flags
# absl dep /logging
# tensorflow dep
# tensorflow internal dep # build_cleaner: keep
],
)
py_library(
name = "pg_agent",
srcs = ["pg_agent.py"],
deps = [
":misc",
# file dep
# absl dep /logging
# numpy dep
# tensorflow dep
"//common:rollout", # project
"//common:utils", # project
],
)
py_test(
name = "pg_agent_test",
srcs = ["pg_agent_test.py"],
deps = [
":data",
":defaults",
":misc",
":pg_agent",
":pg_train",
# absl dep /logging
# numpy dep
# tensorflow dep
"//common:utils", # project
],
)
py_library(
name = "defaults",
srcs = ["defaults.py"],
deps = [
# absl dep /logging
"//common:config_lib", # project
],
)
py_library(
name = "misc",
srcs = ["misc.py"],
)
py_library(
name = "data",
srcs = ["data.py"],
deps = [
":code_tasks",
# absl dep /logging
],
)
py_library(
name = "code_tasks",
srcs = ["code_tasks.py"],
deps = [
":misc",
":test_tasks",
# absl dep /logging
# numpy dep
"//common:bf", # project
"//common:reward", # project
],
)
py_test(
name = "code_tasks_test",
srcs = ["code_tasks_test.py"],
deps = [
":code_tasks",
":defaults",
# numpy dep
# tensorflow dep
],
)
py_library(
name = "test_tasks",
srcs = ["test_tasks.py"],
deps = [
":misc",
"//common:reward", # project
],
)
py_test(
name = "test_tasks_test",
srcs = ["test_tasks_test.py"],
deps = [
":misc",
":test_tasks",
# numpy dep
# tensorflow dep
],
)
py_test(
name = "pg_train_test",
size = "large",
srcs = ["pg_train_test.py"],
deps = [
":defaults",
":run",
# absl dep /logging
# tensorflow dep
],
)
py_library(
name = "results_lib",
srcs = ["results_lib.py"],
deps = [
# file dep
# tensorflow dep
],
)
py_test(
name = "results_lib_test",
srcs = ["results_lib_test.py"],
deps = [
":results_lib",
# tensorflow dep
],
)
par_binary(
name = "aggregate_experiment_results",
srcs = ["aggregate_experiment_results.py"],
deps = [
":misc",
":results_lib",
# file dep
# absl dep :app
# absl dep /flags
# numpy dep
# tensorflow dep
],
)
par_binary(
name = "aggregate_tuning_results",
srcs = ["aggregate_tuning_results.py"],
deps = [
# file dep
# absl dep :app
# absl dep /flags
# tensorflow dep
],
)
# Experiments for ICLR 2018 paper.
[Neural Program Synthesis with Priority Queue Training](https://arxiv.org/abs/1801.03526).
Runs policy gradient (REINFORCE), priority queue training, genetic algorithm,
and uniform random search.
Run all examples below out of your top-level repo directory, i.e. where your git
clone resides.
## Just tell me how to run something and see results
```bash
# These tasks are the fastest to learn. 'echo' and 'count-down' are very
# easy. run_eval_tasks.py will do most of the work to run all the jobs.
# Should take between 10 and 30 minutes.
# How many repetitions each experiment will run. In the paper, we use 25. Less
# reps means faster experiments, but noisier results.
REPS=25
# Extra description in the job names for these experiments. Use this description
# to distinguish between multiple runs of the same experiment.
DESC="demo"
# The tasks to run.
TASKS="reverse echo-second-seq"
# The model types and max NPE.
EXPS=( pg-20M topk-20M ga-20M rand-20M )
# Where training data is saved. This is chosen by launch_training.sh. Custom
# implementations of launch_training.sh may use different locations.
MODELS_DIR="/tmp/models"
# Run run_eval_tasks.py for each experiment name in EXPS.
for exp in "${EXPS[@]}"
do
./single_task/run_eval_tasks.py \
--exp "$exp" --tasks $TASKS --desc "$DESC" --reps $REPS
done
# During training or after completion, run this to aggregate results into a
# table. This is also useful for seeing how much progress has been made.
# Make sure the arguments here match the settings used above.
# Note: This can take a few minutes because it reads from every experiment
# directory.
bazel run single_task:aggregate_experiment_results -- \
--models_dir="$MODELS_DIR" \
--max_npe="20M" \
--task_list="$TASKS" \
--model_types="[('pg', '$DESC'), ('topk', '$DESC'), ('ga', '$DESC'),
('rand', '$DESC')]" \
--csv_file="/tmp/results_table.csv"
```
## Reproduce tuning results in paper
```bash
bazel build -c opt single_task:tune.par
# PG and TopK Tuning.
MAX_NPE=5000000
CONFIG="
env=c(task_cycle=['reverse-tune','remove-tune']),
agent=c(
algorithm='pg',
grad_clip_threshold=50.0,param_init_factor=0.5,entropy_beta=0.05,lr=1e-5,
optimizer='rmsprop',ema_baseline_decay=0.99,topk_loss_hparam=0.0,topk=0,
replay_temperature=1.0,alpha=0.0,eos_token=False),
timestep_limit=50,batch_size=64"
./single_task/launch_tuning.sh \
--job_name="iclr_pg_gridsearch.reverse-remove" \
--config="$CONFIG" \
--max_npe="$MAX_NPE" \
--num_workers_per_tuner=1 \
--num_ps_per_tuner=0 \
--num_tuners=1 \
--num_repetitions=50 \
--hparam_space_type="pg" \
--stop_on_success=true
./single_task/launch_tuning.sh \
--job_name="iclr_pg_topk_gridsearch.reverse-remove" \
--config="$CONFIG" \
--max_npe="$MAX_NPE" \
--num_workers_per_tuner=1 \
--num_ps_per_tuner=0 \
--num_tuners=1 \
--num_repetitions=50 \
--hparam_space_type="pg-topk" \
--fixed_hparams="topk=10" \
--stop_on_success=true
./single_task/launch_tuning.sh \
--job_name="iclr_topk_gridsearch.reverse-remove" \
--config="$CONFIG" \
--max_npe="$MAX_NPE" \
--num_workers_per_tuner=1 \
--num_ps_per_tuner=0 \
--num_tuners=1 \
--num_repetitions=50 \
--hparam_space_type="topk" \
--fixed_hparams="topk=10" \
--stop_on_success=true
# GA Tuning.
CONFIG="
env=c(task_cycle=['reverse-tune','remove-char-tune']),
agent=c(algorithm='ga'),
timestep_limit=50"
./single_task/launch_tuning.sh \
--job_name="iclr_ga_gridsearch.reverse-remove" \
--config="$CONFIG" \
--max_npe="$MAX_NPE" \
--num_workers_per_tuner=25 \
--num_ps_per_tuner=0 \
--num_tuners=1 \
--num_repetitions=50 \
--hparam_space_type="ga" \
--stop_on_success=true
# Aggregate tuning results. Run after tuning jobs complete.
bazel run -c opt single_task:aggregate_tuning_results -- \
--tuning_dir="$MODELS_DIR/iclr_pg_gridsearch.reverse-remove"
bazel run -c opt single_task:aggregate_tuning_results -- \
--tuning_dir="$MODELS_DIR/iclr_pg_topk_gridsearch.reverse-remove"
bazel run -c opt single_task:aggregate_tuning_results -- \
--tuning_dir="$MODELS_DIR/iclr_topk_gridsearch.reverse-remove"
bazel run -c opt single_task:aggregate_tuning_results -- \
--tuning_dir="$MODELS_DIR/iclr_ga_gridsearch.reverse-remove"
```
## Reproduce eval results in paper
```bash
DESC="v0" # Description for each experiment. "Version 0" is a good default.
EXPS=( pg-5M topk-5M ga-5M rand-5M pg-20M topk-20M ga-20M rand-20M )
for exp in "${EXPS[@]}"
do
./single_task/run_eval_tasks.py \
--exp "$exp" --iclr_tasks --desc "$DESC"
done
```
## Run single experiment
```bash
EXP="topk-20M" # Learning algorithm + max-NPE
TASK="reverse" # Coding task
DESC="v0" # Description for each experiment. "Version 0" is a good default.
./single_task/run_eval_tasks.py \
--exp "$EXP" --task "$TASK" --desc "$DESC"
```
## Fetch eval results into a table
```bash
# These arguments should match the settings you used to run the experiments.
MODELS_DIR="/tmp/models"
MAX_NPE="20M"
DESC="v0" # Same description used in the experiments.
# MODEL_TYPES specifies each model type and the description used in their
# experiments.
MODEL_TYPES="[('pg', '$DESC'), ('topk', '$DESC'),
('ga', '$DESC'), ('rand', '$DESC')]"
TASKS="" # Empty string will default to all ICLR tasks.
# To specify custom task list, give task names separated by spaces. Example:
# TASKS="reverse remove-char"
bazel run single_task:aggregate_experiment_results -- \
--models_dir="$MODELS_DIR" \
--max_npe="$MAX_NPE" \
--task_list="$TASKS" \
--model_types="$MODEL_TYPES" \
--csv_file="/tmp/results_table.csv"
```
## Reproduce shortest code examples in paper
```bash
# Maximum NPE is higher here. We only do 1 repetition, and the algorithm needs
# time to simplify its solution.
MODELS_DIR="/tmp/models"
NPE="500M"
DESC="short-code"
./single_task/run_eval_tasks.py \
--exp "simpl-$NPE" --desc "$DESC" --iclr_tasks --reps 1
# Aggregate best code strings. Run after training completes.
TASKS="" # Empty string. Will default to all ICLR tasks.
bazel run single_task:aggregate_experiment_results -- \
--models_dir="$MODELS_DIR" \
--max_npe="$NPE" \
--task_list="$TASKS" \
--model_types="[('topk', '$DESC')]" \
--data=code
```
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
r"""This script crawls experiment directories for results and aggregates them.
Usage example:
MODELS_DIR="/tmp/models"
bazel run single_task:aggregate_experiment_results -- \
--models_dir="$MODELS_DIR" \
--max_npe="20M" \
--task_list="add echo" \
--model_types="[('topk', 'v0'), ('ga', 'v0')]" \
--csv_file=/tmp/results_table.csv
"""
import ast
from collections import namedtuple
import csv
import os
import re
import StringIO
import sys
from absl import app
from absl import flags
import numpy as np
import tensorflow as tf
from single_task import misc # brain coder
from single_task import results_lib # brain coder
DEFAULT_MODELS = [('pg', 'v0'), ('topk', 'v0'), ('ga', 'v0'), ('rand', 'v0')]
DEFAULT_TASKS = [
'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
FLAGS = flags.FLAGS
flags.DEFINE_string(
'models_dir', '',
'Absolute path where results folders are found.')
flags.DEFINE_string(
'exp_prefix', 'bf_rl_iclr',
'Prefix for all experiment folders.')
flags.DEFINE_string(
'max_npe', '5M',
'String representation of max NPE of the experiments.')
flags.DEFINE_spaceseplist(
'task_list', DEFAULT_TASKS,
'List of task names separated by spaces. If empty string, defaults to '
'`DEFAULT_TASKS`. These are the rows of the results table.')
flags.DEFINE_string(
'model_types', str(DEFAULT_MODELS),
'String representation of a python list of 2-tuples, each a model_type + '
'job description pair. Descriptions allow you to choose among different '
'runs of the same experiment. These are the columns of the results table.')
flags.DEFINE_string(
'csv_file', '/tmp/results_table.csv',
'Where to write results table. Format is CSV.')
flags.DEFINE_enum(
'data', 'success_rates', ['success_rates', 'code'],
'What type of data to aggregate.')
def make_csv_string(table):
"""Convert 2D list to CSV string."""
s = StringIO.StringIO()
writer = csv.writer(s)
writer.writerows(table)
value = s.getvalue()
s.close()
return value
def process_results(metrics):
"""Extract useful information from given metrics.
Args:
metrics: List of results dicts. These should have been written to disk by
training jobs.
Returns:
Dict mapping stats names to values.
Raises:
ValueError: If max_npe or max_global_repetitions values are inconsistant
across dicts in the `metrics` list.
"""
count = len(metrics)
success_count = 0
total_npe = 0 # Counting NPE across all runs.
success_npe = 0 # Counting NPE in successful runs only.
max_npe = 0
max_repetitions = 0
for metric_dict in metrics:
if not max_npe:
max_npe = metric_dict['max_npe']
elif max_npe != metric_dict['max_npe']:
raise ValueError(
'Invalid experiment. Different reps have different max-NPE settings.')
if not max_repetitions:
max_repetitions = metric_dict['max_global_repetitions']
elif max_repetitions != metric_dict['max_global_repetitions']:
raise ValueError(
'Invalid experiment. Different reps have different num-repetition '
'settings.')
if metric_dict['found_solution']:
success_count += 1
success_npe += metric_dict['npe']
total_npe += metric_dict['npe']
stats = {}
stats['max_npe'] = max_npe
stats['max_repetitions'] = max_repetitions
stats['repetitions'] = count
stats['successes'] = success_count # successful reps
stats['failures'] = count - success_count # failed reps
stats['success_npe'] = success_npe
stats['total_npe'] = total_npe
if success_count:
# Only successful runs counted.
stats['avg_success_npe'] = stats['success_npe'] / float(success_count)
else:
stats['avg_success_npe'] = 0.0
if count:
stats['success_rate'] = success_count / float(count)
stats['avg_total_npe'] = stats['total_npe'] / float(count)
else:
stats['success_rate'] = 0.0
stats['avg_total_npe'] = 0.0
return stats
ProcessedResults = namedtuple('ProcessedResults', ['metrics', 'processed'])
def get_results_for_experiment(
models_dir, task_name, model_type='pg', max_npe='5M', desc='v0',
name_prefix='bf_rl_paper', extra_desc=''):
"""Get and process results for a given experiment.
An experiment is a set of runs with the same hyperparameters and environment.
It is uniquely specified by a (task_name, model_type, max_npe) triple, as
well as an optional description.
We assume that each experiment has a folder with the same name as the job that
ran the experiment. The name is computed by
"%name_prefix%.%desc%-%max_npe%_%task_name%".
Args:
models_dir: Parent directory containing experiment folders.
task_name: String name of task (the coding env). See code_tasks.py or
run_eval_tasks.py
model_type: Name of the algorithm, such as 'pg', 'topk', 'ga', 'rand'.
max_npe: String SI unit representation of the maximum NPE threshold for the
experiment. For example, "5M" means 5 million.
desc: Description.
name_prefix: Prefix of job names. Normally leave this as default.
extra_desc: Optional extra description at the end of the job name.
Returns:
ProcessedResults namedtuple instance, containing
metrics: Raw dicts read from disk.
processed: Stats computed by `process_results`.
Raises:
ValueError: If max_npe in the metrics does not match NPE in the experiment
folder name.
"""
folder = name_prefix + '.{0}.{1}-{2}_{3}'.format(desc, model_type, max_npe,
task_name)
if extra_desc:
folder += '.' + extra_desc
results = results_lib.Results(os.path.join(models_dir, folder))
metrics, _ = results.read_all()
processed = process_results(metrics)
if (not np.isclose(processed['max_npe'], misc.si_to_int(max_npe))
and processed['repetitions']):
raise ValueError(
'Invalid experiment. Max-NPE setting does not match expected max-NPE '
'in experiment name.')
return ProcessedResults(metrics=metrics, processed=processed)
BestCodeResults = namedtuple(
'BestCodeResults',
['code', 'reward', 'npe', 'folder', 'finished', 'error'])
class BestCodeResultError(object):
success = 0
no_solution_found = 1
experiment_does_not_exist = 2
def get_best_code_for_experiment(
models_dir, task_name, model_type='pg', max_npe='5M', desc=0,
name_prefix='bf_rl_paper', extra_desc=''):
"""Like `get_results_for_experiment`, but fetches the code solutions."""
folder = name_prefix + '.{0}.{1}-{2}_{3}'.format(desc, model_type, max_npe,
task_name)
if extra_desc:
folder += '.' + extra_desc
log_dir = os.path.join(models_dir, folder, 'logs')
search_regex = r'^solutions_([0-9])+\.txt$'
try:
all_children = tf.gfile.ListDirectory(log_dir)
except tf.errors.NotFoundError:
return BestCodeResults(
code=None, reward=0.0, npe=0, folder=folder, finished=False,
error=BestCodeResultError.experiment_does_not_exist)
solution_files = [
fname for fname in all_children if re.search(search_regex, fname)]
max_reward = 0.0
npe = 0
best_code = None
for fname in solution_files:
with tf.gfile.FastGFile(os.path.join(log_dir, fname), 'r') as reader:
results = [ast.literal_eval(entry) for entry in reader]
for res in results:
if res['reward'] > max_reward:
best_code = res['code']
max_reward = res['reward']
npe = res['npe']
error = (
BestCodeResultError.success if best_code
else BestCodeResultError.no_solution_found)
try:
# If there is a status.txt file, check if it contains the status of the job.
with tf.gfile.FastGFile(os.path.join(log_dir, 'status.txt'), 'r') as f:
# Job is done, so mark this experiment as finished.
finished = f.read().lower().strip() == 'done'
except tf.errors.NotFoundError:
# No status file has been written, so the experiment is not done. No need to
# report an error here, because we do not require that experiment jobs write
# out a status.txt file until they have finished.
finished = False
return BestCodeResults(
code=best_code, reward=max_reward, npe=npe, folder=folder,
finished=finished, error=error)
def make_results_table(
models=None,
tasks=None,
max_npe='5M',
name_prefix='bf_rl_paper',
extra_desc='',
models_dir='/tmp'):
"""Creates a table of results: algorithm + version by tasks.
Args:
models: The table columns. A list of (algorithm, desc) tuples.
tasks: The table rows. List of task names.
max_npe: String SI unit representation of the maximum NPE threshold for the
experiment. For example, "5M" means 5 million. All entries in the table
share the same max-NPE.
name_prefix: Name prefix used in logging directory for the experiment.
extra_desc: Extra description added to name of logging directory for the
experiment.
models_dir: Parent directory containing all experiment folders.
Returns:
A 2D list holding the table cells.
"""
if models is None:
models = DEFAULT_MODELS
if tasks is None:
tasks = DEFAULT_TASKS
model_results = {}
for model_type, desc in models:
model_results[model_type] = {
tname: get_results_for_experiment(
models_dir, tname, model_type, max_npe, desc,
name_prefix=name_prefix, extra_desc=extra_desc
).processed
for tname in tasks}
def info(stats):
return [str(stats['repetitions']),
'%.2f' % stats['success_rate'],
str(int(stats['avg_total_npe']))]
rows = [['max NPE: ' + max_npe]
+ misc.flatten([['{0} ({1})'.format(m, d), '', '']
for m, d in models])]
rows.append(
[''] + misc.flatten([['reps', 'success rate', 'avg NPE']
for _ in models]))
for tname in tasks:
rows.append(
[tname]
+ misc.flatten([info(model_results[model][tname])
for model, _ in models]))
return rows
def print_results_table(results_table):
"""Print human readable results table to stdout."""
print('')
print('=== Results Table ===')
print('Format: # reps [success rate, avg total NPE]')
def info_str(info_row):
# num_runs (success_rate, avg_total_npe)
if not info_row[0]:
return '0'
return '%s [%s, %s]' % (str(info_row[0]).ljust(2), info_row[1], info_row[2])
nc = len(results_table[0]) # num cols
out_table = [
[results_table[0][0]] + [results_table[0][i] for i in range(1, nc, 3)]]
for row in results_table[2:]:
out_table.append([row[0]] + [info_str(row[i:i+3]) for i in range(1, nc, 3)])
nc = len(out_table[0]) # num cols
col_widths = [max(len(row[col]) for row in out_table) for col in range(nc)]
table_string = ''
for row in out_table:
table_string += ''.join(
[row[c].ljust(col_widths[c] + 2) for c in range(nc)]) + '\n'
print(table_string)
def main(argv):
del argv # Unused.
name_prefix = FLAGS.exp_prefix
print('Experiments prefix: %s' % name_prefix)
model_types = ast.literal_eval(FLAGS.model_types)
if FLAGS.data == 'success_rates':
results_table = make_results_table(
models=model_types, tasks=FLAGS.task_list, max_npe=FLAGS.max_npe,
models_dir=FLAGS.models_dir,
name_prefix=name_prefix, extra_desc='')
with tf.gfile.FastGFile(FLAGS.csv_file, 'w') as f:
f.write(make_csv_string(results_table))
print_results_table(results_table)
else:
# Best code
print('* = experiment is still running')
print('')
print('=== Best Synthesized Code ===')
for model_type, desc in model_types:
print('%s (%s)' % (model_type, desc))
sys.stdout.flush()
for tname in FLAGS.task_list:
res = get_best_code_for_experiment(
FLAGS.models_dir, tname, model_type, FLAGS.max_npe, desc,
name_prefix=name_prefix, extra_desc='')
unfinished_mark = '' if res.finished else ' *'
tname += unfinished_mark
if res.error == BestCodeResultError.success:
print(' %s' % tname)
print(' %s' % res.code)
print(' R=%.6f, NPE=%s' % (res.reward, misc.int_to_si(res.npe)))
elif res.error == BestCodeResultError.experiment_does_not_exist:
print(' Experiment does not exist. Check arguments.')
print(' Experiment folder: %s' % res.folder)
break
else:
print(' %s' % tname)
print(' (none)')
sys.stdout.flush()
if __name__ == '__main__':
app.run(main)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
r"""After running tuning, use this script to aggregate the results.
Usage:
OUT_DIR="<my_tuning_dir>"
bazel run -c opt single_task:aggregate_tuning_results -- \
--alsologtostderr \
--tuning_dir="$OUT_DIR"
"""
import ast
import os
from absl import app
from absl import flags
import tensorflow as tf
FLAGS = flags.FLAGS
flags.DEFINE_string(
'tuning_dir', '',
'Absolute path where results tuning trial folders are found.')
def main(argv):
del argv # Unused.
try:
trial_dirs = tf.gfile.ListDirectory(FLAGS.tuning_dir)
except tf.errors.NotFoundError:
print('Tuning directory %s does not exist.' % (FLAGS.tuning_dir,))
return
metrics = []
for trial_dir in trial_dirs:
tuning_results_file = os.path.join(
FLAGS.tuning_dir, trial_dir, 'tuning_results.txt')
if tf.gfile.Exists(tuning_results_file):
with tf.gfile.FastGFile(tuning_results_file, 'r') as reader:
for line in reader:
metrics.append(ast.literal_eval(line.replace(': nan,', ': 0.0,')))
if not metrics:
print('No trials found.')
return
num_trials = [m['num_trials'] for m in metrics]
assert all(n == num_trials[0] for n in num_trials)
num_trials = num_trials[0]
print('Found %d completed trials out of %d' % (len(metrics), num_trials))
# Sort by objective descending.
sorted_trials = sorted(metrics, key=lambda m: -m['objective'])
for i, metrics in enumerate(sorted_trials):
hparams = metrics['hparams']
keys = sorted(hparams.keys())
print(
str(i).ljust(4) + ': '
+ '{0:.2f}'.format(metrics['objective']).ljust(10)
+ '['
+ ','.join(['{}={}'.format(k, hparams[k]).ljust(24) for k in keys])
+ ']')
if __name__ == '__main__':
app.run(main)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tasks for RL."""
import abc
import copy
import itertools
import random
from absl import logging
import numpy as np
from six.moves import xrange
from common import bf # brain coder
from common import reward as r # brain coder
from single_task import misc # brain coder
from single_task import test_tasks # brain coder
MAX_EXECUTION_STEPS = 5000
def make_task(task_name, override_kwargs=None, max_code_length=100,
require_correct_syntax=False,
do_code_simplification=False,
correct_bonus=2.0, code_length_bonus=1.0):
"""Make tasks with setting from paper."""
logging.info('Making paper-config task.')
n = 16 # Number of test cases.
task_mapping = {
'print-hello': (
PrintTask, dict(base=27, fixed_string=[8, 5, 12, 12, 15])),
'print': (PrintIntTask, dict(base=256, fixed_string=[1, 2, 3, 4, 5])),
'echo': (EchoTask, dict(base=27, min_length=1, max_length=6)),
'remove-char': (
RemoveCharTask, dict(base=256, n=n, min_len=1, max_len=6)),
'reverse': (
ReverseTask, dict(base=256, n=n, min_len=1, max_len=6)),
'reverse-tune': (
ReverseTaskV2, dict(base=256, reward_type='static-bylen')),
'remove-char-tune': (RemoveCharTaskV2, dict(base=27)),
'prefix': (CommonPrefixTask, dict(base=27)),
'find': (FindSubStrTask, dict(base=27)),
'sort3': (SortFixedTaskV2, dict(base=27, n=150, length=3)),
'count-char': (CountCharTaskV2, dict(n=n, max_len=6)),
'bool-logic': (BooleanLogicTask, dict()),
'add': (AddTask, dict(n=9)),
'echo-twice': (EchoTwiceTask, dict(n=n)),
'echo-thrice': (EchoThriceTask, dict(n=n)),
'copy-reverse': (CopyReverseTask, dict(n=n)),
'zero-cascade': (EchoZeroCascadeTask, dict(n=n)),
'cascade': (EchoCascadeTask, dict(n=n)),
'shift-left': (ShiftLeftTask, dict(n=n)),
'shift-right': (ShiftRightTask, dict(n=n)),
'riffle': (RiffleTask, dict(n=n)),
'unriffle': (UnriffleTask, dict(n=n)),
'middle-char': (MiddleCharTask, dict(n=n)),
'remove-last': (RemoveLastTask, dict(n=n)),
'remove-last-two': (RemoveLastTwoTask, dict(n=n)),
'echo-alternating': (EchoAlternatingTask, dict(n=n)),
'echo-half': (EchoHalfTask, dict(n=n)),
'length': (LengthTask, dict(n=n)),
'echo-second-seq': (EchoSecondSequenceTask, dict(n=n)),
'echo-nth-seq': (EchoNthSequenceTask, dict(n=n)),
'substring': (SubstringTask, dict(n=n)),
'divide-2': (Divide2Task, dict(n=n)),
'dedup': (DedupTask, dict(n=n)),
'remove-target-char': (RemoveTargetCharTask, dict(n=n)),
'list-index': (ListIndexTask, dict(n=n)),
'fib': (FibonacciTask, dict()),
'count-down': (BottlesOfBeerTask, dict()),
'split': (SplitTask, dict()),
'trim-left': (TrimLeftTask, dict()),
'circle-route': (
JudgeRouteCircleTask, dict(n=100, max_len=32)),
'multiply': (MultiplyTask, dict(n=100)),
'divmod': (DivModTask, dict(n=100)),
}
if task_name not in task_mapping:
# Test tasks.
if task_name == 'test-hill-climb':
return test_tasks.BasicTaskManager(test_tasks.HillClimbingTask())
raise ValueError('Unknown task type "%s"' % task_name)
task_cls, kwargs = task_mapping[task_name]
if override_kwargs:
if not isinstance(override_kwargs, dict):
raise ValueError(
'override_kwargs must be a dict, got: %s', override_kwargs)
kwargs.update(override_kwargs)
task = task_cls(**kwargs)
reward_fn = r.absolute_distance_reward
# reward_fn = r.absolute_mod_distance_reward
# reward_fn = r.absolute_log_distance_reward
logging.info('Using reward function: %s', reward_fn.__name__)
# We want reward with and without code simplification to be scaled the same
# way. Without code simplification, give the maximum code length bonus
# every time.
min_code_length = 0.0 if do_code_simplification else max_code_length
return MultiIOTaskManager(
task=task, correct_bonus=correct_bonus,
code_length_bonus=code_length_bonus,
max_code_length=max_code_length, min_code_length=min_code_length,
reward_fn=reward_fn, require_correct_syntax=require_correct_syntax)
def concat(lists):
if not lists:
return []
l = lists[0]
for k in lists[1:]:
l += k
return l
def concat_join(lists, sep):
if not lists:
return []
l = lists[0]
for k in lists[1:]:
l += [sep] + k
return l
def clipped_linear(x, x0, y0, slope, y_range):
min_y, max_y = y_range
return min(max(slope * (x - x0) + y0, min_y), max_y)
class MultiIOTaskManager(object):
"""Supports tasks which test the code with multiple I/O examples."""
def __init__(self, task, max_code_length=32, min_code_length=0,
max_execution_steps=MAX_EXECUTION_STEPS, correct_bonus=1.0,
code_length_bonus=1.0, failure_reward=-2.0, reward_fn=None,
require_correct_syntax=False):
assert isinstance(task, BaseTask)
self.task = task
self.max_code_length = max_code_length
self.min_code_length = min_code_length
self.max_execution_steps = max_execution_steps
self.require_correct_syntax = require_correct_syntax
self.correct_bonus = correct_bonus
self.code_length_bonus = code_length_bonus
self.failure_reward = failure_reward
self.time_penalty = (
1.0 / (max_code_length - min_code_length)
if max_code_length > min_code_length else 0.0)
if reward_fn is None:
self.reward_fn = r.absolute_distance_reward
else:
self.reward_fn = reward_fn
self.input_type = (
task.input_type if hasattr(task, 'input_type') else misc.IOType.integer)
self.output_type = (
task.output_type if hasattr(task, 'output_type')
else misc.IOType.integer)
self._compute_best_reward()
def _compute_best_reward(self):
io_seqs = self.task.make_io_set()
reward = 0.0
for _, output_seq in io_seqs:
reward += self.reward_fn(output_seq, output_seq, self.task.base)
reward += self.correct_bonus
reward += self.code_length_bonus # Bonus for shortest code.
self.best_reward = reward
self.good_reward = 0.75 * reward
logging.info('Known best reward: %.4f', self.best_reward)
def _score_batch(self, code_strings):
return [self._score_code(code) for code in code_strings]
def _score_code(self, code):
"""Run test cases on code and compute reward.
Args:
code: A single BF code string.
Returns:
misc.RewardInfo namedtuple instance containing reward and code execution
information, including inputs, expected outputs, code outputs, input
and output types, and reason for the reward obtained.
"""
# Get list of 2-tuples, each containing an input sequence and an output
# sequence.
io_seqs = self.task.make_io_set()
terminal_reward = 0.0
results = []
reason = 'correct'
for input_seq, output_seq in io_seqs:
eval_result = bf.evaluate(
code, input_buffer=input_seq, timeout=0.1,
max_steps=self.max_execution_steps,
base=self.task.base,
require_correct_syntax=self.require_correct_syntax)
result, success = eval_result.output, eval_result.success
if not success:
# Code execution timed out.
terminal_reward = self.failure_reward
results = []
reason = eval_result.failure_reason
break
else:
terminal_reward += self.reward_fn(result, output_seq, self.task.base)
if result == output_seq:
terminal_reward += self.correct_bonus # Bonus for correct answer.
# Only add additional reward for shorter code. Subtracting reward
# interferes with the main objective. Only optimize for length once
# any solution is found.
if self.min_code_length == self.max_code_length:
terminal_reward += self.code_length_bonus
else:
terminal_reward += self.code_length_bonus * clipped_linear(
x=len(code), x0=self.min_code_length, y0=1.0,
slope=-self.time_penalty, y_range=(0.0, 1.0))
# reason remains 'correct' if it is already
elif reason == 'correct':
reason = 'wrong'
results.append(result)
# Return list of rewards, one for each char in the code. All are 0 except
# for the terminal reward.
terminal_reward /= self.best_reward
return misc.RewardInfo(
episode_rewards=[0.0] * (len(code) - 1) + [terminal_reward],
input_case=misc.IOTuple(i for i, o in io_seqs),
correct_output=misc.IOTuple(o for i, o in io_seqs),
code_output=misc.IOTuple(results),
input_type=self.input_type,
output_type=self.output_type,
reason=reason)
def rl_batch(self, batch_size):
"""Produces list of reward functions. One for each program in the batch."""
return [self._score_code] * batch_size
def conditional_overwrite(current_value, new_value, allowed_overwrite_values):
if current_value in allowed_overwrite_values:
return new_value
return current_value
class BaseTask(object):
"""A coding task.
All coding tasks should inherit this class.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, base=256):
self.base = base # All tasks must set the integer base that the expect.
@abc.abstractmethod
def make_io_set(self):
"""Generate a set of test cases for the task.
Returns:
List of tuples, where each tuple is (input_case, output_case).
input_case and output_case are lists of integers.
"""
pass
# ==============================================================================
# ICLR tasks.
# ==============================================================================
class PrintTask(BaseTask):
"""Print string coding task.
Code needs to output a fixed string (given as a hyperparameter to the
task constructor). Program input is ignored.
"""
def __init__(self, base, fixed_string=None):
super(type(self), self).__init__()
self.base = base # base includes EOS
self.eos = 0
if fixed_string:
self.fixed_string = fixed_string
else:
self.fixed_string = [1, 2, 3, 0] # ABC<EOS>
self.min_length = self.max_length = len(self.fixed_string)
def make_io_set(self):
return [(list(), list(self.fixed_string))]
class RemoveCharTaskV2(BaseTask):
"""Remove character coding task (version 2).
Code needs to pipe input to output, but with all the 'A' (value 1) chars
removed. 'A' appears exactly once in each input.
Test cases are hard-coded.
"""
def __init__(self, base):
super(type(self), self).__init__()
self.base = base
self.eos = 0
self.remove_char = 1
assert base >= 27
def make_io_set(self):
rm = self.remove_char
return [
([rm, 0], [0]),
([20, rm, 0], [20, 0]),
([rm, 13, 0], [13, 0]),
([6, rm, 17, 0], [6, 17, 0]),
([rm, 11, 24, 0], [11, 24, 0]),
([2, 16, 21, rm, 0], [2, 16, 21, 0]),
([18, rm, 12, 26, 7, 0], [18, 12, 26, 7, 0]),
([9, 10, 22, rm, 4, 0], [9, 10, 22, 4, 0])]
class RemoveCharTask(BaseTask):
"""Remove character coding task.
Code needs to pipe input to output, but with all the 'A' (value 1) chars
removed. 'A' appears at least once in each input.
Test cases are dynamically generated, allowing for the number of test cases
to be a hyperparameter.
"""
def __init__(self, base, n, min_len, max_len):
super(type(self), self).__init__()
self.base = base
self.eos = 0
self.remove_char = 1
assert base >= 27
self._io_pairs = self._make_io_examples(n, min_len, max_len)
def _make_io_examples(self, n, min_len, max_len):
"""Generate test cases for the task."""
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
for _ in xrange(n):
length = rand.randrange(min_len, max_len + 1)
rm_char_pos = rand.randrange(0, length)
input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
input_seq[rm_char_pos] = self.remove_char
output_seq = list(input_seq)
del output_seq[rm_char_pos]
output_seq.append(0)
io_examples.append((input_seq, output_seq))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class ReverseTaskV2(BaseTask):
"""Reverse string coding task (version 2).
Code needs to pipe input to output, but in reverse order.
Stochastic test case = new test case randomly generated for every run of
`make_io_set`, i.e. different test cases every time code is scored.
Task supports different types of test cases:
rand-one: Code is scored on one stochastic test case.
rand-many: Code is scored on 5 stochastic test cases.
static-bylen: Code is scored on 5 static test cases. There is one test
case for string lengths 1 through 5.
rand-bylen: Code is scored on 5 stochastic test cases, where there is one
test case for string lengths 1 through 5.
"""
def __init__(self, base, reward_type):
super(type(self), self).__init__()
self.base = base # base includes EOS
assert base >= 27
self.eos = 0
self.io_pair_fn = {
# One random example at a time.
'rand-one': lambda: self._io_rand(1),
# K randomy examples at a time (any lengths).
'rand-many': lambda: self._io_rand(5),
# Static examples, one for each length.
'static-bylen': self._io_static_by_len,
# Random examples, one for each length.
'rand-bylen': self._io_rand_by_len}[reward_type]
def _make_io_examples(self, sequences):
outputs = [list(i) for i in sequences]
for o in outputs:
o.reverse()
o.append(0)
inputs = [i + [0] for i in sequences]
return zip(inputs, outputs)
def _io_rand(self, k):
inputs = [(np.random.choice(26, random.randrange(1, 6)) + 1).tolist()
for _ in xrange(k)]
return self._make_io_examples(inputs)
def _io_rand_by_len(self, k=5):
inputs = [(np.random.choice(26, length) + 1).tolist()
for length in xrange(1, k + 1)]
return self._make_io_examples(inputs)
def _io_static_by_len(self):
return [
([7, 0], [7, 0]),
([6, 2, 0], [2, 6, 0]),
([5, 1, 10, 0], [10, 1, 5, 0]),
([8, 6, 5, 15, 0], [15, 5, 6, 8, 0]),
([10, 12, 5, 2, 7, 0], [7, 2, 5, 12, 10, 0])]
def make_io_set(self):
return self.io_pair_fn()
class ReverseTask(BaseTask):
"""Reverse string coding task.
Code needs to pipe input to output, but in reverse order.
Test cases are dynamically generated, allowing for the number of test cases
to be a hyperparameter.
"""
def __init__(self, base, n, min_len, max_len):
super(type(self), self).__init__()
self.base = base # base includes EOS
assert base >= 27
self.eos = 0
self._io_pairs = self._make_io_examples(n, min_len, max_len)
def _make_io_examples(self, n, min_len, max_len):
"""Generate test cases for the task."""
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
for _ in xrange(n):
length = rand.randrange(min_len, max_len + 1)
input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
output_seq = list(input_seq)
output_seq.reverse()
output_seq.append(0)
io_examples.append((input_seq, output_seq))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class CommonPrefixTask(BaseTask):
"""Common prefix coding task.
Code needs to output the common prefix between two input lists. Input lists
are variable length, where each list ends with a 0. A common prefix is a
sequence which both lists start with.
"""
def __init__(self, base):
super(type(self), self).__init__()
assert base >= 27
self.base = base
self.eos = 0
def make_io_set(self):
return [
([12, 24, 18, 0, 12, 5, 0], [12, 0]),
([1, 2, 3, 0, 1, 2, 17, 14, 0], [1, 2, 0]),
([15, 2, 1, 9, 2, 0, 15, 2, 1, 25, 8, 14, 0], [15, 2, 1, 0]),
([14, 9, 7, 8, 6, 16, 0, 14, 9, 7, 8, 8, 6, 8, 26, 0],
[14, 9, 7, 8, 0]),
([12, 4, 16, 22, 1, 17, 0, 12, 4, 16, 22, 1, 8, 10, 0],
[12, 4, 16, 22, 1, 0])]
class CountCharTask(BaseTask):
def __init__(self):
super(type(self), self).__init__()
self.base = 27
self.eos = 0
self.char = 1
self.input_type = misc.IOType.string
self.output_type = misc.IOType.integer
def make_io_set(self):
return [
([10, 0], [0]),
([1, 0], [1]),
([1, 1, 0], [2]),
([11, 1, 0], [1]),
([1, 24, 0], [1]),
([13, 6, 0], [0]),
([9, 2, 7, 0], [0]),
([1, 24, 11, 0], [1]),
([19, 1, 1, 0], [2]),
([1, 6, 1, 0], [2]),
([22, 16, 17, 9, 0], [0]),
([1, 1, 1, 19, 0], [3]),
([1, 1, 1, 1, 0], [4]),
([9, 4, 19, 11, 5, 0], [0]),
([24, 11, 26, 1, 15, 0], [1]),
([1, 1, 20, 1, 1, 0], [4]),
([1, 1, 1, 1, 1, 0], [5])]
class CountCharTaskV2(BaseTask):
"""Count char coding task (version 2).
Code must output the number of occurances of character 'A' (value 1) in an
input string.
Test cases are dynamically generated, allowing for the number of test cases
to be a hyperparameter.
"""
def __init__(self, n, max_len):
super(type(self), self).__init__()
self.base = 27
self.eos = 0
self.char = 1
self.other_chars = [c for c in xrange(self.base)
if c not in (self.eos, self.char)]
self.input_type = misc.IOType.string
self.output_type = misc.IOType.integer
self._io_pairs = self._make_io_examples(n, max_len)
def _make_io_examples(self, n, max_len):
"""Generate test cases for the task."""
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
io_examples.append(([10, 0], [0]))
io_examples.append(([1, 0], [1]))
io_examples.append(([1, 1, 0], [2]))
io_examples.append(([9, 4, 19, 11, 5, 0], [0]))
io_examples.append(([24, 11, 26, 1, 15, 0], [1]))
for _ in xrange(n - 5):
length = rand.randrange(2, max_len + 1)
num_chars = rand.randrange(0, max_len + 1)
input_seq = [self.char] * num_chars + [0] * (length - num_chars)
rand.shuffle(input_seq)
for i in xrange(len(input_seq)):
if not input_seq[i]:
input_seq[i] = self.other_chars[rand.randrange(len(self.other_chars))]
output_seq = [num_chars]
io_examples.append((input_seq, output_seq))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class AddTask(BaseTask):
"""Addition coding task.
Code needs to read in two integers and output their sum mod the BF base,
followed by a terminating 0.
"""
def __init__(self, n=16):
super(type(self), self).__init__()
self.base = 256
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
self._io_pairs = self._make_io_examples(n)
def _make_io_examples(self, n):
"""Generate test cases for the task."""
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = [
([4, 0], [4, 0]),
([0, 5], [5, 0]),
([1, 2], [3, 0]),
([67, 21], [88, 0]),
([55, 56], [111, 0]),
([128, 33], [161, 0]),
([221, 251], [216, 0]),
([130, 127], [1, 0]),
([255, 1], [0, 0])]
extra_examples = max(n - len(io_examples), 0)
for _ in xrange(extra_examples):
a = rand.randrange(256)
b = rand.randrange(256)
input_seq = [a, b]
output_seq = [(a + b) % 256, 0]
io_examples.append((input_seq, output_seq))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class BooleanLogicTask(BaseTask):
"""Boolean logic (truth table) coding task.
Code needs to memorize a boolean truth table. Specifically, it must encode a
mapping from triple of bools to a single bool.
"""
def __init__(self):
super(type(self), self).__init__()
self.base = 2
self.input_type = misc.IOType.boolean
self.output_type = misc.IOType.boolean
# X(~Z) + (~Y)(~Z) + (~X)YZ
self._truth_fn = (
lambda x, y, z: # pylint: disable=g-long-lambda
(x and not z) or (not y and not z) or (not x and y and z))
self._test_cases = [
([x, y, z], [int(self._truth_fn(x, y, z))])
for x, y, z in itertools.product(range(2), range(2), range(2))]
def make_io_set(self):
return copy.deepcopy(self._test_cases)
# ------------------------------------------------------------------------------
# The following tasks are generated from known BF solutions. This guarantees
# that each task can be solved within the maximum code length, and maximum
# execution steps.
# ------------------------------------------------------------------------------
def default_input_fn_factory(min_length=1, max_length=6, base=256):
def _input_gen(rand):
l = rand.randrange(min_length, max_length + 1)
return [rand.randrange(base) for _ in xrange(l)]
return _input_gen
class KnownCodeBaseTask(BaseTask):
"""These tasks generate their test cases from a known BF solution.
This ensures that each task has a solution which is under the max character
length, and that it solves the test cases under the max number of execution
steps.
"""
def __init__(self, code_solution, make_input_fn, n=100, base=256,
max_steps=5000, seed=6849275409234):
super(KnownCodeBaseTask, self).__init__()
# Make sure known solution is less than the code length used in experiments.
assert len(code_solution) < 100
self.code_solution = code_solution
self.make_input_fn = make_input_fn
self.n = n
self.base = base
self.max_steps = max_steps
self.seed = seed
self._test_cases = list(self._test_case_generator(code_solution))
def _test_case_generator(self, code_solution):
rand = random.Random(self.seed)
for _ in xrange(self.n):
input_case = self.make_input_fn(rand)
result = bf.evaluate(
code_solution, input_buffer=input_case, max_steps=self.max_steps,
base=self.base, require_correct_syntax=False)
if not result.success:
raise RuntimeError(
'Program must succeed. Failed on input: %s' % input_case)
yield input_case, result.output
def make_io_set(self):
return copy.deepcopy(self._test_cases)
class EchoTwiceTask(KnownCodeBaseTask):
"""Echo twice."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,.[>,.]<[<]>[.>].',
default_input_fn_factory(),
**kwargs)
class EchoThriceTask(KnownCodeBaseTask):
"""Echo three times."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,.[>,.]<[<]>[.>].<[<]>[.>].',
default_input_fn_factory(),
**kwargs)
class CopyReverseTask(KnownCodeBaseTask):
"""Echo forwards, backwards, and then forwards again."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,.[>,.]<[.<].>[.>].',
default_input_fn_factory(),
**kwargs)
class EchoZeroCascadeTask(KnownCodeBaseTask):
"""Print k-th char with k zeros inbetween (1-indexed)."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
',[.>[->+>.<<]>+[-<+>]<<,]',
default_input_fn_factory(),
**kwargs)
class EchoCascadeTask(KnownCodeBaseTask):
"""Print k-th char k times (1-indexed)."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
',>>+<<[>>[-<+>]<[->+<<.>]>+<<,].',
default_input_fn_factory(base=20),
**kwargs)
class ShiftLeftTask(KnownCodeBaseTask):
"""Circulate shift input left."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
',>,[.,]<.,.',
default_input_fn_factory(),
**kwargs)
class ShiftRightTask(KnownCodeBaseTask):
"""Circular shift input right."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,[>,]<.[-]<[<]>[.>].',
default_input_fn_factory(),
**kwargs)
class RiffleTask(KnownCodeBaseTask):
"""Shuffle like a deck of cards.
For input of length N, output values in the following index order:
N-1, 0, N-2, 1, N-3, 2, ...
"""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,[>,]<[.[-]<[<]>.[-]>[>]<]',
default_input_fn_factory(base=20, max_length=8),
**kwargs)
class UnriffleTask(KnownCodeBaseTask):
"""Inverse of riffle."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,[>,[.[-]],]<[.<].',
default_input_fn_factory(base=20, max_length=8),
**kwargs)
class MiddleCharTask(KnownCodeBaseTask):
"""Print middle char if length is odd, or 0 if even."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,[>,]<<[[>]<[,<[<]>,>[>]][>]<<]>.',
default_input_fn_factory(max_length=10),
**kwargs)
class RemoveLastTask(KnownCodeBaseTask):
"""Remove last character."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
',>,[[<.[-]>[-<+>]],].',
default_input_fn_factory(base=20),
**kwargs)
class RemoveLastTwoTask(KnownCodeBaseTask):
"""Remove last two characters."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
',>,>,[[<<.[-]>[-<+>]>[-<+>]],].',
default_input_fn_factory(base=10),
**kwargs)
class EchoAlternatingTask(KnownCodeBaseTask):
# Print even numbered chars first (0-indexed), then odd numbered chars
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>,[.,>,]<<[<]>[.>].',
default_input_fn_factory(base=20, max_length=8),
**kwargs)
class EchoHalfTask(KnownCodeBaseTask):
"""Echo only first half of the input (round down when odd lengthed)."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>>+>,[[<]>+[>],]<[<]>-[-[-<<+>]<[>]>]<<[->+<]>[[>]>.,<+[<]>-].',
default_input_fn_factory(base=20, max_length=9),
**kwargs)
class LengthTask(KnownCodeBaseTask):
"""Print length of the input sequence."""
def __init__(self, **kwargs):
super(type(self), self).__init__(
'>+>,[[<]>+[>],]<[<]>-.',
default_input_fn_factory(max_length=14),
**kwargs)
class EchoSecondSequenceTask(KnownCodeBaseTask):
"""Echo second sequence. Sequences are separated by 0."""
def __init__(self, **kwargs):
def echo_second_gen(rand):
l = rand.randrange(1, 6)
x = [rand.randrange(256) for _ in xrange(l)]
l = rand.randrange(1, 6)
y = [rand.randrange(256) for _ in xrange(l)]
return x + [0] + y + [0]
super(type(self), self).__init__(
',[,],[.,].',
echo_second_gen,
**kwargs)
class EchoNthSequenceTask(KnownCodeBaseTask):
"""Echo n-th sequence (1-indexed). Sequences are separated by 0."""
def __init__(self, **kwargs):
def echo_nth_gen(rand):
k = rand.randrange(1, 7)
n = rand.randrange(1, k + 1)
x = []
for _ in xrange(k):
l = rand.randrange(0, 4)
x += [rand.randrange(256) for _ in xrange(l)] + [0]
return [n] + x
super(type(self), self).__init__(
',-[->,[,]<],[.,].',
echo_nth_gen,
**kwargs)
class SubstringTask(KnownCodeBaseTask):
"""Echo substring.
First two inputs are i and l, where i is the starting index (0-indexed)
and l is the length of the substring.
"""
def __init__(self, **kwargs):
def substring_gen(rand):
l = rand.randrange(2, 16)
i, j = sorted([rand.randrange(l), rand.randrange(l)])
n = j - i
x = [rand.randrange(256) for _ in xrange(l)] + [0]
return [i, n] + x
super(type(self), self).__init__(
'>,<,>[->,<]>,<<[->>.,<<]',
substring_gen,
**kwargs)
class Divide2Task(KnownCodeBaseTask):
"""Divide by 2 (integer floor division)."""
def __init__(self, **kwargs):
def int_input_gen(rand):
return [rand.randrange(256)]
super(type(self), self).__init__(
',[-[->>+<]>[<]<]>>.',
int_input_gen,
**kwargs)
class DedupTask(KnownCodeBaseTask):
"""Deduplicate adjacent duplicate chars."""
def __init__(self, **kwargs):
def dedup_input_gen(rand):
np_random = np.random.RandomState(rand.randrange(2147483647))
num_unique = rand.randrange(1, 5)
unique = np_random.choice(6, num_unique, replace=False) + 1
return [v for v in unique for _ in xrange(rand.randrange(1, 5))] + [0]
super(type(self), self).__init__(
'>>,.[[-<+<+>>],[-<->]<[[-<->]<.>]<[->>+<<]>>]',
dedup_input_gen,
**kwargs)
# ==============================================================================
# Extra tasks.
# ==============================================================================
class PrintIntTask(BaseTask):
"""Print integer coding task.
Code needs to output a fixed single value (given as a hyperparameter to the
task constructor). Program input is ignored.
"""
def __init__(self, base, fixed_string):
super(type(self), self).__init__()
self.base = base
self.eos = 0
self.fixed_string = fixed_string
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
def make_io_set(self):
return [(list(), list(self.fixed_string))]
class EchoTask(BaseTask):
"""Echo string coding task.
Code needs to pipe input to putput (without any modifications).
"""
def __init__(self, base, min_length=1, max_length=5):
super(type(self), self).__init__()
self.base = base # base includes EOS
self.eos = 0
self.min_length = min_length
self.max_length = max_length
self._io_pairs = self._make_io_examples(25)
def _make_io_examples(self, n):
# Test cases are fixed, but varied.
np_random = np.random.RandomState(1234567890)
io_pairs = []
for _ in xrange(n):
length = np_random.randint(self.min_length, self.max_length + 1)
input_seq = np_random.randint(1, self.base, length).tolist() + [self.eos]
output_seq = list(input_seq)
io_pairs.append((input_seq, output_seq))
return io_pairs
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class JudgeRouteCircleTask(BaseTask):
"""Judge route circle coding task.
Code needs to determine if the given route makes a closed loop.
Encoding: U = 1, R = 2, D = 3, L = 4.
Based on
https://leetcode.com/problems/judge-route-circle/description/
"""
base = 256
input_type = misc.IOType.integer
output_type = misc.IOType.integer
def __init__(self, n, max_len=12):
super(type(self), self).__init__()
self.eos = 0
self._io_pairs = self._make_io_examples(n, max_len)
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
def _solve(self, input_seq):
assert input_seq[-1] == 0
pos = [0, 0] # (x, y)
for move in input_seq[:-1]:
assert 0 < move <= 4
if move & 1 == 0: # Left or Right.
pos[0] += 3 - move # Add or subtract 1.
else:
pos[1] += 2 - move # Add or subtract 1.
return [int(not pos[0] and not pos[1])]
def _make_io_examples(self, n, max_len):
"""Generate test cases for the task."""
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
io_examples.append(([0], [1]))
io_examples.append(([4, 2, 0], [1]))
io_examples.append(([2, 4, 0], [1]))
io_examples.append(([3, 1, 0], [1]))
io_examples.append(([1, 3, 0], [1]))
io_examples.append(([1, 0], [0]))
io_examples.append(([2, 0], [0]))
io_examples.append(([3, 0], [0]))
io_examples.append(([4, 0], [0]))
for _ in xrange(n):
is_true = rand.randrange(2)
length = rand.randrange(1, max_len + 1)
if is_true:
# Make a true case.
length = (length >> 1) << 1 # Make even.
partition = (rand.randrange(length + 1) >> 1) << 1
a = partition >> 1
b = (length - partition) >> 1
counts = {1: a, 2: b, 3: a, 4: b}
else:
# Make a false case.
partitions = (
[0]
+ sorted([rand.randrange(length + 1) for _ in range(3)])
+ [length])
counts = {n: partitions[n] - partitions[n - 1] for n in range(1, 5)}
if counts[1] == counts[3] and counts[2] == counts[4]:
# By chance we sampled a true case. Make it false by exchanging
# one count between even and odd pairs.
base = 1 + 2 * rand.randrange(2)
a, b = (base, base + 1) if rand.randrange(2) else (base + 1, base)
if counts[a] == length or counts[b] == 0:
# If counts are at their extreme values, then swap who gets
# incremented and decremented.
a, b = b, a
counts[a] += 1
counts[b] -= 1
assert counts[a] <= length and counts[b] >= 0
assert sum(counts.values()) == length
input_seq = [n for n in xrange(1, 5) for _ in xrange(counts[n])]
rand.shuffle(input_seq)
input_seq += [0]
output_seq = self._solve(input_seq)
assert output_seq[0] == is_true
io_examples.append((input_seq, output_seq))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class MultiplyTask(BaseTask):
"""Multiply coding task.
Code needs to multiple two ints.
Solution:
http://robl.co/brief-look-at-brainfuck/
,>,><<[->[->+>+<<]>>[-<<+>>]<<<]>>.
"""
base = 512
input_type = misc.IOType.integer
output_type = misc.IOType.integer
def __init__(self, n):
super(type(self), self).__init__()
self.eos = 0
self._io_pairs = self._make_io_examples(n)
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
def _factors(self, n):
return set(i for i in range(1, int(n**0.5) + 1) if n % i == 0)
def _make_io_examples(self, n):
"""Generate test cases for the task."""
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
for _ in xrange(n):
n = rand.randrange(self.base)
if n == 0:
a, b = 0, rand.randrange(self.base)
else:
f = list(self._factors(n))
a = f[rand.randrange(len(f))]
b = n // a
if rand.randrange(2):
a, b = b, a
io_examples.append(([a, b], [n]))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class DivModTask(BaseTask):
"""Divmod coding task.
Code needs to take the quotient and remainder of two ints.
Solution:
http://robl.co/brief-look-at-brainfuck/
,>,><<[>[->+>+<<]>[-<<-[>]>>>[<[-<->]<[>]>>[[-]>>+<]>-<]<<]>>>+<<[-<<+>>]<<<]>
>>>>[-<<<<<+>>>>>]<<<<<.>.>
"""
base = 512
input_type = misc.IOType.integer
output_type = misc.IOType.integer
def __init__(self, n):
super(type(self), self).__init__()
self.eos = 0
self._io_pairs = self._make_io_examples(n)
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
def _make_io_examples(self, n):
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
for _ in xrange(n):
n = rand.randrange(0, self.base)
k = rand.randrange(1, self.base) # Divisor cannot be 0.
io_examples.append(([n, k], list(divmod(n, k))))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class FibonacciTask(BaseTask):
def __init__(self):
super(type(self), self).__init__()
self.base = 256
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
def make_io_set(self):
return [
([0], [0, 1]),
([1], [1, 1]),
([2], [1, 2]),
([3], [2, 3]),
([4], [3, 5]),
([5], [5, 8]),
([6], [8, 13]),
([7], [13, 21]),
([8], [21, 34]),
([9], [34, 55]),
([10], [55, 89]),
([11], [89, 144]),
([12], [144, 233]),
([13], [233, 121])]
class FindSubStrTask(BaseTask):
"""Find sub-string coding task.
Code needs to output a bool: True if the input string contains a hard-coded
substring, 'AB' (values [1, 2]).
"""
def __init__(self, base):
super(type(self), self).__init__()
assert base >= 27
self.base = base
self.eos = 0
self.find_str = [1, 2]
self.input_type = misc.IOType.string
self.output_type = misc.IOType.boolean
def make_io_set(self):
return [
([1, 1, 23, 0], [0]),
([21, 3, 2, 0], [0]),
([2, 1, 19, 0], [0]),
([2, 24, 15, 3, 0], [0]),
([24, 6, 10, 16, 4, 0], [0]),
([1, 2, 12, 0], [1]),
([7, 1, 2, 0], [1]),
([1, 2, 11, 3, 0], [1]),
([1, 1, 2, 18, 0], [1]),
([7, 25, 1, 2, 0], [1]),
([3, 1, 2, 11, 8, 0], [1]),
([15, 16, 20, 1, 2, 0], [1])]
class SortFixedTask(BaseTask):
"""Sort list coding task.
Code needs to output a sorted input list. The task consists of lists of the
same length L, where L is provided to this task's constructor as a
hyperparameter.
"""
def __init__(self, base, length=3):
super(type(self), self).__init__()
assert base >= 27
self.base = base
self.eos = 0
self.length = length
assert length == 3 # More lengths will be supported.
def make_io_set(self):
if self.length == 3:
return [
([1, 20, 6], [1, 6, 20]),
([13, 6, 7], [6, 7, 13]),
([24, 2, 23], [2, 23, 24]),
([16, 12, 3], [3, 12, 16]),
([11, 24, 4], [4, 11, 24]),
([10, 1, 19], [1, 10, 19])]
class SortFixedTaskV2(BaseTask):
"""Sort list coding task (version 2).
Code needs to output a sorted input list. The task consists of lists of the
same length L, where L is provided to this task's constructor as a
hyperparameter.
Test cases are dynamically generated, allowing for the number of test cases
to be a hyperparameter.
"""
def __init__(self, base, n, length=3):
super(type(self), self).__init__()
assert base >= 27
self.base = base
self.eos = 0
self._io_pairs = self._make_io_examples(n, length)
self.input_type = misc.IOType.integer
self.output_type = misc.IOType.integer
def _make_io_examples(self, n, length):
rand = random.Random(6849275409234) # Test cases are fixed, but varied.
io_examples = []
for _ in xrange(n):
input_seq = [rand.randrange(1, self.base) for _ in xrange(length)]
output_seq = sorted(input_seq)
io_examples.append((input_seq, output_seq))
return io_examples
def make_io_set(self):
return copy.deepcopy(self._io_pairs)
class RemoveTargetCharTask(KnownCodeBaseTask):
"""Remove target character from string, where first input is the target.
Target can appear multiple times.
"""
def __init__(self, **kwargs):
def randrange_hole(rand, a, hole, b):
x = rand.randrange(a, b - 1)
if x >= hole:
return x + 1
return x
def remove_target_char_gen(rand):
char = rand.randrange(1, 6)
l = rand.randrange(1, 8)
input_seq = [randrange_hole(rand, 1, char, 256) for _ in xrange(l)]
idx = range(l)
rand.shuffle(idx)
num_targets = rand.randrange(0, l)
for pos in idx[:num_targets]:
input_seq[pos] = char
return [char] + input_seq + [0]
super(type(self), self).__init__(
',>>>,[<<<[->+>+<<]>>[->->+<<]>[>[-<+>]<.[-]]>[-]<<<[-<+>]>>,].',
remove_target_char_gen,
**kwargs)
class ListIndexTask(KnownCodeBaseTask):
"""Echo i-th value in the given list."""
def __init__(self, **kwargs):
def array_index_gen(rand):
l = rand.randrange(1, 16)
i = rand.randrange(l)
return [i] + [rand.randrange(256) for _ in xrange(l)] + [0]
super(type(self), self).__init__(
',[->,<]>,.',
array_index_gen,
**kwargs)
# ==============================================================================
# Tasks based on primaryobjects paper.
# ==============================================================================
def string2tokens(string):
return [ord(c) for c in string]
def stringlist2tokens(strings):
return [string2tokens(string) for string in strings]
def string2tokens_b27(string):
return [ord(c.lower()) - ord('a') + 1 for c in string]
def stringlist2tokens_b27(strings):
return [string2tokens_b27(string) for string in strings]
class BottlesOfBeerTask(BaseTask):
"""Bottles of beer coding task.
This is a counting task. Code needs to read in an int N and then output
every int from N to 0, each separated by a 0.
"""
base = 256
input_type = misc.IOType.integer
output_type = misc.IOType.integer
def make_io_set(self):
return [
([1], [1, 0]),
([2], [2, 0, 1, 0]),
([3], [3, 0, 2, 0, 1, 0]),
([4], [4, 0, 3, 0, 2, 0, 1, 0]),
([5], [5, 0, 4, 0, 3, 0, 2, 0, 1, 0]),
([6], [6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0])]
class SplitTask(BaseTask):
"""Split coding task.
Code needs to pipe input strings to output, but insert a 0 after every 3
characters. This is in essence splitting the string into intervals of length
3.
"""
base = 28
input_type = misc.IOType.string
output_type = misc.IOType.integer
def _splicer(self, lst, insert, interval=3):
for i, item in enumerate(lst):
yield item
if (i + 1) % interval == 0 and i < len(lst) - 1:
yield insert
def __init__(self):
super(type(self), self).__init__()
inputs = stringlist2tokens_b27(
['hello', 'orange', 'spaghetti', 'wins', 'one'])
targets = [list(self._splicer(i, 27)) for i in inputs]
self._test_cases = list(zip(inputs, targets))
def make_io_set(self):
return copy.deepcopy(self._test_cases)
class TrimLeftTask(BaseTask):
"""Trim left coding task.
Code needs to pipe input strings to output, but remove everything before the
first quotation char (").
"""
base = 256
input_type = misc.IOType.integer
output_type = misc.IOType.integer
def __init__(self):
super(type(self), self).__init__()
inputs = stringlist2tokens(
['a "inside" over', 'xy "test" rights', 'ca6 "foresting" service',
'abc"def"yz.', 'A"B"'])
targets = stringlist2tokens(
['"inside" over', '"test" rights', '"foresting" service', '"def"yz.',
'"B"'])
self._test_cases = list(zip(inputs, targets))
def make_io_set(self):
return copy.deepcopy(self._test_cases)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for code_tasks."""
import numpy as np
import tensorflow as tf
from single_task import code_tasks # brain coder
from single_task import defaults # brain coder
def pad(string, pad_length, pad_char):
return string + pad_char * (pad_length - len(string))
class CodeTasksTest(tf.test.TestCase):
def assertClose(self, a, b):
self.assertTrue(
np.isclose(a, b, atol=1e-4),
'Expecting approximately equal values. Got: %s, %s' % (a, b))
def testMultiIOTaskManager(self):
maxlen = 100
padchr = '['
task = code_tasks.make_paper_task(
'print', timestep_limit=maxlen, do_code_simplification=False)
reward_fns = task.rl_batch(1)
r = reward_fns[0]
self.assertClose(
r(pad('++++++++.---.+++++++...', maxlen, padchr)).episode_rewards[-1],
0.2444)
self.assertClose(
r(pad('++++++++.---.+++++++..+++.',
maxlen, padchr)).episode_rewards[-1],
1.0)
task = code_tasks.make_paper_task(
'print', timestep_limit=maxlen, do_code_simplification=True)
reward_fns = task.rl_batch(1)
r = reward_fns[0]
self.assertClose(
r('++++++++.---.+++++++...').episode_rewards[-1],
0.2444)
self.assertClose(
r('++++++++.---.+++++++..+++.').episode_rewards[-1],
0.935)
self.assertClose(
r(pad('++++++++.---.+++++++..+++.',
maxlen, padchr)).episode_rewards[-1],
0.75)
task = code_tasks.make_paper_task(
'reverse', timestep_limit=maxlen, do_code_simplification=False)
reward_fns = task.rl_batch(1)
r = reward_fns[0]
self.assertClose(
r(pad('>,>,>,.<.<.<.', maxlen, padchr)).episode_rewards[-1],
0.1345)
self.assertClose(
r(pad(',[>,]+[,<.]', maxlen, padchr)).episode_rewards[-1],
1.0)
task = code_tasks.make_paper_task(
'reverse', timestep_limit=maxlen, do_code_simplification=True)
reward_fns = task.rl_batch(1)
r = reward_fns[0]
self.assertClose(r('>,>,>,.<.<.<.').episode_rewards[-1], 0.1324)
self.assertClose(r(',[>,]+[,<.]').episode_rewards[-1], 0.9725)
self.assertClose(
r(pad(',[>,]+[,<.]', maxlen, padchr)).episode_rewards[-1],
0.75)
def testMakeTask(self):
maxlen = 100
padchr = '['
config = defaults.default_config_with_updates(
'env=c(config_for_iclr=False,fixed_string=[8,5,12,12,15])')
task = code_tasks.make_task(config.env, 'print', timestep_limit=maxlen)
reward_fns = task.rl_batch(1)
r = reward_fns[0]
self.assertClose(
r('++++++++.---.+++++++...').episode_rewards[-1],
0.2444)
self.assertClose(
r('++++++++.---.+++++++..+++.').episode_rewards[-1],
0.935)
self.assertClose(
r(pad('++++++++.---.+++++++..+++.',
maxlen, padchr)).episode_rewards[-1],
0.75)
def testKnownCodeBaseTask(self):
maxlen = 100
padchr = '['
task = code_tasks.make_paper_task(
'shift-left', timestep_limit=maxlen, do_code_simplification=False)
reward_fns = task.rl_batch(1)
r = reward_fns[0]
self.assertClose(
r(pad(',>,[.,]<.,.', maxlen, padchr)).episode_rewards[-1],
1.0)
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Manage data for pretraining and RL tasks."""
import ast
from collections import namedtuple
from absl import logging
from single_task import code_tasks # brain coder
RLBatch = namedtuple('RLBatch', ['reward_fns', 'batch_size', 'good_reward'])
class DataManager(object):
"""Interface between environment and model."""
def __init__(self, global_config, run_number=None,
do_code_simplification=False):
"""Constructs a DataManager.
Args:
global_config: A config_lib.Config instance containing all config. See
config in defaults.py.
run_number: Which run this is (of the same experiment). This should be set
when a task cycle is defined in the config. A task cycle is a list of
tasks to cycle through repeatedly, and the selected task is a function
of the run number, i.e. 0-th run, 1-st run, 2-nd run, etc...
This can be None if only a single task is set in the config.
do_code_simplification: When global_config.env.config_for_iclr is True,
use this option to create code simplification (code golf) tasks, vs
fixed length coding tasks. If True, a task with code simplification
reward will be constructed.
Raises:
ValueError: If global_config.env.task and global_config.env.task_cycle
are both set, or both not set. Only one should be given.
ValueError: If global_config.env.task_cycle is set but run_number is None.
"""
env_config = global_config.env
self.batch_size = global_config.batch_size
if env_config.task_cycle:
if env_config.task:
raise ValueError('Do not set both `task` and `task_cycle`.')
if run_number is None:
raise ValueError('Do not use task_cycle for single-run experiment.')
index = run_number % len(env_config.task_cycle)
self.task_name = env_config.task_cycle[index]
logging.info('run_number: %d, task_cycle index: %d', run_number, index)
logging.info('task_cycle: %s', env_config.task_cycle)
elif env_config.task:
self.task_name = env_config.task
else:
raise ValueError('Either `task` or `task_cycle` must be set.')
logging.info('Task for this run: "%s"', self.task_name)
logging.info('config_for_iclr=True; do_code_simplification=%s',
do_code_simplification)
self.rl_task = code_tasks.make_task(
task_name=self.task_name,
override_kwargs=ast.literal_eval(env_config.task_kwargs),
max_code_length=global_config.timestep_limit,
require_correct_syntax=env_config.correct_syntax,
do_code_simplification=do_code_simplification,
correct_bonus=env_config.task_manager_config.correct_bonus,
code_length_bonus=env_config.task_manager_config.code_length_bonus)
def sample_rl_batch(self):
"""Create reward functions from the current task.
Returns:
RLBatch namedtuple instance, which holds functions and information for
a minibatch of episodes.
* reward_fns: A reward function for each episode. Maps code string to
reward.
* batch_size: Number of episodes in this minibatch.
* good_reward: Estimated threshold of rewards which indicate the algorithm
is starting to solve the task. This is a heuristic that tries to
reduce the amount of stuff written to disk.
"""
reward_fns = self.rl_task.rl_batch(self.batch_size)
return RLBatch(
reward_fns=reward_fns,
batch_size=self.batch_size,
good_reward=self.rl_task.good_reward)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Default configuration for agent and environment."""
from absl import logging
from common import config_lib # brain coder
def default_config():
return config_lib.Config(
agent=config_lib.OneOf(
[config_lib.Config(
algorithm='pg',
policy_lstm_sizes=[35,35],
# Set value_lstm_sizes to None to share weights with policy.
value_lstm_sizes=[35,35],
obs_embedding_size=10,
grad_clip_threshold=10.0,
param_init_factor=1.0,
lr=5e-5,
pi_loss_hparam=1.0,
vf_loss_hparam=0.5,
entropy_beta=1e-2,
regularizer=0.0,
softmax_tr=1.0, # Reciprocal temperature.
optimizer='rmsprop', # 'adam', 'sgd', 'rmsprop'
topk=0, # Top-k unique codes will be stored.
topk_loss_hparam=0.0, # off policy loss multiplier.
# Uniformly sample this many episodes from topk buffer per batch.
# If topk is 0, this has no effect.
topk_batch_size=1,
# Exponential moving average baseline for REINFORCE.
# If zero, A2C is used.
# If non-zero, should be close to 1, like .99, .999, etc.
ema_baseline_decay=0.99,
# Whether agent can emit EOS token. If true, agent can emit EOS
# token which ends the episode early (ends the sequence).
# If false, agent must emit tokens until the timestep limit is
# reached. e.g. True means variable length code, False means fixed
# length code.
# WARNING: Making this false slows things down.
eos_token=False,
replay_temperature=1.0,
# Replay probability. 1 = always replay, 0 = always on policy.
alpha=0.0,
# Whether to normalize importance weights in each minibatch.
iw_normalize=True),
config_lib.Config(
algorithm='ga',
crossover_rate=0.99,
mutation_rate=0.086),
config_lib.Config(
algorithm='rand')],
algorithm='pg',
),
env=config_lib.Config(
# If True, task-specific settings are not needed.
task='', # 'print', 'echo', 'reverse', 'remove', ...
task_cycle=[], # If non-empty, reptitions will cycle through tasks.
task_kwargs='{}', # Python dict literal.
task_manager_config=config_lib.Config(
# Reward recieved per test case. These bonuses will be scaled
# based on how many test cases there are.
correct_bonus=2.0, # Bonus for code getting correct answer.
code_length_bonus=1.0), # Maximum bonus for short code.
correct_syntax=False,
),
batch_size=64,
timestep_limit=32)
def default_config_with_updates(config_string, do_logging=True):
if do_logging:
logging.info('Config string: "%s"', config_string)
config = default_config()
config.strict_update(config_lib.Config.parse(config_string))
if do_logging:
logging.info('Config:\n%s', config.pretty_str())
return config
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Genetic algorithm for BF tasks.
Inspired by https://github.com/primaryobjects/AI-Programmer.
GA function code borrowed from https://github.com/DEAP/deap.
"""
from collections import namedtuple
import random
from absl import flags
from absl import logging
import numpy as np
from six.moves import xrange
from common import bf # brain coder
from common import utils # brain coder
from single_task import misc # brain coder
FLAGS = flags.FLAGS
# Saving reward of previous programs saves computation if a program appears
# again.
USE_REWARD_CACHE = True # Disable this if GA is using up too much memory.
GENES = bf.CHARS
MAX_PROGRAM_STEPS = 500
STEP_BONUS = True
ALPHANUM_CHARS = (
['_'] +
[chr(ord('a') + i_) for i_ in range(26)] +
[chr(ord('A') + i_) for i_ in range(26)] +
[chr(ord('0') + i_) for i_ in range(10)])
Result = namedtuple(
'Result',
['reward', 'inputs', 'code_outputs', 'target_outputs', 'type_in',
'type_out', 'base', 'correct'])
class IOType(object):
string = 'string'
integer = 'integer'
class CustomType(object):
def __init__(self, to_str_fn):
self.to_str_fn = to_str_fn
def __call__(self, obj):
return self.to_str_fn(obj)
def tokens_list_repr(tokens, repr_type, base):
"""Make human readable representation of program IO."""
if isinstance(repr_type, CustomType):
return repr_type(tokens)
elif repr_type == IOType.string:
chars = (
[ALPHANUM_CHARS[t] for t in tokens] if base < len(ALPHANUM_CHARS)
else [chr(t) for t in tokens])
return ''.join(chars)
elif repr_type == IOType.integer:
return str(tokens)
raise ValueError('No such representation type "%s"', repr_type)
def io_repr(result):
"""Make human readable representation of test cases."""
inputs = ','.join(
tokens_list_repr(tokens, result.type_in, result.base)
for tokens in result.inputs)
code_outputs = ','.join(
tokens_list_repr(tokens, result.type_out, result.base)
for tokens in result.code_outputs)
target_outputs = ','.join(
tokens_list_repr(tokens, result.type_out, result.base)
for tokens in result.target_outputs)
return inputs, target_outputs, code_outputs
def make_task_eval_fn(task_manager):
"""Returns a wrapper that converts an RL task into a GA task.
Args:
task_manager: Is a task manager object from code_tasks.py
Returns:
A function that takes as input a single list of a code chars, and outputs
a Result namedtuple instance containing the reward and information about
code execution.
"""
def to_data_list(single_or_tuple):
if isinstance(single_or_tuple, misc.IOTuple):
return list(single_or_tuple)
return [single_or_tuple]
def to_ga_type(rl_type):
if rl_type == misc.IOType.string:
return IOType.string
return IOType.integer
# Wrapper function.
def evalbf(bf_chars):
result = task_manager._score_code(''.join(bf_chars))
reward = sum(result.episode_rewards)
correct = result.reason == 'correct'
return Result(
reward=reward,
inputs=to_data_list(result.input_case),
code_outputs=to_data_list(result.code_output),
target_outputs=to_data_list(result.correct_output),
type_in=to_ga_type(result.input_type),
type_out=to_ga_type(result.output_type),
correct=correct,
base=task_manager.task.base)
return evalbf
def debug_str(individual, task_eval_fn):
res = task_eval_fn(individual)
input_str, target_output_str, code_output_str = io_repr(res)
return (
''.join(individual) +
' | ' + input_str +
' | ' + target_output_str +
' | ' + code_output_str +
' | ' + str(res.reward) +
' | ' + str(res.correct))
def mutate_single(code_tokens, mutation_rate):
"""Mutate a single code string.
Args:
code_tokens: A string/list/Individual of BF code chars. Must end with EOS
symbol '_'.
mutation_rate: Float between 0 and 1 which sets the probability of each char
being mutated.
Returns:
An Individual instance containing the mutated code string.
Raises:
ValueError: If `code_tokens` does not end with EOS symbol.
"""
if len(code_tokens) <= 1:
return code_tokens
if code_tokens[-1] == '_':
# Do this check to ensure that the code strings have not been corrupted.
raise ValueError('`code_tokens` must end with EOS symbol.')
else:
cs = Individual(code_tokens)
eos = []
mutated = False
for pos in range(len(cs)):
if random.random() < mutation_rate:
mutated = True
new_char = GENES[random.randrange(len(GENES))]
x = random.random()
if x < 0.25 and pos != 0 and pos != len(cs) - 1:
# Insertion mutation.
if random.random() < 0.50:
# Shift up.
cs = cs[:pos] + [new_char] + cs[pos:-1]
else:
# Shift down.
cs = cs[1:pos] + [new_char] + cs[pos:]
elif x < 0.50:
# Deletion mutation.
if random.random() < 0.50:
# Shift down.
cs = cs[:pos] + cs[pos + 1:] + [new_char]
else:
# Shift up.
cs = [new_char] + cs[:pos] + cs[pos + 1:]
elif x < 0.75:
# Shift rotate mutation (position invariant).
if random.random() < 0.50:
# Shift down.
cs = cs[1:] + [cs[0]]
else:
# Shift up.
cs = [cs[-1]] + cs[:-1]
else:
# Replacement mutation.
cs = cs[:pos] + [new_char] + cs[pos + 1:]
assert len(cs) + len(eos) == len(code_tokens)
if mutated:
return Individual(cs + eos)
else:
return Individual(code_tokens)
def crossover(parent1, parent2):
"""Performs crossover mating between two code strings.
Crossover mating is where a random position is selected, and the chars
after that point are swapped. The resulting new code strings are returned.
Args:
parent1: First code string.
parent2: Second code string.
Returns:
A 2-tuple of children, i.e. the resulting code strings after swapping.
"""
max_parent, min_parent = (
(parent1, parent2) if len(parent1) > len(parent2)
else (parent2, parent1))
pos = random.randrange(len(max_parent))
if pos >= len(min_parent):
child1 = max_parent[:pos]
child2 = min_parent + max_parent[pos:]
else:
child1 = max_parent[:pos] + min_parent[pos:]
child2 = min_parent[:pos] + max_parent[pos:]
return Individual(child1), Individual(child2)
def _make_even(n):
"""Return largest even integer less than or equal to `n`."""
return (n >> 1) << 1
def mutate_and_crossover(population, mutation_rate, crossover_rate):
"""Take a generational step over a population.
Transforms population of parents into population of children (of the same
size) via crossover mating and then mutation on the resulting children.
Args:
population: Parent population. A list of Individual objects.
mutation_rate: Probability of mutation. See `mutate_single`.
crossover_rate: Probability that two parents will mate.
Returns:
Child population. A list of Individual objects.
"""
children = [None] * len(population)
for i in xrange(0, _make_even(len(population)), 2):
p1 = population[i]
p2 = population[i + 1]
if random.random() < crossover_rate:
p1, p2 = crossover(p1, p2)
c1 = mutate_single(p1, mutation_rate)
c2 = mutate_single(p2, mutation_rate)
children[i] = c1
children[i + 1] = c2
if children[-1] is None:
children[-1] = population[-1]
return children
def ga_loop(population, cxpb, mutpb, ngen, task_eval_fn, halloffame=None,
checkpoint_writer=None):
"""A bare bones genetic algorithm.
Similar to chapter 7 of Back, Fogel and Michalewicz, "Evolutionary
Computation 1 : Basic Algorithms and Operators", 2000.
Args:
population: A list of individuals.
cxpb: The probability of mating two individuals.
mutpb: The probability of mutating a gene.
ngen: The number of generation. Unlimited if zero.
task_eval_fn: A python function which maps an Individual to a Result
namedtuple.
halloffame: (optional) a utils.MaxUniquePriorityQueue object that will be
used to aggregate the best individuals found during search.
checkpoint_writer: (optional) an object that can save and load populations.
Needs to have `write`, `load`, and `has_checkpoint` methods. Used to
periodically save progress. In event of a restart, the population will
be loaded from disk.
Returns:
GaResult namedtuple instance. This contains information about the GA run,
including the resulting population, best reward (fitness) obtained, and
the best code string found.
"""
has_checkpoint = False
if checkpoint_writer and checkpoint_writer.has_checkpoint():
try:
gen, population, halloffame = checkpoint_writer.load()
except EOFError: # Data was corrupted. Start over.
pass
else:
has_checkpoint = True
logging.info(
'Loaded population from checkpoint. Starting at generation %d', gen)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
for ind in invalid_ind:
ind.fitness.values = task_eval_fn(ind).reward,
for _, ind in halloffame.iter_in_order():
ind.fitness.values = task_eval_fn(ind).reward,
if not has_checkpoint:
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
for ind in invalid_ind:
ind.fitness.values = task_eval_fn(ind).reward,
if halloffame is not None:
for ind in population:
halloffame.push(ind.fitness.values, tuple(ind), ind)
logging.info('Initialized new population.')
gen = 1
pop_size = len(population)
program_reward_cache = {} if USE_REWARD_CACHE else None
# Begin the generational process
while ngen == 0 or gen <= ngen:
# Select the next generation individuals
offspring = roulette_selection(population, pop_size - len(halloffame))
# Vary the pool of individuals
# offspring = varAnd(offspring, toolbox, cxpb, mutpb)
offspring = mutate_and_crossover(
offspring, mutation_rate=mutpb, crossover_rate=cxpb)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
for ind in invalid_ind:
str_repr = ''.join(ind)
if program_reward_cache is not None and str_repr in program_reward_cache:
ind.fitness.values = (program_reward_cache[str_repr],)
else:
eval_result = task_eval_fn(ind)
ind.fitness.values = (eval_result.reward,)
if program_reward_cache is not None:
program_reward_cache[str_repr] = eval_result.reward
# Replace the current population by the offspring
population = list(offspring)
# Update the hall of fame with the generated individuals
if halloffame is not None:
for ind in population:
halloffame.push(ind.fitness.values, tuple(ind), ind)
# elitism
population.extend([ind for _, ind in halloffame.iter_in_order()])
if gen % 100 == 0:
top_code = '\n'.join([debug_str(ind, task_eval_fn)
for ind in topk(population, k=4)])
logging.info('gen: %d\nNPE: %d\n%s\n\n', gen, gen * pop_size, top_code)
best_code = ''.join(halloffame.get_max()[1])
res = task_eval_fn(best_code)
# Write population and hall-of-fame to disk.
if checkpoint_writer:
checkpoint_writer.write(gen, population, halloffame)
if res.correct:
logging.info('Solution found:\n%s\nreward = %s\n',
best_code, res.reward)
break
gen += 1
best_code = ''.join(halloffame.get_max()[1])
res = task_eval_fn(best_code)
return GaResult(
population=population, best_code=best_code, reward=res.reward,
solution_found=res.correct, generations=gen,
num_programs=gen * len(population),
max_generations=ngen, max_num_programs=ngen * len(population))
GaResult = namedtuple(
'GaResult',
['population', 'best_code', 'reward', 'generations', 'num_programs',
'solution_found', 'max_generations', 'max_num_programs'])
def reward_conversion(reward):
"""Convert real value into positive value."""
if reward <= 0:
return 0.05
return reward + 0.05
def roulette_selection(population, k):
"""Select `k` individuals with prob proportional to fitness.
Each of the `k` selections is independent.
Warning:
The roulette selection by definition cannot be used for minimization
or when the fitness can be smaller or equal to 0.
Args:
population: A list of Individual objects to select from.
k: The number of individuals to select.
Returns:
A list of selected individuals.
"""
fitnesses = np.asarray(
[reward_conversion(ind.fitness.values[0])
for ind in population])
assert np.all(fitnesses > 0)
sum_fits = fitnesses.sum()
chosen = [None] * k
for i in xrange(k):
u = random.random() * sum_fits
sum_ = 0
for ind, fitness in zip(population, fitnesses):
sum_ += fitness
if sum_ > u:
chosen[i] = Individual(ind)
break
if not chosen[i]:
chosen[i] = Individual(population[-1])
return chosen
def make_population(make_individual_fn, n):
return [make_individual_fn() for _ in xrange(n)]
def best(population):
best_ind = None
for ind in population:
if best_ind is None or best_ind.fitness.values < ind.fitness.values:
best_ind = ind
return best_ind
def topk(population, k):
q = utils.MaxUniquePriorityQueue(k)
for ind in population:
q.push(ind.fitness.values, tuple(ind), ind)
return [ind for _, ind in q.iter_in_order()]
class Fitness(object):
def __init__(self):
self.values = ()
@property
def valid(self):
"""Assess if a fitness is valid or not."""
return bool(self.values)
class Individual(list):
def __init__(self, *args):
super(Individual, self).__init__(*args)
self.fitness = Fitness()
def random_individual(genome_size):
return lambda: Individual(np.random.choice(GENES, genome_size).tolist())
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Genetic algorithm for BF tasks.
Also contains the uniform random search algorithm.
Inspired by https://github.com/primaryobjects/AI-Programmer.
GA function code borrowed from https://github.com/DEAP/deap.
"""
import cPickle
import os
import sys
from time import sleep
from absl import flags
from absl import logging
import numpy as np
from six.moves import xrange
import tensorflow as tf
from common import utils # brain coder
from single_task import data # brain coder
from single_task import defaults # brain coder
from single_task import ga_lib # brain coder
from single_task import results_lib # brain coder
FLAGS = flags.FLAGS
def define_tuner_hparam_space(hparam_space_type):
"""Define tunable hparams for grid search."""
if hparam_space_type != 'ga':
raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
return {
'population_size': [10, 25, 50, 100, 500],
'crossover_rate': [0.2, 0.5, 0.7, 0.9, 0.95],
'mutation_rate': [0.01, 0.03, 0.05, 0.1, 0.15]}
def write_hparams_to_config(config, hparams, hparam_space_type):
"""Write hparams given by the tuner into the Config object."""
if hparam_space_type != 'ga':
raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
config.batch_size = hparams.population_size
config.agent.crossover_rate = hparams.crossover_rate
config.agent.mutation_rate = hparams.mutation_rate
class CheckpointWriter(object):
"""Manages loading and saving GA populations to disk.
This object is used by the genetic algorithm to save progress periodically
so that a recent population can be loaded from disk in the event of a restart.
"""
def __init__(self, checkpoint_dir, population_size):
self.checkpoint_file = os.path.join(checkpoint_dir, 'checkpoint.pickle')
self.population_size = population_size
def write(self, gen, population, halloffame):
"""Write GA state to disk.
Overwrites previous saved state.
Args:
gen: Generation number.
population: List of Individual objects.
halloffame: Hall-of-fame buffer. Typically a priority queue.
"""
raw = cPickle.dumps((gen, population, halloffame))
with tf.gfile.FastGFile(self.checkpoint_file, 'w') as f:
f.write(raw)
def load(self):
"""Loads GA state from disk.
Loads whatever is on disk, which will be whatever the most recent call
to `write` wrote.
Returns:
gen: Generation number.
population: List of Individual objects.
halloffame: Hall-of-fame buffer. Typically a priority queue.
"""
with tf.gfile.FastGFile(self.checkpoint_file, 'r') as f:
raw = f.read()
objs = cPickle.loads(raw)
# Validate data.
assert isinstance(objs, tuple) and len(objs) == 3, (
'Expecting a 3-tuple, but got %s instead.' % (objs,))
gen, population, halloffame = objs
assert isinstance(gen, int), (
'Expecting `gen` to be an integer, got %s' % (gen,))
assert (
isinstance(population, list)
and len(population) == self.population_size
), (
'Expecting `population` to be a list with size %d, got %s'
% (self.population_size, population))
assert halloffame is None or len(halloffame) == 2, (
'Expecting hall-of-fame object to have length two, got length %d'
% len(halloffame))
logging.info('Loaded pop from checkpoint file: "%s".',
self.checkpoint_file)
return gen, population, halloffame
def has_checkpoint(self):
"""Checks if a checkpoint exists on disk, and if so returns True."""
return tf.gfile.Exists(self.checkpoint_file)
def run_training(config=None, tuner=None, logdir=None, trial_name=None, # pylint: disable=unused-argument
is_chief=True):
"""Do all training runs.
This is the top level training function for policy gradient based models.
Run this from the main function.
Args:
config: config_lib.Config instance containing global config (agent and
environment hparams). If None, config will be parsed from FLAGS.config.
tuner: (unused) A tuner instance. Leave as None if not tuning.
logdir: Parent directory where all data from all runs will be written. If
None, FLAGS.logdir will be used.
trial_name: (unused) If tuning, set this to a unique string that identifies
this trial. If `tuner` is not None, this also must be set.
is_chief: True if this worker is the chief.
Returns:
List of results dicts which were written to disk. Each training run gets a
results dict. Results dict contains metrics, i.e. (name, value) pairs which
give information about the training run.
Raises:
ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions.
ValueError: If results dicts read from disk contain invalid data.
"""
if not config:
# If custom config is not given, get it from flags.
config = defaults.default_config_with_updates(FLAGS.config)
if not logdir:
logdir = FLAGS.logdir
if FLAGS.num_repetitions % FLAGS.num_workers != 0:
raise ValueError('Number of workers must divide number of repetitions')
num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers
logging.info('Running %d reps globally.', FLAGS.num_repetitions)
logging.info('This worker will run %d local reps.', num_local_reps)
if FLAGS.max_npe:
max_generations = FLAGS.max_npe // config.batch_size
logging.info('Max samples per rep: %d', FLAGS.max_npe)
logging.info('Max generations per rep: %d', max_generations)
else:
max_generations = sys.maxint
logging.info('Running unlimited generations.')
assert FLAGS.num_workers > 0
logging.info('Starting experiment. Directory: "%s"', logdir)
results = results_lib.Results(logdir, FLAGS.task_id)
local_results_list = results.read_this_shard()
if local_results_list:
if local_results_list[0]['max_npe'] != FLAGS.max_npe:
raise ValueError(
'Cannot resume training. Max-NPE changed. Was %s, now %s',
local_results_list[0]['max_npe'], FLAGS.max_npe)
if local_results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
raise ValueError(
'Cannot resume training. Number of repetitions changed. Was %s, '
'now %s',
local_results_list[0]['max_global_repetitions'],
FLAGS.num_repetitions)
start_rep = len(local_results_list)
for rep in xrange(start_rep, num_local_reps):
global_rep = num_local_reps * FLAGS.task_id + rep
logging.info(
'Starting repetition: Rep = %d. (global rep = %d)',
rep, global_rep)
# Save data for each rep, like checkpoints, goes into separate folders.
run_dir = os.path.join(logdir, 'run_%d' % global_rep)
if not tf.gfile.IsDirectory(run_dir):
tf.gfile.MakeDirs(run_dir)
checkpoint_writer = CheckpointWriter(run_dir,
population_size=config.batch_size)
data_manager = data.DataManager(config, run_number=global_rep)
task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task)
if config.agent.algorithm == 'rand':
logging.info('Running random search.')
assert FLAGS.max_npe
result = run_random_search(
FLAGS.max_npe, run_dir, task_eval_fn, config.timestep_limit)
else:
assert config.agent.algorithm == 'ga'
logging.info('Running genetic algorithm.')
pop = ga_lib.make_population(
ga_lib.random_individual(config.timestep_limit),
n=config.batch_size)
hof = utils.MaxUniquePriorityQueue(2) # Hall of fame.
result = ga_lib.ga_loop(
pop,
cxpb=config.agent.crossover_rate, mutpb=config.agent.mutation_rate,
task_eval_fn=task_eval_fn,
ngen=max_generations, halloffame=hof,
checkpoint_writer=checkpoint_writer)
logging.info('Finished rep. Num gens: %d', result.generations)
results_dict = {
'max_npe': FLAGS.max_npe,
'batch_size': config.batch_size,
'max_batches': FLAGS.max_npe // config.batch_size,
'npe': result.num_programs,
'max_global_repetitions': FLAGS.num_repetitions,
'max_local_repetitions': num_local_reps,
'code_solution': result.best_code if result.solution_found else '',
'best_reward': result.reward,
'num_batches': result.generations,
'found_solution': result.solution_found,
'task': data_manager.task_name,
'global_rep': global_rep}
logging.info('results_dict: %s', results_dict)
results.append(results_dict)
if is_chief:
logging.info(
'Worker is chief. Waiting for all workers to finish so that results '
'can be reported to the tuner.')
global_results_list, shard_stats = results.read_all(
num_shards=FLAGS.num_workers)
while not all(s.finished for s in shard_stats):
logging.info(
'Still waiting on these workers: %s',
', '.join(
['%d (%d reps left)'
% (i, s.max_local_reps - s.num_local_reps_completed)
for i, s in enumerate(shard_stats)
if not s.finished]))
sleep(60)
global_results_list, shard_stats = results.read_all(
num_shards=FLAGS.num_workers)
logging.info(
'%d results obtained. Chief worker is exiting the experiment.',
len(global_results_list))
return global_results_list
def run_random_search(max_num_programs, checkpoint_dir, task_eval_fn,
timestep_limit):
"""Run uniform random search routine.
Randomly samples programs from a uniform distribution until either a valid
program is found, or the maximum NPE is reached. Results are written to disk
and returned.
Args:
max_num_programs: Maximum NPE (number of programs executed). If no solution
is found after this many programs are tried, the run is stopped and
considered a failure.
checkpoint_dir: Where to save state during the run.
task_eval_fn: Function that maps code string to result containing total
reward and info about success.
timestep_limit: Maximum length of code strings.
Returns:
ga_lib.GaResult namedtuple instance. This contains the best code and highest
reward found.
"""
checkpoint_file = os.path.join(checkpoint_dir, 'random_search.txt')
num_programs_seen = 0
found_solution = False
best_code = ''
best_reward = 0.0
if tf.gfile.Exists(checkpoint_file):
try:
with tf.gfile.FastGFile(checkpoint_file, 'r') as f:
lines = list(f)
num_programs_seen = int(lines[0])
found_solution = bool(int(lines[1]))
if found_solution:
best_code = lines[2]
best_reward = float(lines[3])
except: # pylint: disable=bare-except
pass
while not found_solution and num_programs_seen < max_num_programs:
if num_programs_seen % 1000 == 0:
logging.info('num_programs_seen = %d', num_programs_seen)
with tf.gfile.FastGFile(checkpoint_file, 'w') as f:
f.write(str(num_programs_seen) + '\n')
f.write(str(int(found_solution)) + '\n')
code = np.random.choice(ga_lib.GENES, timestep_limit).tolist()
res = task_eval_fn(code)
found_solution = res.correct
num_programs_seen += 1
if found_solution:
best_code = ''.join(code)
best_reward = res.reward
logging.info('num_programs_seen = %d', num_programs_seen)
logging.info('found solution: %s', found_solution)
with tf.gfile.FastGFile(checkpoint_file, 'w') as f:
f.write(str(num_programs_seen) + '\n')
f.write(str(int(found_solution)) + '\n')
if found_solution:
f.write(best_code + '\n')
f.write(str(best_reward) + '\n')
return ga_lib.GaResult(
population=[], best_code=best_code, reward=best_reward,
solution_found=found_solution, generations=num_programs_seen,
num_programs=num_programs_seen, max_generations=max_num_programs,
max_num_programs=max_num_programs)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for ga_train.
Tests that ga runs for a few generations without crashing.
"""
from absl import flags
import tensorflow as tf
from single_task import defaults # brain coder
from single_task import run # brain coder
FLAGS = flags.FLAGS
class GaTest(tf.test.TestCase):
def RunTrainingSteps(self, config_string, num_steps=10):
"""Run a few training steps with the given config.
Just check that nothing crashes.
Args:
config_string: Config encoded in a string. See
$REPO_PATH/common/config_lib.py
num_steps: Number of training steps to run. Defaults to 10.
"""
config = defaults.default_config_with_updates(config_string)
FLAGS.max_npe = num_steps * config.batch_size
FLAGS.logdir = tf.test.get_temp_dir()
FLAGS.config = config_string
run.main(None)
def testGeneticAlgorithm(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="ga"),'
'timestep_limit=40,batch_size=64')
def testUniformRandomSearch(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="rand"),'
'timestep_limit=40,batch_size=64')
if __name__ == '__main__':
tf.test.main()
#!/bin/bash
# Launches training jobs.
# Modify this file to launch workers with your prefered cloud API.
# The following implementation runs each worker as a subprocess on the local
# machine.
MODELS_DIR="/tmp/models"
# Get command line options.
OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_workers:,num_ps:,max_npe:,num_repetitions:,stop_on_success:" -- "$@")
if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
eval set -- "$OPTS"
JOB_NAME="" # Name of the process and the logs directory.
CONFIG="" # Model and environment hparams.
# NUM_WORKERS: Number of workers to launch for this training job. If using
# neural networks, each worker will be 1 replica.
NUM_WORKERS=1
# NUM_PS: Number of parameter servers to launch for this training job. Only set
# this if using neural networks. For 1 worker, no parameter servers are needed.
# For more than 1 worker, at least 1 parameter server is needed to store the
# global model.
NUM_PS=0
# MAX_NPE: Maximum number of programs executed. Training will quit once this
# threshold is reached. If 0, the threshold is infinite.
MAX_NPE=0
NUM_REPETITIONS=1 # How many times to run this experiment.
STOP_ON_SUCCESS=true # Whether to halt training when a solution is found.
# Parse options into variables.
while true; do
case "$1" in
--job_name ) JOB_NAME="$2"; shift; shift ;;
--config ) CONFIG="$2"; shift; shift ;;
--num_workers ) NUM_WORKERS="$2"; shift; shift ;;
--num_ps ) NUM_PS="$2"; shift; shift ;;
--max_npe ) MAX_NPE="$2"; shift; shift ;;
--num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
--stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
# Launch jobs.
# TODO: multi-worker RL training
LOGDIR="$MODELS_DIR/$JOB_NAME"
mkdir -p $LOGDIR
BIN_DIR="bazel-bin/single_task"
for (( i=0; i<NUM_WORKERS; i++))
do
# Expecting run.par to be built.
$BIN_DIR/run.par \
--alsologtostderr \
--config="$CONFIG" \
--logdir="$LOGDIR" \
--max_npe="$MAX_NPE" \
--num_repetitions="$NUM_REPETITIONS" \
--stop_on_success="$STOP_ON_SUCCESS" \
--task_id="$i" \
--num_workers="$NUM_WORKERS" \
--summary_tasks=1 \
2> "$LOGDIR/task_$i.log" & # Run as subprocess
echo "Launched task $i. Logs: $LOGDIR/task_$i.log"
done
# Use "pidof run.par" to find jobs.
# Kill with "pkill run.par"
#!/bin/bash
# Launches tuning jobs.
# Modify this file to launch workers with your prefered cloud API.
# The following implementation runs each worker as a subprocess on the local
# machine.
MODELS_DIR="/tmp/models"
# Get command line options.
OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_tuners:,num_workers_per_tuner:,num_ps_per_tuner:,max_npe:,num_repetitions:,stop_on_success:,fixed_hparams:,hparam_space_type:" -- "$@")
if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
eval set -- "$OPTS"
JOB_NAME="" # Name of the process and the logs directory.
CONFIG="" # Model and environment hparams.
# NUM_TUNERS: Number of tuning jobs to launch. Each tuning job can train a
# hparam combination. So more tuners means more hparams tried in parallel.
NUM_TUNERS=1
# NUM_WORKERS_PER_TUNER: Number of workers to launch for each tuning job. If
# using neural networks, each worker will be 1 replica.
NUM_WORKERS_PER_TUNER=1
# NUM_PS_PER_TUNER: Number of parameter servers to launch for this tuning job.
# Only set this if using neural networks. For 1 worker per tuner, no parameter
# servers are needed. For more than 1 worker per tuner, at least 1 parameter
# server per tuner is needed to store the global model for each tuner.
NUM_PS_PER_TUNER=0
# MAX_NPE: Maximum number of programs executed. Training will quit once this
# threshold is reached. If 0, the threshold is infinite.
MAX_NPE=0
NUM_REPETITIONS=25 # How many times to run this experiment.
STOP_ON_SUCCESS=true # Whether to halt training when a solution is found.
# FIXED_HPARAMS: Hold hparams fixed in the grid search. This reduces the search
# space.
FIXED_HPARAMS=""
# HPARAM_SPACE_TYPE: Specifies the hparam search space. See
# `define_tuner_hparam_space` functions defined in pg_train.py and ga_train.py.
HPARAM_SPACE_TYPE="pg"
# Parse options into variables.
while true; do
case "$1" in
--job_name ) JOB_NAME="$2"; shift; shift ;;
--config ) CONFIG="$2"; shift; shift ;;
--num_tuners ) NUM_TUNERS="$2"; shift; shift ;;
--num_workers_per_tuner ) NUM_WORKERS_PER_TUNER="$2"; shift; shift ;;
--num_ps_per_tuner ) NUM_PS_PER_TUNER="$2"; shift; shift ;;
--max_npe ) MAX_NPE="$2"; shift; shift ;;
--num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
--stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
--fixed_hparams ) FIXED_HPARAMS="$2"; shift; shift ;;
--hparam_space_type ) HPARAM_SPACE_TYPE="$2"; shift; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
# Launch jobs.
# TODO: multi-worker RL training
LOGDIR="$MODELS_DIR/$JOB_NAME"
mkdir -p $LOGDIR
BIN_DIR="bazel-bin/single_task"
for ((tuner=0;tuner<NUM_TUNERS;tuner+=1)); do
for ((i=0;i<NUM_WORKERS_PER_TUNER;i++)); do
# Expecting tune.par to be built.
echo "$LOGDIR"
$BIN_DIR/tune.par \
--alsologtostderr \
--config="$CONFIG" \
--logdir="$LOGDIR" \
--max_npe="$MAX_NPE" \
--num_repetitions="$NUM_REPETITIONS" \
--stop_on_success="$STOP_ON_SUCCESS" \
--summary_tasks=1 \
--hparam_space="$HPARAM_SPACE_TYPE" \
--fixed_hparams="$FIXED_HPARAMS" \
--tuner_id=$tuner \
--num_tuners=$NUM_TUNERS \
2> "$LOGDIR/tuner_$tuner.task_$i.log" & # Run as subprocess
echo "Launched tuner $tuner, task $i. Logs: $LOGDIR/tuner_$tuner.task_$i.log"
done
done
# Use "pidof tune.par" to find jobs.
# Kill with "pkill tune.par"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Utilities specific to this project."""
from collections import namedtuple
from six import string_types
#####################
# BF-lang utilities #
#####################
BF_EOS_INT = 0 # Also used as SOS (start of sequence).
BF_EOS_CHAR = TEXT_EOS_CHAR = '_'
BF_LANG_INTS = range(1, 9)
BF_INT_TO_CHAR = [BF_EOS_CHAR, '>', '<', '+', '-', '[', ']', '.', ',']
BF_CHAR_TO_INT = dict([(c, i) for i, c in enumerate(BF_INT_TO_CHAR)])
RewardInfo = namedtuple('RewardInfo', ['episode_rewards', 'input_case',
'correct_output',
'code_output', 'reason', 'input_type',
'output_type'])
class IOType(object):
string = 'string'
integer = 'integer'
boolean = 'boolean'
class IOTuple(tuple):
pass
def flatten(lst):
return [item for row in lst for item in row]
def bf_num_tokens():
# BF tokens plus EOS.
return len(BF_INT_TO_CHAR)
def bf_char2int(bf_char):
"""Convert BF code char to int token."""
return BF_CHAR_TO_INT[bf_char]
def bf_int2char(bf_int):
"""Convert BF int token to code char."""
return BF_INT_TO_CHAR[bf_int]
def bf_tokens_to_string(bf_tokens, truncate=True):
"""Convert token list to code string. Will truncate at EOS token.
Args:
bf_tokens: Python list of ints representing the code string.
truncate: If true, the output string will end at the first EOS token.
If false, the entire token list is converted to string.
Returns:
String representation of the tokens.
Raises:
ValueError: If bf_tokens is not a python list.
"""
if not isinstance(bf_tokens, list):
raise ValueError('Only python list supported here.')
if truncate:
try:
eos_index = bf_tokens.index(BF_EOS_INT)
except ValueError:
eos_index = len(bf_tokens)
else:
eos_index = len(bf_tokens)
return ''.join([BF_INT_TO_CHAR[t] for t in bf_tokens[:eos_index]])
def bf_string_to_tokens(bf_string):
"""Convert string to token list. Will strip and append EOS token."""
tokens = [BF_CHAR_TO_INT[char] for char in bf_string.strip()]
tokens.append(BF_EOS_INT)
return tokens
def tokens_to_text(tokens):
"""Convert token list to human readable text."""
return ''.join(
[TEXT_EOS_CHAR if t == 0 else chr(t - 1 + ord('A')) for t in tokens])
###################################
# Number representation utilities #
###################################
# https://en.wikipedia.org/wiki/Metric_prefix
si_magnitudes = {
'k': 1e3,
'm': 1e6,
'g': 1e9}
def si_to_int(s):
"""Convert string ending with SI magnitude to int.
Examples: 5K ==> 5000, 12M ==> 12000000.
Args:
s: String in the form 'xx..xP' where x is a digit and P is an SI prefix.
Returns:
Integer equivalent to the string.
"""
if isinstance(s, string_types) and s[-1].lower() in si_magnitudes.keys():
return int(int(s[:-1]) * si_magnitudes[s[-1].lower()])
return int(s)
def int_to_si(n):
"""Convert integer to string with SI magnitude.
`n` will be truncated.
Examples: 5432 ==> 5k, 12345678 ==> 12M
Args:
n: Integer to represent as a string.
Returns:
String representation of `n` containing SI magnitude.
"""
m = abs(n)
sign = -1 if n < 0 else 1
if m < 1e3:
return str(n)
if m < 1e6:
return '{0}K'.format(sign*int(m / 1e3))
if m < 1e9:
return '{0}M'.format(sign*int(m / 1e6))
if m < 1e12:
return '{0}G'.format(sign*int(m / 1e9))
return str(m)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Language model agent.
Agent outputs code in a sequence just like a language model. Can be trained
as a language model or using RL, or a combination of the two.
"""
from collections import namedtuple
from math import exp
from math import log
import time
from absl import logging
import numpy as np
from six.moves import xrange
import tensorflow as tf
from common import rollout as rollout_lib # brain coder
from common import utils # brain coder
from single_task import misc # brain coder
# Experiments in the ICLR 2018 paper used reduce_sum instead of reduce_mean for
# some losses. We make all loses be batch_size independent, and multiply the
# changed losses by 64, which was the fixed batch_size when the experiments
# where run. The loss hyperparameters still match what is reported in the paper.
MAGIC_LOSS_MULTIPLIER = 64
def rshift_time(tensor_2d, fill=misc.BF_EOS_INT):
"""Right shifts a 2D tensor along the time dimension (axis-1)."""
dim_0 = tf.shape(tensor_2d)[0]
fill_tensor = tf.fill([dim_0, 1], fill)
return tf.concat([fill_tensor, tensor_2d[:, :-1]], axis=1)
def join(a, b):
# Concat a and b along 0-th dim.
if a is None or len(a) == 0: # pylint: disable=g-explicit-length-test
return b
if b is None or len(b) == 0: # pylint: disable=g-explicit-length-test
return a
return np.concatenate((a, b))
def make_optimizer(kind, lr):
if kind == 'sgd':
return tf.train.GradientDescentOptimizer(lr)
elif kind == 'adam':
return tf.train.AdamOptimizer(lr)
elif kind == 'rmsprop':
return tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.99)
else:
raise ValueError('Optimizer type "%s" not recognized.' % kind)
class LinearWrapper(tf.contrib.rnn.RNNCell):
"""RNNCell wrapper that adds a linear layer to the output."""
def __init__(self, cell, output_size, dtype=tf.float32, suppress_index=None):
self.cell = cell
self._output_size = output_size
self._dtype = dtype
self._suppress_index = suppress_index
self.smallest_float = -2.4e38
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(type(self).__name__):
outputs, state = self.cell(inputs, state, scope=scope)
logits = tf.matmul(
outputs,
tf.get_variable('w_output',
[self.cell.output_size, self.output_size],
dtype=self._dtype))
if self._suppress_index is not None:
# Replace the target index with -inf, so that it never gets selected.
batch_size = tf.shape(logits)[0]
logits = tf.concat(
[logits[:, :self._suppress_index],
tf.fill([batch_size, 1], self.smallest_float),
logits[:, self._suppress_index + 1:]],
axis=1)
return logits, state
@property
def output_size(self):
return self._output_size
@property
def state_size(self):
return self.cell.state_size
def zero_state(self, batch_size, dtype):
return self.cell.zero_state(batch_size, dtype)
UpdateStepResult = namedtuple(
'UpdateStepResult',
['global_step', 'global_npe', 'summaries_list', 'gradients_dict'])
class AttrDict(dict):
"""Dict with attributes as keys.
https://stackoverflow.com/a/14620633
"""
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
class LMAgent(object):
"""Language model agent."""
action_space = misc.bf_num_tokens()
observation_space = misc.bf_num_tokens()
def __init__(self, global_config, task_id=0,
logging_file=None,
experience_replay_file=None,
global_best_reward_fn=None,
found_solution_op=None,
assign_code_solution_fn=None,
program_count=None,
do_iw_summaries=False,
stop_on_success=True,
dtype=tf.float32,
verbose_level=0,
is_local=True):
self.config = config = global_config.agent
self.logging_file = logging_file
self.experience_replay_file = experience_replay_file
self.task_id = task_id
self.verbose_level = verbose_level
self.global_best_reward_fn = global_best_reward_fn
self.found_solution_op = found_solution_op
self.assign_code_solution_fn = assign_code_solution_fn
self.parent_scope_name = tf.get_variable_scope().name
self.dtype = dtype
self.allow_eos_token = config.eos_token
self.stop_on_success = stop_on_success
self.pi_loss_hparam = config.pi_loss_hparam
self.vf_loss_hparam = config.vf_loss_hparam
self.is_local = is_local
self.top_reward = 0.0
self.embeddings_trainable = True
self.no_op = tf.no_op()
self.learning_rate = tf.constant(
config.lr, dtype=dtype, name='learning_rate')
self.initializer = tf.contrib.layers.variance_scaling_initializer(
factor=config.param_init_factor,
mode='FAN_AVG',
uniform=True,
dtype=dtype) # TF's default initializer.
tf.get_variable_scope().set_initializer(self.initializer)
self.a2c = config.ema_baseline_decay == 0
if not self.a2c:
logging.info('Using exponential moving average REINFORCE baselines.')
self.ema_baseline_decay = config.ema_baseline_decay
self.ema_by_len = [0.0] * global_config.timestep_limit
else:
logging.info('Using advantage (a2c) with learned value function.')
self.ema_baseline_decay = 0.0
self.ema_by_len = None
# Top-k
if config.topk and config.topk_loss_hparam:
self.topk_loss_hparam = config.topk_loss_hparam
self.topk_batch_size = config.topk_batch_size
if self.topk_batch_size <= 0:
raise ValueError('topk_batch_size must be a positive integer. Got %s',
self.topk_batch_size)
self.top_episodes = utils.MaxUniquePriorityQueue(config.topk)
logging.info('Made max-priorty-queue with capacity %d',
self.top_episodes.capacity)
else:
self.top_episodes = None
self.topk_loss_hparam = 0.0
logging.info('No max-priorty-queue')
# Experience replay.
self.replay_temperature = config.replay_temperature
self.num_replay_per_batch = int(global_config.batch_size * config.alpha)
self.num_on_policy_per_batch = (
global_config.batch_size - self.num_replay_per_batch)
self.replay_alpha = (
self.num_replay_per_batch / float(global_config.batch_size))
logging.info('num_replay_per_batch: %d', self.num_replay_per_batch)
logging.info('num_on_policy_per_batch: %d', self.num_on_policy_per_batch)
logging.info('replay_alpha: %s', self.replay_alpha)
if self.num_replay_per_batch > 0:
# Train with off-policy episodes from replay buffer.
start_time = time.time()
self.experience_replay = utils.RouletteWheel(
unique_mode=True, save_file=experience_replay_file)
logging.info('Took %s sec to load replay buffer from disk.',
int(time.time() - start_time))
logging.info('Replay buffer file location: "%s"',
self.experience_replay.save_file)
else:
# Only train on-policy.
self.experience_replay = None
if program_count is not None:
self.program_count = program_count
self.program_count_add_ph = tf.placeholder(
tf.int64, [], 'program_count_add_ph')
self.program_count_add_op = self.program_count.assign_add(
self.program_count_add_ph)
################################
# RL policy and value networks #
################################
batch_size = global_config.batch_size
logging.info('batch_size: %d', batch_size)
self.policy_cell = LinearWrapper(
tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.BasicLSTMCell(cell_size)
for cell_size in config.policy_lstm_sizes]),
self.action_space,
dtype=dtype,
suppress_index=None if self.allow_eos_token else misc.BF_EOS_INT)
self.value_cell = LinearWrapper(
tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.BasicLSTMCell(cell_size)
for cell_size in config.value_lstm_sizes]),
1,
dtype=dtype)
obs_embedding_scope = 'obs_embed'
with tf.variable_scope(
obs_embedding_scope,
initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)):
obs_embeddings = tf.get_variable(
'embeddings',
[self.observation_space, config.obs_embedding_size],
dtype=dtype, trainable=self.embeddings_trainable)
self.obs_embeddings = obs_embeddings
################################
# RL policy and value networks #
################################
initial_state = tf.fill([batch_size], misc.BF_EOS_INT)
def loop_fn(loop_time, cell_output, cell_state, loop_state):
"""Function called by tf.nn.raw_rnn to instantiate body of the while_loop.
See https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn for more
information.
When time is 0, and cell_output, cell_state, loop_state are all None,
`loop_fn` will create the initial input, internal cell state, and loop
state. When time > 0, `loop_fn` will operate on previous cell output,
state, and loop state.
Args:
loop_time: A scalar tensor holding the current timestep (zero based
counting).
cell_output: Output of the raw_rnn cell at the current timestep.
cell_state: Cell internal state at the current timestep.
loop_state: Additional loop state. These tensors were returned by the
previous call to `loop_fn`.
Returns:
elements_finished: Bool tensor of shape [batch_size] which marks each
sequence in the batch as being finished or not finished.
next_input: A tensor containing input to be fed into the cell at the
next timestep.
next_cell_state: Cell internal state to be fed into the cell at the
next timestep.
emit_output: Tensor to be added to the TensorArray returned by raw_rnn
as output from the while_loop.
next_loop_state: Additional loop state. These tensors will be fed back
into the next call to `loop_fn` as `loop_state`.
"""
if cell_output is None: # 0th time step.
next_cell_state = self.policy_cell.zero_state(batch_size, dtype)
elements_finished = tf.zeros([batch_size], tf.bool)
output_lengths = tf.ones([batch_size], dtype=tf.int32)
next_input = tf.gather(obs_embeddings, initial_state)
emit_output = None
next_loop_state = (
tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True),
output_lengths,
elements_finished
)
else:
scaled_logits = cell_output * config.softmax_tr # Scale temperature.
prev_chosen, prev_output_lengths, prev_elements_finished = loop_state
next_cell_state = cell_state
chosen_outputs = tf.to_int32(tf.where(
tf.logical_not(prev_elements_finished),
tf.multinomial(logits=scaled_logits, num_samples=1)[:, 0],
tf.zeros([batch_size], dtype=tf.int64)))
elements_finished = tf.logical_or(
tf.equal(chosen_outputs, misc.BF_EOS_INT),
loop_time >= global_config.timestep_limit)
output_lengths = tf.where(
elements_finished,
prev_output_lengths,
# length includes EOS token. empty seq has len 1.
tf.tile(tf.expand_dims(loop_time + 1, 0), [batch_size])
)
next_input = tf.gather(obs_embeddings, chosen_outputs)
emit_output = scaled_logits
next_loop_state = (prev_chosen.write(loop_time - 1, chosen_outputs),
output_lengths,
tf.logical_or(prev_elements_finished,
elements_finished))
return (elements_finished, next_input, next_cell_state, emit_output,
next_loop_state)
with tf.variable_scope('policy'):
(decoder_outputs_ta,
_, # decoder_state
(sampled_output_ta, output_lengths, _)) = tf.nn.raw_rnn(
cell=self.policy_cell,
loop_fn=loop_fn)
policy_logits = tf.transpose(decoder_outputs_ta.stack(), (1, 0, 2),
name='policy_logits')
sampled_tokens = tf.transpose(sampled_output_ta.stack(), (1, 0),
name='sampled_tokens')
# Add SOS to beginning of the sequence.
rshift_sampled_tokens = rshift_time(sampled_tokens, fill=misc.BF_EOS_INT)
# Initial state is 0, 2nd state is first token.
# Note: If value of last state is computed, this will be used as bootstrap.
if self.a2c:
with tf.variable_scope('value'):
value_output, _ = tf.nn.dynamic_rnn(
self.value_cell,
tf.gather(obs_embeddings, rshift_sampled_tokens),
sequence_length=output_lengths,
dtype=dtype)
value = tf.squeeze(value_output, axis=[2])
else:
value = tf.zeros([], dtype=dtype)
# for sampling actions from the agent, and which told tensors for doing
# gradient updates on the agent.
self.sampled_batch = AttrDict(
logits=policy_logits,
value=value,
tokens=sampled_tokens,
episode_lengths=output_lengths,
probs=tf.nn.softmax(policy_logits),
log_probs=tf.nn.log_softmax(policy_logits))
# adjusted_lengths can be less than the full length of each episode.
# Use this to train on only part of an episode (starting from t=0).
self.adjusted_lengths = tf.placeholder(
tf.int32, [None], name='adjusted_lengths')
self.policy_multipliers = tf.placeholder(
dtype,
[None, None],
name='policy_multipliers')
# Empirical value, i.e. discounted sum of observed future rewards from each
# time step in the episode.
self.empirical_values = tf.placeholder(
dtype,
[None, None],
name='empirical_values')
# Off-policy training. Just add supervised loss to the RL loss.
self.off_policy_targets = tf.placeholder(
tf.int32,
[None, None],
name='off_policy_targets')
self.off_policy_target_lengths = tf.placeholder(
tf.int32, [None], name='off_policy_target_lengths')
self.actions = tf.placeholder(tf.int32, [None, None], name='actions')
# Add SOS to beginning of the sequence.
inputs = rshift_time(self.actions, fill=misc.BF_EOS_INT)
with tf.variable_scope('policy', reuse=True):
logits, _ = tf.nn.dynamic_rnn(
self.policy_cell, tf.gather(obs_embeddings, inputs),
sequence_length=self.adjusted_lengths,
dtype=dtype)
if self.a2c:
with tf.variable_scope('value', reuse=True):
value_output, _ = tf.nn.dynamic_rnn(
self.value_cell,
tf.gather(obs_embeddings, inputs),
sequence_length=self.adjusted_lengths,
dtype=dtype)
value2 = tf.squeeze(value_output, axis=[2])
else:
value2 = tf.zeros([], dtype=dtype)
self.given_batch = AttrDict(
logits=logits,
value=value2,
tokens=sampled_tokens,
episode_lengths=self.adjusted_lengths,
probs=tf.nn.softmax(logits),
log_probs=tf.nn.log_softmax(logits))
# Episode masks.
max_episode_length = tf.shape(self.actions)[1]
# range_row shape: [1, max_episode_length]
range_row = tf.expand_dims(tf.range(max_episode_length), 0)
episode_masks = tf.cast(
tf.less(range_row, tf.expand_dims(self.given_batch.episode_lengths, 1)),
dtype=dtype)
episode_masks_3d = tf.expand_dims(episode_masks, 2)
# Length adjusted episodes.
self.a_probs = a_probs = self.given_batch.probs * episode_masks_3d
self.a_log_probs = a_log_probs = (
self.given_batch.log_probs * episode_masks_3d)
self.a_value = a_value = self.given_batch.value * episode_masks
self.a_policy_multipliers = a_policy_multipliers = (
self.policy_multipliers * episode_masks)
if self.a2c:
self.a_empirical_values = a_empirical_values = (
self.empirical_values * episode_masks)
# pi_loss is scalar
acs_onehot = tf.one_hot(self.actions, self.action_space, dtype=dtype)
self.acs_onehot = acs_onehot
chosen_masked_log_probs = acs_onehot * a_log_probs
pi_target = tf.expand_dims(a_policy_multipliers, -1)
pi_loss_per_step = chosen_masked_log_probs * pi_target # Maximize.
self.pi_loss = pi_loss = (
-tf.reduce_mean(tf.reduce_sum(pi_loss_per_step, axis=[1, 2]), axis=0)
* MAGIC_LOSS_MULTIPLIER) # Minimize.
assert len(self.pi_loss.shape) == 0 # pylint: disable=g-explicit-length-test
# shape: [batch_size, time]
self.chosen_log_probs = tf.reduce_sum(chosen_masked_log_probs, axis=2)
self.chosen_probs = tf.reduce_sum(acs_onehot * a_probs, axis=2)
# loss of value function
if self.a2c:
vf_loss_per_step = tf.square(a_value - a_empirical_values)
self.vf_loss = vf_loss = (
tf.reduce_mean(tf.reduce_sum(vf_loss_per_step, axis=1), axis=0)
* MAGIC_LOSS_MULTIPLIER) # Minimize.
assert len(self.vf_loss.shape) == 0 # pylint: disable=g-explicit-length-test
else:
self.vf_loss = vf_loss = 0.0
# Maximize entropy regularizer
self.entropy = entropy = (
-tf.reduce_mean(
tf.reduce_sum(a_probs * a_log_probs, axis=[1, 2]), axis=0)
* MAGIC_LOSS_MULTIPLIER) # Maximize
self.negentropy = -entropy # Minimize negentropy.
assert len(self.negentropy.shape) == 0 # pylint: disable=g-explicit-length-test
# off-policy loss
self.offp_switch = tf.placeholder(dtype, [], name='offp_switch')
if self.top_episodes is not None:
# Add SOS to beginning of the sequence.
offp_inputs = tf.gather(obs_embeddings,
rshift_time(self.off_policy_targets,
fill=misc.BF_EOS_INT))
with tf.variable_scope('policy', reuse=True):
offp_logits, _ = tf.nn.dynamic_rnn(
self.policy_cell, offp_inputs, self.off_policy_target_lengths,
dtype=dtype) # shape: [batch_size, time, action_space]
topk_loss_per_step = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=self.off_policy_targets,
logits=offp_logits,
name='topk_loss_per_logit')
# Take mean over batch dimension so that the loss multiplier strength is
# independent of batch size. Sum over time dimension.
topk_loss = tf.reduce_mean(
tf.reduce_sum(topk_loss_per_step, axis=1), axis=0)
assert len(topk_loss.shape) == 0 # pylint: disable=g-explicit-length-test
self.topk_loss = topk_loss * self.offp_switch
logging.info('Including off policy loss.')
else:
self.topk_loss = topk_loss = 0.0
self.entropy_hparam = tf.constant(
config.entropy_beta, dtype=dtype, name='entropy_beta')
self.pi_loss_term = pi_loss * self.pi_loss_hparam
self.vf_loss_term = vf_loss * self.vf_loss_hparam
self.entropy_loss_term = self.negentropy * self.entropy_hparam
self.topk_loss_term = self.topk_loss_hparam * topk_loss
self.loss = (
self.pi_loss_term
+ self.vf_loss_term
+ self.entropy_loss_term
+ self.topk_loss_term)
params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
self.trainable_variables = params
self.sync_variables = self.trainable_variables
non_embedding_params = [p for p in params
if obs_embedding_scope not in p.name]
self.non_embedding_params = non_embedding_params
self.params = params
if config.regularizer:
logging.info('Adding L2 regularizer with scale %.2f.',
config.regularizer)
self.regularizer = config.regularizer * sum(
tf.nn.l2_loss(w) for w in non_embedding_params)
self.loss += self.regularizer
else:
logging.info('Skipping regularizer.')
self.regularizer = 0.0
# Only build gradients graph for local model.
if self.is_local:
unclipped_grads = tf.gradients(self.loss, params)
self.dense_unclipped_grads = [
tf.convert_to_tensor(g) for g in unclipped_grads]
self.grads, self.global_grad_norm = tf.clip_by_global_norm(
unclipped_grads, config.grad_clip_threshold)
self.gradients_dict = dict(zip(params, self.grads))
self.optimizer = make_optimizer(config.optimizer, self.learning_rate)
self.all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
tf.get_variable_scope().name)
self.do_iw_summaries = do_iw_summaries
if self.do_iw_summaries:
b = None
self.log_iw_replay_ph = tf.placeholder(tf.float32, [b],
'log_iw_replay_ph')
self.log_iw_policy_ph = tf.placeholder(tf.float32, [b],
'log_iw_policy_ph')
self.log_prob_replay_ph = tf.placeholder(tf.float32, [b],
'log_prob_replay_ph')
self.log_prob_policy_ph = tf.placeholder(tf.float32, [b],
'log_prob_policy_ph')
self.log_norm_replay_weights_ph = tf.placeholder(
tf.float32, [b], 'log_norm_replay_weights_ph')
self.iw_summary_op = tf.summary.merge([
tf.summary.histogram('is/log_iw_replay', self.log_iw_replay_ph),
tf.summary.histogram('is/log_iw_policy', self.log_iw_policy_ph),
tf.summary.histogram('is/log_prob_replay', self.log_prob_replay_ph),
tf.summary.histogram('is/log_prob_policy', self.log_prob_policy_ph),
tf.summary.histogram(
'is/log_norm_replay_weights', self.log_norm_replay_weights_ph),
])
def make_summary_ops(self):
"""Construct summary ops for the model."""
# size = number of timesteps across entire batch. Number normalized by size
# will not be affected by the amount of padding at the ends of sequences
# in the batch.
size = tf.cast(
tf.reduce_sum(self.given_batch.episode_lengths), dtype=self.dtype)
offp_size = tf.cast(tf.reduce_sum(self.off_policy_target_lengths),
dtype=self.dtype)
scope_prefix = self.parent_scope_name
def _remove_prefix(prefix, name):
assert name.startswith(prefix)
return name[len(prefix):]
# RL summaries.
self.rl_summary_op = tf.summary.merge(
[tf.summary.scalar('model/policy_loss', self.pi_loss / size),
tf.summary.scalar('model/value_loss', self.vf_loss / size),
tf.summary.scalar('model/topk_loss', self.topk_loss / offp_size),
tf.summary.scalar('model/entropy', self.entropy / size),
tf.summary.scalar('model/loss', self.loss / size),
tf.summary.scalar('model/grad_norm',
tf.global_norm(self.grads)),
tf.summary.scalar('model/unclipped_grad_norm', self.global_grad_norm),
tf.summary.scalar('model/non_embedding_var_norm',
tf.global_norm(self.non_embedding_params)),
tf.summary.scalar('hparams/entropy_beta', self.entropy_hparam),
tf.summary.scalar('hparams/topk_loss_hparam', self.topk_loss_hparam),
tf.summary.scalar('hparams/learning_rate', self.learning_rate),
tf.summary.scalar('model/trainable_var_norm',
tf.global_norm(self.trainable_variables)),
tf.summary.scalar('loss/loss', self.loss),
tf.summary.scalar('loss/entropy', self.entropy_loss_term),
tf.summary.scalar('loss/vf', self.vf_loss_term),
tf.summary.scalar('loss/policy', self.pi_loss_term),
tf.summary.scalar('loss/offp', self.topk_loss_term)] +
[tf.summary.scalar(
'param_norms/' + _remove_prefix(scope_prefix + '/', p.name),
tf.norm(p))
for p in self.params] +
[tf.summary.scalar(
'grad_norms/' + _remove_prefix(scope_prefix + '/', p.name),
tf.norm(g))
for p, g in zip(self.params, self.grads)] +
[tf.summary.scalar(
'unclipped_grad_norms/' + _remove_prefix(scope_prefix + '/',
p.name),
tf.norm(g))
for p, g in zip(self.params, self.dense_unclipped_grads)])
self.text_summary_placeholder = tf.placeholder(tf.string, shape=[])
self.rl_text_summary_op = tf.summary.text('rl',
self.text_summary_placeholder)
def _rl_text_summary(self, session, step, npe, tot_r, num_steps,
input_case, code_output, code, reason):
"""Logs summary about a single episode and creates a text_summary for TB.
Args:
session: tf.Session instance.
step: Global training step.
npe: Number of programs executed so far.
tot_r: Total reward.
num_steps: Number of timesteps in the episode (i.e. code length).
input_case: Inputs for test cases.
code_output: Outputs produced by running the code on the inputs.
code: String representation of the code.
reason: Reason for the reward assigned by the task.
Returns:
Serialized text summary data for tensorboard.
"""
if not input_case:
input_case = ' '
if not code_output:
code_output = ' '
if not code:
code = ' '
text = (
'Tot R: **%.2f**; Len: **%d**; Reason: **%s**\n\n'
'Input: **`%s`**; Output: **`%s`**\n\nCode: **`%s`**'
% (tot_r, num_steps, reason, input_case, code_output, code))
text_summary = session.run(self.rl_text_summary_op,
{self.text_summary_placeholder: text})
logging.info(
'Step %d.\t NPE: %d\t Reason: %s.\t Tot R: %.2f.\t Length: %d. '
'\tInput: %s \tOutput: %s \tProgram: %s',
step, npe, reason, tot_r, num_steps, input_case,
code_output, code)
return text_summary
def _rl_reward_summary(self, total_rewards):
"""Create summary ops that report on episode rewards.
Creates summaries for average, median, max, and min rewards in the batch.
Args:
total_rewards: Tensor of shape [batch_size] containing the total reward
from each episode in the batch.
Returns:
tf.Summary op.
"""
tr = np.asarray(total_rewards)
reward_summary = tf.Summary(value=[
tf.Summary.Value(
tag='reward/avg',
simple_value=np.mean(tr)),
tf.Summary.Value(
tag='reward/med',
simple_value=np.median(tr)),
tf.Summary.Value(
tag='reward/max',
simple_value=np.max(tr)),
tf.Summary.Value(
tag='reward/min',
simple_value=np.min(tr))])
return reward_summary
def _iw_summary(self, session, replay_iw, replay_log_probs,
norm_replay_weights, on_policy_iw,
on_policy_log_probs):
"""Compute summaries for importance weights at a given batch.
Args:
session: tf.Session instance.
replay_iw: Importance weights for episodes from replay buffer.
replay_log_probs: Total log probabilities of the replay episodes under the
current policy.
norm_replay_weights: Normalized replay weights, i.e. values in `replay_iw`
divided by the total weight in the entire replay buffer. Note, this is
also the probability of selecting each episode from the replay buffer
(in a roulette wheel replay buffer).
on_policy_iw: Importance weights for episodes sampled from the current
policy.
on_policy_log_probs: Total log probabilities of the on-policy episodes
under the current policy.
Returns:
Serialized TF summaries. Use a summary writer to write these summaries to
disk.
"""
return session.run(
self.iw_summary_op,
{self.log_iw_replay_ph: np.log(replay_iw),
self.log_iw_policy_ph: np.log(on_policy_iw),
self.log_norm_replay_weights_ph: np.log(norm_replay_weights),
self.log_prob_replay_ph: replay_log_probs,
self.log_prob_policy_ph: on_policy_log_probs})
def _compute_iw(self, policy_log_probs, replay_weights):
"""Compute importance weights for a batch of episodes.
Arguments are iterables of length batch_size.
Args:
policy_log_probs: Log probability of each episode under the current
policy.
replay_weights: Weight of each episode in the replay buffer. 0 for
episodes not sampled from the replay buffer (i.e. sampled from the
policy).
Returns:
Numpy array of shape [batch_size] containing the importance weight for
each episode in the batch.
"""
log_total_replay_weight = log(self.experience_replay.total_weight)
# importance weight
# = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
# = 1 / ((1-a) + a*q/p)
a = float(self.replay_alpha)
a_com = 1.0 - a # compliment of a
importance_weights = np.asarray(
[1.0 / (a_com
+ a * exp((log(replay_weight) - log_total_replay_weight)
- log_p))
if replay_weight > 0 else 1.0 / a_com
for log_p, replay_weight
in zip(policy_log_probs, replay_weights)])
return importance_weights
def update_step(self, session, rl_batch, train_op, global_step_op,
return_gradients=False):
"""Perform gradient update on the model.
Args:
session: tf.Session instance.
rl_batch: RLBatch instance from data.py. Use DataManager to create a
RLBatch for each call to update_step. RLBatch contains a batch of
tasks.
train_op: A TF op which will perform the gradient update. LMAgent does not
own its training op, so that trainers can do distributed training
and construct a specialized training op.
global_step_op: A TF op which will return the current global step when
run (should not increment it).
return_gradients: If True, the gradients will be saved and returned from
this method call. This is useful for testing.
Returns:
Results from the update step in a UpdateStepResult namedtuple, including
global step, global NPE, serialized summaries, and optionally gradients.
"""
assert self.is_local
# Do update for REINFORCE or REINFORCE + replay buffer.
if self.experience_replay is None:
# Train with on-policy REINFORCE.
# Sample new programs from the policy.
num_programs_from_policy = rl_batch.batch_size
(batch_actions,
batch_values,
episode_lengths) = session.run(
[self.sampled_batch.tokens, self.sampled_batch.value,
self.sampled_batch.episode_lengths])
if episode_lengths.size == 0:
# This should not happen.
logging.warn(
'Shapes:\n'
'batch_actions.shape: %s\n'
'batch_values.shape: %s\n'
'episode_lengths.shape: %s\n',
batch_actions.shape, batch_values.shape, episode_lengths.shape)
# Compute rewards.
code_scores = compute_rewards(
rl_batch, batch_actions, episode_lengths)
code_strings = code_scores.code_strings
batch_tot_r = code_scores.total_rewards
test_cases = code_scores.test_cases
code_outputs = code_scores.code_outputs
reasons = code_scores.reasons
# Process on-policy samples.
batch_targets, batch_returns = process_episodes(
code_scores.batch_rewards, episode_lengths, a2c=self.a2c,
baselines=self.ema_by_len,
batch_values=batch_values)
batch_policy_multipliers = batch_targets
batch_emp_values = batch_returns if self.a2c else [[]]
adjusted_lengths = episode_lengths
if self.top_episodes:
assert len(self.top_episodes) > 0 # pylint: disable=g-explicit-length-test
off_policy_targets = [
item for item, _
in self.top_episodes.random_sample(self.topk_batch_size)]
off_policy_target_lengths = [len(t) for t in off_policy_targets]
off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
dtype=np.int32)
offp_switch = 1
else:
off_policy_targets = [[0]]
off_policy_target_lengths = [1]
offp_switch = 0
fetches = {
'global_step': global_step_op,
'program_count': self.program_count,
'summaries': self.rl_summary_op,
'train_op': train_op,
'gradients': self.gradients_dict if return_gradients else self.no_op}
fetched = session.run(
fetches,
{self.actions: batch_actions,
self.empirical_values: batch_emp_values,
self.policy_multipliers: batch_policy_multipliers,
self.adjusted_lengths: adjusted_lengths,
self.off_policy_targets: off_policy_targets,
self.off_policy_target_lengths: off_policy_target_lengths,
self.offp_switch: offp_switch})
combined_adjusted_lengths = adjusted_lengths
combined_returns = batch_returns
else:
# Train with REINFORCE + off-policy replay buffer by using importance
# sampling.
# Sample new programs from the policy.
# Note: batch size is constant. A full batch will be sampled, but not all
# programs will be executed and added to the replay buffer. Those which
# are not executed will be discarded and not counted.
batch_actions, batch_values, episode_lengths, log_probs = session.run(
[self.sampled_batch.tokens, self.sampled_batch.value,
self.sampled_batch.episode_lengths, self.sampled_batch.log_probs])
if episode_lengths.size == 0:
# This should not happen.
logging.warn(
'Shapes:\n'
'batch_actions.shape: %s\n'
'batch_values.shape: %s\n'
'episode_lengths.shape: %s\n',
batch_actions.shape, batch_values.shape, episode_lengths.shape)
# Sample from experince replay buffer
empty_replay_buffer = (
self.experience_replay.is_empty()
if self.experience_replay is not None else True)
num_programs_from_replay_buff = (
self.num_replay_per_batch if not empty_replay_buffer else 0)
num_programs_from_policy = (
rl_batch.batch_size - num_programs_from_replay_buff)
if (not empty_replay_buffer) and num_programs_from_replay_buff:
result = self.experience_replay.sample_many(
num_programs_from_replay_buff)
experience_samples, replay_weights = zip(*result)
(replay_actions,
replay_rewards,
_, # log probs
replay_adjusted_lengths) = zip(*experience_samples)
replay_batch_actions = utils.stack_pad(replay_actions, pad_axes=0,
dtype=np.int32)
# compute log probs for replay samples under current policy
all_replay_log_probs, = session.run(
[self.given_batch.log_probs],
{self.actions: replay_batch_actions,
self.adjusted_lengths: replay_adjusted_lengths})
replay_log_probs = [
np.choose(replay_actions[i], all_replay_log_probs[i, :l].T).sum()
for i, l in enumerate(replay_adjusted_lengths)]
else:
# Replay buffer is empty. Do not sample from it.
replay_actions = None
replay_policy_multipliers = None
replay_adjusted_lengths = None
replay_log_probs = None
replay_weights = None
replay_returns = None
on_policy_weights = [0] * num_programs_from_replay_buff
assert not self.a2c # TODO(danabo): Support A2C with importance sampling.
# Compute rewards.
code_scores = compute_rewards(
rl_batch, batch_actions, episode_lengths,
batch_size=num_programs_from_policy)
code_strings = code_scores.code_strings
batch_tot_r = code_scores.total_rewards
test_cases = code_scores.test_cases
code_outputs = code_scores.code_outputs
reasons = code_scores.reasons
# Process on-policy samples.
p = num_programs_from_policy
batch_targets, batch_returns = process_episodes(
code_scores.batch_rewards, episode_lengths[:p], a2c=False,
baselines=self.ema_by_len)
batch_policy_multipliers = batch_targets
batch_emp_values = [[]]
on_policy_returns = batch_returns
# Process off-policy samples.
if (not empty_replay_buffer) and num_programs_from_replay_buff:
offp_batch_rewards = [
[0.0] * (l - 1) + [r]
for l, r in zip(replay_adjusted_lengths, replay_rewards)]
assert len(offp_batch_rewards) == num_programs_from_replay_buff
assert len(replay_adjusted_lengths) == num_programs_from_replay_buff
replay_batch_targets, replay_returns = process_episodes(
offp_batch_rewards, replay_adjusted_lengths, a2c=False,
baselines=self.ema_by_len)
# Convert 2D array back into ragged 2D list.
replay_policy_multipliers = [
replay_batch_targets[i, :l]
for i, l
in enumerate(
replay_adjusted_lengths[:num_programs_from_replay_buff])]
adjusted_lengths = episode_lengths[:num_programs_from_policy]
if self.top_episodes:
assert len(self.top_episodes) > 0 # pylint: disable=g-explicit-length-test
off_policy_targets = [
item for item, _
in self.top_episodes.random_sample(self.topk_batch_size)]
off_policy_target_lengths = [len(t) for t in off_policy_targets]
off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
dtype=np.int32)
offp_switch = 1
else:
off_policy_targets = [[0]]
off_policy_target_lengths = [1]
offp_switch = 0
# On-policy episodes.
if num_programs_from_policy:
separate_actions = [
batch_actions[i, :l]
for i, l in enumerate(adjusted_lengths)]
chosen_log_probs = [
np.choose(separate_actions[i], log_probs[i, :l].T)
for i, l in enumerate(adjusted_lengths)]
new_experiences = [
(separate_actions[i],
batch_tot_r[i],
chosen_log_probs[i].sum(), l)
for i, l in enumerate(adjusted_lengths)]
on_policy_policy_multipliers = [
batch_policy_multipliers[i, :l]
for i, l in enumerate(adjusted_lengths)]
(on_policy_actions,
_, # rewards
on_policy_log_probs,
on_policy_adjusted_lengths) = zip(*new_experiences)
else:
new_experiences = []
on_policy_policy_multipliers = []
on_policy_actions = []
on_policy_log_probs = []
on_policy_adjusted_lengths = []
if (not empty_replay_buffer) and num_programs_from_replay_buff:
# Look for new experiences in replay buffer. Assign weight if an episode
# is in the buffer.
on_policy_weights = [0] * num_programs_from_policy
for i, cs in enumerate(code_strings):
if self.experience_replay.has_key(cs):
on_policy_weights[i] = self.experience_replay.get_weight(cs)
# Randomly select on-policy or off policy episodes to train on.
combined_actions = join(replay_actions, on_policy_actions)
combined_policy_multipliers = join(
replay_policy_multipliers, on_policy_policy_multipliers)
combined_adjusted_lengths = join(
replay_adjusted_lengths, on_policy_adjusted_lengths)
combined_returns = join(replay_returns, on_policy_returns)
combined_actions = utils.stack_pad(combined_actions, pad_axes=0)
combined_policy_multipliers = utils.stack_pad(combined_policy_multipliers,
pad_axes=0)
# P
combined_on_policy_log_probs = join(replay_log_probs, on_policy_log_probs)
# Q
# Assume weight is zero for all sequences sampled from the policy.
combined_q_weights = join(replay_weights, on_policy_weights)
# Importance adjustment. Naive formulation:
# E_{x~p}[f(x)] ~= 1/N sum_{x~p}(f(x)) ~= 1/N sum_{x~q}(f(x) * p(x)/q(x)).
# p(x) is the policy, and q(x) is the off-policy distribution, i.e. replay
# buffer distribution. Importance weight w(x) = p(x) / q(x).
# Instead of sampling from the replay buffer only, we sample from a
# mixture distribution of the policy and replay buffer.
# We are sampling from the mixture a*q(x) + (1-a)*p(x), where 0 <= a <= 1.
# Thus the importance weight w(x) = p(x) / (a*q(x) + (1-a)*p(x))
# = 1 / ((1-a) + a*q(x)/p(x)) where q(x) is 0 for x sampled from the
# policy.
# Note: a = self.replay_alpha
if empty_replay_buffer:
# The replay buffer is empty.
# Do no gradient update this step. The replay buffer will have stuff in
# it next time.
combined_policy_multipliers *= 0
elif not num_programs_from_replay_buff:
combined_policy_multipliers = np.ones([len(combined_actions), 1],
dtype=np.float32)
else:
# If a < 1 compute importance weights
# importance weight
# = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
# = 1 / ((1-a) + a*q/p)
importance_weights = self._compute_iw(combined_on_policy_log_probs,
combined_q_weights)
if self.config.iw_normalize:
importance_weights *= (
float(rl_batch.batch_size) / importance_weights.sum())
combined_policy_multipliers *= importance_weights.reshape(-1, 1)
# Train on replay batch, top-k MLE.
assert self.program_count is not None
fetches = {
'global_step': global_step_op,
'program_count': self.program_count,
'summaries': self.rl_summary_op,
'train_op': train_op,
'gradients': self.gradients_dict if return_gradients else self.no_op}
fetched = session.run(
fetches,
{self.actions: combined_actions,
self.empirical_values: [[]], # replay_emp_values,
self.policy_multipliers: combined_policy_multipliers,
self.adjusted_lengths: combined_adjusted_lengths,
self.off_policy_targets: off_policy_targets,
self.off_policy_target_lengths: off_policy_target_lengths,
self.offp_switch: offp_switch})
# Add to experience replay buffer.
self.experience_replay.add_many(
objs=new_experiences,
weights=[exp(r / self.replay_temperature) for r in batch_tot_r],
keys=code_strings)
# Update program count.
session.run(
[self.program_count_add_op],
{self.program_count_add_ph: num_programs_from_policy})
# Update EMA baselines on the mini-batch which we just did traning on.
if not self.a2c:
for i in xrange(rl_batch.batch_size):
episode_length = combined_adjusted_lengths[i]
empirical_returns = combined_returns[i, :episode_length]
for j in xrange(episode_length):
# Update ema_baselines in place.
self.ema_by_len[j] = (
self.ema_baseline_decay * self.ema_by_len[j]
+ (1 - self.ema_baseline_decay) * empirical_returns[j])
global_step = fetched['global_step']
global_npe = fetched['program_count']
core_summaries = fetched['summaries']
summaries_list = [core_summaries]
if num_programs_from_policy:
s_i = 0
text_summary = self._rl_text_summary(
session,
global_step,
global_npe,
batch_tot_r[s_i],
episode_lengths[s_i], test_cases[s_i],
code_outputs[s_i], code_strings[s_i], reasons[s_i])
reward_summary = self._rl_reward_summary(batch_tot_r)
is_best = False
if self.global_best_reward_fn:
# Save best reward.
best_reward = np.max(batch_tot_r)
is_best = self.global_best_reward_fn(session, best_reward)
if self.found_solution_op is not None and 'correct' in reasons:
session.run(self.found_solution_op)
# Save program to disk for record keeping.
if self.stop_on_success:
solutions = [
{'code': code_strings[i], 'reward': batch_tot_r[i],
'npe': global_npe}
for i in xrange(len(reasons)) if reasons[i] == 'correct']
elif is_best:
solutions = [
{'code': code_strings[np.argmax(batch_tot_r)],
'reward': np.max(batch_tot_r),
'npe': global_npe}]
else:
solutions = []
if solutions:
if self.assign_code_solution_fn:
self.assign_code_solution_fn(session, solutions[0]['code'])
with tf.gfile.FastGFile(self.logging_file, 'a') as writer:
for solution_dict in solutions:
writer.write(str(solution_dict) + '\n')
max_i = np.argmax(batch_tot_r)
max_tot_r = batch_tot_r[max_i]
if max_tot_r >= self.top_reward:
if max_tot_r >= self.top_reward:
self.top_reward = max_tot_r
logging.info('Top code: r=%.2f, \t%s', max_tot_r, code_strings[max_i])
if self.top_episodes is not None:
self.top_episodes.push(
max_tot_r, tuple(batch_actions[max_i, :episode_lengths[max_i]]))
summaries_list += [text_summary, reward_summary]
if self.do_iw_summaries and not empty_replay_buffer:
# prob of replay samples under replay buffer sampling.
norm_replay_weights = [
w / self.experience_replay.total_weight
for w in replay_weights]
replay_iw = self._compute_iw(replay_log_probs, replay_weights)
on_policy_iw = self._compute_iw(on_policy_log_probs, on_policy_weights)
summaries_list.append(
self._iw_summary(
session, replay_iw, replay_log_probs, norm_replay_weights,
on_policy_iw, on_policy_log_probs))
return UpdateStepResult(
global_step=global_step,
global_npe=global_npe,
summaries_list=summaries_list,
gradients_dict=fetched['gradients'])
def io_to_text(io_case, io_type):
if isinstance(io_case, misc.IOTuple):
# If there are many strings, join them with ','.
return ','.join([io_to_text(e, io_type) for e in io_case])
if io_type == misc.IOType.string:
# There is one string. Return it.
return misc.tokens_to_text(io_case)
if (io_type == misc.IOType.integer
or io_type == misc.IOType.boolean):
if len(io_case) == 1:
return str(io_case[0])
return str(io_case)
CodeScoreInfo = namedtuple(
'CodeScoreInfo',
['code_strings', 'batch_rewards', 'total_rewards', 'test_cases',
'code_outputs', 'reasons'])
def compute_rewards(rl_batch, batch_actions, episode_lengths, batch_size=None):
"""Compute rewards for each episode in the batch.
Args:
rl_batch: A data.RLBatch instance. This holds information about the task
each episode is solving, and a reward function for each episode.
batch_actions: Contains batch of episodes. Each sequence of actions will be
converted into a BF program and then scored. A numpy array of shape
[batch_size, max_sequence_length].
episode_lengths: The sequence length of each episode in the batch. Iterable
of length batch_size.
batch_size: (optional) number of programs to score. Use this to limit the
number of programs executed from this batch. For example, when doing
importance sampling some of the on-policy episodes will be discarded
and they should not be executed. `batch_size` can be less than or equal
to the size of the input batch.
Returns:
CodeScoreInfo namedtuple instance. This holds not just the computed rewards,
but additional information computed during code execution which can be used
for debugging and monitoring. this includes: BF code strings, test cases
the code was executed on, code outputs from those test cases, and reasons
for success or failure.
"""
code_strings = [
''.join([misc.bf_int2char(a) for a in action_sequence[:l]])
for action_sequence, l in zip(batch_actions, episode_lengths)]
if batch_size is None:
batch_size = len(code_strings)
else:
assert batch_size <= len(code_strings)
code_strings = code_strings[:batch_size]
if isinstance(rl_batch.reward_fns, (list, tuple)):
# reward_fns is a list of functions, same length as code_strings.
assert len(rl_batch.reward_fns) >= batch_size
r_fn_results = [
rl_batch.reward_fns[i](code_strings[i]) for i in xrange(batch_size)]
else:
# reward_fns is allowed to be one function which processes a batch of code
# strings. This is useful for efficiency and batch level computation.
r_fn_results = rl_batch.reward_fns(code_strings)
# Expecting that r_fn returns a list of rewards. Length of list equals
# length of the code string (including EOS char).
batch_rewards = [r.episode_rewards for r in r_fn_results]
total_rewards = [sum(b) for b in batch_rewards]
test_cases = [io_to_text(r.input_case, r.input_type) for r in r_fn_results]
code_outputs = [io_to_text(r.code_output, r.output_type)
for r in r_fn_results]
reasons = [r.reason for r in r_fn_results]
return CodeScoreInfo(
code_strings=code_strings,
batch_rewards=batch_rewards,
total_rewards=total_rewards,
test_cases=test_cases,
code_outputs=code_outputs,
reasons=reasons)
def process_episodes(
batch_rewards, episode_lengths, a2c=False, baselines=None,
batch_values=None):
"""Compute REINFORCE targets.
REINFORCE here takes the form:
grad_t = grad[log(pi(a_t|c_t))*target_t]
where c_t is context: i.e. RNN state or environment state (or both).
Two types of targets are supported:
1) Advantage actor critic (a2c).
2) Vanilla REINFORCE with baseline.
Args:
batch_rewards: Rewards received in each episode in the batch. A numpy array
of shape [batch_size, max_sequence_length]. Note, these are per-timestep
rewards, not total reward.
episode_lengths: Length of each episode. An iterable of length batch_size.
a2c: A bool. Whether to compute a2c targets (True) or vanilla targets
(False).
baselines: If a2c is False, provide baselines for each timestep. This is a
list (or indexable container) of length max_time. Note: baselines are
shared across all episodes, which is why there is no batch dimension.
It is up to the caller to update baselines accordingly.
batch_values: If a2c is True, provide values computed by a value estimator.
A numpy array of shape [batch_size, max_sequence_length].
Returns:
batch_targets: REINFORCE targets for each episode and timestep. A numpy
array of shape [batch_size, max_sequence_length].
batch_returns: Returns computed for each episode and timestep. This is for
reference, and is not used in the REINFORCE gradient update (but was
used to compute the targets). A numpy array of shape
[batch_size, max_sequence_length].
"""
num_programs = len(batch_rewards)
assert num_programs <= len(episode_lengths)
batch_returns = [None] * num_programs
batch_targets = [None] * num_programs
for i in xrange(num_programs):
episode_length = episode_lengths[i]
assert len(batch_rewards[i]) == episode_length
# Compute target for each timestep.
# If we are computing A2C:
# target_t = advantage_t = R_t - V(c_t)
# where V(c_t) is a learned value function (provided as `values`).
# Otherwise:
# target_t = R_t - baselines[t]
# where `baselines` are provided.
# In practice we use a more generalized formulation of advantage. See docs
# for `discounted_advantage_and_rewards`.
if a2c:
# Compute advantage.
assert batch_values is not None
episode_values = batch_values[i, :episode_length]
episode_rewards = batch_rewards[i]
emp_val, gen_adv = rollout_lib.discounted_advantage_and_rewards(
episode_rewards, episode_values, gamma=1.0, lambda_=1.0)
batch_returns[i] = emp_val
batch_targets[i] = gen_adv
else:
# Compute return for each timestep. See section 3 of
# https://arxiv.org/pdf/1602.01783.pdf
assert baselines is not None
empirical_returns = rollout_lib.discount(batch_rewards[i], gamma=1.0)
targets = [None] * episode_length
for j in xrange(episode_length):
targets[j] = empirical_returns[j] - baselines[j]
batch_returns[i] = empirical_returns
batch_targets[i] = targets
batch_returns = utils.stack_pad(batch_returns, 0)
if num_programs:
batch_targets = utils.stack_pad(batch_targets, 0)
else:
batch_targets = np.array([], dtype=np.float32)
return (batch_targets, batch_returns)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment