Commit f5fc733a authored by Byzantine's avatar Byzantine
Browse files

Removing research/community models

parent 09bc9f54
#!/bin/bash
# Launches tuning jobs.
# Modify this file to launch workers with your prefered cloud API.
# The following implementation runs each worker as a subprocess on the local
# machine.
MODELS_DIR="/tmp/models"
# Get command line options.
OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_tuners:,num_workers_per_tuner:,num_ps_per_tuner:,max_npe:,num_repetitions:,stop_on_success:,fixed_hparams:,hparam_space_type:" -- "$@")
if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
eval set -- "$OPTS"
JOB_NAME="" # Name of the process and the logs directory.
CONFIG="" # Model and environment hparams.
# NUM_TUNERS: Number of tuning jobs to launch. Each tuning job can train a
# hparam combination. So more tuners means more hparams tried in parallel.
NUM_TUNERS=1
# NUM_WORKERS_PER_TUNER: Number of workers to launch for each tuning job. If
# using neural networks, each worker will be 1 replica.
NUM_WORKERS_PER_TUNER=1
# NUM_PS_PER_TUNER: Number of parameter servers to launch for this tuning job.
# Only set this if using neural networks. For 1 worker per tuner, no parameter
# servers are needed. For more than 1 worker per tuner, at least 1 parameter
# server per tuner is needed to store the global model for each tuner.
NUM_PS_PER_TUNER=0
# MAX_NPE: Maximum number of programs executed. Training will quit once this
# threshold is reached. If 0, the threshold is infinite.
MAX_NPE=0
NUM_REPETITIONS=25 # How many times to run this experiment.
STOP_ON_SUCCESS=true # Whether to halt training when a solution is found.
# FIXED_HPARAMS: Hold hparams fixed in the grid search. This reduces the search
# space.
FIXED_HPARAMS=""
# HPARAM_SPACE_TYPE: Specifies the hparam search space. See
# `define_tuner_hparam_space` functions defined in pg_train.py and ga_train.py.
HPARAM_SPACE_TYPE="pg"
# Parse options into variables.
while true; do
case "$1" in
--job_name ) JOB_NAME="$2"; shift; shift ;;
--config ) CONFIG="$2"; shift; shift ;;
--num_tuners ) NUM_TUNERS="$2"; shift; shift ;;
--num_workers_per_tuner ) NUM_WORKERS_PER_TUNER="$2"; shift; shift ;;
--num_ps_per_tuner ) NUM_PS_PER_TUNER="$2"; shift; shift ;;
--max_npe ) MAX_NPE="$2"; shift; shift ;;
--num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
--stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
--fixed_hparams ) FIXED_HPARAMS="$2"; shift; shift ;;
--hparam_space_type ) HPARAM_SPACE_TYPE="$2"; shift; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
# Launch jobs.
# TODO: multi-worker RL training
LOGDIR="$MODELS_DIR/$JOB_NAME"
mkdir -p $LOGDIR
BIN_DIR="bazel-bin/single_task"
for ((tuner=0;tuner<NUM_TUNERS;tuner+=1)); do
for ((i=0;i<NUM_WORKERS_PER_TUNER;i++)); do
# Expecting tune.par to be built.
echo "$LOGDIR"
$BIN_DIR/tune.par \
--alsologtostderr \
--config="$CONFIG" \
--logdir="$LOGDIR" \
--max_npe="$MAX_NPE" \
--num_repetitions="$NUM_REPETITIONS" \
--stop_on_success="$STOP_ON_SUCCESS" \
--summary_tasks=1 \
--hparam_space="$HPARAM_SPACE_TYPE" \
--fixed_hparams="$FIXED_HPARAMS" \
--tuner_id=$tuner \
--num_tuners=$NUM_TUNERS \
2> "$LOGDIR/tuner_$tuner.task_$i.log" & # Run as subprocess
echo "Launched tuner $tuner, task $i. Logs: $LOGDIR/tuner_$tuner.task_$i.log"
done
done
# Use "pidof tune.par" to find jobs.
# Kill with "pkill tune.par"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Utilities specific to this project."""
from collections import namedtuple
from six import string_types
#####################
# BF-lang utilities #
#####################
BF_EOS_INT = 0 # Also used as SOS (start of sequence).
BF_EOS_CHAR = TEXT_EOS_CHAR = '_'
BF_LANG_INTS = range(1, 9)
BF_INT_TO_CHAR = [BF_EOS_CHAR, '>', '<', '+', '-', '[', ']', '.', ',']
BF_CHAR_TO_INT = dict([(c, i) for i, c in enumerate(BF_INT_TO_CHAR)])
RewardInfo = namedtuple('RewardInfo', ['episode_rewards', 'input_case',
'correct_output',
'code_output', 'reason', 'input_type',
'output_type'])
class IOType(object):
string = 'string'
integer = 'integer'
boolean = 'boolean'
class IOTuple(tuple):
pass
def flatten(lst):
return [item for row in lst for item in row]
def bf_num_tokens():
# BF tokens plus EOS.
return len(BF_INT_TO_CHAR)
def bf_char2int(bf_char):
"""Convert BF code char to int token."""
return BF_CHAR_TO_INT[bf_char]
def bf_int2char(bf_int):
"""Convert BF int token to code char."""
return BF_INT_TO_CHAR[bf_int]
def bf_tokens_to_string(bf_tokens, truncate=True):
"""Convert token list to code string. Will truncate at EOS token.
Args:
bf_tokens: Python list of ints representing the code string.
truncate: If true, the output string will end at the first EOS token.
If false, the entire token list is converted to string.
Returns:
String representation of the tokens.
Raises:
ValueError: If bf_tokens is not a python list.
"""
if not isinstance(bf_tokens, list):
raise ValueError('Only python list supported here.')
if truncate:
try:
eos_index = bf_tokens.index(BF_EOS_INT)
except ValueError:
eos_index = len(bf_tokens)
else:
eos_index = len(bf_tokens)
return ''.join([BF_INT_TO_CHAR[t] for t in bf_tokens[:eos_index]])
def bf_string_to_tokens(bf_string):
"""Convert string to token list. Will strip and append EOS token."""
tokens = [BF_CHAR_TO_INT[char] for char in bf_string.strip()]
tokens.append(BF_EOS_INT)
return tokens
def tokens_to_text(tokens):
"""Convert token list to human readable text."""
return ''.join(
[TEXT_EOS_CHAR if t == 0 else chr(t - 1 + ord('A')) for t in tokens])
###################################
# Number representation utilities #
###################################
# https://en.wikipedia.org/wiki/Metric_prefix
si_magnitudes = {
'k': 1e3,
'm': 1e6,
'g': 1e9}
def si_to_int(s):
"""Convert string ending with SI magnitude to int.
Examples: 5K ==> 5000, 12M ==> 12000000.
Args:
s: String in the form 'xx..xP' where x is a digit and P is an SI prefix.
Returns:
Integer equivalent to the string.
"""
if isinstance(s, string_types) and s[-1].lower() in si_magnitudes.keys():
return int(int(s[:-1]) * si_magnitudes[s[-1].lower()])
return int(s)
def int_to_si(n):
"""Convert integer to string with SI magnitude.
`n` will be truncated.
Examples: 5432 ==> 5k, 12345678 ==> 12M
Args:
n: Integer to represent as a string.
Returns:
String representation of `n` containing SI magnitude.
"""
m = abs(n)
sign = -1 if n < 0 else 1
if m < 1e3:
return str(n)
if m < 1e6:
return '{0}K'.format(sign*int(m / 1e3))
if m < 1e9:
return '{0}M'.format(sign*int(m / 1e6))
if m < 1e12:
return '{0}G'.format(sign*int(m / 1e9))
return str(m)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Language model agent.
Agent outputs code in a sequence just like a language model. Can be trained
as a language model or using RL, or a combination of the two.
"""
from collections import namedtuple
from math import exp
from math import log
import time
from absl import logging
import numpy as np
from six.moves import xrange
import tensorflow as tf
from common import rollout as rollout_lib # brain coder
from common import utils # brain coder
from single_task import misc # brain coder
# Experiments in the ICLR 2018 paper used reduce_sum instead of reduce_mean for
# some losses. We make all loses be batch_size independent, and multiply the
# changed losses by 64, which was the fixed batch_size when the experiments
# where run. The loss hyperparameters still match what is reported in the paper.
MAGIC_LOSS_MULTIPLIER = 64
def rshift_time(tensor_2d, fill=misc.BF_EOS_INT):
"""Right shifts a 2D tensor along the time dimension (axis-1)."""
dim_0 = tf.shape(tensor_2d)[0]
fill_tensor = tf.fill([dim_0, 1], fill)
return tf.concat([fill_tensor, tensor_2d[:, :-1]], axis=1)
def join(a, b):
# Concat a and b along 0-th dim.
if a is None or len(a) == 0: # pylint: disable=g-explicit-length-test
return b
if b is None or len(b) == 0: # pylint: disable=g-explicit-length-test
return a
return np.concatenate((a, b))
def make_optimizer(kind, lr):
if kind == 'sgd':
return tf.train.GradientDescentOptimizer(lr)
elif kind == 'adam':
return tf.train.AdamOptimizer(lr)
elif kind == 'rmsprop':
return tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.99)
else:
raise ValueError('Optimizer type "%s" not recognized.' % kind)
class LinearWrapper(tf.contrib.rnn.RNNCell):
"""RNNCell wrapper that adds a linear layer to the output."""
def __init__(self, cell, output_size, dtype=tf.float32, suppress_index=None):
self.cell = cell
self._output_size = output_size
self._dtype = dtype
self._suppress_index = suppress_index
self.smallest_float = -2.4e38
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(type(self).__name__):
outputs, state = self.cell(inputs, state, scope=scope)
logits = tf.matmul(
outputs,
tf.get_variable('w_output',
[self.cell.output_size, self.output_size],
dtype=self._dtype))
if self._suppress_index is not None:
# Replace the target index with -inf, so that it never gets selected.
batch_size = tf.shape(logits)[0]
logits = tf.concat(
[logits[:, :self._suppress_index],
tf.fill([batch_size, 1], self.smallest_float),
logits[:, self._suppress_index + 1:]],
axis=1)
return logits, state
@property
def output_size(self):
return self._output_size
@property
def state_size(self):
return self.cell.state_size
def zero_state(self, batch_size, dtype):
return self.cell.zero_state(batch_size, dtype)
UpdateStepResult = namedtuple(
'UpdateStepResult',
['global_step', 'global_npe', 'summaries_list', 'gradients_dict'])
class AttrDict(dict):
"""Dict with attributes as keys.
https://stackoverflow.com/a/14620633
"""
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
class LMAgent(object):
"""Language model agent."""
action_space = misc.bf_num_tokens()
observation_space = misc.bf_num_tokens()
def __init__(self, global_config, task_id=0,
logging_file=None,
experience_replay_file=None,
global_best_reward_fn=None,
found_solution_op=None,
assign_code_solution_fn=None,
program_count=None,
do_iw_summaries=False,
stop_on_success=True,
dtype=tf.float32,
verbose_level=0,
is_local=True):
self.config = config = global_config.agent
self.logging_file = logging_file
self.experience_replay_file = experience_replay_file
self.task_id = task_id
self.verbose_level = verbose_level
self.global_best_reward_fn = global_best_reward_fn
self.found_solution_op = found_solution_op
self.assign_code_solution_fn = assign_code_solution_fn
self.parent_scope_name = tf.get_variable_scope().name
self.dtype = dtype
self.allow_eos_token = config.eos_token
self.stop_on_success = stop_on_success
self.pi_loss_hparam = config.pi_loss_hparam
self.vf_loss_hparam = config.vf_loss_hparam
self.is_local = is_local
self.top_reward = 0.0
self.embeddings_trainable = True
self.no_op = tf.no_op()
self.learning_rate = tf.constant(
config.lr, dtype=dtype, name='learning_rate')
self.initializer = tf.contrib.layers.variance_scaling_initializer(
factor=config.param_init_factor,
mode='FAN_AVG',
uniform=True,
dtype=dtype) # TF's default initializer.
tf.get_variable_scope().set_initializer(self.initializer)
self.a2c = config.ema_baseline_decay == 0
if not self.a2c:
logging.info('Using exponential moving average REINFORCE baselines.')
self.ema_baseline_decay = config.ema_baseline_decay
self.ema_by_len = [0.0] * global_config.timestep_limit
else:
logging.info('Using advantage (a2c) with learned value function.')
self.ema_baseline_decay = 0.0
self.ema_by_len = None
# Top-k
if config.topk and config.topk_loss_hparam:
self.topk_loss_hparam = config.topk_loss_hparam
self.topk_batch_size = config.topk_batch_size
if self.topk_batch_size <= 0:
raise ValueError('topk_batch_size must be a positive integer. Got %s',
self.topk_batch_size)
self.top_episodes = utils.MaxUniquePriorityQueue(config.topk)
logging.info('Made max-priorty-queue with capacity %d',
self.top_episodes.capacity)
else:
self.top_episodes = None
self.topk_loss_hparam = 0.0
logging.info('No max-priorty-queue')
# Experience replay.
self.replay_temperature = config.replay_temperature
self.num_replay_per_batch = int(global_config.batch_size * config.alpha)
self.num_on_policy_per_batch = (
global_config.batch_size - self.num_replay_per_batch)
self.replay_alpha = (
self.num_replay_per_batch / float(global_config.batch_size))
logging.info('num_replay_per_batch: %d', self.num_replay_per_batch)
logging.info('num_on_policy_per_batch: %d', self.num_on_policy_per_batch)
logging.info('replay_alpha: %s', self.replay_alpha)
if self.num_replay_per_batch > 0:
# Train with off-policy episodes from replay buffer.
start_time = time.time()
self.experience_replay = utils.RouletteWheel(
unique_mode=True, save_file=experience_replay_file)
logging.info('Took %s sec to load replay buffer from disk.',
int(time.time() - start_time))
logging.info('Replay buffer file location: "%s"',
self.experience_replay.save_file)
else:
# Only train on-policy.
self.experience_replay = None
if program_count is not None:
self.program_count = program_count
self.program_count_add_ph = tf.placeholder(
tf.int64, [], 'program_count_add_ph')
self.program_count_add_op = self.program_count.assign_add(
self.program_count_add_ph)
################################
# RL policy and value networks #
################################
batch_size = global_config.batch_size
logging.info('batch_size: %d', batch_size)
self.policy_cell = LinearWrapper(
tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.BasicLSTMCell(cell_size)
for cell_size in config.policy_lstm_sizes]),
self.action_space,
dtype=dtype,
suppress_index=None if self.allow_eos_token else misc.BF_EOS_INT)
self.value_cell = LinearWrapper(
tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.BasicLSTMCell(cell_size)
for cell_size in config.value_lstm_sizes]),
1,
dtype=dtype)
obs_embedding_scope = 'obs_embed'
with tf.variable_scope(
obs_embedding_scope,
initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)):
obs_embeddings = tf.get_variable(
'embeddings',
[self.observation_space, config.obs_embedding_size],
dtype=dtype, trainable=self.embeddings_trainable)
self.obs_embeddings = obs_embeddings
################################
# RL policy and value networks #
################################
initial_state = tf.fill([batch_size], misc.BF_EOS_INT)
def loop_fn(loop_time, cell_output, cell_state, loop_state):
"""Function called by tf.nn.raw_rnn to instantiate body of the while_loop.
See https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn for more
information.
When time is 0, and cell_output, cell_state, loop_state are all None,
`loop_fn` will create the initial input, internal cell state, and loop
state. When time > 0, `loop_fn` will operate on previous cell output,
state, and loop state.
Args:
loop_time: A scalar tensor holding the current timestep (zero based
counting).
cell_output: Output of the raw_rnn cell at the current timestep.
cell_state: Cell internal state at the current timestep.
loop_state: Additional loop state. These tensors were returned by the
previous call to `loop_fn`.
Returns:
elements_finished: Bool tensor of shape [batch_size] which marks each
sequence in the batch as being finished or not finished.
next_input: A tensor containing input to be fed into the cell at the
next timestep.
next_cell_state: Cell internal state to be fed into the cell at the
next timestep.
emit_output: Tensor to be added to the TensorArray returned by raw_rnn
as output from the while_loop.
next_loop_state: Additional loop state. These tensors will be fed back
into the next call to `loop_fn` as `loop_state`.
"""
if cell_output is None: # 0th time step.
next_cell_state = self.policy_cell.zero_state(batch_size, dtype)
elements_finished = tf.zeros([batch_size], tf.bool)
output_lengths = tf.ones([batch_size], dtype=tf.int32)
next_input = tf.gather(obs_embeddings, initial_state)
emit_output = None
next_loop_state = (
tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True),
output_lengths,
elements_finished
)
else:
scaled_logits = cell_output * config.softmax_tr # Scale temperature.
prev_chosen, prev_output_lengths, prev_elements_finished = loop_state
next_cell_state = cell_state
chosen_outputs = tf.to_int32(tf.where(
tf.logical_not(prev_elements_finished),
tf.multinomial(logits=scaled_logits, num_samples=1)[:, 0],
tf.zeros([batch_size], dtype=tf.int64)))
elements_finished = tf.logical_or(
tf.equal(chosen_outputs, misc.BF_EOS_INT),
loop_time >= global_config.timestep_limit)
output_lengths = tf.where(
elements_finished,
prev_output_lengths,
# length includes EOS token. empty seq has len 1.
tf.tile(tf.expand_dims(loop_time + 1, 0), [batch_size])
)
next_input = tf.gather(obs_embeddings, chosen_outputs)
emit_output = scaled_logits
next_loop_state = (prev_chosen.write(loop_time - 1, chosen_outputs),
output_lengths,
tf.logical_or(prev_elements_finished,
elements_finished))
return (elements_finished, next_input, next_cell_state, emit_output,
next_loop_state)
with tf.variable_scope('policy'):
(decoder_outputs_ta,
_, # decoder_state
(sampled_output_ta, output_lengths, _)) = tf.nn.raw_rnn(
cell=self.policy_cell,
loop_fn=loop_fn)
policy_logits = tf.transpose(decoder_outputs_ta.stack(), (1, 0, 2),
name='policy_logits')
sampled_tokens = tf.transpose(sampled_output_ta.stack(), (1, 0),
name='sampled_tokens')
# Add SOS to beginning of the sequence.
rshift_sampled_tokens = rshift_time(sampled_tokens, fill=misc.BF_EOS_INT)
# Initial state is 0, 2nd state is first token.
# Note: If value of last state is computed, this will be used as bootstrap.
if self.a2c:
with tf.variable_scope('value'):
value_output, _ = tf.nn.dynamic_rnn(
self.value_cell,
tf.gather(obs_embeddings, rshift_sampled_tokens),
sequence_length=output_lengths,
dtype=dtype)
value = tf.squeeze(value_output, axis=[2])
else:
value = tf.zeros([], dtype=dtype)
# for sampling actions from the agent, and which told tensors for doing
# gradient updates on the agent.
self.sampled_batch = AttrDict(
logits=policy_logits,
value=value,
tokens=sampled_tokens,
episode_lengths=output_lengths,
probs=tf.nn.softmax(policy_logits),
log_probs=tf.nn.log_softmax(policy_logits))
# adjusted_lengths can be less than the full length of each episode.
# Use this to train on only part of an episode (starting from t=0).
self.adjusted_lengths = tf.placeholder(
tf.int32, [None], name='adjusted_lengths')
self.policy_multipliers = tf.placeholder(
dtype,
[None, None],
name='policy_multipliers')
# Empirical value, i.e. discounted sum of observed future rewards from each
# time step in the episode.
self.empirical_values = tf.placeholder(
dtype,
[None, None],
name='empirical_values')
# Off-policy training. Just add supervised loss to the RL loss.
self.off_policy_targets = tf.placeholder(
tf.int32,
[None, None],
name='off_policy_targets')
self.off_policy_target_lengths = tf.placeholder(
tf.int32, [None], name='off_policy_target_lengths')
self.actions = tf.placeholder(tf.int32, [None, None], name='actions')
# Add SOS to beginning of the sequence.
inputs = rshift_time(self.actions, fill=misc.BF_EOS_INT)
with tf.variable_scope('policy', reuse=True):
logits, _ = tf.nn.dynamic_rnn(
self.policy_cell, tf.gather(obs_embeddings, inputs),
sequence_length=self.adjusted_lengths,
dtype=dtype)
if self.a2c:
with tf.variable_scope('value', reuse=True):
value_output, _ = tf.nn.dynamic_rnn(
self.value_cell,
tf.gather(obs_embeddings, inputs),
sequence_length=self.adjusted_lengths,
dtype=dtype)
value2 = tf.squeeze(value_output, axis=[2])
else:
value2 = tf.zeros([], dtype=dtype)
self.given_batch = AttrDict(
logits=logits,
value=value2,
tokens=sampled_tokens,
episode_lengths=self.adjusted_lengths,
probs=tf.nn.softmax(logits),
log_probs=tf.nn.log_softmax(logits))
# Episode masks.
max_episode_length = tf.shape(self.actions)[1]
# range_row shape: [1, max_episode_length]
range_row = tf.expand_dims(tf.range(max_episode_length), 0)
episode_masks = tf.cast(
tf.less(range_row, tf.expand_dims(self.given_batch.episode_lengths, 1)),
dtype=dtype)
episode_masks_3d = tf.expand_dims(episode_masks, 2)
# Length adjusted episodes.
self.a_probs = a_probs = self.given_batch.probs * episode_masks_3d
self.a_log_probs = a_log_probs = (
self.given_batch.log_probs * episode_masks_3d)
self.a_value = a_value = self.given_batch.value * episode_masks
self.a_policy_multipliers = a_policy_multipliers = (
self.policy_multipliers * episode_masks)
if self.a2c:
self.a_empirical_values = a_empirical_values = (
self.empirical_values * episode_masks)
# pi_loss is scalar
acs_onehot = tf.one_hot(self.actions, self.action_space, dtype=dtype)
self.acs_onehot = acs_onehot
chosen_masked_log_probs = acs_onehot * a_log_probs
pi_target = tf.expand_dims(a_policy_multipliers, -1)
pi_loss_per_step = chosen_masked_log_probs * pi_target # Maximize.
self.pi_loss = pi_loss = (
-tf.reduce_mean(tf.reduce_sum(pi_loss_per_step, axis=[1, 2]), axis=0)
* MAGIC_LOSS_MULTIPLIER) # Minimize.
assert len(self.pi_loss.shape) == 0 # pylint: disable=g-explicit-length-test
# shape: [batch_size, time]
self.chosen_log_probs = tf.reduce_sum(chosen_masked_log_probs, axis=2)
self.chosen_probs = tf.reduce_sum(acs_onehot * a_probs, axis=2)
# loss of value function
if self.a2c:
vf_loss_per_step = tf.square(a_value - a_empirical_values)
self.vf_loss = vf_loss = (
tf.reduce_mean(tf.reduce_sum(vf_loss_per_step, axis=1), axis=0)
* MAGIC_LOSS_MULTIPLIER) # Minimize.
assert len(self.vf_loss.shape) == 0 # pylint: disable=g-explicit-length-test
else:
self.vf_loss = vf_loss = 0.0
# Maximize entropy regularizer
self.entropy = entropy = (
-tf.reduce_mean(
tf.reduce_sum(a_probs * a_log_probs, axis=[1, 2]), axis=0)
* MAGIC_LOSS_MULTIPLIER) # Maximize
self.negentropy = -entropy # Minimize negentropy.
assert len(self.negentropy.shape) == 0 # pylint: disable=g-explicit-length-test
# off-policy loss
self.offp_switch = tf.placeholder(dtype, [], name='offp_switch')
if self.top_episodes is not None:
# Add SOS to beginning of the sequence.
offp_inputs = tf.gather(obs_embeddings,
rshift_time(self.off_policy_targets,
fill=misc.BF_EOS_INT))
with tf.variable_scope('policy', reuse=True):
offp_logits, _ = tf.nn.dynamic_rnn(
self.policy_cell, offp_inputs, self.off_policy_target_lengths,
dtype=dtype) # shape: [batch_size, time, action_space]
topk_loss_per_step = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=self.off_policy_targets,
logits=offp_logits,
name='topk_loss_per_logit')
# Take mean over batch dimension so that the loss multiplier strength is
# independent of batch size. Sum over time dimension.
topk_loss = tf.reduce_mean(
tf.reduce_sum(topk_loss_per_step, axis=1), axis=0)
assert len(topk_loss.shape) == 0 # pylint: disable=g-explicit-length-test
self.topk_loss = topk_loss * self.offp_switch
logging.info('Including off policy loss.')
else:
self.topk_loss = topk_loss = 0.0
self.entropy_hparam = tf.constant(
config.entropy_beta, dtype=dtype, name='entropy_beta')
self.pi_loss_term = pi_loss * self.pi_loss_hparam
self.vf_loss_term = vf_loss * self.vf_loss_hparam
self.entropy_loss_term = self.negentropy * self.entropy_hparam
self.topk_loss_term = self.topk_loss_hparam * topk_loss
self.loss = (
self.pi_loss_term
+ self.vf_loss_term
+ self.entropy_loss_term
+ self.topk_loss_term)
params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
self.trainable_variables = params
self.sync_variables = self.trainable_variables
non_embedding_params = [p for p in params
if obs_embedding_scope not in p.name]
self.non_embedding_params = non_embedding_params
self.params = params
if config.regularizer:
logging.info('Adding L2 regularizer with scale %.2f.',
config.regularizer)
self.regularizer = config.regularizer * sum(
tf.nn.l2_loss(w) for w in non_embedding_params)
self.loss += self.regularizer
else:
logging.info('Skipping regularizer.')
self.regularizer = 0.0
# Only build gradients graph for local model.
if self.is_local:
unclipped_grads = tf.gradients(self.loss, params)
self.dense_unclipped_grads = [
tf.convert_to_tensor(g) for g in unclipped_grads]
self.grads, self.global_grad_norm = tf.clip_by_global_norm(
unclipped_grads, config.grad_clip_threshold)
self.gradients_dict = dict(zip(params, self.grads))
self.optimizer = make_optimizer(config.optimizer, self.learning_rate)
self.all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
tf.get_variable_scope().name)
self.do_iw_summaries = do_iw_summaries
if self.do_iw_summaries:
b = None
self.log_iw_replay_ph = tf.placeholder(tf.float32, [b],
'log_iw_replay_ph')
self.log_iw_policy_ph = tf.placeholder(tf.float32, [b],
'log_iw_policy_ph')
self.log_prob_replay_ph = tf.placeholder(tf.float32, [b],
'log_prob_replay_ph')
self.log_prob_policy_ph = tf.placeholder(tf.float32, [b],
'log_prob_policy_ph')
self.log_norm_replay_weights_ph = tf.placeholder(
tf.float32, [b], 'log_norm_replay_weights_ph')
self.iw_summary_op = tf.summary.merge([
tf.summary.histogram('is/log_iw_replay', self.log_iw_replay_ph),
tf.summary.histogram('is/log_iw_policy', self.log_iw_policy_ph),
tf.summary.histogram('is/log_prob_replay', self.log_prob_replay_ph),
tf.summary.histogram('is/log_prob_policy', self.log_prob_policy_ph),
tf.summary.histogram(
'is/log_norm_replay_weights', self.log_norm_replay_weights_ph),
])
def make_summary_ops(self):
"""Construct summary ops for the model."""
# size = number of timesteps across entire batch. Number normalized by size
# will not be affected by the amount of padding at the ends of sequences
# in the batch.
size = tf.cast(
tf.reduce_sum(self.given_batch.episode_lengths), dtype=self.dtype)
offp_size = tf.cast(tf.reduce_sum(self.off_policy_target_lengths),
dtype=self.dtype)
scope_prefix = self.parent_scope_name
def _remove_prefix(prefix, name):
assert name.startswith(prefix)
return name[len(prefix):]
# RL summaries.
self.rl_summary_op = tf.summary.merge(
[tf.summary.scalar('model/policy_loss', self.pi_loss / size),
tf.summary.scalar('model/value_loss', self.vf_loss / size),
tf.summary.scalar('model/topk_loss', self.topk_loss / offp_size),
tf.summary.scalar('model/entropy', self.entropy / size),
tf.summary.scalar('model/loss', self.loss / size),
tf.summary.scalar('model/grad_norm',
tf.global_norm(self.grads)),
tf.summary.scalar('model/unclipped_grad_norm', self.global_grad_norm),
tf.summary.scalar('model/non_embedding_var_norm',
tf.global_norm(self.non_embedding_params)),
tf.summary.scalar('hparams/entropy_beta', self.entropy_hparam),
tf.summary.scalar('hparams/topk_loss_hparam', self.topk_loss_hparam),
tf.summary.scalar('hparams/learning_rate', self.learning_rate),
tf.summary.scalar('model/trainable_var_norm',
tf.global_norm(self.trainable_variables)),
tf.summary.scalar('loss/loss', self.loss),
tf.summary.scalar('loss/entropy', self.entropy_loss_term),
tf.summary.scalar('loss/vf', self.vf_loss_term),
tf.summary.scalar('loss/policy', self.pi_loss_term),
tf.summary.scalar('loss/offp', self.topk_loss_term)] +
[tf.summary.scalar(
'param_norms/' + _remove_prefix(scope_prefix + '/', p.name),
tf.norm(p))
for p in self.params] +
[tf.summary.scalar(
'grad_norms/' + _remove_prefix(scope_prefix + '/', p.name),
tf.norm(g))
for p, g in zip(self.params, self.grads)] +
[tf.summary.scalar(
'unclipped_grad_norms/' + _remove_prefix(scope_prefix + '/',
p.name),
tf.norm(g))
for p, g in zip(self.params, self.dense_unclipped_grads)])
self.text_summary_placeholder = tf.placeholder(tf.string, shape=[])
self.rl_text_summary_op = tf.summary.text('rl',
self.text_summary_placeholder)
def _rl_text_summary(self, session, step, npe, tot_r, num_steps,
input_case, code_output, code, reason):
"""Logs summary about a single episode and creates a text_summary for TB.
Args:
session: tf.Session instance.
step: Global training step.
npe: Number of programs executed so far.
tot_r: Total reward.
num_steps: Number of timesteps in the episode (i.e. code length).
input_case: Inputs for test cases.
code_output: Outputs produced by running the code on the inputs.
code: String representation of the code.
reason: Reason for the reward assigned by the task.
Returns:
Serialized text summary data for tensorboard.
"""
if not input_case:
input_case = ' '
if not code_output:
code_output = ' '
if not code:
code = ' '
text = (
'Tot R: **%.2f**; Len: **%d**; Reason: **%s**\n\n'
'Input: **`%s`**; Output: **`%s`**\n\nCode: **`%s`**'
% (tot_r, num_steps, reason, input_case, code_output, code))
text_summary = session.run(self.rl_text_summary_op,
{self.text_summary_placeholder: text})
logging.info(
'Step %d.\t NPE: %d\t Reason: %s.\t Tot R: %.2f.\t Length: %d. '
'\tInput: %s \tOutput: %s \tProgram: %s',
step, npe, reason, tot_r, num_steps, input_case,
code_output, code)
return text_summary
def _rl_reward_summary(self, total_rewards):
"""Create summary ops that report on episode rewards.
Creates summaries for average, median, max, and min rewards in the batch.
Args:
total_rewards: Tensor of shape [batch_size] containing the total reward
from each episode in the batch.
Returns:
tf.Summary op.
"""
tr = np.asarray(total_rewards)
reward_summary = tf.Summary(value=[
tf.Summary.Value(
tag='reward/avg',
simple_value=np.mean(tr)),
tf.Summary.Value(
tag='reward/med',
simple_value=np.median(tr)),
tf.Summary.Value(
tag='reward/max',
simple_value=np.max(tr)),
tf.Summary.Value(
tag='reward/min',
simple_value=np.min(tr))])
return reward_summary
def _iw_summary(self, session, replay_iw, replay_log_probs,
norm_replay_weights, on_policy_iw,
on_policy_log_probs):
"""Compute summaries for importance weights at a given batch.
Args:
session: tf.Session instance.
replay_iw: Importance weights for episodes from replay buffer.
replay_log_probs: Total log probabilities of the replay episodes under the
current policy.
norm_replay_weights: Normalized replay weights, i.e. values in `replay_iw`
divided by the total weight in the entire replay buffer. Note, this is
also the probability of selecting each episode from the replay buffer
(in a roulette wheel replay buffer).
on_policy_iw: Importance weights for episodes sampled from the current
policy.
on_policy_log_probs: Total log probabilities of the on-policy episodes
under the current policy.
Returns:
Serialized TF summaries. Use a summary writer to write these summaries to
disk.
"""
return session.run(
self.iw_summary_op,
{self.log_iw_replay_ph: np.log(replay_iw),
self.log_iw_policy_ph: np.log(on_policy_iw),
self.log_norm_replay_weights_ph: np.log(norm_replay_weights),
self.log_prob_replay_ph: replay_log_probs,
self.log_prob_policy_ph: on_policy_log_probs})
def _compute_iw(self, policy_log_probs, replay_weights):
"""Compute importance weights for a batch of episodes.
Arguments are iterables of length batch_size.
Args:
policy_log_probs: Log probability of each episode under the current
policy.
replay_weights: Weight of each episode in the replay buffer. 0 for
episodes not sampled from the replay buffer (i.e. sampled from the
policy).
Returns:
Numpy array of shape [batch_size] containing the importance weight for
each episode in the batch.
"""
log_total_replay_weight = log(self.experience_replay.total_weight)
# importance weight
# = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
# = 1 / ((1-a) + a*q/p)
a = float(self.replay_alpha)
a_com = 1.0 - a # compliment of a
importance_weights = np.asarray(
[1.0 / (a_com
+ a * exp((log(replay_weight) - log_total_replay_weight)
- log_p))
if replay_weight > 0 else 1.0 / a_com
for log_p, replay_weight
in zip(policy_log_probs, replay_weights)])
return importance_weights
def update_step(self, session, rl_batch, train_op, global_step_op,
return_gradients=False):
"""Perform gradient update on the model.
Args:
session: tf.Session instance.
rl_batch: RLBatch instance from data.py. Use DataManager to create a
RLBatch for each call to update_step. RLBatch contains a batch of
tasks.
train_op: A TF op which will perform the gradient update. LMAgent does not
own its training op, so that trainers can do distributed training
and construct a specialized training op.
global_step_op: A TF op which will return the current global step when
run (should not increment it).
return_gradients: If True, the gradients will be saved and returned from
this method call. This is useful for testing.
Returns:
Results from the update step in a UpdateStepResult namedtuple, including
global step, global NPE, serialized summaries, and optionally gradients.
"""
assert self.is_local
# Do update for REINFORCE or REINFORCE + replay buffer.
if self.experience_replay is None:
# Train with on-policy REINFORCE.
# Sample new programs from the policy.
num_programs_from_policy = rl_batch.batch_size
(batch_actions,
batch_values,
episode_lengths) = session.run(
[self.sampled_batch.tokens, self.sampled_batch.value,
self.sampled_batch.episode_lengths])
if episode_lengths.size == 0:
# This should not happen.
logging.warn(
'Shapes:\n'
'batch_actions.shape: %s\n'
'batch_values.shape: %s\n'
'episode_lengths.shape: %s\n',
batch_actions.shape, batch_values.shape, episode_lengths.shape)
# Compute rewards.
code_scores = compute_rewards(
rl_batch, batch_actions, episode_lengths)
code_strings = code_scores.code_strings
batch_tot_r = code_scores.total_rewards
test_cases = code_scores.test_cases
code_outputs = code_scores.code_outputs
reasons = code_scores.reasons
# Process on-policy samples.
batch_targets, batch_returns = process_episodes(
code_scores.batch_rewards, episode_lengths, a2c=self.a2c,
baselines=self.ema_by_len,
batch_values=batch_values)
batch_policy_multipliers = batch_targets
batch_emp_values = batch_returns if self.a2c else [[]]
adjusted_lengths = episode_lengths
if self.top_episodes:
assert len(self.top_episodes) > 0 # pylint: disable=g-explicit-length-test
off_policy_targets = [
item for item, _
in self.top_episodes.random_sample(self.topk_batch_size)]
off_policy_target_lengths = [len(t) for t in off_policy_targets]
off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
dtype=np.int32)
offp_switch = 1
else:
off_policy_targets = [[0]]
off_policy_target_lengths = [1]
offp_switch = 0
fetches = {
'global_step': global_step_op,
'program_count': self.program_count,
'summaries': self.rl_summary_op,
'train_op': train_op,
'gradients': self.gradients_dict if return_gradients else self.no_op}
fetched = session.run(
fetches,
{self.actions: batch_actions,
self.empirical_values: batch_emp_values,
self.policy_multipliers: batch_policy_multipliers,
self.adjusted_lengths: adjusted_lengths,
self.off_policy_targets: off_policy_targets,
self.off_policy_target_lengths: off_policy_target_lengths,
self.offp_switch: offp_switch})
combined_adjusted_lengths = adjusted_lengths
combined_returns = batch_returns
else:
# Train with REINFORCE + off-policy replay buffer by using importance
# sampling.
# Sample new programs from the policy.
# Note: batch size is constant. A full batch will be sampled, but not all
# programs will be executed and added to the replay buffer. Those which
# are not executed will be discarded and not counted.
batch_actions, batch_values, episode_lengths, log_probs = session.run(
[self.sampled_batch.tokens, self.sampled_batch.value,
self.sampled_batch.episode_lengths, self.sampled_batch.log_probs])
if episode_lengths.size == 0:
# This should not happen.
logging.warn(
'Shapes:\n'
'batch_actions.shape: %s\n'
'batch_values.shape: %s\n'
'episode_lengths.shape: %s\n',
batch_actions.shape, batch_values.shape, episode_lengths.shape)
# Sample from experince replay buffer
empty_replay_buffer = (
self.experience_replay.is_empty()
if self.experience_replay is not None else True)
num_programs_from_replay_buff = (
self.num_replay_per_batch if not empty_replay_buffer else 0)
num_programs_from_policy = (
rl_batch.batch_size - num_programs_from_replay_buff)
if (not empty_replay_buffer) and num_programs_from_replay_buff:
result = self.experience_replay.sample_many(
num_programs_from_replay_buff)
experience_samples, replay_weights = zip(*result)
(replay_actions,
replay_rewards,
_, # log probs
replay_adjusted_lengths) = zip(*experience_samples)
replay_batch_actions = utils.stack_pad(replay_actions, pad_axes=0,
dtype=np.int32)
# compute log probs for replay samples under current policy
all_replay_log_probs, = session.run(
[self.given_batch.log_probs],
{self.actions: replay_batch_actions,
self.adjusted_lengths: replay_adjusted_lengths})
replay_log_probs = [
np.choose(replay_actions[i], all_replay_log_probs[i, :l].T).sum()
for i, l in enumerate(replay_adjusted_lengths)]
else:
# Replay buffer is empty. Do not sample from it.
replay_actions = None
replay_policy_multipliers = None
replay_adjusted_lengths = None
replay_log_probs = None
replay_weights = None
replay_returns = None
on_policy_weights = [0] * num_programs_from_replay_buff
assert not self.a2c # TODO(danabo): Support A2C with importance sampling.
# Compute rewards.
code_scores = compute_rewards(
rl_batch, batch_actions, episode_lengths,
batch_size=num_programs_from_policy)
code_strings = code_scores.code_strings
batch_tot_r = code_scores.total_rewards
test_cases = code_scores.test_cases
code_outputs = code_scores.code_outputs
reasons = code_scores.reasons
# Process on-policy samples.
p = num_programs_from_policy
batch_targets, batch_returns = process_episodes(
code_scores.batch_rewards, episode_lengths[:p], a2c=False,
baselines=self.ema_by_len)
batch_policy_multipliers = batch_targets
batch_emp_values = [[]]
on_policy_returns = batch_returns
# Process off-policy samples.
if (not empty_replay_buffer) and num_programs_from_replay_buff:
offp_batch_rewards = [
[0.0] * (l - 1) + [r]
for l, r in zip(replay_adjusted_lengths, replay_rewards)]
assert len(offp_batch_rewards) == num_programs_from_replay_buff
assert len(replay_adjusted_lengths) == num_programs_from_replay_buff
replay_batch_targets, replay_returns = process_episodes(
offp_batch_rewards, replay_adjusted_lengths, a2c=False,
baselines=self.ema_by_len)
# Convert 2D array back into ragged 2D list.
replay_policy_multipliers = [
replay_batch_targets[i, :l]
for i, l
in enumerate(
replay_adjusted_lengths[:num_programs_from_replay_buff])]
adjusted_lengths = episode_lengths[:num_programs_from_policy]
if self.top_episodes:
assert len(self.top_episodes) > 0 # pylint: disable=g-explicit-length-test
off_policy_targets = [
item for item, _
in self.top_episodes.random_sample(self.topk_batch_size)]
off_policy_target_lengths = [len(t) for t in off_policy_targets]
off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
dtype=np.int32)
offp_switch = 1
else:
off_policy_targets = [[0]]
off_policy_target_lengths = [1]
offp_switch = 0
# On-policy episodes.
if num_programs_from_policy:
separate_actions = [
batch_actions[i, :l]
for i, l in enumerate(adjusted_lengths)]
chosen_log_probs = [
np.choose(separate_actions[i], log_probs[i, :l].T)
for i, l in enumerate(adjusted_lengths)]
new_experiences = [
(separate_actions[i],
batch_tot_r[i],
chosen_log_probs[i].sum(), l)
for i, l in enumerate(adjusted_lengths)]
on_policy_policy_multipliers = [
batch_policy_multipliers[i, :l]
for i, l in enumerate(adjusted_lengths)]
(on_policy_actions,
_, # rewards
on_policy_log_probs,
on_policy_adjusted_lengths) = zip(*new_experiences)
else:
new_experiences = []
on_policy_policy_multipliers = []
on_policy_actions = []
on_policy_log_probs = []
on_policy_adjusted_lengths = []
if (not empty_replay_buffer) and num_programs_from_replay_buff:
# Look for new experiences in replay buffer. Assign weight if an episode
# is in the buffer.
on_policy_weights = [0] * num_programs_from_policy
for i, cs in enumerate(code_strings):
if self.experience_replay.has_key(cs):
on_policy_weights[i] = self.experience_replay.get_weight(cs)
# Randomly select on-policy or off policy episodes to train on.
combined_actions = join(replay_actions, on_policy_actions)
combined_policy_multipliers = join(
replay_policy_multipliers, on_policy_policy_multipliers)
combined_adjusted_lengths = join(
replay_adjusted_lengths, on_policy_adjusted_lengths)
combined_returns = join(replay_returns, on_policy_returns)
combined_actions = utils.stack_pad(combined_actions, pad_axes=0)
combined_policy_multipliers = utils.stack_pad(combined_policy_multipliers,
pad_axes=0)
# P
combined_on_policy_log_probs = join(replay_log_probs, on_policy_log_probs)
# Q
# Assume weight is zero for all sequences sampled from the policy.
combined_q_weights = join(replay_weights, on_policy_weights)
# Importance adjustment. Naive formulation:
# E_{x~p}[f(x)] ~= 1/N sum_{x~p}(f(x)) ~= 1/N sum_{x~q}(f(x) * p(x)/q(x)).
# p(x) is the policy, and q(x) is the off-policy distribution, i.e. replay
# buffer distribution. Importance weight w(x) = p(x) / q(x).
# Instead of sampling from the replay buffer only, we sample from a
# mixture distribution of the policy and replay buffer.
# We are sampling from the mixture a*q(x) + (1-a)*p(x), where 0 <= a <= 1.
# Thus the importance weight w(x) = p(x) / (a*q(x) + (1-a)*p(x))
# = 1 / ((1-a) + a*q(x)/p(x)) where q(x) is 0 for x sampled from the
# policy.
# Note: a = self.replay_alpha
if empty_replay_buffer:
# The replay buffer is empty.
# Do no gradient update this step. The replay buffer will have stuff in
# it next time.
combined_policy_multipliers *= 0
elif not num_programs_from_replay_buff:
combined_policy_multipliers = np.ones([len(combined_actions), 1],
dtype=np.float32)
else:
# If a < 1 compute importance weights
# importance weight
# = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
# = 1 / ((1-a) + a*q/p)
importance_weights = self._compute_iw(combined_on_policy_log_probs,
combined_q_weights)
if self.config.iw_normalize:
importance_weights *= (
float(rl_batch.batch_size) / importance_weights.sum())
combined_policy_multipliers *= importance_weights.reshape(-1, 1)
# Train on replay batch, top-k MLE.
assert self.program_count is not None
fetches = {
'global_step': global_step_op,
'program_count': self.program_count,
'summaries': self.rl_summary_op,
'train_op': train_op,
'gradients': self.gradients_dict if return_gradients else self.no_op}
fetched = session.run(
fetches,
{self.actions: combined_actions,
self.empirical_values: [[]], # replay_emp_values,
self.policy_multipliers: combined_policy_multipliers,
self.adjusted_lengths: combined_adjusted_lengths,
self.off_policy_targets: off_policy_targets,
self.off_policy_target_lengths: off_policy_target_lengths,
self.offp_switch: offp_switch})
# Add to experience replay buffer.
self.experience_replay.add_many(
objs=new_experiences,
weights=[exp(r / self.replay_temperature) for r in batch_tot_r],
keys=code_strings)
# Update program count.
session.run(
[self.program_count_add_op],
{self.program_count_add_ph: num_programs_from_policy})
# Update EMA baselines on the mini-batch which we just did traning on.
if not self.a2c:
for i in xrange(rl_batch.batch_size):
episode_length = combined_adjusted_lengths[i]
empirical_returns = combined_returns[i, :episode_length]
for j in xrange(episode_length):
# Update ema_baselines in place.
self.ema_by_len[j] = (
self.ema_baseline_decay * self.ema_by_len[j]
+ (1 - self.ema_baseline_decay) * empirical_returns[j])
global_step = fetched['global_step']
global_npe = fetched['program_count']
core_summaries = fetched['summaries']
summaries_list = [core_summaries]
if num_programs_from_policy:
s_i = 0
text_summary = self._rl_text_summary(
session,
global_step,
global_npe,
batch_tot_r[s_i],
episode_lengths[s_i], test_cases[s_i],
code_outputs[s_i], code_strings[s_i], reasons[s_i])
reward_summary = self._rl_reward_summary(batch_tot_r)
is_best = False
if self.global_best_reward_fn:
# Save best reward.
best_reward = np.max(batch_tot_r)
is_best = self.global_best_reward_fn(session, best_reward)
if self.found_solution_op is not None and 'correct' in reasons:
session.run(self.found_solution_op)
# Save program to disk for record keeping.
if self.stop_on_success:
solutions = [
{'code': code_strings[i], 'reward': batch_tot_r[i],
'npe': global_npe}
for i in xrange(len(reasons)) if reasons[i] == 'correct']
elif is_best:
solutions = [
{'code': code_strings[np.argmax(batch_tot_r)],
'reward': np.max(batch_tot_r),
'npe': global_npe}]
else:
solutions = []
if solutions:
if self.assign_code_solution_fn:
self.assign_code_solution_fn(session, solutions[0]['code'])
with tf.gfile.FastGFile(self.logging_file, 'a') as writer:
for solution_dict in solutions:
writer.write(str(solution_dict) + '\n')
max_i = np.argmax(batch_tot_r)
max_tot_r = batch_tot_r[max_i]
if max_tot_r >= self.top_reward:
if max_tot_r >= self.top_reward:
self.top_reward = max_tot_r
logging.info('Top code: r=%.2f, \t%s', max_tot_r, code_strings[max_i])
if self.top_episodes is not None:
self.top_episodes.push(
max_tot_r, tuple(batch_actions[max_i, :episode_lengths[max_i]]))
summaries_list += [text_summary, reward_summary]
if self.do_iw_summaries and not empty_replay_buffer:
# prob of replay samples under replay buffer sampling.
norm_replay_weights = [
w / self.experience_replay.total_weight
for w in replay_weights]
replay_iw = self._compute_iw(replay_log_probs, replay_weights)
on_policy_iw = self._compute_iw(on_policy_log_probs, on_policy_weights)
summaries_list.append(
self._iw_summary(
session, replay_iw, replay_log_probs, norm_replay_weights,
on_policy_iw, on_policy_log_probs))
return UpdateStepResult(
global_step=global_step,
global_npe=global_npe,
summaries_list=summaries_list,
gradients_dict=fetched['gradients'])
def io_to_text(io_case, io_type):
if isinstance(io_case, misc.IOTuple):
# If there are many strings, join them with ','.
return ','.join([io_to_text(e, io_type) for e in io_case])
if io_type == misc.IOType.string:
# There is one string. Return it.
return misc.tokens_to_text(io_case)
if (io_type == misc.IOType.integer
or io_type == misc.IOType.boolean):
if len(io_case) == 1:
return str(io_case[0])
return str(io_case)
CodeScoreInfo = namedtuple(
'CodeScoreInfo',
['code_strings', 'batch_rewards', 'total_rewards', 'test_cases',
'code_outputs', 'reasons'])
def compute_rewards(rl_batch, batch_actions, episode_lengths, batch_size=None):
"""Compute rewards for each episode in the batch.
Args:
rl_batch: A data.RLBatch instance. This holds information about the task
each episode is solving, and a reward function for each episode.
batch_actions: Contains batch of episodes. Each sequence of actions will be
converted into a BF program and then scored. A numpy array of shape
[batch_size, max_sequence_length].
episode_lengths: The sequence length of each episode in the batch. Iterable
of length batch_size.
batch_size: (optional) number of programs to score. Use this to limit the
number of programs executed from this batch. For example, when doing
importance sampling some of the on-policy episodes will be discarded
and they should not be executed. `batch_size` can be less than or equal
to the size of the input batch.
Returns:
CodeScoreInfo namedtuple instance. This holds not just the computed rewards,
but additional information computed during code execution which can be used
for debugging and monitoring. this includes: BF code strings, test cases
the code was executed on, code outputs from those test cases, and reasons
for success or failure.
"""
code_strings = [
''.join([misc.bf_int2char(a) for a in action_sequence[:l]])
for action_sequence, l in zip(batch_actions, episode_lengths)]
if batch_size is None:
batch_size = len(code_strings)
else:
assert batch_size <= len(code_strings)
code_strings = code_strings[:batch_size]
if isinstance(rl_batch.reward_fns, (list, tuple)):
# reward_fns is a list of functions, same length as code_strings.
assert len(rl_batch.reward_fns) >= batch_size
r_fn_results = [
rl_batch.reward_fns[i](code_strings[i]) for i in xrange(batch_size)]
else:
# reward_fns is allowed to be one function which processes a batch of code
# strings. This is useful for efficiency and batch level computation.
r_fn_results = rl_batch.reward_fns(code_strings)
# Expecting that r_fn returns a list of rewards. Length of list equals
# length of the code string (including EOS char).
batch_rewards = [r.episode_rewards for r in r_fn_results]
total_rewards = [sum(b) for b in batch_rewards]
test_cases = [io_to_text(r.input_case, r.input_type) for r in r_fn_results]
code_outputs = [io_to_text(r.code_output, r.output_type)
for r in r_fn_results]
reasons = [r.reason for r in r_fn_results]
return CodeScoreInfo(
code_strings=code_strings,
batch_rewards=batch_rewards,
total_rewards=total_rewards,
test_cases=test_cases,
code_outputs=code_outputs,
reasons=reasons)
def process_episodes(
batch_rewards, episode_lengths, a2c=False, baselines=None,
batch_values=None):
"""Compute REINFORCE targets.
REINFORCE here takes the form:
grad_t = grad[log(pi(a_t|c_t))*target_t]
where c_t is context: i.e. RNN state or environment state (or both).
Two types of targets are supported:
1) Advantage actor critic (a2c).
2) Vanilla REINFORCE with baseline.
Args:
batch_rewards: Rewards received in each episode in the batch. A numpy array
of shape [batch_size, max_sequence_length]. Note, these are per-timestep
rewards, not total reward.
episode_lengths: Length of each episode. An iterable of length batch_size.
a2c: A bool. Whether to compute a2c targets (True) or vanilla targets
(False).
baselines: If a2c is False, provide baselines for each timestep. This is a
list (or indexable container) of length max_time. Note: baselines are
shared across all episodes, which is why there is no batch dimension.
It is up to the caller to update baselines accordingly.
batch_values: If a2c is True, provide values computed by a value estimator.
A numpy array of shape [batch_size, max_sequence_length].
Returns:
batch_targets: REINFORCE targets for each episode and timestep. A numpy
array of shape [batch_size, max_sequence_length].
batch_returns: Returns computed for each episode and timestep. This is for
reference, and is not used in the REINFORCE gradient update (but was
used to compute the targets). A numpy array of shape
[batch_size, max_sequence_length].
"""
num_programs = len(batch_rewards)
assert num_programs <= len(episode_lengths)
batch_returns = [None] * num_programs
batch_targets = [None] * num_programs
for i in xrange(num_programs):
episode_length = episode_lengths[i]
assert len(batch_rewards[i]) == episode_length
# Compute target for each timestep.
# If we are computing A2C:
# target_t = advantage_t = R_t - V(c_t)
# where V(c_t) is a learned value function (provided as `values`).
# Otherwise:
# target_t = R_t - baselines[t]
# where `baselines` are provided.
# In practice we use a more generalized formulation of advantage. See docs
# for `discounted_advantage_and_rewards`.
if a2c:
# Compute advantage.
assert batch_values is not None
episode_values = batch_values[i, :episode_length]
episode_rewards = batch_rewards[i]
emp_val, gen_adv = rollout_lib.discounted_advantage_and_rewards(
episode_rewards, episode_values, gamma=1.0, lambda_=1.0)
batch_returns[i] = emp_val
batch_targets[i] = gen_adv
else:
# Compute return for each timestep. See section 3 of
# https://arxiv.org/pdf/1602.01783.pdf
assert baselines is not None
empirical_returns = rollout_lib.discount(batch_rewards[i], gamma=1.0)
targets = [None] * episode_length
for j in xrange(episode_length):
targets[j] = empirical_returns[j] - baselines[j]
batch_returns[i] = empirical_returns
batch_targets[i] = targets
batch_returns = utils.stack_pad(batch_returns, 0)
if num_programs:
batch_targets = utils.stack_pad(batch_targets, 0)
else:
batch_targets = np.array([], dtype=np.float32)
return (batch_targets, batch_returns)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for pg_agent."""
from collections import Counter
from absl import logging
import numpy as np
from six.moves import xrange
import tensorflow as tf
from common import utils # brain coder
from single_task import data # brain coder
from single_task import defaults # brain coder
from single_task import misc # brain coder
from single_task import pg_agent as agent_lib # brain coder
from single_task import pg_train # brain coder
# Symmetric mean absolute percentage error (SMAPE).
# https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error
def smape(a, b):
return 2.0 * abs(a - b) / float(a + b)
def onehot(dim, num_dims):
value = np.zeros(num_dims, dtype=np.float32)
value[dim] = 1
return value
def random_sequence(max_length, num_tokens, eos=0):
length = np.random.randint(1, max_length - 1)
return np.append(np.random.randint(1, num_tokens, length), eos)
def repeat_and_pad(v, rep, total_len):
return [v] * rep + [0.0] * (total_len - rep)
class AgentTest(tf.test.TestCase):
def testProcessEpisodes(self):
batch_size = 3
def reward_fn(code_string):
return misc.RewardInfo(
episode_rewards=[float(ord(c)) for c in code_string],
input_case=[],
correct_output=[],
code_output=[],
input_type=misc.IOType.integer,
output_type=misc.IOType.integer,
reason='none')
rl_batch = data.RLBatch(
reward_fns=[reward_fn for _ in range(batch_size)],
batch_size=batch_size,
good_reward=10.0)
batch_actions = np.asarray([
[4, 5, 3, 6, 8, 1, 0, 0],
[1, 2, 3, 4, 0, 0, 0, 0],
[8, 7, 6, 5, 4, 3, 2, 1]], dtype=np.int32)
batch_values = np.asarray([
[0, 1, 2, 1, 0, 1, 1, 0],
[0, 2, 1, 2, 1, 0, 0, 0],
[0, 1, 1, 0, 0, 0, 1, 1]], dtype=np.float32)
episode_lengths = np.asarray([7, 5, 8], dtype=np.int32)
scores = agent_lib.compute_rewards(
rl_batch, batch_actions, episode_lengths)
batch_targets, batch_returns = agent_lib.process_episodes(
scores.batch_rewards, episode_lengths, a2c=True,
batch_values=batch_values)
self.assertEqual(
[[473.0, 428.0, 337.0, 294.0, 201.0, 157.0, 95.0, 0.0],
[305.0, 243.0, 183.0, 140.0, 95.0, 0.0, 0.0, 0.0],
[484.0, 440.0, 394.0, 301.0, 210.0, 165.0, 122.0, 62.0]],
batch_returns.tolist())
self.assertEqual(
[[473.0, 427.0, 335.0, 293.0, 201.0, 156.0, 94.0, 0.0],
[305.0, 241.0, 182.0, 138.0, 94.0, 0.0, 0.0, 0.0],
[484.0, 439.0, 393.0, 301.0, 210.0, 165.0, 121.0, 61.0]],
batch_targets.tolist())
def testVarUpdates(self):
"""Tests that variables get updated as expected.
For the RL update, check that gradients are non-zero and that the global
model gets updated.
"""
config = defaults.default_config_with_updates(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)')
lr = config.agent.lr
tf.reset_default_graph()
trainer = pg_train.AsyncTrainer(
config, task_id=0, ps_tasks=0, num_workers=1)
global_init_op = tf.variables_initializer(
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
with tf.Session() as sess:
sess.run(global_init_op) # Initialize global copy.
trainer.initialize(sess)
model = trainer.model
global_vars = sess.run(trainer.global_model.trainable_variables)
local_vars = sess.run(model.trainable_variables)
# Make sure names match.
g_prefix = 'global/'
l_prefix = 'local/'
for g, l in zip(trainer.global_model.trainable_variables,
model.trainable_variables):
self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):])
# Assert that shapes and values are the same between global and local
# models.
for g, l in zip(global_vars, local_vars):
self.assertEqual(g.shape, l.shape)
self.assertTrue(np.array_equal(g, l))
# Make all gradients dense tensors.
for param, grad in model.gradients_dict.items():
if isinstance(grad, tf.IndexedSlices):
# Converts to dense tensor.
model.gradients_dict[param] = tf.multiply(grad, 1.0)
# Perform update.
results = model.update_step(
sess, trainer.data_manager.sample_rl_batch(), trainer.train_op,
trainer.global_step, return_gradients=True)
grads_dict = results.gradients_dict
for grad in grads_dict.values():
self.assertIsNotNone(grad)
self.assertTrue(np.count_nonzero(grad) > 0)
global_update = sess.run(trainer.global_model.trainable_variables)
for tf_var, var_before, var_after in zip(
model.trainable_variables, local_vars, global_update):
# Check that the params were updated.
self.assertTrue(np.allclose(
var_after,
var_before - grads_dict[tf_var] * lr))
# Test that global to local sync works.
sess.run(trainer.sync_op)
global_vars = sess.run(trainer.global_model.trainable_variables)
local_vars = sess.run(model.trainable_variables)
for l, g in zip(local_vars, global_vars):
self.assertTrue(np.allclose(l, g))
def testMonteCarloGradients(self):
"""Test Monte Carlo estimate of REINFORCE gradient.
Test that the Monte Carlo estimate of the REINFORCE gradient is
approximately equal to the true gradient. We compute the true gradient for a
toy environment with a very small action space.
Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
"""
# Test may have different outcome on different machines due to different
# rounding behavior of float arithmetic.
tf.reset_default_graph()
tf.set_random_seed(12345678987654321)
np.random.seed(1294024302)
max_length = 2
num_tokens = misc.bf_num_tokens()
eos = misc.BF_EOS_INT
assert eos == 0
def sequence_iterator(max_length):
"""Iterates through all sequences up to the given length."""
yield [eos]
for a in xrange(1, num_tokens):
if max_length > 1:
for sub_seq in sequence_iterator(max_length - 1):
yield [a] + sub_seq
else:
yield [a]
actions = list(sequence_iterator(max_length))
# This batch contains all possible episodes up to max_length.
actions_batch = utils.stack_pad(actions, 0)
lengths_batch = [len(s) for s in actions]
reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
# reward_map = {tuple(a): np.random.normal(3, 1)
# for a in actions_batch} # normal distribution
# reward_map = {tuple(a): 1.0
# for a in actions_batch} # expected reward is 1
n = 100000 # MC sample size.
config = defaults.default_config_with_updates(
'env=c(task="print"),'
'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
'policy_lstm_sizes=[10],eos_token=True),'
'batch_size='+str(n)+',timestep_limit='+str(max_length))
dtype = tf.float64
trainer = pg_train.AsyncTrainer(
config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
model = trainer.model
actions_ph = model.actions
lengths_ph = model.adjusted_lengths
multipliers_ph = model.policy_multipliers
global_init_op = tf.variables_initializer(
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
with tf.Session() as sess, sess.graph.as_default():
sess.run(global_init_op) # Initialize global copy.
trainer.initialize(sess)
# Compute exact gradients.
# exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
true_loss_unnormalized = 0.0
exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
episode_probs_map = {}
grads_map = {}
for a_idx in xrange(len(actions_batch)):
a = actions_batch[a_idx]
grads_result, probs_result, loss = sess.run(
[model.dense_unclipped_grads, model.chosen_probs, model.loss],
{actions_ph: [a],
lengths_ph: [lengths_batch[a_idx]],
multipliers_ph: [
repeat_and_pad(reward_map[tuple(a)],
lengths_batch[a_idx],
max_length)]})
# Take product over time axis.
episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
for i in range(0, len(exact_grads)):
exact_grads[i] += grads_result[i] * episode_probs_result
episode_probs_map[tuple(a)] = episode_probs_result
reward_map[tuple(a)] = reward_map[tuple(a)]
grads_map[tuple(a)] = grads_result
true_loss_unnormalized += loss
# Normalize loss. Since each episode is feed into the model one at a time,
# normalization needs to be done manually.
true_loss = true_loss_unnormalized / float(len(actions_batch))
# Compute Monte Carlo gradients.
# E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
# sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
# where len(actions_sampled_from_P) == n.
#
# In other words, sample from the policy and compute the gradients of the
# log probs weighted by the returns. This will excersize the code in
# agent.py
sampled_actions, sampled_lengths = sess.run(
[model.sampled_tokens, model.episode_lengths])
pi_multipliers = [
repeat_and_pad(reward_map[tuple(a)], l, max_length)
for a, l in zip(sampled_actions, sampled_lengths)]
mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
[model.dense_unclipped_grads, model.chosen_probs, model.loss],
{actions_ph: sampled_actions,
multipliers_ph: pi_multipliers,
lengths_ph: sampled_lengths})
# Loss is already normalized across the minibatch, so no normalization
# is needed.
mc_grads = mc_grads_unnormalized
mc_loss = mc_loss_unnormalized
# Make sure true loss and MC loss are similar.
loss_error = smape(true_loss, mc_loss)
self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)
# Check that probs computed for episodes sampled from the model are the same
# as the recorded true probs.
for i in range(100):
acs = tuple(sampled_actions[i].tolist())
sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))
# Make sure MC estimates of true probs are close.
counter = Counter(tuple(e) for e in sampled_actions)
for acs, count in counter.iteritems():
mc_prob = count / float(len(sampled_actions))
true_prob = episode_probs_map[acs]
error = smape(mc_prob, true_prob)
self.assertTrue(
error < 0.15,
msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
% (error, count, mc_prob, true_prob))
# Manually recompute MC gradients and make sure they match MC gradients
# computed in TF.
mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
for i in range(n):
acs = tuple(sampled_actions[i].tolist())
for i in range(0, len(mc_grads_recompute)):
mc_grads_recompute[i] += grads_map[acs][i]
for i in range(0, len(mc_grads_recompute)):
self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))
# Check angle between gradients as fraction of pi.
for index in range(len(mc_grads)):
v1 = mc_grads[index].reshape(-1)
v2 = exact_grads[index].reshape(-1)
# angle = arccos(v1 . v2 / (|v1|*|v2|))
angle_rad = np.arccos(
np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
logging.info('angle / pi: %s', angle_rad / np.pi)
angle_frac = angle_rad / np.pi
self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
# Check norms.
for index in range(len(mc_grads)):
v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
error = smape(v1_norm, v2_norm)
self.assertTrue(error < 0.02, msg='actual: %s' % error)
# Check expected rewards.
# E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
mc_expected_reward = np.mean(
[reward_map[tuple(a)] for a in sampled_actions])
exact_expected_reward = np.sum(
[episode_probs_map[k] * reward_map[k] for k in reward_map])
error = smape(mc_expected_reward, exact_expected_reward)
self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
def testNumericalGradChecking(self):
# Similar to
# http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
epsilon = 1e-4
eos = misc.BF_EOS_INT
self.assertEqual(0, eos)
config = defaults.default_config_with_updates(
'env=c(task="print"),'
'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
'eos_token=True),'
'batch_size=64')
dtype = tf.float64
tf.reset_default_graph()
tf.set_random_seed(12345678987654321)
np.random.seed(1294024302)
trainer = pg_train.AsyncTrainer(
config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
model = trainer.model
actions_ph = model.actions
lengths_ph = model.adjusted_lengths
multipliers_ph = model.policy_multipliers
loss = model.pi_loss
global_init_op = tf.variables_initializer(
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
assign_add_placeholders = [None] * len(model.trainable_variables)
assign_add_ops = [None] * len(model.trainable_variables)
param_shapes = [None] * len(model.trainable_variables)
for i, param in enumerate(model.trainable_variables):
param_shapes[i] = param.get_shape().as_list()
assign_add_placeholders[i] = tf.placeholder(dtype,
np.prod(param_shapes[i]))
assign_add_ops[i] = param.assign_add(
tf.reshape(assign_add_placeholders[i], param_shapes[i]))
with tf.Session() as sess:
sess.run(global_init_op) # Initialize global copy.
trainer.initialize(sess)
actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
actions_batch = utils.stack_pad(actions_raw, 0)
lengths_batch = [len(l) for l in actions_raw]
feed = {actions_ph: actions_batch,
multipliers_ph: np.ones_like(actions_batch),
lengths_ph: lengths_batch}
estimated_grads = [None] * len(model.trainable_variables)
for i, param in enumerate(model.trainable_variables):
param_size = np.prod(param_shapes[i])
estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
for index in xrange(param_size):
e = onehot(index, param_size) * epsilon
sess.run(assign_add_ops[i],
{assign_add_placeholders[i]: e})
j_plus = sess.run(loss, feed)
sess.run(assign_add_ops[i],
{assign_add_placeholders[i]: -2 * e})
j_minus = sess.run(loss, feed)
sess.run(assign_add_ops[i],
{assign_add_placeholders[i]: e})
estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])
analytic_grads = sess.run(model.dense_unclipped_grads, feed)
for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
self.assertTrue(np.allclose(g1, g2))
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
r"""Train RL agent on coding tasks."""
import contextlib
import cPickle
import cProfile
import marshal
import os
import time
from absl import flags
from absl import logging
import tensorflow as tf
# internal session lib import
from single_task import data # brain coder
from single_task import defaults # brain coder
from single_task import pg_agent as agent_lib # brain coder
from single_task import results_lib # brain coder
FLAGS = flags.FLAGS
flags.DEFINE_string(
'master', '',
'URL of the TensorFlow master to use.')
flags.DEFINE_integer(
'ps_tasks', 0,
'Number of parameter server tasks. Only set to 0 for '
'single worker training.')
flags.DEFINE_integer(
'summary_interval', 10,
'How often to write summaries.')
flags.DEFINE_integer(
'summary_tasks', 16,
'If greater than 0 only tasks 0 through summary_tasks - 1 '
'will write summaries. If 0, all tasks will write '
'summaries.')
flags.DEFINE_bool(
'stop_on_success', True,
'If True, training will stop as soon as a solution is found. '
'If False, training will continue indefinitely until another '
'stopping condition is reached.')
flags.DEFINE_bool(
'do_profiling', False,
'If True, cProfile profiler will run and results will be '
'written to logdir. WARNING: Results will not be written if '
'the code crashes. Make sure it exists successfully.')
flags.DEFINE_integer('model_v', 0, 'Model verbosity level.')
flags.DEFINE_bool(
'delayed_graph_cleanup', True,
'If true, container for n-th run will not be reset until the (n+1)-th run '
'is complete. This greatly reduces the chance that a worker is still '
'using the n-th container when it is cleared.')
def define_tuner_hparam_space(hparam_space_type):
"""Define tunable hparams for grid search."""
if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
# Discrete hparam space is stored as a dict from hparam name to discrete
# values.
hparam_space = {}
if hparam_space_type in ('pg', 'pg-topk', 'is'):
# Add a floating point parameter named learning rate.
hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
hparam_space['entropy_beta'] = [0.005, 0.01, 0.05, 0.10]
else: # 'topk'
# Add a floating point parameter named learning rate.
hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
hparam_space['entropy_beta'] = [0.0, 0.005, 0.01, 0.05, 0.10]
if hparam_space_type in ('topk', 'pg-topk'):
# topk tuning will be enabled.
hparam_space['topk'] = [10]
hparam_space['topk_loss_hparam'] = [1.0, 10.0, 50.0, 200.0]
elif hparam_space_type == 'is':
# importance sampling tuning will be enabled.
hparam_space['replay_temperature'] = [0.25, 0.5, 1.0, 2.0]
hparam_space['alpha'] = [0.5, 0.75, 63/64.]
return hparam_space
def write_hparams_to_config(config, hparams, hparam_space_type):
"""Write hparams given by the tuner into the Config object."""
if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
config.agent.lr = hparams.lr
config.agent.entropy_beta = hparams.entropy_beta
if hparam_space_type in ('topk', 'pg-topk'):
# topk tuning will be enabled.
config.agent.topk = hparams.topk
config.agent.topk_loss_hparam = hparams.topk_loss_hparam
elif hparam_space_type == 'is':
# importance sampling tuning will be enabled.
config.agent.replay_temperature = hparams.replay_temperature
config.agent.alpha = hparams.alpha
def make_initialized_variable(value, name, shape=None, dtype=tf.float32):
"""Create a tf.Variable with a constant initializer.
Args:
value: Constant value to initialize the variable with. This is the value
that the variable starts with.
name: Name of the variable in the TF graph.
shape: Shape of the variable. If None, variable will be a scalar.
dtype: Data type of the variable. Should be a TF dtype. Defaults to
tf.float32.
Returns:
tf.Variable instance.
"""
if shape is None:
shape = []
return tf.get_variable(
name=name, shape=shape, initializer=tf.constant_initializer(value),
dtype=dtype, trainable=False)
class AsyncTrainer(object):
"""Manages graph creation and training.
This async trainer creates a global model on the parameter server, and a local
model (for this worker). Gradient updates are sent to the global model, and
the updated weights are synced to the local copy.
"""
def __init__(self, config, task_id, ps_tasks, num_workers, is_chief=True,
summary_writer=None,
dtype=tf.float32,
summary_interval=1,
run_number=0,
logging_dir='/tmp', model_v=0):
self.config = config
self.data_manager = data.DataManager(
config, run_number=run_number,
do_code_simplification=not FLAGS.stop_on_success)
self.task_id = task_id
self.ps_tasks = ps_tasks
self.is_chief = is_chief
if ps_tasks == 0:
assert task_id == 0, 'No parameter servers specified. Expecting 1 task.'
assert num_workers == 1, (
'No parameter servers specified. Expecting 1 task.')
worker_device = '/job:localhost/replica:%d/task:0/cpu:0' % task_id
# worker_device = '/cpu:0'
# ps_device = '/cpu:0'
else:
assert num_workers > 0, 'There must be at least 1 training worker.'
worker_device = '/job:worker/replica:%d/task:0/cpu:0' % task_id
# ps_device = '/job:ps/replica:0/task:0/cpu:0'
logging.info('worker_device: %s', worker_device)
logging_file = os.path.join(
logging_dir, 'solutions_%d.txt' % task_id)
experience_replay_file = os.path.join(
logging_dir, 'replay_buffer_%d.pickle' % task_id)
self.topk_file = os.path.join(
logging_dir, 'topk_buffer_%d.pickle' % task_id)
tf.get_variable_scope().set_use_resource(True)
# global model
with tf.device(tf.train.replica_device_setter(ps_tasks,
ps_device='/job:ps/replica:0',
worker_device=worker_device)):
with tf.variable_scope('global'):
global_model = agent_lib.LMAgent(config, dtype=dtype, is_local=False)
global_params_dict = {p.name: p
for p in global_model.sync_variables}
self.global_model = global_model
self.global_step = make_initialized_variable(
0, 'global_step', dtype=tf.int64)
self.global_best_reward = make_initialized_variable(
-10.0, 'global_best_reward', dtype=tf.float64)
self.is_best_model = make_initialized_variable(
False, 'is_best_model', dtype=tf.bool)
self.reset_is_best_model = self.is_best_model.assign(False)
self.global_best_reward_placeholder = tf.placeholder(
tf.float64, [], name='global_best_reward_placeholder')
self.assign_global_best_reward_op = tf.group(
self.global_best_reward.assign(
self.global_best_reward_placeholder),
self.is_best_model.assign(True))
def assign_global_best_reward_fn(session, reward):
reward = round(reward, 10)
best_reward = round(session.run(self.global_best_reward), 10)
is_best = reward > best_reward
if is_best:
session.run(self.assign_global_best_reward_op,
{self.global_best_reward_placeholder: reward})
return is_best
self.assign_global_best_reward_fn = assign_global_best_reward_fn
# Any worker will set to true when it finds a solution.
self.found_solution_flag = make_initialized_variable(
False, 'found_solution_flag', dtype=tf.bool)
self.found_solution_op = self.found_solution_flag.assign(True)
self.run_number = make_initialized_variable(
run_number, 'run_number', dtype=tf.int32)
# Store a solution when found.
self.code_solution_variable = tf.get_variable(
'code_solution', [], tf.string,
initializer=tf.constant_initializer(''))
self.code_solution_ph = tf.placeholder(
tf.string, [], name='code_solution_ph')
self.code_solution_assign_op = self.code_solution_variable.assign(
self.code_solution_ph)
def assign_code_solution_fn(session, code_solution_string):
session.run(self.code_solution_assign_op,
{self.code_solution_ph: code_solution_string})
self.assign_code_solution_fn = assign_code_solution_fn
# Count all programs sampled from policy. This does not include
# programs sampled from replay buffer.
# This equals NPE (number of programs executed). Only programs sampled
# from the policy need to be executed.
self.program_count = make_initialized_variable(
0, 'program_count', dtype=tf.int64)
# local model
with tf.device(worker_device):
with tf.variable_scope('local'):
self.model = model = agent_lib.LMAgent(
config,
task_id=task_id,
logging_file=logging_file,
experience_replay_file=experience_replay_file,
dtype=dtype,
global_best_reward_fn=self.assign_global_best_reward_fn,
found_solution_op=self.found_solution_op,
assign_code_solution_fn=self.assign_code_solution_fn,
program_count=self.program_count,
stop_on_success=FLAGS.stop_on_success,
verbose_level=model_v)
local_params = model.trainable_variables
local_params_dict = {p.name: p for p in local_params}
# Pull global params to local model.
def _global_to_local_scope(name):
assert name.startswith('global/')
return 'local' + name[6:]
sync_dict = {
local_params_dict[_global_to_local_scope(p_name)]: p
for p_name, p in global_params_dict.items()}
self.sync_op = tf.group(*[v_local.assign(v_global)
for v_local, v_global
in sync_dict.items()])
# Pair local gradients with global params.
grad_var_dict = {
gradient: sync_dict[local_var]
for local_var, gradient in model.gradients_dict.items()}
# local model
model.make_summary_ops() # Don't put summaries under 'local' scope.
with tf.variable_scope('local'):
self.train_op = model.optimizer.apply_gradients(
grad_var_dict.items(), global_step=self.global_step)
self.local_init_op = tf.variables_initializer(
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
tf.get_variable_scope().name))
self.local_step = 0
self.last_summary_time = time.time()
self.summary_interval = summary_interval
self.summary_writer = summary_writer
self.cached_global_step = -1
self.cached_global_npe = -1
logging.info('summary_interval: %d', self.summary_interval)
# Load top-k buffer.
if self.model.top_episodes is not None and tf.gfile.Exists(self.topk_file):
try:
with tf.gfile.FastGFile(self.topk_file, 'r') as f:
self.model.top_episodes = cPickle.loads(f.read())
logging.info(
'Loaded top-k buffer from disk with %d items. Location: "%s"',
len(self.model.top_episodes), self.topk_file)
except (cPickle.UnpicklingError, EOFError) as e:
logging.warn(
'Failed to load existing top-k buffer from disk. Removing bad file.'
'\nLocation: "%s"\nException: %s', self.topk_file, str(e))
tf.gfile.Remove(self.topk_file)
def initialize(self, session):
"""Run initialization ops."""
session.run(self.local_init_op)
session.run(self.sync_op)
self.cached_global_step, self.cached_global_npe = session.run(
[self.global_step, self.program_count])
def update_global_model(self, session):
"""Run an update step.
1) Asynchronously copy global weights to local model.
2) Call into local model's update_step method, which does the following:
a) Sample batch of programs from policy.
b) Compute rewards.
c) Compute gradients and update the global model asynchronously.
3) Write tensorboard summaries to disk.
Args:
session: tf.Session instance.
"""
session.run(self.sync_op) # Copy weights from global to local.
with session.as_default():
result = self.model.update_step(
session, self.data_manager.sample_rl_batch(), self.train_op,
self.global_step)
global_step = result.global_step
global_npe = result.global_npe
summaries = result.summaries_list
self.cached_global_step = global_step
self.cached_global_npe = global_npe
self.local_step += 1
if self.summary_writer and self.local_step % self.summary_interval == 0:
if not isinstance(summaries, (tuple, list)):
summaries = [summaries]
summaries.append(self._local_step_summary())
if self.is_chief:
(global_best_reward,
found_solution_flag,
program_count) = session.run(
[self.global_best_reward,
self.found_solution_flag,
self.program_count])
summaries.append(
tf.Summary(
value=[tf.Summary.Value(
tag='model/best_reward',
simple_value=global_best_reward)]))
summaries.append(
tf.Summary(
value=[tf.Summary.Value(
tag='model/solution_found',
simple_value=int(found_solution_flag))]))
summaries.append(
tf.Summary(
value=[tf.Summary.Value(
tag='model/program_count',
simple_value=program_count)]))
for s in summaries:
self.summary_writer.add_summary(s, global_step)
self.last_summary_time = time.time()
def _local_step_summary(self):
"""Compute number of local steps per time increment."""
dt = time.time() - self.last_summary_time
steps_per_time = self.summary_interval / float(dt)
return tf.Summary(value=[
tf.Summary.Value(
tag='local_step/per_sec',
simple_value=steps_per_time),
tf.Summary.Value(
tag='local_step/step',
simple_value=self.local_step)])
def maybe_save_best_model(self, session, saver, checkpoint_file):
"""Check if this model got the highest reward and save to disk if so."""
if self.is_chief and session.run(self.is_best_model):
logging.info('Saving best model to "%s"', checkpoint_file)
saver.save(session, checkpoint_file)
session.run(self.reset_is_best_model)
def save_replay_buffer(self):
"""Save replay buffer to disk.
Call this periodically so that training can recover if jobs go down.
"""
if self.model.experience_replay is not None:
logging.info('Saving experience replay buffer to "%s".',
self.model.experience_replay.save_file)
self.model.experience_replay.incremental_save(True)
def delete_replay_buffer(self):
"""Delete replay buffer from disk.
Call this at the end of training to clean up. Replay buffer can get very
large.
"""
if self.model.experience_replay is not None:
logging.info('Deleting experience replay buffer at "%s".',
self.model.experience_replay.save_file)
tf.gfile.Remove(self.model.experience_replay.save_file)
def save_topk_buffer(self):
"""Save top-k buffer to disk.
Call this periodically so that training can recover if jobs go down.
"""
if self.model.top_episodes is not None:
logging.info('Saving top-k buffer to "%s".', self.topk_file)
# Overwrite previous data each time.
with tf.gfile.FastGFile(self.topk_file, 'w') as f:
f.write(cPickle.dumps(self.model.top_episodes))
@contextlib.contextmanager
def managed_session(sv, master='', config=None,
start_standard_services=True,
close_summary_writer=True,
max_wait_secs=7200):
# Same as Supervisor.managed_session, but with configurable timeout.
try:
sess = sv.prepare_or_wait_for_session(
master=master, config=config,
start_standard_services=start_standard_services,
max_wait_secs=max_wait_secs)
yield sess
except tf.errors.DeadlineExceededError:
raise
except Exception as e: # pylint: disable=broad-except
sv.request_stop(e)
finally:
try:
# Request all the threads to stop and wait for them to do so. Any
# exception raised by the threads is raised again from stop().
# Passing stop_grace_period_secs is for blocked enqueue/dequeue
# threads which are not checking for `should_stop()`. They
# will be stopped when we close the session further down.
sv.stop(close_summary_writer=close_summary_writer)
finally:
# Close the session to finish up all pending calls. We do not care
# about exceptions raised when closing. This takes care of
# blocked enqueue/dequeue calls.
try:
sess.close()
except Exception: # pylint: disable=broad-except
# Silently ignore exceptions raised by close().
pass
def train(config, is_chief, tuner=None, run_dir=None, run_number=0,
results_writer=None):
"""Run training loop.
Args:
config: config_lib.Config instance containing global config (agent and env).
is_chief: True if this worker is chief. Chief worker manages writing some
data to disk and initialization of the global model.
tuner: A tuner instance. If not tuning, leave as None.
run_dir: Directory where all data for this run will be written. If None,
run_dir = FLAGS.logdir. Set this argument when doing multiple runs.
run_number: Which run is this.
results_writer: Managest writing training results to disk. Results are a
dict of metric names and values.
Returns:
The trainer object used to run training updates.
"""
logging.info('Will run asynchronous training.')
if run_dir is None:
run_dir = FLAGS.logdir
train_dir = os.path.join(run_dir, 'train')
best_model_checkpoint = os.path.join(train_dir, 'best.ckpt')
events_dir = '%s/events_%d' % (run_dir, FLAGS.task_id)
logging.info('Events directory: %s', events_dir)
logging_dir = os.path.join(run_dir, 'logs')
if not tf.gfile.Exists(logging_dir):
tf.gfile.MakeDirs(logging_dir)
status_file = os.path.join(logging_dir, 'status.txt')
if FLAGS.summary_tasks and FLAGS.task_id < FLAGS.summary_tasks:
summary_writer = tf.summary.FileWriter(events_dir)
else:
summary_writer = None
# Only profile task 0.
if FLAGS.do_profiling:
logging.info('Profiling enabled')
profiler = cProfile.Profile()
profiler.enable()
else:
profiler = None
trainer = AsyncTrainer(
config, FLAGS.task_id, FLAGS.ps_tasks, FLAGS.num_workers,
is_chief=is_chief,
summary_interval=FLAGS.summary_interval,
summary_writer=summary_writer,
logging_dir=logging_dir,
run_number=run_number,
model_v=FLAGS.model_v)
variables_to_save = [v for v in tf.global_variables()
if v.name.startswith('global')]
global_init_op = tf.variables_initializer(variables_to_save)
saver = tf.train.Saver(variables_to_save)
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
logging.info('Trainable vars:')
for v in var_list:
logging.info(' %s, %s, %s', v.name, v.device, v.get_shape())
logging.info('All vars:')
for v in tf.global_variables():
logging.info(' %s, %s, %s', v.name, v.device, v.get_shape())
def init_fn(unused_sess):
logging.info('No checkpoint found. Initialized global params.')
sv = tf.train.Supervisor(is_chief=is_chief,
logdir=train_dir,
saver=saver,
summary_op=None,
init_op=global_init_op,
init_fn=init_fn,
summary_writer=summary_writer,
ready_op=tf.report_uninitialized_variables(
variables_to_save),
ready_for_local_init_op=None,
global_step=trainer.global_step,
save_model_secs=30,
save_summaries_secs=30)
# Add a thread that periodically checks if this Trial should stop
# based on an early stopping policy.
if tuner:
sv.Loop(60, tuner.check_for_stop, (sv.coord,))
last_replay_save_time = time.time()
global_step = -1
logging.info(
'Starting session. '
'If this hangs, we\'re mostly likely waiting to connect '
'to the parameter server. One common cause is that the parameter '
'server DNS name isn\'t resolving yet, or is misspecified.')
should_retry = True
supervisor_deadline_exceeded = False
while should_retry:
try:
with managed_session(
sv, FLAGS.master, max_wait_secs=60) as session, session.as_default():
should_retry = False
do_training = True
try:
trainer.initialize(session)
if session.run(trainer.run_number) != run_number:
# If we loaded existing model from disk, and the saved run number is
# different, throw an exception.
raise RuntimeError(
'Expecting to be on run %d, but is actually on run %d. '
'run_dir: "%s"'
% (run_number, session.run(trainer.run_number), run_dir))
global_step = trainer.cached_global_step
logging.info('Starting training at step=%d', global_step)
while do_training:
trainer.update_global_model(session)
if is_chief:
trainer.maybe_save_best_model(
session, saver, best_model_checkpoint)
global_step = trainer.cached_global_step
global_npe = trainer.cached_global_npe
if time.time() - last_replay_save_time >= 30:
trainer.save_replay_buffer()
trainer.save_topk_buffer()
last_replay_save_time = time.time()
# Stopping conditions.
if tuner and tuner.should_trial_stop():
logging.info('Tuner requested early stopping. Finishing.')
do_training = False
if is_chief and FLAGS.stop_on_success:
found_solution = session.run(trainer.found_solution_flag)
if found_solution:
do_training = False
logging.info('Solution found. Finishing.')
if FLAGS.max_npe and global_npe >= FLAGS.max_npe:
# Max NPE (number of programs executed) reached.
logging.info('Max NPE reached. Finishing.')
do_training = False
if sv.should_stop():
logging.info('Supervisor issued stop. Finishing.')
do_training = False
except tf.errors.NotFoundError:
# Catch "Error while reading resource variable".
# The chief worker likely destroyed the container, so do not retry.
logging.info('Caught NotFoundError. Quitting.')
do_training = False
should_retry = False
break
except tf.errors.InternalError as e:
# Catch "Invalid variable reference."
if str(e).startswith('Invalid variable reference.'):
# The chief worker likely destroyed the container, so do not
# retry.
logging.info(
'Caught "InternalError: Invalid variable reference.". '
'Quitting.')
do_training = False
should_retry = False
break
else:
# Pass exception through.
raise
# Exited training loop. Write results to disk.
if is_chief and results_writer:
assert not should_retry
with tf.gfile.FastGFile(status_file, 'w') as f:
f.write('done')
(program_count,
found_solution,
code_solution,
best_reward,
global_step) = session.run(
[trainer.program_count,
trainer.found_solution_flag,
trainer.code_solution_variable,
trainer.global_best_reward,
trainer.global_step])
results_dict = {
'max_npe': FLAGS.max_npe,
'batch_size': config.batch_size,
'max_batches': FLAGS.max_npe // config.batch_size,
'npe': program_count,
'max_global_repetitions': FLAGS.num_repetitions,
'max_local_repetitions': FLAGS.num_repetitions,
'code_solution': code_solution,
'best_reward': best_reward,
'num_batches': global_step,
'found_solution': found_solution,
'task': trainer.data_manager.task_name,
'global_rep': run_number}
logging.info('results_dict: %s', results_dict)
results_writer.append(results_dict)
except tf.errors.AbortedError:
# Catch "Graph handle is not found" error due to preempted jobs.
logging.info('Caught AbortedError. Retying.')
should_retry = True
except tf.errors.DeadlineExceededError:
supervisor_deadline_exceeded = True
should_retry = False
if is_chief:
logging.info('This is chief worker. Stopping all workers.')
sv.stop()
if supervisor_deadline_exceeded:
logging.info('Supervisor timed out. Quitting.')
else:
logging.info('Reached %s steps. Worker stopped.', global_step)
# Dump profiling.
"""
How to use profiling data.
Download the profiler dump to your local machine, say to PROF_FILE_PATH.
In a separate script, run something like the following:
import pstats
p = pstats.Stats(PROF_FILE_PATH)
p.strip_dirs().sort_stats('cumtime').print_stats()
This will sort by 'cumtime', which "is the cumulative time spent in this and
all subfunctions (from invocation till exit)."
https://docs.python.org/2/library/profile.html#instant-user-s-manual
""" # pylint: disable=pointless-string-statement
if profiler:
prof_file = os.path.join(run_dir, 'task_%d.prof' % FLAGS.task_id)
logging.info('Done profiling.\nDumping to "%s".', prof_file)
profiler.create_stats()
with tf.gfile.Open(prof_file, 'w') as f:
f.write(marshal.dumps(profiler.stats))
return trainer
def run_training(config=None, tuner=None, logdir=None, trial_name=None,
is_chief=True):
"""Do all training runs.
This is the top level training function for policy gradient based models.
Run this from the main function.
Args:
config: config_lib.Config instance containing global config (agent and
environment hparams). If None, config will be parsed from FLAGS.config.
tuner: A tuner instance. Leave as None if not tuning.
logdir: Parent directory where all data from all runs will be written. If
None, FLAGS.logdir will be used.
trial_name: If tuning, set this to a unique string that identifies this
trial. If `tuner` is not None, this also must be set.
is_chief: True if this worker is the chief.
Returns:
List of results dicts which were written to disk. Each training run gets a
results dict. Results dict contains metrics, i.e. (name, value) pairs which
give information about the training run.
Raises:
ValueError: If results dicts read from disk contain invalid data.
"""
if not config:
# If custom config is not given, get it from flags.
config = defaults.default_config_with_updates(FLAGS.config)
if not logdir:
logdir = FLAGS.logdir
if not tf.gfile.Exists(logdir):
tf.gfile.MakeDirs(logdir)
assert FLAGS.num_repetitions > 0
results = results_lib.Results(logdir)
results_list, _ = results.read_all()
logging.info('Starting experiment. Directory: "%s"', logdir)
if results_list:
if results_list[0]['max_npe'] != FLAGS.max_npe:
raise ValueError(
'Cannot resume training. Max-NPE changed. Was %s, now %s',
results_list[0]['max_npe'], FLAGS.max_npe)
if results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
raise ValueError(
'Cannot resume training. Number of repetitions changed. Was %s, '
'now %s',
results_list[0]['max_global_repetitions'],
FLAGS.num_repetitions)
while len(results_list) < FLAGS.num_repetitions:
run_number = len(results_list)
rep_container_name = trial_name if trial_name else 'container'
if FLAGS.num_repetitions > 1:
rep_dir = os.path.join(logdir, 'run_%d' % run_number)
rep_container_name = rep_container_name + '_run_' + str(run_number)
else:
rep_dir = logdir
logging.info(
'Starting repetition %d (%d out of %d)', run_number, run_number + 1,
FLAGS.num_repetitions)
# Train will write result to disk.
with tf.container(rep_container_name):
trainer = train(config, is_chief, tuner, rep_dir, run_number, results)
logging.info('Done training.')
if is_chief:
# Destroy current container immediately (clears current graph).
logging.info('Clearing shared variables.')
tf.Session.reset(FLAGS.master, containers=[rep_container_name])
logging.info('Shared variables cleared.')
# Delete replay buffer on disk.
assert trainer
trainer.delete_replay_buffer()
else:
# Give chief worker time to clean up.
sleep_sec = 30.0
logging.info('Sleeping for %s sec.', sleep_sec)
time.sleep(sleep_sec)
tf.reset_default_graph()
logging.info('Default graph reset.')
# Expecting that train wrote new result to disk before returning.
results_list, _ = results.read_all()
return results_list
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for pg_train.
These tests excersize code paths available through configuration options.
Training will be run for just a few steps with the goal being to check that
nothing crashes.
"""
from absl import flags
import tensorflow as tf
from single_task import defaults # brain coder
from single_task import run # brain coder
FLAGS = flags.FLAGS
class TrainTest(tf.test.TestCase):
def RunTrainingSteps(self, config_string, num_steps=10):
"""Run a few training steps with the given config.
Just check that nothing crashes.
Args:
config_string: Config encoded in a string. See
$REPO_PATH/common/config_lib.py
num_steps: Number of training steps to run. Defaults to 10.
"""
config = defaults.default_config_with_updates(config_string)
FLAGS.master = ''
FLAGS.max_npe = num_steps * config.batch_size
FLAGS.summary_interval = 1
FLAGS.logdir = tf.test.get_temp_dir()
FLAGS.config = config_string
tf.reset_default_graph()
run.main(None)
def testVanillaPolicyGradient(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg"),'
'timestep_limit=90,batch_size=64')
def testVanillaPolicyGradient_VariableLengthSequences(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",eos_token=False),'
'timestep_limit=90,batch_size=64')
def testVanillaActorCritic(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",ema_baseline_decay=0.0),'
'timestep_limit=90,batch_size=64')
def testPolicyGradientWithTopK(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10),'
'timestep_limit=90,batch_size=64')
def testVanillaActorCriticWithTopK(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",ema_baseline_decay=0.0,topk_loss_hparam=1.0,'
'topk=10),'
'timestep_limit=90,batch_size=64')
def testPolicyGradientWithTopK_VariableLengthSequences(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10,eos_token=False),'
'timestep_limit=90,batch_size=64')
def testPolicyGradientWithImportanceSampling(self):
self.RunTrainingSteps(
'env=c(task="reverse"),'
'agent=c(algorithm="pg",alpha=0.5),'
'timestep_limit=90,batch_size=64')
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Results object manages distributed reading and writing of results to disk."""
import ast
from collections import namedtuple
import os
import re
from six.moves import xrange
import tensorflow as tf
ShardStats = namedtuple(
'ShardStats',
['num_local_reps_completed', 'max_local_reps', 'finished'])
def ge_non_zero(a, b):
return a >= b and b > 0
def get_shard_id(file_name):
assert file_name[-4:].lower() == '.txt'
return int(file_name[file_name.rfind('_') + 1: -4])
class Results(object):
"""Manages reading and writing training results to disk asynchronously.
Each worker writes to its own file, so that there are no race conditions when
writing happens. However any worker may read any file, as is the case for
`read_all`. Writes are expected to be atomic so that workers will never
read incomplete data, and this is likely to be the case on Unix systems.
Reading out of date data is fine, as workers calling `read_all` will wait
until data from every worker has been written before proceeding.
"""
file_template = 'experiment_results_{0}.txt'
search_regex = r'^experiment_results_([0-9])+\.txt$'
def __init__(self, log_dir, shard_id=0):
"""Construct `Results` instance.
Args:
log_dir: Where to write results files.
shard_id: Unique id for this file (i.e. shard). Each worker that will
be writing results should use a different shard id. If there are
N shards, each shard should be numbered 0 through N-1.
"""
# Use different files for workers so that they can write to disk async.
assert 0 <= shard_id
self.file_name = self.file_template.format(shard_id)
self.log_dir = log_dir
self.results_file = os.path.join(self.log_dir, self.file_name)
def append(self, metrics):
"""Append results to results list on disk."""
with tf.gfile.FastGFile(self.results_file, 'a') as writer:
writer.write(str(metrics) + '\n')
def read_this_shard(self):
"""Read only from this shard."""
return self._read_shard(self.results_file)
def _read_shard(self, results_file):
"""Read only from the given shard file."""
try:
with tf.gfile.FastGFile(results_file, 'r') as reader:
results = [ast.literal_eval(entry) for entry in reader]
except tf.errors.NotFoundError:
# No results written to disk yet. Return empty list.
return []
return results
def _get_max_local_reps(self, shard_results):
"""Get maximum number of repetitions the given shard needs to complete.
Worker working on each shard needs to complete a certain number of runs
before it finishes. This method will return that number so that we can
determine which shards are still not done.
We assume that workers are including a 'max_local_repetitions' value in
their results, which should be the total number of repetitions it needs to
run.
Args:
shard_results: Dict mapping metric names to values. This should be read
from a shard on disk.
Returns:
Maximum number of repetitions the given shard needs to complete.
"""
mlrs = [r['max_local_repetitions'] for r in shard_results]
if not mlrs:
return 0
for n in mlrs[1:]:
assert n == mlrs[0], 'Some reps have different max rep.'
return mlrs[0]
def read_all(self, num_shards=None):
"""Read results across all shards, i.e. get global results list.
Args:
num_shards: (optional) specifies total number of shards. If the caller
wants information about which shards are incomplete, provide this
argument (so that shards which have yet to be created are still
counted as incomplete shards). Otherwise, no information about
incomplete shards will be returned.
Returns:
aggregate: Global list of results (across all shards).
shard_stats: List of ShardStats instances, one for each shard. Or None if
`num_shards` is None.
"""
try:
all_children = tf.gfile.ListDirectory(self.log_dir)
except tf.errors.NotFoundError:
if num_shards is None:
return [], None
return [], [[] for _ in xrange(num_shards)]
shard_ids = {
get_shard_id(fname): fname
for fname in all_children if re.search(self.search_regex, fname)}
if num_shards is None:
aggregate = []
shard_stats = None
for results_file in shard_ids.values():
aggregate.extend(self._read_shard(
os.path.join(self.log_dir, results_file)))
else:
results_per_shard = [None] * num_shards
for shard_id in xrange(num_shards):
if shard_id in shard_ids:
results_file = shard_ids[shard_id]
results_per_shard[shard_id] = self._read_shard(
os.path.join(self.log_dir, results_file))
else:
results_per_shard[shard_id] = []
# Compute shard stats.
shard_stats = []
for shard_results in results_per_shard:
max_local_reps = self._get_max_local_reps(shard_results)
shard_stats.append(ShardStats(
num_local_reps_completed=len(shard_results),
max_local_reps=max_local_reps,
finished=ge_non_zero(len(shard_results), max_local_reps)))
# Compute aggregate.
aggregate = [
r for shard_results in results_per_shard for r in shard_results]
return aggregate, shard_stats
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for results_lib."""
import contextlib
import os
import shutil
import tempfile
from six.moves import xrange
import tensorflow as tf
from single_task import results_lib # brain coder
@contextlib.contextmanager
def temporary_directory(suffix='', prefix='tmp', base_path=None):
"""A context manager to create a temporary directory and clean up on exit.
The parameters are the same ones expected by tempfile.mkdtemp.
The directory will be securely and atomically created.
Everything under it will be removed when exiting the context.
Args:
suffix: optional suffix.
prefix: options prefix.
base_path: the base path under which to create the temporary directory.
Yields:
The absolute path of the new temporary directory.
"""
temp_dir_path = tempfile.mkdtemp(suffix, prefix, base_path)
try:
yield temp_dir_path
finally:
try:
shutil.rmtree(temp_dir_path)
except OSError as e:
if e.message == 'Cannot call rmtree on a symbolic link':
# Interesting synthetic exception made up by shutil.rmtree.
# Means we received a symlink from mkdtemp.
# Also means must clean up the symlink instead.
os.unlink(temp_dir_path)
else:
raise
def freeze(dictionary):
"""Convert dict to hashable frozenset."""
return frozenset(dictionary.iteritems())
class ResultsLibTest(tf.test.TestCase):
def testResults(self):
with temporary_directory() as logdir:
results_obj = results_lib.Results(logdir)
self.assertEqual(results_obj.read_this_shard(), [])
results_obj.append(
{'foo': 1.5, 'bar': 2.5, 'baz': 0})
results_obj.append(
{'foo': 5.5, 'bar': -1, 'baz': 2})
self.assertEqual(
results_obj.read_this_shard(),
[{'foo': 1.5, 'bar': 2.5, 'baz': 0},
{'foo': 5.5, 'bar': -1, 'baz': 2}])
def testShardedResults(self):
with temporary_directory() as logdir:
n = 4 # Number of shards.
results_objs = [
results_lib.Results(logdir, shard_id=i) for i in xrange(n)]
for i, robj in enumerate(results_objs):
robj.append({'foo': i, 'bar': 1 + i * 2})
results_list, _ = results_objs[0].read_all()
# Check results. Order does not matter here.
self.assertEqual(
set(freeze(r) for r in results_list),
set(freeze({'foo': i, 'bar': 1 + i * 2}) for i in xrange(n)))
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
r"""Run training.
Choose training algorithm and task(s) and follow these examples.
Run synchronous policy gradient training locally:
CONFIG="agent=c(algorithm='pg'),env=c(task='reverse')"
OUT_DIR="/tmp/bf_pg_local"
rm -rf $OUT_DIR
bazel run -c opt single_task:run -- \
--alsologtostderr \
--config="$CONFIG" \
--max_npe=0 \
--logdir="$OUT_DIR" \
--summary_interval=1 \
--model_v=0
learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
Run genetic algorithm locally:
CONFIG="agent=c(algorithm='ga'),env=c(task='reverse')"
OUT_DIR="/tmp/bf_ga_local"
rm -rf $OUT_DIR
bazel run -c opt single_task:run -- \
--alsologtostderr \
--config="$CONFIG" \
--max_npe=0 \
--logdir="$OUT_DIR"
Run uniform random search locally:
CONFIG="agent=c(algorithm='rand'),env=c(task='reverse')"
OUT_DIR="/tmp/bf_rand_local"
rm -rf $OUT_DIR
bazel run -c opt single_task:run -- \
--alsologtostderr \
--config="$CONFIG" \
--max_npe=0 \
--logdir="$OUT_DIR"
"""
from absl import app
from absl import flags
from absl import logging
from single_task import defaults # brain coder
from single_task import ga_train # brain coder
from single_task import pg_train # brain coder
FLAGS = flags.FLAGS
flags.DEFINE_string('config', '', 'Configuration.')
flags.DEFINE_string(
'logdir', None, 'Absolute path where to write results.')
flags.DEFINE_integer('task_id', 0, 'ID for this worker.')
flags.DEFINE_integer('num_workers', 1, 'How many workers there are.')
flags.DEFINE_integer(
'max_npe', 0,
'NPE = number of programs executed. Maximum number of programs to execute '
'in each run. Training will complete when this threshold is reached. Set '
'to 0 for unlimited training.')
flags.DEFINE_integer(
'num_repetitions', 1,
'Number of times the same experiment will be run (globally across all '
'workers). Each run is independent.')
flags.DEFINE_string(
'log_level', 'INFO',
'The threshold for what messages will be logged. One of DEBUG, INFO, WARN, '
'ERROR, or FATAL.')
# To register an algorithm:
# 1) Add dependency in the BUILD file to this build rule.
# 2) Import the algorithm's module at the top of this file.
# 3) Add a new entry in the following dict. The key is the algorithm name
# (used to select the algorithm in the config). The value is the module
# defining the expected functions for training and tuning. See the docstring
# for `get_namespace` for further details.
ALGORITHM_REGISTRATION = {
'pg': pg_train,
'ga': ga_train,
'rand': ga_train,
}
def get_namespace(config_string):
"""Get namespace for the selected algorithm.
Users who want to add additional algorithm types should modify this function.
The algorithm's namespace should contain the following functions:
run_training: Run the main training loop.
define_tuner_hparam_space: Return the hparam tuning space for the algo.
write_hparams_to_config: Helper for tuning. Write hparams chosen for tuning
to the Config object.
Look at pg_train.py and ga_train.py for function signatures and
implementations.
Args:
config_string: String representation of a Config object. This will get
parsed into a Config in order to determine what algorithm to use.
Returns:
algorithm_namespace: The module corresponding to the algorithm given in the
config.
config: The Config object resulting from parsing `config_string`.
Raises:
ValueError: If config.agent.algorithm is not one of the registered
algorithms.
"""
config = defaults.default_config_with_updates(config_string)
if config.agent.algorithm not in ALGORITHM_REGISTRATION:
raise ValueError('Unknown algorithm type "%s"' % (config.agent.algorithm,))
else:
return ALGORITHM_REGISTRATION[config.agent.algorithm], config
def main(argv):
del argv # Unused.
logging.set_verbosity(FLAGS.log_level)
flags.mark_flag_as_required('logdir')
if FLAGS.num_workers <= 0:
raise ValueError('num_workers flag must be greater than 0.')
if FLAGS.task_id < 0:
raise ValueError('task_id flag must be greater than or equal to 0.')
if FLAGS.task_id >= FLAGS.num_workers:
raise ValueError(
'task_id flag must be strictly less than num_workers flag.')
ns, _ = get_namespace(FLAGS.config)
ns.run_training(is_chief=FLAGS.task_id == 0)
if __name__ == '__main__':
app.run(main)
#!/usr/bin/env python
from __future__ import print_function
r"""This script can launch any eval experiments from the paper.
This is a script. Run with python, not bazel.
Usage:
./single_task/run_eval_tasks.py \
--exp EXP --desc DESC [--tuning_tasks] [--iclr_tasks] [--task TASK] \
[--tasks TASK1 TASK2 ...]
where EXP is one of the keys in `experiments`,
and DESC is a string description of the set of experiments (such as "v0")
Set only one of these flags:
--tuning_tasks flag only runs tuning tasks.
--iclr_tasks flag only runs the tasks included in the paper.
--regression_tests flag runs tasks which function as regression tests.
--task flag manually selects a single task to run.
--tasks flag takes a custom list of tasks.
Other flags:
--reps N specifies N repetitions per experiment, Default is 25.
--training_replicas R specifies that R workers will be launched to train one
task (for neural network algorithms). These workers will update a global
model stored on a parameter server. Defaults to 1. If R > 1, a parameter
server will also be launched.
Run everything:
exps=( pg-20M pg-topk-20M topk-20M ga-20M rand-20M )
BIN_DIR="single_task"
for exp in "${exps[@]}"
do
./$BIN_DIR/run_eval_tasks.py \
--exp "$exp" --iclr_tasks
done
"""
import argparse
from collections import namedtuple
import subprocess
S = namedtuple('S', ['length'])
default_length = 100
iclr_tasks = [
'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
regression_test_tasks = ['reverse', 'test-hill-climb']
E = namedtuple(
'E',
['name', 'method_type', 'config', 'simplify', 'batch_size', 'max_npe'])
def make_experiment_settings(name, **kwargs):
# Unpack experiment info from name.
def split_last(string, char):
i = string.rindex(char)
return string[:i], string[i+1:]
def si_to_int(si_string):
return int(
si_string.upper().replace('K', '0'*3).replace('M', '0'*6)
.replace('G', '0'*9))
method_type, max_npe = split_last(name, '-')
assert method_type
assert max_npe
return E(
name=name, method_type=method_type, max_npe=si_to_int(max_npe), **kwargs)
experiments_set = {
make_experiment_settings(
'pg-20M',
config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.0,topk=0,'
'pi_loss_hparam=1.0,alpha=0.0',
simplify=False,
batch_size=64),
make_experiment_settings(
'pg-topk-20M',
config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=50.0,topk=10,'
'pi_loss_hparam=1.0,alpha=0.0',
simplify=False,
batch_size=64),
make_experiment_settings(
'topk-20M',
config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
'pi_loss_hparam=0.0,alpha=0.0',
simplify=False,
batch_size=64),
make_experiment_settings(
'topk-0ent-20M',
config='entropy_beta=0.000,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
'pi_loss_hparam=0.0,alpha=0.0',
simplify=False,
batch_size=64),
make_experiment_settings(
'ga-20M',
config='crossover_rate=0.95,mutation_rate=0.15',
simplify=False,
batch_size=100), # Population size.
make_experiment_settings(
'rand-20M',
config='',
simplify=False,
batch_size=1),
make_experiment_settings(
'simpl-500M',
config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.5,topk=10,'
'pi_loss_hparam=1.0,alpha=0.0',
simplify=True,
batch_size=64),
}
experiments = {e.name: e for e in experiments_set}
# pylint: disable=redefined-outer-name
def parse_args(extra_args=()):
"""Parse arguments and extract task and experiment info."""
parser = argparse.ArgumentParser(description='Run all eval tasks.')
parser.add_argument('--exp', required=True)
parser.add_argument('--tuning_tasks', action='store_true')
parser.add_argument('--iclr_tasks', action='store_true')
parser.add_argument('--regression_tests', action='store_true')
parser.add_argument('--desc', default='v0')
parser.add_argument('--reps', default=25)
parser.add_argument('--task')
parser.add_argument('--tasks', nargs='+')
for arg_string, default in extra_args:
parser.add_argument(arg_string, default=default)
args = parser.parse_args()
print('Running experiment: %s' % (args.exp,))
if args.desc:
print('Extra description: "%s"' % (args.desc,))
if args.exp not in experiments:
raise ValueError('Experiment name is not valid')
experiment_name = args.exp
experiment_settings = experiments[experiment_name]
assert experiment_settings.name == experiment_name
if args.tasks:
print('Launching tasks from args: %s' % (args.tasks,))
tasks = {t: S(length=default_length) for t in args.tasks}
elif args.task:
print('Launching single task "%s"' % args.task)
tasks = {args.task: S(length=default_length)}
elif args.tuning_tasks:
print('Only running tuning tasks')
tasks = {name: S(length=default_length)
for name in ['reverse-tune', 'remove-char-tune']}
elif args.iclr_tasks:
print('Running eval tasks from ICLR paper.')
tasks = {name: S(length=default_length) for name in iclr_tasks}
elif args.regression_tests:
tasks = {name: S(length=default_length) for name in regression_test_tasks}
print('Tasks: %s' % tasks.keys())
print('reps = %d' % (int(args.reps),))
return args, tasks, experiment_settings
def run(command_string):
subprocess.call(command_string, shell=True)
if __name__ == '__main__':
LAUNCH_TRAINING_COMMAND = 'single_task/launch_training.sh'
COMPILE_COMMAND = 'bazel build -c opt single_task:run.par'
args, tasks, experiment_settings = parse_args(
extra_args=(('--training_replicas', 1),))
if experiment_settings.method_type in (
'pg', 'pg-topk', 'topk', 'topk-0ent', 'simpl'):
# Runs PG and TopK.
def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
batch_size, do_simplify, custom_config_str):
"""Constructs terminal command for launching NN based algorithms.
The arguments to this function will be used to create config for the
experiment.
Args:
job_name: Name of the job to launch. Should uniquely identify this
experiment run.
task: Name of the coding task to solve.
max_npe: Maximum number of programs executed. An integer.
num_reps: Number of times to run the experiment. An integer.
code_length: Maximum allowed length of synthesized code.
batch_size: Minibatch size for gradient descent.
do_simplify: Whether to run the experiment in code simplification mode.
A bool.
custom_config_str: Additional config for the model config string.
Returns:
The terminal command that launches the specified experiment.
"""
config = """
env=c(task='{0}',correct_syntax=False),
agent=c(
algorithm='pg',
policy_lstm_sizes=[35,35],value_lstm_sizes=[35,35],
grad_clip_threshold=50.0,param_init_factor=0.5,regularizer=0.0,
softmax_tr=1.0,optimizer='rmsprop',ema_baseline_decay=0.99,
eos_token={3},{4}),
timestep_limit={1},batch_size={2}
""".replace(' ', '').replace('\n', '').format(
task, code_length, batch_size, do_simplify, custom_config_str)
num_ps = 0 if args.training_replicas == 1 else 1
return (
r'{0} --job_name={1} --config="{2}" --max_npe={3} '
'--num_repetitions={4} --num_workers={5} --num_ps={6} '
'--stop_on_success={7}'
.format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
args.training_replicas, num_ps, str(not do_simplify).lower()))
else:
# Runs GA and Rand.
assert experiment_settings.method_type in ('ga', 'rand')
def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
batch_size, do_simplify, custom_config_str):
"""Constructs terminal command for launching GA or uniform random search.
The arguments to this function will be used to create config for the
experiment.
Args:
job_name: Name of the job to launch. Should uniquely identify this
experiment run.
task: Name of the coding task to solve.
max_npe: Maximum number of programs executed. An integer.
num_reps: Number of times to run the experiment. An integer.
code_length: Maximum allowed length of synthesized code.
batch_size: Minibatch size for gradient descent.
do_simplify: Whether to run the experiment in code simplification mode.
A bool.
custom_config_str: Additional config for the model config string.
Returns:
The terminal command that launches the specified experiment.
"""
assert not do_simplify
if custom_config_str:
custom_config_str = ',' + custom_config_str
config = """
env=c(task='{0}',correct_syntax=False),
agent=c(
algorithm='{4}'
{3}),
timestep_limit={1},batch_size={2}
""".replace(' ', '').replace('\n', '').format(
task, code_length, batch_size, custom_config_str,
experiment_settings.method_type)
num_workers = num_reps # Do each rep in parallel.
return (
r'{0} --job_name={1} --config="{2}" --max_npe={3} '
'--num_repetitions={4} --num_workers={5} --num_ps={6} '
'--stop_on_success={7}'
.format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
num_workers, 0, str(not do_simplify).lower()))
print('Compiling...')
run(COMPILE_COMMAND)
print('Launching %d coding tasks...' % len(tasks))
for task, task_settings in tasks.iteritems():
name = 'bf_rl_iclr'
desc = '{0}.{1}_{2}'.format(args.desc, experiment_settings.name, task)
job_name = '{}.{}'.format(name, desc)
print('Job name: %s' % job_name)
reps = int(args.reps) if not experiment_settings.simplify else 1
run_cmd = make_run_cmd(
job_name, task, experiment_settings.max_npe, reps,
task_settings.length, experiment_settings.batch_size,
experiment_settings.simplify,
experiment_settings.config)
print('Running command:\n' + run_cmd)
run(run_cmd)
print('Done.')
# pylint: enable=redefined-outer-name
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tasks that test correctness of algorithms."""
from six.moves import xrange
from common import reward as reward_lib # brain coder
from single_task import misc # brain coder
class BasicTaskManager(object):
"""Wraps a generic reward function."""
def __init__(self, reward_fn):
self.reward_fn = reward_fn
self.good_reward = 1.0
def _score_string(self, string):
actions = misc.bf_string_to_tokens(string)
reward, correct = self.reward_fn(actions)
return misc.RewardInfo(
episode_rewards=[0.0] * (len(string) - 1) + [reward],
input_case=None,
correct_output=None,
code_output=actions,
input_type=None,
output_type=misc.IOType.integer,
reason='correct' if correct else 'wrong')
def rl_batch(self, batch_size):
reward_fns = [self._score_string] * batch_size
return reward_fns
class Trie(object):
"""Trie for sequences."""
EOS = ()
def __init__(self):
self.trie = {}
def insert(self, sequence):
d = self.trie
for e in sequence:
if e not in d:
d[e] = {}
d = d[e]
d[self.EOS] = True # Terminate sequence.
def prefix_match(self, sequence):
"""Return prefix of `sequence` which exists in the trie."""
d = self.trie
index = 0
for i, e in enumerate(sequence + [self.EOS]):
index = i
if e in d:
d = d[e]
if e == self.EOS:
return sequence, True
else:
break
return sequence[:index], False
def next_choices(self, sequence):
d = self.trie
for e in sequence:
if e in d:
d = d[e]
else:
raise ValueError('Sequence not a prefix: %s' % (sequence,))
return d.keys()
class HillClimbingTask(object):
"""Simple task that tests reward hill climbing ability.
There are a set of paths (sequences of tokens) which are rewarded. The total
reward for a path is proportional to its length, so the longest path is the
target. Shorter paths can be dead ends.
"""
def __init__(self):
# Paths are sequences of sub-sequences. Here we form unique sub-sequences
# out of 3 arbitrary ints. We use sub-sequences instead of single entities
# to make the task harder by making the episodes last longer, i.e. more
# for the agent to remember.
a = (1, 2, 3)
b = (4, 5, 6)
c = (7, 8, 7)
d = (6, 5, 4)
e = (3, 2, 1)
f = (8, 5, 1)
g = (6, 4, 2)
h = (1, 8, 3)
self.paths = Trie()
self.paths.insert([a, b, h])
self.paths.insert([a, b, c, d, e, f, g, h])
self.paths.insert([a, b, c, d, e, b, a])
self.paths.insert([a, b, g, h])
self.paths.insert([a, e, f, g])
self.correct_sequence = misc.flatten([a, b, c, d, e, f, g, h])
def distance_fn(a, b):
len_diff = abs(len(a) - len(b))
return sum(reward_lib.mod_abs_diff(ai - 1, bi - 1, 8)
for ai, bi in zip(a, b)) + len_diff * 4 # 8 / 2 = 4
self.distance_fn = distance_fn
def __call__(self, actions):
# Compute reward for action sequence.
actions = [a for a in actions if a > 0]
sequence = [tuple(actions[i: i + 3]) for i in xrange(0, len(actions), 3)]
prefix, complete = self.paths.prefix_match(sequence)
if complete:
return float(len(prefix)), actions == self.correct_sequence
if len(prefix) == len(sequence):
return float(len(prefix)), False
next_pred = sequence[len(prefix)]
choices = self.paths.next_choices(prefix)
if choices == [()]:
return (len(prefix) - len(next_pred) / 3.0), False
min_dist = min(self.distance_fn(c, next_pred) for c in choices)
# +1 reward for each element in the sequence correct, plus fraction torwards
# closest next element.
# Maximum distance possible is num_actions * base / 2 = 3 * 8 / 2 = 12
return (len(prefix) + (1 - min_dist / 12.0)), False
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for test_tasks."""
import numpy as np
import tensorflow as tf
from single_task import misc # brain coder
from single_task import test_tasks # brain coder
def get_reward(reward_fn, candidate):
return sum(reward_fn(misc.bf_tokens_to_string(candidate)).episode_rewards)
class TestTasksTest(tf.test.TestCase):
def testHillClimbingTask(self):
task = test_tasks.BasicTaskManager(test_tasks.HillClimbingTask())
reward_fns = task.rl_batch(1)
reward_fn = reward_fns[0]
self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 0]), 8 / 12.))
self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 2, 0]), 11 / 12.))
self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 3, 0]), 1.0))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 2, 0]), 1. + 8 / 12.))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 0]), 2.0))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 0]), 3.0))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 0]), 3.0))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 1, 0]),
3.0 - 4 / 12.))
self.assertTrue(
np.isclose(
get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 1, 1, 1, 1, 0]),
2.0))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 3, 0]),
3.0 + 1 / 12.))
self.assertTrue(
np.isclose(
get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1,
8, 5, 1, 6, 4, 2, 1, 8, 3, 0]),
8.0))
self.assertTrue(
np.isclose(
get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1,
8, 5, 1, 6, 4, 2, 1, 8, 3, 1, 1, 0]),
8.0 - 8 / 12.))
self.assertTrue(
np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3,
2, 1, 8, 5, 1, 6, 4, 2, 1, 8, 3, 1, 1,
1, 1, 1, 1, 1, 0]),
7.0))
if __name__ == '__main__':
tf.test.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
r"""Run grid search.
Look at launch_tuning.sh for details on how to tune at scale.
Usage example:
Tune with one worker on the local machine.
CONFIG="agent=c(algorithm='pg'),"
CONFIG+="env=c(task_cycle=['reverse-tune', 'remove-tune'])"
HPARAM_SPACE_TYPE="pg"
OUT_DIR="/tmp/bf_pg_tune"
MAX_NPE=5000000
NUM_REPETITIONS=50
rm -rf $OUT_DIR
mkdir $OUT_DIR
bazel run -c opt single_task:tune -- \
--alsologtostderr \
--config="$CONFIG" \
--max_npe="$MAX_NPE" \
--num_repetitions="$NUM_REPETITIONS" \
--logdir="$OUT_DIR" \
--summary_interval=1 \
--model_v=0 \
--hparam_space="$HPARAM_SPACE_TYPE" \
--tuner_id=0 \
--num_tuners=1 \
2>&1 >"$OUT_DIR/tuner_0.log"
learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
"""
import ast
import os
from absl import app
from absl import flags
from absl import logging
import numpy as np
from six.moves import xrange
import tensorflow as tf
from single_task import defaults # brain coder
from single_task import run as run_lib # brain coder
FLAGS = flags.FLAGS
flags.DEFINE_integer(
'tuner_id', 0,
'The unique ID for this tuning worker.')
flags.DEFINE_integer(
'num_tuners', 1,
'How many tuners are there.')
flags.DEFINE_string(
'hparam_space', 'default',
'String name which denotes the hparam space to tune over. This is '
'algorithm dependent.')
flags.DEFINE_string(
'fixed_hparams', '',
'HParams string. Used to fix hparams during tuning.')
flags.DEFINE_float(
'success_rate_objective_weight', 1.0,
'How much to weight success rate vs num programs seen. By default, only '
'success rate is optimized (this is the setting used in the paper).')
def parse_hparams_string(hparams_str):
hparams = {}
for term in hparams_str.split(','):
if not term:
continue
name, value = term.split('=')
hparams[name.strip()] = ast.literal_eval(value)
return hparams
def int_to_multibase(n, bases):
digits = [0] * len(bases)
for i, b in enumerate(bases):
n, d = divmod(n, b)
digits[i] = d
return digits
def hparams_for_index(index, tuning_space):
keys = sorted(tuning_space.keys())
indices = int_to_multibase(index, [len(tuning_space[k]) for k in keys])
return tf.contrib.training.HParams(
**{k: tuning_space[k][i] for k, i in zip(keys, indices)})
def run_tuner_loop(ns):
"""Run tuning loop for this worker."""
is_chief = FLAGS.task_id == 0
tuning_space = ns.define_tuner_hparam_space(
hparam_space_type=FLAGS.hparam_space)
fixed_hparams = parse_hparams_string(FLAGS.fixed_hparams)
for name, value in fixed_hparams.iteritems():
tuning_space[name] = [value]
tuning_space_size = np.prod([len(values) for values in tuning_space.values()])
num_local_trials, remainder = divmod(tuning_space_size, FLAGS.num_tuners)
if FLAGS.tuner_id < remainder:
num_local_trials += 1
starting_trial_id = (
num_local_trials * FLAGS.tuner_id + min(remainder, FLAGS.tuner_id))
logging.info('tuning_space_size: %d', tuning_space_size)
logging.info('num_local_trials: %d', num_local_trials)
logging.info('starting_trial_id: %d', starting_trial_id)
for local_trial_index in xrange(num_local_trials):
trial_config = defaults.default_config_with_updates(FLAGS.config)
global_trial_index = local_trial_index + starting_trial_id
trial_name = 'trial_' + str(global_trial_index)
trial_dir = os.path.join(FLAGS.logdir, trial_name)
hparams = hparams_for_index(global_trial_index, tuning_space)
ns.write_hparams_to_config(
trial_config, hparams, hparam_space_type=FLAGS.hparam_space)
results_list = ns.run_training(
config=trial_config, tuner=None, logdir=trial_dir, is_chief=is_chief,
trial_name=trial_name)
if not is_chief:
# Only chief worker needs to write tuning results to disk.
continue
objective, metrics = compute_tuning_objective(
results_list, hparams, trial_name, num_trials=tuning_space_size)
logging.info('metrics:\n%s', metrics)
logging.info('objective: %s', objective)
logging.info('programs_seen_fraction: %s',
metrics['programs_seen_fraction'])
logging.info('success_rate: %s', metrics['success_rate'])
logging.info('success_rate_objective_weight: %s',
FLAGS.success_rate_objective_weight)
tuning_results_file = os.path.join(trial_dir, 'tuning_results.txt')
with tf.gfile.FastGFile(tuning_results_file, 'a') as writer:
writer.write(str(metrics) + '\n')
logging.info('Trial %s complete.', trial_name)
def compute_tuning_objective(results_list, hparams, trial_name, num_trials):
"""Compute tuning objective and metrics given results and trial information.
Args:
results_list: List of results dicts read from disk. These are written by
workers.
hparams: tf.contrib.training.HParams instance containing the hparams used
in this trial (only the hparams which are being tuned).
trial_name: Name of this trial. Used to create a trial directory.
num_trials: Total number of trials that need to be run. This is saved in the
metrics dict for future reference.
Returns:
objective: The objective computed for this trial. Choose the hparams for the
trial with the largest objective value.
metrics: Information about this trial. A dict.
"""
found_solution = [r['found_solution'] for r in results_list]
successful_program_counts = [
r['npe'] for r in results_list if r['found_solution']]
success_rate = sum(found_solution) / float(len(results_list))
max_programs = FLAGS.max_npe # Per run.
all_program_counts = [
r['npe'] if r['found_solution'] else max_programs
for r in results_list]
programs_seen_fraction = (
float(sum(all_program_counts))
/ (max_programs * len(all_program_counts)))
# min/max/avg stats are over successful runs.
metrics = {
'num_runs': len(results_list),
'num_succeeded': sum(found_solution),
'success_rate': success_rate,
'programs_seen_fraction': programs_seen_fraction,
'avg_programs': np.mean(successful_program_counts),
'max_possible_programs_per_run': max_programs,
'global_step': sum([r['num_batches'] for r in results_list]),
'hparams': hparams.values(),
'trial_name': trial_name,
'num_trials': num_trials}
# Report stats per tasks.
tasks = [r['task'] for r in results_list]
for task in set(tasks):
task_list = [r for r in results_list if r['task'] == task]
found_solution = [r['found_solution'] for r in task_list]
successful_rewards = [
r['best_reward'] for r in task_list
if r['found_solution']]
successful_num_batches = [
r['num_batches']
for r in task_list if r['found_solution']]
successful_program_counts = [
r['npe'] for r in task_list if r['found_solution']]
metrics_append = {
task + '__num_runs': len(task_list),
task + '__num_succeeded': sum(found_solution),
task + '__success_rate': (
sum(found_solution) / float(len(task_list)))}
metrics.update(metrics_append)
if any(found_solution):
metrics_append = {
task + '__min_reward': min(successful_rewards),
task + '__max_reward': max(successful_rewards),
task + '__avg_reward': np.median(successful_rewards),
task + '__min_programs': min(successful_program_counts),
task + '__max_programs': max(successful_program_counts),
task + '__avg_programs': np.mean(successful_program_counts),
task + '__min_batches': min(successful_num_batches),
task + '__max_batches': max(successful_num_batches),
task + '__avg_batches': np.mean(successful_num_batches)}
metrics.update(metrics_append)
# Objective will be maximized.
# Maximize success rate, minimize num programs seen.
# Max objective is always 1.
weight = FLAGS.success_rate_objective_weight
objective = (
weight * success_rate
+ (1 - weight) * (1 - programs_seen_fraction))
metrics['objective'] = objective
return objective, metrics
def main(argv):
del argv
logging.set_verbosity(FLAGS.log_level)
if not FLAGS.logdir:
raise ValueError('logdir flag must be provided.')
if FLAGS.num_workers <= 0:
raise ValueError('num_workers flag must be greater than 0.')
if FLAGS.task_id < 0:
raise ValueError('task_id flag must be greater than or equal to 0.')
if FLAGS.task_id >= FLAGS.num_workers:
raise ValueError(
'task_id flag must be strictly less than num_workers flag.')
if FLAGS.num_tuners <= 0:
raise ValueError('num_tuners flag must be greater than 0.')
if FLAGS.tuner_id < 0:
raise ValueError('tuner_id flag must be greater than or equal to 0.')
if FLAGS.tuner_id >= FLAGS.num_tuners:
raise ValueError(
'tuner_id flag must be strictly less than num_tuners flag.')
ns, _ = run_lib.get_namespace(FLAGS.config)
run_tuner_loop(ns)
if __name__ == '__main__':
app.run(main)
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Cognitive Mapping and Planning for Visual Navigation
**Saurabh Gupta, James Davidson, Sergey Levine, Rahul Sukthankar, Jitendra Malik**
**Computer Vision and Pattern Recognition (CVPR) 2017.**
**[ArXiv](https://arxiv.org/abs/1702.03920),
[Project Website](https://sites.google.com/corp/view/cognitive-mapping-and-planning/)**
### Citing
If you find this code base and models useful in your research, please consider
citing the following paper:
```
@inproceedings{gupta2017cognitive,
title={Cognitive Mapping and Planning for Visual Navigation},
author={Gupta, Saurabh and Davidson, James and Levine, Sergey and
Sukthankar, Rahul and Malik, Jitendra},
booktitle={CVPR},
year={2017}
}
```
### Contents
1. [Requirements: software](#requirements-software)
2. [Requirements: data](#requirements-data)
3. [Test Pre-trained Models](#test-pre-trained-models)
4. [Train your Own Models](#train-your-own-models)
### Requirements: software
1. Python Virtual Env Setup: All code is implemented in Python but depends on a
small number of python packages and a couple of C libraries. We recommend
using virtual environment for installing these python packages and python
bindings for these C libraries.
```Shell
VENV_DIR=venv
pip install virtualenv
virtualenv $VENV_DIR
source $VENV_DIR/bin/activate
# You may need to upgrade pip for installing openv-python.
pip install --upgrade pip
# Install simple dependencies.
pip install -r requirements.txt
# Patch bugs in dependencies.
sh patches/apply_patches.sh
```
2. Install [Tensorflow](https://www.tensorflow.org/) inside this virtual
environment. You will need to use one of the latest nightly builds
(see instructions [here](https://github.com/tensorflow/tensorflow#installation)).
3. Swiftshader: We use
[Swiftshader](https://github.com/google/swiftshader.git), a CPU based
renderer to render the meshes. It is possible to use other renderers,
replace `SwiftshaderRenderer` in `render/swiftshader_renderer.py` with
bindings to your renderer.
```Shell
mkdir -p deps
git clone --recursive https://github.com/google/swiftshader.git deps/swiftshader-src
cd deps/swiftshader-src && git checkout 91da6b00584afd7dcaed66da88e2b617429b3950
git submodule update
mkdir build && cd build && cmake .. && make -j 16 libEGL libGLESv2
cd ../../../
cp deps/swiftshader-src/build/libEGL* libEGL.so.1
cp deps/swiftshader-src/build/libGLESv2* libGLESv2.so.2
```
4. PyAssimp: We use [PyAssimp](https://github.com/assimp/assimp.git) to load
meshes. It is possible to use other libraries to load meshes, replace
`Shape` `render/swiftshader_renderer.py` with bindings to your library for
loading meshes.
```Shell
mkdir -p deps
git clone https://github.com/assimp/assimp.git deps/assimp-src
cd deps/assimp-src
git checkout 2afeddd5cb63d14bc77b53740b38a54a97d94ee8
cmake CMakeLists.txt -G 'Unix Makefiles' && make -j 16
cd port/PyAssimp && python setup.py install
cd ../../../..
cp deps/assimp-src/lib/libassimp* .
```
5. graph-tool: We use [graph-tool](https://git.skewed.de/count0/graph-tool)
library for graph processing.
```Shell
mkdir -p deps
# If the following git clone command fails, you can also download the source
# from https://downloads.skewed.de/graph-tool/graph-tool-2.2.44.tar.bz2
git clone https://git.skewed.de/count0/graph-tool deps/graph-tool-src
cd deps/graph-tool-src && git checkout 178add3a571feb6666f4f119027705d95d2951ab
bash autogen.sh
./configure --disable-cairo --disable-sparsehash --prefix=$HOME/.local
make -j 16
make install
cd ../../
```
### Requirements: data
1. Download the Stanford 3D Indoor Spaces Dataset (S3DIS Dataset) and ImageNet
Pre-trained models for initializing different models. Follow instructions in
`data/README.md`
### Test Pre-trained Models
1. Download pre-trained models. See `output/README.md`.
2. Test models using `scripts/script_test_pretrained_models.sh`.
### Train Your Own Models
All models were trained asynchronously with 16 workers each worker using data
from a single floor. The default hyper-parameters correspond to this setting.
See [distributed training with
Tensorflow](https://www.tensorflow.org/deploy/distributed) for setting up
distributed training. Training with a single worker is possible with the current
code base but will require some minor changes to allow each worker to load all
training environments.
### Contact
For questions or issues open an issue on the tensorflow/models [issues
tracker](https://github.com/tensorflow/models/issues). Please assign issues to
@s-gupta.
### Credits
This code was written by Saurabh Gupta (@s-gupta).
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os, sys
import numpy as np
from tensorflow.python.platform import app
from tensorflow.python.platform import flags
import logging
import src.utils as utils
import cfgs.config_common as cc
import tensorflow as tf
rgb_resnet_v2_50_path = 'data/init_models/resnet_v2_50/model.ckpt-5136169'
d_resnet_v2_50_path = 'data/init_models/distill_rgb_to_d_resnet_v2_50/model.ckpt-120002'
def get_default_args():
summary_args = utils.Foo(display_interval=1, test_iters=26,
arop_full_summary_iters=14)
control_args = utils.Foo(train=False, test=False,
force_batchnorm_is_training_at_test=False,
reset_rng_seed=False, only_eval_when_done=False,
test_mode=None)
return summary_args, control_args
def get_default_cmp_args():
batch_norm_param = {'center': True, 'scale': True,
'activation_fn':tf.nn.relu}
mapper_arch_args = utils.Foo(
dim_reduce_neurons=64,
fc_neurons=[1024, 1024],
fc_out_size=8,
fc_out_neurons=64,
encoder='resnet_v2_50',
deconv_neurons=[64, 32, 16, 8, 4, 2],
deconv_strides=[2, 2, 2, 2, 2, 2],
deconv_layers_per_block=2,
deconv_kernel_size=4,
fc_dropout=0.5,
combine_type='wt_avg_logits',
batch_norm_param=batch_norm_param)
readout_maps_arch_args = utils.Foo(
num_neurons=[],
strides=[],
kernel_size=None,
layers_per_block=None)
arch_args = utils.Foo(
vin_val_neurons=8, vin_action_neurons=8, vin_ks=3, vin_share_wts=False,
pred_neurons=[64, 64], pred_batch_norm_param=batch_norm_param,
conv_on_value_map=0, fr_neurons=16, fr_ver='v2', fr_inside_neurons=64,
fr_stride=1, crop_remove_each=30, value_crop_size=4,
action_sample_type='sample', action_sample_combine_type='one_or_other',
sample_gt_prob_type='inverse_sigmoid_decay', dagger_sample_bn_false=True,
vin_num_iters=36, isd_k=750., use_agent_loc=False, multi_scale=True,
readout_maps=False, rom_arch=readout_maps_arch_args)
return arch_args, mapper_arch_args
def get_arch_vars(arch_str):
if arch_str == '': vals = []
else: vals = arch_str.split('_')
ks = ['var1', 'var2', 'var3']
ks = ks[:len(vals)]
# Exp Ver.
if len(vals) == 0: ks.append('var1'); vals.append('v0')
# custom arch.
if len(vals) == 1: ks.append('var2'); vals.append('')
# map scape for projection baseline.
if len(vals) == 2: ks.append('var3'); vals.append('fr2')
assert(len(vals) == 3)
vars = utils.Foo()
for k, v in zip(ks, vals):
setattr(vars, k, v)
logging.error('arch_vars: %s', vars)
return vars
def process_arch_str(args, arch_str):
# This function modifies args.
args.arch, args.mapper_arch = get_default_cmp_args()
arch_vars = get_arch_vars(arch_str)
args.navtask.task_params.outputs.ego_maps = True
args.navtask.task_params.outputs.ego_goal_imgs = True
args.navtask.task_params.outputs.egomotion = True
args.navtask.task_params.toy_problem = False
if arch_vars.var1 == 'lmap':
args = process_arch_learned_map(args, arch_vars)
elif arch_vars.var1 == 'pmap':
args = process_arch_projected_map(args, arch_vars)
else:
logging.fatal('arch_vars.var1 should be lmap or pmap, but is %s', arch_vars.var1)
assert(False)
return args
def process_arch_learned_map(args, arch_vars):
# Multiscale vision based system.
args.navtask.task_params.input_type = 'vision'
args.navtask.task_params.outputs.images = True
if args.navtask.camera_param.modalities[0] == 'rgb':
args.solver.pretrained_path = rgb_resnet_v2_50_path
elif args.navtask.camera_param.modalities[0] == 'depth':
args.solver.pretrained_path = d_resnet_v2_50_path
if arch_vars.var2 == 'Ssc':
sc = 1./args.navtask.task_params.step_size
args.arch.vin_num_iters = 40
args.navtask.task_params.map_scales = [sc]
max_dist = args.navtask.task_params.max_dist * \
args.navtask.task_params.num_goals
args.navtask.task_params.map_crop_sizes = [2*max_dist]
args.arch.fr_stride = 1
args.arch.vin_action_neurons = 8
args.arch.vin_val_neurons = 3
args.arch.fr_inside_neurons = 32
args.mapper_arch.pad_map_with_zeros_each = [24]
args.mapper_arch.deconv_neurons = [64, 32, 16]
args.mapper_arch.deconv_strides = [1, 2, 1]
elif (arch_vars.var2 == 'Msc' or arch_vars.var2 == 'MscROMms' or
arch_vars.var2 == 'MscROMss' or arch_vars.var2 == 'MscNoVin'):
# Code for multi-scale planner.
args.arch.vin_num_iters = 8
args.arch.crop_remove_each = 4
args.arch.value_crop_size = 8
sc = 1./args.navtask.task_params.step_size
max_dist = args.navtask.task_params.max_dist * \
args.navtask.task_params.num_goals
n_scales = np.log2(float(max_dist) / float(args.arch.vin_num_iters))
n_scales = int(np.ceil(n_scales)+1)
args.navtask.task_params.map_scales = \
list(sc*(0.5**(np.arange(n_scales))[::-1]))
args.navtask.task_params.map_crop_sizes = [16 for x in range(n_scales)]
args.arch.fr_stride = 1
args.arch.vin_action_neurons = 8
args.arch.vin_val_neurons = 3
args.arch.fr_inside_neurons = 32
args.mapper_arch.pad_map_with_zeros_each = [0 for _ in range(n_scales)]
args.mapper_arch.deconv_neurons = [64*n_scales, 32*n_scales, 16*n_scales]
args.mapper_arch.deconv_strides = [1, 2, 1]
if arch_vars.var2 == 'MscNoVin':
# No planning version.
args.arch.fr_stride = [1, 2, 1, 2]
args.arch.vin_action_neurons = None
args.arch.vin_val_neurons = 16
args.arch.fr_inside_neurons = 32
args.arch.crop_remove_each = 0
args.arch.value_crop_size = 4
args.arch.vin_num_iters = 0
elif arch_vars.var2 == 'MscROMms' or arch_vars.var2 == 'MscROMss':
# Code with read outs, MscROMms flattens and reads out,
# MscROMss does not flatten and produces output at multiple scales.
args.navtask.task_params.outputs.readout_maps = True
args.navtask.task_params.map_resize_method = 'antialiasing'
args.arch.readout_maps = True
if arch_vars.var2 == 'MscROMms':
args.arch.rom_arch.num_neurons = [64, 1]
args.arch.rom_arch.kernel_size = 4
args.arch.rom_arch.strides = [2,2]
args.arch.rom_arch.layers_per_block = 2
args.navtask.task_params.readout_maps_crop_sizes = [64]
args.navtask.task_params.readout_maps_scales = [sc]
elif arch_vars.var2 == 'MscROMss':
args.arch.rom_arch.num_neurons = \
[64, len(args.navtask.task_params.map_scales)]
args.arch.rom_arch.kernel_size = 4
args.arch.rom_arch.strides = [1,1]
args.arch.rom_arch.layers_per_block = 1
args.navtask.task_params.readout_maps_crop_sizes = \
args.navtask.task_params.map_crop_sizes
args.navtask.task_params.readout_maps_scales = \
args.navtask.task_params.map_scales
else:
logging.fatal('arch_vars.var2 not one of Msc, MscROMms, MscROMss, MscNoVin.')
assert(False)
map_channels = args.mapper_arch.deconv_neurons[-1] / \
(2*len(args.navtask.task_params.map_scales))
args.navtask.task_params.map_channels = map_channels
return args
def process_arch_projected_map(args, arch_vars):
# Single scale vision based system which does not use a mapper but instead
# uses an analytically estimated map.
ds = int(arch_vars.var3[2])
args.navtask.task_params.input_type = 'analytical_counts'
args.navtask.task_params.outputs.analytical_counts = True
assert(args.navtask.task_params.modalities[0] == 'depth')
args.navtask.camera_param.img_channels = None
analytical_counts = utils.Foo(map_sizes=[512/ds],
xy_resolution=[5.*ds],
z_bins=[[-10, 10, 150, 200]],
non_linearity=[arch_vars.var2])
args.navtask.task_params.analytical_counts = analytical_counts
sc = 1./ds
args.arch.vin_num_iters = 36
args.navtask.task_params.map_scales = [sc]
args.navtask.task_params.map_crop_sizes = [512/ds]
args.arch.fr_stride = [1,2]
args.arch.vin_action_neurons = 8
args.arch.vin_val_neurons = 3
args.arch.fr_inside_neurons = 32
map_channels = len(analytical_counts.z_bins[0]) + 1
args.navtask.task_params.map_channels = map_channels
args.solver.freeze_conv = False
return args
def get_args_for_config(config_name):
args = utils.Foo()
args.summary, args.control = get_default_args()
exp_name, mode_str = config_name.split('+')
arch_str, solver_str, navtask_str = exp_name.split('.')
logging.error('config_name: %s', config_name)
logging.error('arch_str: %s', arch_str)
logging.error('navtask_str: %s', navtask_str)
logging.error('solver_str: %s', solver_str)
logging.error('mode_str: %s', mode_str)
args.solver = cc.process_solver_str(solver_str)
args.navtask = cc.process_navtask_str(navtask_str)
args = process_arch_str(args, arch_str)
args.arch.isd_k = args.solver.isd_k
# Train, test, etc.
mode, imset = mode_str.split('_')
args = cc.adjust_args_for_mode(args, mode)
args.navtask.building_names = args.navtask.dataset.get_split(imset)
args.control.test_name = '{:s}_on_{:s}'.format(mode, imset)
# Log the arguments
logging.error('%s', args)
return args
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import numpy as np
import logging
import src.utils as utils
import datasets.nav_env_config as nec
from datasets import factory
def adjust_args_for_mode(args, mode):
if mode == 'train':
args.control.train = True
elif mode == 'val1':
# Same settings as for training, to make sure nothing wonky is happening
# there.
args.control.test = True
args.control.test_mode = 'val'
args.navtask.task_params.batch_size = 32
elif mode == 'val2':
# No data augmentation, not sampling but taking the argmax action, not
# sampling from the ground truth at all.
args.control.test = True
args.arch.action_sample_type = 'argmax'
args.arch.sample_gt_prob_type = 'zero'
args.navtask.task_params.data_augment = \
utils.Foo(lr_flip=0, delta_angle=0, delta_xy=0, relight=False,
relight_fast=False, structured=False)
args.control.test_mode = 'val'
args.navtask.task_params.batch_size = 32
elif mode == 'bench':
# Actually testing the agent in settings that are kept same between
# different runs.
args.navtask.task_params.batch_size = 16
args.control.test = True
args.arch.action_sample_type = 'argmax'
args.arch.sample_gt_prob_type = 'zero'
args.navtask.task_params.data_augment = \
utils.Foo(lr_flip=0, delta_angle=0, delta_xy=0, relight=False,
relight_fast=False, structured=False)
args.summary.test_iters = 250
args.control.only_eval_when_done = True
args.control.reset_rng_seed = True
args.control.test_mode = 'test'
else:
logging.fatal('Unknown mode: %s.', mode)
assert(False)
return args
def get_solver_vars(solver_str):
if solver_str == '': vals = [];
else: vals = solver_str.split('_')
ks = ['clip', 'dlw', 'long', 'typ', 'isdk', 'adam_eps', 'init_lr'];
ks = ks[:len(vals)]
# Gradient clipping or not.
if len(vals) == 0: ks.append('clip'); vals.append('noclip');
# data loss weight.
if len(vals) == 1: ks.append('dlw'); vals.append('dlw20')
# how long to train for.
if len(vals) == 2: ks.append('long'); vals.append('nolong')
# Adam
if len(vals) == 3: ks.append('typ'); vals.append('adam2')
# reg loss wt
if len(vals) == 4: ks.append('rlw'); vals.append('rlw1')
# isd_k
if len(vals) == 5: ks.append('isdk'); vals.append('isdk415') # 415, inflexion at 2.5k.
# adam eps
if len(vals) == 6: ks.append('adam_eps'); vals.append('aeps1en8')
# init lr
if len(vals) == 7: ks.append('init_lr'); vals.append('lr1en3')
assert(len(vals) == 8)
vars = utils.Foo()
for k, v in zip(ks, vals):
setattr(vars, k, v)
logging.error('solver_vars: %s', vars)
return vars
def process_solver_str(solver_str):
solver = utils.Foo(
seed=0, learning_rate_decay=None, clip_gradient_norm=None, max_steps=None,
initial_learning_rate=None, momentum=None, steps_per_decay=None,
logdir=None, sync=False, adjust_lr_sync=True, wt_decay=0.0001,
data_loss_wt=None, reg_loss_wt=None, freeze_conv=True, num_workers=1,
task=0, ps_tasks=0, master='local', typ=None, momentum2=None,
adam_eps=None)
# Clobber with overrides from solver str.
solver_vars = get_solver_vars(solver_str)
solver.data_loss_wt = float(solver_vars.dlw[3:].replace('x', '.'))
solver.adam_eps = float(solver_vars.adam_eps[4:].replace('x', '.').replace('n', '-'))
solver.initial_learning_rate = float(solver_vars.init_lr[2:].replace('x', '.').replace('n', '-'))
solver.reg_loss_wt = float(solver_vars.rlw[3:].replace('x', '.'))
solver.isd_k = float(solver_vars.isdk[4:].replace('x', '.'))
long = solver_vars.long
if long == 'long':
solver.steps_per_decay = 40000
solver.max_steps = 120000
elif long == 'long2':
solver.steps_per_decay = 80000
solver.max_steps = 120000
elif long == 'nolong' or long == 'nol':
solver.steps_per_decay = 20000
solver.max_steps = 60000
else:
logging.fatal('solver_vars.long should be long, long2, nolong or nol.')
assert(False)
clip = solver_vars.clip
if clip == 'noclip' or clip == 'nocl':
solver.clip_gradient_norm = 0
elif clip[:4] == 'clip':
solver.clip_gradient_norm = float(clip[4:].replace('x', '.'))
else:
logging.fatal('Unknown solver_vars.clip: %s', clip)
assert(False)
typ = solver_vars.typ
if typ == 'adam':
solver.typ = 'adam'
solver.momentum = 0.9
solver.momentum2 = 0.999
solver.learning_rate_decay = 1.0
elif typ == 'adam2':
solver.typ = 'adam'
solver.momentum = 0.9
solver.momentum2 = 0.999
solver.learning_rate_decay = 0.1
elif typ == 'sgd':
solver.typ = 'sgd'
solver.momentum = 0.99
solver.momentum2 = None
solver.learning_rate_decay = 0.1
else:
logging.fatal('Unknown solver_vars.typ: %s', typ)
assert(False)
logging.error('solver: %s', solver)
return solver
def get_navtask_vars(navtask_str):
if navtask_str == '': vals = []
else: vals = navtask_str.split('_')
ks_all = ['dataset_name', 'modality', 'task', 'history', 'max_dist',
'num_steps', 'step_size', 'n_ori', 'aux_views', 'data_aug']
ks = ks_all[:len(vals)]
# All data or not.
if len(vals) == 0: ks.append('dataset_name'); vals.append('sbpd')
# modality
if len(vals) == 1: ks.append('modality'); vals.append('rgb')
# semantic task?
if len(vals) == 2: ks.append('task'); vals.append('r2r')
# number of history frames.
if len(vals) == 3: ks.append('history'); vals.append('h0')
# max steps
if len(vals) == 4: ks.append('max_dist'); vals.append('32')
# num steps
if len(vals) == 5: ks.append('num_steps'); vals.append('40')
# step size
if len(vals) == 6: ks.append('step_size'); vals.append('8')
# n_ori
if len(vals) == 7: ks.append('n_ori'); vals.append('4')
# Auxiliary views.
if len(vals) == 8: ks.append('aux_views'); vals.append('nv0')
# Normal data augmentation as opposed to structured data augmentation (if set
# to straug.
if len(vals) == 9: ks.append('data_aug'); vals.append('straug')
assert(len(vals) == 10)
for i in range(len(ks)):
assert(ks[i] == ks_all[i])
vars = utils.Foo()
for k, v in zip(ks, vals):
setattr(vars, k, v)
logging.error('navtask_vars: %s', vals)
return vars
def process_navtask_str(navtask_str):
navtask = nec.nav_env_base_config()
# Clobber with overrides from strings.
navtask_vars = get_navtask_vars(navtask_str)
navtask.task_params.n_ori = int(navtask_vars.n_ori)
navtask.task_params.max_dist = int(navtask_vars.max_dist)
navtask.task_params.num_steps = int(navtask_vars.num_steps)
navtask.task_params.step_size = int(navtask_vars.step_size)
navtask.task_params.data_augment.delta_xy = int(navtask_vars.step_size)/2.
n_aux_views_each = int(navtask_vars.aux_views[2])
aux_delta_thetas = np.concatenate((np.arange(n_aux_views_each) + 1,
-1 -np.arange(n_aux_views_each)))
aux_delta_thetas = aux_delta_thetas*np.deg2rad(navtask.camera_param.fov)
navtask.task_params.aux_delta_thetas = aux_delta_thetas
if navtask_vars.data_aug == 'aug':
navtask.task_params.data_augment.structured = False
elif navtask_vars.data_aug == 'straug':
navtask.task_params.data_augment.structured = True
else:
logging.fatal('Unknown navtask_vars.data_aug %s.', navtask_vars.data_aug)
assert(False)
navtask.task_params.num_history_frames = int(navtask_vars.history[1:])
navtask.task_params.n_views = 1+navtask.task_params.num_history_frames
navtask.task_params.goal_channels = int(navtask_vars.n_ori)
if navtask_vars.task == 'hard':
navtask.task_params.type = 'rng_rejection_sampling_many'
navtask.task_params.rejection_sampling_M = 2000
navtask.task_params.min_dist = 10
elif navtask_vars.task == 'r2r':
navtask.task_params.type = 'room_to_room_many'
elif navtask_vars.task == 'ST':
# Semantic task at hand.
navtask.task_params.goal_channels = \
len(navtask.task_params.semantic_task.class_map_names)
navtask.task_params.rel_goal_loc_dim = \
len(navtask.task_params.semantic_task.class_map_names)
navtask.task_params.type = 'to_nearest_obj_acc'
else:
logging.fatal('navtask_vars.task: should be hard or r2r, ST')
assert(False)
if navtask_vars.modality == 'rgb':
navtask.camera_param.modalities = ['rgb']
navtask.camera_param.img_channels = 3
elif navtask_vars.modality == 'd':
navtask.camera_param.modalities = ['depth']
navtask.camera_param.img_channels = 2
navtask.task_params.img_height = navtask.camera_param.height
navtask.task_params.img_width = navtask.camera_param.width
navtask.task_params.modalities = navtask.camera_param.modalities
navtask.task_params.img_channels = navtask.camera_param.img_channels
navtask.task_params.img_fov = navtask.camera_param.fov
navtask.dataset = factory.get_dataset(navtask_vars.dataset_name)
return navtask
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import pprint
import copy
import os
from tensorflow.python.platform import app
from tensorflow.python.platform import flags
import logging
import src.utils as utils
import cfgs.config_common as cc
import tensorflow as tf
rgb_resnet_v2_50_path = 'cache/resnet_v2_50_inception_preprocessed/model.ckpt-5136169'
def get_default_args():
robot = utils.Foo(radius=15, base=10, height=140, sensor_height=120,
camera_elevation_degree=-15)
camera_param = utils.Foo(width=225, height=225, z_near=0.05, z_far=20.0,
fov=60., modalities=['rgb', 'depth'])
env = utils.Foo(padding=10, resolution=5, num_point_threshold=2,
valid_min=-10, valid_max=200, n_samples_per_face=200)
data_augment = utils.Foo(lr_flip=0, delta_angle=1, delta_xy=4, relight=False,
relight_fast=False, structured=False)
task_params = utils.Foo(num_actions=4, step_size=4, num_steps=0,
batch_size=32, room_seed=0, base_class='Building',
task='mapping', n_ori=6, data_augment=data_augment,
output_transform_to_global_map=False,
output_canonical_map=False,
output_incremental_transform=False,
output_free_space=False, move_type='shortest_path',
toy_problem=0)
buildinger_args = utils.Foo(building_names=['area1_gates_wingA_floor1_westpart'],
env_class=None, robot=robot,
task_params=task_params, env=env,
camera_param=camera_param)
solver_args = utils.Foo(seed=0, learning_rate_decay=0.1,
clip_gradient_norm=0, max_steps=120000,
initial_learning_rate=0.001, momentum=0.99,
steps_per_decay=40000, logdir=None, sync=False,
adjust_lr_sync=True, wt_decay=0.0001,
data_loss_wt=1.0, reg_loss_wt=1.0,
num_workers=1, task=0, ps_tasks=0, master='local')
summary_args = utils.Foo(display_interval=1, test_iters=100)
control_args = utils.Foo(train=False, test=False,
force_batchnorm_is_training_at_test=False)
arch_args = utils.Foo(rgb_encoder='resnet_v2_50', d_encoder='resnet_v2_50')
return utils.Foo(solver=solver_args,
summary=summary_args, control=control_args, arch=arch_args,
buildinger=buildinger_args)
def get_vars(config_name):
vars = config_name.split('_')
if len(vars) == 1: # All data or not.
vars.append('noall')
if len(vars) == 2: # n_ori
vars.append('4')
logging.error('vars: %s', vars)
return vars
def get_args_for_config(config_name):
args = get_default_args()
config_name, mode = config_name.split('+')
vars = get_vars(config_name)
logging.info('config_name: %s, mode: %s', config_name, mode)
args.buildinger.task_params.n_ori = int(vars[2])
args.solver.freeze_conv = True
args.solver.pretrained_path = rgb_resnet_v2_50_path
args.buildinger.task_params.img_channels = 5
args.solver.data_loss_wt = 0.00001
if vars[0] == 'v0':
None
else:
logging.error('config_name: %s undefined', config_name)
args.buildinger.task_params.height = args.buildinger.camera_param.height
args.buildinger.task_params.width = args.buildinger.camera_param.width
args.buildinger.task_params.modalities = args.buildinger.camera_param.modalities
if vars[1] == 'all':
args = cc.get_args_for_mode_building_all(args, mode)
elif vars[1] == 'noall':
args = cc.get_args_for_mode_building(args, mode)
# Log the arguments
logging.error('%s', args)
return args
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment