Removing research/community models

f5fc733a · Byzantine · 09bc9f54 · 09bc9f54 · 09bc9f54 · 09bc9f54
Commit f5fc733a authored Feb 03, 2022 by Byzantine
20 changed files
--- a/research/brain_coder/single_task/launch_tuning.sh
+++ b/research/brain_coder/single_task/launch_tuning.sh
-#!/bin/bash
-# Launches tuning jobs.
-# Modify this file to launch workers with your prefered cloud API.
-# The following implementation runs each worker as a subprocess on the local
-# machine.
-
-MODELS_DIR="/tmp/models"
-
-# Get command line options.
-OPTS=$(getopt -n "$0" -o "" --long "job_name:,config:,num_tuners:,num_workers_per_tuner:,num_ps_per_tuner:,max_npe:,num_repetitions:,stop_on_success:,fixed_hparams:,hparam_space_type:" -- "$@")
-if [ $? != 0 ] ; then echo "Failed parsing options." >&2 ; exit 1 ; fi
-
-eval set -- "$OPTS"
-
-JOB_NAME=""           # Name of the process and the logs directory.
-CONFIG=""             # Model and environment hparams.
-# NUM_TUNERS: Number of tuning jobs to launch. Each tuning job can train a
-# hparam combination. So more tuners means more hparams tried in parallel.
-NUM_TUNERS=1
-# NUM_WORKERS_PER_TUNER: Number of workers to launch for each tuning job. If
-# using neural networks, each worker will be 1 replica.
-NUM_WORKERS_PER_TUNER=1
-# NUM_PS_PER_TUNER: Number of parameter servers to launch for this tuning job.
-# Only set this if using neural networks. For 1 worker per tuner, no parameter
-# servers are needed. For more than 1 worker per tuner, at least 1 parameter
-# server per tuner is needed to store the global model for each tuner.
-NUM_PS_PER_TUNER=0
-# MAX_NPE: Maximum number of programs executed. Training will quit once this
-# threshold is reached. If 0, the threshold is infinite.
-MAX_NPE=0
-NUM_REPETITIONS=25    # How many times to run this experiment.
-STOP_ON_SUCCESS=true  # Whether to halt training when a solution is found.
-# FIXED_HPARAMS: Hold hparams fixed in the grid search. This reduces the search
-# space.
-FIXED_HPARAMS=""
-# HPARAM_SPACE_TYPE: Specifies the hparam search space. See
-# `define_tuner_hparam_space` functions defined in pg_train.py and ga_train.py.
-HPARAM_SPACE_TYPE="pg"
-
-# Parse options into variables.
-while true; do
-  case "$1" in
-    --job_name ) JOB_NAME="$2"; shift; shift ;;
-    --config ) CONFIG="$2"; shift; shift ;;
-    --num_tuners ) NUM_TUNERS="$2"; shift; shift ;;
-    --num_workers_per_tuner ) NUM_WORKERS_PER_TUNER="$2"; shift; shift ;;
-    --num_ps_per_tuner ) NUM_PS_PER_TUNER="$2"; shift; shift ;;
-    --max_npe ) MAX_NPE="$2"; shift; shift ;;
-    --num_repetitions ) NUM_REPETITIONS="$2"; shift; shift ;;
-    --stop_on_success ) STOP_ON_SUCCESS="$2"; shift; shift ;;
-    --fixed_hparams ) FIXED_HPARAMS="$2"; shift; shift ;;
-    --hparam_space_type ) HPARAM_SPACE_TYPE="$2"; shift; shift ;;
-    -- ) shift; break ;;
-    * ) break ;;
-  esac
-done
-
-# Launch jobs.
-# TODO: multi-worker RL training
-
-LOGDIR="$MODELS_DIR/$JOB_NAME"
-mkdir -p $LOGDIR
-
-BIN_DIR="bazel-bin/single_task"
-for ((tuner=0;tuner<NUM_TUNERS;tuner+=1)); do
-  for ((i=0;i<NUM_WORKERS_PER_TUNER;i++)); do
-    # Expecting tune.par to be built.
-    echo "$LOGDIR"
-    $BIN_DIR/tune.par \
-        --alsologtostderr \
-        --config="$CONFIG" \
-        --logdir="$LOGDIR" \
-        --max_npe="$MAX_NPE" \
-        --num_repetitions="$NUM_REPETITIONS" \
-        --stop_on_success="$STOP_ON_SUCCESS" \
-        --summary_tasks=1 \
-        --hparam_space="$HPARAM_SPACE_TYPE" \
-        --fixed_hparams="$FIXED_HPARAMS" \
-        --tuner_id=$tuner \
-        --num_tuners=$NUM_TUNERS \
-        2> "$LOGDIR/tuner_$tuner.task_$i.log" &  # Run as subprocess
-    echo "Launched tuner $tuner, task $i. Logs: $LOGDIR/tuner_$tuner.task_$i.log"
-  done
-done
-
-# Use "pidof tune.par" to find jobs.
-# Kill with "pkill tune.par"
--- a/research/brain_coder/single_task/misc.py
+++ b/research/brain_coder/single_task/misc.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Utilities specific to this project."""
-
-from collections import namedtuple
-from six import string_types
-
-
-#####################
-# BF-lang utilities #
-#####################
-
-
-BF_EOS_INT = 0  # Also used as SOS (start of sequence).
-BF_EOS_CHAR = TEXT_EOS_CHAR = '_'
-BF_LANG_INTS = range(1, 9)
-BF_INT_TO_CHAR = [BF_EOS_CHAR, '>', '<', '+', '-', '[', ']', '.', ',']
-BF_CHAR_TO_INT = dict([(c, i) for i, c in enumerate(BF_INT_TO_CHAR)])
-
-
-RewardInfo = namedtuple('RewardInfo', ['episode_rewards', 'input_case',
-                                       'correct_output',
-                                       'code_output', 'reason', 'input_type',
-                                       'output_type'])
-
-
-class IOType(object):
-  string = 'string'
-  integer = 'integer'
-  boolean = 'boolean'
-
-
-class IOTuple(tuple):
-  pass
-
-
-def flatten(lst):
-  return [item for row in lst for item in row]
-
-
-def bf_num_tokens():
-  # BF tokens plus EOS.
-  return len(BF_INT_TO_CHAR)
-
-
-def bf_char2int(bf_char):
-  """Convert BF code char to int token."""
-  return BF_CHAR_TO_INT[bf_char]
-
-
-def bf_int2char(bf_int):
-  """Convert BF int token to code char."""
-  return BF_INT_TO_CHAR[bf_int]
-
-
-def bf_tokens_to_string(bf_tokens, truncate=True):
-  """Convert token list to code string. Will truncate at EOS token.
-
-  Args:
-    bf_tokens: Python list of ints representing the code string.
-    truncate: If true, the output string will end at the first EOS token.
-        If false, the entire token list is converted to string.
-
-  Returns:
-    String representation of the tokens.
-
-  Raises:
-    ValueError: If bf_tokens is not a python list.
-  """
-  if not isinstance(bf_tokens, list):
-    raise ValueError('Only python list supported here.')
-  if truncate:
-    try:
-      eos_index = bf_tokens.index(BF_EOS_INT)
-    except ValueError:
-      eos_index = len(bf_tokens)
-  else:
-    eos_index = len(bf_tokens)
-  return ''.join([BF_INT_TO_CHAR[t] for t in bf_tokens[:eos_index]])
-
-
-def bf_string_to_tokens(bf_string):
-  """Convert string to token list. Will strip and append EOS token."""
-  tokens = [BF_CHAR_TO_INT[char] for char in bf_string.strip()]
-  tokens.append(BF_EOS_INT)
-  return tokens
-
-
-def tokens_to_text(tokens):
-  """Convert token list to human readable text."""
-  return ''.join(
-      [TEXT_EOS_CHAR if t == 0 else chr(t - 1 + ord('A')) for t in tokens])
-
-
-###################################
-# Number representation utilities #
-###################################
-
-
-# https://en.wikipedia.org/wiki/Metric_prefix
-si_magnitudes = {
-    'k': 1e3,
-    'm': 1e6,
-    'g': 1e9}
-
-
-def si_to_int(s):
-  """Convert string ending with SI magnitude to int.
-
-  Examples: 5K ==> 5000, 12M ==> 12000000.
-
-  Args:
-    s: String in the form 'xx..xP' where x is a digit and P is an SI prefix.
-
-  Returns:
-    Integer equivalent to the string.
-  """
-  if isinstance(s, string_types) and s[-1].lower() in si_magnitudes.keys():
-    return int(int(s[:-1]) * si_magnitudes[s[-1].lower()])
-  return int(s)
-
-
-def int_to_si(n):
-  """Convert integer to string with SI magnitude.
-
-  `n` will be truncated.
-
-  Examples: 5432 ==> 5k, 12345678 ==> 12M
-
-  Args:
-    n: Integer to represent as a string.
-
-  Returns:
-    String representation of `n` containing SI magnitude.
-  """
-  m = abs(n)
-  sign = -1 if n < 0 else 1
-  if m < 1e3:
-    return str(n)
-  if m < 1e6:
-    return '{0}K'.format(sign*int(m / 1e3))
-  if m < 1e9:
-    return '{0}M'.format(sign*int(m / 1e6))
-  if m < 1e12:
-    return '{0}G'.format(sign*int(m / 1e9))
-  return str(m)
-
--- a/research/brain_coder/single_task/pg_agent.py
+++ b/research/brain_coder/single_task/pg_agent.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Language model agent.
-
-Agent outputs code in a sequence just like a language model. Can be trained
-as a language model or using RL, or a combination of the two.
-"""
-
-from collections import namedtuple
-from math import exp
-from math import log
-import time
-
-from absl import logging
-import numpy as np
-from six.moves import xrange
-import tensorflow as tf
-
-from common import rollout as rollout_lib  # brain coder
-from common import utils  # brain coder
-from single_task import misc  # brain coder
-
-
-# Experiments in the ICLR 2018 paper used reduce_sum instead of reduce_mean for
-# some losses. We make all loses be batch_size independent, and multiply the
-# changed losses by 64, which was the fixed batch_size when the experiments
-# where run. The loss hyperparameters still match what is reported in the paper.
-MAGIC_LOSS_MULTIPLIER = 64
-
-
-def rshift_time(tensor_2d, fill=misc.BF_EOS_INT):
-  """Right shifts a 2D tensor along the time dimension (axis-1)."""
-  dim_0 = tf.shape(tensor_2d)[0]
-  fill_tensor = tf.fill([dim_0, 1], fill)
-  return tf.concat([fill_tensor, tensor_2d[:, :-1]], axis=1)
-
-
-def join(a, b):
-  # Concat a and b along 0-th dim.
-  if a is None or len(a) == 0:  # pylint: disable=g-explicit-length-test
-    return b
-  if b is None or len(b) == 0:  # pylint: disable=g-explicit-length-test
-    return a
-  return np.concatenate((a, b))
-
-
-def make_optimizer(kind, lr):
-  if kind == 'sgd':
-    return tf.train.GradientDescentOptimizer(lr)
-  elif kind == 'adam':
-    return tf.train.AdamOptimizer(lr)
-  elif kind == 'rmsprop':
-    return tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.99)
-  else:
-    raise ValueError('Optimizer type "%s" not recognized.' % kind)
-
-
-class LinearWrapper(tf.contrib.rnn.RNNCell):
-  """RNNCell wrapper that adds a linear layer to the output."""
-
-  def __init__(self, cell, output_size, dtype=tf.float32, suppress_index=None):
-    self.cell = cell
-    self._output_size = output_size
-    self._dtype = dtype
-    self._suppress_index = suppress_index
-    self.smallest_float = -2.4e38
-
-  def __call__(self, inputs, state, scope=None):
-    with tf.variable_scope(type(self).__name__):
-      outputs, state = self.cell(inputs, state, scope=scope)
-      logits = tf.matmul(
-          outputs,
-          tf.get_variable('w_output',
-                          [self.cell.output_size, self.output_size],
-                          dtype=self._dtype))
-      if self._suppress_index is not None:
-        # Replace the target index with -inf, so that it never gets selected.
-        batch_size = tf.shape(logits)[0]
-        logits = tf.concat(
-            [logits[:, :self._suppress_index],
-             tf.fill([batch_size, 1], self.smallest_float),
-             logits[:, self._suppress_index + 1:]],
-            axis=1)
-
-    return logits, state
-
-  @property
-  def output_size(self):
-    return self._output_size
-
-  @property
-  def state_size(self):
-    return self.cell.state_size
-
-  def zero_state(self, batch_size, dtype):
-    return self.cell.zero_state(batch_size, dtype)
-
-
-UpdateStepResult = namedtuple(
-    'UpdateStepResult',
-    ['global_step', 'global_npe', 'summaries_list', 'gradients_dict'])
-
-
-class AttrDict(dict):
-  """Dict with attributes as keys.
-
-  https://stackoverflow.com/a/14620633
-  """
-
-  def __init__(self, *args, **kwargs):
-    super(AttrDict, self).__init__(*args, **kwargs)
-    self.__dict__ = self
-
-
-class LMAgent(object):
-  """Language model agent."""
-  action_space = misc.bf_num_tokens()
-  observation_space = misc.bf_num_tokens()
-
-  def __init__(self, global_config, task_id=0,
-               logging_file=None,
-               experience_replay_file=None,
-               global_best_reward_fn=None,
-               found_solution_op=None,
-               assign_code_solution_fn=None,
-               program_count=None,
-               do_iw_summaries=False,
-               stop_on_success=True,
-               dtype=tf.float32,
-               verbose_level=0,
-               is_local=True):
-    self.config = config = global_config.agent
-    self.logging_file = logging_file
-    self.experience_replay_file = experience_replay_file
-    self.task_id = task_id
-    self.verbose_level = verbose_level
-    self.global_best_reward_fn = global_best_reward_fn
-    self.found_solution_op = found_solution_op
-    self.assign_code_solution_fn = assign_code_solution_fn
-    self.parent_scope_name = tf.get_variable_scope().name
-    self.dtype = dtype
-    self.allow_eos_token = config.eos_token
-    self.stop_on_success = stop_on_success
-    self.pi_loss_hparam = config.pi_loss_hparam
-    self.vf_loss_hparam = config.vf_loss_hparam
-    self.is_local = is_local
-
-    self.top_reward = 0.0
-    self.embeddings_trainable = True
-
-    self.no_op = tf.no_op()
-
-    self.learning_rate = tf.constant(
-        config.lr, dtype=dtype, name='learning_rate')
-    self.initializer = tf.contrib.layers.variance_scaling_initializer(
-        factor=config.param_init_factor,
-        mode='FAN_AVG',
-        uniform=True,
-        dtype=dtype)  # TF's default initializer.
-    tf.get_variable_scope().set_initializer(self.initializer)
-
-    self.a2c = config.ema_baseline_decay == 0
-    if not self.a2c:
-      logging.info('Using exponential moving average REINFORCE baselines.')
-      self.ema_baseline_decay = config.ema_baseline_decay
-      self.ema_by_len = [0.0] * global_config.timestep_limit
-    else:
-      logging.info('Using advantage (a2c) with learned value function.')
-      self.ema_baseline_decay = 0.0
-      self.ema_by_len = None
-
-    # Top-k
-    if config.topk and config.topk_loss_hparam:
-      self.topk_loss_hparam = config.topk_loss_hparam
-      self.topk_batch_size = config.topk_batch_size
-      if self.topk_batch_size <= 0:
-        raise ValueError('topk_batch_size must be a positive integer. Got %s',
-                         self.topk_batch_size)
-      self.top_episodes = utils.MaxUniquePriorityQueue(config.topk)
-      logging.info('Made max-priorty-queue with capacity %d',
-                   self.top_episodes.capacity)
-    else:
-      self.top_episodes = None
-      self.topk_loss_hparam = 0.0
-      logging.info('No max-priorty-queue')
-
-    # Experience replay.
-    self.replay_temperature = config.replay_temperature
-    self.num_replay_per_batch = int(global_config.batch_size * config.alpha)
-    self.num_on_policy_per_batch = (
-        global_config.batch_size - self.num_replay_per_batch)
-    self.replay_alpha = (
-        self.num_replay_per_batch / float(global_config.batch_size))
-    logging.info('num_replay_per_batch: %d', self.num_replay_per_batch)
-    logging.info('num_on_policy_per_batch: %d', self.num_on_policy_per_batch)
-    logging.info('replay_alpha: %s', self.replay_alpha)
-    if self.num_replay_per_batch > 0:
-      # Train with off-policy episodes from replay buffer.
-      start_time = time.time()
-      self.experience_replay = utils.RouletteWheel(
-          unique_mode=True, save_file=experience_replay_file)
-      logging.info('Took %s sec to load replay buffer from disk.',
-                   int(time.time() - start_time))
-      logging.info('Replay buffer file location: "%s"',
-                   self.experience_replay.save_file)
-    else:
-      # Only train on-policy.
-      self.experience_replay = None
-
-    if program_count is not None:
-      self.program_count = program_count
-      self.program_count_add_ph = tf.placeholder(
-          tf.int64, [], 'program_count_add_ph')
-      self.program_count_add_op = self.program_count.assign_add(
-          self.program_count_add_ph)
-
-    ################################
-    # RL policy and value networks #
-    ################################
-    batch_size = global_config.batch_size
-    logging.info('batch_size: %d', batch_size)
-
-    self.policy_cell = LinearWrapper(
-        tf.contrib.rnn.MultiRNNCell(
-            [tf.contrib.rnn.BasicLSTMCell(cell_size)
-             for cell_size in config.policy_lstm_sizes]),
-        self.action_space,
-        dtype=dtype,
-        suppress_index=None if self.allow_eos_token else misc.BF_EOS_INT)
-    self.value_cell = LinearWrapper(
-        tf.contrib.rnn.MultiRNNCell(
-            [tf.contrib.rnn.BasicLSTMCell(cell_size)
-             for cell_size in config.value_lstm_sizes]),
-        1,
-        dtype=dtype)
-
-    obs_embedding_scope = 'obs_embed'
-    with tf.variable_scope(
-        obs_embedding_scope,
-        initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)):
-      obs_embeddings = tf.get_variable(
-          'embeddings',
-          [self.observation_space, config.obs_embedding_size],
-          dtype=dtype, trainable=self.embeddings_trainable)
-      self.obs_embeddings = obs_embeddings
-
-    ################################
-    # RL policy and value networks #
-    ################################
-
-    initial_state = tf.fill([batch_size], misc.BF_EOS_INT)
-    def loop_fn(loop_time, cell_output, cell_state, loop_state):
-      """Function called by tf.nn.raw_rnn to instantiate body of the while_loop.
-
-      See https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn for more
-      information.
-
-      When time is 0, and cell_output, cell_state, loop_state are all None,
-      `loop_fn` will create the initial input, internal cell state, and loop
-      state. When time > 0, `loop_fn` will operate on previous cell output,
-      state, and loop state.
-
-      Args:
-        loop_time: A scalar tensor holding the current timestep (zero based
-            counting).
-        cell_output: Output of the raw_rnn cell at the current timestep.
-        cell_state: Cell internal state at the current timestep.
-        loop_state: Additional loop state. These tensors were returned by the
-            previous call to `loop_fn`.
-
-      Returns:
-        elements_finished: Bool tensor of shape [batch_size] which marks each
-            sequence in the batch as being finished or not finished.
-        next_input: A tensor containing input to be fed into the cell at the
-            next timestep.
-        next_cell_state: Cell internal state to be fed into the cell at the
-            next timestep.
-        emit_output: Tensor to be added to the TensorArray returned by raw_rnn
-            as output from the while_loop.
-        next_loop_state: Additional loop state. These tensors will be fed back
-            into the next call to `loop_fn` as `loop_state`.
-      """
-      if cell_output is None:  # 0th time step.
-        next_cell_state = self.policy_cell.zero_state(batch_size, dtype)
-        elements_finished = tf.zeros([batch_size], tf.bool)
-        output_lengths = tf.ones([batch_size], dtype=tf.int32)
-        next_input = tf.gather(obs_embeddings, initial_state)
-        emit_output = None
-        next_loop_state = (
-            tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True),
-            output_lengths,
-            elements_finished
-        )
-      else:
-        scaled_logits = cell_output * config.softmax_tr  # Scale temperature.
-        prev_chosen, prev_output_lengths, prev_elements_finished = loop_state
-        next_cell_state = cell_state
-        chosen_outputs = tf.to_int32(tf.where(
-            tf.logical_not(prev_elements_finished),
-            tf.multinomial(logits=scaled_logits, num_samples=1)[:, 0],
-            tf.zeros([batch_size], dtype=tf.int64)))
-        elements_finished = tf.logical_or(
-            tf.equal(chosen_outputs, misc.BF_EOS_INT),
-            loop_time >= global_config.timestep_limit)
-        output_lengths = tf.where(
-            elements_finished,
-            prev_output_lengths,
-            # length includes EOS token. empty seq has len 1.
-            tf.tile(tf.expand_dims(loop_time + 1, 0), [batch_size])
-        )
-        next_input = tf.gather(obs_embeddings, chosen_outputs)
-        emit_output = scaled_logits
-        next_loop_state = (prev_chosen.write(loop_time - 1, chosen_outputs),
-                           output_lengths,
-                           tf.logical_or(prev_elements_finished,
-                                         elements_finished))
-      return (elements_finished, next_input, next_cell_state, emit_output,
-              next_loop_state)
-
-    with tf.variable_scope('policy'):
-      (decoder_outputs_ta,
-       _,  # decoder_state
-       (sampled_output_ta, output_lengths, _)) = tf.nn.raw_rnn(
-           cell=self.policy_cell,
-           loop_fn=loop_fn)
-    policy_logits = tf.transpose(decoder_outputs_ta.stack(), (1, 0, 2),
-                                 name='policy_logits')
-    sampled_tokens = tf.transpose(sampled_output_ta.stack(), (1, 0),
-                                  name='sampled_tokens')
-    # Add SOS to beginning of the sequence.
-    rshift_sampled_tokens = rshift_time(sampled_tokens, fill=misc.BF_EOS_INT)
-
-    # Initial state is 0, 2nd state is first token.
-    # Note: If value of last state is computed, this will be used as bootstrap.
-    if self.a2c:
-      with tf.variable_scope('value'):
-        value_output, _ = tf.nn.dynamic_rnn(
-            self.value_cell,
-            tf.gather(obs_embeddings, rshift_sampled_tokens),
-            sequence_length=output_lengths,
-            dtype=dtype)
-      value = tf.squeeze(value_output, axis=[2])
-    else:
-      value = tf.zeros([], dtype=dtype)
-
-    # for sampling actions from the agent, and which told tensors for doing
-    # gradient updates on the agent.
-    self.sampled_batch = AttrDict(
-        logits=policy_logits,
-        value=value,
-        tokens=sampled_tokens,
-        episode_lengths=output_lengths,
-        probs=tf.nn.softmax(policy_logits),
-        log_probs=tf.nn.log_softmax(policy_logits))
-
-    # adjusted_lengths can be less than the full length of each episode.
-    # Use this to train on only part of an episode (starting from t=0).
-    self.adjusted_lengths = tf.placeholder(
-        tf.int32, [None], name='adjusted_lengths')
-    self.policy_multipliers = tf.placeholder(
-        dtype,
-        [None, None],
-        name='policy_multipliers')
-    # Empirical value, i.e. discounted sum of observed future rewards from each
-    # time step in the episode.
-    self.empirical_values = tf.placeholder(
-        dtype,
-        [None, None],
-        name='empirical_values')
-
-    # Off-policy training. Just add supervised loss to the RL loss.
-    self.off_policy_targets = tf.placeholder(
-        tf.int32,
-        [None, None],
-        name='off_policy_targets')
-    self.off_policy_target_lengths = tf.placeholder(
-        tf.int32, [None], name='off_policy_target_lengths')
-
-    self.actions = tf.placeholder(tf.int32, [None, None], name='actions')
-    # Add SOS to beginning of the sequence.
-    inputs = rshift_time(self.actions, fill=misc.BF_EOS_INT)
-    with tf.variable_scope('policy', reuse=True):
-      logits, _ = tf.nn.dynamic_rnn(
-          self.policy_cell, tf.gather(obs_embeddings, inputs),
-          sequence_length=self.adjusted_lengths,
-          dtype=dtype)
-
-    if self.a2c:
-      with tf.variable_scope('value', reuse=True):
-        value_output, _ = tf.nn.dynamic_rnn(
-            self.value_cell,
-            tf.gather(obs_embeddings, inputs),
-            sequence_length=self.adjusted_lengths,
-            dtype=dtype)
-      value2 = tf.squeeze(value_output, axis=[2])
-    else:
-      value2 = tf.zeros([], dtype=dtype)
-
-    self.given_batch = AttrDict(
-        logits=logits,
-        value=value2,
-        tokens=sampled_tokens,
-        episode_lengths=self.adjusted_lengths,
-        probs=tf.nn.softmax(logits),
-        log_probs=tf.nn.log_softmax(logits))
-
-    # Episode masks.
-    max_episode_length = tf.shape(self.actions)[1]
-    # range_row shape: [1, max_episode_length]
-    range_row = tf.expand_dims(tf.range(max_episode_length), 0)
-    episode_masks = tf.cast(
-        tf.less(range_row, tf.expand_dims(self.given_batch.episode_lengths, 1)),
-        dtype=dtype)
-    episode_masks_3d = tf.expand_dims(episode_masks, 2)
-
-    # Length adjusted episodes.
-    self.a_probs = a_probs = self.given_batch.probs * episode_masks_3d
-    self.a_log_probs = a_log_probs = (
-        self.given_batch.log_probs * episode_masks_3d)
-    self.a_value = a_value = self.given_batch.value * episode_masks
-    self.a_policy_multipliers = a_policy_multipliers = (
-        self.policy_multipliers * episode_masks)
-    if self.a2c:
-      self.a_empirical_values = a_empirical_values = (
-          self.empirical_values * episode_masks)
-
-    # pi_loss is scalar
-    acs_onehot = tf.one_hot(self.actions, self.action_space, dtype=dtype)
-    self.acs_onehot = acs_onehot
-    chosen_masked_log_probs = acs_onehot * a_log_probs
-    pi_target = tf.expand_dims(a_policy_multipliers, -1)
-    pi_loss_per_step = chosen_masked_log_probs * pi_target  # Maximize.
-    self.pi_loss = pi_loss = (
-        -tf.reduce_mean(tf.reduce_sum(pi_loss_per_step, axis=[1, 2]), axis=0)
-        * MAGIC_LOSS_MULTIPLIER)  # Minimize.
-    assert len(self.pi_loss.shape) == 0  # pylint: disable=g-explicit-length-test
-
-    # shape: [batch_size, time]
-    self.chosen_log_probs = tf.reduce_sum(chosen_masked_log_probs, axis=2)
-    self.chosen_probs = tf.reduce_sum(acs_onehot * a_probs, axis=2)
-
-    # loss of value function
-    if self.a2c:
-      vf_loss_per_step = tf.square(a_value - a_empirical_values)
-      self.vf_loss = vf_loss = (
-          tf.reduce_mean(tf.reduce_sum(vf_loss_per_step, axis=1), axis=0)
-          * MAGIC_LOSS_MULTIPLIER)  # Minimize.
-      assert len(self.vf_loss.shape) == 0  # pylint: disable=g-explicit-length-test
-    else:
-      self.vf_loss = vf_loss = 0.0
-
-    # Maximize entropy regularizer
-    self.entropy = entropy = (
-        -tf.reduce_mean(
-            tf.reduce_sum(a_probs * a_log_probs, axis=[1, 2]), axis=0)
-        * MAGIC_LOSS_MULTIPLIER)  # Maximize
-    self.negentropy = -entropy  # Minimize negentropy.
-    assert len(self.negentropy.shape) == 0  # pylint: disable=g-explicit-length-test
-
-    # off-policy loss
-    self.offp_switch = tf.placeholder(dtype, [], name='offp_switch')
-    if self.top_episodes is not None:
-      # Add SOS to beginning of the sequence.
-      offp_inputs = tf.gather(obs_embeddings,
-                              rshift_time(self.off_policy_targets,
-                                          fill=misc.BF_EOS_INT))
-      with tf.variable_scope('policy', reuse=True):
-        offp_logits, _ = tf.nn.dynamic_rnn(
-            self.policy_cell, offp_inputs, self.off_policy_target_lengths,
-            dtype=dtype)  # shape: [batch_size, time, action_space]
-      topk_loss_per_step = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=self.off_policy_targets,
-          logits=offp_logits,
-          name='topk_loss_per_logit')
-      # Take mean over batch dimension so that the loss multiplier strength is
-      # independent of batch size. Sum over time dimension.
-      topk_loss = tf.reduce_mean(
-          tf.reduce_sum(topk_loss_per_step, axis=1), axis=0)
-      assert len(topk_loss.shape) == 0  # pylint: disable=g-explicit-length-test
-      self.topk_loss = topk_loss * self.offp_switch
-      logging.info('Including off policy loss.')
-    else:
-      self.topk_loss = topk_loss = 0.0
-
-    self.entropy_hparam = tf.constant(
-        config.entropy_beta, dtype=dtype, name='entropy_beta')
-
-    self.pi_loss_term = pi_loss * self.pi_loss_hparam
-    self.vf_loss_term = vf_loss * self.vf_loss_hparam
-    self.entropy_loss_term = self.negentropy * self.entropy_hparam
-    self.topk_loss_term = self.topk_loss_hparam * topk_loss
-    self.loss = (
-        self.pi_loss_term
-        + self.vf_loss_term
-        + self.entropy_loss_term
-        + self.topk_loss_term)
-
-    params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                               tf.get_variable_scope().name)
-    self.trainable_variables = params
-    self.sync_variables = self.trainable_variables
-    non_embedding_params = [p for p in params
-                            if obs_embedding_scope not in p.name]
-    self.non_embedding_params = non_embedding_params
-    self.params = params
-
-    if config.regularizer:
-      logging.info('Adding L2 regularizer with scale %.2f.',
-                   config.regularizer)
-      self.regularizer = config.regularizer * sum(
-          tf.nn.l2_loss(w) for w in non_embedding_params)
-      self.loss += self.regularizer
-    else:
-      logging.info('Skipping regularizer.')
-      self.regularizer = 0.0
-
-    # Only build gradients graph for local model.
-    if self.is_local:
-      unclipped_grads = tf.gradients(self.loss, params)
-      self.dense_unclipped_grads = [
-          tf.convert_to_tensor(g) for g in unclipped_grads]
-      self.grads, self.global_grad_norm = tf.clip_by_global_norm(
-          unclipped_grads, config.grad_clip_threshold)
-      self.gradients_dict = dict(zip(params, self.grads))
-      self.optimizer = make_optimizer(config.optimizer, self.learning_rate)
-      self.all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
-                                             tf.get_variable_scope().name)
-
-    self.do_iw_summaries = do_iw_summaries
-    if self.do_iw_summaries:
-      b = None
-      self.log_iw_replay_ph = tf.placeholder(tf.float32, [b],
-                                             'log_iw_replay_ph')
-      self.log_iw_policy_ph = tf.placeholder(tf.float32, [b],
-                                             'log_iw_policy_ph')
-      self.log_prob_replay_ph = tf.placeholder(tf.float32, [b],
-                                               'log_prob_replay_ph')
-      self.log_prob_policy_ph = tf.placeholder(tf.float32, [b],
-                                               'log_prob_policy_ph')
-      self.log_norm_replay_weights_ph = tf.placeholder(
-          tf.float32, [b], 'log_norm_replay_weights_ph')
-      self.iw_summary_op = tf.summary.merge([
-          tf.summary.histogram('is/log_iw_replay', self.log_iw_replay_ph),
-          tf.summary.histogram('is/log_iw_policy', self.log_iw_policy_ph),
-          tf.summary.histogram('is/log_prob_replay', self.log_prob_replay_ph),
-          tf.summary.histogram('is/log_prob_policy', self.log_prob_policy_ph),
-          tf.summary.histogram(
-              'is/log_norm_replay_weights', self.log_norm_replay_weights_ph),
-      ])
-
-  def make_summary_ops(self):
-    """Construct summary ops for the model."""
-    # size = number of timesteps across entire batch. Number normalized by size
-    # will not be affected by the amount of padding at the ends of sequences
-    # in the batch.
-    size = tf.cast(
-        tf.reduce_sum(self.given_batch.episode_lengths), dtype=self.dtype)
-    offp_size = tf.cast(tf.reduce_sum(self.off_policy_target_lengths),
-                        dtype=self.dtype)
-    scope_prefix = self.parent_scope_name
-
-    def _remove_prefix(prefix, name):
-      assert name.startswith(prefix)
-      return name[len(prefix):]
-
-    # RL summaries.
-    self.rl_summary_op = tf.summary.merge(
-        [tf.summary.scalar('model/policy_loss', self.pi_loss / size),
-         tf.summary.scalar('model/value_loss', self.vf_loss / size),
-         tf.summary.scalar('model/topk_loss', self.topk_loss / offp_size),
-         tf.summary.scalar('model/entropy', self.entropy / size),
-         tf.summary.scalar('model/loss', self.loss / size),
-         tf.summary.scalar('model/grad_norm',
-                           tf.global_norm(self.grads)),
-         tf.summary.scalar('model/unclipped_grad_norm', self.global_grad_norm),
-         tf.summary.scalar('model/non_embedding_var_norm',
-                           tf.global_norm(self.non_embedding_params)),
-         tf.summary.scalar('hparams/entropy_beta', self.entropy_hparam),
-         tf.summary.scalar('hparams/topk_loss_hparam', self.topk_loss_hparam),
-         tf.summary.scalar('hparams/learning_rate', self.learning_rate),
-         tf.summary.scalar('model/trainable_var_norm',
-                           tf.global_norm(self.trainable_variables)),
-         tf.summary.scalar('loss/loss', self.loss),
-         tf.summary.scalar('loss/entropy', self.entropy_loss_term),
-         tf.summary.scalar('loss/vf', self.vf_loss_term),
-         tf.summary.scalar('loss/policy', self.pi_loss_term),
-         tf.summary.scalar('loss/offp', self.topk_loss_term)] +
-        [tf.summary.scalar(
-            'param_norms/' + _remove_prefix(scope_prefix + '/', p.name),
-            tf.norm(p))
-         for p in self.params] +
-        [tf.summary.scalar(
-            'grad_norms/' + _remove_prefix(scope_prefix + '/', p.name),
-            tf.norm(g))
-         for p, g in zip(self.params, self.grads)] +
-        [tf.summary.scalar(
-            'unclipped_grad_norms/' + _remove_prefix(scope_prefix + '/',
-                                                     p.name),
-            tf.norm(g))
-         for p, g in zip(self.params, self.dense_unclipped_grads)])
-
-    self.text_summary_placeholder = tf.placeholder(tf.string, shape=[])
-    self.rl_text_summary_op = tf.summary.text('rl',
-                                              self.text_summary_placeholder)
-
-  def _rl_text_summary(self, session, step, npe, tot_r, num_steps,
-                       input_case, code_output, code, reason):
-    """Logs summary about a single episode and creates a text_summary for TB.
-
-    Args:
-      session: tf.Session instance.
-      step: Global training step.
-      npe: Number of programs executed so far.
-      tot_r: Total reward.
-      num_steps: Number of timesteps in the episode (i.e. code length).
-      input_case: Inputs for test cases.
-      code_output: Outputs produced by running the code on the inputs.
-      code: String representation of the code.
-      reason: Reason for the reward assigned by the task.
-
-    Returns:
-      Serialized text summary data for tensorboard.
-    """
-    if not input_case:
-      input_case = ' '
-    if not code_output:
-      code_output = ' '
-    if not code:
-      code = ' '
-    text = (
-        'Tot R: **%.2f**;  Len: **%d**;  Reason: **%s**\n\n'
-        'Input: **`%s`**; Output: **`%s`**\n\nCode: **`%s`**'
-        % (tot_r, num_steps, reason, input_case, code_output, code))
-    text_summary = session.run(self.rl_text_summary_op,
-                               {self.text_summary_placeholder: text})
-    logging.info(
-        'Step %d.\t NPE: %d\t Reason: %s.\t Tot R: %.2f.\t Length: %d. '
-        '\tInput: %s \tOutput: %s \tProgram: %s',
-        step, npe, reason, tot_r, num_steps, input_case,
-        code_output, code)
-    return text_summary
-
-  def _rl_reward_summary(self, total_rewards):
-    """Create summary ops that report on episode rewards.
-
-    Creates summaries for average, median, max, and min rewards in the batch.
-
-    Args:
-      total_rewards: Tensor of shape [batch_size] containing the total reward
-          from each episode in the batch.
-
-    Returns:
-      tf.Summary op.
-    """
-    tr = np.asarray(total_rewards)
-    reward_summary = tf.Summary(value=[
-        tf.Summary.Value(
-            tag='reward/avg',
-            simple_value=np.mean(tr)),
-        tf.Summary.Value(
-            tag='reward/med',
-            simple_value=np.median(tr)),
-        tf.Summary.Value(
-            tag='reward/max',
-            simple_value=np.max(tr)),
-        tf.Summary.Value(
-            tag='reward/min',
-            simple_value=np.min(tr))])
-    return reward_summary
-
-  def _iw_summary(self, session, replay_iw, replay_log_probs,
-                  norm_replay_weights, on_policy_iw,
-                  on_policy_log_probs):
-    """Compute summaries for importance weights at a given batch.
-
-    Args:
-      session: tf.Session instance.
-      replay_iw: Importance weights for episodes from replay buffer.
-      replay_log_probs: Total log probabilities of the replay episodes under the
-          current policy.
-      norm_replay_weights: Normalized replay weights, i.e. values in `replay_iw`
-          divided by the total weight in the entire replay buffer. Note, this is
-          also the probability of selecting each episode from the replay buffer
-          (in a roulette wheel replay buffer).
-      on_policy_iw: Importance weights for episodes sampled from the current
-          policy.
-      on_policy_log_probs: Total log probabilities of the on-policy episodes
-          under the current policy.
-
-    Returns:
-      Serialized TF summaries. Use a summary writer to write these summaries to
-      disk.
-    """
-    return session.run(
-        self.iw_summary_op,
-        {self.log_iw_replay_ph: np.log(replay_iw),
-         self.log_iw_policy_ph: np.log(on_policy_iw),
-         self.log_norm_replay_weights_ph: np.log(norm_replay_weights),
-         self.log_prob_replay_ph: replay_log_probs,
-         self.log_prob_policy_ph: on_policy_log_probs})
-
-  def _compute_iw(self, policy_log_probs, replay_weights):
-    """Compute importance weights for a batch of episodes.
-
-    Arguments are iterables of length batch_size.
-
-    Args:
-      policy_log_probs: Log probability of each episode under the current
-          policy.
-      replay_weights: Weight of each episode in the replay buffer. 0 for
-          episodes not sampled from the replay buffer (i.e. sampled from the
-          policy).
-
-    Returns:
-      Numpy array of shape [batch_size] containing the importance weight for
-      each episode in the batch.
-    """
-    log_total_replay_weight = log(self.experience_replay.total_weight)
-
-    # importance weight
-    # = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
-    # = 1 / ((1-a) + a*q/p)
-    a = float(self.replay_alpha)
-    a_com = 1.0 - a  # compliment of a
-    importance_weights = np.asarray(
-        [1.0 / (a_com
-                + a * exp((log(replay_weight) - log_total_replay_weight)
-                          - log_p))
-         if replay_weight > 0 else 1.0 / a_com
-         for log_p, replay_weight
-         in zip(policy_log_probs, replay_weights)])
-    return importance_weights
-
-  def update_step(self, session, rl_batch, train_op, global_step_op,
-                  return_gradients=False):
-    """Perform gradient update on the model.
-
-    Args:
-      session: tf.Session instance.
-      rl_batch: RLBatch instance from data.py. Use DataManager to create a
-          RLBatch for each call to update_step. RLBatch contains a batch of
-          tasks.
-      train_op: A TF op which will perform the gradient update. LMAgent does not
-          own its training op, so that trainers can do distributed training
-          and construct a specialized training op.
-      global_step_op: A TF op which will return the current global step when
-          run (should not increment it).
-      return_gradients: If True, the gradients will be saved and returned from
-          this method call. This is useful for testing.
-
-    Returns:
-      Results from the update step in a UpdateStepResult namedtuple, including
-      global step, global NPE, serialized summaries, and optionally gradients.
-    """
-    assert self.is_local
-
-    # Do update for REINFORCE or REINFORCE + replay buffer.
-    if self.experience_replay is None:
-      # Train with on-policy REINFORCE.
-
-      # Sample new programs from the policy.
-      num_programs_from_policy = rl_batch.batch_size
-      (batch_actions,
-       batch_values,
-       episode_lengths) = session.run(
-           [self.sampled_batch.tokens, self.sampled_batch.value,
-            self.sampled_batch.episode_lengths])
-      if episode_lengths.size == 0:
-        # This should not happen.
-        logging.warn(
-            'Shapes:\n'
-            'batch_actions.shape: %s\n'
-            'batch_values.shape: %s\n'
-            'episode_lengths.shape: %s\n',
-            batch_actions.shape, batch_values.shape, episode_lengths.shape)
-
-      # Compute rewards.
-      code_scores = compute_rewards(
-          rl_batch, batch_actions, episode_lengths)
-      code_strings = code_scores.code_strings
-      batch_tot_r = code_scores.total_rewards
-      test_cases = code_scores.test_cases
-      code_outputs = code_scores.code_outputs
-      reasons = code_scores.reasons
-
-      # Process on-policy samples.
-      batch_targets, batch_returns = process_episodes(
-          code_scores.batch_rewards, episode_lengths, a2c=self.a2c,
-          baselines=self.ema_by_len,
-          batch_values=batch_values)
-      batch_policy_multipliers = batch_targets
-      batch_emp_values = batch_returns if self.a2c else [[]]
-      adjusted_lengths = episode_lengths
-
-      if self.top_episodes:
-        assert len(self.top_episodes) > 0  # pylint: disable=g-explicit-length-test
-        off_policy_targets = [
-            item for item, _
-            in self.top_episodes.random_sample(self.topk_batch_size)]
-        off_policy_target_lengths = [len(t) for t in off_policy_targets]
-        off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
-                                             dtype=np.int32)
-        offp_switch = 1
-      else:
-        off_policy_targets = [[0]]
-        off_policy_target_lengths = [1]
-        offp_switch = 0
-
-      fetches = {
-          'global_step': global_step_op,
-          'program_count': self.program_count,
-          'summaries': self.rl_summary_op,
-          'train_op': train_op,
-          'gradients': self.gradients_dict if return_gradients else self.no_op}
-      fetched = session.run(
-          fetches,
-          {self.actions: batch_actions,
-           self.empirical_values: batch_emp_values,
-           self.policy_multipliers: batch_policy_multipliers,
-           self.adjusted_lengths: adjusted_lengths,
-           self.off_policy_targets: off_policy_targets,
-           self.off_policy_target_lengths: off_policy_target_lengths,
-           self.offp_switch: offp_switch})
-
-      combined_adjusted_lengths = adjusted_lengths
-      combined_returns = batch_returns
-    else:
-      # Train with REINFORCE + off-policy replay buffer by using importance
-      # sampling.
-
-      # Sample new programs from the policy.
-      # Note: batch size is constant. A full batch will be sampled, but not all
-      # programs will be executed and added to the replay buffer. Those which
-      # are not executed will be discarded and not counted.
-      batch_actions, batch_values, episode_lengths, log_probs = session.run(
-          [self.sampled_batch.tokens, self.sampled_batch.value,
-           self.sampled_batch.episode_lengths, self.sampled_batch.log_probs])
-      if episode_lengths.size == 0:
-        # This should not happen.
-        logging.warn(
-            'Shapes:\n'
-            'batch_actions.shape: %s\n'
-            'batch_values.shape: %s\n'
-            'episode_lengths.shape: %s\n',
-            batch_actions.shape, batch_values.shape, episode_lengths.shape)
-
-      # Sample from experince replay buffer
-      empty_replay_buffer = (
-          self.experience_replay.is_empty()
-          if self.experience_replay is not None else True)
-      num_programs_from_replay_buff = (
-          self.num_replay_per_batch if not empty_replay_buffer else 0)
-      num_programs_from_policy = (
-          rl_batch.batch_size - num_programs_from_replay_buff)
-      if (not empty_replay_buffer) and num_programs_from_replay_buff:
-        result = self.experience_replay.sample_many(
-            num_programs_from_replay_buff)
-        experience_samples, replay_weights = zip(*result)
-        (replay_actions,
-         replay_rewards,
-         _,  # log probs
-         replay_adjusted_lengths) = zip(*experience_samples)
-
-        replay_batch_actions = utils.stack_pad(replay_actions, pad_axes=0,
-                                               dtype=np.int32)
-
-        # compute log probs for replay samples under current policy
-        all_replay_log_probs, = session.run(
-            [self.given_batch.log_probs],
-            {self.actions: replay_batch_actions,
-             self.adjusted_lengths: replay_adjusted_lengths})
-        replay_log_probs = [
-            np.choose(replay_actions[i], all_replay_log_probs[i, :l].T).sum()
-            for i, l in enumerate(replay_adjusted_lengths)]
-      else:
-        # Replay buffer is empty. Do not sample from it.
-        replay_actions = None
-        replay_policy_multipliers = None
-        replay_adjusted_lengths = None
-        replay_log_probs = None
-        replay_weights = None
-        replay_returns = None
-        on_policy_weights = [0] * num_programs_from_replay_buff
-
-      assert not self.a2c  # TODO(danabo): Support A2C with importance sampling.
-
-      # Compute rewards.
-      code_scores = compute_rewards(
-          rl_batch, batch_actions, episode_lengths,
-          batch_size=num_programs_from_policy)
-      code_strings = code_scores.code_strings
-      batch_tot_r = code_scores.total_rewards
-      test_cases = code_scores.test_cases
-      code_outputs = code_scores.code_outputs
-      reasons = code_scores.reasons
-
-      # Process on-policy samples.
-      p = num_programs_from_policy
-      batch_targets, batch_returns = process_episodes(
-          code_scores.batch_rewards, episode_lengths[:p], a2c=False,
-          baselines=self.ema_by_len)
-      batch_policy_multipliers = batch_targets
-      batch_emp_values = [[]]
-      on_policy_returns = batch_returns
-
-      # Process off-policy samples.
-      if (not empty_replay_buffer) and num_programs_from_replay_buff:
-        offp_batch_rewards = [
-            [0.0] * (l - 1) + [r]
-            for l, r in zip(replay_adjusted_lengths, replay_rewards)]
-        assert len(offp_batch_rewards) == num_programs_from_replay_buff
-        assert len(replay_adjusted_lengths) == num_programs_from_replay_buff
-        replay_batch_targets, replay_returns = process_episodes(
-            offp_batch_rewards, replay_adjusted_lengths, a2c=False,
-            baselines=self.ema_by_len)
-        # Convert 2D array back into ragged 2D list.
-        replay_policy_multipliers = [
-            replay_batch_targets[i, :l]
-            for i, l
-            in enumerate(
-                replay_adjusted_lengths[:num_programs_from_replay_buff])]
-
-      adjusted_lengths = episode_lengths[:num_programs_from_policy]
-
-      if self.top_episodes:
-        assert len(self.top_episodes) > 0  # pylint: disable=g-explicit-length-test
-        off_policy_targets = [
-            item for item, _
-            in self.top_episodes.random_sample(self.topk_batch_size)]
-        off_policy_target_lengths = [len(t) for t in off_policy_targets]
-        off_policy_targets = utils.stack_pad(off_policy_targets, pad_axes=0,
-                                             dtype=np.int32)
-        offp_switch = 1
-      else:
-        off_policy_targets = [[0]]
-        off_policy_target_lengths = [1]
-        offp_switch = 0
-
-      # On-policy episodes.
-      if num_programs_from_policy:
-        separate_actions = [
-            batch_actions[i, :l]
-            for i, l in enumerate(adjusted_lengths)]
-        chosen_log_probs = [
-            np.choose(separate_actions[i], log_probs[i, :l].T)
-            for i, l in enumerate(adjusted_lengths)]
-        new_experiences = [
-            (separate_actions[i],
-             batch_tot_r[i],
-             chosen_log_probs[i].sum(), l)
-            for i, l in enumerate(adjusted_lengths)]
-        on_policy_policy_multipliers = [
-            batch_policy_multipliers[i, :l]
-            for i, l in enumerate(adjusted_lengths)]
-        (on_policy_actions,
-         _,  # rewards
-         on_policy_log_probs,
-         on_policy_adjusted_lengths) = zip(*new_experiences)
-      else:
-        new_experiences = []
-        on_policy_policy_multipliers = []
-        on_policy_actions = []
-        on_policy_log_probs = []
-        on_policy_adjusted_lengths = []
-
-      if (not empty_replay_buffer) and num_programs_from_replay_buff:
-        # Look for new experiences in replay buffer. Assign weight if an episode
-        # is in the buffer.
-        on_policy_weights = [0] * num_programs_from_policy
-        for i, cs in enumerate(code_strings):
-          if self.experience_replay.has_key(cs):
-            on_policy_weights[i] = self.experience_replay.get_weight(cs)
-
-      # Randomly select on-policy or off policy episodes to train on.
-      combined_actions = join(replay_actions, on_policy_actions)
-      combined_policy_multipliers = join(
-          replay_policy_multipliers, on_policy_policy_multipliers)
-      combined_adjusted_lengths = join(
-          replay_adjusted_lengths, on_policy_adjusted_lengths)
-      combined_returns = join(replay_returns, on_policy_returns)
-      combined_actions = utils.stack_pad(combined_actions, pad_axes=0)
-      combined_policy_multipliers = utils.stack_pad(combined_policy_multipliers,
-                                                    pad_axes=0)
-      # P
-      combined_on_policy_log_probs = join(replay_log_probs, on_policy_log_probs)
-      # Q
-      # Assume weight is zero for all sequences sampled from the policy.
-      combined_q_weights = join(replay_weights, on_policy_weights)
-
-      # Importance adjustment. Naive formulation:
-      # E_{x~p}[f(x)] ~= 1/N sum_{x~p}(f(x)) ~= 1/N sum_{x~q}(f(x) * p(x)/q(x)).
-      # p(x) is the policy, and q(x) is the off-policy distribution, i.e. replay
-      # buffer distribution. Importance weight w(x) = p(x) / q(x).
-
-      # Instead of sampling from the replay buffer only, we sample from a
-      # mixture distribution of the policy and replay buffer.
-      # We are sampling from the mixture a*q(x) + (1-a)*p(x), where 0 <= a <= 1.
-      # Thus the importance weight w(x) = p(x) / (a*q(x) + (1-a)*p(x))
-      # = 1 / ((1-a) + a*q(x)/p(x)) where q(x) is 0 for x sampled from the
-      #                             policy.
-      # Note: a = self.replay_alpha
-      if empty_replay_buffer:
-        # The replay buffer is empty.
-        # Do no gradient update this step. The replay buffer will have stuff in
-        # it next time.
-        combined_policy_multipliers *= 0
-      elif not num_programs_from_replay_buff:
-        combined_policy_multipliers = np.ones([len(combined_actions), 1],
-                                              dtype=np.float32)
-      else:
-        # If a < 1 compute importance weights
-        # importance weight
-        # = 1 / [(1 - a) + a * exp(log(replay_weight / total_weight / p))]
-        # = 1 / ((1-a) + a*q/p)
-        importance_weights = self._compute_iw(combined_on_policy_log_probs,
-                                              combined_q_weights)
-        if self.config.iw_normalize:
-          importance_weights *= (
-              float(rl_batch.batch_size) / importance_weights.sum())
-        combined_policy_multipliers *= importance_weights.reshape(-1, 1)
-
-      # Train on replay batch, top-k MLE.
-      assert self.program_count is not None
-      fetches = {
-          'global_step': global_step_op,
-          'program_count': self.program_count,
-          'summaries': self.rl_summary_op,
-          'train_op': train_op,
-          'gradients': self.gradients_dict if return_gradients else self.no_op}
-      fetched = session.run(
-          fetches,
-          {self.actions: combined_actions,
-           self.empirical_values: [[]],  # replay_emp_values,
-           self.policy_multipliers: combined_policy_multipliers,
-           self.adjusted_lengths: combined_adjusted_lengths,
-           self.off_policy_targets: off_policy_targets,
-           self.off_policy_target_lengths: off_policy_target_lengths,
-           self.offp_switch: offp_switch})
-
-      # Add to experience replay buffer.
-      self.experience_replay.add_many(
-          objs=new_experiences,
-          weights=[exp(r / self.replay_temperature) for r in batch_tot_r],
-          keys=code_strings)
-
-    # Update program count.
-    session.run(
-        [self.program_count_add_op],
-        {self.program_count_add_ph: num_programs_from_policy})
-
-    # Update EMA baselines on the mini-batch which we just did traning on.
-    if not self.a2c:
-      for i in xrange(rl_batch.batch_size):
-        episode_length = combined_adjusted_lengths[i]
-        empirical_returns = combined_returns[i, :episode_length]
-        for j in xrange(episode_length):
-          # Update ema_baselines in place.
-          self.ema_by_len[j] = (
-              self.ema_baseline_decay * self.ema_by_len[j]
-              + (1 - self.ema_baseline_decay) * empirical_returns[j])
-
-    global_step = fetched['global_step']
-    global_npe = fetched['program_count']
-    core_summaries = fetched['summaries']
-    summaries_list = [core_summaries]
-
-    if num_programs_from_policy:
-      s_i = 0
-      text_summary = self._rl_text_summary(
-          session,
-          global_step,
-          global_npe,
-          batch_tot_r[s_i],
-          episode_lengths[s_i], test_cases[s_i],
-          code_outputs[s_i], code_strings[s_i], reasons[s_i])
-      reward_summary = self._rl_reward_summary(batch_tot_r)
-
-      is_best = False
-      if self.global_best_reward_fn:
-        # Save best reward.
-        best_reward = np.max(batch_tot_r)
-        is_best = self.global_best_reward_fn(session, best_reward)
-
-      if self.found_solution_op is not None and 'correct' in reasons:
-        session.run(self.found_solution_op)
-
-        # Save program to disk for record keeping.
-        if self.stop_on_success:
-          solutions = [
-              {'code': code_strings[i], 'reward': batch_tot_r[i],
-               'npe': global_npe}
-              for i in xrange(len(reasons)) if reasons[i] == 'correct']
-        elif is_best:
-          solutions = [
-              {'code': code_strings[np.argmax(batch_tot_r)],
-               'reward': np.max(batch_tot_r),
-               'npe': global_npe}]
-        else:
-          solutions = []
-        if solutions:
-          if self.assign_code_solution_fn:
-            self.assign_code_solution_fn(session, solutions[0]['code'])
-          with tf.gfile.FastGFile(self.logging_file, 'a') as writer:
-            for solution_dict in solutions:
-              writer.write(str(solution_dict) + '\n')
-
-      max_i = np.argmax(batch_tot_r)
-      max_tot_r = batch_tot_r[max_i]
-      if max_tot_r >= self.top_reward:
-        if max_tot_r >= self.top_reward:
-          self.top_reward = max_tot_r
-        logging.info('Top code: r=%.2f, \t%s', max_tot_r, code_strings[max_i])
-      if self.top_episodes is not None:
-        self.top_episodes.push(
-            max_tot_r, tuple(batch_actions[max_i, :episode_lengths[max_i]]))
-
-      summaries_list += [text_summary, reward_summary]
-
-      if self.do_iw_summaries and not empty_replay_buffer:
-        # prob of replay samples under replay buffer sampling.
-        norm_replay_weights = [
-            w / self.experience_replay.total_weight
-            for w in replay_weights]
-        replay_iw = self._compute_iw(replay_log_probs, replay_weights)
-        on_policy_iw = self._compute_iw(on_policy_log_probs, on_policy_weights)
-        summaries_list.append(
-            self._iw_summary(
-                session, replay_iw, replay_log_probs, norm_replay_weights,
-                on_policy_iw, on_policy_log_probs))
-
-    return UpdateStepResult(
-        global_step=global_step,
-        global_npe=global_npe,
-        summaries_list=summaries_list,
-        gradients_dict=fetched['gradients'])
-
-
-def io_to_text(io_case, io_type):
-  if isinstance(io_case, misc.IOTuple):
-    # If there are many strings, join them with ','.
-    return ','.join([io_to_text(e, io_type) for e in io_case])
-  if io_type == misc.IOType.string:
-    # There is one string. Return it.
-    return misc.tokens_to_text(io_case)
-  if (io_type == misc.IOType.integer
-      or io_type == misc.IOType.boolean):
-    if len(io_case) == 1:
-      return str(io_case[0])
-    return str(io_case)
-
-
-CodeScoreInfo = namedtuple(
-    'CodeScoreInfo',
-    ['code_strings', 'batch_rewards', 'total_rewards', 'test_cases',
-     'code_outputs', 'reasons'])
-
-
-def compute_rewards(rl_batch, batch_actions, episode_lengths, batch_size=None):
-  """Compute rewards for each episode in the batch.
-
-  Args:
-    rl_batch: A data.RLBatch instance. This holds information about the task
-        each episode is solving, and a reward function for each episode.
-    batch_actions: Contains batch of episodes. Each sequence of actions will be
-        converted into a BF program and then scored. A numpy array of shape
-        [batch_size, max_sequence_length].
-    episode_lengths: The sequence length of each episode in the batch. Iterable
-        of length batch_size.
-    batch_size: (optional) number of programs to score. Use this to limit the
-        number of programs executed from this batch. For example, when doing
-        importance sampling some of the on-policy episodes will be discarded
-        and they should not be executed. `batch_size` can be less than or equal
-        to the size of the input batch.
-
-  Returns:
-    CodeScoreInfo namedtuple instance. This holds not just the computed rewards,
-    but additional information computed during code execution which can be used
-    for debugging and monitoring. this includes: BF code strings, test cases
-    the code was executed on, code outputs from those test cases, and reasons
-    for success or failure.
-  """
-  code_strings = [
-      ''.join([misc.bf_int2char(a) for a in action_sequence[:l]])
-      for action_sequence, l in zip(batch_actions, episode_lengths)]
-  if batch_size is None:
-    batch_size = len(code_strings)
-  else:
-    assert batch_size <= len(code_strings)
-    code_strings = code_strings[:batch_size]
-
-  if isinstance(rl_batch.reward_fns, (list, tuple)):
-    # reward_fns is a list of functions, same length as code_strings.
-    assert len(rl_batch.reward_fns) >= batch_size
-    r_fn_results = [
-        rl_batch.reward_fns[i](code_strings[i]) for i in xrange(batch_size)]
-  else:
-    # reward_fns is allowed to be one function which processes a batch of code
-    # strings. This is useful for efficiency and batch level computation.
-    r_fn_results = rl_batch.reward_fns(code_strings)
-
-  # Expecting that r_fn returns a list of rewards. Length of list equals
-  # length of the code string (including EOS char).
-
-  batch_rewards = [r.episode_rewards for r in r_fn_results]
-  total_rewards = [sum(b) for b in batch_rewards]
-  test_cases = [io_to_text(r.input_case, r.input_type) for r in r_fn_results]
-  code_outputs = [io_to_text(r.code_output, r.output_type)
-                  for r in r_fn_results]
-  reasons = [r.reason for r in r_fn_results]
-  return CodeScoreInfo(
-      code_strings=code_strings,
-      batch_rewards=batch_rewards,
-      total_rewards=total_rewards,
-      test_cases=test_cases,
-      code_outputs=code_outputs,
-      reasons=reasons)
-
-
-def process_episodes(
-    batch_rewards, episode_lengths, a2c=False, baselines=None,
-    batch_values=None):
-  """Compute REINFORCE targets.
-
-  REINFORCE here takes the form:
-  grad_t = grad[log(pi(a_t|c_t))*target_t]
-  where c_t is context: i.e. RNN state or environment state (or both).
-
-  Two types of targets are supported:
-  1) Advantage actor critic (a2c).
-  2) Vanilla REINFORCE with baseline.
-
-  Args:
-    batch_rewards: Rewards received in each episode in the batch. A numpy array
-        of shape [batch_size, max_sequence_length]. Note, these are per-timestep
-        rewards, not total reward.
-    episode_lengths: Length of each episode. An iterable of length batch_size.
-    a2c: A bool. Whether to compute a2c targets (True) or vanilla targets
-        (False).
-    baselines: If a2c is False, provide baselines for each timestep. This is a
-        list (or indexable container) of length max_time. Note: baselines are
-        shared across all episodes, which is why there is no batch dimension.
-        It is up to the caller to update baselines accordingly.
-    batch_values: If a2c is True, provide values computed by a value estimator.
-        A numpy array of shape [batch_size, max_sequence_length].
-
-  Returns:
-    batch_targets: REINFORCE targets for each episode and timestep. A numpy
-        array of shape [batch_size, max_sequence_length].
-    batch_returns: Returns computed for each episode and timestep. This is for
-        reference, and is not used in the REINFORCE gradient update (but was
-        used to compute the targets). A numpy array of shape
-        [batch_size, max_sequence_length].
-  """
-  num_programs = len(batch_rewards)
-  assert num_programs <= len(episode_lengths)
-  batch_returns = [None] * num_programs
-  batch_targets = [None] * num_programs
-  for i in xrange(num_programs):
-    episode_length = episode_lengths[i]
-    assert len(batch_rewards[i]) == episode_length
-    # Compute target for each timestep.
-    # If we are computing A2C:
-    #    target_t = advantage_t = R_t - V(c_t)
-    #    where V(c_t) is a learned value function (provided as `values`).
-    # Otherwise:
-    #    target_t = R_t - baselines[t]
-    #    where `baselines` are provided.
-    # In practice we use a more generalized formulation of advantage. See docs
-    # for `discounted_advantage_and_rewards`.
-    if a2c:
-      # Compute advantage.
-      assert batch_values is not None
-      episode_values = batch_values[i, :episode_length]
-      episode_rewards = batch_rewards[i]
-      emp_val, gen_adv = rollout_lib.discounted_advantage_and_rewards(
-          episode_rewards, episode_values, gamma=1.0, lambda_=1.0)
-      batch_returns[i] = emp_val
-      batch_targets[i] = gen_adv
-    else:
-      # Compute return for each timestep. See section 3 of
-      # https://arxiv.org/pdf/1602.01783.pdf
-      assert baselines is not None
-      empirical_returns = rollout_lib.discount(batch_rewards[i], gamma=1.0)
-      targets = [None] * episode_length
-      for j in xrange(episode_length):
-        targets[j] = empirical_returns[j] - baselines[j]
-      batch_returns[i] = empirical_returns
-      batch_targets[i] = targets
-  batch_returns = utils.stack_pad(batch_returns, 0)
-  if num_programs:
-    batch_targets = utils.stack_pad(batch_targets, 0)
-  else:
-    batch_targets = np.array([], dtype=np.float32)
-
-  return (batch_targets, batch_returns)
--- a/research/brain_coder/single_task/pg_agent_test.py
+++ b/research/brain_coder/single_task/pg_agent_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for pg_agent."""
-
-from collections import Counter
-
-from absl import logging
-import numpy as np
-from six.moves import xrange
-import tensorflow as tf
-
-from common import utils  # brain coder
-from single_task import data  # brain coder
-from single_task import defaults  # brain coder
-from single_task import misc  # brain coder
-from single_task import pg_agent as agent_lib  # brain coder
-from single_task import pg_train  # brain coder
-
-
-# Symmetric mean absolute percentage error (SMAPE).
-# https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error
-def smape(a, b):
-  return 2.0 * abs(a - b) / float(a + b)
-
-
-def onehot(dim, num_dims):
-  value = np.zeros(num_dims, dtype=np.float32)
-  value[dim] = 1
-  return value
-
-
-def random_sequence(max_length, num_tokens, eos=0):
-  length = np.random.randint(1, max_length - 1)
-  return np.append(np.random.randint(1, num_tokens, length), eos)
-
-
-def repeat_and_pad(v, rep, total_len):
-  return [v] * rep + [0.0] * (total_len - rep)
-
-
-class AgentTest(tf.test.TestCase):
-
-  def testProcessEpisodes(self):
-    batch_size = 3
-
-    def reward_fn(code_string):
-      return misc.RewardInfo(
-          episode_rewards=[float(ord(c)) for c in code_string],
-          input_case=[],
-          correct_output=[],
-          code_output=[],
-          input_type=misc.IOType.integer,
-          output_type=misc.IOType.integer,
-          reason='none')
-
-    rl_batch = data.RLBatch(
-        reward_fns=[reward_fn for _ in range(batch_size)],
-        batch_size=batch_size,
-        good_reward=10.0)
-    batch_actions = np.asarray([
-        [4, 5, 3, 6, 8, 1, 0, 0],
-        [1, 2, 3, 4, 0, 0, 0, 0],
-        [8, 7, 6, 5, 4, 3, 2, 1]], dtype=np.int32)
-    batch_values = np.asarray([
-        [0, 1, 2, 1, 0, 1, 1, 0],
-        [0, 2, 1, 2, 1, 0, 0, 0],
-        [0, 1, 1, 0, 0, 0, 1, 1]], dtype=np.float32)
-    episode_lengths = np.asarray([7, 5, 8], dtype=np.int32)
-
-    scores = agent_lib.compute_rewards(
-        rl_batch, batch_actions, episode_lengths)
-    batch_targets, batch_returns = agent_lib.process_episodes(
-        scores.batch_rewards, episode_lengths, a2c=True,
-        batch_values=batch_values)
-    self.assertEqual(
-        [[473.0, 428.0, 337.0, 294.0, 201.0, 157.0, 95.0, 0.0],
-         [305.0, 243.0, 183.0, 140.0, 95.0, 0.0, 0.0, 0.0],
-         [484.0, 440.0, 394.0, 301.0, 210.0, 165.0, 122.0, 62.0]],
-        batch_returns.tolist())
-    self.assertEqual(
-        [[473.0, 427.0, 335.0, 293.0, 201.0, 156.0, 94.0, 0.0],
-         [305.0, 241.0, 182.0, 138.0, 94.0, 0.0, 0.0, 0.0],
-         [484.0, 439.0, 393.0, 301.0, 210.0, 165.0, 121.0, 61.0]],
-        batch_targets.tolist())
-
-  def testVarUpdates(self):
-    """Tests that variables get updated as expected.
-
-    For the RL update, check that gradients are non-zero and that the global
-    model gets updated.
-    """
-    config = defaults.default_config_with_updates(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)')
-    lr = config.agent.lr
-
-    tf.reset_default_graph()
-    trainer = pg_train.AsyncTrainer(
-        config, task_id=0, ps_tasks=0, num_workers=1)
-    global_init_op = tf.variables_initializer(
-        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
-    with tf.Session() as sess:
-      sess.run(global_init_op)  # Initialize global copy.
-      trainer.initialize(sess)
-      model = trainer.model
-      global_vars = sess.run(trainer.global_model.trainable_variables)
-      local_vars = sess.run(model.trainable_variables)
-
-      # Make sure names match.
-      g_prefix = 'global/'
-      l_prefix = 'local/'
-      for g, l in zip(trainer.global_model.trainable_variables,
-                      model.trainable_variables):
-        self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):])
-
-      # Assert that shapes and values are the same between global and local
-      # models.
-      for g, l in zip(global_vars, local_vars):
-        self.assertEqual(g.shape, l.shape)
-        self.assertTrue(np.array_equal(g, l))
-
-      # Make all gradients dense tensors.
-      for param, grad in model.gradients_dict.items():
-        if isinstance(grad, tf.IndexedSlices):
-          # Converts to dense tensor.
-          model.gradients_dict[param] = tf.multiply(grad, 1.0)
-
-      # Perform update.
-      results = model.update_step(
-          sess, trainer.data_manager.sample_rl_batch(), trainer.train_op,
-          trainer.global_step, return_gradients=True)
-      grads_dict = results.gradients_dict
-      for grad in grads_dict.values():
-        self.assertIsNotNone(grad)
-        self.assertTrue(np.count_nonzero(grad) > 0)
-      global_update = sess.run(trainer.global_model.trainable_variables)
-      for tf_var, var_before, var_after in zip(
-          model.trainable_variables, local_vars, global_update):
-        # Check that the params were updated.
-        self.assertTrue(np.allclose(
-            var_after,
-            var_before - grads_dict[tf_var] * lr))
-
-      # Test that global to local sync works.
-      sess.run(trainer.sync_op)
-      global_vars = sess.run(trainer.global_model.trainable_variables)
-      local_vars = sess.run(model.trainable_variables)
-      for l, g in zip(local_vars, global_vars):
-        self.assertTrue(np.allclose(l, g))
-
-  def testMonteCarloGradients(self):
-    """Test Monte Carlo estimate of REINFORCE gradient.
-
-    Test that the Monte Carlo estimate of the REINFORCE gradient is
-    approximately equal to the true gradient. We compute the true gradient for a
-    toy environment with a very small action space.
-
-    Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf.
-    """
-    # Test may have different outcome on different machines due to different
-    # rounding behavior of float arithmetic.
-    tf.reset_default_graph()
-    tf.set_random_seed(12345678987654321)
-    np.random.seed(1294024302)
-    max_length = 2
-    num_tokens = misc.bf_num_tokens()
-    eos = misc.BF_EOS_INT
-    assert eos == 0
-    def sequence_iterator(max_length):
-      """Iterates through all sequences up to the given length."""
-      yield [eos]
-      for a in xrange(1, num_tokens):
-        if max_length > 1:
-          for sub_seq in sequence_iterator(max_length - 1):
-            yield [a] + sub_seq
-        else:
-          yield [a]
-    actions = list(sequence_iterator(max_length))
-
-    # This batch contains all possible episodes up to max_length.
-    actions_batch = utils.stack_pad(actions, 0)
-    lengths_batch = [len(s) for s in actions]
-
-    reward_map = {tuple(a): np.random.randint(-1, 7) for a in actions_batch}
-    # reward_map = {tuple(a): np.random.normal(3, 1)
-    #               for a in actions_batch}  # normal distribution
-    # reward_map = {tuple(a): 1.0
-    #               for a in actions_batch}  # expected reward is 1
-
-    n = 100000  # MC sample size.
-    config = defaults.default_config_with_updates(
-        'env=c(task="print"),'
-        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
-        'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,'
-        'policy_lstm_sizes=[10],eos_token=True),'
-        'batch_size='+str(n)+',timestep_limit='+str(max_length))
-
-    dtype = tf.float64
-    trainer = pg_train.AsyncTrainer(
-        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
-    model = trainer.model
-    actions_ph = model.actions
-    lengths_ph = model.adjusted_lengths
-    multipliers_ph = model.policy_multipliers
-
-    global_init_op = tf.variables_initializer(
-        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
-    with tf.Session() as sess, sess.graph.as_default():
-      sess.run(global_init_op)  # Initialize global copy.
-      trainer.initialize(sess)
-
-      # Compute exact gradients.
-      # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch)
-      true_loss_unnormalized = 0.0
-      exact_grads = [np.zeros(v.shape) for v in model.trainable_variables]
-      episode_probs_map = {}
-      grads_map = {}
-      for a_idx in xrange(len(actions_batch)):
-        a = actions_batch[a_idx]
-        grads_result, probs_result, loss = sess.run(
-            [model.dense_unclipped_grads, model.chosen_probs, model.loss],
-            {actions_ph: [a],
-             lengths_ph: [lengths_batch[a_idx]],
-             multipliers_ph: [
-                 repeat_and_pad(reward_map[tuple(a)],
-                                lengths_batch[a_idx],
-                                max_length)]})
-        # Take product over time axis.
-        episode_probs_result = np.prod(probs_result[0, :lengths_batch[a_idx]])
-        for i in range(0, len(exact_grads)):
-          exact_grads[i] += grads_result[i] * episode_probs_result
-        episode_probs_map[tuple(a)] = episode_probs_result
-        reward_map[tuple(a)] = reward_map[tuple(a)]
-        grads_map[tuple(a)] = grads_result
-        true_loss_unnormalized += loss
-      # Normalize loss. Since each episode is feed into the model one at a time,
-      # normalization needs to be done manually.
-      true_loss = true_loss_unnormalized / float(len(actions_batch))
-
-      # Compute Monte Carlo gradients.
-      # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to
-      # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n
-      # where len(actions_sampled_from_P) == n.
-      #
-      # In other words, sample from the policy and compute the gradients of the
-      # log probs weighted by the returns. This will excersize the code in
-      # agent.py
-      sampled_actions, sampled_lengths = sess.run(
-          [model.sampled_tokens, model.episode_lengths])
-      pi_multipliers = [
-          repeat_and_pad(reward_map[tuple(a)], l, max_length)
-          for a, l in zip(sampled_actions, sampled_lengths)]
-      mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run(
-          [model.dense_unclipped_grads, model.chosen_probs, model.loss],
-          {actions_ph: sampled_actions,
-           multipliers_ph: pi_multipliers,
-           lengths_ph: sampled_lengths})
-      # Loss is already normalized across the minibatch, so no normalization
-      # is needed.
-      mc_grads = mc_grads_unnormalized
-      mc_loss = mc_loss_unnormalized
-
-    # Make sure true loss and MC loss are similar.
-    loss_error = smape(true_loss, mc_loss)
-    self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error)
-
-    # Check that probs computed for episodes sampled from the model are the same
-    # as the recorded true probs.
-    for i in range(100):
-      acs = tuple(sampled_actions[i].tolist())
-      sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]])
-      self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob))
-
-    # Make sure MC estimates of true probs are close.
-    counter = Counter(tuple(e) for e in sampled_actions)
-    for acs, count in counter.iteritems():
-      mc_prob = count / float(len(sampled_actions))
-      true_prob = episode_probs_map[acs]
-      error = smape(mc_prob, true_prob)
-      self.assertTrue(
-          error < 0.15,
-          msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s'
-          % (error, count, mc_prob, true_prob))
-
-    # Manually recompute MC gradients and make sure they match MC gradients
-    # computed in TF.
-    mc_grads_recompute = [np.zeros(v.shape) for v in model.trainable_variables]
-    for i in range(n):
-      acs = tuple(sampled_actions[i].tolist())
-      for i in range(0, len(mc_grads_recompute)):
-        mc_grads_recompute[i] += grads_map[acs][i]
-    for i in range(0, len(mc_grads_recompute)):
-      self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n))
-
-    # Check angle between gradients as fraction of pi.
-    for index in range(len(mc_grads)):
-      v1 = mc_grads[index].reshape(-1)
-      v2 = exact_grads[index].reshape(-1)
-      # angle = arccos(v1 . v2 / (|v1|*|v2|))
-      angle_rad = np.arccos(
-          np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
-      logging.info('angle / pi: %s', angle_rad / np.pi)
-      angle_frac = angle_rad / np.pi
-      self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac)
-    # Check norms.
-    for index in range(len(mc_grads)):
-      v1_norm = np.linalg.norm(mc_grads[index].reshape(-1))
-      v2_norm = np.linalg.norm(exact_grads[index].reshape(-1))
-      error = smape(v1_norm, v2_norm)
-      self.assertTrue(error < 0.02, msg='actual: %s' % error)
-
-    # Check expected rewards.
-    # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions)
-    mc_expected_reward = np.mean(
-        [reward_map[tuple(a)] for a in sampled_actions])
-    exact_expected_reward = np.sum(
-        [episode_probs_map[k] * reward_map[k] for k in reward_map])
-    error = smape(mc_expected_reward, exact_expected_reward)
-    self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
-
-  def testNumericalGradChecking(self):
-    # Similar to
-    # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization.
-    epsilon = 1e-4
-    eos = misc.BF_EOS_INT
-    self.assertEqual(0, eos)
-    config = defaults.default_config_with_updates(
-        'env=c(task="print"),'
-        'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,'
-        'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],'
-        'eos_token=True),'
-        'batch_size=64')
-    dtype = tf.float64
-    tf.reset_default_graph()
-    tf.set_random_seed(12345678987654321)
-    np.random.seed(1294024302)
-    trainer = pg_train.AsyncTrainer(
-        config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype)
-    model = trainer.model
-    actions_ph = model.actions
-    lengths_ph = model.adjusted_lengths
-    multipliers_ph = model.policy_multipliers
-    loss = model.pi_loss
-    global_init_op = tf.variables_initializer(
-        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global'))
-
-    assign_add_placeholders = [None] * len(model.trainable_variables)
-    assign_add_ops = [None] * len(model.trainable_variables)
-    param_shapes = [None] * len(model.trainable_variables)
-    for i, param in enumerate(model.trainable_variables):
-      param_shapes[i] = param.get_shape().as_list()
-      assign_add_placeholders[i] = tf.placeholder(dtype,
-                                                  np.prod(param_shapes[i]))
-      assign_add_ops[i] = param.assign_add(
-          tf.reshape(assign_add_placeholders[i], param_shapes[i]))
-
-    with tf.Session() as sess:
-      sess.run(global_init_op)  # Initialize global copy.
-      trainer.initialize(sess)
-
-      actions_raw = [random_sequence(10, 9) for _ in xrange(16)]
-      actions_batch = utils.stack_pad(actions_raw, 0)
-      lengths_batch = [len(l) for l in actions_raw]
-      feed = {actions_ph: actions_batch,
-              multipliers_ph: np.ones_like(actions_batch),
-              lengths_ph: lengths_batch}
-
-      estimated_grads = [None] * len(model.trainable_variables)
-      for i, param in enumerate(model.trainable_variables):
-        param_size = np.prod(param_shapes[i])
-        estimated_grads[i] = np.zeros(param_size, dtype=np.float64)
-        for index in xrange(param_size):
-          e = onehot(index, param_size) * epsilon
-          sess.run(assign_add_ops[i],
-                   {assign_add_placeholders[i]: e})
-          j_plus = sess.run(loss, feed)
-          sess.run(assign_add_ops[i],
-                   {assign_add_placeholders[i]: -2 * e})
-          j_minus = sess.run(loss, feed)
-          sess.run(assign_add_ops[i],
-                   {assign_add_placeholders[i]: e})
-          estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon)
-        estimated_grads[i] = estimated_grads[i].reshape(param_shapes[i])
-
-      analytic_grads = sess.run(model.dense_unclipped_grads, feed)
-
-      for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]):
-        logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean())
-        self.assertTrue(np.allclose(g1, g2))
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/pg_train.py
+++ b/research/brain_coder/single_task/pg_train.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-r"""Train RL agent on coding tasks."""
-
-import contextlib
-import cPickle
-import cProfile
-import marshal
-import os
-import time
-
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-# internal session lib import
-
-from single_task import data  # brain coder
-from single_task import defaults  # brain coder
-from single_task import pg_agent as agent_lib  # brain coder
-from single_task import results_lib  # brain coder
-
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(
-    'master', '',
-    'URL of the TensorFlow master to use.')
-flags.DEFINE_integer(
-    'ps_tasks', 0,
-    'Number of parameter server tasks. Only set to 0 for '
-    'single worker training.')
-flags.DEFINE_integer(
-    'summary_interval', 10,
-    'How often to write summaries.')
-flags.DEFINE_integer(
-    'summary_tasks', 16,
-    'If greater than 0 only tasks 0 through summary_tasks - 1 '
-    'will write summaries. If 0, all tasks will write '
-    'summaries.')
-flags.DEFINE_bool(
-    'stop_on_success', True,
-    'If True, training will stop as soon as a solution is found. '
-    'If False, training will continue indefinitely until another '
-    'stopping condition is reached.')
-flags.DEFINE_bool(
-    'do_profiling', False,
-    'If True, cProfile profiler will run and results will be '
-    'written to logdir. WARNING: Results will not be written if '
-    'the code crashes. Make sure it exists successfully.')
-flags.DEFINE_integer('model_v', 0, 'Model verbosity level.')
-flags.DEFINE_bool(
-    'delayed_graph_cleanup', True,
-    'If true, container for n-th run will not be reset until the (n+1)-th run '
-    'is complete. This greatly reduces the chance that a worker is still '
-    'using the n-th container when it is cleared.')
-
-
-def define_tuner_hparam_space(hparam_space_type):
-  """Define tunable hparams for grid search."""
-  if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
-    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
-
-  # Discrete hparam space is stored as a dict from hparam name to discrete
-  # values.
-  hparam_space = {}
-
-  if hparam_space_type in ('pg', 'pg-topk', 'is'):
-    # Add a floating point parameter named learning rate.
-    hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
-    hparam_space['entropy_beta'] = [0.005, 0.01, 0.05, 0.10]
-  else:  # 'topk'
-    # Add a floating point parameter named learning rate.
-    hparam_space['lr'] = [1e-5, 1e-4, 1e-3]
-    hparam_space['entropy_beta'] = [0.0, 0.005, 0.01, 0.05, 0.10]
-
-  if hparam_space_type in ('topk', 'pg-topk'):
-    # topk tuning will be enabled.
-    hparam_space['topk'] = [10]
-    hparam_space['topk_loss_hparam'] = [1.0, 10.0, 50.0, 200.0]
-
-  elif hparam_space_type == 'is':
-    # importance sampling tuning will be enabled.
-    hparam_space['replay_temperature'] = [0.25, 0.5, 1.0, 2.0]
-    hparam_space['alpha'] = [0.5, 0.75, 63/64.]
-
-  return hparam_space
-
-
-def write_hparams_to_config(config, hparams, hparam_space_type):
-  """Write hparams given by the tuner into the Config object."""
-  if hparam_space_type not in ('pg', 'pg-topk', 'topk', 'is'):
-    raise ValueError('Hparam space is not valid: "%s"' % hparam_space_type)
-
-  config.agent.lr = hparams.lr
-  config.agent.entropy_beta = hparams.entropy_beta
-
-  if hparam_space_type in ('topk', 'pg-topk'):
-    # topk tuning will be enabled.
-    config.agent.topk = hparams.topk
-    config.agent.topk_loss_hparam = hparams.topk_loss_hparam
-  elif hparam_space_type == 'is':
-    # importance sampling tuning will be enabled.
-    config.agent.replay_temperature = hparams.replay_temperature
-    config.agent.alpha = hparams.alpha
-
-
-def make_initialized_variable(value, name, shape=None, dtype=tf.float32):
-  """Create a tf.Variable with a constant initializer.
-
-  Args:
-    value: Constant value to initialize the variable with. This is the value
-        that the variable starts with.
-    name: Name of the variable in the TF graph.
-    shape: Shape of the variable. If None, variable will be a scalar.
-    dtype: Data type of the variable. Should be a TF dtype. Defaults to
-        tf.float32.
-
-  Returns:
-    tf.Variable instance.
-  """
-  if shape is None:
-    shape = []
-  return tf.get_variable(
-      name=name, shape=shape, initializer=tf.constant_initializer(value),
-      dtype=dtype, trainable=False)
-
-
-class AsyncTrainer(object):
-  """Manages graph creation and training.
-
-  This async trainer creates a global model on the parameter server, and a local
-  model (for this worker). Gradient updates are sent to the global model, and
-  the updated weights are synced to the local copy.
-  """
-
-  def __init__(self, config, task_id, ps_tasks, num_workers, is_chief=True,
-               summary_writer=None,
-               dtype=tf.float32,
-               summary_interval=1,
-               run_number=0,
-               logging_dir='/tmp', model_v=0):
-    self.config = config
-    self.data_manager = data.DataManager(
-        config, run_number=run_number,
-        do_code_simplification=not FLAGS.stop_on_success)
-    self.task_id = task_id
-    self.ps_tasks = ps_tasks
-    self.is_chief = is_chief
-    if ps_tasks == 0:
-      assert task_id == 0, 'No parameter servers specified. Expecting 1 task.'
-      assert num_workers == 1, (
-          'No parameter servers specified. Expecting 1 task.')
-      worker_device = '/job:localhost/replica:%d/task:0/cpu:0' % task_id
-      # worker_device = '/cpu:0'
-      # ps_device = '/cpu:0'
-    else:
-      assert num_workers > 0, 'There must be at least 1 training worker.'
-      worker_device = '/job:worker/replica:%d/task:0/cpu:0' % task_id
-      # ps_device = '/job:ps/replica:0/task:0/cpu:0'
-    logging.info('worker_device: %s', worker_device)
-
-    logging_file = os.path.join(
-        logging_dir, 'solutions_%d.txt' % task_id)
-    experience_replay_file = os.path.join(
-        logging_dir, 'replay_buffer_%d.pickle' % task_id)
-    self.topk_file = os.path.join(
-        logging_dir, 'topk_buffer_%d.pickle' % task_id)
-
-    tf.get_variable_scope().set_use_resource(True)
-
-    # global model
-    with tf.device(tf.train.replica_device_setter(ps_tasks,
-                                                  ps_device='/job:ps/replica:0',
-                                                  worker_device=worker_device)):
-      with tf.variable_scope('global'):
-        global_model = agent_lib.LMAgent(config, dtype=dtype, is_local=False)
-        global_params_dict = {p.name: p
-                              for p in global_model.sync_variables}
-        self.global_model = global_model
-        self.global_step = make_initialized_variable(
-            0, 'global_step', dtype=tf.int64)
-
-        self.global_best_reward = make_initialized_variable(
-            -10.0, 'global_best_reward', dtype=tf.float64)
-        self.is_best_model = make_initialized_variable(
-            False, 'is_best_model', dtype=tf.bool)
-        self.reset_is_best_model = self.is_best_model.assign(False)
-        self.global_best_reward_placeholder = tf.placeholder(
-            tf.float64, [], name='global_best_reward_placeholder')
-        self.assign_global_best_reward_op = tf.group(
-            self.global_best_reward.assign(
-                self.global_best_reward_placeholder),
-            self.is_best_model.assign(True))
-        def assign_global_best_reward_fn(session, reward):
-          reward = round(reward, 10)
-          best_reward = round(session.run(self.global_best_reward), 10)
-          is_best = reward > best_reward
-          if is_best:
-            session.run(self.assign_global_best_reward_op,
-                        {self.global_best_reward_placeholder: reward})
-          return is_best
-        self.assign_global_best_reward_fn = assign_global_best_reward_fn
-
-        # Any worker will set to true when it finds a solution.
-        self.found_solution_flag = make_initialized_variable(
-            False, 'found_solution_flag', dtype=tf.bool)
-        self.found_solution_op = self.found_solution_flag.assign(True)
-
-        self.run_number = make_initialized_variable(
-            run_number, 'run_number', dtype=tf.int32)
-
-        # Store a solution when found.
-        self.code_solution_variable = tf.get_variable(
-            'code_solution', [], tf.string,
-            initializer=tf.constant_initializer(''))
-        self.code_solution_ph = tf.placeholder(
-            tf.string, [], name='code_solution_ph')
-        self.code_solution_assign_op = self.code_solution_variable.assign(
-            self.code_solution_ph)
-        def assign_code_solution_fn(session, code_solution_string):
-          session.run(self.code_solution_assign_op,
-                      {self.code_solution_ph: code_solution_string})
-        self.assign_code_solution_fn = assign_code_solution_fn
-
-        # Count all programs sampled from policy. This does not include
-        # programs sampled from replay buffer.
-        # This equals NPE (number of programs executed). Only programs sampled
-        # from the policy need to be executed.
-        self.program_count = make_initialized_variable(
-            0, 'program_count', dtype=tf.int64)
-
-    # local model
-    with tf.device(worker_device):
-      with tf.variable_scope('local'):
-        self.model = model = agent_lib.LMAgent(
-            config,
-            task_id=task_id,
-            logging_file=logging_file,
-            experience_replay_file=experience_replay_file,
-            dtype=dtype,
-            global_best_reward_fn=self.assign_global_best_reward_fn,
-            found_solution_op=self.found_solution_op,
-            assign_code_solution_fn=self.assign_code_solution_fn,
-            program_count=self.program_count,
-            stop_on_success=FLAGS.stop_on_success,
-            verbose_level=model_v)
-        local_params = model.trainable_variables
-        local_params_dict = {p.name: p for p in local_params}
-
-    # Pull global params to local model.
-    def _global_to_local_scope(name):
-      assert name.startswith('global/')
-      return 'local' + name[6:]
-    sync_dict = {
-        local_params_dict[_global_to_local_scope(p_name)]: p
-        for p_name, p in global_params_dict.items()}
-    self.sync_op = tf.group(*[v_local.assign(v_global)
-                              for v_local, v_global
-                              in sync_dict.items()])
-
-    # Pair local gradients with global params.
-    grad_var_dict = {
-        gradient: sync_dict[local_var]
-        for local_var, gradient in model.gradients_dict.items()}
-
-    # local model
-    model.make_summary_ops()  # Don't put summaries under 'local' scope.
-    with tf.variable_scope('local'):
-      self.train_op = model.optimizer.apply_gradients(
-          grad_var_dict.items(), global_step=self.global_step)
-      self.local_init_op = tf.variables_initializer(
-          tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
-                            tf.get_variable_scope().name))
-
-    self.local_step = 0
-    self.last_summary_time = time.time()
-    self.summary_interval = summary_interval
-    self.summary_writer = summary_writer
-    self.cached_global_step = -1
-    self.cached_global_npe = -1
-
-    logging.info('summary_interval: %d', self.summary_interval)
-
-    # Load top-k buffer.
-    if self.model.top_episodes is not None and tf.gfile.Exists(self.topk_file):
-      try:
-        with tf.gfile.FastGFile(self.topk_file, 'r') as f:
-          self.model.top_episodes = cPickle.loads(f.read())
-        logging.info(
-            'Loaded top-k buffer from disk with %d items. Location: "%s"',
-            len(self.model.top_episodes), self.topk_file)
-      except (cPickle.UnpicklingError, EOFError) as e:
-        logging.warn(
-            'Failed to load existing top-k buffer from disk. Removing bad file.'
-            '\nLocation: "%s"\nException: %s', self.topk_file, str(e))
-        tf.gfile.Remove(self.topk_file)
-
-  def initialize(self, session):
-    """Run initialization ops."""
-    session.run(self.local_init_op)
-    session.run(self.sync_op)
-    self.cached_global_step, self.cached_global_npe = session.run(
-        [self.global_step, self.program_count])
-
-  def update_global_model(self, session):
-    """Run an update step.
-
-    1) Asynchronously copy global weights to local model.
-    2) Call into local model's update_step method, which does the following:
-        a) Sample batch of programs from policy.
-        b) Compute rewards.
-        c) Compute gradients and update the global model asynchronously.
-    3) Write tensorboard summaries to disk.
-
-    Args:
-      session: tf.Session instance.
-    """
-    session.run(self.sync_op)  # Copy weights from global to local.
-
-    with session.as_default():
-      result = self.model.update_step(
-          session, self.data_manager.sample_rl_batch(), self.train_op,
-          self.global_step)
-      global_step = result.global_step
-      global_npe = result.global_npe
-      summaries = result.summaries_list
-    self.cached_global_step = global_step
-    self.cached_global_npe = global_npe
-    self.local_step += 1
-
-    if self.summary_writer and self.local_step % self.summary_interval == 0:
-      if not isinstance(summaries, (tuple, list)):
-        summaries = [summaries]
-      summaries.append(self._local_step_summary())
-      if self.is_chief:
-        (global_best_reward,
-         found_solution_flag,
-         program_count) = session.run(
-             [self.global_best_reward,
-              self.found_solution_flag,
-              self.program_count])
-        summaries.append(
-            tf.Summary(
-                value=[tf.Summary.Value(
-                    tag='model/best_reward',
-                    simple_value=global_best_reward)]))
-        summaries.append(
-            tf.Summary(
-                value=[tf.Summary.Value(
-                    tag='model/solution_found',
-                    simple_value=int(found_solution_flag))]))
-        summaries.append(
-            tf.Summary(
-                value=[tf.Summary.Value(
-                    tag='model/program_count',
-                    simple_value=program_count)]))
-      for s in summaries:
-        self.summary_writer.add_summary(s, global_step)
-      self.last_summary_time = time.time()
-
-  def _local_step_summary(self):
-    """Compute number of local steps per time increment."""
-    dt = time.time() - self.last_summary_time
-    steps_per_time = self.summary_interval / float(dt)
-    return tf.Summary(value=[
-        tf.Summary.Value(
-            tag='local_step/per_sec',
-            simple_value=steps_per_time),
-        tf.Summary.Value(
-            tag='local_step/step',
-            simple_value=self.local_step)])
-
-  def maybe_save_best_model(self, session, saver, checkpoint_file):
-    """Check if this model got the highest reward and save to disk if so."""
-    if self.is_chief and session.run(self.is_best_model):
-      logging.info('Saving best model to "%s"', checkpoint_file)
-      saver.save(session, checkpoint_file)
-      session.run(self.reset_is_best_model)
-
-  def save_replay_buffer(self):
-    """Save replay buffer to disk.
-
-    Call this periodically so that training can recover if jobs go down.
-    """
-    if self.model.experience_replay is not None:
-      logging.info('Saving experience replay buffer to "%s".',
-                   self.model.experience_replay.save_file)
-      self.model.experience_replay.incremental_save(True)
-
-  def delete_replay_buffer(self):
-    """Delete replay buffer from disk.
-
-    Call this at the end of training to clean up. Replay buffer can get very
-    large.
-    """
-    if self.model.experience_replay is not None:
-      logging.info('Deleting experience replay buffer at "%s".',
-                   self.model.experience_replay.save_file)
-      tf.gfile.Remove(self.model.experience_replay.save_file)
-
-  def save_topk_buffer(self):
-    """Save top-k buffer to disk.
-
-    Call this periodically so that training can recover if jobs go down.
-    """
-    if self.model.top_episodes is not None:
-      logging.info('Saving top-k buffer to "%s".', self.topk_file)
-      # Overwrite previous data each time.
-      with tf.gfile.FastGFile(self.topk_file, 'w') as f:
-        f.write(cPickle.dumps(self.model.top_episodes))
-
-
-@contextlib.contextmanager
-def managed_session(sv, master='', config=None,
-                    start_standard_services=True,
-                    close_summary_writer=True,
-                    max_wait_secs=7200):
-  # Same as Supervisor.managed_session, but with configurable timeout.
-  try:
-    sess = sv.prepare_or_wait_for_session(
-        master=master, config=config,
-        start_standard_services=start_standard_services,
-        max_wait_secs=max_wait_secs)
-    yield sess
-  except tf.errors.DeadlineExceededError:
-    raise
-  except Exception as e:  # pylint: disable=broad-except
-    sv.request_stop(e)
-  finally:
-    try:
-      # Request all the threads to stop and wait for them to do so.  Any
-      # exception raised by the threads is raised again from stop().
-      # Passing stop_grace_period_secs is for blocked enqueue/dequeue
-      # threads which are not checking for `should_stop()`.  They
-      # will be stopped when we close the session further down.
-      sv.stop(close_summary_writer=close_summary_writer)
-    finally:
-      # Close the session to finish up all pending calls.  We do not care
-      # about exceptions raised when closing.  This takes care of
-      # blocked enqueue/dequeue calls.
-      try:
-        sess.close()
-      except Exception:  # pylint: disable=broad-except
-        # Silently ignore exceptions raised by close().
-        pass
-
-
-def train(config, is_chief, tuner=None, run_dir=None, run_number=0,
-          results_writer=None):
-  """Run training loop.
-
-  Args:
-    config: config_lib.Config instance containing global config (agent and env).
-    is_chief: True if this worker is chief. Chief worker manages writing some
-        data to disk and initialization of the global model.
-    tuner: A tuner instance. If not tuning, leave as None.
-    run_dir: Directory where all data for this run will be written. If None,
-        run_dir = FLAGS.logdir. Set this argument when doing multiple runs.
-    run_number: Which run is this.
-    results_writer: Managest writing training results to disk. Results are a
-        dict of metric names and values.
-
-  Returns:
-    The trainer object used to run training updates.
-  """
-  logging.info('Will run asynchronous training.')
-
-  if run_dir is None:
-    run_dir = FLAGS.logdir
-  train_dir = os.path.join(run_dir, 'train')
-  best_model_checkpoint = os.path.join(train_dir, 'best.ckpt')
-  events_dir = '%s/events_%d' % (run_dir, FLAGS.task_id)
-  logging.info('Events directory: %s', events_dir)
-
-  logging_dir = os.path.join(run_dir, 'logs')
-  if not tf.gfile.Exists(logging_dir):
-    tf.gfile.MakeDirs(logging_dir)
-  status_file = os.path.join(logging_dir, 'status.txt')
-
-  if FLAGS.summary_tasks and FLAGS.task_id < FLAGS.summary_tasks:
-    summary_writer = tf.summary.FileWriter(events_dir)
-  else:
-    summary_writer = None
-
-  # Only profile task 0.
-  if FLAGS.do_profiling:
-    logging.info('Profiling enabled')
-    profiler = cProfile.Profile()
-    profiler.enable()
-  else:
-    profiler = None
-
-  trainer = AsyncTrainer(
-      config, FLAGS.task_id, FLAGS.ps_tasks, FLAGS.num_workers,
-      is_chief=is_chief,
-      summary_interval=FLAGS.summary_interval,
-      summary_writer=summary_writer,
-      logging_dir=logging_dir,
-      run_number=run_number,
-      model_v=FLAGS.model_v)
-
-  variables_to_save = [v for v in tf.global_variables()
-                       if v.name.startswith('global')]
-  global_init_op = tf.variables_initializer(variables_to_save)
-  saver = tf.train.Saver(variables_to_save)
-
-  var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                               tf.get_variable_scope().name)
-  logging.info('Trainable vars:')
-  for v in var_list:
-    logging.info('  %s, %s, %s', v.name, v.device, v.get_shape())
-
-  logging.info('All vars:')
-  for v in tf.global_variables():
-    logging.info('  %s, %s, %s', v.name, v.device, v.get_shape())
-
-  def init_fn(unused_sess):
-    logging.info('No checkpoint found. Initialized global params.')
-
-  sv = tf.train.Supervisor(is_chief=is_chief,
-                           logdir=train_dir,
-                           saver=saver,
-                           summary_op=None,
-                           init_op=global_init_op,
-                           init_fn=init_fn,
-                           summary_writer=summary_writer,
-                           ready_op=tf.report_uninitialized_variables(
-                               variables_to_save),
-                           ready_for_local_init_op=None,
-                           global_step=trainer.global_step,
-                           save_model_secs=30,
-                           save_summaries_secs=30)
-
-  # Add a thread that periodically checks if this Trial should stop
-  # based on an early stopping policy.
-  if tuner:
-    sv.Loop(60, tuner.check_for_stop, (sv.coord,))
-
-  last_replay_save_time = time.time()
-
-  global_step = -1
-  logging.info(
-      'Starting session. '
-      'If this hangs, we\'re mostly likely waiting to connect '
-      'to the parameter server. One common cause is that the parameter '
-      'server DNS name isn\'t resolving yet, or is misspecified.')
-  should_retry = True
-  supervisor_deadline_exceeded = False
-  while should_retry:
-    try:
-      with managed_session(
-          sv, FLAGS.master, max_wait_secs=60) as session, session.as_default():
-        should_retry = False
-        do_training = True
-
-        try:
-          trainer.initialize(session)
-          if session.run(trainer.run_number) != run_number:
-            # If we loaded existing model from disk, and the saved run number is
-            # different, throw an exception.
-            raise RuntimeError(
-                'Expecting to be on run %d, but is actually on run %d. '
-                'run_dir: "%s"'
-                % (run_number, session.run(trainer.run_number), run_dir))
-          global_step = trainer.cached_global_step
-          logging.info('Starting training at step=%d', global_step)
-          while do_training:
-            trainer.update_global_model(session)
-
-            if is_chief:
-              trainer.maybe_save_best_model(
-                  session, saver, best_model_checkpoint)
-            global_step = trainer.cached_global_step
-            global_npe = trainer.cached_global_npe
-
-            if time.time() - last_replay_save_time >= 30:
-              trainer.save_replay_buffer()
-              trainer.save_topk_buffer()
-              last_replay_save_time = time.time()
-
-            # Stopping conditions.
-            if tuner and tuner.should_trial_stop():
-              logging.info('Tuner requested early stopping. Finishing.')
-              do_training = False
-            if is_chief and FLAGS.stop_on_success:
-              found_solution = session.run(trainer.found_solution_flag)
-              if found_solution:
-                do_training = False
-                logging.info('Solution found. Finishing.')
-            if FLAGS.max_npe and global_npe >= FLAGS.max_npe:
-              # Max NPE (number of programs executed) reached.
-              logging.info('Max NPE reached. Finishing.')
-              do_training = False
-            if sv.should_stop():
-              logging.info('Supervisor issued stop. Finishing.')
-              do_training = False
-
-        except tf.errors.NotFoundError:
-          # Catch "Error while reading resource variable".
-          # The chief worker likely destroyed the container, so do not retry.
-          logging.info('Caught NotFoundError. Quitting.')
-          do_training = False
-          should_retry = False
-          break
-        except tf.errors.InternalError as e:
-          # Catch "Invalid variable reference."
-          if str(e).startswith('Invalid variable reference.'):
-            # The chief worker likely destroyed the container, so do not
-            # retry.
-            logging.info(
-                'Caught "InternalError: Invalid variable reference.". '
-                'Quitting.')
-            do_training = False
-            should_retry = False
-            break
-          else:
-            # Pass exception through.
-            raise
-
-        # Exited training loop. Write results to disk.
-        if is_chief and results_writer:
-          assert not should_retry
-          with tf.gfile.FastGFile(status_file, 'w') as f:
-            f.write('done')
-          (program_count,
-           found_solution,
-           code_solution,
-           best_reward,
-           global_step) = session.run(
-               [trainer.program_count,
-                trainer.found_solution_flag,
-                trainer.code_solution_variable,
-                trainer.global_best_reward,
-                trainer.global_step])
-          results_dict = {
-              'max_npe': FLAGS.max_npe,
-              'batch_size': config.batch_size,
-              'max_batches': FLAGS.max_npe // config.batch_size,
-              'npe': program_count,
-              'max_global_repetitions': FLAGS.num_repetitions,
-              'max_local_repetitions': FLAGS.num_repetitions,
-              'code_solution': code_solution,
-              'best_reward': best_reward,
-              'num_batches': global_step,
-              'found_solution': found_solution,
-              'task': trainer.data_manager.task_name,
-              'global_rep': run_number}
-          logging.info('results_dict: %s', results_dict)
-          results_writer.append(results_dict)
-
-    except tf.errors.AbortedError:
-      # Catch "Graph handle is not found" error due to preempted jobs.
-      logging.info('Caught AbortedError. Retying.')
-      should_retry = True
-    except tf.errors.DeadlineExceededError:
-      supervisor_deadline_exceeded = True
-      should_retry = False
-
-  if is_chief:
-    logging.info('This is chief worker. Stopping all workers.')
-    sv.stop()
-
-  if supervisor_deadline_exceeded:
-    logging.info('Supervisor timed out. Quitting.')
-  else:
-    logging.info('Reached %s steps. Worker stopped.', global_step)
-
-  # Dump profiling.
-  """
-  How to use profiling data.
-
-  Download the profiler dump to your local machine, say to PROF_FILE_PATH.
-  In a separate script, run something like the following:
-
-  import pstats
-  p = pstats.Stats(PROF_FILE_PATH)
-  p.strip_dirs().sort_stats('cumtime').print_stats()
-
-  This will sort by 'cumtime', which "is the cumulative time spent in this and
-  all subfunctions (from invocation till exit)."
-  https://docs.python.org/2/library/profile.html#instant-user-s-manual
-  """  # pylint: disable=pointless-string-statement
-  if profiler:
-    prof_file = os.path.join(run_dir, 'task_%d.prof' % FLAGS.task_id)
-    logging.info('Done profiling.\nDumping to "%s".', prof_file)
-    profiler.create_stats()
-    with tf.gfile.Open(prof_file, 'w') as f:
-      f.write(marshal.dumps(profiler.stats))
-
-  return trainer
-
-
-def run_training(config=None, tuner=None, logdir=None, trial_name=None,
-                 is_chief=True):
-  """Do all training runs.
-
-  This is the top level training function for policy gradient based models.
-  Run this from the main function.
-
-  Args:
-    config: config_lib.Config instance containing global config (agent and
-        environment hparams). If None, config will be parsed from FLAGS.config.
-    tuner: A tuner instance. Leave as None if not tuning.
-    logdir: Parent directory where all data from all runs will be written. If
-        None, FLAGS.logdir will be used.
-    trial_name: If tuning, set this to a unique string that identifies this
-        trial. If `tuner` is not None, this also must be set.
-    is_chief: True if this worker is the chief.
-
-  Returns:
-    List of results dicts which were written to disk. Each training run gets a
-    results dict. Results dict contains metrics, i.e. (name, value) pairs which
-    give information about the training run.
-
-  Raises:
-    ValueError: If results dicts read from disk contain invalid data.
-  """
-  if not config:
-    # If custom config is not given, get it from flags.
-    config = defaults.default_config_with_updates(FLAGS.config)
-  if not logdir:
-    logdir = FLAGS.logdir
-  if not tf.gfile.Exists(logdir):
-    tf.gfile.MakeDirs(logdir)
-  assert FLAGS.num_repetitions > 0
-  results = results_lib.Results(logdir)
-  results_list, _ = results.read_all()
-
-  logging.info('Starting experiment. Directory: "%s"', logdir)
-
-  if results_list:
-    if results_list[0]['max_npe'] != FLAGS.max_npe:
-      raise ValueError(
-          'Cannot resume training. Max-NPE changed. Was %s, now %s',
-          results_list[0]['max_npe'], FLAGS.max_npe)
-    if results_list[0]['max_global_repetitions'] != FLAGS.num_repetitions:
-      raise ValueError(
-          'Cannot resume training. Number of repetitions changed. Was %s, '
-          'now %s',
-          results_list[0]['max_global_repetitions'],
-          FLAGS.num_repetitions)
-
-  while len(results_list) < FLAGS.num_repetitions:
-    run_number = len(results_list)
-    rep_container_name = trial_name if trial_name else 'container'
-    if FLAGS.num_repetitions > 1:
-      rep_dir = os.path.join(logdir, 'run_%d' % run_number)
-      rep_container_name = rep_container_name + '_run_' + str(run_number)
-    else:
-      rep_dir = logdir
-
-    logging.info(
-        'Starting repetition %d (%d out of %d)', run_number, run_number + 1,
-        FLAGS.num_repetitions)
-
-    # Train will write result to disk.
-    with tf.container(rep_container_name):
-      trainer = train(config, is_chief, tuner, rep_dir, run_number, results)
-    logging.info('Done training.')
-
-    if is_chief:
-      # Destroy current container immediately (clears current graph).
-      logging.info('Clearing shared variables.')
-      tf.Session.reset(FLAGS.master, containers=[rep_container_name])
-      logging.info('Shared variables cleared.')
-
-      # Delete replay buffer on disk.
-      assert trainer
-      trainer.delete_replay_buffer()
-    else:
-      # Give chief worker time to clean up.
-      sleep_sec = 30.0
-      logging.info('Sleeping for %s sec.', sleep_sec)
-      time.sleep(sleep_sec)
-    tf.reset_default_graph()
-    logging.info('Default graph reset.')
-
-    # Expecting that train wrote new result to disk before returning.
-    results_list, _ = results.read_all()
-  return results_list
--- a/research/brain_coder/single_task/pg_train_test.py
+++ b/research/brain_coder/single_task/pg_train_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for pg_train.
-
-These tests excersize code paths available through configuration options.
-Training will be run for just a few steps with the goal being to check that
-nothing crashes.
-"""
-
-from absl import flags
-import tensorflow as tf
-
-from single_task import defaults  # brain coder
-from single_task import run  # brain coder
-
-FLAGS = flags.FLAGS
-
-
-class TrainTest(tf.test.TestCase):
-
-  def RunTrainingSteps(self, config_string, num_steps=10):
-    """Run a few training steps with the given config.
-
-    Just check that nothing crashes.
-
-    Args:
-      config_string: Config encoded in a string. See
-          $REPO_PATH/common/config_lib.py
-      num_steps: Number of training steps to run. Defaults to 10.
-    """
-    config = defaults.default_config_with_updates(config_string)
-    FLAGS.master = ''
-    FLAGS.max_npe = num_steps * config.batch_size
-    FLAGS.summary_interval = 1
-    FLAGS.logdir = tf.test.get_temp_dir()
-    FLAGS.config = config_string
-    tf.reset_default_graph()
-    run.main(None)
-
-  def testVanillaPolicyGradient(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg"),'
-        'timestep_limit=90,batch_size=64')
-
-  def testVanillaPolicyGradient_VariableLengthSequences(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",eos_token=False),'
-        'timestep_limit=90,batch_size=64')
-
-  def testVanillaActorCritic(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",ema_baseline_decay=0.0),'
-        'timestep_limit=90,batch_size=64')
-
-  def testPolicyGradientWithTopK(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10),'
-        'timestep_limit=90,batch_size=64')
-
-  def testVanillaActorCriticWithTopK(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",ema_baseline_decay=0.0,topk_loss_hparam=1.0,'
-        'topk=10),'
-        'timestep_limit=90,batch_size=64')
-
-  def testPolicyGradientWithTopK_VariableLengthSequences(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",topk_loss_hparam=1.0,topk=10,eos_token=False),'
-        'timestep_limit=90,batch_size=64')
-
-  def testPolicyGradientWithImportanceSampling(self):
-    self.RunTrainingSteps(
-        'env=c(task="reverse"),'
-        'agent=c(algorithm="pg",alpha=0.5),'
-        'timestep_limit=90,batch_size=64')
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/results_lib.py
+++ b/research/brain_coder/single_task/results_lib.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Results object manages distributed reading and writing of results to disk."""
-
-import ast
-from collections import namedtuple
-import os
-import re
-from six.moves import xrange
-import tensorflow as tf
-
-
-ShardStats = namedtuple(
-    'ShardStats',
-    ['num_local_reps_completed', 'max_local_reps', 'finished'])
-
-
-def ge_non_zero(a, b):
-  return a >= b and b > 0
-
-
-def get_shard_id(file_name):
-  assert file_name[-4:].lower() == '.txt'
-  return int(file_name[file_name.rfind('_') + 1: -4])
-
-
-class Results(object):
-  """Manages reading and writing training results to disk asynchronously.
-
-  Each worker writes to its own file, so that there are no race conditions when
-  writing happens. However any worker may read any file, as is the case for
-  `read_all`. Writes are expected to be atomic so that workers will never
-  read incomplete data, and this is likely to be the case on Unix systems.
-  Reading out of date data is fine, as workers calling `read_all` will wait
-  until data from every worker has been written before proceeding.
-  """
-  file_template = 'experiment_results_{0}.txt'
-  search_regex = r'^experiment_results_([0-9])+\.txt$'
-
-  def __init__(self, log_dir, shard_id=0):
-    """Construct `Results` instance.
-
-    Args:
-      log_dir: Where to write results files.
-      shard_id: Unique id for this file (i.e. shard). Each worker that will
-          be writing results should use a different shard id. If there are
-          N shards, each shard should be numbered 0 through N-1.
-    """
-    # Use different files for workers so that they can write to disk async.
-    assert 0 <= shard_id
-    self.file_name = self.file_template.format(shard_id)
-    self.log_dir = log_dir
-    self.results_file = os.path.join(self.log_dir, self.file_name)
-
-  def append(self, metrics):
-    """Append results to results list on disk."""
-    with tf.gfile.FastGFile(self.results_file, 'a') as writer:
-      writer.write(str(metrics) + '\n')
-
-  def read_this_shard(self):
-    """Read only from this shard."""
-    return self._read_shard(self.results_file)
-
-  def _read_shard(self, results_file):
-    """Read only from the given shard file."""
-    try:
-      with tf.gfile.FastGFile(results_file, 'r') as reader:
-        results = [ast.literal_eval(entry) for entry in reader]
-    except tf.errors.NotFoundError:
-      # No results written to disk yet. Return empty list.
-      return []
-    return results
-
-  def _get_max_local_reps(self, shard_results):
-    """Get maximum number of repetitions the given shard needs to complete.
-
-    Worker working on each shard needs to complete a certain number of runs
-    before it finishes. This method will return that number so that we can
-    determine which shards are still not done.
-
-    We assume that workers are including a 'max_local_repetitions' value in
-    their results, which should be the total number of repetitions it needs to
-    run.
-
-    Args:
-      shard_results: Dict mapping metric names to values. This should be read
-          from a shard on disk.
-
-    Returns:
-      Maximum number of repetitions the given shard needs to complete.
-    """
-    mlrs = [r['max_local_repetitions'] for r in shard_results]
-    if not mlrs:
-      return 0
-    for n in mlrs[1:]:
-      assert n == mlrs[0], 'Some reps have different max rep.'
-    return mlrs[0]
-
-  def read_all(self, num_shards=None):
-    """Read results across all shards, i.e. get global results list.
-
-    Args:
-      num_shards: (optional) specifies total number of shards. If the caller
-          wants information about which shards are incomplete, provide this
-          argument (so that shards which have yet to be created are still
-          counted as incomplete shards). Otherwise, no information about
-          incomplete shards will be returned.
-
-    Returns:
-      aggregate: Global list of results (across all shards).
-      shard_stats: List of ShardStats instances, one for each shard. Or None if
-          `num_shards` is None.
-    """
-    try:
-      all_children = tf.gfile.ListDirectory(self.log_dir)
-    except tf.errors.NotFoundError:
-      if num_shards is None:
-        return [], None
-      return [], [[] for _ in xrange(num_shards)]
-    shard_ids = {
-        get_shard_id(fname): fname
-        for fname in all_children if re.search(self.search_regex, fname)}
-
-    if num_shards is None:
-      aggregate = []
-      shard_stats = None
-      for results_file in shard_ids.values():
-        aggregate.extend(self._read_shard(
-            os.path.join(self.log_dir, results_file)))
-    else:
-      results_per_shard = [None] * num_shards
-      for shard_id in xrange(num_shards):
-        if shard_id in shard_ids:
-          results_file = shard_ids[shard_id]
-          results_per_shard[shard_id] = self._read_shard(
-              os.path.join(self.log_dir, results_file))
-        else:
-          results_per_shard[shard_id] = []
-
-      # Compute shard stats.
-      shard_stats = []
-      for shard_results in results_per_shard:
-        max_local_reps = self._get_max_local_reps(shard_results)
-        shard_stats.append(ShardStats(
-            num_local_reps_completed=len(shard_results),
-            max_local_reps=max_local_reps,
-            finished=ge_non_zero(len(shard_results), max_local_reps)))
-
-      # Compute aggregate.
-      aggregate = [
-          r for shard_results in results_per_shard for r in shard_results]
-
-    return aggregate, shard_stats
--- a/research/brain_coder/single_task/results_lib_test.py
+++ b/research/brain_coder/single_task/results_lib_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for results_lib."""
-
-import contextlib
-import os
-import shutil
-import tempfile
-from six.moves import xrange
-import tensorflow as tf
-
-from single_task import results_lib  # brain coder
-
-
-@contextlib.contextmanager
-def temporary_directory(suffix='', prefix='tmp', base_path=None):
-  """A context manager to create a temporary directory and clean up on exit.
-
-  The parameters are the same ones expected by tempfile.mkdtemp.
-  The directory will be securely and atomically created.
-  Everything under it will be removed when exiting the context.
-
-  Args:
-    suffix: optional suffix.
-    prefix: options prefix.
-    base_path: the base path under which to create the temporary directory.
-  Yields:
-    The absolute path of the new temporary directory.
-  """
-  temp_dir_path = tempfile.mkdtemp(suffix, prefix, base_path)
-  try:
-    yield temp_dir_path
-  finally:
-    try:
-      shutil.rmtree(temp_dir_path)
-    except OSError as e:
-      if e.message == 'Cannot call rmtree on a symbolic link':
-        # Interesting synthetic exception made up by shutil.rmtree.
-        # Means we received a symlink from mkdtemp.
-        # Also means must clean up the symlink instead.
-        os.unlink(temp_dir_path)
-      else:
-        raise
-
-
-def freeze(dictionary):
-  """Convert dict to hashable frozenset."""
-  return frozenset(dictionary.iteritems())
-
-
-class ResultsLibTest(tf.test.TestCase):
-
-  def testResults(self):
-    with temporary_directory() as logdir:
-      results_obj = results_lib.Results(logdir)
-      self.assertEqual(results_obj.read_this_shard(), [])
-      results_obj.append(
-          {'foo': 1.5, 'bar': 2.5, 'baz': 0})
-      results_obj.append(
-          {'foo': 5.5, 'bar': -1, 'baz': 2})
-      self.assertEqual(
-          results_obj.read_this_shard(),
-          [{'foo': 1.5, 'bar': 2.5, 'baz': 0},
-           {'foo': 5.5, 'bar': -1, 'baz': 2}])
-
-  def testShardedResults(self):
-    with temporary_directory() as logdir:
-      n = 4  # Number of shards.
-      results_objs = [
-          results_lib.Results(logdir, shard_id=i) for i in xrange(n)]
-      for i, robj in enumerate(results_objs):
-        robj.append({'foo': i, 'bar': 1 + i * 2})
-      results_list, _ = results_objs[0].read_all()
-
-      # Check results. Order does not matter here.
-      self.assertEqual(
-          set(freeze(r) for r in results_list),
-          set(freeze({'foo': i, 'bar': 1 + i * 2}) for i in xrange(n)))
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/run.py
+++ b/research/brain_coder/single_task/run.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-r"""Run training.
-
-Choose training algorithm and task(s) and follow these examples.
-
-Run synchronous policy gradient training locally:
-
-CONFIG="agent=c(algorithm='pg'),env=c(task='reverse')"
-OUT_DIR="/tmp/bf_pg_local"
-rm -rf $OUT_DIR
-bazel run -c opt single_task:run -- \
-    --alsologtostderr \
-    --config="$CONFIG" \
-    --max_npe=0 \
-    --logdir="$OUT_DIR" \
-    --summary_interval=1 \
-    --model_v=0
-learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
-
-
-Run genetic algorithm locally:
-
-CONFIG="agent=c(algorithm='ga'),env=c(task='reverse')"
-OUT_DIR="/tmp/bf_ga_local"
-rm -rf $OUT_DIR
-bazel run -c opt single_task:run -- \
-    --alsologtostderr \
-    --config="$CONFIG" \
-    --max_npe=0 \
-    --logdir="$OUT_DIR"
-
-
-Run uniform random search locally:
-
-CONFIG="agent=c(algorithm='rand'),env=c(task='reverse')"
-OUT_DIR="/tmp/bf_rand_local"
-rm -rf $OUT_DIR
-bazel run -c opt single_task:run -- \
-    --alsologtostderr \
-    --config="$CONFIG" \
-    --max_npe=0 \
-    --logdir="$OUT_DIR"
-"""
-
-from absl import app
-from absl import flags
-from absl import logging
-
-from single_task import defaults  # brain coder
-from single_task import ga_train  # brain coder
-from single_task import pg_train  # brain coder
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string('config', '', 'Configuration.')
-flags.DEFINE_string(
-    'logdir', None, 'Absolute path where to write results.')
-flags.DEFINE_integer('task_id', 0, 'ID for this worker.')
-flags.DEFINE_integer('num_workers', 1, 'How many workers there are.')
-flags.DEFINE_integer(
-    'max_npe', 0,
-    'NPE = number of programs executed. Maximum number of programs to execute '
-    'in each run. Training will complete when this threshold is reached. Set '
-    'to 0 for unlimited training.')
-flags.DEFINE_integer(
-    'num_repetitions', 1,
-    'Number of times the same experiment will be run (globally across all '
-    'workers). Each run is independent.')
-flags.DEFINE_string(
-    'log_level', 'INFO',
-    'The threshold for what messages will be logged. One of DEBUG, INFO, WARN, '
-    'ERROR, or FATAL.')
-
-
-# To register an algorithm:
-# 1) Add dependency in the BUILD file to this build rule.
-# 2) Import the algorithm's module at the top of this file.
-# 3) Add a new entry in the following dict. The key is the algorithm name
-#    (used to select the algorithm in the config). The value is the module
-#    defining the expected functions for training and tuning. See the docstring
-#    for `get_namespace` for further details.
-ALGORITHM_REGISTRATION = {
-    'pg': pg_train,
-    'ga': ga_train,
-    'rand': ga_train,
-}
-
-
-def get_namespace(config_string):
-  """Get namespace for the selected algorithm.
-
-  Users who want to add additional algorithm types should modify this function.
-  The algorithm's namespace should contain the following functions:
-    run_training: Run the main training loop.
-    define_tuner_hparam_space: Return the hparam tuning space for the algo.
-    write_hparams_to_config: Helper for tuning. Write hparams chosen for tuning
-        to the Config object.
-  Look at pg_train.py and ga_train.py for function signatures and
-  implementations.
-
-  Args:
-    config_string: String representation of a Config object. This will get
-        parsed into a Config in order to determine what algorithm to use.
-
-  Returns:
-    algorithm_namespace: The module corresponding to the algorithm given in the
-        config.
-    config: The Config object resulting from parsing `config_string`.
-
-  Raises:
-    ValueError: If config.agent.algorithm is not one of the registered
-        algorithms.
-  """
-  config = defaults.default_config_with_updates(config_string)
-  if config.agent.algorithm not in ALGORITHM_REGISTRATION:
-    raise ValueError('Unknown algorithm type "%s"' % (config.agent.algorithm,))
-  else:
-    return ALGORITHM_REGISTRATION[config.agent.algorithm], config
-
-
-def main(argv):
-  del argv  # Unused.
-
-  logging.set_verbosity(FLAGS.log_level)
-
-  flags.mark_flag_as_required('logdir')
-  if FLAGS.num_workers <= 0:
-    raise ValueError('num_workers flag must be greater than 0.')
-  if FLAGS.task_id < 0:
-    raise ValueError('task_id flag must be greater than or equal to 0.')
-  if FLAGS.task_id >= FLAGS.num_workers:
-    raise ValueError(
-        'task_id flag must be strictly less than num_workers flag.')
-
-  ns, _ = get_namespace(FLAGS.config)
-  ns.run_training(is_chief=FLAGS.task_id == 0)
-
-
-if __name__ == '__main__':
-  app.run(main)
--- a/research/brain_coder/single_task/run_eval_tasks.py
+++ b/research/brain_coder/single_task/run_eval_tasks.py
-#!/usr/bin/env python
-from __future__ import print_function
-
-r"""This script can launch any eval experiments from the paper.
-
-This is a script. Run with python, not bazel.
-
-Usage:
-./single_task/run_eval_tasks.py \
-    --exp EXP --desc DESC [--tuning_tasks] [--iclr_tasks] [--task TASK] \
-    [--tasks TASK1 TASK2 ...]
-
-where EXP is one of the keys in `experiments`,
-and DESC is a string description of the set of experiments (such as "v0")
-
-Set only one of these flags:
--tuning_tasks flag only runs tuning tasks.
--iclr_tasks flag only runs the tasks included in the paper.
--regression_tests flag runs tasks which function as regression tests.
--task flag manually selects a single task to run.
--tasks flag takes a custom list of tasks.
-
-Other flags:
--reps N specifies N repetitions per experiment, Default is 25.
--training_replicas R specifies that R workers will be launched to train one
-    task (for neural network algorithms). These workers will update a global
-    model stored on a parameter server. Defaults to 1. If R > 1, a parameter
-    server will also be launched.
-
-
-Run everything:
-exps=( pg-20M pg-topk-20M topk-20M ga-20M rand-20M )
-BIN_DIR="single_task"
-for exp in "${exps[@]}"
-do
-  ./$BIN_DIR/run_eval_tasks.py \
-      --exp "$exp" --iclr_tasks
-done
-"""
-
-import argparse
-from collections import namedtuple
-import subprocess
-
-
-S = namedtuple('S', ['length'])
-default_length = 100
-
-
-iclr_tasks = [
-    'reverse', 'remove-char', 'count-char', 'add', 'bool-logic', 'print-hello',
-    'echo-twice', 'echo-thrice', 'copy-reverse', 'zero-cascade', 'cascade',
-    'shift-left', 'shift-right', 'riffle', 'unriffle', 'middle-char',
-    'remove-last', 'remove-last-two', 'echo-alternating', 'echo-half', 'length',
-    'echo-second-seq', 'echo-nth-seq', 'substring', 'divide-2', 'dedup']
-
-
-regression_test_tasks = ['reverse', 'test-hill-climb']
-
-
-E = namedtuple(
-    'E',
-    ['name', 'method_type', 'config', 'simplify', 'batch_size', 'max_npe'])
-
-
-def make_experiment_settings(name, **kwargs):
-  # Unpack experiment info from name.
-  def split_last(string, char):
-    i = string.rindex(char)
-    return string[:i], string[i+1:]
-  def si_to_int(si_string):
-    return int(
-        si_string.upper().replace('K', '0'*3).replace('M', '0'*6)
-        .replace('G', '0'*9))
-  method_type, max_npe = split_last(name, '-')
-  assert method_type
-  assert max_npe
-  return E(
-      name=name, method_type=method_type, max_npe=si_to_int(max_npe), **kwargs)
-
-
-experiments_set = {
-    make_experiment_settings(
-        'pg-20M',
-        config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.0,topk=0,'
-               'pi_loss_hparam=1.0,alpha=0.0',
-        simplify=False,
-        batch_size=64),
-    make_experiment_settings(
-        'pg-topk-20M',
-        config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=50.0,topk=10,'
-               'pi_loss_hparam=1.0,alpha=0.0',
-        simplify=False,
-        batch_size=64),
-    make_experiment_settings(
-        'topk-20M',
-        config='entropy_beta=0.01,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
-               'pi_loss_hparam=0.0,alpha=0.0',
-        simplify=False,
-        batch_size=64),
-    make_experiment_settings(
-        'topk-0ent-20M',
-        config='entropy_beta=0.000,lr=0.0001,topk_loss_hparam=200.0,topk=10,'
-               'pi_loss_hparam=0.0,alpha=0.0',
-        simplify=False,
-        batch_size=64),
-    make_experiment_settings(
-        'ga-20M',
-        config='crossover_rate=0.95,mutation_rate=0.15',
-        simplify=False,
-        batch_size=100),  # Population size.
-    make_experiment_settings(
-        'rand-20M',
-        config='',
-        simplify=False,
-        batch_size=1),
-    make_experiment_settings(
-        'simpl-500M',
-        config='entropy_beta=0.05,lr=0.0001,topk_loss_hparam=0.5,topk=10,'
-               'pi_loss_hparam=1.0,alpha=0.0',
-        simplify=True,
-        batch_size=64),
-}
-
-experiments = {e.name: e for e in experiments_set}
-
-
-# pylint: disable=redefined-outer-name
-def parse_args(extra_args=()):
-  """Parse arguments and extract task and experiment info."""
-  parser = argparse.ArgumentParser(description='Run all eval tasks.')
-  parser.add_argument('--exp', required=True)
-  parser.add_argument('--tuning_tasks', action='store_true')
-  parser.add_argument('--iclr_tasks', action='store_true')
-  parser.add_argument('--regression_tests', action='store_true')
-  parser.add_argument('--desc', default='v0')
-  parser.add_argument('--reps', default=25)
-  parser.add_argument('--task')
-  parser.add_argument('--tasks', nargs='+')
-  for arg_string, default in extra_args:
-    parser.add_argument(arg_string, default=default)
-  args = parser.parse_args()
-
-  print('Running experiment: %s' % (args.exp,))
-  if args.desc:
-    print('Extra description: "%s"' % (args.desc,))
-  if args.exp not in experiments:
-    raise ValueError('Experiment name is not valid')
-  experiment_name = args.exp
-  experiment_settings = experiments[experiment_name]
-  assert experiment_settings.name == experiment_name
-
-  if args.tasks:
-    print('Launching tasks from args: %s' % (args.tasks,))
-    tasks = {t: S(length=default_length) for t in args.tasks}
-  elif args.task:
-    print('Launching single task "%s"' % args.task)
-    tasks = {args.task: S(length=default_length)}
-  elif args.tuning_tasks:
-    print('Only running tuning tasks')
-    tasks = {name: S(length=default_length)
-             for name in ['reverse-tune', 'remove-char-tune']}
-  elif args.iclr_tasks:
-    print('Running eval tasks from ICLR paper.')
-    tasks = {name: S(length=default_length) for name in iclr_tasks}
-  elif args.regression_tests:
-    tasks = {name: S(length=default_length) for name in regression_test_tasks}
-  print('Tasks: %s' % tasks.keys())
-
-  print('reps = %d' % (int(args.reps),))
-
-  return args, tasks, experiment_settings
-
-
-def run(command_string):
-  subprocess.call(command_string, shell=True)
-
-
-if __name__ == '__main__':
-  LAUNCH_TRAINING_COMMAND = 'single_task/launch_training.sh'
-  COMPILE_COMMAND = 'bazel build -c opt single_task:run.par'
-
-  args, tasks, experiment_settings = parse_args(
-      extra_args=(('--training_replicas', 1),))
-
-  if experiment_settings.method_type in (
-      'pg', 'pg-topk', 'topk', 'topk-0ent', 'simpl'):
-    # Runs PG and TopK.
-
-    def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
-                     batch_size, do_simplify, custom_config_str):
-      """Constructs terminal command for launching NN based algorithms.
-
-      The arguments to this function will be used to create config for the
-      experiment.
-
-      Args:
-        job_name: Name of the job to launch. Should uniquely identify this
-            experiment run.
-        task: Name of the coding task to solve.
-        max_npe: Maximum number of programs executed. An integer.
-        num_reps: Number of times to run the experiment. An integer.
-        code_length: Maximum allowed length of synthesized code.
-        batch_size: Minibatch size for gradient descent.
-        do_simplify: Whether to run the experiment in code simplification mode.
-            A bool.
-        custom_config_str: Additional config for the model config string.
-
-      Returns:
-        The terminal command that launches the specified experiment.
-      """
-      config = """
-        env=c(task='{0}',correct_syntax=False),
-        agent=c(
-          algorithm='pg',
-          policy_lstm_sizes=[35,35],value_lstm_sizes=[35,35],
-          grad_clip_threshold=50.0,param_init_factor=0.5,regularizer=0.0,
-          softmax_tr=1.0,optimizer='rmsprop',ema_baseline_decay=0.99,
-          eos_token={3},{4}),
-        timestep_limit={1},batch_size={2}
-      """.replace(' ', '').replace('\n', '').format(
-          task, code_length, batch_size, do_simplify, custom_config_str)
-      num_ps = 0 if args.training_replicas == 1 else 1
-      return (
-          r'{0} --job_name={1} --config="{2}" --max_npe={3} '
-          '--num_repetitions={4} --num_workers={5} --num_ps={6} '
-          '--stop_on_success={7}'
-          .format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
-                  args.training_replicas, num_ps, str(not do_simplify).lower()))
-
-  else:
-    # Runs GA and Rand.
-    assert experiment_settings.method_type in ('ga', 'rand')
-
-    def make_run_cmd(job_name, task, max_npe, num_reps, code_length,
-                     batch_size, do_simplify, custom_config_str):
-      """Constructs terminal command for launching GA or uniform random search.
-
-      The arguments to this function will be used to create config for the
-      experiment.
-
-      Args:
-        job_name: Name of the job to launch. Should uniquely identify this
-            experiment run.
-        task: Name of the coding task to solve.
-        max_npe: Maximum number of programs executed. An integer.
-        num_reps: Number of times to run the experiment. An integer.
-        code_length: Maximum allowed length of synthesized code.
-        batch_size: Minibatch size for gradient descent.
-        do_simplify: Whether to run the experiment in code simplification mode.
-            A bool.
-        custom_config_str: Additional config for the model config string.
-
-      Returns:
-        The terminal command that launches the specified experiment.
-      """
-      assert not do_simplify
-      if custom_config_str:
-        custom_config_str = ',' + custom_config_str
-      config = """
-        env=c(task='{0}',correct_syntax=False),
-        agent=c(
-          algorithm='{4}'
-          {3}),
-        timestep_limit={1},batch_size={2}
-      """.replace(' ', '').replace('\n', '').format(
-          task, code_length, batch_size, custom_config_str,
-          experiment_settings.method_type)
-      num_workers = num_reps  # Do each rep in parallel.
-      return (
-          r'{0} --job_name={1} --config="{2}" --max_npe={3} '
-          '--num_repetitions={4} --num_workers={5} --num_ps={6} '
-          '--stop_on_success={7}'
-          .format(LAUNCH_TRAINING_COMMAND, job_name, config, max_npe, num_reps,
-                  num_workers, 0, str(not do_simplify).lower()))
-
-  print('Compiling...')
-  run(COMPILE_COMMAND)
-
-  print('Launching %d coding tasks...' % len(tasks))
-  for task, task_settings in tasks.iteritems():
-    name = 'bf_rl_iclr'
-    desc = '{0}.{1}_{2}'.format(args.desc, experiment_settings.name, task)
-    job_name = '{}.{}'.format(name, desc)
-    print('Job name: %s' % job_name)
-    reps = int(args.reps) if not experiment_settings.simplify else 1
-    run_cmd = make_run_cmd(
-        job_name, task, experiment_settings.max_npe, reps,
-        task_settings.length, experiment_settings.batch_size,
-        experiment_settings.simplify,
-        experiment_settings.config)
-    print('Running command:\n' + run_cmd)
-    run(run_cmd)
-
-  print('Done.')
-# pylint: enable=redefined-outer-name
--- a/research/brain_coder/single_task/test_tasks.py
+++ b/research/brain_coder/single_task/test_tasks.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tasks that test correctness of algorithms."""
-
-from six.moves import xrange
-from common import reward as reward_lib  # brain coder
-from single_task import misc  # brain coder
-
-
-class BasicTaskManager(object):
-  """Wraps a generic reward function."""
-
-  def __init__(self, reward_fn):
-    self.reward_fn = reward_fn
-    self.good_reward = 1.0
-
-  def _score_string(self, string):
-    actions = misc.bf_string_to_tokens(string)
-    reward, correct = self.reward_fn(actions)
-    return misc.RewardInfo(
-        episode_rewards=[0.0] * (len(string) - 1) + [reward],
-        input_case=None,
-        correct_output=None,
-        code_output=actions,
-        input_type=None,
-        output_type=misc.IOType.integer,
-        reason='correct' if correct else 'wrong')
-
-  def rl_batch(self, batch_size):
-    reward_fns = [self._score_string] * batch_size
-    return reward_fns
-
-
-class Trie(object):
-  """Trie for sequences."""
-  EOS = ()
-
-  def __init__(self):
-    self.trie = {}
-
-  def insert(self, sequence):
-    d = self.trie
-    for e in sequence:
-      if e not in d:
-        d[e] = {}
-      d = d[e]
-    d[self.EOS] = True   # Terminate sequence.
-
-  def prefix_match(self, sequence):
-    """Return prefix of `sequence` which exists in the trie."""
-    d = self.trie
-    index = 0
-    for i, e in enumerate(sequence + [self.EOS]):
-      index = i
-      if e in d:
-        d = d[e]
-        if e == self.EOS:
-          return sequence, True
-      else:
-        break
-    return sequence[:index], False
-
-  def next_choices(self, sequence):
-    d = self.trie
-    for e in sequence:
-      if e in d:
-        d = d[e]
-      else:
-        raise ValueError('Sequence not a prefix: %s' % (sequence,))
-    return d.keys()
-
-
-class HillClimbingTask(object):
-  """Simple task that tests reward hill climbing ability.
-
-  There are a set of paths (sequences of tokens) which are rewarded. The total
-  reward for a path is proportional to its length, so the longest path is the
-  target. Shorter paths can be dead ends.
-  """
-
-  def __init__(self):
-    # Paths are sequences of sub-sequences. Here we form unique sub-sequences
-    # out of 3 arbitrary ints. We use sub-sequences instead of single entities
-    # to make the task harder by making the episodes last longer, i.e. more
-    # for the agent to remember.
-    a = (1, 2, 3)
-    b = (4, 5, 6)
-    c = (7, 8, 7)
-    d = (6, 5, 4)
-    e = (3, 2, 1)
-    f = (8, 5, 1)
-    g = (6, 4, 2)
-    h = (1, 8, 3)
-    self.paths = Trie()
-    self.paths.insert([a, b, h])
-    self.paths.insert([a, b, c, d, e, f, g, h])
-    self.paths.insert([a, b, c, d, e, b, a])
-    self.paths.insert([a, b, g, h])
-    self.paths.insert([a, e, f, g])
-    self.correct_sequence = misc.flatten([a, b, c, d, e, f, g, h])
-
-    def distance_fn(a, b):
-      len_diff = abs(len(a) - len(b))
-      return sum(reward_lib.mod_abs_diff(ai - 1, bi - 1, 8)
-                 for ai, bi in zip(a, b)) + len_diff * 4  # 8 / 2 = 4
-    self.distance_fn = distance_fn
-
-  def __call__(self, actions):
-    # Compute reward for action sequence.
-    actions = [a for a in actions if a > 0]
-    sequence = [tuple(actions[i: i + 3]) for i in xrange(0, len(actions), 3)]
-    prefix, complete = self.paths.prefix_match(sequence)
-    if complete:
-      return float(len(prefix)), actions == self.correct_sequence
-    if len(prefix) == len(sequence):
-      return float(len(prefix)), False
-    next_pred = sequence[len(prefix)]
-    choices = self.paths.next_choices(prefix)
-    if choices == [()]:
-      return (len(prefix) - len(next_pred) / 3.0), False
-    min_dist = min(self.distance_fn(c, next_pred) for c in choices)
-    # +1 reward for each element in the sequence correct, plus fraction torwards
-    # closest next element.
-    # Maximum distance possible is num_actions * base / 2 = 3 * 8 / 2 = 12
-    return (len(prefix) + (1 - min_dist / 12.0)), False
--- a/research/brain_coder/single_task/test_tasks_test.py
+++ b/research/brain_coder/single_task/test_tasks_test.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-"""Tests for test_tasks."""
-
-import numpy as np
-import tensorflow as tf
-
-from single_task import misc  # brain coder
-from single_task import test_tasks  # brain coder
-
-
-def get_reward(reward_fn, candidate):
-  return sum(reward_fn(misc.bf_tokens_to_string(candidate)).episode_rewards)
-
-
-class TestTasksTest(tf.test.TestCase):
-
-  def testHillClimbingTask(self):
-    task = test_tasks.BasicTaskManager(test_tasks.HillClimbingTask())
-    reward_fns = task.rl_batch(1)
-    reward_fn = reward_fns[0]
-    self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 0]), 8 / 12.))
-    self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 2, 0]), 11 / 12.))
-    self.assertTrue(np.isclose(get_reward(reward_fn, [1, 2, 3, 0]), 1.0))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 2, 0]), 1. + 8 / 12.))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 0]), 2.0))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 0]), 3.0))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 0]), 3.0))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 1, 0]),
-                   3.0 - 4 / 12.))
-    self.assertTrue(
-        np.isclose(
-            get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 1, 8, 3, 1, 1, 1, 1, 0]),
-            2.0))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 3, 0]),
-                   3.0 + 1 / 12.))
-    self.assertTrue(
-        np.isclose(
-            get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1,
-                                   8, 5, 1, 6, 4, 2, 1, 8, 3, 0]),
-            8.0))
-    self.assertTrue(
-        np.isclose(
-            get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1,
-                                   8, 5, 1, 6, 4, 2, 1, 8, 3, 1, 1, 0]),
-            8.0 - 8 / 12.))
-    self.assertTrue(
-        np.isclose(get_reward(reward_fn, [1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3,
-                                          2, 1, 8, 5, 1, 6, 4, 2, 1, 8, 3, 1, 1,
-                                          1, 1, 1, 1, 1, 0]),
-                   7.0))
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/research/brain_coder/single_task/tune.py
+++ b/research/brain_coder/single_task/tune.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-r"""Run grid search.
-
-Look at launch_tuning.sh for details on how to tune at scale.
-
-Usage example:
-Tune with one worker on the local machine.
-
-CONFIG="agent=c(algorithm='pg'),"
-CONFIG+="env=c(task_cycle=['reverse-tune', 'remove-tune'])"
-HPARAM_SPACE_TYPE="pg"
-OUT_DIR="/tmp/bf_pg_tune"
-MAX_NPE=5000000
-NUM_REPETITIONS=50
-rm -rf $OUT_DIR
-mkdir $OUT_DIR
-bazel run -c opt single_task:tune -- \
-    --alsologtostderr \
-    --config="$CONFIG" \
-    --max_npe="$MAX_NPE" \
-    --num_repetitions="$NUM_REPETITIONS" \
-    --logdir="$OUT_DIR" \
-    --summary_interval=1 \
-    --model_v=0 \
-    --hparam_space="$HPARAM_SPACE_TYPE" \
-    --tuner_id=0 \
-    --num_tuners=1 \
-    2>&1 >"$OUT_DIR/tuner_0.log"
-learning/brain/tensorboard/tensorboard.sh --port 12345 --logdir "$OUT_DIR"
-"""
-
-import ast
-import os
-
-from absl import app
-from absl import flags
-from absl import logging
-import numpy as np
-from six.moves import xrange
-import tensorflow as tf
-
-from single_task import defaults  # brain coder
-from single_task import run as run_lib  # brain coder
-
-FLAGS = flags.FLAGS
-flags.DEFINE_integer(
-    'tuner_id', 0,
-    'The unique ID for this tuning worker.')
-flags.DEFINE_integer(
-    'num_tuners', 1,
-    'How many tuners are there.')
-flags.DEFINE_string(
-    'hparam_space', 'default',
-    'String name which denotes the hparam space to tune over. This is '
-    'algorithm dependent.')
-flags.DEFINE_string(
-    'fixed_hparams', '',
-    'HParams string. Used to fix hparams during tuning.')
-flags.DEFINE_float(
-    'success_rate_objective_weight', 1.0,
-    'How much to weight success rate vs num programs seen. By default, only '
-    'success rate is optimized (this is the setting used in the paper).')
-
-
-def parse_hparams_string(hparams_str):
-  hparams = {}
-  for term in hparams_str.split(','):
-    if not term:
-      continue
-    name, value = term.split('=')
-    hparams[name.strip()] = ast.literal_eval(value)
-  return hparams
-
-
-def int_to_multibase(n, bases):
-  digits = [0] * len(bases)
-  for i, b in enumerate(bases):
-    n, d = divmod(n, b)
-    digits[i] = d
-  return digits
-
-
-def hparams_for_index(index, tuning_space):
-  keys = sorted(tuning_space.keys())
-  indices = int_to_multibase(index, [len(tuning_space[k]) for k in keys])
-  return tf.contrib.training.HParams(
-      **{k: tuning_space[k][i] for k, i in zip(keys, indices)})
-
-
-def run_tuner_loop(ns):
-  """Run tuning loop for this worker."""
-  is_chief = FLAGS.task_id == 0
-  tuning_space = ns.define_tuner_hparam_space(
-      hparam_space_type=FLAGS.hparam_space)
-  fixed_hparams = parse_hparams_string(FLAGS.fixed_hparams)
-  for name, value in fixed_hparams.iteritems():
-    tuning_space[name] = [value]
-  tuning_space_size = np.prod([len(values) for values in tuning_space.values()])
-
-  num_local_trials, remainder = divmod(tuning_space_size, FLAGS.num_tuners)
-  if FLAGS.tuner_id < remainder:
-    num_local_trials += 1
-  starting_trial_id = (
-      num_local_trials * FLAGS.tuner_id + min(remainder, FLAGS.tuner_id))
-
-  logging.info('tuning_space_size: %d', tuning_space_size)
-  logging.info('num_local_trials: %d', num_local_trials)
-  logging.info('starting_trial_id: %d', starting_trial_id)
-
-  for local_trial_index in xrange(num_local_trials):
-    trial_config = defaults.default_config_with_updates(FLAGS.config)
-    global_trial_index = local_trial_index + starting_trial_id
-    trial_name = 'trial_' + str(global_trial_index)
-    trial_dir = os.path.join(FLAGS.logdir, trial_name)
-    hparams = hparams_for_index(global_trial_index, tuning_space)
-    ns.write_hparams_to_config(
-        trial_config, hparams, hparam_space_type=FLAGS.hparam_space)
-
-    results_list = ns.run_training(
-        config=trial_config, tuner=None, logdir=trial_dir, is_chief=is_chief,
-        trial_name=trial_name)
-
-    if not is_chief:
-      # Only chief worker needs to write tuning results to disk.
-      continue
-
-    objective, metrics = compute_tuning_objective(
-        results_list, hparams, trial_name, num_trials=tuning_space_size)
-    logging.info('metrics:\n%s', metrics)
-    logging.info('objective: %s', objective)
-    logging.info('programs_seen_fraction: %s',
-                 metrics['programs_seen_fraction'])
-    logging.info('success_rate: %s', metrics['success_rate'])
-    logging.info('success_rate_objective_weight: %s',
-                 FLAGS.success_rate_objective_weight)
-
-    tuning_results_file = os.path.join(trial_dir, 'tuning_results.txt')
-    with tf.gfile.FastGFile(tuning_results_file, 'a') as writer:
-      writer.write(str(metrics) + '\n')
-
-    logging.info('Trial %s complete.', trial_name)
-
-
-def compute_tuning_objective(results_list, hparams, trial_name, num_trials):
-  """Compute tuning objective and metrics given results and trial information.
-
-  Args:
-    results_list: List of results dicts read from disk. These are written by
-        workers.
-    hparams: tf.contrib.training.HParams instance containing the hparams used
-        in this trial (only the hparams which are being tuned).
-    trial_name: Name of this trial. Used to create a trial directory.
-    num_trials: Total number of trials that need to be run. This is saved in the
-        metrics dict for future reference.
-
-  Returns:
-    objective: The objective computed for this trial. Choose the hparams for the
-        trial with the largest objective value.
-    metrics: Information about this trial. A dict.
-  """
-  found_solution = [r['found_solution'] for r in results_list]
-  successful_program_counts = [
-      r['npe'] for r in results_list if r['found_solution']]
-
-  success_rate = sum(found_solution) / float(len(results_list))
-
-  max_programs = FLAGS.max_npe  # Per run.
-  all_program_counts = [
-      r['npe'] if r['found_solution'] else max_programs
-      for r in results_list]
-  programs_seen_fraction = (
-      float(sum(all_program_counts))
-      / (max_programs * len(all_program_counts)))
-
-  # min/max/avg stats are over successful runs.
-  metrics = {
-      'num_runs': len(results_list),
-      'num_succeeded': sum(found_solution),
-      'success_rate': success_rate,
-      'programs_seen_fraction': programs_seen_fraction,
-      'avg_programs': np.mean(successful_program_counts),
-      'max_possible_programs_per_run': max_programs,
-      'global_step': sum([r['num_batches'] for r in results_list]),
-      'hparams': hparams.values(),
-      'trial_name': trial_name,
-      'num_trials': num_trials}
-
-  # Report stats per tasks.
-  tasks = [r['task'] for r in results_list]
-  for task in set(tasks):
-    task_list = [r for r in results_list if r['task'] == task]
-    found_solution = [r['found_solution'] for r in task_list]
-    successful_rewards = [
-        r['best_reward'] for r in task_list
-        if r['found_solution']]
-    successful_num_batches = [
-        r['num_batches']
-        for r in task_list if r['found_solution']]
-    successful_program_counts = [
-        r['npe'] for r in task_list if r['found_solution']]
-    metrics_append = {
-        task + '__num_runs': len(task_list),
-        task + '__num_succeeded': sum(found_solution),
-        task + '__success_rate': (
-            sum(found_solution) / float(len(task_list)))}
-    metrics.update(metrics_append)
-    if any(found_solution):
-      metrics_append = {
-          task + '__min_reward': min(successful_rewards),
-          task + '__max_reward': max(successful_rewards),
-          task + '__avg_reward': np.median(successful_rewards),
-          task + '__min_programs': min(successful_program_counts),
-          task + '__max_programs': max(successful_program_counts),
-          task + '__avg_programs': np.mean(successful_program_counts),
-          task + '__min_batches': min(successful_num_batches),
-          task + '__max_batches': max(successful_num_batches),
-          task + '__avg_batches': np.mean(successful_num_batches)}
-      metrics.update(metrics_append)
-
-  # Objective will be maximized.
-  # Maximize success rate, minimize num programs seen.
-  # Max objective is always 1.
-  weight = FLAGS.success_rate_objective_weight
-  objective = (
-      weight * success_rate
-      + (1 - weight) * (1 - programs_seen_fraction))
-  metrics['objective'] = objective
-
-  return objective, metrics
-
-
-def main(argv):
-  del argv
-
-  logging.set_verbosity(FLAGS.log_level)
-
-  if not FLAGS.logdir:
-    raise ValueError('logdir flag must be provided.')
-  if FLAGS.num_workers <= 0:
-    raise ValueError('num_workers flag must be greater than 0.')
-  if FLAGS.task_id < 0:
-    raise ValueError('task_id flag must be greater than or equal to 0.')
-  if FLAGS.task_id >= FLAGS.num_workers:
-    raise ValueError(
-        'task_id flag must be strictly less than num_workers flag.')
-  if FLAGS.num_tuners <= 0:
-    raise ValueError('num_tuners flag must be greater than 0.')
-  if FLAGS.tuner_id < 0:
-    raise ValueError('tuner_id flag must be greater than or equal to 0.')
-  if FLAGS.tuner_id >= FLAGS.num_tuners:
-    raise ValueError(
-        'tuner_id flag must be strictly less than num_tuners flag.')
-
-  ns, _ = run_lib.get_namespace(FLAGS.config)
-  run_tuner_loop(ns)
-
-
-if __name__ == '__main__':
-  app.run(main)
--- a/research/cognitive_mapping_and_planning/.gitignore
+++ b/research/cognitive_mapping_and_planning/.gitignore
-deps
-*.pyc
-lib*.so
-lib*.so*
--- a/research/cognitive_mapping_and_planning/README.md
+++ b/research/cognitive_mapping_and_planning/README.md
-![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
-![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
-![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
-
-# Cognitive Mapping and Planning for Visual Navigation
-**Saurabh Gupta, James Davidson, Sergey Levine, Rahul Sukthankar, Jitendra Malik**
-
-**Computer Vision and Pattern Recognition (CVPR) 2017.**
-
-**[ArXiv](https://arxiv.org/abs/1702.03920), 
-[Project Website](https://sites.google.com/corp/view/cognitive-mapping-and-planning/)**
-
-### Citing
-If you find this code base and models useful in your research, please consider
-citing the following paper:
-  ```
-  @inproceedings{gupta2017cognitive,
-    title={Cognitive Mapping and Planning for Visual Navigation},
-    author={Gupta, Saurabh and Davidson, James and Levine, Sergey and
-      Sukthankar, Rahul and Malik, Jitendra},
-    booktitle={CVPR},
-    year={2017}
-  }
-  ```
-
-### Contents
-1.  [Requirements: software](#requirements-software)
-2.  [Requirements: data](#requirements-data)
-3.  [Test Pre-trained Models](#test-pre-trained-models)
-4.  [Train your Own Models](#train-your-own-models)
-
-### Requirements: software
-1.  Python Virtual Env Setup: All code is implemented in Python but depends on a
-    small number of python packages and a couple of C libraries. We recommend
-    using virtual environment for installing these python packages and python
-    bindings for these C libraries.
-      ```Shell
-      VENV_DIR=venv
-      pip install virtualenv
-      virtualenv $VENV_DIR
-      source $VENV_DIR/bin/activate
-      
-      # You may need to upgrade pip for installing openv-python.
-      pip install --upgrade pip
-      # Install simple dependencies.
-      pip install -r requirements.txt
-
-      # Patch bugs in dependencies.
-      sh patches/apply_patches.sh
-      ```
-
-2.  Install [Tensorflow](https://www.tensorflow.org/) inside this virtual
-    environment. You will need to use one of the latest nightly builds 
-    (see instructions [here](https://github.com/tensorflow/tensorflow#installation)).
-
-3.  Swiftshader: We use
-    [Swiftshader](https://github.com/google/swiftshader.git), a CPU based
-    renderer to render the meshes.  It is possible to use other renderers,
-    replace `SwiftshaderRenderer` in `render/swiftshader_renderer.py` with
-    bindings to your renderer. 
-    ```Shell
-    mkdir -p deps
-    git clone --recursive https://github.com/google/swiftshader.git deps/swiftshader-src
-    cd deps/swiftshader-src && git checkout 91da6b00584afd7dcaed66da88e2b617429b3950
-    git submodule update
-    mkdir build && cd build && cmake .. && make -j 16 libEGL libGLESv2
-    cd ../../../
-    cp deps/swiftshader-src/build/libEGL* libEGL.so.1
-    cp deps/swiftshader-src/build/libGLESv2* libGLESv2.so.2
-    ```
-
-4.  PyAssimp: We use [PyAssimp](https://github.com/assimp/assimp.git) to load
-    meshes.  It is possible to use other libraries to load meshes, replace
-    `Shape` `render/swiftshader_renderer.py` with bindings to your library for
-    loading meshes. 
-    ```Shell
-    mkdir -p deps
-    git clone https://github.com/assimp/assimp.git deps/assimp-src
-    cd deps/assimp-src
-    git checkout 2afeddd5cb63d14bc77b53740b38a54a97d94ee8
-    cmake CMakeLists.txt -G 'Unix Makefiles' && make -j 16
-    cd port/PyAssimp && python setup.py install
-    cd ../../../..
-    cp deps/assimp-src/lib/libassimp* .
-    ```
-
-5.  graph-tool: We use [graph-tool](https://git.skewed.de/count0/graph-tool)
-    library for graph processing.
-    ```Shell
-    mkdir -p deps
-    # If the following git clone command fails, you can also download the source
-    # from https://downloads.skewed.de/graph-tool/graph-tool-2.2.44.tar.bz2
-    git clone https://git.skewed.de/count0/graph-tool deps/graph-tool-src
-    cd deps/graph-tool-src && git checkout 178add3a571feb6666f4f119027705d95d2951ab
-    bash autogen.sh
-    ./configure --disable-cairo --disable-sparsehash --prefix=$HOME/.local
-    make -j 16
-    make install
-    cd ../../
-    ```
-
-### Requirements: data
-1.  Download the Stanford 3D Indoor Spaces Dataset (S3DIS Dataset) and ImageNet
-    Pre-trained models for initializing different models. Follow instructions in
-    `data/README.md`
-
-### Test Pre-trained Models
-1.  Download pre-trained models. See `output/README.md`.
-
-2.  Test models using `scripts/script_test_pretrained_models.sh`.
-
-### Train Your Own Models
-All models were trained asynchronously with 16 workers each worker using data
-from a single floor. The default hyper-parameters correspond to this setting.
-See [distributed training with
-Tensorflow](https://www.tensorflow.org/deploy/distributed) for setting up
-distributed training. Training with a single worker is possible with the current
-code base but will require some minor changes to allow each worker to load all
-training environments.
-
-### Contact
-For questions or issues open an issue on the tensorflow/models [issues
-tracker](https://github.com/tensorflow/models/issues). Please assign issues to
-@s-gupta.
-
-### Credits
-This code was written by Saurabh Gupta (@s-gupta).
--- a/research/cognitive_mapping_and_planning/__init__.py
+++ b/research/cognitive_mapping_and_planning/__init__.py
--- a/research/cognitive_mapping_and_planning/cfgs/__init__.py
+++ b/research/cognitive_mapping_and_planning/cfgs/__init__.py
--- a/research/cognitive_mapping_and_planning/cfgs/config_cmp.py
+++ b/research/cognitive_mapping_and_planning/cfgs/config_cmp.py
-# Copyright 2016 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import os, sys
-import numpy as np
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
-import logging
-import src.utils as utils
-import cfgs.config_common as cc
-
-
-import tensorflow as tf
-
-
-rgb_resnet_v2_50_path = 'data/init_models/resnet_v2_50/model.ckpt-5136169'
-d_resnet_v2_50_path = 'data/init_models/distill_rgb_to_d_resnet_v2_50/model.ckpt-120002'
-
-def get_default_args():
-  summary_args = utils.Foo(display_interval=1, test_iters=26,
-                           arop_full_summary_iters=14)
-
-  control_args = utils.Foo(train=False, test=False,
-                           force_batchnorm_is_training_at_test=False,
-                           reset_rng_seed=False, only_eval_when_done=False,
-                           test_mode=None)
-  return summary_args, control_args
-
-def get_default_cmp_args():
-  batch_norm_param = {'center': True, 'scale': True,
-                      'activation_fn':tf.nn.relu}
-
-  mapper_arch_args = utils.Foo(
-      dim_reduce_neurons=64,
-      fc_neurons=[1024, 1024],
-      fc_out_size=8,
-      fc_out_neurons=64,
-      encoder='resnet_v2_50',
-      deconv_neurons=[64, 32, 16, 8, 4, 2],
-      deconv_strides=[2, 2, 2, 2, 2, 2],
-      deconv_layers_per_block=2,
-      deconv_kernel_size=4,
-      fc_dropout=0.5,
-      combine_type='wt_avg_logits',
-      batch_norm_param=batch_norm_param)
-
-  readout_maps_arch_args = utils.Foo(
-      num_neurons=[],
-      strides=[],
-      kernel_size=None,
-      layers_per_block=None)
-
-  arch_args = utils.Foo(
-      vin_val_neurons=8, vin_action_neurons=8, vin_ks=3, vin_share_wts=False,
-      pred_neurons=[64, 64], pred_batch_norm_param=batch_norm_param,
-      conv_on_value_map=0, fr_neurons=16, fr_ver='v2', fr_inside_neurons=64,
-      fr_stride=1, crop_remove_each=30, value_crop_size=4,
-      action_sample_type='sample', action_sample_combine_type='one_or_other',
-      sample_gt_prob_type='inverse_sigmoid_decay', dagger_sample_bn_false=True,
-      vin_num_iters=36, isd_k=750., use_agent_loc=False, multi_scale=True,
-      readout_maps=False, rom_arch=readout_maps_arch_args)
-
-  return arch_args, mapper_arch_args
-
-def get_arch_vars(arch_str):
-  if arch_str == '': vals = []
-  else: vals = arch_str.split('_')
-  ks = ['var1', 'var2', 'var3']
-  ks = ks[:len(vals)]
-  
-  # Exp Ver.
-  if len(vals) == 0: ks.append('var1'); vals.append('v0')
-  # custom arch.
-  if len(vals) == 1: ks.append('var2'); vals.append('')
-  # map scape for projection baseline.
-  if len(vals) == 2: ks.append('var3'); vals.append('fr2')
-
-  assert(len(vals) == 3)
-
-  vars = utils.Foo()
-  for k, v in zip(ks, vals):
-    setattr(vars, k, v)
-
-  logging.error('arch_vars: %s', vars)
-  return vars
-
-def process_arch_str(args, arch_str):
-  # This function modifies args.
-  args.arch, args.mapper_arch = get_default_cmp_args()
-
-  arch_vars = get_arch_vars(arch_str)
-
-  args.navtask.task_params.outputs.ego_maps = True
-  args.navtask.task_params.outputs.ego_goal_imgs = True
-  args.navtask.task_params.outputs.egomotion = True
-  args.navtask.task_params.toy_problem = False
-
-  if arch_vars.var1 == 'lmap':
-    args = process_arch_learned_map(args, arch_vars)
-
-  elif arch_vars.var1 == 'pmap':
-    args = process_arch_projected_map(args, arch_vars)
-
-  else:
-    logging.fatal('arch_vars.var1 should be lmap or pmap, but is %s', arch_vars.var1)
-    assert(False)
-
-  return args
-
-def process_arch_learned_map(args, arch_vars):
-  # Multiscale vision based system.
-  args.navtask.task_params.input_type = 'vision'
-  args.navtask.task_params.outputs.images = True
-  
-  if args.navtask.camera_param.modalities[0] == 'rgb':
-    args.solver.pretrained_path = rgb_resnet_v2_50_path
-  elif args.navtask.camera_param.modalities[0] == 'depth':
-    args.solver.pretrained_path = d_resnet_v2_50_path
-
-  if arch_vars.var2 == 'Ssc':
-    sc = 1./args.navtask.task_params.step_size
-    args.arch.vin_num_iters = 40
-    args.navtask.task_params.map_scales = [sc]
-    max_dist = args.navtask.task_params.max_dist * \
-        args.navtask.task_params.num_goals
-    args.navtask.task_params.map_crop_sizes = [2*max_dist]
-
-    args.arch.fr_stride = 1
-    args.arch.vin_action_neurons = 8
-    args.arch.vin_val_neurons = 3
-    args.arch.fr_inside_neurons = 32
-
-    args.mapper_arch.pad_map_with_zeros_each = [24]
-    args.mapper_arch.deconv_neurons = [64, 32, 16]
-    args.mapper_arch.deconv_strides = [1, 2, 1]
-
-  elif (arch_vars.var2 == 'Msc' or arch_vars.var2 == 'MscROMms' or
-        arch_vars.var2 == 'MscROMss' or arch_vars.var2 == 'MscNoVin'):
-    # Code for multi-scale planner.
-    args.arch.vin_num_iters = 8
-    args.arch.crop_remove_each = 4
-    args.arch.value_crop_size = 8
-
-    sc = 1./args.navtask.task_params.step_size
-    max_dist = args.navtask.task_params.max_dist * \
-        args.navtask.task_params.num_goals
-    n_scales = np.log2(float(max_dist) / float(args.arch.vin_num_iters))
-    n_scales = int(np.ceil(n_scales)+1)
-
-    args.navtask.task_params.map_scales = \
-        list(sc*(0.5**(np.arange(n_scales))[::-1]))
-    args.navtask.task_params.map_crop_sizes = [16 for x in range(n_scales)]
-
-    args.arch.fr_stride = 1
-    args.arch.vin_action_neurons = 8
-    args.arch.vin_val_neurons = 3
-    args.arch.fr_inside_neurons = 32
-
-    args.mapper_arch.pad_map_with_zeros_each = [0 for _ in range(n_scales)]
-    args.mapper_arch.deconv_neurons = [64*n_scales, 32*n_scales, 16*n_scales]
-    args.mapper_arch.deconv_strides = [1, 2, 1]
-
-    if arch_vars.var2 == 'MscNoVin':
-      # No planning version.
-      args.arch.fr_stride = [1, 2, 1, 2]
-      args.arch.vin_action_neurons = None
-      args.arch.vin_val_neurons = 16
-      args.arch.fr_inside_neurons = 32
-
-      args.arch.crop_remove_each = 0
-      args.arch.value_crop_size = 4
-      args.arch.vin_num_iters = 0
-
-    elif arch_vars.var2 == 'MscROMms' or arch_vars.var2 == 'MscROMss':
-      # Code with read outs, MscROMms flattens and reads out,
-      # MscROMss does not flatten and produces output at multiple scales.
-      args.navtask.task_params.outputs.readout_maps = True
-      args.navtask.task_params.map_resize_method = 'antialiasing'
-      args.arch.readout_maps = True
-
-      if arch_vars.var2 == 'MscROMms':
-        args.arch.rom_arch.num_neurons = [64, 1]
-        args.arch.rom_arch.kernel_size = 4
-        args.arch.rom_arch.strides = [2,2]
-        args.arch.rom_arch.layers_per_block = 2
-
-        args.navtask.task_params.readout_maps_crop_sizes = [64]
-        args.navtask.task_params.readout_maps_scales = [sc]
-
-      elif arch_vars.var2 == 'MscROMss':
-        args.arch.rom_arch.num_neurons = \
-            [64, len(args.navtask.task_params.map_scales)]
-        args.arch.rom_arch.kernel_size = 4
-        args.arch.rom_arch.strides = [1,1]
-        args.arch.rom_arch.layers_per_block = 1
-
-        args.navtask.task_params.readout_maps_crop_sizes = \
-            args.navtask.task_params.map_crop_sizes
-        args.navtask.task_params.readout_maps_scales = \
-            args.navtask.task_params.map_scales
-
-  else:
-    logging.fatal('arch_vars.var2 not one of Msc, MscROMms, MscROMss, MscNoVin.')
-    assert(False)
-
-  map_channels = args.mapper_arch.deconv_neurons[-1] / \
-    (2*len(args.navtask.task_params.map_scales))
-  args.navtask.task_params.map_channels = map_channels
-  
-  return args
-
-def process_arch_projected_map(args, arch_vars):
-  # Single scale vision based system which does not use a mapper but instead
-  # uses an analytically estimated map.
-  ds = int(arch_vars.var3[2])
-  args.navtask.task_params.input_type = 'analytical_counts'
-  args.navtask.task_params.outputs.analytical_counts = True
-
-  assert(args.navtask.task_params.modalities[0] == 'depth')
-  args.navtask.camera_param.img_channels = None
-
-  analytical_counts = utils.Foo(map_sizes=[512/ds],
-                                xy_resolution=[5.*ds],
-                                z_bins=[[-10, 10, 150, 200]],
-                                non_linearity=[arch_vars.var2])
-  args.navtask.task_params.analytical_counts = analytical_counts
-
-  sc = 1./ds
-  args.arch.vin_num_iters = 36
-  args.navtask.task_params.map_scales = [sc]
-  args.navtask.task_params.map_crop_sizes = [512/ds]
-
-  args.arch.fr_stride = [1,2]
-  args.arch.vin_action_neurons = 8
-  args.arch.vin_val_neurons = 3
-  args.arch.fr_inside_neurons = 32
-
-  map_channels = len(analytical_counts.z_bins[0]) + 1
-  args.navtask.task_params.map_channels = map_channels
-  args.solver.freeze_conv = False
-
-  return args
-
-def get_args_for_config(config_name):
-  args = utils.Foo()
-
-  args.summary, args.control = get_default_args()
-
-  exp_name, mode_str = config_name.split('+')
-  arch_str, solver_str, navtask_str = exp_name.split('.')
-  logging.error('config_name: %s', config_name)
-  logging.error('arch_str: %s', arch_str)
-  logging.error('navtask_str: %s', navtask_str)
-  logging.error('solver_str: %s', solver_str)
-  logging.error('mode_str: %s', mode_str)
-
-  args.solver = cc.process_solver_str(solver_str)
-  args.navtask = cc.process_navtask_str(navtask_str)
-
-  args = process_arch_str(args, arch_str)
-  args.arch.isd_k = args.solver.isd_k
-
-  # Train, test, etc.
-  mode, imset = mode_str.split('_')
-  args = cc.adjust_args_for_mode(args, mode)
-  args.navtask.building_names = args.navtask.dataset.get_split(imset)
-  args.control.test_name = '{:s}_on_{:s}'.format(mode, imset)
-
-  # Log the arguments
-  logging.error('%s', args)
-  return args
--- a/research/cognitive_mapping_and_planning/cfgs/config_common.py
+++ b/research/cognitive_mapping_and_planning/cfgs/config_common.py
-# Copyright 2016 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import os
-import numpy as np
-import logging
-import src.utils as utils
-import datasets.nav_env_config as nec
-from datasets import factory
-
-def adjust_args_for_mode(args, mode):
-  if mode == 'train':
-    args.control.train = True
-  
-  elif mode == 'val1':
-    # Same settings as for training, to make sure nothing wonky is happening
-    # there.
-    args.control.test = True
-    args.control.test_mode = 'val'
-    args.navtask.task_params.batch_size = 32
-
-  elif mode == 'val2':
-    # No data augmentation, not sampling but taking the argmax action, not
-    # sampling from the ground truth at all.
-    args.control.test = True
-    args.arch.action_sample_type = 'argmax'
-    args.arch.sample_gt_prob_type = 'zero'
-    args.navtask.task_params.data_augment = \
-      utils.Foo(lr_flip=0, delta_angle=0, delta_xy=0, relight=False,
-                relight_fast=False, structured=False)
-    args.control.test_mode = 'val'
-    args.navtask.task_params.batch_size = 32
-
-  elif mode == 'bench':
-    # Actually testing the agent in settings that are kept same between
-    # different runs.
-    args.navtask.task_params.batch_size = 16
-    args.control.test = True
-    args.arch.action_sample_type = 'argmax'
-    args.arch.sample_gt_prob_type = 'zero'
-    args.navtask.task_params.data_augment = \
-      utils.Foo(lr_flip=0, delta_angle=0, delta_xy=0, relight=False,
-                relight_fast=False, structured=False)
-    args.summary.test_iters = 250
-    args.control.only_eval_when_done = True
-    args.control.reset_rng_seed = True
-    args.control.test_mode = 'test'
-  else:
-    logging.fatal('Unknown mode: %s.', mode)
-    assert(False)
-  return args
-
-def get_solver_vars(solver_str):
-  if solver_str == '': vals = []; 
-  else: vals = solver_str.split('_')
-  ks = ['clip', 'dlw', 'long', 'typ', 'isdk', 'adam_eps', 'init_lr'];
-  ks = ks[:len(vals)]
-
-  # Gradient clipping or not.
-  if len(vals) == 0: ks.append('clip'); vals.append('noclip');
-  # data loss weight.
-  if len(vals) == 1: ks.append('dlw');  vals.append('dlw20')
-  # how long to train for.
-  if len(vals) == 2: ks.append('long');  vals.append('nolong')
-  # Adam
-  if len(vals) == 3: ks.append('typ');  vals.append('adam2')
-  # reg loss wt
-  if len(vals) == 4: ks.append('rlw');  vals.append('rlw1')
-  # isd_k
-  if len(vals) == 5: ks.append('isdk');  vals.append('isdk415') # 415, inflexion at 2.5k.
-  # adam eps
-  if len(vals) == 6: ks.append('adam_eps');  vals.append('aeps1en8')
-  # init lr
-  if len(vals) == 7: ks.append('init_lr');  vals.append('lr1en3')
-
-  assert(len(vals) == 8)
-  
-  vars = utils.Foo()
-  for k, v in zip(ks, vals):
-    setattr(vars, k, v)
-  logging.error('solver_vars: %s', vars)
-  return vars
-
-def process_solver_str(solver_str):
-  solver = utils.Foo(
-      seed=0, learning_rate_decay=None, clip_gradient_norm=None, max_steps=None,
-      initial_learning_rate=None, momentum=None, steps_per_decay=None,
-      logdir=None, sync=False, adjust_lr_sync=True, wt_decay=0.0001,
-      data_loss_wt=None, reg_loss_wt=None, freeze_conv=True, num_workers=1,
-      task=0, ps_tasks=0, master='local', typ=None, momentum2=None,
-      adam_eps=None)
-
-  # Clobber with overrides from solver str.
-  solver_vars = get_solver_vars(solver_str)
-
-  solver.data_loss_wt          = float(solver_vars.dlw[3:].replace('x', '.'))
-  solver.adam_eps              = float(solver_vars.adam_eps[4:].replace('x', '.').replace('n', '-'))
-  solver.initial_learning_rate = float(solver_vars.init_lr[2:].replace('x', '.').replace('n', '-'))
-  solver.reg_loss_wt           = float(solver_vars.rlw[3:].replace('x', '.'))
-  solver.isd_k                 = float(solver_vars.isdk[4:].replace('x', '.'))
-
-  long = solver_vars.long
-  if long == 'long':
-    solver.steps_per_decay = 40000
-    solver.max_steps = 120000
-  elif long == 'long2':
-    solver.steps_per_decay = 80000
-    solver.max_steps = 120000
-  elif long == 'nolong' or long == 'nol':
-    solver.steps_per_decay = 20000
-    solver.max_steps = 60000
-  else:
-    logging.fatal('solver_vars.long should be long, long2, nolong or nol.')
-    assert(False)
-
-  clip = solver_vars.clip
-  if clip == 'noclip' or clip == 'nocl':
-    solver.clip_gradient_norm = 0
-  elif clip[:4] == 'clip':
-    solver.clip_gradient_norm = float(clip[4:].replace('x', '.'))
-  else:
-    logging.fatal('Unknown solver_vars.clip: %s', clip)
-    assert(False)
-
-  typ = solver_vars.typ
-  if typ == 'adam':
-    solver.typ = 'adam'
-    solver.momentum = 0.9
-    solver.momentum2 = 0.999
-    solver.learning_rate_decay = 1.0
-  elif typ == 'adam2':
-    solver.typ = 'adam'
-    solver.momentum = 0.9
-    solver.momentum2 = 0.999
-    solver.learning_rate_decay = 0.1
-  elif typ == 'sgd':
-    solver.typ = 'sgd'
-    solver.momentum = 0.99
-    solver.momentum2 = None
-    solver.learning_rate_decay = 0.1
-  else:
-    logging.fatal('Unknown solver_vars.typ: %s', typ)
-    assert(False)
-
-  logging.error('solver: %s', solver)
-  return solver
-
-def get_navtask_vars(navtask_str):
-  if navtask_str == '': vals = []
-  else: vals = navtask_str.split('_')
-
-  ks_all = ['dataset_name', 'modality', 'task', 'history', 'max_dist',
-            'num_steps', 'step_size', 'n_ori', 'aux_views', 'data_aug']
-  ks = ks_all[:len(vals)]
-
-  # All data or not.
-  if len(vals) == 0: ks.append('dataset_name'); vals.append('sbpd')
-  # modality
-  if len(vals) == 1: ks.append('modality'); vals.append('rgb')
-  # semantic task?
-  if len(vals) == 2: ks.append('task'); vals.append('r2r')
-  # number of history frames.
-  if len(vals) == 3: ks.append('history'); vals.append('h0')
-  # max steps
-  if len(vals) == 4: ks.append('max_dist'); vals.append('32')
-  # num steps
-  if len(vals) == 5: ks.append('num_steps'); vals.append('40')
-  # step size
-  if len(vals) == 6: ks.append('step_size'); vals.append('8')
-  # n_ori
-  if len(vals) == 7: ks.append('n_ori'); vals.append('4')
-  # Auxiliary views.
-  if len(vals) == 8: ks.append('aux_views'); vals.append('nv0')
-  # Normal data augmentation as opposed to structured data augmentation (if set
-  # to straug.
-  if len(vals) == 9: ks.append('data_aug'); vals.append('straug')
-
-  assert(len(vals) == 10)
-  for i in range(len(ks)):
-    assert(ks[i] == ks_all[i])
-
-  vars = utils.Foo()
-  for k, v in zip(ks, vals):
-    setattr(vars, k, v)
-  logging.error('navtask_vars: %s', vals)
-  return vars
-
-def process_navtask_str(navtask_str):
-  navtask = nec.nav_env_base_config()
-  
-  # Clobber with overrides from strings.
-  navtask_vars = get_navtask_vars(navtask_str)
-
-  navtask.task_params.n_ori = int(navtask_vars.n_ori)
-  navtask.task_params.max_dist = int(navtask_vars.max_dist)
-  navtask.task_params.num_steps = int(navtask_vars.num_steps)
-  navtask.task_params.step_size = int(navtask_vars.step_size)
-  navtask.task_params.data_augment.delta_xy = int(navtask_vars.step_size)/2.
-  n_aux_views_each = int(navtask_vars.aux_views[2])
-  aux_delta_thetas = np.concatenate((np.arange(n_aux_views_each) + 1,
-                                     -1 -np.arange(n_aux_views_each)))
-  aux_delta_thetas = aux_delta_thetas*np.deg2rad(navtask.camera_param.fov)
-  navtask.task_params.aux_delta_thetas = aux_delta_thetas
-  
-  if navtask_vars.data_aug == 'aug':
-    navtask.task_params.data_augment.structured = False
-  elif navtask_vars.data_aug == 'straug':
-    navtask.task_params.data_augment.structured = True
-  else:
-    logging.fatal('Unknown navtask_vars.data_aug %s.', navtask_vars.data_aug)
-    assert(False)
-
-  navtask.task_params.num_history_frames = int(navtask_vars.history[1:])
-  navtask.task_params.n_views = 1+navtask.task_params.num_history_frames
-  
-  navtask.task_params.goal_channels = int(navtask_vars.n_ori)
-  
-  if navtask_vars.task == 'hard': 
-    navtask.task_params.type = 'rng_rejection_sampling_many'
-    navtask.task_params.rejection_sampling_M = 2000
-    navtask.task_params.min_dist = 10
-  elif navtask_vars.task == 'r2r':
-    navtask.task_params.type = 'room_to_room_many'
-  elif navtask_vars.task == 'ST':
-    # Semantic task at hand.
-    navtask.task_params.goal_channels = \
-        len(navtask.task_params.semantic_task.class_map_names)
-    navtask.task_params.rel_goal_loc_dim = \
-        len(navtask.task_params.semantic_task.class_map_names)
-    navtask.task_params.type = 'to_nearest_obj_acc'
-  else:
-    logging.fatal('navtask_vars.task: should be hard or r2r, ST')
-    assert(False)
-  
-  if navtask_vars.modality == 'rgb':
-    navtask.camera_param.modalities = ['rgb']
-    navtask.camera_param.img_channels = 3
-  elif navtask_vars.modality == 'd':
-    navtask.camera_param.modalities = ['depth']
-    navtask.camera_param.img_channels = 2
-  
-  navtask.task_params.img_height   = navtask.camera_param.height
-  navtask.task_params.img_width    = navtask.camera_param.width
-  navtask.task_params.modalities   = navtask.camera_param.modalities
-  navtask.task_params.img_channels = navtask.camera_param.img_channels
-  navtask.task_params.img_fov      = navtask.camera_param.fov
-  
-  navtask.dataset = factory.get_dataset(navtask_vars.dataset_name)
-  return navtask
--- a/research/cognitive_mapping_and_planning/cfgs/config_distill.py
+++ b/research/cognitive_mapping_and_planning/cfgs/config_distill.py
-# Copyright 2016 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import pprint
-import copy
-import os
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
-import logging
-import src.utils as utils
-import cfgs.config_common as cc
-
-
-import tensorflow as tf
-
-rgb_resnet_v2_50_path = 'cache/resnet_v2_50_inception_preprocessed/model.ckpt-5136169'
-
-def get_default_args():
-  robot = utils.Foo(radius=15, base=10, height=140, sensor_height=120,
-                    camera_elevation_degree=-15)
-
-  camera_param = utils.Foo(width=225, height=225, z_near=0.05, z_far=20.0,
-                           fov=60., modalities=['rgb', 'depth'])
-
-  env = utils.Foo(padding=10, resolution=5, num_point_threshold=2,
-                  valid_min=-10, valid_max=200, n_samples_per_face=200)
-
-  data_augment = utils.Foo(lr_flip=0, delta_angle=1, delta_xy=4, relight=False,
-                           relight_fast=False, structured=False)
-
-  task_params = utils.Foo(num_actions=4, step_size=4, num_steps=0,
-                          batch_size=32, room_seed=0, base_class='Building',
-                          task='mapping', n_ori=6, data_augment=data_augment,
-                          output_transform_to_global_map=False,
-                          output_canonical_map=False,
-                          output_incremental_transform=False,
-                          output_free_space=False, move_type='shortest_path',
-                          toy_problem=0)
-
-  buildinger_args = utils.Foo(building_names=['area1_gates_wingA_floor1_westpart'],
-                              env_class=None, robot=robot, 
-                              task_params=task_params, env=env,
-                              camera_param=camera_param)
-
-  solver_args = utils.Foo(seed=0, learning_rate_decay=0.1,
-                          clip_gradient_norm=0, max_steps=120000,
-                          initial_learning_rate=0.001, momentum=0.99,
-                          steps_per_decay=40000, logdir=None, sync=False,
-                          adjust_lr_sync=True, wt_decay=0.0001,
-                          data_loss_wt=1.0, reg_loss_wt=1.0,
-                          num_workers=1, task=0, ps_tasks=0, master='local')
-
-  summary_args = utils.Foo(display_interval=1, test_iters=100)
-
-  control_args = utils.Foo(train=False, test=False,
-                           force_batchnorm_is_training_at_test=False)
-  
-  arch_args = utils.Foo(rgb_encoder='resnet_v2_50', d_encoder='resnet_v2_50')
-
-  return utils.Foo(solver=solver_args,
-                   summary=summary_args, control=control_args, arch=arch_args,
-                   buildinger=buildinger_args)
-
-def get_vars(config_name):
-  vars = config_name.split('_')
-  if len(vars) == 1: # All data or not.
-    vars.append('noall')
-  if len(vars) == 2: # n_ori
-    vars.append('4')
-  logging.error('vars: %s', vars)
-  return vars
-
-def get_args_for_config(config_name):
-  args = get_default_args()
-  config_name, mode = config_name.split('+')
-  vars = get_vars(config_name)
-  
-  logging.info('config_name: %s, mode: %s', config_name, mode)
-  
-  args.buildinger.task_params.n_ori = int(vars[2])
-  args.solver.freeze_conv = True
-  args.solver.pretrained_path = rgb_resnet_v2_50_path
-  args.buildinger.task_params.img_channels = 5
-  args.solver.data_loss_wt = 0.00001
- 
-  if vars[0] == 'v0':
-    None
-  else:
-    logging.error('config_name: %s undefined', config_name)
-
-  args.buildinger.task_params.height = args.buildinger.camera_param.height
-  args.buildinger.task_params.width = args.buildinger.camera_param.width
-  args.buildinger.task_params.modalities = args.buildinger.camera_param.modalities
-  
-  if vars[1] == 'all':
-    args = cc.get_args_for_mode_building_all(args, mode)
-  elif vars[1] == 'noall':
-    args = cc.get_args_for_mode_building(args, mode)
-  
-  # Log the arguments
-  logging.error('%s', args)
-  return args