Merge pull request #1878 from ofirnachum/master

Implementation of PCL and other newly proposed algorithms

Merge pull request #1878 from ofirnachum/master
Implementation of PCL and other newly proposed algorithms
8cedd479 · Lukasz Kaiser · GitHub · 252fffb2 · 51892707 · 8cedd479
Commit 8cedd479 authored Jul 06, 2017 by Lukasz Kaiser Committed by GitHub Jul 06, 2017
14 changed files
--- a/pcl_rl/README.md
+++ b/pcl_rl/README.md
+Code for several RL algorithms used in the following papers:
+* "Improving Policy Gradient by Exploring Under-appreciated Rewards" by
+Ofir Nachum, Mohammad Norouzi, and Dale Schuurmans.
+* "Bridging the Gap Between Value and Policy Based Reinforcement Learning" by
+Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
+* "Trust-PCL: An Off-Policy Trust Region Method for Continuous Control" by
+Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
+Available algorithms:
+* Actor Critic
+* TRPO
+* PCL
+* Unified PCL
+* Trust-PCL
+* PCL + Constraint Trust Region (un-published)
+* REINFORCE
+* UREX
+Requirements:
+* TensorFlow (see http://www.tensorflow.org for how to install/upgrade)
+* OpenAI Gym (see http://gym.openai.com/docs)
+* NumPy (see http://www.numpy.org/)
+* SciPy (see http://www.scipy.org/)
+Quick Start:
+Run UREX on a simple environment:
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.1 --clip_norm=50 \
+  --num_samples=10 --objective=urex
+```
+Run REINFORCE on a simple environment:
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.01 --clip_norm=50 \
+  --num_samples=10 --objective=reinforce
+```
+Run PCL on a simple environment:
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
+  --gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl
+```
+Run PCL with expert trajectories on a simple environment:
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
+  --gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl \
+  --num_expert_paths=10
+```
+Run Mujoco task with TRPO:
+```
+python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
+  --validation_frequency=5 --rollout=10 --gamma=0.995 \
+  --max_step=1000 --cutoff_agent=1000 \
+  --objective=trpo --norecurrent --internal_dim=64 --trust_region_p \
+  --max_divergence=0.05 --value_opt=best_fit --critic_weight=0.0 \
+```
+Run Mujoco task with Trust-PCL:
+```
+python trainer.py --logtostderr --batch_size=1 --env=HalfCheetah-v1 \
+  --validation_frequency=50 --rollout=10 --critic_weight=0.0 \
+  --gamma=0.995 --clip_norm=40 --learning_rate=0.002 \
+  --replay_buffer_freq=1 --replay_buffer_size=20000 \
+  --replay_buffer_alpha=0.1 --norecurrent --objective=pcl \
+  --max_step=100 --tau=0.0 --eviction=fifo --max_divergence=0.001 \
+  --internal_dim=64 --cutoff_agent=1000 \
+  --replay_batch_size=25 --nouse_online_batch --batch_by_steps \
+  --sample_from=target --value_opt=grad --value_hidden_layers=2 \
+  --update_eps_lambda --unify_episodes --clip_adv=1.0 \
+  --target_network_lag=0.99 --prioritize_by=step
+```
+Run Mujoco task with PCL constraint trust region:
+```
+python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
+  --validation_frequency=5 --tau=0.001 --rollout=50 --gamma=0.99 \
+  --max_step=1000 --cutoff_agent=1000 \
+  --objective=pcl --norecurrent --internal_dim=64 --trust_region_p \
+  --max_divergence=0.01 --value_opt=best_fit --critic_weight=0.0 \
+  --tau_decay=0.1 --tau_start=0.1
+```
+Maintained by Ofir Nachum (ofirnachum).
--- a/pcl_rl/baseline.py
+++ b/pcl_rl/baseline.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Baseline model for value estimates.
+Implements the value component of the neural network.
+In some cases this is just an additional linear layer on the policy.
+In other cases, it is a completely separate neural network.
+"""
+import tensorflow as tf
+import numpy as np
+class Baseline(object):
+  def __init__(self, env_spec, internal_policy_dim,
+               input_prev_actions=True,
+               input_time_step=False,
+               input_policy_state=True,
+               n_hidden_layers=0,
+               hidden_dim=64,
+               tau=0.0):
+    self.env_spec = env_spec
+    self.internal_policy_dim = internal_policy_dim
+    self.input_prev_actions = input_prev_actions
+    self.input_time_step = input_time_step
+    self.input_policy_state = input_policy_state
+    self.n_hidden_layers = n_hidden_layers
+    self.hidden_dim = hidden_dim
+    self.tau = tau
+    self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
+  def get_inputs(self, time_step, obs, prev_actions,
+                 internal_policy_states):
+    """Get inputs to network as single tensor."""
+    inputs = [tf.ones_like(time_step)]
+    input_dim = 1
+    if not self.input_policy_state:
+      for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+        if self.env_spec.is_discrete(obs_type):
+          inputs.append(
+              tf.one_hot(obs[i], obs_dim))
+          input_dim += obs_dim
+        elif self.env_spec.is_box(obs_type):
+          cur_obs = obs[i]
+          inputs.append(cur_obs)
+          inputs.append(cur_obs ** 2)
+          input_dim += obs_dim * 2
+        else:
+          assert False
+      if self.input_prev_actions:
+        for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+          if self.env_spec.is_discrete(act_type):
+            inputs.append(
+                tf.one_hot(prev_actions[i], act_dim))
+            input_dim += act_dim
+          elif self.env_spec.is_box(act_type):
+            inputs.append(prev_actions[i])
+            input_dim += act_dim
+          else:
+            assert False
+    if self.input_policy_state:
+      inputs.append(internal_policy_states)
+      input_dim += self.internal_policy_dim
+    if self.input_time_step:
+      scaled_time = 0.01 * time_step
+      inputs.extend([scaled_time, scaled_time ** 2, scaled_time ** 3])
+      input_dim += 3
+    return input_dim, tf.concat(inputs, 1)
+  def reshape_batched_inputs(self, all_obs, all_actions,
+                             internal_policy_states, policy_logits):
+    """Reshape inputs from [time_length, batch_size, ...] to
+    [time_length * batch_size, ...].
+    This allows for computing the value estimate in one go.
+    """
+    batch_size = tf.shape(all_obs[0])[1]
+    time_length = tf.shape(all_obs[0])[0]
+    reshaped_obs = []
+    for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
+      elif self.env_spec.is_box(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
+    reshaped_prev_act = []
+    reshaped_policy_logits = []
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      prev_act = all_actions[i]
+      if self.env_spec.is_discrete(act_type):
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size]))
+      elif self.env_spec.is_box(act_type):
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size, act_dim]))
+      reshaped_policy_logits.append(
+          tf.reshape(policy_logits[i], [time_length * batch_size, -1]))
+    reshaped_internal_policy_states = tf.reshape(
+        internal_policy_states,
+        [time_length * batch_size, self.internal_policy_dim])
+    time_step = (float(self.input_time_step) *
+                 tf.expand_dims(
+                     tf.to_float(tf.range(time_length * batch_size) /
+                                 batch_size), -1))
+    return (time_step, reshaped_obs, reshaped_prev_act,
+            reshaped_internal_policy_states,
+            reshaped_policy_logits)
+  def get_values(self, all_obs, all_actions, internal_policy_states,
+                 policy_logits):
+    """Get value estimates given input."""
+    batch_size = tf.shape(all_obs[0])[1]
+    time_length = tf.shape(all_obs[0])[0]
+    (time_step, reshaped_obs, reshaped_prev_act,
+     reshaped_internal_policy_states,
+     reshaped_policy_logits) = self.reshape_batched_inputs(
+         all_obs, all_actions, internal_policy_states, policy_logits)
+    input_dim, inputs = self.get_inputs(
+        time_step, reshaped_obs, reshaped_prev_act,
+        reshaped_internal_policy_states)
+    for depth in xrange(self.n_hidden_layers):
+      with tf.variable_scope('value_layer%d' % depth):
+        w = tf.get_variable('w', [input_dim, self.hidden_dim])
+        inputs = tf.nn.tanh(tf.matmul(inputs, w))
+        input_dim = self.hidden_dim
+    w_v = tf.get_variable('w_v', [input_dim, 1],
+                          initializer=self.matrix_init)
+    values = tf.matmul(inputs, w_v)
+    values = tf.reshape(values, [time_length, batch_size])
+    inputs = inputs[:-batch_size]  # remove "final vals"
+    return values, inputs, w_v
+class UnifiedBaseline(Baseline):
+  """Baseline for Unified PCL."""
+  def get_values(self, all_obs, all_actions, internal_policy_states,
+                 policy_logits):
+    batch_size = tf.shape(all_obs[0])[1]
+    time_length = tf.shape(all_obs[0])[0]
+    (time_step, reshaped_obs, reshaped_prev_act,
+     reshaped_internal_policy_states,
+     reshaped_policy_logits) = self.reshape_batched_inputs(
+         all_obs, all_actions, internal_policy_states, policy_logits)
+    def f_transform(q_values, tau):
+      max_q = tf.reduce_max(q_values, -1, keep_dims=True)
+      return tf.squeeze(max_q, [-1]) + tau * tf.log(
+          tf.reduce_sum(tf.exp((q_values - max_q) / tau), -1))
+    assert len(reshaped_policy_logits) == 1
+    values = f_transform((self.tau + self.eps_lambda) * reshaped_policy_logits[0],
+                         (self.tau + self.eps_lambda))
+    values = tf.reshape(values, [time_length, batch_size])
+    # not used
+    input_dim, inputs = self.get_inputs(
+        time_step, reshaped_obs, reshaped_prev_act,
+        reshaped_internal_policy_states)
+    w_v = tf.get_variable('w_v', [input_dim, 1],
+                          initializer=self.matrix_init)
+    inputs = inputs[:-batch_size]  # remove "final vals"
+    return values, inputs, w_v
--- a/pcl_rl/controller.py
+++ b/pcl_rl/controller.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Controller coordinates sampling and training model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+import numpy as np
+import pickle
+import random
+flags = tf.flags
+gfile = tf.gfile
+FLAGS = flags.FLAGS
+def find_best_eps_lambda(rewards, lengths):
+  """Find the best lambda given a desired epsilon = FLAGS.max_divergence."""
+  # perhaps not the best way to do this
+  desired_div = FLAGS.max_divergence * np.mean(lengths)
+  def calc_divergence(eps_lambda):
+    max_reward = np.max(rewards)
+    logz = (max_reward / eps_lambda +
+            np.log(np.mean(np.exp((rewards - max_reward) / eps_lambda))))
+    exprr = np.mean(np.exp(rewards / eps_lambda - logz) *
+                    rewards / eps_lambda)
+    return exprr - logz
+  left = 0.0
+  right = 1000.0
+  if len(rewards) <= 8:
+    return (left + right) / 2
+  num_iter = max(4, 1 + int(np.log((right - left) / 0.1) / np.log(2.0)))
+  for _ in xrange(num_iter):
+    mid = (left + right) / 2
+    cur_div = calc_divergence(mid)
+    if cur_div > desired_div:
+      left = mid
+    else:
+      right = mid
+  return (left + right) / 2
+class Controller(object):
+  def __init__(self, env, env_spec, internal_dim,
+               use_online_batch=True,
+               batch_by_steps=False,
+               unify_episodes=False,
+               replay_batch_size=None,
+               max_step=None,
+               cutoff_agent=1,
+               save_trajectories_file=None,
+               use_trust_region=False,
+               use_value_opt=False,
+               update_eps_lambda=False,
+               prioritize_by='rewards',
+               get_model=None,
+               get_replay_buffer=None,
+               get_buffer_seeds=None):
+    self.env = env
+    self.env_spec = env_spec
+    self.internal_dim = internal_dim
+    self.use_online_batch = use_online_batch
+    self.batch_by_steps = batch_by_steps
+    self.unify_episodes = unify_episodes
+    self.replay_batch_size = replay_batch_size
+    self.max_step = max_step
+    self.cutoff_agent = cutoff_agent
+    self.save_trajectories_file = save_trajectories_file
+    self.use_trust_region = use_trust_region
+    self.use_value_opt = use_value_opt
+    self.update_eps_lambda = update_eps_lambda
+    self.prioritize_by = prioritize_by
+    self.model = get_model()
+    self.replay_buffer = get_replay_buffer()
+    self.seed_replay_buffer(get_buffer_seeds())
+    self.internal_state = np.array([self.initial_internal_state()] *
+                                   len(self.env))
+    self.last_obs = self.env_spec.initial_obs(len(self.env))
+    self.last_act = self.env_spec.initial_act(len(self.env))
+    self.last_pad = np.zeros(len(self.env))
+    self.start_episode = np.array([True] * len(self.env))
+    self.step_count = np.array([0] * len(self.env))
+    self.episode_running_rewards = np.zeros(len(self.env))
+    self.episode_running_lengths = np.zeros(len(self.env))
+    self.episode_rewards = []
+    self.episode_lengths = []
+    self.total_rewards = []
+    self.best_batch_rewards = None
+  def setup(self):
+    self.model.setup()
+  def initial_internal_state(self):
+    return np.zeros(self.model.policy.rnn_state_dim)
+  def _sample_episodes(self, sess, greedy=False):
+    """Sample episodes from environment using model."""
+    # reset environments as necessary
+    obs_after_reset = self.env.reset_if(self.start_episode)
+    for i, obs in enumerate(obs_after_reset):
+      if obs is not None:
+        self.step_count[i] = 0
+        self.internal_state[i] = self.initial_internal_state()
+        for j in xrange(len(self.env_spec.obs_dims)):
+          self.last_obs[j][i] = obs[j]
+        for j in xrange(len(self.env_spec.act_dims)):
+          self.last_act[j][i] = -1
+        self.last_pad[i] = 0
+    # maintain episode as a single unit if the last sampling
+    # batch ended before the episode was terminated
+    if self.unify_episodes:
+      assert len(obs_after_reset) == 1
+      new_ep = obs_after_reset[0] is not None
+    else:
+      new_ep = True
+    self.start_id = 0 if new_ep else len(self.all_obs[:])
+    initial_state = self.internal_state
+    all_obs = [] if new_ep else self.all_obs[:]
+    all_act = ([self.last_act] if new_ep else self.all_act[:])
+    all_pad = [] if new_ep else self.all_pad[:]
+    rewards = [] if new_ep else self.rewards[:]
+    # start stepping in the environments
+    step = 0
+    while not self.env.all_done():
+      self.step_count += 1 - np.array(self.env.dones)
+      next_internal_state, sampled_actions = self.model.sample_step(
+          sess, self.last_obs, self.internal_state, self.last_act,
+          greedy=greedy)
+      env_actions = self.env_spec.convert_actions_to_env(sampled_actions)
+      next_obs, reward, next_dones, _ = self.env.step(env_actions)
+      all_obs.append(self.last_obs)
+      all_act.append(sampled_actions)
+      all_pad.append(self.last_pad)
+      rewards.append(reward)
+      self.internal_state = next_internal_state
+      self.last_obs = next_obs
+      self.last_act = sampled_actions
+      self.last_pad = np.array(next_dones).astype('float32')
+      step += 1
+      if self.max_step and step >= self.max_step:
+        break
+    self.all_obs = all_obs[:]
+    self.all_act = all_act[:]
+    self.all_pad = all_pad[:]
+    self.rewards = rewards[:]
+    # append final observation
+    all_obs.append(self.last_obs)
+    return initial_state, all_obs, all_act, rewards, all_pad
+  def sample_episodes(self, sess):
+    """Sample steps from the environment until we have enough for a batch."""
+    # check if last batch ended with episode that was not terminated
+    if self.unify_episodes:
+      self.all_new_ep = self.start_episode[0]
+    # sample episodes until we either have enough episodes or enough steps
+    episodes = []
+    total_steps = 0
+    while total_steps < self.max_step * len(self.env):
+      (initial_state,
+       observations, actions, rewards,
+       pads) = self._sample_episodes(sess)
+      observations = zip(*observations)
+      actions = zip(*actions)
+      terminated = np.array(self.env.dones)
+      self.total_rewards = np.sum(np.array(rewards[self.start_id:]) *
+                                  (1 - np.array(pads[self.start_id:])), axis=0)
+      self.episode_running_rewards *= 1 - self.start_episode
+      self.episode_running_lengths *= 1 - self.start_episode
+      self.episode_running_rewards += self.total_rewards
+      self.episode_running_lengths += np.sum(1 - np.array(pads[self.start_id:]), axis=0)
+      episodes.extend(self.convert_from_batched_episodes(
+          initial_state, observations, actions, rewards,
+          terminated, pads))
+      total_steps += np.sum(1 - np.array(pads))
+      # set next starting episodes
+      self.start_episode = np.logical_or(terminated,
+                                         self.step_count >= self.cutoff_agent)
+      episode_rewards = self.episode_running_rewards[self.start_episode].tolist()
+      self.episode_rewards.extend(episode_rewards)
+      self.episode_lengths.extend(self.episode_running_lengths[self.start_episode].tolist())
+      self.episode_rewards = self.episode_rewards[-100:]
+      self.episode_lengths = self.episode_lengths[-100:]
+      if (self.save_trajectories_file is not None and
+          (self.best_batch_rewards is None or
+           np.mean(self.total_rewards) > self.best_batch_rewards)):
+        self.best_batch_rewards = np.mean(self.total_rewards)
+        my_episodes = self.convert_from_batched_episodes(
+          initial_state, observations, actions, rewards,
+          terminated, pads)
+        with gfile.GFile(self.save_trajectories_file, 'w') as f:
+          pickle.dump(my_episodes, f)
+      if not self.batch_by_steps:
+        return (initial_state,
+                observations, actions, rewards,
+                terminated, pads)
+    return self.convert_to_batched_episodes(episodes)
+  def _train(self, sess,
+             observations, initial_state, actions,
+             rewards, terminated, pads):
+    """Train model using batch."""
+    if self.use_trust_region:
+      # use trust region to optimize policy
+      loss, _, summary = self.model.trust_region_step(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads,
+          avg_episode_reward=np.mean(self.episode_rewards))
+    else:  # otherwise use simple gradient descent on policy
+      loss, _, summary = self.model.train_step(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads,
+          avg_episode_reward=np.mean(self.episode_rewards))
+    if self.use_value_opt:  # optionally perform specific value optimization
+      self.model.fit_values(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads)
+    return loss, summary
+  def train(self, sess):
+    """Sample some episodes and train on some episodes."""
+    cur_step = sess.run(self.model.inc_global_step)
+    self.cur_step = cur_step
+    # on the first iteration, set target network close to online network
+    if self.cur_step == 0:
+      for _ in xrange(100):
+        sess.run(self.model.copy_op)
+    # on other iterations, just perform single target <-- online operation
+    sess.run(self.model.copy_op)
+    # sample from env
+    (initial_state,
+     observations, actions, rewards,
+     terminated, pads) = self.sample_episodes(sess)
+    # add to replay buffer
+    self.add_to_replay_buffer(
+        initial_state, observations, actions,
+        rewards, terminated, pads)
+    loss, summary = 0, None
+    # train on online batch
+    if self.use_online_batch:
+      loss, summary = self._train(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads)
+    # update relative entropy coefficient
+    if self.update_eps_lambda:
+      episode_rewards = np.array(self.episode_rewards)
+      episode_lengths = np.array(self.episode_lengths)
+      eps_lambda = find_best_eps_lambda(episode_rewards, episode_lengths)
+      sess.run(self.model.objective.assign_eps_lambda,
+               feed_dict={self.model.objective.new_eps_lambda: eps_lambda})
+    # train on replay batch
+    replay_batch, replay_probs = self.get_from_replay_buffer(
+        self.replay_batch_size)
+    if replay_batch:
+      (initial_state,
+       observations, actions, rewards,
+       terminated, pads) = replay_batch
+      loss, summary = self._train(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads)
+    return loss, summary, self.total_rewards, self.episode_rewards
+  def eval(self, sess):
+    """Use greedy sampling."""
+    (initial_state,
+     observations, actions, rewards,
+     pads) = self._sample_episodes(sess, greedy=True)
+    total_rewards = np.sum(np.array(rewards) * (1 - np.array(pads)), axis=0)
+    return np.mean(total_rewards)
+  def convert_from_batched_episodes(
+      self, initial_state, observations, actions, rewards,
+      terminated, pads):
+    """Convert time-major batch of episodes to batch-major list of episodes."""
+    rewards = np.array(rewards)
+    pads = np.array(pads)
+    observations = [np.array(obs) for obs in observations]
+    actions = [np.array(act) for act in actions]
+    total_rewards = np.sum(rewards * (1 - pads), axis=0)
+    total_length = np.sum(1 - pads, axis=0).astype('int32')
+    episodes = []
+    num_episodes = rewards.shape[1]
+    for i in xrange(num_episodes):
+      length = total_length[i]
+      ep_initial = initial_state[i]
+      ep_obs = [obs[:length, i, ...] for obs in observations]
+      ep_act = [act[:length + 1, i, ...] for act in actions]
+      ep_rewards = rewards[:length, i]
+      episodes.append(
+          [ep_initial, ep_obs, ep_act, ep_rewards, terminated[i]])
+    return episodes
+  def convert_to_batched_episodes(self, episodes, max_length=None):
+    """Convert batch-major list of episodes to time-major batch of episodes."""
+    lengths = [len(ep[-2]) for ep in episodes]
+    max_length = max_length or max(lengths)
+    new_episodes = []
+    for ep, length in zip(episodes, lengths):
+      initial, observations, actions, rewards, terminated = ep
+      observations = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:])
+                      for obs in observations]
+      actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:])
+                 for act in actions]
+      pads = np.array([0] * length + [1] * (max_length - length))
+      rewards = np.resize(rewards, [max_length]) * (1 - pads)
+      new_episodes.append([initial, observations, actions, rewards,
+                           terminated, pads])
+    (initial, observations, actions, rewards,
+     terminated, pads) = zip(*new_episodes)
+    observations = [np.swapaxes(obs, 0, 1)
+                    for obs in zip(*observations)]
+    actions = [np.swapaxes(act, 0, 1)
+               for act in zip(*actions)]
+    rewards = np.transpose(rewards)
+    pads = np.transpose(pads)
+    return (initial, observations, actions, rewards, terminated, pads)
+  def add_to_replay_buffer(self, initial_state,
+                           observations, actions, rewards,
+                           terminated, pads):
+    """Add batch of episodes to replay buffer."""
+    if self.replay_buffer is None:
+      return
+    rewards = np.array(rewards)
+    pads = np.array(pads)
+    total_rewards = np.sum(rewards * (1 - pads), axis=0)
+    episodes = self.convert_from_batched_episodes(
+      initial_state, observations, actions, rewards,
+      terminated, pads)
+    priorities = (total_rewards if self.prioritize_by == 'reward'
+                  else self.cur_step)
+    if not self.unify_episodes or self.all_new_ep:
+      self.last_idxs = self.replay_buffer.add(
+          episodes, priorities)
+    else:
+      # If we are unifying episodes, we attempt to
+      # keep them unified in the replay buffer.
+      # The first episode sampled in the current batch is a
+      # continuation of the last episode from the previous batch
+      self.replay_buffer.add(episodes[:1], priorities, self.last_idxs[-1:])
+      if len(episodes) > 1:
+        self.replay_buffer.add(episodes[1:], priorities)
+  def get_from_replay_buffer(self, batch_size):
+    """Sample a batch of episodes from the replay buffer."""
+    if self.replay_buffer is None or len(self.replay_buffer) < 1 * batch_size:
+      return None, None
+    desired_count = batch_size * self.max_step
+    # in the case of batch_by_steps, we sample larger and larger
+    # amounts from the replay buffer until we have enough steps.
+    while True:
+      if batch_size > len(self.replay_buffer):
+        batch_size = len(self.replay_buffer)
+      episodes, probs = self.replay_buffer.get_batch(batch_size)
+      count = sum(len(ep[-2]) for ep in episodes)
+      if count >= desired_count or not self.batch_by_steps:
+        break
+      if batch_size == len(self.replay_buffer):
+        return None, None
+      batch_size *= 1.2
+    return (self.convert_to_batched_episodes(episodes), probs)
+  def seed_replay_buffer(self, episodes):
+    """Seed the replay buffer with some episodes."""
+    if self.replay_buffer is None:
+      return
+    # just need to add initial state
+    for i in xrange(len(episodes)):
+      episodes[i] = [self.initial_internal_state()] + episodes[i]
+    self.replay_buffer.seed_buffer(episodes)
--- a/pcl_rl/env_spec.py
+++ b/pcl_rl/env_spec.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for environment interface with agent / tensorflow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+class spaces(object):
+  discrete = 0
+  box = 1
+def get_space(space):
+  if hasattr(space, 'n'):
+    return space.n, spaces.discrete, None
+  elif hasattr(space, 'shape'):
+    return np.prod(space.shape), spaces.box, (space.low, space.high)
+def get_spaces(spaces):
+  if hasattr(spaces, 'spaces'):
+    return zip(*[get_space(space) for space in spaces.spaces])
+  else:
+    return [(ret,) for ret in get_space(spaces)]
+class EnvSpec(object):
+  def __init__(self, env, try_combining_actions=True,
+               discretize_actions=None):
+    self.discretize_actions = discretize_actions
+    # figure out observation space
+    self.obs_space = env.observation_space
+    self.obs_dims, self.obs_types, self.obs_info = get_spaces(self.obs_space)
+    # figure out action space
+    self.act_space = env.action_space
+    self.act_dims, self.act_types, self.act_info = get_spaces(self.act_space)
+    if self.discretize_actions:
+      self._act_dims = self.act_dims[:]
+      self._act_types = self.act_types[:]
+      self.act_dims = []
+      self.act_types = []
+      for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
+        if typ == spaces.discrete:
+          self.act_dims.append(dim)
+          self.act_types.append(spaces.discrete)
+        elif typ == spaces.box:
+          for _ in xrange(dim):
+            self.act_dims.append(self.discretize_actions)
+            self.act_types.append(spaces.discrete)
+    else:
+      self._act_dims = None
+      self._act_types = None
+    if (try_combining_actions and
+        all(typ == spaces.discrete for typ in self.act_types)):
+      self.combine_actions = True
+      self.orig_act_dims = self.act_dims[:]
+      self.orig_act_types = self.act_types[:]
+      total_act_dim = 1
+      for dim in self.act_dims:
+        total_act_dim *= dim
+      self.act_dims = [total_act_dim]
+      self.act_types = [spaces.discrete]
+    else:
+      self.combine_actions = False
+    self.obs_dims_and_types = zip(self.obs_dims, self.obs_types)
+    self.act_dims_and_types = zip(self.act_dims, self.act_types)
+    self.total_obs_dim = sum(self.obs_dims)
+    self.total_sampling_act_dim = sum(self.sampling_dim(dim, typ)
+                                      for dim, typ in self.act_dims_and_types)
+    self.total_sampled_act_dim = sum(self.act_dims)
+  def sampling_dim(self, dim, typ):
+    if typ == spaces.discrete:
+      return dim
+    elif typ == spaces.box:
+      return 2 * dim  # Gaussian mean and std
+    else:
+      assert False
+  def convert_actions_to_env(self, actions):
+    if self.combine_actions:
+      new_actions = []
+      actions = actions[0]
+      for dim in self.orig_act_dims:
+        new_actions.append(np.mod(actions, dim))
+        actions = (actions / dim).astype('int32')
+      actions = new_actions
+    if self.discretize_actions:
+      new_actions = []
+      idx = 0
+      for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
+        if typ == spaces.discrete:
+          new_actions.append(actions[idx])
+          idx += 1
+        elif typ == spaces.box:
+          low, high = self.act_info[i]
+          cur_action = []
+          for j in xrange(dim):
+            cur_action.append(
+                low[j] + (high[j] - low[j]) * actions[idx] /
+                float(self.discretize_actions))
+            idx += 1
+          new_actions.append(np.hstack(cur_action))
+      actions = new_actions
+    return actions
+  def convert_env_actions_to_actions(self, actions):
+    if not self.combine_actions:
+      return actions
+    new_actions = 0
+    base = 1
+    for act, dim in zip(actions, self.orig_act_dims):
+      new_actions = new_actions + base * act
+      base *= dim
+    return [new_actions]
+  def convert_obs_to_list(self, obs):
+    if len(self.obs_dims) == 1:
+      return [obs]
+    else:
+      return list(obs)
+  def convert_action_to_gym(self, action):
+    if len(action) == 1:
+      return action[0]
+    else:
+      return list(action)
+    if ((not self.combine_actions or len(self.orig_act_dims) == 1) and
+        (len(self.act_dims) == 1 or
+         (self.discretize_actions and len(self._act_dims) == 1))):
+      return action[0]
+    else:
+      return list(action)
+  def initial_obs(self, batch_size):
+    batched = batch_size is not None
+    batch_size = batch_size or 1
+    obs = []
+    for dim, typ in self.obs_dims_and_types:
+      if typ == spaces.discrete:
+        obs.append(np.zeros(batch_size))
+      elif typ == spaces.box:
+        obs.append(np.zeros([batch_size, dim]))
+    if batched:
+      return obs
+    else:
+      return zip(*obs)[0]
+  def initial_act(self, batch_size=None):
+    batched = batch_size is not None
+    batch_size = batch_size or 1
+    act = []
+    for dim, typ in self.act_dims_and_types:
+      if typ == spaces.discrete:
+        act.append(-np.ones(batch_size))
+      elif typ == spaces.box:
+        act.append(-np.ones([batch_size, dim]))
+    if batched:
+      return act
+    else:
+      return zip(*act)[0]
+  def is_discrete(self, typ):
+    return typ == spaces.discrete
+  def is_box(self, typ):
+    return typ == spaces.box
--- a/pcl_rl/expert_paths.py
+++ b/pcl_rl/expert_paths.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Expert paths/trajectories.
+For producing or loading expert trajectories in environment.
+"""
+import tensorflow as tf
+import random
+import os
+import numpy as np
+import pickle
+gfile = tf.gfile
+def sample_expert_paths(num, env_str, env_spec,
+                        load_trajectories_file=None):
+  """Sample a number of expert paths randomly."""
+  if load_trajectories_file is not None:
+    if not gfile.Exists(load_trajectories_file):
+      assert False, 'trajectories file %s does not exist' % load_trajectories_file
+    with gfile.GFile(load_trajectories_file, 'r') as f:
+      episodes = pickle.load(f)
+      episodes = random.sample(episodes, num)
+      return [ep[1:] for ep in episodes]
+  return [sample_expert_path(env_str, env_spec)
+          for _ in xrange(num)]
+def sample_expert_path(env_str, env_spec):
+  """Algorithmic tasks have known distribution of expert paths we sample from."""
+  t = random.randint(2, 10)  # sequence length
+  observations = []
+  actions = [env_spec.initial_act(None)]
+  rewards = []
+  if env_str in ['DuplicatedInput-v0', 'Copy-v0']:
+    chars = 5
+    random_ints = [int(random.random() * 1000) for _ in xrange(t)]
+    for tt in xrange(t):
+      char_idx = tt // 2 if env_str == 'DuplicatedInput-v0' else tt
+      char = random_ints[char_idx] % chars
+      observations.append([char])
+      actions.append([1, (tt + 1) % 2, char])
+      rewards.append((tt + 1) % 2)
+  elif env_str in ['RepeatCopy-v0']:
+    chars = 5
+    random_ints = [int(random.random() * 1000) for _ in xrange(t)]
+    for tt in xrange(3 * t + 2):
+      char_idx = (tt if tt < t else
+                  2 * t - tt if tt <= 2 * t else
+                  tt - 2 * t - 2)
+      if tt in [t, 2 * t + 1]:
+        char = chars
+      else:
+        char = random_ints[char_idx] % chars
+      observations.append([char])
+      actions.append([1 if tt < t else 0 if tt <= 2 * t else 1,
+                      tt not in [t, 2 * t + 1], char])
+      rewards.append(actions[-1][-2])
+  elif env_str in ['Reverse-v0']:
+    chars = 2
+    random_ints = [int(random.random() * 1000) for _ in xrange(t)]
+    for tt in xrange(2 * t + 1):
+      char_idx = tt if tt < t else 2 * t - tt
+      if tt != t:
+        char = random_ints[char_idx] % chars
+      else:
+        char = chars
+      observations.append([char])
+      actions.append([tt < t, tt > t, char])
+      rewards.append(tt > t)
+  elif env_str in ['ReversedAddition-v0']:
+    chars = 3
+    random_ints = [int(random.random() * 1000) for _ in xrange(1 + 2 * t)]
+    carry = 0
+    char_history = []
+    move_map = {0: 3, 1: 1, 2: 2, 3: 1}
+    for tt in xrange(2 * t + 1):
+      char_idx = tt
+      if tt >= 2 * t:
+        char = chars
+      else:
+        char = random_ints[char_idx] % chars
+      char_history.append(char)
+      if tt % 2 == 1:
+        tot = char_history[-2] + char_history[-1] + carry
+        carry = tot // chars
+        tot = tot % chars
+      elif tt == 2 * t:
+        tot = carry
+      else:
+        tot = 0
+      observations.append([char])
+      actions.append([move_map[tt % len(move_map)],
+                      tt % 2 or tt == 2 * t, tot])
+      rewards.append(tt % 2 or tt == 2 * t)
+  elif env_str in ['ReversedAddition3-v0']:
+    chars = 3
+    random_ints = [int(random.random() * 1000) for _ in xrange(1 + 3 * t)]
+    carry = 0
+    char_history = []
+    move_map = {0: 3, 1: 3, 2: 1, 3: 2, 4:2, 5: 1}
+    for tt in xrange(3 * t + 1):
+      char_idx = tt
+      if tt >= 3 * t:
+        char = chars
+      else:
+        char = random_ints[char_idx] % chars
+      char_history.append(char)
+      if tt % 3 == 2:
+        tot = char_history[-3] + char_history[-2] + char_history[-1] + carry
+        carry = tot // chars
+        tot = tot % chars
+      elif tt == 3 * t:
+        tot = carry
+      else:
+        tot = 0
+      observations.append([char])
+      actions.append([move_map[tt % len(move_map)],
+                      tt % 3 == 2 or tt == 3 * t, tot])
+      rewards.append(tt % 3 == 2 or tt == 3 * t)
+  else:
+    assert False, 'No expert trajectories for env %s' % env_str
+  actions = [
+      env_spec.convert_env_actions_to_actions(act)
+      for act in actions]
+  observations.append([chars])
+  observations = [np.array(obs) for obs in zip(*observations)]
+  actions = [np.array(act) for act in zip(*actions)]
+  rewards = np.array(rewards)
+  return [observations, actions, rewards, True]
--- a/pcl_rl/full_episode_objective.py
+++ b/pcl_rl/full_episode_objective.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Objectives for full-episode.
+Implementations of UREX & REINFORCE.  Note that these implementations
+use a non-parametric baseline to reduce variance.  Thus, multiple
+samples with the same seed must be taken from the environment.
+"""
+import tensorflow as tf
+import objective
+class Reinforce(objective.Objective):
+  def __init__(self, learning_rate, clip_norm, num_samples,
+               tau=0.1, bonus_weight=1.0):
+    super(Reinforce, self).__init__(learning_rate, clip_norm=clip_norm)
+    self.num_samples = num_samples
+    assert self.num_samples > 1
+    self.tau = tau
+    self.bonus_weight = bonus_weight
+    self.eps_lambda = 0.0
+  def get_bonus(self, total_rewards, total_log_probs):
+    """Exploration bonus."""
+    return -self.tau * total_log_probs
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    seq_length = tf.shape(rewards)[0]
+    not_pad = tf.reshape(1 - pads, [seq_length, -1, self.num_samples])
+    rewards = not_pad * tf.reshape(rewards, [seq_length, -1, self.num_samples])
+    log_probs = not_pad * tf.reshape(sum(log_probs), [seq_length, -1, self.num_samples])
+    total_rewards = tf.reduce_sum(rewards, 0)
+    total_log_probs = tf.reduce_sum(log_probs, 0)
+    rewards_and_bonus = (total_rewards +
+                         self.bonus_weight *
+                         self.get_bonus(total_rewards, total_log_probs))
+    baseline = tf.reduce_mean(rewards_and_bonus, 1, keep_dims=True)
+    loss = -tf.stop_gradient(rewards_and_bonus - baseline) * total_log_probs
+    loss = tf.reduce_mean(loss)
+    raw_loss = loss  # TODO
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+    tf.summary.histogram('log_probs', total_log_probs)
+    tf.summary.histogram('rewards', total_rewards)
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(total_rewards))
+    tf.summary.scalar('loss', loss)
+    return loss, raw_loss, baseline, gradient_ops, tf.summary.merge_all()
+class UREX(Reinforce):
+  def get_bonus(self, total_rewards, total_log_probs):
+    """Exploration bonus."""
+    discrepancy = total_rewards / self.tau - total_log_probs
+    normalized_d = self.num_samples * tf.nn.softmax(discrepancy)
+    return self.tau * normalized_d
--- a/pcl_rl/gym_wrapper.py
+++ b/pcl_rl/gym_wrapper.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper around gym env.
+Allows for using batches of possibly identitically seeded environments.
+"""
+import gym
+import numpy as np
+import random
+import env_spec
+def get_env(env_str):
+  return gym.make(env_str)
+class GymWrapper(object):
+  def __init__(self, env_str, distinct=1, count=1, seeds=None):
+    self.distinct = distinct
+    self.count = count
+    self.total = self.distinct * self.count
+    self.seeds = seeds or [random.randint(0, 1e12)
+                           for _ in xrange(self.distinct)]
+    self.envs = []
+    for seed in self.seeds:
+      for _ in xrange(self.count):
+        env = get_env(env_str)
+        env.seed(seed)
+        if hasattr(env, 'last'):
+          env.last = 100  # for algorithmic envs
+        self.envs.append(env)
+    self.dones = [True] * self.total
+    self.num_episodes_played = 0
+    one_env = self.get_one()
+    self.use_action_list = hasattr(one_env.action_space, 'spaces')
+    self.env_spec = env_spec.EnvSpec(self.get_one())
+  def get_seeds(self):
+    return self.seeds
+  def reset(self):
+    self.dones = [False] * self.total
+    self.num_episodes_played += len(self.envs)
+    # reset seeds to be synchronized
+    self.seeds = [random.randint(0, 1e12) for _ in xrange(self.distinct)]
+    counter = 0
+    for seed in self.seeds:
+      for _ in xrange(self.count):
+        self.envs[counter].seed(seed)
+        counter += 1
+    return [self.env_spec.convert_obs_to_list(env.reset())
+            for env in self.envs]
+  def reset_if(self, predicate=None):
+    if predicate is None:
+      predicate = self.dones
+    if self.count != 1:
+      assert np.all(predicate)
+      return self.reset()
+    self.num_episodes_played += sum(predicate)
+    output = [self.env_spec.convert_obs_to_list(env.reset())
+              if pred else None
+              for env, pred in zip(self.envs, predicate)]
+    for i, pred in enumerate(predicate):
+      if pred:
+        self.dones[i] = False
+    return output
+  def all_done(self):
+    return all(self.dones)
+  def step(self, actions):
+    def env_step(action):
+      action = self.env_spec.convert_action_to_gym(action)
+      obs, reward, done, tt = env.step(action)
+      obs = self.env_spec.convert_obs_to_list(obs)
+      return obs, reward, done, tt
+    actions = zip(*actions)
+    outputs = [env_step(action)
+               if not done else (self.env_spec.initial_obs(None), 0, True, None)
+               for action, env, done in zip(actions, self.envs, self.dones)]
+    for i, (_, _, done, _) in enumerate(outputs):
+      self.dones[i] = self.dones[i] or done
+    obs, reward, done, tt = zip(*outputs)
+    obs = [list(oo) for oo in zip(*obs)]
+    return [obs, reward, done, tt]
+  def get_one(self):
+    return random.choice(self.envs)
+  def __len__(self):
+    return len(self.envs)
--- a/pcl_rl/model.py
+++ b/pcl_rl/model.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model is responsible for setting up Tensorflow graph.
+Creates policy and value networks.  Also sets up all optimization
+ops, including gradient ops, trust region ops, and value optimizers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+class Model(object):
+  def __init__(self, env_spec, global_step,
+               target_network_lag=0.95,
+               sample_from='online',
+               get_policy=None,
+               get_baseline=None,
+               get_objective=None,
+               get_trust_region_p_opt=None,
+               get_value_opt=None):
+    self.env_spec = env_spec
+    self.global_step = global_step
+    self.inc_global_step = self.global_step.assign_add(1)
+    self.target_network_lag = target_network_lag
+    self.sample_from = sample_from
+    self.policy = get_policy()
+    self.baseline = get_baseline()
+    self.objective = get_objective()
+    self.baseline.eps_lambda = self.objective.eps_lambda  # TODO: do this better
+    self.trust_region_policy_opt = get_trust_region_p_opt()
+    self.value_opt = get_value_opt()
+  def setup_placeholders(self):
+    """Create the Tensorflow placeholders."""
+    # summary placeholder
+    self.avg_episode_reward = tf.placeholder(
+        tf.float32, [], 'avg_episode_reward')
+    # sampling placeholders
+    self.internal_state = tf.placeholder(tf.float32,
+                                         [None, self.policy.rnn_state_dim],
+                                         'internal_state')
+    self.single_observation = []
+    for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        self.single_observation.append(
+            tf.placeholder(tf.int32, [None], 'obs%d' % i))
+      elif self.env_spec.is_box(obs_type):
+        self.single_observation.append(
+            tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i))
+      else:
+        assert False
+    self.single_action = []
+    for i, (action_dim, action_type) in \
+        enumerate(self.env_spec.act_dims_and_types):
+      if self.env_spec.is_discrete(action_type):
+        self.single_action.append(
+            tf.placeholder(tf.int32, [None], 'act%d' % i))
+      elif self.env_spec.is_box(action_type):
+        self.single_action.append(
+            tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i))
+      else:
+        assert False
+    # training placeholders
+    self.observations = []
+    for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        self.observations.append(
+            tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i))
+      else:
+        self.observations.append(
+            tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i))
+    self.actions = []
+    self.other_logits = []
+    for i, (action_dim, action_type) in \
+        enumerate(self.env_spec.act_dims_and_types):
+      if self.env_spec.is_discrete(action_type):
+        self.actions.append(
+            tf.placeholder(tf.int32, [None, None], 'all_act%d' % i))
+      if self.env_spec.is_box(action_type):
+        self.actions.append(
+            tf.placeholder(tf.float32, [None, None, action_dim],
+                           'all_act%d' % i))
+      self.other_logits.append(
+          tf.placeholder(tf.float32, [None, None, None],
+                         'other_logits%d' % i))
+    self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards')
+    self.terminated = tf.placeholder(tf.float32, [None], 'terminated')
+    self.pads = tf.placeholder(tf.float32, [None, None], 'pads')
+    self.prev_log_probs = tf.placeholder(tf.float32, [None, None],
+                                         'prev_log_probs')
+  def setup(self):
+    """Setup Tensorflow Graph."""
+    self.setup_placeholders()
+    tf.summary.scalar('avg_episode_reward', self.avg_episode_reward)
+    with tf.variable_scope('model', reuse=None):
+      # policy network
+      with tf.variable_scope('policy_net'):
+        (self.policy_internal_states, self.logits, self.log_probs,
+         self.entropies, self.self_kls) = \
+            self.policy.multi_step(self.observations,
+                                   self.internal_state,
+                                   self.actions)
+        self.out_log_probs = sum(self.log_probs)
+        self.kl = self.policy.calculate_kl(self.other_logits, self.logits)
+        self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) /
+                       tf.reduce_sum(1 - self.pads))
+      # value network
+      with tf.variable_scope('value_net'):
+        (self.values,
+         self.regression_input,
+         self.regression_weight) = self.baseline.get_values(
+            self.observations, self.actions,
+            self.policy_internal_states, self.logits)
+      # target policy network
+      with tf.variable_scope('target_policy_net'):
+        (self.target_policy_internal_states,
+         self.target_logits, self.target_log_probs,
+         _, _) = \
+            self.policy.multi_step(self.observations,
+                                   self.internal_state,
+                                   self.actions)
+      # target value network
+      with tf.variable_scope('target_value_net'):
+        (self.target_values, _, _) = self.baseline.get_values(
+            self.observations, self.actions,
+            self.target_policy_internal_states, self.target_logits)
+      # construct copy op online --> target
+      all_vars = tf.trainable_variables()
+      online_vars = [p for p in all_vars if
+                     '/policy_net' in p.name or '/value_net' in p.name]
+      target_vars = [p for p in all_vars if
+                     'target_policy_net' in p.name or 'target_value_net' in p.name]
+      online_vars.sort(key=lambda p: p.name)
+      target_vars.sort(key=lambda p: p.name)
+      aa = self.target_network_lag
+      self.copy_op = tf.group(*[
+          target_p.assign(aa * target_p + (1 - aa) * online_p)
+          for online_p, target_p in zip(online_vars, target_vars)])
+      # evaluate objective
+      (self.loss, self.raw_loss, self.regression_target,
+       self.gradient_ops, self.summary) = self.objective.get(
+          self.rewards, self.pads,
+          self.values[:-1, :],
+          self.values[-1, :] * (1 - self.terminated),
+          self.log_probs, self.prev_log_probs, self.target_log_probs,
+          self.entropies,
+          self.logits)
+      self.regression_target = tf.reshape(self.regression_target, [-1])
+      self.policy_vars = [
+          v for v in tf.trainable_variables()
+          if '/policy_net' in v.name]
+      self.value_vars = [
+          v for v in tf.trainable_variables()
+          if '/value_net' in v.name]
+    # trust region optimizer
+    if self.trust_region_policy_opt is not None:
+      with tf.variable_scope('trust_region_policy', reuse=None):
+        avg_self_kl = (
+            tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) /
+            tf.reduce_sum(1 - self.pads))
+        self.trust_region_policy_opt.setup(
+            self.policy_vars, self.raw_loss, avg_self_kl,
+            self.avg_kl)
+    # value optimizer
+    if self.value_opt is not None:
+      with tf.variable_scope('trust_region_value', reuse=None):
+        self.value_opt.setup(
+            self.value_vars,
+            tf.reshape(self.values[:-1, :], [-1]),
+            self.regression_target,
+            tf.reshape(self.pads, [-1]),
+            self.regression_input, self.regression_weight)
+    # we re-use variables for the sampling operations
+    with tf.variable_scope('model', reuse=True):
+      scope = ('target_policy_net' if self.sample_from == 'target'
+               else 'policy_net')
+      with tf.variable_scope(scope):
+        self.next_internal_state, self.sampled_actions = \
+            self.policy.sample_step(self.single_observation,
+                                    self.internal_state,
+                                    self.single_action)
+        self.greedy_next_internal_state, self.greedy_sampled_actions = \
+            self.policy.sample_step(self.single_observation,
+                                    self.internal_state,
+                                    self.single_action,
+                                    greedy=True)
+  def sample_step(self, sess,
+                  single_observation, internal_state, single_action,
+                  greedy=False):
+    """Sample batch of steps from policy."""
+    if greedy:
+      outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions]
+    else:
+      outputs = [self.next_internal_state, self.sampled_actions]
+    feed_dict = {self.internal_state: internal_state}
+    for action_place, action in zip(self.single_action, single_action):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.single_observation, single_observation):
+      feed_dict[obs_place] = obs
+    return sess.run(outputs, feed_dict=feed_dict)
+  def train_step(self, sess,
+                 observations, internal_state, actions,
+                 rewards, terminated, pads,
+                 avg_episode_reward=0):
+    """Train network using standard gradient descent."""
+    outputs = [self.raw_loss, self.gradient_ops, self.summary]
+    feed_dict = {self.internal_state: internal_state,
+                 self.rewards: rewards,
+                 self.terminated: terminated,
+                 self.pads: pads,
+                 self.avg_episode_reward: avg_episode_reward}
+    for action_place, action in zip(self.actions, actions):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.observations, observations):
+      feed_dict[obs_place] = obs
+    return sess.run(outputs, feed_dict=feed_dict)
+  def trust_region_step(self, sess,
+                        observations, internal_state, actions,
+                        rewards, terminated, pads,
+                        avg_episode_reward=0):
+    """Train policy using trust region step."""
+    feed_dict = {self.internal_state: internal_state,
+                 self.rewards: rewards,
+                 self.terminated: terminated,
+                 self.pads: pads,
+                 self.avg_episode_reward: avg_episode_reward}
+    for action_place, action in zip(self.actions, actions):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.observations, observations):
+      feed_dict[obs_place] = obs
+    (prev_log_probs, prev_logits) = sess.run(
+        [self.out_log_probs, self.logits], feed_dict=feed_dict)
+    feed_dict[self.prev_log_probs] = prev_log_probs
+    for other_logit, prev_logit in zip(self.other_logits, prev_logits):
+      feed_dict[other_logit] = prev_logit
+    # fit policy
+    self.trust_region_policy_opt.optimize(sess, feed_dict)
+    ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict)
+    ret = [ret[0], None, ret[1]]
+    return ret
+  def fit_values(self, sess,
+                 observations, internal_state, actions,
+                 rewards, terminated, pads):
+    """Train value network using value-specific optimizer."""
+    feed_dict = {self.internal_state: internal_state,
+                 self.rewards: rewards,
+                 self.terminated: terminated,
+                 self.pads: pads}
+    for action_place, action in zip(self.actions, actions):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.observations, observations):
+      feed_dict[obs_place] = obs
+    # fit values
+    if self.value_opt is None:
+      raise ValueError('Specific value optimizer does not exist')
+    self.value_opt.optimize(sess, feed_dict)
--- a/pcl_rl/objective.py
+++ b/pcl_rl/objective.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Objectives to compute loss and value targets.
+Implements Actor Critic, PCL (vanilla PCL, Unified PCL, Trust PCL), and TRPO.
+"""
+import tensorflow as tf
+import numpy as np
+class Objective(object):
+  def __init__(self, learning_rate, clip_norm):
+    self.learning_rate = learning_rate
+    self.clip_norm = clip_norm
+  def get_optimizer(self, learning_rate):
+    """Optimizer for gradient descent ops."""
+    return tf.train.AdamOptimizer(learning_rate=learning_rate,
+                                  epsilon=2e-4)
+  def training_ops(self, loss, learning_rate=None):
+    """Gradient ops."""
+    opt = self.get_optimizer(learning_rate)
+    params = tf.trainable_variables()
+    grads = tf.gradients(loss, params)
+    if self.clip_norm:
+      grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm)
+      tf.summary.scalar('grad_global_norm', global_norm)
+    return opt.apply_gradients(zip(grads, params))
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    """Get objective calculations."""
+    raise NotImplementedError()
+def discounted_future_sum(values, discount, rollout):
+  """Discounted future sum of time-major values."""
+  discount_filter = tf.reshape(
+      discount ** tf.range(float(rollout)), [-1, 1, 1])
+  expanded_values = tf.concat(
+      [values, tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
+  conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
+      tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
+      stride=1, padding='VALID'), -1))
+  return conv_values
+def discounted_two_sided_sum(values, discount, rollout):
+  """Discounted two-sided sum of time-major values."""
+  roll = float(rollout)
+  discount_filter = tf.reshape(
+      discount ** tf.abs(tf.range(-roll + 1, roll)), [-1, 1, 1])
+  expanded_values = tf.concat(
+      [tf.zeros([rollout - 1, tf.shape(values)[1]]), values,
+       tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
+  conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
+      tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
+      stride=1, padding='VALID'), -1))
+  return conv_values
+def shift_values(values, discount, rollout, final_values=0.0):
+  """Shift values up by some amount of time.
+  Those values that shift from a value beyond the last value
+  are calculated using final_values.
+  """
+  roll_range = tf.cumsum(tf.ones_like(values[:rollout, :]), 0,
+                         exclusive=True, reverse=True)
+  final_pad = tf.expand_dims(final_values, 0) * discount ** roll_range
+  return tf.concat([discount ** rollout * values[rollout:, :],
+                    final_pad], 0)
+class ActorCritic(Objective):
+  """Standard Actor-Critic."""
+  def __init__(self, learning_rate, clip_norm=5,
+               policy_weight=1.0, critic_weight=0.1,
+               tau=0.1, gamma=1.0, rollout=10,
+               eps_lambda=0.0, clip_adv=None):
+    super(ActorCritic, self).__init__(learning_rate, clip_norm=clip_norm)
+    self.policy_weight = policy_weight
+    self.critic_weight = critic_weight
+    self.tau = tau
+    self.gamma = gamma
+    self.rollout = rollout
+    self.clip_adv = clip_adv
+    self.eps_lambda = tf.get_variable(  # TODO: need a better way
+        'eps_lambda', [], initializer=tf.constant_initializer(eps_lambda))
+    self.new_eps_lambda = tf.placeholder(tf.float32, [])
+    self.assign_eps_lambda = self.eps_lambda.assign(
+        0.95 * self.eps_lambda + 0.05 * self.new_eps_lambda)
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    not_pad = 1 - pads
+    batch_size = tf.shape(rewards)[1]
+    entropy = not_pad * sum(entropies)
+    rewards = not_pad * rewards
+    value_estimates = not_pad * values
+    log_probs = not_pad * sum(log_probs)
+    sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
+    last_values = shift_values(value_estimates, self.gamma, self.rollout,
+                               final_values)
+    future_values = sum_rewards + last_values
+    baseline_values = value_estimates
+    adv = tf.stop_gradient(-baseline_values + future_values)
+    if self.clip_adv:
+      adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
+    policy_loss = -adv * log_probs
+    critic_loss = -adv * baseline_values
+    regularizer = -self.tau * entropy
+    policy_loss = tf.reduce_mean(
+        tf.reduce_sum(policy_loss * not_pad, 0))
+    critic_loss = tf.reduce_mean(
+        tf.reduce_sum(critic_loss * not_pad, 0))
+    regularizer = tf.reduce_mean(
+        tf.reduce_sum(regularizer * not_pad, 0))
+    # loss for gradient calculation
+    loss = (self.policy_weight * policy_loss +
+            self.critic_weight * critic_loss + regularizer)
+    raw_loss = tf.reduce_mean(  # TODO
+        tf.reduce_sum(not_pad * policy_loss, 0))
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+    tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
+    tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(tf.reduce_sum(rewards, 0)))
+    tf.summary.scalar('policy_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('critic_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('loss', loss)
+    tf.summary.scalar('raw_loss', raw_loss)
+    return (loss, raw_loss, future_values,
+            gradient_ops, tf.summary.merge_all())
+class PCL(ActorCritic):
+  """PCL implementation.
+  Implements vanilla PCL, Unified PCL, and Trust PCL depending
+  on provided inputs.
+  """
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    not_pad = 1 - pads
+    batch_size = tf.shape(rewards)[1]
+    rewards = not_pad * rewards
+    value_estimates = not_pad * values
+    log_probs = not_pad * sum(log_probs)
+    target_log_probs = not_pad * tf.stop_gradient(sum(target_log_probs))
+    relative_log_probs = not_pad * (log_probs - target_log_probs)
+    # Prepend.
+    not_pad = tf.concat([tf.ones([self.rollout - 1, batch_size]),
+                         not_pad], 0)
+    rewards = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                         rewards], 0)
+    value_estimates = tf.concat(
+        [self.gamma ** tf.expand_dims(
+            tf.range(float(self.rollout - 1), 0, -1), 1) *
+         tf.ones([self.rollout - 1, batch_size]) *
+         value_estimates[0:1, :],
+         value_estimates], 0)
+    log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                           log_probs], 0)
+    prev_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                                prev_log_probs], 0)
+    relative_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                                    relative_log_probs], 0)
+    sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
+    sum_log_probs = discounted_future_sum(log_probs, self.gamma, self.rollout)
+    sum_prev_log_probs = discounted_future_sum(prev_log_probs, self.gamma, self.rollout)
+    sum_relative_log_probs = discounted_future_sum(
+        relative_log_probs, self.gamma, self.rollout)
+    last_values = shift_values(value_estimates, self.gamma, self.rollout,
+                               final_values)
+    future_values = (
+        - self.tau * sum_log_probs
+        - self.eps_lambda * sum_relative_log_probs
+        + sum_rewards + last_values)
+    baseline_values = value_estimates
+    adv = tf.stop_gradient(-baseline_values + future_values)
+    if self.clip_adv:
+      adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
+    policy_loss = -adv * sum_log_probs
+    critic_loss = -adv * (baseline_values - last_values)
+    policy_loss = tf.reduce_mean(
+        tf.reduce_sum(policy_loss * not_pad, 0))
+    critic_loss = tf.reduce_mean(
+        tf.reduce_sum(critic_loss * not_pad, 0))
+    # loss for gradient calculation
+    loss = (self.policy_weight * policy_loss +
+            self.critic_weight * critic_loss)
+    # actual quantity we're trying to minimize
+    raw_loss = tf.reduce_mean(
+        tf.reduce_sum(not_pad * adv * (-baseline_values + future_values), 0))
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+    tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
+    tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
+    tf.summary.histogram('future_values', future_values)
+    tf.summary.histogram('baseline_values', baseline_values)
+    tf.summary.histogram('advantages', adv)
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(tf.reduce_sum(rewards, 0)))
+    tf.summary.scalar('policy_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('critic_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('loss', loss)
+    tf.summary.scalar('raw_loss', tf.reduce_mean(raw_loss))
+    tf.summary.scalar('eps_lambda', self.eps_lambda)
+    return (loss, raw_loss,
+            future_values[self.rollout - 1:, :],
+            gradient_ops, tf.summary.merge_all())
+class TRPO(ActorCritic):
+  """TRPO."""
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    not_pad = 1 - pads
+    batch_size = tf.shape(rewards)[1]
+    rewards = not_pad * rewards
+    value_estimates = not_pad * values
+    log_probs = not_pad * sum(log_probs)
+    prev_log_probs = not_pad * prev_log_probs
+    sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
+    last_values = shift_values(value_estimates, self.gamma, self.rollout,
+                               final_values)
+    future_values = sum_rewards + last_values
+    baseline_values = value_estimates
+    adv = tf.stop_gradient(-baseline_values + future_values)
+    if self.clip_adv:
+      adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
+    policy_loss = -adv * tf.exp(log_probs - prev_log_probs)
+    critic_loss = -adv * baseline_values
+    policy_loss = tf.reduce_mean(
+        tf.reduce_sum(policy_loss * not_pad, 0))
+    critic_loss = tf.reduce_mean(
+        tf.reduce_sum(critic_loss * not_pad, 0))
+    raw_loss = policy_loss
+    # loss for gradient calculation
+    if self.policy_weight == 0:
+      policy_loss = 0.0
+    elif self.critic_weight == 0:
+      critic_loss = 0.0
+    loss = (self.policy_weight * policy_loss +
+            self.critic_weight * critic_loss)
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+    tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
+    tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(tf.reduce_sum(rewards, 0)))
+    tf.summary.scalar('policy_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('critic_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('loss', loss)
+    tf.summary.scalar('raw_loss', raw_loss)
+    return (loss, raw_loss, future_values,
+            gradient_ops, tf.summary.merge_all())
--- a/pcl_rl/optimizers.py
+++ b/pcl_rl/optimizers.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizers mostly for value estimate.
+Gradient Descent optimizer
+LBFGS optimizer
+Best Fit optimizer
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+import numpy as np
+import scipy.optimize
+def var_size(v):
+  return int(np.prod([int(d) for d in v.shape]))
+def gradients(loss, var_list):
+  grads = tf.gradients(loss, var_list)
+  return [g if g is not None else tf.zeros(v.shape)
+          for g, v in zip(grads, var_list)]
+def flatgrad(loss, var_list):
+  grads = gradients(loss, var_list)
+  return tf.concat([tf.reshape(grad, [-1])
+                    for (v, grad) in zip(var_list, grads)
+                    if grad is not None], 0)
+def get_flat(var_list):
+  return tf.concat([tf.reshape(v, [-1]) for v in var_list], 0)
+def set_from_flat(var_list, flat_theta):
+  assigns = []
+  shapes = [v.shape for v in var_list]
+  sizes = [var_size(v) for v in var_list]
+  start = 0
+  assigns = []
+  for (shape, size, v) in zip(shapes, sizes, var_list):
+    assigns.append(v.assign(
+        tf.reshape(flat_theta[start:start + size], shape)))
+    start += size
+  assert start == sum(sizes)
+  return tf.group(*assigns)
+class LbfgsOptimization(object):
+  def __init__(self, max_iter=25, mix_frac=1.0):
+    self.max_iter = max_iter
+    self.mix_frac = mix_frac
+  def setup_placeholders(self):
+    self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
+    self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
+  def setup(self, var_list, values, targets, pads,
+            inputs, regression_weight):
+    self.setup_placeholders()
+    self.values = values
+    self.targets = targets
+    self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
+                     / tf.reduce_sum(1 - pads))
+    self.loss_flat_gradient = flatgrad(self.raw_loss, var_list)
+    self.flat_vars = get_flat(var_list)
+    self.set_vars = set_from_flat(var_list, self.flat_theta)
+  def optimize(self, sess, feed_dict):
+    old_theta = sess.run(self.flat_vars)
+    old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
+    intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
+    feed_dict = dict(feed_dict)
+    feed_dict[self.intended_values] = intended_values
+    def calc_loss_and_grad(theta):
+      sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
+      loss, grad = sess.run([self.raw_loss, self.loss_flat_gradient],
+                            feed_dict=feed_dict)
+      grad = grad.astype('float64')
+      return loss, grad
+    theta, _, _ = scipy.optimize.fmin_l_bfgs_b(
+        calc_loss_and_grad, old_theta, maxiter=self.max_iter)
+    sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
+class GradOptimization(object):
+  def __init__(self, learning_rate=0.001, max_iter=25, mix_frac=1.0):
+    self.learning_rate = learning_rate
+    self.max_iter = max_iter
+    self.mix_frac = mix_frac
+  def get_optimizer(self):
+    return tf.train.AdamOptimizer(learning_rate=self.learning_rate,
+                                  epsilon=2e-4)
+  def setup_placeholders(self):
+    self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
+    self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
+  def setup(self, var_list, values, targets, pads,
+            inputs, regression_weight):
+    self.setup_placeholders()
+    self.values = values
+    self.targets = targets
+    self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
+                     / tf.reduce_sum(1 - pads))
+    opt = self.get_optimizer()
+    params = var_list
+    grads = tf.gradients(self.raw_loss, params)
+    self.gradient_ops = opt.apply_gradients(zip(grads, params))
+  def optimize(self, sess, feed_dict):
+    old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
+    intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
+    feed_dict = dict(feed_dict)
+    feed_dict[self.intended_values] = intended_values
+    for _ in xrange(self.max_iter):
+      sess.run(self.gradient_ops, feed_dict=feed_dict)
+class BestFitOptimization(object):
+  def __init__(self, mix_frac=1.0):
+    self.mix_frac = mix_frac
+  def setup_placeholders(self):
+    self.new_regression_weight = tf.placeholder(
+        tf.float32, self.regression_weight.shape)
+  def setup(self, var_list, values, targets, pads,
+            inputs, regression_weight):
+    self.values = values
+    self.targets = targets
+    self.inputs = inputs
+    self.regression_weight = regression_weight
+    self.setup_placeholders()
+    self.update_regression_weight = tf.assign(
+        self.regression_weight, self.new_regression_weight)
+  def optimize(self, sess, feed_dict):
+    reg_input, reg_weight, old_values, targets = sess.run(
+        [self.inputs, self.regression_weight, self.values, self.targets],
+        feed_dict=feed_dict)
+    intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
+    # taken from rllab
+    reg_coeff = 1e-5
+    for _ in range(5):
+      best_fit_weight = np.linalg.lstsq(
+          reg_input.T.dot(reg_input) +
+          reg_coeff * np.identity(reg_input.shape[1]),
+          reg_input.T.dot(intended_values))[0]
+      if not np.any(np.isnan(best_fit_weight)):
+        break
+      reg_coeff *= 10
+    if len(best_fit_weight.shape) == 1:
+      best_fit_weight = np.expand_dims(best_fit_weight, -1)
+    sess.run(self.update_regression_weight,
+             feed_dict={self.new_regression_weight: best_fit_weight})
--- a/pcl_rl/policy.py
+++ b/pcl_rl/policy.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Policy neural network.
+Implements network which takes in input and produces actions
+and log probabilities given a sampling distribution parameterization.
+"""
+import tensorflow as tf
+import numpy as np
+class Policy(object):
+  def __init__(self, env_spec, internal_dim,
+               fixed_std=True, recurrent=True,
+               input_prev_actions=True):
+    self.env_spec = env_spec
+    self.internal_dim = internal_dim
+    self.rnn_state_dim = self.internal_dim
+    self.fixed_std = fixed_std
+    self.recurrent = recurrent
+    self.input_prev_actions = input_prev_actions
+    self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
+    self.vector_init = tf.constant_initializer(0.0)
+  @property
+  def input_dim(self):
+    return (self.env_spec.total_obs_dim +
+            self.env_spec.total_sampled_act_dim * self.input_prev_actions)
+  @property
+  def output_dim(self):
+    return self.env_spec.total_sampling_act_dim
+  def get_cell(self):
+    """Get RNN cell."""
+    self.cell_input_dim = self.internal_dim // 2
+    cell = tf.contrib.rnn.LSTMCell(self.cell_input_dim,
+                                   state_is_tuple=False,
+                                   reuse=tf.get_variable_scope().reuse)
+    cell = tf.contrib.rnn.OutputProjectionWrapper(
+        cell, self.output_dim,
+        reuse=tf.get_variable_scope().reuse)
+    return cell
+  def core(self, obs, prev_internal_state, prev_actions):
+    """Core neural network taking in inputs and outputting sampling
+    distribution parameters."""
+    batch_size = tf.shape(obs[0])[0]
+    if not self.recurrent:
+      prev_internal_state = tf.zeros([batch_size, self.rnn_state_dim])
+    cell = self.get_cell()
+    b = tf.get_variable('input_bias', [self.cell_input_dim],
+                        initializer=self.vector_init)
+    cell_input = tf.nn.bias_add(tf.zeros([batch_size, self.cell_input_dim]), b)
+    for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+      w = tf.get_variable('w_state%d' % i, [obs_dim, self.cell_input_dim],
+                          initializer=self.matrix_init)
+      if self.env_spec.is_discrete(obs_type):
+        cell_input += tf.matmul(tf.one_hot(obs[i], obs_dim), w)
+      elif self.env_spec.is_box(obs_type):
+        cell_input += tf.matmul(obs[i], w)
+      else:
+        assert False
+    if self.input_prev_actions:
+      if self.env_spec.combine_actions:  # TODO(ofir): clean this up
+        prev_action = prev_actions[0]
+        for i, action_dim in enumerate(self.env_spec.orig_act_dims):
+          act = tf.mod(prev_action, action_dim)
+          w = tf.get_variable('w_prev_action%d' % i, [action_dim, self.cell_input_dim],
+                              initializer=self.matrix_init)
+          cell_input += tf.matmul(tf.one_hot(act, action_dim), w)
+          prev_action = tf.to_int32(prev_action / action_dim)
+      else:
+        for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+          w = tf.get_variable('w_prev_action%d' % i, [act_dim, self.cell_input_dim],
+                              initializer=self.matrix_init)
+          if self.env_spec.is_discrete(act_type):
+            cell_input += tf.matmul(tf.one_hot(prev_actions[i], act_dim), w)
+          elif self.env_spec.is_box(act_type):
+            cell_input += tf.matmul(prev_actions[i], w)
+          else:
+            assert False
+    output, next_state = cell(cell_input, prev_internal_state)
+    return output, next_state
+  def sample_action(self, logits, sampling_dim,
+                    act_dim, act_type, greedy=False):
+    """Sample an action from a distribution."""
+    if self.env_spec.is_discrete(act_type):
+      if greedy:
+        act = tf.argmax(logits, 1)
+      else:
+        act = tf.reshape(tf.multinomial(logits, 1), [-1])
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      if greedy:
+        act = means
+      else:
+        batch_size = tf.shape(logits)[0]
+        act = means + std * tf.random_normal([batch_size, act_dim])
+    else:
+      assert False
+    return act
+  def entropy(self, logits,
+              sampling_dim, act_dim, act_type):
+    """Calculate entropy of distribution."""
+    if self.env_spec.is_discrete(act_type):
+      entropy = tf.reduce_sum(
+          -tf.nn.softmax(logits) * tf.nn.log_softmax(logits), -1)
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      entropy = tf.reduce_sum(
+          0.5 * (1 + tf.log(2 * np.pi * tf.square(std))), -1)
+    else:
+      assert False
+    return entropy
+  def self_kl(self, logits,
+              sampling_dim, act_dim, act_type):
+    """Calculate KL of distribution with itself.
+    Used layer only for the gradients.
+    """
+    if self.env_spec.is_discrete(act_type):
+      probs = tf.nn.softmax(logits)
+      log_probs = tf.nn.log_softmax(logits)
+      self_kl = tf.reduce_sum(
+          tf.stop_gradient(probs) *
+          (tf.stop_gradient(log_probs) - log_probs), -1)
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      my_means = tf.stop_gradient(means)
+      my_std = tf.stop_gradient(std)
+      self_kl = tf.reduce_sum(
+          tf.log(std / my_std) +
+          (tf.square(my_std) + tf.square(my_means - means)) /
+          (2.0 * tf.square(std)) - 0.5,
+          -1)
+    else:
+      assert False
+    return self_kl
+  def log_prob_action(self, action, logits,
+                      sampling_dim, act_dim, act_type):
+    """Calculate log-prob of action sampled from distribution."""
+    if self.env_spec.is_discrete(act_type):
+      act_log_prob = tf.reduce_sum(
+          tf.one_hot(action, act_dim) * tf.nn.log_softmax(logits), -1)
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      act_log_prob = (- 0.5 * tf.log(2 * np.pi * tf.square(std))
+                      - 0.5 * tf.square(action - means) / tf.square(std))
+      act_log_prob = tf.reduce_sum(act_log_prob, -1)
+    else:
+      assert False
+    return act_log_prob
+  def sample_actions(self, output, actions=None, greedy=False):
+    """Sample all actions given output of core network."""
+    sampled_actions = []
+    logits = []
+    log_probs = []
+    entropy = []
+    self_kl = []
+    start_idx = 0
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
+      if self.fixed_std and self.env_spec.is_box(act_type):
+        act_logits = output[:, start_idx:start_idx + act_dim]
+        log_std = tf.get_variable('std%d' % i, [1, sampling_dim // 2])
+        # fix standard deviations to variable
+        act_logits = tf.concat(
+            [act_logits,
+             1e-6 + tf.exp(log_std) + 0 * act_logits], 1)
+      else:
+        act_logits = output[:, start_idx:start_idx + sampling_dim]
+      if actions is None:
+        act = self.sample_action(act_logits, sampling_dim,
+                                 act_dim, act_type,
+                                 greedy=greedy)
+      else:
+        act = actions[i]
+      ent = self.entropy(act_logits, sampling_dim, act_dim, act_type)
+      kl = self.self_kl(act_logits, sampling_dim, act_dim, act_type)
+      act_log_prob = self.log_prob_action(
+          act, act_logits,
+          sampling_dim, act_dim, act_type)
+      sampled_actions.append(act)
+      logits.append(act_logits)
+      log_probs.append(act_log_prob)
+      entropy.append(ent)
+      self_kl.append(kl)
+      start_idx += sampling_dim
+    assert start_idx == self.env_spec.total_sampling_act_dim
+    return sampled_actions, logits, log_probs, entropy, self_kl
+  def get_kl(self, my_logits, other_logits):
+    """Calculate KL between one policy output and another."""
+    kl = []
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
+      single_my_logits = my_logits[i]
+      single_other_logits = other_logits[i]
+      if self.env_spec.is_discrete(act_type):
+        my_probs = tf.nn.softmax(single_my_logits)
+        my_log_probs = tf.nn.log_softmax(single_my_logits)
+        other_log_probs = tf.nn.log_softmax(single_other_logits)
+        my_kl = tf.reduce_sum(my_probs * (my_log_probs - other_log_probs), -1)
+      elif self.env_spec.is_box(act_type):
+        my_means = single_my_logits[:, :sampling_dim / 2]
+        my_std = single_my_logits[:, sampling_dim / 2:]
+        other_means = single_other_logits[:, :sampling_dim / 2]
+        other_std = single_other_logits[:, sampling_dim / 2:]
+        my_kl = tf.reduce_sum(
+            tf.log(other_std / my_std) +
+            (tf.square(my_std) + tf.square(my_means - other_means)) /
+            (2.0 * tf.square(other_std)) - 0.5,
+            -1)
+      else:
+        assert False
+      kl.append(my_kl)
+    return kl
+  def single_step(self, prev, cur, greedy=False):
+    """Single RNN step.  Equivalently, single-time-step sampled actions."""
+    prev_internal_state, prev_actions, _, _, _, _ = prev
+    obs, actions = cur  # state observed and action taken at this time step
+    # feed into RNN cell
+    output, next_state = self.core(
+        obs, prev_internal_state, prev_actions)
+    # sample actions with values and log-probs
+    (actions, logits, log_probs,
+     entropy, self_kl) = self.sample_actions(
+        output, actions=actions, greedy=greedy)
+    return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
+            tuple(entropy), tuple(self_kl))
+  def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
+    """Sample single step from policy."""
+    (next_state, sampled_actions, logits, log_probs,
+     entropies, self_kls) = self.single_step(
+        (prev_internal_state, prev_actions, None, None, None, None),
+        (obs, None), greedy=greedy)
+    return next_state, sampled_actions
+  def multi_step(self, all_obs, initial_state, all_actions):
+    """Calculate log-probs and other calculations on batch of episodes."""
+    batch_size = tf.shape(initial_state)[0]
+    time_length = tf.shape(all_obs[0])[0]
+    initial_actions = [act[0] for act in all_actions]
+    all_actions = [tf.concat([act[1:], act[0:1]], 0)
+                   for act in all_actions]  # "final" action is dummy
+    (internal_states, _, logits, log_probs,
+     entropies, self_kls) = tf.scan(
+        self.single_step,
+        (all_obs, all_actions),
+        initializer=self.get_initializer(
+            batch_size, initial_state, initial_actions))
+    # remove "final" computations
+    log_probs = [log_prob[:-1] for log_prob in log_probs]
+    entropies = [entropy[:-1] for entropy in entropies]
+    self_kls = [self_kl[:-1] for self_kl in self_kls]
+    return internal_states, logits, log_probs, entropies, self_kls
+  def get_initializer(self, batch_size, initial_state, initial_actions):
+    """Get initializer for RNN."""
+    logits_init = []
+    log_probs_init = []
+    for act_dim, act_type in self.env_spec.act_dims_and_types:
+      sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
+      logits_init.append(tf.zeros([batch_size, sampling_dim]))
+      log_probs_init.append(tf.zeros([batch_size]))
+    entropy_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
+    self_kl_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
+    return (initial_state,
+            tuple(initial_actions),
+            tuple(logits_init), tuple(log_probs_init),
+            tuple(entropy_init),
+            tuple(self_kl_init))
+  def calculate_kl(self, my_logits, other_logits):
+    """Calculate KL between one policy and another on batch of episodes."""
+    batch_size = tf.shape(my_logits[0])[1]
+    time_length = tf.shape(my_logits[0])[0]
+    reshaped_my_logits = [
+        tf.reshape(my_logit, [batch_size * time_length, -1])
+        for my_logit in my_logits]
+    reshaped_other_logits = [
+        tf.reshape(other_logit, [batch_size * time_length, -1])
+        for other_logit in other_logits]
+    kl = self.get_kl(reshaped_my_logits, reshaped_other_logits)
+    kl = [tf.reshape(kkl, [time_length, batch_size])
+          for kkl in kl]
+    return kl
+class MLPPolicy(Policy):
+  """Non-recurrent policy."""
+  def get_cell(self):
+    self.cell_input_dim = self.internal_dim
+    def mlp(cell_input, prev_internal_state):
+      w1 = tf.get_variable('w1', [self.cell_input_dim, self.internal_dim])
+      b1 = tf.get_variable('b1', [self.internal_dim])
+      w2 = tf.get_variable('w2', [self.internal_dim, self.internal_dim])
+      b2 = tf.get_variable('b2', [self.internal_dim])
+      w3 = tf.get_variable('w3', [self.internal_dim, self.internal_dim])
+      b3 = tf.get_variable('b3', [self.internal_dim])
+      proj = tf.get_variable(
+          'proj', [self.internal_dim, self.output_dim])
+      hidden = cell_input
+      hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w1), b1))
+      hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w2), b2))
+      output = tf.matmul(hidden, proj)
+      return output, hidden
+    return mlp
+  def single_step(self, obs, actions, prev_actions, greedy=False):
+    """Single step."""
+    batch_size = tf.shape(obs[0])[0]
+    prev_internal_state = tf.zeros([batch_size, self.internal_dim])
+    output, next_state = self.core(
+        obs, prev_internal_state, prev_actions)
+    # sample actions with values and log-probs
+    (actions, logits, log_probs,
+     entropy, self_kl) = self.sample_actions(
+        output, actions=actions, greedy=greedy)
+    return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
+            tuple(entropy), tuple(self_kl))
+  def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
+    """Sample single step from policy."""
+    (next_state, sampled_actions, logits, log_probs,
+     entropies, self_kls) = self.single_step(obs, None, prev_actions,
+                                             greedy=greedy)
+    return next_state, sampled_actions
+  def multi_step(self, all_obs, initial_state, all_actions):
+    """Calculate log-probs and other calculations on batch of episodes."""
+    batch_size = tf.shape(initial_state)[0]
+    time_length = tf.shape(all_obs[0])[0]
+    # first reshape inputs as a single batch
+    reshaped_obs = []
+    for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
+      elif self.env_spec.is_box(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
+    reshaped_act = []
+    reshaped_prev_act = []
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      act = tf.concat([all_actions[i][1:], all_actions[i][0:1]], 0)
+      prev_act = all_actions[i]
+      if self.env_spec.is_discrete(act_type):
+        reshaped_act.append(tf.reshape(act, [time_length * batch_size]))
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size]))
+      elif self.env_spec.is_box(act_type):
+        reshaped_act.append(
+            tf.reshape(act, [time_length * batch_size, act_dim]))
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size, act_dim]))
+    # now inputs go into single step as one large batch
+    (internal_states, _, logits, log_probs,
+     entropies, self_kls) = self.single_step(
+         reshaped_obs, reshaped_act, reshaped_prev_act)
+    # reshape the outputs back to original time-major format
+    internal_states = tf.reshape(internal_states, [time_length, batch_size, -1])
+    logits = [tf.reshape(logit, [time_length, batch_size, -1])
+              for logit in logits]
+    log_probs = [tf.reshape(log_prob, [time_length, batch_size])[:-1]
+                 for log_prob in log_probs]
+    entropies = [tf.reshape(ent, [time_length, batch_size])[:-1]
+                 for ent in entropies]
+    self_kls = [tf.reshape(self_kl, [time_length, batch_size])[:-1]
+                for self_kl in self_kls]
+    return internal_states, logits, log_probs, entropies, self_kls
--- a/pcl_rl/replay_buffer.py
+++ b/pcl_rl/replay_buffer.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Replay buffer.
+Implements replay buffer in Python.
+"""
+import random
+import numpy as np
+class ReplayBuffer(object):
+  def __init__(self, max_size):
+    self.max_size = max_size
+    self.cur_size = 0
+    self.buffer = {}
+    self.init_length = 0
+  def __len__(self):
+    return self.cur_size
+  def seed_buffer(self, episodes):
+    self.init_length = len(episodes)
+    self.add(episodes, np.ones(self.init_length))
+  def add(self, episodes, *args):
+    """Add episodes to buffer."""
+    idx = 0
+    while self.cur_size < self.max_size and idx < len(episodes):
+      self.buffer[self.cur_size] = episodes[idx]
+      self.cur_size += 1
+      idx += 1
+    if idx < len(episodes):
+      remove_idxs = self.remove_n(len(episodes) - idx)
+      for remove_idx in remove_idxs:
+        self.buffer[remove_idx] = episodes[idx]
+        idx += 1
+    assert len(self.buffer) == self.cur_size
+  def remove_n(self, n):
+    """Get n items for removal."""
+    # random removal
+    idxs = random.sample(xrange(self.init_length, self.cur_size), n)
+    return idxs
+  def get_batch(self, n):
+    """Get batch of episodes to train on."""
+    # random batch
+    idxs = random.sample(xrange(self.cur_size), n)
+    return [self.buffer[idx] for idx in idxs], None
+  def update_last_batch(self, delta):
+    pass
+class PrioritizedReplayBuffer(ReplayBuffer):
+  def __init__(self, max_size, alpha=0.2,
+               eviction_strategy='rand'):
+    self.max_size = max_size
+    self.alpha = alpha
+    self.eviction_strategy = eviction_strategy
+    assert self.eviction_strategy in ['rand', 'fifo', 'rank']
+    self.remove_idx = 0
+    self.cur_size = 0
+    self.buffer = {}
+    self.priorities = np.zeros(self.max_size)
+    self.init_length = 0
+  def __len__(self):
+    return self.cur_size
+  def add(self, episodes, priorities, new_idxs=None):
+    """Add episodes to buffer."""
+    if new_idxs is None:
+      idx = 0
+      new_idxs = []
+      while self.cur_size < self.max_size and idx < len(episodes):
+        self.buffer[self.cur_size] = episodes[idx]
+        new_idxs.append(self.cur_size)
+        self.cur_size += 1
+        idx += 1
+      if idx < len(episodes):
+        remove_idxs = self.remove_n(len(episodes) - idx)
+        for remove_idx in remove_idxs:
+          self.buffer[remove_idx] = episodes[idx]
+          new_idxs.append(remove_idx)
+          idx += 1
+    else:
+      assert len(new_idxs) == len(episodes)
+      for new_idx, ep in zip(new_idxs, episodes):
+        self.buffer[new_idx] = ep
+    self.priorities[new_idxs] = priorities
+    self.priorities[0:self.init_length] = np.max(
+        self.priorities[self.init_length:])
+    assert len(self.buffer) == self.cur_size
+    return new_idxs
+  def remove_n(self, n):
+    """Get n items for removal."""
+    assert self.init_length + n <= self.cur_size
+    if self.eviction_strategy == 'rand':
+      # random removal
+      idxs = random.sample(xrange(self.init_length, self.cur_size), n)
+    elif self.eviction_strategy == 'fifo':
+      # overwrite elements in cyclical fashion
+      idxs = [
+          self.init_length +
+          (self.remove_idx + i) % (self.max_size - self.init_length)
+          for i in xrange(n)]
+      self.remove_idx = idxs[-1] + 1 - self.init_length
+    elif self.eviction_strategy == 'rank':
+      # remove lowest-priority indices
+      idxs = np.argpartition(self.priorities, n)[:n]
+    return idxs
+  def sampling_distribution(self):
+    p = self.priorities[:self.cur_size]
+    p = np.exp(self.alpha * (p - np.max(p)))
+    norm = np.sum(p)
+    if norm > 0:
+      uniform = 0.0
+      p = p / norm * (1 - uniform) + 1.0 / self.cur_size * uniform
+    else:
+      p = np.ones(self.cur_size) / self.cur_size
+    return p
+  def get_batch(self, n):
+    """Get batch of episodes to train on."""
+    p = self.sampling_distribution()
+    idxs = np.random.choice(self.cur_size, size=n, replace=False, p=p)
+    self.last_batch = idxs
+    return [self.buffer[idx] for idx in idxs], p[idxs]
+  def update_last_batch(self, delta):
+    """Update last batch idxs with new priority."""
+    self.priorities[self.last_batch] = np.abs(delta)
+    self.priorities[0:self.init_length] = np.max(
+        self.priorities[self.init_length:])
--- a/pcl_rl/trainer.py
+++ b/pcl_rl/trainer.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Trainer for coordinating single or multi-replica training.
+Main point of entry for running models.  Specifies most of
+the parameters used by different algorithms.
+"""
+import tensorflow as tf
+import numpy as np
+import random
+import os
+import pickle
+import controller
+import model
+import policy
+import baseline
+import objective
+import full_episode_objective
+import trust_region
+import optimizers
+import replay_buffer
+import expert_paths
+import gym_wrapper
+import env_spec
+app = tf.app
+flags = tf.flags
+logging = tf.logging
+gfile = tf.gfile
+FLAGS = flags.FLAGS
+flags.DEFINE_string('env', 'Copy-v0', 'environment name')
+flags.DEFINE_integer('batch_size', 100, 'batch size')
+flags.DEFINE_integer('replay_batch_size', None, 'replay batch size; defaults to batch_size')
+flags.DEFINE_integer('num_samples', 1,
+                     'number of samples from each random seed initialization')
+flags.DEFINE_integer('max_step', 200, 'max number of steps to train on')
+flags.DEFINE_integer('cutoff_agent', 0,
+                     'number of steps at which to cut-off agent. '
+                     'Defaults to always cutoff')
+flags.DEFINE_integer('num_steps', 100000, 'number of training steps')
+flags.DEFINE_integer('validation_frequency', 100,
+                     'every so many steps, output some stats')
+flags.DEFINE_float('target_network_lag', 0.95,
+                   'This exponential decay on online network yields target '
+                   'network')
+flags.DEFINE_string('sample_from', 'online',
+                    'Sample actions from "online" network or "target" network')
+flags.DEFINE_string('objective', 'pcl',
+                    'pcl/upcl/a3c/trpo/reinforce/urex')
+flags.DEFINE_bool('trust_region_p', False,
+                  'use trust region for policy optimization')
+flags.DEFINE_string('value_opt', None,
+                    'leave as None to optimize it along with policy '
+                    '(using critic_weight). Otherwise set to '
+                    '"best_fit" (least squares regression), "lbfgs", or "grad"')
+flags.DEFINE_float('max_divergence', 0.01,
+                   'max divergence (i.e. KL) to allow during '
+                   'trust region optimization')
+flags.DEFINE_float('learning_rate', 0.01, 'learning rate')
+flags.DEFINE_float('clip_norm', 5.0, 'clip norm')
+flags.DEFINE_float('clip_adv', 0.0, 'Clip advantages at this value.  '
+                   'Leave as 0 to not clip at all.')
+flags.DEFINE_float('critic_weight', 0.1, 'critic weight')
+flags.DEFINE_float('tau', 0.1, 'entropy regularizer.'
+                   'If using decaying tau, this is the final value.')
+flags.DEFINE_float('tau_decay', None,
+                   'decay tau by this much every 100 steps')
+flags.DEFINE_float('tau_start', 0.1,
+                   'start tau at this value')
+flags.DEFINE_float('eps_lambda', 0.0, 'relative entropy regularizer.')
+flags.DEFINE_bool('update_eps_lambda', False,
+                  'Update lambda automatically based on last 100 episodes.')
+flags.DEFINE_float('gamma', 1.0, 'discount')
+flags.DEFINE_integer('rollout', 10, 'rollout')
+flags.DEFINE_bool('fixed_std', True,
+                  'fix the std in Gaussian distributions')
+flags.DEFINE_bool('input_prev_actions', True,
+                  'input previous actions to policy network')
+flags.DEFINE_bool('recurrent', True,
+                  'use recurrent connections')
+flags.DEFINE_bool('input_time_step', False,
+                  'input time step into value calucations')
+flags.DEFINE_bool('use_online_batch', True, 'train on batches as they are sampled')
+flags.DEFINE_bool('batch_by_steps', False,
+                  'ensure each training batch has batch_size * max_step steps')
+flags.DEFINE_bool('unify_episodes', False,
+                  'Make sure replay buffer holds entire episodes, '
+                  'even across distinct sampling steps')
+flags.DEFINE_integer('replay_buffer_size', 5000, 'replay buffer size')
+flags.DEFINE_float('replay_buffer_alpha', 0.5, 'replay buffer alpha param')
+flags.DEFINE_integer('replay_buffer_freq', 0,
+                     'replay buffer frequency (only supports -1/0/1)')
+flags.DEFINE_string('eviction', 'rand',
+                    'how to evict from replay buffer: rand/rank/fifo')
+flags.DEFINE_string('prioritize_by', 'rewards',
+                    'Prioritize replay buffer by "rewards" or "step"')
+flags.DEFINE_integer('num_expert_paths', 0,
+                     'number of expert paths to seed replay buffer with')
+flags.DEFINE_integer('internal_dim', 256, 'RNN internal dim')
+flags.DEFINE_integer('value_hidden_layers', 0,
+                     'number of hidden layers in value estimate')
+flags.DEFINE_integer('tf_seed', 42, 'random seed for tensorflow')
+flags.DEFINE_string('save_trajectories_dir', None,
+                    'directory to save trajectories to, if desired')
+flags.DEFINE_string('load_trajectories_file', None,
+                    'file to load expert trajectories from')
+# supervisor flags
+flags.DEFINE_bool('supervisor', False, 'use supervisor training')
+flags.DEFINE_integer('task_id', 0, 'task id')
+flags.DEFINE_integer('ps_tasks', 0, 'number of ps tasks')
+flags.DEFINE_integer('num_replicas', 1, 'number of replicas used')
+flags.DEFINE_string('master', 'local', 'name of master')
+flags.DEFINE_string('save_dir', '', 'directory to save model to')
+flags.DEFINE_string('load_path', '', 'path of saved model to load (if none in save_dir)')
+class Trainer(object):
+  """Coordinates single or multi-replica training."""
+  def __init__(self):
+    self.batch_size = FLAGS.batch_size
+    self.replay_batch_size = FLAGS.replay_batch_size
+    if self.replay_batch_size is None:
+      self.replay_batch_size = self.batch_size
+    self.num_samples = FLAGS.num_samples
+    self.env_str = FLAGS.env
+    self.env = gym_wrapper.GymWrapper(self.env_str,
+                                      distinct=FLAGS.batch_size // self.num_samples,
+                                      count=self.num_samples)
+    self.env_spec = env_spec.EnvSpec(self.env.get_one())
+    self.max_step = FLAGS.max_step
+    self.cutoff_agent = FLAGS.cutoff_agent
+    self.num_steps = FLAGS.num_steps
+    self.validation_frequency = FLAGS.validation_frequency
+    self.target_network_lag = FLAGS.target_network_lag
+    self.sample_from = FLAGS.sample_from
+    assert self.sample_from in ['online', 'target']
+    self.critic_weight = FLAGS.critic_weight
+    self.objective = FLAGS.objective
+    self.trust_region_p = FLAGS.trust_region_p
+    self.value_opt = FLAGS.value_opt
+    assert not self.trust_region_p or self.objective in ['pcl', 'trpo']
+    assert self.objective != 'trpo' or self.trust_region_p
+    assert self.value_opt is None or self.critic_weight == 0.0
+    self.max_divergence = FLAGS.max_divergence
+    self.learning_rate = FLAGS.learning_rate
+    self.clip_norm = FLAGS.clip_norm
+    self.clip_adv = FLAGS.clip_adv
+    self.tau = FLAGS.tau
+    self.tau_decay = FLAGS.tau_decay
+    self.tau_start = FLAGS.tau_start
+    self.eps_lambda = FLAGS.eps_lambda
+    self.update_eps_lambda = FLAGS.update_eps_lambda
+    self.gamma = FLAGS.gamma
+    self.rollout = FLAGS.rollout
+    self.fixed_std = FLAGS.fixed_std
+    self.input_prev_actions = FLAGS.input_prev_actions
+    self.recurrent = FLAGS.recurrent
+    assert not self.trust_region_p or not self.recurrent
+    self.input_time_step = FLAGS.input_time_step
+    assert not self.input_time_step or (self.cutoff_agent <= self.max_step)
+    self.use_online_batch = FLAGS.use_online_batch
+    self.batch_by_steps = FLAGS.batch_by_steps
+    self.unify_episodes = FLAGS.unify_episodes
+    if self.unify_episodes:
+      assert self.batch_size == 1
+    self.replay_buffer_size = FLAGS.replay_buffer_size
+    self.replay_buffer_alpha = FLAGS.replay_buffer_alpha
+    self.replay_buffer_freq = FLAGS.replay_buffer_freq
+    assert self.replay_buffer_freq in [-1, 0, 1]
+    self.eviction = FLAGS.eviction
+    self.prioritize_by = FLAGS.prioritize_by
+    assert self.prioritize_by in ['rewards', 'step']
+    self.num_expert_paths = FLAGS.num_expert_paths
+    self.internal_dim = FLAGS.internal_dim
+    self.value_hidden_layers = FLAGS.value_hidden_layers
+    self.tf_seed = FLAGS.tf_seed
+    self.save_trajectories_dir = (
+        FLAGS.save_trajectories_dir or FLAGS.save_dir)
+    self.save_trajectories_file = (
+        os.path.join(
+            self.save_trajectories_dir, self.env_str.replace('-', '_'))
+        if self.save_trajectories_dir else None)
+    self.load_trajectories_file = FLAGS.load_trajectories_file
+    self.hparams = dict((attr, getattr(self, attr))
+                        for attr in dir(self)
+                        if not attr.startswith('__') and
+                        not callable(getattr(self, attr)))
+  def hparams_string(self):
+    return '\n'.join('%s: %s' % item for item in sorted(self.hparams.items()))
+  def get_objective(self):
+    tau = self.tau
+    if self.tau_decay is not None:
+      assert self.tau_start >= self.tau
+      tau = tf.maximum(
+          tf.train.exponential_decay(
+              self.tau_start, self.global_step, 100, self.tau_decay),
+          self.tau)
+    if self.objective in ['pcl', 'a3c', 'trpo', 'upcl']:
+      cls = (objective.PCL if self.objective in ['pcl', 'upcl'] else
+             objective.TRPO if self.objective == 'trpo' else
+             objective.ActorCritic)
+      policy_weight = 1.0
+      return cls(self.learning_rate,
+                 clip_norm=self.clip_norm,
+                 policy_weight=policy_weight,
+                 critic_weight=self.critic_weight,
+                 tau=tau, gamma=self.gamma, rollout=self.rollout,
+                 eps_lambda=self.eps_lambda, clip_adv=self.clip_adv)
+    elif self.objective in ['reinforce', 'urex']:
+      cls = (full_episode_objective.Reinforce
+             if self.objective == 'reinforce' else
+             full_episode_objective.UREX)
+      return cls(self.learning_rate,
+                 clip_norm=self.clip_norm,
+                 num_samples=self.num_samples,
+                 tau=tau, bonus_weight=1.0)  # TODO: bonus weight?
+    else:
+      assert False, 'Unknown objective %s' % self.objective
+  def get_policy(self):
+    if self.recurrent:
+      cls = policy.Policy
+    else:
+      cls = policy.MLPPolicy
+    return cls(self.env_spec, self.internal_dim,
+               fixed_std=self.fixed_std,
+               recurrent=self.recurrent,
+               input_prev_actions=self.input_prev_actions)
+  def get_baseline(self):
+    cls = (baseline.UnifiedBaseline if self.objective == 'upcl' else
+           baseline.Baseline)
+    return cls(self.env_spec, self.internal_dim,
+               input_prev_actions=self.input_prev_actions,
+               input_time_step=self.input_time_step,
+               input_policy_state=self.recurrent,  # may want to change this
+               n_hidden_layers=self.value_hidden_layers,
+               hidden_dim=self.internal_dim,
+               tau=self.tau)
+  def get_trust_region_p_opt(self):
+    if self.trust_region_p:
+      return trust_region.TrustRegionOptimization(
+          max_divergence=self.max_divergence)
+    else:
+      return None
+  def get_value_opt(self):
+    if self.value_opt == 'grad':
+      return optimizers.GradOptimization(
+          learning_rate=self.learning_rate, max_iter=5, mix_frac=0.05)
+    elif self.value_opt == 'lbfgs':
+      return optimizers.LbfgsOptimization(max_iter=25, mix_frac=0.1)
+    elif self.value_opt == 'best_fit':
+      return optimizers.BestFitOptimization(mix_frac=1.0)
+    else:
+      return None
+  def get_model(self):
+    cls = model.Model
+    return cls(self.env_spec, self.global_step,
+               target_network_lag=self.target_network_lag,
+               sample_from=self.sample_from,
+               get_policy=self.get_policy,
+               get_baseline=self.get_baseline,
+               get_objective=self.get_objective,
+               get_trust_region_p_opt=self.get_trust_region_p_opt,
+               get_value_opt=self.get_value_opt)
+  def get_replay_buffer(self):
+    if self.replay_buffer_freq <= 0:
+      return None
+    else:
+      assert self.objective in ['pcl', 'upcl'], 'Can\'t use replay buffer with %s' % (
+          self.objective)
+    cls = replay_buffer.PrioritizedReplayBuffer
+    return cls(self.replay_buffer_size,
+               alpha=self.replay_buffer_alpha,
+               eviction_strategy=self.eviction)
+  def get_buffer_seeds(self):
+    return expert_paths.sample_expert_paths(
+        self.num_expert_paths, self.env_str, self.env_spec,
+        load_trajectories_file=self.load_trajectories_file)
+  def get_controller(self):
+    """Get controller."""
+    cls = controller.Controller
+    return cls(self.env, self.env_spec, self.internal_dim,
+               use_online_batch=self.use_online_batch,
+               batch_by_steps=self.batch_by_steps,
+               unify_episodes=self.unify_episodes,
+               replay_batch_size=self.replay_batch_size,
+               max_step=self.max_step,
+               cutoff_agent=self.cutoff_agent,
+               save_trajectories_file=self.save_trajectories_file,
+               use_trust_region=self.trust_region_p,
+               use_value_opt=self.value_opt is not None,
+               update_eps_lambda=self.update_eps_lambda,
+               prioritize_by=self.prioritize_by,
+               get_model=self.get_model,
+               get_replay_buffer=self.get_replay_buffer,
+               get_buffer_seeds=self.get_buffer_seeds)
+  def do_before_step(self, step):
+    pass
+  def run(self):
+    """Run training."""
+    is_chief = FLAGS.task_id == 0 or not FLAGS.supervisor
+    sv = None
+    def init_fn(sess, saver):
+      ckpt = None
+      if FLAGS.save_dir and sv is None:
+        load_dir = FLAGS.save_dir
+        ckpt = tf.train.get_checkpoint_state(load_dir)
+      if ckpt and ckpt.model_checkpoint_path:
+        logging.info('restoring from %s', ckpt.model_checkpoint_path)
+        saver.restore(sess, ckpt.model_checkpoint_path)
+      elif FLAGS.load_path:
+        logging.info('restoring from %s', FLAGS.load_path)
+        with gfile.AsUser('distbelief-brain-gpu'):
+          saver.restore(sess, FLAGS.load_path)
+    if FLAGS.supervisor:
+      with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks, merge_devices=True)):
+        self.global_step = tf.contrib.framework.get_or_create_global_step()
+        tf.set_random_seed(FLAGS.tf_seed)
+        self.controller = self.get_controller()
+        self.model = self.controller.model
+        self.controller.setup()
+        saver = tf.train.Saver(max_to_keep=10)
+        step = self.model.global_step
+        sv = tf.Supervisor(logdir=FLAGS.save_dir,
+                           is_chief=is_chief,
+                           saver=saver,
+                           save_model_secs=600,
+                           summary_op=None,  # we define it ourselves
+                           save_summaries_secs=60,
+                           global_step=step,
+                           init_fn=lambda sess: init_fn(sess, saver))
+        sess = sv.PrepareSession(FLAGS.master)
+    else:
+      tf.set_random_seed(FLAGS.tf_seed)
+      self.global_step = tf.contrib.framework.get_or_create_global_step()
+      self.controller = self.get_controller()
+      self.model = self.controller.model
+      self.controller.setup()
+      saver = tf.train.Saver(max_to_keep=10)
+      sess = tf.Session()
+      sess.run(tf.initialize_all_variables())
+      init_fn(sess, saver)
+    self.sv = sv
+    self.sess = sess
+    logging.info('hparams:\n%s', self.hparams_string())
+    model_step = sess.run(self.model.global_step)
+    if model_step >= self.num_steps:
+      logging.info('training has reached final step')
+      return
+    losses = []
+    rewards = []
+    all_ep_rewards = []
+    for step in xrange(1 + self.num_steps):
+      if sv is not None and sv.ShouldStop():
+        logging.info('stopping supervisor')
+        break
+      self.do_before_step(step)
+      (loss, summary,
+       total_rewards, episode_rewards) = self.controller.train(sess)
+      losses.append(loss)
+      rewards.append(total_rewards)
+      all_ep_rewards.extend(episode_rewards)
+      if random.random() < 1 and is_chief and sv and sv._summary_writer:
+        sv.summary_computed(sess, summary)
+      model_step = sess.run(self.model.global_step)
+      if is_chief and step % self.validation_frequency == 0:
+        logging.info('at training step %d, model step %d: '
+                     'avg loss %f, avg reward %f, '
+                     'episode rewards: %f',
+                     step, model_step,
+                     np.mean(losses), np.mean(rewards),
+                     np.mean(all_ep_rewards))
+        losses = []
+        rewards = []
+        all_ep_rewards = []
+      if model_step >= self.num_steps:
+        logging.info('training has reached final step')
+        break
+    if is_chief and sv is not None:
+      logging.info('saving final model to %s', sv.save_path)
+      sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
+def main(unused_argv):
+  logging.set_verbosity(logging.INFO)
+  trainer = Trainer()
+  trainer.run()
+if __name__ == '__main__':
+  app.run()
--- a/pcl_rl/trust_region.py
+++ b/pcl_rl/trust_region.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Trust region optimization.
+A lot of this is adapted from other's code.
+See Schulman's Modular RL, wojzaremba's TRPO, etc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+import numpy as np
+def var_size(v):
+  return int(np.prod([int(d) for d in v.shape]))
+def gradients(loss, var_list):
+  grads = tf.gradients(loss, var_list)
+  return [g if g is not None else tf.zeros(v.shape)
+          for g, v in zip(grads, var_list)]
+def flatgrad(loss, var_list):
+  grads = gradients(loss, var_list)
+  return tf.concat([tf.reshape(grad, [-1])
+                    for (v, grad) in zip(var_list, grads)
+                    if grad is not None], 0)
+def get_flat(var_list):
+  return tf.concat([tf.reshape(v, [-1]) for v in var_list], 0)
+def set_from_flat(var_list, flat_theta):
+  assigns = []
+  shapes = [v.shape for v in var_list]
+  sizes = [var_size(v) for v in var_list]
+  start = 0
+  assigns = []
+  for (shape, size, v) in zip(shapes, sizes, var_list):
+    assigns.append(v.assign(
+        tf.reshape(flat_theta[start:start + size], shape)))
+    start += size
+  assert start == sum(sizes)
+  return tf.group(*assigns)
+class TrustRegionOptimization(object):
+  def __init__(self, max_divergence=0.1, cg_damping=0.1):
+    self.max_divergence = max_divergence
+    self.cg_damping = cg_damping
+  def setup_placeholders(self):
+    self.flat_tangent = tf.placeholder(tf.float32, [None], 'flat_tangent')
+    self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
+  def setup(self, var_list, raw_loss, self_divergence,
+            divergence=None):
+    self.setup_placeholders()
+    self.raw_loss = raw_loss
+    self.divergence = divergence
+    self.loss_flat_gradient = flatgrad(raw_loss, var_list)
+    self.divergence_gradient = gradients(self_divergence, var_list)
+    shapes = [var.shape for var in var_list]
+    sizes = [var_size(var) for var in var_list]
+    start = 0
+    tangents = []
+    for shape, size in zip(shapes, sizes):
+      param = tf.reshape(self.flat_tangent[start:start + size], shape)
+      tangents.append(param)
+      start += size
+    assert start == sum(sizes)
+    self.grad_vector_product = sum(
+        tf.reduce_sum(g * t) for (g, t) in zip(self.divergence_gradient, tangents))
+    self.fisher_vector_product = flatgrad(self.grad_vector_product, var_list)
+    self.flat_vars = get_flat(var_list)
+    self.set_vars = set_from_flat(var_list, self.flat_theta)
+  def optimize(self, sess, feed_dict):
+    old_theta = sess.run(self.flat_vars)
+    loss_flat_grad = sess.run(self.loss_flat_gradient,
+                              feed_dict=feed_dict)
+    def calc_fisher_vector_product(tangent):
+      feed_dict[self.flat_tangent] = tangent
+      fvp = sess.run(self.fisher_vector_product,
+                     feed_dict=feed_dict)
+      fvp += self.cg_damping * tangent
+      return fvp
+    step_dir = conjugate_gradient(calc_fisher_vector_product, -loss_flat_grad)
+    shs = 0.5 * step_dir.dot(calc_fisher_vector_product(step_dir))
+    lm = np.sqrt(shs / self.max_divergence)
+    fullstep = step_dir / lm
+    neggdotstepdir = -loss_flat_grad.dot(step_dir)
+    def calc_loss(theta):
+      sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
+      if self.divergence is None:
+        return sess.run(self.raw_loss, feed_dict=feed_dict), True
+      else:
+        raw_loss, divergence = sess.run(
+            [self.raw_loss, self.divergence], feed_dict=feed_dict)
+        return raw_loss, divergence < self.max_divergence
+    # find optimal theta
+    theta = linesearch(calc_loss, old_theta, fullstep, neggdotstepdir / lm)
+    if self.divergence is not None:
+      final_divergence = sess.run(self.divergence, feed_dict=feed_dict)
+    else:
+      final_divergence = None
+    # set vars accordingly
+    if final_divergence is None or final_divergence < self.max_divergence:
+      sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
+    else:
+      sess.run(self.set_vars, feed_dict={self.flat_theta: old_theta})
+def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
+  p = b.copy()
+  r = b.copy()
+  x = np.zeros_like(b)
+  rdotr = r.dot(r)
+  for i in xrange(cg_iters):
+    z = f_Ax(p)
+    v = rdotr / p.dot(z)
+    x += v * p
+    r -= v * z
+    newrdotr = r.dot(r)
+    mu = newrdotr / rdotr
+    p = r + mu * p
+    rdotr = newrdotr
+    if rdotr < residual_tol:
+      break
+  return x
+def linesearch(f, x, fullstep, expected_improve_rate):
+  accept_ratio = 0.1
+  max_backtracks = 10
+  fval, _ = f(x)
+  for (_n_backtracks, stepfrac) in enumerate(.5 ** np.arange(max_backtracks)):
+    xnew = x + stepfrac * fullstep
+    newfval, valid = f(xnew)
+    if not valid:
+      continue
+    actual_improve = fval - newfval
+    expected_improve = expected_improve_rate * stepfrac
+    ratio = actual_improve / expected_improve
+    if ratio > accept_ratio and actual_improve > 0:
+      return xnew
+  return x