Merge branch 'master' of github.com:tensorflow/models

dff0f0c1 · Alexander Gorban · da341f70 · 36203f09 · dff0f0c1 · dff0f0c1
Commit dff0f0c1 authored Aug 08, 2017 by Alexander Gorban
20 changed files
--- a/object_detection/trainer.py
+++ b/object_detection/trainer.py
@@ -211,9 +211,15 @@ def train(create_tensor_dict_fn, create_model_fn, train_config, master, task,
    # Create ops required to initialize the model from a given checkpoint.
    init_fn = None
    if train_config.fine_tune_checkpoint:
-      init_fn = detection_model.restore_fn(
-          train_config.fine_tune_checkpoint,
+      var_map = detection_model.restore_map(
          from_detection_checkpoint=train_config.from_detection_checkpoint)
+      available_var_map = (variables_helper.
+                           get_variables_available_in_checkpoint(
+                               var_map, train_config.fine_tune_checkpoint))
+      init_saver = tf.train.Saver(available_var_map)
+      def initializer_fn(sess):
+        init_saver.restore(sess, train_config.fine_tune_checkpoint)
+      init_fn = initializer_fn

    with tf.device(deploy_config.optimizer_device()):
      total_loss, grads_and_vars = model_deploy.optimize_clones(

--- a/object_detection/trainer_test.py
+++ b/object_detection/trainer_test.py
@@ -139,21 +139,18 @@ class FakeDetectionModel(model.DetectionModel):
    }
    return loss_dict

-  def restore_fn(self, checkpoint_path, from_detection_checkpoint=True):
-    """Return callable for loading a checkpoint into the tensorflow graph.
+  def restore_map(self, from_detection_checkpoint=True):
+    """Returns a map of variables to load from a foreign checkpoint.

    Args:
-      checkpoint_path: path to checkpoint to restore.
      from_detection_checkpoint: whether to restore from a full detection
        checkpoint (with compatible variable names) or to restore from a
        classification checkpoint for initialization prior to training.

    Returns:
-      a callable which takes a tf.Session and does nothing.
+      A dict mapping variable names to variables.
    """
-    def restore(unused_sess):
-      return
-    return restore
+    return {var.op.name: var for var in tf.global_variables()}


 class TrainerTest(tf.test.TestCase):

--- a/object_detection/utils/BUILD
+++ b/object_detection/utils/BUILD
@@ -120,6 +120,7 @@ py_library(
        "//tensorflow_models/object_detection/core:box_list",
        "//tensorflow_models/object_detection/core:box_predictor",
        "//tensorflow_models/object_detection/core:matcher",
+        "//tensorflow_models/object_detection/utils:shape_utils"
    ],
 )


--- a/object_detection/utils/label_map_util.py
+++ b/object_detection/utils/label_map_util.py
@@ -22,6 +22,20 @@ from google.protobuf import text_format
 from object_detection.protos import string_int_label_map_pb2


+def _validate_label_map(label_map):
+  """Checks if a label map is valid.
+
+  Args:
+    label_map: StringIntLabelMap to validate.
+
+  Raises:
+    ValueError: if label map is invalid.
+  """
+  for item in label_map.item:
+    if item.id < 1:
+      raise ValueError('Label map ids should be >= 1.')
+
+
 def create_category_index(categories):
  """Creates dictionary of COCO compatible categories keyed by category id.

@@ -61,7 +75,7 @@ def convert_label_map_to_categories(label_map,
      list is created with max_num_classes categories.
    max_num_classes: maximum number of (consecutive) label indices to include.
    use_display_name: (boolean) choose whether to load 'display_name' field
-      as category name.  If False of if the display_name field does not exist,
+      as category name.  If False or if the display_name field does not exist,
      uses 'name' field as category names instead.
  Returns:
    categories: a list of dictionaries representing all possible categories.
@@ -91,7 +105,6 @@ def convert_label_map_to_categories(label_map,
  return categories


-# TODO: double check documentaion.
 def load_labelmap(path):
  """Loads label map proto.

@@ -107,6 +120,7 @@ def load_labelmap(path):
      text_format.Merge(label_map_string, label_map)
    except text_format.ParseError:
      label_map.ParseFromString(label_map_string)
+  _validate_label_map(label_map)
  return label_map



--- a/object_detection/utils/label_map_util_test.py
+++ b/object_detection/utils/label_map_util_test.py
@@ -53,6 +53,28 @@ class LabelMapUtilTest(tf.test.TestCase):
    self.assertEqual(label_map_dict['dog'], 1)
    self.assertEqual(label_map_dict['cat'], 2)

+  def test_load_bad_label_map(self):
+    label_map_string = """
+      item {
+        id:0
+        name:'class that should not be indexed at zero'
+      }
+      item {
+        id:2
+        name:'cat'
+      }
+      item {
+        id:1
+        name:'dog'
+      }
+    """
+    label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt')
+    with tf.gfile.Open(label_map_path, 'wb') as f:
+      f.write(label_map_string)
+
+    with self.assertRaises(ValueError):
+      label_map_util.load_labelmap(label_map_path)
+
  def test_keep_categories_with_unique_id(self):
    label_map_proto = string_int_label_map_pb2.StringIntLabelMap()
    label_map_string = """

--- a/object_detection/utils/shape_utils.py
+++ b/object_detection/utils/shape_utils.py
@@ -111,3 +111,26 @@ def pad_or_clip_tensor(t, length):
  if not _is_tensor(length):
    processed_t = _set_dim_0(processed_t, length)
  return processed_t
+
+
+def combined_static_and_dynamic_shape(tensor):
+  """Returns a list containing static and dynamic values for the dimensions.
+
+  Returns a list of static and dynamic values for shape dimensions. This is
+  useful to preserve static shapes when available in reshape operation.
+
+  Args:
+    tensor: A tensor of any type.
+
+  Returns:
+    A list of size tensor.shape.ndims containing integers or a scalar tensor.
+  """
+  static_shape = tensor.shape.as_list()
+  dynamic_shape = tf.shape(tensor)
+  combined_shape = []
+  for index, dim in enumerate(static_shape):
+    if dim is not None:
+      combined_shape.append(dim)
+    else:
+      combined_shape.append(dynamic_shape[index])
+  return combined_shape
--- a/object_detection/utils/shape_utils_test.py
+++ b/object_detection/utils/shape_utils_test.py
@@ -115,6 +115,13 @@ class UtilTest(tf.test.TestCase):
      self.assertAllEqual([1, 2], tt3_result)
      self.assertAllClose([[0.1, 0.2], [0.2, 0.4]], tt4_result)

+  def test_combines_static_dynamic_shape(self):
+    tensor = tf.placeholder(tf.float32, shape=(None, 2, 3))
+    combined_shape = shape_utils.combined_static_and_dynamic_shape(
+        tensor)
+    self.assertTrue(tf.contrib.framework.is_tensor(combined_shape[0]))
+    self.assertListEqual(combined_shape[1:], [2, 3])
+

 if __name__ == '__main__':
  tf.test.main()
--- a/object_detection/utils/test_utils.py
+++ b/object_detection/utils/test_utils.py
@@ -22,6 +22,7 @@ from object_detection.core import box_coder
 from object_detection.core import box_list
 from object_detection.core import box_predictor
 from object_detection.core import matcher
+from object_detection.utils import shape_utils


 class MockBoxCoder(box_coder.BoxCoder):
@@ -45,9 +46,10 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
    super(MockBoxPredictor, self).__init__(is_training, num_classes)

  def _predict(self, image_features, num_predictions_per_location):
-    batch_size = image_features.get_shape().as_list()[0]
-    num_anchors = (image_features.get_shape().as_list()[1]
-                   * image_features.get_shape().as_list()[2])
+    combined_feature_shape = shape_utils.combined_static_and_dynamic_shape(
+        image_features)
+    batch_size = combined_feature_shape[0]
+    num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
    code_size = 4
    zero = tf.reduce_sum(0 * image_features)
    box_encodings = zero + tf.zeros(

--- a/object_detection/utils/visualization_utils.py
+++ b/object_detection/utils/visualization_utils.py
@@ -398,7 +398,7 @@ def visualize_boxes_and_labels_on_image_array(image,
              classes[i] % len(STANDARD_COLORS)]

  # Draw all boxes onto image.
-  for box, color in six.iteritems(box_to_color_map):
+  for box, color in box_to_color_map.items():
    ymin, xmin, ymax, xmax = box
    if instance_masks is not None:
      draw_mask_on_image_array(

--- a/pcl_rl/README.md
+++ b/pcl_rl/README.md
+Code for several RL algorithms used in the following papers:
+* "Improving Policy Gradient by Exploring Under-appreciated Rewards" by
+Ofir Nachum, Mohammad Norouzi, and Dale Schuurmans.
+* "Bridging the Gap Between Value and Policy Based Reinforcement Learning" by
+Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
+* "Trust-PCL: An Off-Policy Trust Region Method for Continuous Control" by
+Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
+
+Available algorithms:
+* Actor Critic
+* TRPO
+* PCL
+* Unified PCL
+* Trust-PCL
+* PCL + Constraint Trust Region (un-published)
+* REINFORCE
+* UREX
+
+Requirements:
+* TensorFlow (see http://www.tensorflow.org for how to install/upgrade)
+* OpenAI Gym (see http://gym.openai.com/docs)
+* NumPy (see http://www.numpy.org/)
+* SciPy (see http://www.scipy.org/)
+
+Quick Start:
+
+Run UREX on a simple environment:
+
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.1 --clip_norm=50 \
+  --num_samples=10 --objective=urex
+```
+
+Run REINFORCE on a simple environment:
+
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.01 --clip_norm=50 \
+  --num_samples=10 --objective=reinforce
+```
+
+Run PCL on a simple environment:
+
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
+  --gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl
+```
+
+Run PCL with expert trajectories on a simple environment:
+
+```
+python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
+  --validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
+  --gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl \
+  --num_expert_paths=10
+```
+
+Run Mujoco task with TRPO:
+
+```
+python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
+  --validation_frequency=5 --rollout=10 --gamma=0.995 \
+  --max_step=1000 --cutoff_agent=1000 \
+  --objective=trpo --norecurrent --internal_dim=64 --trust_region_p \
+  --max_divergence=0.05 --value_opt=best_fit --critic_weight=0.0 \
+```
+
+Run Mujoco task with Trust-PCL:
+
+```
+python trainer.py --logtostderr --batch_size=1 --env=HalfCheetah-v1 \
+  --validation_frequency=50 --rollout=10 --critic_weight=0.0 \
+  --gamma=0.995 --clip_norm=40 --learning_rate=0.002 \
+  --replay_buffer_freq=1 --replay_buffer_size=20000 \
+  --replay_buffer_alpha=0.1 --norecurrent --objective=pcl \
+  --max_step=100 --tau=0.0 --eviction=fifo --max_divergence=0.001 \
+  --internal_dim=64 --cutoff_agent=1000 \
+  --replay_batch_size=25 --nouse_online_batch --batch_by_steps \
+  --sample_from=target --value_opt=grad --value_hidden_layers=2 \
+  --update_eps_lambda --unify_episodes --clip_adv=1.0 \
+  --target_network_lag=0.99 --prioritize_by=step
+```
+
+Run Mujoco task with PCL constraint trust region:
+
+```
+python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
+  --validation_frequency=5 --tau=0.001 --rollout=50 --gamma=0.99 \
+  --max_step=1000 --cutoff_agent=1000 \
+  --objective=pcl --norecurrent --internal_dim=64 --trust_region_p \
+  --max_divergence=0.01 --value_opt=best_fit --critic_weight=0.0 \
+  --tau_decay=0.1 --tau_start=0.1
+```
+
+
+Maintained by Ofir Nachum (ofirnachum).
--- a/pcl_rl/baseline.py
+++ b/pcl_rl/baseline.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Baseline model for value estimates.
+
+Implements the value component of the neural network.
+In some cases this is just an additional linear layer on the policy.
+In other cases, it is a completely separate neural network.
+"""
+
+import tensorflow as tf
+import numpy as np
+
+
+class Baseline(object):
+  def __init__(self, env_spec, internal_policy_dim,
+               input_prev_actions=True,
+               input_time_step=False,
+               input_policy_state=True,
+               n_hidden_layers=0,
+               hidden_dim=64,
+               tau=0.0):
+    self.env_spec = env_spec
+    self.internal_policy_dim = internal_policy_dim
+    self.input_prev_actions = input_prev_actions
+    self.input_time_step = input_time_step
+    self.input_policy_state = input_policy_state
+    self.n_hidden_layers = n_hidden_layers
+    self.hidden_dim = hidden_dim
+    self.tau = tau
+
+    self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
+
+  def get_inputs(self, time_step, obs, prev_actions,
+                 internal_policy_states):
+    """Get inputs to network as single tensor."""
+    inputs = [tf.ones_like(time_step)]
+    input_dim = 1
+
+    if not self.input_policy_state:
+      for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+        if self.env_spec.is_discrete(obs_type):
+          inputs.append(
+              tf.one_hot(obs[i], obs_dim))
+          input_dim += obs_dim
+        elif self.env_spec.is_box(obs_type):
+          cur_obs = obs[i]
+          inputs.append(cur_obs)
+          inputs.append(cur_obs ** 2)
+          input_dim += obs_dim * 2
+        else:
+          assert False
+
+      if self.input_prev_actions:
+        for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+          if self.env_spec.is_discrete(act_type):
+            inputs.append(
+                tf.one_hot(prev_actions[i], act_dim))
+            input_dim += act_dim
+          elif self.env_spec.is_box(act_type):
+            inputs.append(prev_actions[i])
+            input_dim += act_dim
+          else:
+            assert False
+
+    if self.input_policy_state:
+      inputs.append(internal_policy_states)
+      input_dim += self.internal_policy_dim
+
+    if self.input_time_step:
+      scaled_time = 0.01 * time_step
+      inputs.extend([scaled_time, scaled_time ** 2, scaled_time ** 3])
+      input_dim += 3
+
+    return input_dim, tf.concat(inputs, 1)
+
+  def reshape_batched_inputs(self, all_obs, all_actions,
+                             internal_policy_states, policy_logits):
+    """Reshape inputs from [time_length, batch_size, ...] to
+    [time_length * batch_size, ...].
+
+    This allows for computing the value estimate in one go.
+    """
+    batch_size = tf.shape(all_obs[0])[1]
+    time_length = tf.shape(all_obs[0])[0]
+
+    reshaped_obs = []
+    for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
+      elif self.env_spec.is_box(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
+
+    reshaped_prev_act = []
+    reshaped_policy_logits = []
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      prev_act = all_actions[i]
+      if self.env_spec.is_discrete(act_type):
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size]))
+      elif self.env_spec.is_box(act_type):
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size, act_dim]))
+
+      reshaped_policy_logits.append(
+          tf.reshape(policy_logits[i], [time_length * batch_size, -1]))
+
+    reshaped_internal_policy_states = tf.reshape(
+        internal_policy_states,
+        [time_length * batch_size, self.internal_policy_dim])
+
+    time_step = (float(self.input_time_step) *
+                 tf.expand_dims(
+                     tf.to_float(tf.range(time_length * batch_size) /
+                                 batch_size), -1))
+
+    return (time_step, reshaped_obs, reshaped_prev_act,
+            reshaped_internal_policy_states,
+            reshaped_policy_logits)
+
+  def get_values(self, all_obs, all_actions, internal_policy_states,
+                 policy_logits):
+    """Get value estimates given input."""
+    batch_size = tf.shape(all_obs[0])[1]
+    time_length = tf.shape(all_obs[0])[0]
+
+    (time_step, reshaped_obs, reshaped_prev_act,
+     reshaped_internal_policy_states,
+     reshaped_policy_logits) = self.reshape_batched_inputs(
+         all_obs, all_actions, internal_policy_states, policy_logits)
+
+    input_dim, inputs = self.get_inputs(
+        time_step, reshaped_obs, reshaped_prev_act,
+        reshaped_internal_policy_states)
+
+    for depth in xrange(self.n_hidden_layers):
+      with tf.variable_scope('value_layer%d' % depth):
+        w = tf.get_variable('w', [input_dim, self.hidden_dim])
+        inputs = tf.nn.tanh(tf.matmul(inputs, w))
+        input_dim = self.hidden_dim
+
+    w_v = tf.get_variable('w_v', [input_dim, 1],
+                          initializer=self.matrix_init)
+    values = tf.matmul(inputs, w_v)
+    values = tf.reshape(values, [time_length, batch_size])
+
+    inputs = inputs[:-batch_size]  # remove "final vals"
+    return values, inputs, w_v
+
+
+class UnifiedBaseline(Baseline):
+  """Baseline for Unified PCL."""
+
+  def get_values(self, all_obs, all_actions, internal_policy_states,
+                 policy_logits):
+    batch_size = tf.shape(all_obs[0])[1]
+    time_length = tf.shape(all_obs[0])[0]
+
+    (time_step, reshaped_obs, reshaped_prev_act,
+     reshaped_internal_policy_states,
+     reshaped_policy_logits) = self.reshape_batched_inputs(
+         all_obs, all_actions, internal_policy_states, policy_logits)
+
+    def f_transform(q_values, tau):
+      max_q = tf.reduce_max(q_values, -1, keep_dims=True)
+      return tf.squeeze(max_q, [-1]) + tau * tf.log(
+          tf.reduce_sum(tf.exp((q_values - max_q) / tau), -1))
+
+    assert len(reshaped_policy_logits) == 1
+    values = f_transform((self.tau + self.eps_lambda) * reshaped_policy_logits[0],
+                         (self.tau + self.eps_lambda))
+    values = tf.reshape(values, [time_length, batch_size])
+
+    # not used
+    input_dim, inputs = self.get_inputs(
+        time_step, reshaped_obs, reshaped_prev_act,
+        reshaped_internal_policy_states)
+
+    w_v = tf.get_variable('w_v', [input_dim, 1],
+                          initializer=self.matrix_init)
+
+    inputs = inputs[:-batch_size]  # remove "final vals"
+
+    return values, inputs, w_v
--- a/pcl_rl/controller.py
+++ b/pcl_rl/controller.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Controller coordinates sampling and training model.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import numpy as np
+import pickle
+import random
+
+flags = tf.flags
+gfile = tf.gfile
+
+FLAGS = flags.FLAGS
+
+
+def find_best_eps_lambda(rewards, lengths):
+  """Find the best lambda given a desired epsilon = FLAGS.max_divergence."""
+  # perhaps not the best way to do this
+  desired_div = FLAGS.max_divergence * np.mean(lengths)
+
+  def calc_divergence(eps_lambda):
+    max_reward = np.max(rewards)
+    logz = (max_reward / eps_lambda +
+            np.log(np.mean(np.exp((rewards - max_reward) / eps_lambda))))
+    exprr = np.mean(np.exp(rewards / eps_lambda - logz) *
+                    rewards / eps_lambda)
+    return exprr - logz
+
+  left = 0.0
+  right = 1000.0
+
+  if len(rewards) <= 8:
+    return (left + right) / 2
+
+  num_iter = max(4, 1 + int(np.log((right - left) / 0.1) / np.log(2.0)))
+  for _ in xrange(num_iter):
+    mid = (left + right) / 2
+    cur_div = calc_divergence(mid)
+    if cur_div > desired_div:
+      left = mid
+    else:
+      right = mid
+
+  return (left + right) / 2
+
+
+class Controller(object):
+
+  def __init__(self, env, env_spec, internal_dim,
+               use_online_batch=True,
+               batch_by_steps=False,
+               unify_episodes=False,
+               replay_batch_size=None,
+               max_step=None,
+               cutoff_agent=1,
+               save_trajectories_file=None,
+               use_trust_region=False,
+               use_value_opt=False,
+               update_eps_lambda=False,
+               prioritize_by='rewards',
+               get_model=None,
+               get_replay_buffer=None,
+               get_buffer_seeds=None):
+    self.env = env
+    self.env_spec = env_spec
+    self.internal_dim = internal_dim
+    self.use_online_batch = use_online_batch
+    self.batch_by_steps = batch_by_steps
+    self.unify_episodes = unify_episodes
+    self.replay_batch_size = replay_batch_size
+    self.max_step = max_step
+    self.cutoff_agent = cutoff_agent
+    self.save_trajectories_file = save_trajectories_file
+    self.use_trust_region = use_trust_region
+    self.use_value_opt = use_value_opt
+    self.update_eps_lambda = update_eps_lambda
+    self.prioritize_by = prioritize_by
+
+    self.model = get_model()
+    self.replay_buffer = get_replay_buffer()
+    self.seed_replay_buffer(get_buffer_seeds())
+
+    self.internal_state = np.array([self.initial_internal_state()] *
+                                   len(self.env))
+    self.last_obs = self.env_spec.initial_obs(len(self.env))
+    self.last_act = self.env_spec.initial_act(len(self.env))
+    self.last_pad = np.zeros(len(self.env))
+
+    self.start_episode = np.array([True] * len(self.env))
+    self.step_count = np.array([0] * len(self.env))
+    self.episode_running_rewards = np.zeros(len(self.env))
+    self.episode_running_lengths = np.zeros(len(self.env))
+    self.episode_rewards = []
+    self.episode_lengths = []
+    self.total_rewards = []
+
+    self.best_batch_rewards = None
+
+  def setup(self):
+    self.model.setup()
+
+  def initial_internal_state(self):
+    return np.zeros(self.model.policy.rnn_state_dim)
+
+  def _sample_episodes(self, sess, greedy=False):
+    """Sample episodes from environment using model."""
+    # reset environments as necessary
+    obs_after_reset = self.env.reset_if(self.start_episode)
+
+    for i, obs in enumerate(obs_after_reset):
+      if obs is not None:
+        self.step_count[i] = 0
+        self.internal_state[i] = self.initial_internal_state()
+        for j in xrange(len(self.env_spec.obs_dims)):
+          self.last_obs[j][i] = obs[j]
+        for j in xrange(len(self.env_spec.act_dims)):
+          self.last_act[j][i] = -1
+        self.last_pad[i] = 0
+
+    # maintain episode as a single unit if the last sampling
+    # batch ended before the episode was terminated
+    if self.unify_episodes:
+      assert len(obs_after_reset) == 1
+      new_ep = obs_after_reset[0] is not None
+    else:
+      new_ep = True
+
+    self.start_id = 0 if new_ep else len(self.all_obs[:])
+
+    initial_state = self.internal_state
+    all_obs = [] if new_ep else self.all_obs[:]
+    all_act = ([self.last_act] if new_ep else self.all_act[:])
+    all_pad = [] if new_ep else self.all_pad[:]
+    rewards = [] if new_ep else self.rewards[:]
+
+    # start stepping in the environments
+    step = 0
+    while not self.env.all_done():
+      self.step_count += 1 - np.array(self.env.dones)
+
+      next_internal_state, sampled_actions = self.model.sample_step(
+          sess, self.last_obs, self.internal_state, self.last_act,
+          greedy=greedy)
+
+      env_actions = self.env_spec.convert_actions_to_env(sampled_actions)
+      next_obs, reward, next_dones, _ = self.env.step(env_actions)
+
+      all_obs.append(self.last_obs)
+      all_act.append(sampled_actions)
+      all_pad.append(self.last_pad)
+      rewards.append(reward)
+
+      self.internal_state = next_internal_state
+      self.last_obs = next_obs
+      self.last_act = sampled_actions
+      self.last_pad = np.array(next_dones).astype('float32')
+
+      step += 1
+      if self.max_step and step >= self.max_step:
+        break
+
+    self.all_obs = all_obs[:]
+    self.all_act = all_act[:]
+    self.all_pad = all_pad[:]
+    self.rewards = rewards[:]
+
+    # append final observation
+    all_obs.append(self.last_obs)
+
+    return initial_state, all_obs, all_act, rewards, all_pad
+
+  def sample_episodes(self, sess):
+    """Sample steps from the environment until we have enough for a batch."""
+
+    # check if last batch ended with episode that was not terminated
+    if self.unify_episodes:
+      self.all_new_ep = self.start_episode[0]
+
+    # sample episodes until we either have enough episodes or enough steps
+    episodes = []
+    total_steps = 0
+    while total_steps < self.max_step * len(self.env):
+      (initial_state,
+       observations, actions, rewards,
+       pads) = self._sample_episodes(sess)
+
+      observations = zip(*observations)
+      actions = zip(*actions)
+
+      terminated = np.array(self.env.dones)
+
+      self.total_rewards = np.sum(np.array(rewards[self.start_id:]) *
+                                  (1 - np.array(pads[self.start_id:])), axis=0)
+      self.episode_running_rewards *= 1 - self.start_episode
+      self.episode_running_lengths *= 1 - self.start_episode
+      self.episode_running_rewards += self.total_rewards
+      self.episode_running_lengths += np.sum(1 - np.array(pads[self.start_id:]), axis=0)
+
+      episodes.extend(self.convert_from_batched_episodes(
+          initial_state, observations, actions, rewards,
+          terminated, pads))
+      total_steps += np.sum(1 - np.array(pads))
+
+      # set next starting episodes
+      self.start_episode = np.logical_or(terminated,
+                                         self.step_count >= self.cutoff_agent)
+      episode_rewards = self.episode_running_rewards[self.start_episode].tolist()
+      self.episode_rewards.extend(episode_rewards)
+      self.episode_lengths.extend(self.episode_running_lengths[self.start_episode].tolist())
+      self.episode_rewards = self.episode_rewards[-100:]
+      self.episode_lengths = self.episode_lengths[-100:]
+
+      if (self.save_trajectories_file is not None and
+          (self.best_batch_rewards is None or
+           np.mean(self.total_rewards) > self.best_batch_rewards)):
+        self.best_batch_rewards = np.mean(self.total_rewards)
+        my_episodes = self.convert_from_batched_episodes(
+          initial_state, observations, actions, rewards,
+          terminated, pads)
+        with gfile.GFile(self.save_trajectories_file, 'w') as f:
+          pickle.dump(my_episodes, f)
+
+      if not self.batch_by_steps:
+        return (initial_state,
+                observations, actions, rewards,
+                terminated, pads)
+
+    return self.convert_to_batched_episodes(episodes)
+
+  def _train(self, sess,
+             observations, initial_state, actions,
+             rewards, terminated, pads):
+    """Train model using batch."""
+    if self.use_trust_region:
+      # use trust region to optimize policy
+      loss, _, summary = self.model.trust_region_step(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads,
+          avg_episode_reward=np.mean(self.episode_rewards))
+    else:  # otherwise use simple gradient descent on policy
+      loss, _, summary = self.model.train_step(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads,
+          avg_episode_reward=np.mean(self.episode_rewards))
+
+    if self.use_value_opt:  # optionally perform specific value optimization
+      self.model.fit_values(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads)
+
+    return loss, summary
+
+  def train(self, sess):
+    """Sample some episodes and train on some episodes."""
+    cur_step = sess.run(self.model.inc_global_step)
+    self.cur_step = cur_step
+
+    # on the first iteration, set target network close to online network
+    if self.cur_step == 0:
+      for _ in xrange(100):
+        sess.run(self.model.copy_op)
+    # on other iterations, just perform single target <-- online operation
+    sess.run(self.model.copy_op)
+
+    # sample from env
+    (initial_state,
+     observations, actions, rewards,
+     terminated, pads) = self.sample_episodes(sess)
+
+    # add to replay buffer
+    self.add_to_replay_buffer(
+        initial_state, observations, actions,
+        rewards, terminated, pads)
+
+    loss, summary = 0, None
+    # train on online batch
+    if self.use_online_batch:
+      loss, summary = self._train(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads)
+
+    # update relative entropy coefficient
+    if self.update_eps_lambda:
+      episode_rewards = np.array(self.episode_rewards)
+      episode_lengths = np.array(self.episode_lengths)
+      eps_lambda = find_best_eps_lambda(episode_rewards, episode_lengths)
+      sess.run(self.model.objective.assign_eps_lambda,
+               feed_dict={self.model.objective.new_eps_lambda: eps_lambda})
+
+    # train on replay batch
+    replay_batch, replay_probs = self.get_from_replay_buffer(
+        self.replay_batch_size)
+    if replay_batch:
+      (initial_state,
+       observations, actions, rewards,
+       terminated, pads) = replay_batch
+
+      loss, summary = self._train(
+          sess,
+          observations, initial_state, actions,
+          rewards, terminated, pads)
+
+    return loss, summary, self.total_rewards, self.episode_rewards
+
+  def eval(self, sess):
+    """Use greedy sampling."""
+    (initial_state,
+     observations, actions, rewards,
+     pads) = self._sample_episodes(sess, greedy=True)
+
+    total_rewards = np.sum(np.array(rewards) * (1 - np.array(pads)), axis=0)
+    return np.mean(total_rewards)
+
+  def convert_from_batched_episodes(
+      self, initial_state, observations, actions, rewards,
+      terminated, pads):
+    """Convert time-major batch of episodes to batch-major list of episodes."""
+
+    rewards = np.array(rewards)
+    pads = np.array(pads)
+    observations = [np.array(obs) for obs in observations]
+    actions = [np.array(act) for act in actions]
+
+    total_rewards = np.sum(rewards * (1 - pads), axis=0)
+    total_length = np.sum(1 - pads, axis=0).astype('int32')
+
+    episodes = []
+    num_episodes = rewards.shape[1]
+    for i in xrange(num_episodes):
+      length = total_length[i]
+      ep_initial = initial_state[i]
+      ep_obs = [obs[:length, i, ...] for obs in observations]
+      ep_act = [act[:length + 1, i, ...] for act in actions]
+      ep_rewards = rewards[:length, i]
+
+      episodes.append(
+          [ep_initial, ep_obs, ep_act, ep_rewards, terminated[i]])
+
+    return episodes
+
+  def convert_to_batched_episodes(self, episodes, max_length=None):
+    """Convert batch-major list of episodes to time-major batch of episodes."""
+    lengths = [len(ep[-2]) for ep in episodes]
+    max_length = max_length or max(lengths)
+
+    new_episodes = []
+    for ep, length in zip(episodes, lengths):
+      initial, observations, actions, rewards, terminated = ep
+      observations = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:])
+                      for obs in observations]
+      actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:])
+                 for act in actions]
+      pads = np.array([0] * length + [1] * (max_length - length))
+      rewards = np.resize(rewards, [max_length]) * (1 - pads)
+      new_episodes.append([initial, observations, actions, rewards,
+                           terminated, pads])
+
+    (initial, observations, actions, rewards,
+     terminated, pads) = zip(*new_episodes)
+    observations = [np.swapaxes(obs, 0, 1)
+                    for obs in zip(*observations)]
+    actions = [np.swapaxes(act, 0, 1)
+               for act in zip(*actions)]
+    rewards = np.transpose(rewards)
+    pads = np.transpose(pads)
+
+    return (initial, observations, actions, rewards, terminated, pads)
+
+  def add_to_replay_buffer(self, initial_state,
+                           observations, actions, rewards,
+                           terminated, pads):
+    """Add batch of episodes to replay buffer."""
+    if self.replay_buffer is None:
+      return
+
+    rewards = np.array(rewards)
+    pads = np.array(pads)
+    total_rewards = np.sum(rewards * (1 - pads), axis=0)
+
+    episodes = self.convert_from_batched_episodes(
+      initial_state, observations, actions, rewards,
+      terminated, pads)
+
+    priorities = (total_rewards if self.prioritize_by == 'reward'
+                  else self.cur_step)
+
+    if not self.unify_episodes or self.all_new_ep:
+      self.last_idxs = self.replay_buffer.add(
+          episodes, priorities)
+    else:
+      # If we are unifying episodes, we attempt to
+      # keep them unified in the replay buffer.
+      # The first episode sampled in the current batch is a
+      # continuation of the last episode from the previous batch
+      self.replay_buffer.add(episodes[:1], priorities, self.last_idxs[-1:])
+      if len(episodes) > 1:
+        self.replay_buffer.add(episodes[1:], priorities)
+
+  def get_from_replay_buffer(self, batch_size):
+    """Sample a batch of episodes from the replay buffer."""
+    if self.replay_buffer is None or len(self.replay_buffer) < 1 * batch_size:
+      return None, None
+
+    desired_count = batch_size * self.max_step
+    # in the case of batch_by_steps, we sample larger and larger
+    # amounts from the replay buffer until we have enough steps.
+    while True:
+      if batch_size > len(self.replay_buffer):
+        batch_size = len(self.replay_buffer)
+      episodes, probs = self.replay_buffer.get_batch(batch_size)
+      count = sum(len(ep[-2]) for ep in episodes)
+      if count >= desired_count or not self.batch_by_steps:
+        break
+      if batch_size == len(self.replay_buffer):
+        return None, None
+      batch_size *= 1.2
+
+    return (self.convert_to_batched_episodes(episodes), probs)
+
+  def seed_replay_buffer(self, episodes):
+    """Seed the replay buffer with some episodes."""
+    if self.replay_buffer is None:
+      return
+
+    # just need to add initial state
+    for i in xrange(len(episodes)):
+      episodes[i] = [self.initial_internal_state()] + episodes[i]
+
+    self.replay_buffer.seed_buffer(episodes)
--- a/pcl_rl/env_spec.py
+++ b/pcl_rl/env_spec.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for environment interface with agent / tensorflow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class spaces(object):
+  discrete = 0
+  box = 1
+
+
+def get_space(space):
+  if hasattr(space, 'n'):
+    return space.n, spaces.discrete, None
+  elif hasattr(space, 'shape'):
+    return np.prod(space.shape), spaces.box, (space.low, space.high)
+
+
+def get_spaces(spaces):
+  if hasattr(spaces, 'spaces'):
+    return zip(*[get_space(space) for space in spaces.spaces])
+  else:
+    return [(ret,) for ret in get_space(spaces)]
+
+
+class EnvSpec(object):
+
+  def __init__(self, env, try_combining_actions=True,
+               discretize_actions=None):
+    self.discretize_actions = discretize_actions
+
+    # figure out observation space
+    self.obs_space = env.observation_space
+    self.obs_dims, self.obs_types, self.obs_info = get_spaces(self.obs_space)
+
+    # figure out action space
+    self.act_space = env.action_space
+    self.act_dims, self.act_types, self.act_info = get_spaces(self.act_space)
+
+    if self.discretize_actions:
+      self._act_dims = self.act_dims[:]
+      self._act_types = self.act_types[:]
+      self.act_dims = []
+      self.act_types = []
+      for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
+        if typ == spaces.discrete:
+          self.act_dims.append(dim)
+          self.act_types.append(spaces.discrete)
+        elif typ == spaces.box:
+          for _ in xrange(dim):
+            self.act_dims.append(self.discretize_actions)
+            self.act_types.append(spaces.discrete)
+    else:
+      self._act_dims = None
+      self._act_types = None
+
+    if (try_combining_actions and
+        all(typ == spaces.discrete for typ in self.act_types)):
+      self.combine_actions = True
+      self.orig_act_dims = self.act_dims[:]
+      self.orig_act_types = self.act_types[:]
+      total_act_dim = 1
+      for dim in self.act_dims:
+        total_act_dim *= dim
+      self.act_dims = [total_act_dim]
+      self.act_types = [spaces.discrete]
+    else:
+      self.combine_actions = False
+
+    self.obs_dims_and_types = zip(self.obs_dims, self.obs_types)
+    self.act_dims_and_types = zip(self.act_dims, self.act_types)
+
+    self.total_obs_dim = sum(self.obs_dims)
+    self.total_sampling_act_dim = sum(self.sampling_dim(dim, typ)
+                                      for dim, typ in self.act_dims_and_types)
+    self.total_sampled_act_dim = sum(self.act_dims)
+
+  def sampling_dim(self, dim, typ):
+    if typ == spaces.discrete:
+      return dim
+    elif typ == spaces.box:
+      return 2 * dim  # Gaussian mean and std
+    else:
+      assert False
+
+  def convert_actions_to_env(self, actions):
+    if self.combine_actions:
+      new_actions = []
+      actions = actions[0]
+      for dim in self.orig_act_dims:
+        new_actions.append(np.mod(actions, dim))
+        actions = (actions / dim).astype('int32')
+      actions = new_actions
+
+    if self.discretize_actions:
+      new_actions = []
+      idx = 0
+      for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
+        if typ == spaces.discrete:
+          new_actions.append(actions[idx])
+          idx += 1
+        elif typ == spaces.box:
+          low, high = self.act_info[i]
+          cur_action = []
+          for j in xrange(dim):
+            cur_action.append(
+                low[j] + (high[j] - low[j]) * actions[idx] /
+                float(self.discretize_actions))
+            idx += 1
+          new_actions.append(np.hstack(cur_action))
+      actions = new_actions
+
+    return actions
+
+  def convert_env_actions_to_actions(self, actions):
+    if not self.combine_actions:
+      return actions
+
+    new_actions = 0
+    base = 1
+    for act, dim in zip(actions, self.orig_act_dims):
+      new_actions = new_actions + base * act
+      base *= dim
+
+    return [new_actions]
+
+  def convert_obs_to_list(self, obs):
+    if len(self.obs_dims) == 1:
+      return [obs]
+    else:
+      return list(obs)
+
+  def convert_action_to_gym(self, action):
+    if len(action) == 1:
+      return action[0]
+    else:
+      return list(action)
+    if ((not self.combine_actions or len(self.orig_act_dims) == 1) and
+        (len(self.act_dims) == 1 or
+         (self.discretize_actions and len(self._act_dims) == 1))):
+      return action[0]
+    else:
+      return list(action)
+
+  def initial_obs(self, batch_size):
+    batched = batch_size is not None
+    batch_size = batch_size or 1
+
+    obs = []
+    for dim, typ in self.obs_dims_and_types:
+      if typ == spaces.discrete:
+        obs.append(np.zeros(batch_size))
+      elif typ == spaces.box:
+        obs.append(np.zeros([batch_size, dim]))
+
+    if batched:
+      return obs
+    else:
+      return zip(*obs)[0]
+
+  def initial_act(self, batch_size=None):
+    batched = batch_size is not None
+    batch_size = batch_size or 1
+
+    act = []
+    for dim, typ in self.act_dims_and_types:
+      if typ == spaces.discrete:
+        act.append(-np.ones(batch_size))
+      elif typ == spaces.box:
+        act.append(-np.ones([batch_size, dim]))
+
+    if batched:
+      return act
+    else:
+      return zip(*act)[0]
+
+  def is_discrete(self, typ):
+    return typ == spaces.discrete
+
+  def is_box(self, typ):
+    return typ == spaces.box
--- a/pcl_rl/expert_paths.py
+++ b/pcl_rl/expert_paths.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Expert paths/trajectories.
+
+For producing or loading expert trajectories in environment.
+"""
+
+import tensorflow as tf
+import random
+import os
+import numpy as np
+import pickle
+
+gfile = tf.gfile
+
+
+def sample_expert_paths(num, env_str, env_spec,
+                        load_trajectories_file=None):
+  """Sample a number of expert paths randomly."""
+  if load_trajectories_file is not None:
+    if not gfile.Exists(load_trajectories_file):
+      assert False, 'trajectories file %s does not exist' % load_trajectories_file
+
+    with gfile.GFile(load_trajectories_file, 'r') as f:
+      episodes = pickle.load(f)
+      episodes = random.sample(episodes, num)
+      return [ep[1:] for ep in episodes]
+
+  return [sample_expert_path(env_str, env_spec)
+          for _ in xrange(num)]
+
+
+def sample_expert_path(env_str, env_spec):
+  """Algorithmic tasks have known distribution of expert paths we sample from."""
+  t = random.randint(2, 10)  # sequence length
+  observations = []
+  actions = [env_spec.initial_act(None)]
+  rewards = []
+
+  if env_str in ['DuplicatedInput-v0', 'Copy-v0']:
+    chars = 5
+    random_ints = [int(random.random() * 1000) for _ in xrange(t)]
+    for tt in xrange(t):
+      char_idx = tt // 2 if env_str == 'DuplicatedInput-v0' else tt
+      char = random_ints[char_idx] % chars
+      observations.append([char])
+      actions.append([1, (tt + 1) % 2, char])
+      rewards.append((tt + 1) % 2)
+  elif env_str in ['RepeatCopy-v0']:
+    chars = 5
+
+    random_ints = [int(random.random() * 1000) for _ in xrange(t)]
+    for tt in xrange(3 * t + 2):
+      char_idx = (tt if tt < t else
+                  2 * t - tt if tt <= 2 * t else
+                  tt - 2 * t - 2)
+      if tt in [t, 2 * t + 1]:
+        char = chars
+      else:
+        char = random_ints[char_idx] % chars
+      observations.append([char])
+      actions.append([1 if tt < t else 0 if tt <= 2 * t else 1,
+                      tt not in [t, 2 * t + 1], char])
+      rewards.append(actions[-1][-2])
+  elif env_str in ['Reverse-v0']:
+    chars = 2
+    random_ints = [int(random.random() * 1000) for _ in xrange(t)]
+    for tt in xrange(2 * t + 1):
+      char_idx = tt if tt < t else 2 * t - tt
+      if tt != t:
+        char = random_ints[char_idx] % chars
+      else:
+        char = chars
+      observations.append([char])
+      actions.append([tt < t, tt > t, char])
+      rewards.append(tt > t)
+  elif env_str in ['ReversedAddition-v0']:
+    chars = 3
+    random_ints = [int(random.random() * 1000) for _ in xrange(1 + 2 * t)]
+    carry = 0
+    char_history = []
+    move_map = {0: 3, 1: 1, 2: 2, 3: 1}
+    for tt in xrange(2 * t + 1):
+      char_idx = tt
+      if tt >= 2 * t:
+        char = chars
+      else:
+        char = random_ints[char_idx] % chars
+      char_history.append(char)
+      if tt % 2 == 1:
+        tot = char_history[-2] + char_history[-1] + carry
+        carry = tot // chars
+        tot = tot % chars
+      elif tt == 2 * t:
+        tot = carry
+      else:
+        tot = 0
+      observations.append([char])
+      actions.append([move_map[tt % len(move_map)],
+                      tt % 2 or tt == 2 * t, tot])
+      rewards.append(tt % 2 or tt == 2 * t)
+  elif env_str in ['ReversedAddition3-v0']:
+    chars = 3
+    random_ints = [int(random.random() * 1000) for _ in xrange(1 + 3 * t)]
+    carry = 0
+    char_history = []
+    move_map = {0: 3, 1: 3, 2: 1, 3: 2, 4:2, 5: 1}
+    for tt in xrange(3 * t + 1):
+      char_idx = tt
+      if tt >= 3 * t:
+        char = chars
+      else:
+        char = random_ints[char_idx] % chars
+      char_history.append(char)
+      if tt % 3 == 2:
+        tot = char_history[-3] + char_history[-2] + char_history[-1] + carry
+        carry = tot // chars
+        tot = tot % chars
+      elif tt == 3 * t:
+        tot = carry
+      else:
+        tot = 0
+      observations.append([char])
+      actions.append([move_map[tt % len(move_map)],
+                      tt % 3 == 2 or tt == 3 * t, tot])
+      rewards.append(tt % 3 == 2 or tt == 3 * t)
+
+  else:
+    assert False, 'No expert trajectories for env %s' % env_str
+
+  actions = [
+      env_spec.convert_env_actions_to_actions(act)
+      for act in actions]
+  observations.append([chars])
+
+  observations = [np.array(obs) for obs in zip(*observations)]
+  actions = [np.array(act) for act in zip(*actions)]
+  rewards = np.array(rewards)
+  return [observations, actions, rewards, True]
--- a/pcl_rl/full_episode_objective.py
+++ b/pcl_rl/full_episode_objective.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Objectives for full-episode.
+
+Implementations of UREX & REINFORCE.  Note that these implementations
+use a non-parametric baseline to reduce variance.  Thus, multiple
+samples with the same seed must be taken from the environment.
+
+"""
+
+import tensorflow as tf
+
+import objective
+
+
+class Reinforce(objective.Objective):
+  def __init__(self, learning_rate, clip_norm, num_samples,
+               tau=0.1, bonus_weight=1.0):
+    super(Reinforce, self).__init__(learning_rate, clip_norm=clip_norm)
+    self.num_samples = num_samples
+    assert self.num_samples > 1
+    self.tau = tau
+    self.bonus_weight = bonus_weight
+    self.eps_lambda = 0.0
+
+  def get_bonus(self, total_rewards, total_log_probs):
+    """Exploration bonus."""
+    return -self.tau * total_log_probs
+
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    seq_length = tf.shape(rewards)[0]
+
+    not_pad = tf.reshape(1 - pads, [seq_length, -1, self.num_samples])
+    rewards = not_pad * tf.reshape(rewards, [seq_length, -1, self.num_samples])
+    log_probs = not_pad * tf.reshape(sum(log_probs), [seq_length, -1, self.num_samples])
+
+    total_rewards = tf.reduce_sum(rewards, 0)
+    total_log_probs = tf.reduce_sum(log_probs, 0)
+
+    rewards_and_bonus = (total_rewards +
+                         self.bonus_weight *
+                         self.get_bonus(total_rewards, total_log_probs))
+
+    baseline = tf.reduce_mean(rewards_and_bonus, 1, keep_dims=True)
+
+    loss = -tf.stop_gradient(rewards_and_bonus - baseline) * total_log_probs
+    loss = tf.reduce_mean(loss)
+    raw_loss = loss  # TODO
+
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+
+    tf.summary.histogram('log_probs', total_log_probs)
+    tf.summary.histogram('rewards', total_rewards)
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(total_rewards))
+    tf.summary.scalar('loss', loss)
+
+    return loss, raw_loss, baseline, gradient_ops, tf.summary.merge_all()
+
+
+class UREX(Reinforce):
+  def get_bonus(self, total_rewards, total_log_probs):
+    """Exploration bonus."""
+    discrepancy = total_rewards / self.tau - total_log_probs
+    normalized_d = self.num_samples * tf.nn.softmax(discrepancy)
+    return self.tau * normalized_d
--- a/pcl_rl/gym_wrapper.py
+++ b/pcl_rl/gym_wrapper.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper around gym env.
+
+Allows for using batches of possibly identitically seeded environments.
+"""
+
+import gym
+import numpy as np
+import random
+
+import env_spec
+
+
+def get_env(env_str):
+  return gym.make(env_str)
+
+
+class GymWrapper(object):
+
+  def __init__(self, env_str, distinct=1, count=1, seeds=None):
+    self.distinct = distinct
+    self.count = count
+    self.total = self.distinct * self.count
+    self.seeds = seeds or [random.randint(0, 1e12)
+                           for _ in xrange(self.distinct)]
+
+    self.envs = []
+    for seed in self.seeds:
+      for _ in xrange(self.count):
+        env = get_env(env_str)
+        env.seed(seed)
+        if hasattr(env, 'last'):
+          env.last = 100  # for algorithmic envs
+        self.envs.append(env)
+
+    self.dones = [True] * self.total
+    self.num_episodes_played = 0
+
+    one_env = self.get_one()
+    self.use_action_list = hasattr(one_env.action_space, 'spaces')
+    self.env_spec = env_spec.EnvSpec(self.get_one())
+
+  def get_seeds(self):
+    return self.seeds
+
+  def reset(self):
+    self.dones = [False] * self.total
+    self.num_episodes_played += len(self.envs)
+
+    # reset seeds to be synchronized
+    self.seeds = [random.randint(0, 1e12) for _ in xrange(self.distinct)]
+    counter = 0
+    for seed in self.seeds:
+      for _ in xrange(self.count):
+        self.envs[counter].seed(seed)
+        counter += 1
+
+    return [self.env_spec.convert_obs_to_list(env.reset())
+            for env in self.envs]
+
+  def reset_if(self, predicate=None):
+    if predicate is None:
+      predicate = self.dones
+    if self.count != 1:
+      assert np.all(predicate)
+      return self.reset()
+    self.num_episodes_played += sum(predicate)
+    output = [self.env_spec.convert_obs_to_list(env.reset())
+              if pred else None
+              for env, pred in zip(self.envs, predicate)]
+    for i, pred in enumerate(predicate):
+      if pred:
+        self.dones[i] = False
+    return output
+
+  def all_done(self):
+    return all(self.dones)
+
+  def step(self, actions):
+
+    def env_step(action):
+      action = self.env_spec.convert_action_to_gym(action)
+      obs, reward, done, tt = env.step(action)
+      obs = self.env_spec.convert_obs_to_list(obs)
+      return obs, reward, done, tt
+
+    actions = zip(*actions)
+    outputs = [env_step(action)
+               if not done else (self.env_spec.initial_obs(None), 0, True, None)
+               for action, env, done in zip(actions, self.envs, self.dones)]
+    for i, (_, _, done, _) in enumerate(outputs):
+      self.dones[i] = self.dones[i] or done
+    obs, reward, done, tt = zip(*outputs)
+    obs = [list(oo) for oo in zip(*obs)]
+    return [obs, reward, done, tt]
+
+  def get_one(self):
+    return random.choice(self.envs)
+
+  def __len__(self):
+    return len(self.envs)
--- a/pcl_rl/model.py
+++ b/pcl_rl/model.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Model is responsible for setting up Tensorflow graph.
+
+Creates policy and value networks.  Also sets up all optimization
+ops, including gradient ops, trust region ops, and value optimizers.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class Model(object):
+
+  def __init__(self, env_spec, global_step,
+               target_network_lag=0.95,
+               sample_from='online',
+               get_policy=None,
+               get_baseline=None,
+               get_objective=None,
+               get_trust_region_p_opt=None,
+               get_value_opt=None):
+    self.env_spec = env_spec
+
+    self.global_step = global_step
+    self.inc_global_step = self.global_step.assign_add(1)
+
+    self.target_network_lag = target_network_lag
+    self.sample_from = sample_from
+
+    self.policy = get_policy()
+    self.baseline = get_baseline()
+    self.objective = get_objective()
+    self.baseline.eps_lambda = self.objective.eps_lambda  # TODO: do this better
+    self.trust_region_policy_opt = get_trust_region_p_opt()
+    self.value_opt = get_value_opt()
+
+  def setup_placeholders(self):
+    """Create the Tensorflow placeholders."""
+    # summary placeholder
+    self.avg_episode_reward = tf.placeholder(
+        tf.float32, [], 'avg_episode_reward')
+
+    # sampling placeholders
+    self.internal_state = tf.placeholder(tf.float32,
+                                         [None, self.policy.rnn_state_dim],
+                                         'internal_state')
+
+    self.single_observation = []
+    for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        self.single_observation.append(
+            tf.placeholder(tf.int32, [None], 'obs%d' % i))
+      elif self.env_spec.is_box(obs_type):
+        self.single_observation.append(
+            tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i))
+      else:
+        assert False
+
+    self.single_action = []
+    for i, (action_dim, action_type) in \
+        enumerate(self.env_spec.act_dims_and_types):
+      if self.env_spec.is_discrete(action_type):
+        self.single_action.append(
+            tf.placeholder(tf.int32, [None], 'act%d' % i))
+      elif self.env_spec.is_box(action_type):
+        self.single_action.append(
+            tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i))
+      else:
+        assert False
+
+    # training placeholders
+    self.observations = []
+    for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        self.observations.append(
+            tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i))
+      else:
+        self.observations.append(
+            tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i))
+
+    self.actions = []
+    self.other_logits = []
+    for i, (action_dim, action_type) in \
+        enumerate(self.env_spec.act_dims_and_types):
+      if self.env_spec.is_discrete(action_type):
+        self.actions.append(
+            tf.placeholder(tf.int32, [None, None], 'all_act%d' % i))
+      if self.env_spec.is_box(action_type):
+        self.actions.append(
+            tf.placeholder(tf.float32, [None, None, action_dim],
+                           'all_act%d' % i))
+      self.other_logits.append(
+          tf.placeholder(tf.float32, [None, None, None],
+                         'other_logits%d' % i))
+
+    self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards')
+    self.terminated = tf.placeholder(tf.float32, [None], 'terminated')
+    self.pads = tf.placeholder(tf.float32, [None, None], 'pads')
+
+    self.prev_log_probs = tf.placeholder(tf.float32, [None, None],
+                                         'prev_log_probs')
+
+  def setup(self):
+    """Setup Tensorflow Graph."""
+
+    self.setup_placeholders()
+
+    tf.summary.scalar('avg_episode_reward', self.avg_episode_reward)
+
+    with tf.variable_scope('model', reuse=None):
+      # policy network
+      with tf.variable_scope('policy_net'):
+        (self.policy_internal_states, self.logits, self.log_probs,
+         self.entropies, self.self_kls) = \
+            self.policy.multi_step(self.observations,
+                                   self.internal_state,
+                                   self.actions)
+        self.out_log_probs = sum(self.log_probs)
+        self.kl = self.policy.calculate_kl(self.other_logits, self.logits)
+        self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) /
+                       tf.reduce_sum(1 - self.pads))
+
+      # value network
+      with tf.variable_scope('value_net'):
+        (self.values,
+         self.regression_input,
+         self.regression_weight) = self.baseline.get_values(
+            self.observations, self.actions,
+            self.policy_internal_states, self.logits)
+
+      # target policy network
+      with tf.variable_scope('target_policy_net'):
+        (self.target_policy_internal_states,
+         self.target_logits, self.target_log_probs,
+         _, _) = \
+            self.policy.multi_step(self.observations,
+                                   self.internal_state,
+                                   self.actions)
+
+      # target value network
+      with tf.variable_scope('target_value_net'):
+        (self.target_values, _, _) = self.baseline.get_values(
+            self.observations, self.actions,
+            self.target_policy_internal_states, self.target_logits)
+
+      # construct copy op online --> target
+      all_vars = tf.trainable_variables()
+      online_vars = [p for p in all_vars if
+                     '/policy_net' in p.name or '/value_net' in p.name]
+      target_vars = [p for p in all_vars if
+                     'target_policy_net' in p.name or 'target_value_net' in p.name]
+      online_vars.sort(key=lambda p: p.name)
+      target_vars.sort(key=lambda p: p.name)
+      aa = self.target_network_lag
+      self.copy_op = tf.group(*[
+          target_p.assign(aa * target_p + (1 - aa) * online_p)
+          for online_p, target_p in zip(online_vars, target_vars)])
+
+      # evaluate objective
+      (self.loss, self.raw_loss, self.regression_target,
+       self.gradient_ops, self.summary) = self.objective.get(
+          self.rewards, self.pads,
+          self.values[:-1, :],
+          self.values[-1, :] * (1 - self.terminated),
+          self.log_probs, self.prev_log_probs, self.target_log_probs,
+          self.entropies,
+          self.logits)
+
+      self.regression_target = tf.reshape(self.regression_target, [-1])
+
+      self.policy_vars = [
+          v for v in tf.trainable_variables()
+          if '/policy_net' in v.name]
+      self.value_vars = [
+          v for v in tf.trainable_variables()
+          if '/value_net' in v.name]
+
+    # trust region optimizer
+    if self.trust_region_policy_opt is not None:
+      with tf.variable_scope('trust_region_policy', reuse=None):
+        avg_self_kl = (
+            tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) /
+            tf.reduce_sum(1 - self.pads))
+
+        self.trust_region_policy_opt.setup(
+            self.policy_vars, self.raw_loss, avg_self_kl,
+            self.avg_kl)
+
+    # value optimizer
+    if self.value_opt is not None:
+      with tf.variable_scope('trust_region_value', reuse=None):
+        self.value_opt.setup(
+            self.value_vars,
+            tf.reshape(self.values[:-1, :], [-1]),
+            self.regression_target,
+            tf.reshape(self.pads, [-1]),
+            self.regression_input, self.regression_weight)
+
+    # we re-use variables for the sampling operations
+    with tf.variable_scope('model', reuse=True):
+      scope = ('target_policy_net' if self.sample_from == 'target'
+               else 'policy_net')
+      with tf.variable_scope(scope):
+        self.next_internal_state, self.sampled_actions = \
+            self.policy.sample_step(self.single_observation,
+                                    self.internal_state,
+                                    self.single_action)
+        self.greedy_next_internal_state, self.greedy_sampled_actions = \
+            self.policy.sample_step(self.single_observation,
+                                    self.internal_state,
+                                    self.single_action,
+                                    greedy=True)
+
+  def sample_step(self, sess,
+                  single_observation, internal_state, single_action,
+                  greedy=False):
+    """Sample batch of steps from policy."""
+    if greedy:
+      outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions]
+    else:
+      outputs = [self.next_internal_state, self.sampled_actions]
+
+    feed_dict = {self.internal_state: internal_state}
+    for action_place, action in zip(self.single_action, single_action):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.single_observation, single_observation):
+      feed_dict[obs_place] = obs
+
+    return sess.run(outputs, feed_dict=feed_dict)
+
+  def train_step(self, sess,
+                 observations, internal_state, actions,
+                 rewards, terminated, pads,
+                 avg_episode_reward=0):
+    """Train network using standard gradient descent."""
+    outputs = [self.raw_loss, self.gradient_ops, self.summary]
+    feed_dict = {self.internal_state: internal_state,
+                 self.rewards: rewards,
+                 self.terminated: terminated,
+                 self.pads: pads,
+                 self.avg_episode_reward: avg_episode_reward}
+    for action_place, action in zip(self.actions, actions):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.observations, observations):
+      feed_dict[obs_place] = obs
+
+    return sess.run(outputs, feed_dict=feed_dict)
+
+
+  def trust_region_step(self, sess,
+                        observations, internal_state, actions,
+                        rewards, terminated, pads,
+                        avg_episode_reward=0):
+    """Train policy using trust region step."""
+    feed_dict = {self.internal_state: internal_state,
+                 self.rewards: rewards,
+                 self.terminated: terminated,
+                 self.pads: pads,
+                 self.avg_episode_reward: avg_episode_reward}
+    for action_place, action in zip(self.actions, actions):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.observations, observations):
+      feed_dict[obs_place] = obs
+
+    (prev_log_probs, prev_logits) = sess.run(
+        [self.out_log_probs, self.logits], feed_dict=feed_dict)
+    feed_dict[self.prev_log_probs] = prev_log_probs
+    for other_logit, prev_logit in zip(self.other_logits, prev_logits):
+      feed_dict[other_logit] = prev_logit
+
+    # fit policy
+    self.trust_region_policy_opt.optimize(sess, feed_dict)
+
+    ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict)
+    ret = [ret[0], None, ret[1]]
+    return ret
+
+  def fit_values(self, sess,
+                 observations, internal_state, actions,
+                 rewards, terminated, pads):
+    """Train value network using value-specific optimizer."""
+    feed_dict = {self.internal_state: internal_state,
+                 self.rewards: rewards,
+                 self.terminated: terminated,
+                 self.pads: pads}
+    for action_place, action in zip(self.actions, actions):
+      feed_dict[action_place] = action
+    for obs_place, obs in zip(self.observations, observations):
+      feed_dict[obs_place] = obs
+
+    # fit values
+    if self.value_opt is None:
+      raise ValueError('Specific value optimizer does not exist')
+    self.value_opt.optimize(sess, feed_dict)
--- a/pcl_rl/objective.py
+++ b/pcl_rl/objective.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Objectives to compute loss and value targets.
+
+Implements Actor Critic, PCL (vanilla PCL, Unified PCL, Trust PCL), and TRPO.
+"""
+
+import tensorflow as tf
+import numpy as np
+
+
+class Objective(object):
+  def __init__(self, learning_rate, clip_norm):
+    self.learning_rate = learning_rate
+    self.clip_norm = clip_norm
+
+  def get_optimizer(self, learning_rate):
+    """Optimizer for gradient descent ops."""
+    return tf.train.AdamOptimizer(learning_rate=learning_rate,
+                                  epsilon=2e-4)
+
+  def training_ops(self, loss, learning_rate=None):
+    """Gradient ops."""
+    opt = self.get_optimizer(learning_rate)
+    params = tf.trainable_variables()
+    grads = tf.gradients(loss, params)
+
+    if self.clip_norm:
+      grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm)
+      tf.summary.scalar('grad_global_norm', global_norm)
+
+    return opt.apply_gradients(zip(grads, params))
+
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    """Get objective calculations."""
+    raise NotImplementedError()
+
+
+def discounted_future_sum(values, discount, rollout):
+  """Discounted future sum of time-major values."""
+  discount_filter = tf.reshape(
+      discount ** tf.range(float(rollout)), [-1, 1, 1])
+  expanded_values = tf.concat(
+      [values, tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
+
+  conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
+      tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
+      stride=1, padding='VALID'), -1))
+
+  return conv_values
+
+
+def discounted_two_sided_sum(values, discount, rollout):
+  """Discounted two-sided sum of time-major values."""
+  roll = float(rollout)
+  discount_filter = tf.reshape(
+      discount ** tf.abs(tf.range(-roll + 1, roll)), [-1, 1, 1])
+  expanded_values = tf.concat(
+      [tf.zeros([rollout - 1, tf.shape(values)[1]]), values,
+       tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
+
+  conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
+      tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
+      stride=1, padding='VALID'), -1))
+
+  return conv_values
+
+
+def shift_values(values, discount, rollout, final_values=0.0):
+  """Shift values up by some amount of time.
+
+  Those values that shift from a value beyond the last value
+  are calculated using final_values.
+
+  """
+  roll_range = tf.cumsum(tf.ones_like(values[:rollout, :]), 0,
+                         exclusive=True, reverse=True)
+  final_pad = tf.expand_dims(final_values, 0) * discount ** roll_range
+  return tf.concat([discount ** rollout * values[rollout:, :],
+                    final_pad], 0)
+
+
+class ActorCritic(Objective):
+  """Standard Actor-Critic."""
+
+  def __init__(self, learning_rate, clip_norm=5,
+               policy_weight=1.0, critic_weight=0.1,
+               tau=0.1, gamma=1.0, rollout=10,
+               eps_lambda=0.0, clip_adv=None):
+    super(ActorCritic, self).__init__(learning_rate, clip_norm=clip_norm)
+    self.policy_weight = policy_weight
+    self.critic_weight = critic_weight
+    self.tau = tau
+    self.gamma = gamma
+    self.rollout = rollout
+    self.clip_adv = clip_adv
+
+    self.eps_lambda = tf.get_variable(  # TODO: need a better way
+        'eps_lambda', [], initializer=tf.constant_initializer(eps_lambda))
+    self.new_eps_lambda = tf.placeholder(tf.float32, [])
+    self.assign_eps_lambda = self.eps_lambda.assign(
+        0.95 * self.eps_lambda + 0.05 * self.new_eps_lambda)
+
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    not_pad = 1 - pads
+    batch_size = tf.shape(rewards)[1]
+
+    entropy = not_pad * sum(entropies)
+    rewards = not_pad * rewards
+    value_estimates = not_pad * values
+    log_probs = not_pad * sum(log_probs)
+
+    sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
+    last_values = shift_values(value_estimates, self.gamma, self.rollout,
+                               final_values)
+
+    future_values = sum_rewards + last_values
+    baseline_values = value_estimates
+
+    adv = tf.stop_gradient(-baseline_values + future_values)
+    if self.clip_adv:
+      adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
+    policy_loss = -adv * log_probs
+    critic_loss = -adv * baseline_values
+    regularizer = -self.tau * entropy
+
+    policy_loss = tf.reduce_mean(
+        tf.reduce_sum(policy_loss * not_pad, 0))
+    critic_loss = tf.reduce_mean(
+        tf.reduce_sum(critic_loss * not_pad, 0))
+    regularizer = tf.reduce_mean(
+        tf.reduce_sum(regularizer * not_pad, 0))
+
+    # loss for gradient calculation
+    loss = (self.policy_weight * policy_loss +
+            self.critic_weight * critic_loss + regularizer)
+
+    raw_loss = tf.reduce_mean(  # TODO
+        tf.reduce_sum(not_pad * policy_loss, 0))
+
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+
+    tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
+    tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(tf.reduce_sum(rewards, 0)))
+    tf.summary.scalar('policy_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('critic_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('loss', loss)
+    tf.summary.scalar('raw_loss', raw_loss)
+
+    return (loss, raw_loss, future_values,
+            gradient_ops, tf.summary.merge_all())
+
+
+class PCL(ActorCritic):
+  """PCL implementation.
+
+  Implements vanilla PCL, Unified PCL, and Trust PCL depending
+  on provided inputs.
+
+  """
+
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    not_pad = 1 - pads
+    batch_size = tf.shape(rewards)[1]
+
+    rewards = not_pad * rewards
+    value_estimates = not_pad * values
+    log_probs = not_pad * sum(log_probs)
+    target_log_probs = not_pad * tf.stop_gradient(sum(target_log_probs))
+    relative_log_probs = not_pad * (log_probs - target_log_probs)
+
+    # Prepend.
+    not_pad = tf.concat([tf.ones([self.rollout - 1, batch_size]),
+                         not_pad], 0)
+    rewards = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                         rewards], 0)
+    value_estimates = tf.concat(
+        [self.gamma ** tf.expand_dims(
+            tf.range(float(self.rollout - 1), 0, -1), 1) *
+         tf.ones([self.rollout - 1, batch_size]) *
+         value_estimates[0:1, :],
+         value_estimates], 0)
+    log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                           log_probs], 0)
+    prev_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                                prev_log_probs], 0)
+    relative_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
+                                    relative_log_probs], 0)
+
+    sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
+    sum_log_probs = discounted_future_sum(log_probs, self.gamma, self.rollout)
+    sum_prev_log_probs = discounted_future_sum(prev_log_probs, self.gamma, self.rollout)
+    sum_relative_log_probs = discounted_future_sum(
+        relative_log_probs, self.gamma, self.rollout)
+    last_values = shift_values(value_estimates, self.gamma, self.rollout,
+                               final_values)
+
+    future_values = (
+        - self.tau * sum_log_probs
+        - self.eps_lambda * sum_relative_log_probs
+        + sum_rewards + last_values)
+    baseline_values = value_estimates
+
+    adv = tf.stop_gradient(-baseline_values + future_values)
+    if self.clip_adv:
+      adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
+    policy_loss = -adv * sum_log_probs
+    critic_loss = -adv * (baseline_values - last_values)
+
+    policy_loss = tf.reduce_mean(
+        tf.reduce_sum(policy_loss * not_pad, 0))
+    critic_loss = tf.reduce_mean(
+        tf.reduce_sum(critic_loss * not_pad, 0))
+
+    # loss for gradient calculation
+    loss = (self.policy_weight * policy_loss +
+            self.critic_weight * critic_loss)
+
+    # actual quantity we're trying to minimize
+    raw_loss = tf.reduce_mean(
+        tf.reduce_sum(not_pad * adv * (-baseline_values + future_values), 0))
+
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+
+    tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
+    tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
+    tf.summary.histogram('future_values', future_values)
+    tf.summary.histogram('baseline_values', baseline_values)
+    tf.summary.histogram('advantages', adv)
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(tf.reduce_sum(rewards, 0)))
+    tf.summary.scalar('policy_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('critic_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('loss', loss)
+    tf.summary.scalar('raw_loss', tf.reduce_mean(raw_loss))
+    tf.summary.scalar('eps_lambda', self.eps_lambda)
+
+    return (loss, raw_loss,
+            future_values[self.rollout - 1:, :],
+            gradient_ops, tf.summary.merge_all())
+
+
+class TRPO(ActorCritic):
+  """TRPO."""
+
+  def get(self, rewards, pads, values, final_values,
+          log_probs, prev_log_probs, target_log_probs,
+          entropies, logits):
+    not_pad = 1 - pads
+    batch_size = tf.shape(rewards)[1]
+
+    rewards = not_pad * rewards
+    value_estimates = not_pad * values
+    log_probs = not_pad * sum(log_probs)
+    prev_log_probs = not_pad * prev_log_probs
+
+    sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
+    last_values = shift_values(value_estimates, self.gamma, self.rollout,
+                               final_values)
+
+    future_values = sum_rewards + last_values
+    baseline_values = value_estimates
+
+
+    adv = tf.stop_gradient(-baseline_values + future_values)
+    if self.clip_adv:
+      adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
+    policy_loss = -adv * tf.exp(log_probs - prev_log_probs)
+    critic_loss = -adv * baseline_values
+
+    policy_loss = tf.reduce_mean(
+        tf.reduce_sum(policy_loss * not_pad, 0))
+    critic_loss = tf.reduce_mean(
+        tf.reduce_sum(critic_loss * not_pad, 0))
+    raw_loss = policy_loss
+
+    # loss for gradient calculation
+    if self.policy_weight == 0:
+      policy_loss = 0.0
+    elif self.critic_weight == 0:
+      critic_loss = 0.0
+
+    loss = (self.policy_weight * policy_loss +
+            self.critic_weight * critic_loss)
+
+    gradient_ops = self.training_ops(
+        loss, learning_rate=self.learning_rate)
+
+    tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
+    tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
+    tf.summary.scalar('avg_rewards',
+                      tf.reduce_mean(tf.reduce_sum(rewards, 0)))
+    tf.summary.scalar('policy_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('critic_loss',
+                      tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
+    tf.summary.scalar('loss', loss)
+    tf.summary.scalar('raw_loss', raw_loss)
+
+    return (loss, raw_loss, future_values,
+            gradient_ops, tf.summary.merge_all())
--- a/pcl_rl/optimizers.py
+++ b/pcl_rl/optimizers.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Optimizers mostly for value estimate.
+
+Gradient Descent optimizer
+LBFGS optimizer
+Best Fit optimizer
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import numpy as np
+import scipy.optimize
+
+
+def var_size(v):
+  return int(np.prod([int(d) for d in v.shape]))
+
+
+def gradients(loss, var_list):
+  grads = tf.gradients(loss, var_list)
+  return [g if g is not None else tf.zeros(v.shape)
+          for g, v in zip(grads, var_list)]
+
+def flatgrad(loss, var_list):
+  grads = gradients(loss, var_list)
+  return tf.concat([tf.reshape(grad, [-1])
+                    for (v, grad) in zip(var_list, grads)
+                    if grad is not None], 0)
+
+
+def get_flat(var_list):
+  return tf.concat([tf.reshape(v, [-1]) for v in var_list], 0)
+
+
+def set_from_flat(var_list, flat_theta):
+  assigns = []
+  shapes = [v.shape for v in var_list]
+  sizes = [var_size(v) for v in var_list]
+
+  start = 0
+  assigns = []
+  for (shape, size, v) in zip(shapes, sizes, var_list):
+    assigns.append(v.assign(
+        tf.reshape(flat_theta[start:start + size], shape)))
+    start += size
+  assert start == sum(sizes)
+
+  return tf.group(*assigns)
+
+
+class LbfgsOptimization(object):
+
+  def __init__(self, max_iter=25, mix_frac=1.0):
+    self.max_iter = max_iter
+    self.mix_frac = mix_frac
+
+  def setup_placeholders(self):
+    self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
+    self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
+
+  def setup(self, var_list, values, targets, pads,
+            inputs, regression_weight):
+    self.setup_placeholders()
+    self.values = values
+    self.targets = targets
+
+    self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
+                     / tf.reduce_sum(1 - pads))
+    self.loss_flat_gradient = flatgrad(self.raw_loss, var_list)
+
+    self.flat_vars = get_flat(var_list)
+    self.set_vars = set_from_flat(var_list, self.flat_theta)
+
+  def optimize(self, sess, feed_dict):
+    old_theta = sess.run(self.flat_vars)
+
+    old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
+    intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
+    feed_dict = dict(feed_dict)
+    feed_dict[self.intended_values] = intended_values
+
+    def calc_loss_and_grad(theta):
+      sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
+      loss, grad = sess.run([self.raw_loss, self.loss_flat_gradient],
+                            feed_dict=feed_dict)
+      grad = grad.astype('float64')
+      return loss, grad
+
+    theta, _, _ = scipy.optimize.fmin_l_bfgs_b(
+        calc_loss_and_grad, old_theta, maxiter=self.max_iter)
+    sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
+
+
+class GradOptimization(object):
+
+  def __init__(self, learning_rate=0.001, max_iter=25, mix_frac=1.0):
+    self.learning_rate = learning_rate
+    self.max_iter = max_iter
+    self.mix_frac = mix_frac
+
+  def get_optimizer(self):
+    return tf.train.AdamOptimizer(learning_rate=self.learning_rate,
+                                  epsilon=2e-4)
+
+  def setup_placeholders(self):
+    self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
+    self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
+
+  def setup(self, var_list, values, targets, pads,
+            inputs, regression_weight):
+    self.setup_placeholders()
+    self.values = values
+    self.targets = targets
+
+    self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
+                     / tf.reduce_sum(1 - pads))
+
+    opt = self.get_optimizer()
+    params = var_list
+    grads = tf.gradients(self.raw_loss, params)
+    self.gradient_ops = opt.apply_gradients(zip(grads, params))
+
+  def optimize(self, sess, feed_dict):
+    old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
+    intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
+
+    feed_dict = dict(feed_dict)
+    feed_dict[self.intended_values] = intended_values
+
+    for _ in xrange(self.max_iter):
+      sess.run(self.gradient_ops, feed_dict=feed_dict)
+
+
+class BestFitOptimization(object):
+
+  def __init__(self, mix_frac=1.0):
+    self.mix_frac = mix_frac
+
+  def setup_placeholders(self):
+    self.new_regression_weight = tf.placeholder(
+        tf.float32, self.regression_weight.shape)
+
+  def setup(self, var_list, values, targets, pads,
+            inputs, regression_weight):
+    self.values = values
+    self.targets = targets
+
+    self.inputs = inputs
+    self.regression_weight = regression_weight
+
+    self.setup_placeholders()
+
+    self.update_regression_weight = tf.assign(
+        self.regression_weight, self.new_regression_weight)
+
+  def optimize(self, sess, feed_dict):
+    reg_input, reg_weight, old_values, targets = sess.run(
+        [self.inputs, self.regression_weight, self.values, self.targets],
+        feed_dict=feed_dict)
+
+    intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
+
+    # taken from rllab
+    reg_coeff = 1e-5
+    for _ in range(5):
+      best_fit_weight = np.linalg.lstsq(
+          reg_input.T.dot(reg_input) +
+          reg_coeff * np.identity(reg_input.shape[1]),
+          reg_input.T.dot(intended_values))[0]
+      if not np.any(np.isnan(best_fit_weight)):
+        break
+      reg_coeff *= 10
+
+    if len(best_fit_weight.shape) == 1:
+      best_fit_weight = np.expand_dims(best_fit_weight, -1)
+
+    sess.run(self.update_regression_weight,
+             feed_dict={self.new_regression_weight: best_fit_weight})
--- a/pcl_rl/policy.py
+++ b/pcl_rl/policy.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Policy neural network.
+
+Implements network which takes in input and produces actions
+and log probabilities given a sampling distribution parameterization.
+"""
+
+import tensorflow as tf
+import numpy as np
+
+
+class Policy(object):
+  def __init__(self, env_spec, internal_dim,
+               fixed_std=True, recurrent=True,
+               input_prev_actions=True):
+    self.env_spec = env_spec
+    self.internal_dim = internal_dim
+    self.rnn_state_dim = self.internal_dim
+    self.fixed_std = fixed_std
+    self.recurrent = recurrent
+    self.input_prev_actions = input_prev_actions
+
+    self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
+    self.vector_init = tf.constant_initializer(0.0)
+
+  @property
+  def input_dim(self):
+    return (self.env_spec.total_obs_dim +
+            self.env_spec.total_sampled_act_dim * self.input_prev_actions)
+
+  @property
+  def output_dim(self):
+    return self.env_spec.total_sampling_act_dim
+
+  def get_cell(self):
+    """Get RNN cell."""
+    self.cell_input_dim = self.internal_dim // 2
+    cell = tf.contrib.rnn.LSTMCell(self.cell_input_dim,
+                                   state_is_tuple=False,
+                                   reuse=tf.get_variable_scope().reuse)
+
+    cell = tf.contrib.rnn.OutputProjectionWrapper(
+        cell, self.output_dim,
+        reuse=tf.get_variable_scope().reuse)
+
+    return cell
+
+  def core(self, obs, prev_internal_state, prev_actions):
+    """Core neural network taking in inputs and outputting sampling
+    distribution parameters."""
+    batch_size = tf.shape(obs[0])[0]
+    if not self.recurrent:
+      prev_internal_state = tf.zeros([batch_size, self.rnn_state_dim])
+
+    cell = self.get_cell()
+
+    b = tf.get_variable('input_bias', [self.cell_input_dim],
+                        initializer=self.vector_init)
+    cell_input = tf.nn.bias_add(tf.zeros([batch_size, self.cell_input_dim]), b)
+
+    for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
+      w = tf.get_variable('w_state%d' % i, [obs_dim, self.cell_input_dim],
+                          initializer=self.matrix_init)
+      if self.env_spec.is_discrete(obs_type):
+        cell_input += tf.matmul(tf.one_hot(obs[i], obs_dim), w)
+      elif self.env_spec.is_box(obs_type):
+        cell_input += tf.matmul(obs[i], w)
+      else:
+        assert False
+
+    if self.input_prev_actions:
+      if self.env_spec.combine_actions:  # TODO(ofir): clean this up
+        prev_action = prev_actions[0]
+        for i, action_dim in enumerate(self.env_spec.orig_act_dims):
+          act = tf.mod(prev_action, action_dim)
+          w = tf.get_variable('w_prev_action%d' % i, [action_dim, self.cell_input_dim],
+                              initializer=self.matrix_init)
+          cell_input += tf.matmul(tf.one_hot(act, action_dim), w)
+          prev_action = tf.to_int32(prev_action / action_dim)
+      else:
+        for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+          w = tf.get_variable('w_prev_action%d' % i, [act_dim, self.cell_input_dim],
+                              initializer=self.matrix_init)
+          if self.env_spec.is_discrete(act_type):
+            cell_input += tf.matmul(tf.one_hot(prev_actions[i], act_dim), w)
+          elif self.env_spec.is_box(act_type):
+            cell_input += tf.matmul(prev_actions[i], w)
+          else:
+            assert False
+
+    output, next_state = cell(cell_input, prev_internal_state)
+
+    return output, next_state
+
+  def sample_action(self, logits, sampling_dim,
+                    act_dim, act_type, greedy=False):
+    """Sample an action from a distribution."""
+    if self.env_spec.is_discrete(act_type):
+      if greedy:
+        act = tf.argmax(logits, 1)
+      else:
+        act = tf.reshape(tf.multinomial(logits, 1), [-1])
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      if greedy:
+        act = means
+      else:
+        batch_size = tf.shape(logits)[0]
+        act = means + std * tf.random_normal([batch_size, act_dim])
+    else:
+      assert False
+
+    return act
+
+  def entropy(self, logits,
+              sampling_dim, act_dim, act_type):
+    """Calculate entropy of distribution."""
+    if self.env_spec.is_discrete(act_type):
+      entropy = tf.reduce_sum(
+          -tf.nn.softmax(logits) * tf.nn.log_softmax(logits), -1)
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      entropy = tf.reduce_sum(
+          0.5 * (1 + tf.log(2 * np.pi * tf.square(std))), -1)
+    else:
+      assert False
+
+    return entropy
+
+  def self_kl(self, logits,
+              sampling_dim, act_dim, act_type):
+    """Calculate KL of distribution with itself.
+
+    Used layer only for the gradients.
+    """
+
+    if self.env_spec.is_discrete(act_type):
+      probs = tf.nn.softmax(logits)
+      log_probs = tf.nn.log_softmax(logits)
+      self_kl = tf.reduce_sum(
+          tf.stop_gradient(probs) *
+          (tf.stop_gradient(log_probs) - log_probs), -1)
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      my_means = tf.stop_gradient(means)
+      my_std = tf.stop_gradient(std)
+      self_kl = tf.reduce_sum(
+          tf.log(std / my_std) +
+          (tf.square(my_std) + tf.square(my_means - means)) /
+          (2.0 * tf.square(std)) - 0.5,
+          -1)
+    else:
+      assert False
+
+    return self_kl
+
+  def log_prob_action(self, action, logits,
+                      sampling_dim, act_dim, act_type):
+    """Calculate log-prob of action sampled from distribution."""
+    if self.env_spec.is_discrete(act_type):
+      act_log_prob = tf.reduce_sum(
+          tf.one_hot(action, act_dim) * tf.nn.log_softmax(logits), -1)
+    elif self.env_spec.is_box(act_type):
+      means = logits[:, :sampling_dim / 2]
+      std = logits[:, sampling_dim / 2:]
+      act_log_prob = (- 0.5 * tf.log(2 * np.pi * tf.square(std))
+                      - 0.5 * tf.square(action - means) / tf.square(std))
+      act_log_prob = tf.reduce_sum(act_log_prob, -1)
+    else:
+      assert False
+
+    return act_log_prob
+
+  def sample_actions(self, output, actions=None, greedy=False):
+    """Sample all actions given output of core network."""
+    sampled_actions = []
+    logits = []
+    log_probs = []
+    entropy = []
+    self_kl = []
+
+    start_idx = 0
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
+      if self.fixed_std and self.env_spec.is_box(act_type):
+        act_logits = output[:, start_idx:start_idx + act_dim]
+
+        log_std = tf.get_variable('std%d' % i, [1, sampling_dim // 2])
+        # fix standard deviations to variable
+        act_logits = tf.concat(
+            [act_logits,
+             1e-6 + tf.exp(log_std) + 0 * act_logits], 1)
+      else:
+        act_logits = output[:, start_idx:start_idx + sampling_dim]
+
+      if actions is None:
+        act = self.sample_action(act_logits, sampling_dim,
+                                 act_dim, act_type,
+                                 greedy=greedy)
+      else:
+        act = actions[i]
+
+      ent = self.entropy(act_logits, sampling_dim, act_dim, act_type)
+      kl = self.self_kl(act_logits, sampling_dim, act_dim, act_type)
+
+      act_log_prob = self.log_prob_action(
+          act, act_logits,
+          sampling_dim, act_dim, act_type)
+
+      sampled_actions.append(act)
+      logits.append(act_logits)
+      log_probs.append(act_log_prob)
+      entropy.append(ent)
+      self_kl.append(kl)
+
+      start_idx += sampling_dim
+
+    assert start_idx == self.env_spec.total_sampling_act_dim
+
+    return sampled_actions, logits, log_probs, entropy, self_kl
+
+  def get_kl(self, my_logits, other_logits):
+    """Calculate KL between one policy output and another."""
+    kl = []
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
+      single_my_logits = my_logits[i]
+      single_other_logits = other_logits[i]
+      if self.env_spec.is_discrete(act_type):
+        my_probs = tf.nn.softmax(single_my_logits)
+        my_log_probs = tf.nn.log_softmax(single_my_logits)
+        other_log_probs = tf.nn.log_softmax(single_other_logits)
+        my_kl = tf.reduce_sum(my_probs * (my_log_probs - other_log_probs), -1)
+      elif self.env_spec.is_box(act_type):
+        my_means = single_my_logits[:, :sampling_dim / 2]
+        my_std = single_my_logits[:, sampling_dim / 2:]
+        other_means = single_other_logits[:, :sampling_dim / 2]
+        other_std = single_other_logits[:, sampling_dim / 2:]
+        my_kl = tf.reduce_sum(
+            tf.log(other_std / my_std) +
+            (tf.square(my_std) + tf.square(my_means - other_means)) /
+            (2.0 * tf.square(other_std)) - 0.5,
+            -1)
+      else:
+        assert False
+
+      kl.append(my_kl)
+
+    return kl
+
+  def single_step(self, prev, cur, greedy=False):
+    """Single RNN step.  Equivalently, single-time-step sampled actions."""
+    prev_internal_state, prev_actions, _, _, _, _ = prev
+    obs, actions = cur  # state observed and action taken at this time step
+
+    # feed into RNN cell
+    output, next_state = self.core(
+        obs, prev_internal_state, prev_actions)
+
+    # sample actions with values and log-probs
+    (actions, logits, log_probs,
+     entropy, self_kl) = self.sample_actions(
+        output, actions=actions, greedy=greedy)
+
+    return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
+            tuple(entropy), tuple(self_kl))
+
+  def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
+    """Sample single step from policy."""
+    (next_state, sampled_actions, logits, log_probs,
+     entropies, self_kls) = self.single_step(
+        (prev_internal_state, prev_actions, None, None, None, None),
+        (obs, None), greedy=greedy)
+    return next_state, sampled_actions
+
+  def multi_step(self, all_obs, initial_state, all_actions):
+    """Calculate log-probs and other calculations on batch of episodes."""
+    batch_size = tf.shape(initial_state)[0]
+    time_length = tf.shape(all_obs[0])[0]
+    initial_actions = [act[0] for act in all_actions]
+    all_actions = [tf.concat([act[1:], act[0:1]], 0)
+                   for act in all_actions]  # "final" action is dummy
+
+    (internal_states, _, logits, log_probs,
+     entropies, self_kls) = tf.scan(
+        self.single_step,
+        (all_obs, all_actions),
+        initializer=self.get_initializer(
+            batch_size, initial_state, initial_actions))
+
+    # remove "final" computations
+    log_probs = [log_prob[:-1] for log_prob in log_probs]
+    entropies = [entropy[:-1] for entropy in entropies]
+    self_kls = [self_kl[:-1] for self_kl in self_kls]
+
+    return internal_states, logits, log_probs, entropies, self_kls
+
+  def get_initializer(self, batch_size, initial_state, initial_actions):
+    """Get initializer for RNN."""
+    logits_init = []
+    log_probs_init = []
+    for act_dim, act_type in self.env_spec.act_dims_and_types:
+      sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
+      logits_init.append(tf.zeros([batch_size, sampling_dim]))
+      log_probs_init.append(tf.zeros([batch_size]))
+    entropy_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
+    self_kl_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
+
+    return (initial_state,
+            tuple(initial_actions),
+            tuple(logits_init), tuple(log_probs_init),
+            tuple(entropy_init),
+            tuple(self_kl_init))
+
+  def calculate_kl(self, my_logits, other_logits):
+    """Calculate KL between one policy and another on batch of episodes."""
+    batch_size = tf.shape(my_logits[0])[1]
+    time_length = tf.shape(my_logits[0])[0]
+
+    reshaped_my_logits = [
+        tf.reshape(my_logit, [batch_size * time_length, -1])
+        for my_logit in my_logits]
+    reshaped_other_logits = [
+        tf.reshape(other_logit, [batch_size * time_length, -1])
+        for other_logit in other_logits]
+
+    kl = self.get_kl(reshaped_my_logits, reshaped_other_logits)
+    kl = [tf.reshape(kkl, [time_length, batch_size])
+          for kkl in kl]
+    return kl
+
+
+class MLPPolicy(Policy):
+  """Non-recurrent policy."""
+
+  def get_cell(self):
+    self.cell_input_dim = self.internal_dim
+
+    def mlp(cell_input, prev_internal_state):
+      w1 = tf.get_variable('w1', [self.cell_input_dim, self.internal_dim])
+      b1 = tf.get_variable('b1', [self.internal_dim])
+
+      w2 = tf.get_variable('w2', [self.internal_dim, self.internal_dim])
+      b2 = tf.get_variable('b2', [self.internal_dim])
+
+      w3 = tf.get_variable('w3', [self.internal_dim, self.internal_dim])
+      b3 = tf.get_variable('b3', [self.internal_dim])
+
+      proj = tf.get_variable(
+          'proj', [self.internal_dim, self.output_dim])
+
+      hidden = cell_input
+      hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w1), b1))
+      hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w2), b2))
+
+      output = tf.matmul(hidden, proj)
+
+      return output, hidden
+
+    return mlp
+
+  def single_step(self, obs, actions, prev_actions, greedy=False):
+    """Single step."""
+    batch_size = tf.shape(obs[0])[0]
+    prev_internal_state = tf.zeros([batch_size, self.internal_dim])
+
+    output, next_state = self.core(
+        obs, prev_internal_state, prev_actions)
+
+    # sample actions with values and log-probs
+    (actions, logits, log_probs,
+     entropy, self_kl) = self.sample_actions(
+        output, actions=actions, greedy=greedy)
+
+    return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
+            tuple(entropy), tuple(self_kl))
+
+  def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
+    """Sample single step from policy."""
+    (next_state, sampled_actions, logits, log_probs,
+     entropies, self_kls) = self.single_step(obs, None, prev_actions,
+                                             greedy=greedy)
+    return next_state, sampled_actions
+
+  def multi_step(self, all_obs, initial_state, all_actions):
+    """Calculate log-probs and other calculations on batch of episodes."""
+    batch_size = tf.shape(initial_state)[0]
+    time_length = tf.shape(all_obs[0])[0]
+
+    # first reshape inputs as a single batch
+    reshaped_obs = []
+    for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
+      if self.env_spec.is_discrete(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
+      elif self.env_spec.is_box(obs_type):
+        reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
+
+    reshaped_act = []
+    reshaped_prev_act = []
+    for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
+      act = tf.concat([all_actions[i][1:], all_actions[i][0:1]], 0)
+      prev_act = all_actions[i]
+      if self.env_spec.is_discrete(act_type):
+        reshaped_act.append(tf.reshape(act, [time_length * batch_size]))
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size]))
+      elif self.env_spec.is_box(act_type):
+        reshaped_act.append(
+            tf.reshape(act, [time_length * batch_size, act_dim]))
+        reshaped_prev_act.append(
+            tf.reshape(prev_act, [time_length * batch_size, act_dim]))
+
+    # now inputs go into single step as one large batch
+    (internal_states, _, logits, log_probs,
+     entropies, self_kls) = self.single_step(
+         reshaped_obs, reshaped_act, reshaped_prev_act)
+
+    # reshape the outputs back to original time-major format
+    internal_states = tf.reshape(internal_states, [time_length, batch_size, -1])
+    logits = [tf.reshape(logit, [time_length, batch_size, -1])
+              for logit in logits]
+    log_probs = [tf.reshape(log_prob, [time_length, batch_size])[:-1]
+                 for log_prob in log_probs]
+    entropies = [tf.reshape(ent, [time_length, batch_size])[:-1]
+                 for ent in entropies]
+    self_kls = [tf.reshape(self_kl, [time_length, batch_size])[:-1]
+                for self_kl in self_kls]
+
+    return internal_states, logits, log_probs, entropies, self_kls