Merge pull request #4642 from buckman-google/master

Addition of STEVE

Merge pull request #4642 from buckman-google/master
Addition of STEVE
e10d986e · Lukasz Kaiser · GitHub · ee0e9d11 · f789dcf5 · e10d986e
Unverified Commit e10d986e authored Jun 27, 2018 by Lukasz Kaiser Committed by GitHub Jun 27, 2018
20 changed files
--- a/research/steve/config/experiments/goodruns/walker2d/steve2.json
+++ b/research/steve/config/experiments/goodruns/walker2d/steve2.json
+{"inherits": ["config/algos/steve.json", "config/envs/walker2d.json"]}
--- a/research/steve/config/experiments/goodruns/walker2d/steve3.json
+++ b/research/steve/config/experiments/goodruns/walker2d/steve3.json
+{"inherits": ["config/algos/steve.json", "config/envs/walker2d.json"]}
--- a/research/steve/config/experiments/speedruns/flagrun/speedy_ddpg0.json
+++ b/research/steve/config/experiments/speedruns/flagrun/speedy_ddpg0.json
+{"inherits": ["config/algos/ddpg.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/flagrun/speedy_ddpg1.json
+++ b/research/steve/config/experiments/speedruns/flagrun/speedy_ddpg1.json
+{"inherits": ["config/algos/ddpg.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/flagrun/speedy_mve_tdk0.json
+++ b/research/steve/config/experiments/speedruns/flagrun/speedy_mve_tdk0.json
+{"inherits": ["config/algos/mve_tdk.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/flagrun/speedy_mve_tdk1.json
+++ b/research/steve/config/experiments/speedruns/flagrun/speedy_mve_tdk1.json
+{"inherits": ["config/algos/mve_tdk.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/flagrun/speedy_steve0.json
+++ b/research/steve/config/experiments/speedruns/flagrun/speedy_steve0.json
+{"inherits": ["config/algos/steve.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/flagrun/speedy_steve1.json
+++ b/research/steve/config/experiments/speedruns/flagrun/speedy_steve1.json
+{"inherits": ["config/algos/steve.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/humanoid/speedy_ddpg0.json
+++ b/research/steve/config/experiments/speedruns/humanoid/speedy_ddpg0.json
+{"inherits": ["config/algos/ddpg.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/humanoid/speedy_ddpg1.json
+++ b/research/steve/config/experiments/speedruns/humanoid/speedy_ddpg1.json
+{"inherits": ["config/algos/ddpg.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/humanoid/speedy_mve_tdk0.json
+++ b/research/steve/config/experiments/speedruns/humanoid/speedy_mve_tdk0.json
+{"inherits": ["config/algos/mve_tdk.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/humanoid/speedy_mve_tdk1.json
+++ b/research/steve/config/experiments/speedruns/humanoid/speedy_mve_tdk1.json
+{"inherits": ["config/algos/mve_tdk.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/humanoid/speedy_steve0.json
+++ b/research/steve/config/experiments/speedruns/humanoid/speedy_steve0.json
+{"inherits": ["config/algos/steve.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/config/experiments/speedruns/humanoid/speedy_steve1.json
+++ b/research/steve/config/experiments/speedruns/humanoid/speedy_steve1.json
+{"inherits": ["config/algos/steve.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
--- a/research/steve/envwrap.py
+++ b/research/steve/envwrap.py
+from builtins import object
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+try:
+  import roboschool
+except:
+  pass
+import gym
+import numpy as np
+
+from config import config
+
+MAX_FRAMES = config["env"]["max_frames"]
+
+gym.logger.level=40
+
+def get_env(env_name, *args, **kwargs):
+  MAPPING = {
+    "CartPole-v0": CartPoleWrapper,
+  }
+  if env_name in MAPPING: return MAPPING[env_name](env_name, *args, **kwargs)
+  else: return NoTimeLimitMujocoWrapper(env_name, *args, **kwargs)
+
+class GymWrapper(object):
+  """
+  Generic wrapper for OpenAI gym environments.
+  """
+  def __init__(self, env_name):
+    self.internal_env = gym.make(env_name)
+    self.observation_space = self.internal_env.observation_space
+    self.action_space = self.internal_env.action_space
+    self.custom_init()
+
+  def custom_init(self):
+    pass
+
+  def reset(self):
+    self.clock = 0
+    return self.preprocess_obs(self.internal_env.reset())
+
+  # returns normalized actions
+  def sample(self):
+    return self.action_space.sample()
+
+  # this is used for converting continuous approximations back to the original domain
+  def normalize_actions(self, actions):
+    return actions
+
+  # puts actions into a form where they can be predicted. by default, called after sample()
+  def unnormalize_actions(self, actions):
+    return actions
+
+  def preprocess_obs(self, obs):
+    # return np.append(obs, [self.clock/float(MAX_FRAMES)])
+    return obs
+
+  def step(self, normalized_action):
+    out = self.internal_env.step(normalized_action)
+    self.clock += 1
+    obs, reward, done = self.preprocess_obs(out[0]), out[1], float(out[2])
+    reset = done == 1. or self.clock == MAX_FRAMES
+    return obs, reward, done, reset
+
+  def render_rollout(self, states):
+    ## states is numpy array of size [timesteps, state]
+    self.internal_env.reset()
+    for state in states:
+      self.internal_env.env.state = state
+      self.internal_env.render()
+
+class CartPoleWrapper(GymWrapper):
+  """
+  Wrap CartPole.
+  """
+  def sample(self):
+    return np.array([np.random.uniform(0., 1.)])
+
+  def normalize_actions(self, action):
+    return 1 if action[0] >= 0 else 0
+
+  def unnormalize_actions(self, action):
+    return 2. * action - 1.
+
+class NoTimeLimitMujocoWrapper(GymWrapper):
+  """
+  Wrap Mujoco-style environments, removing the termination condition after time.
+  This is needed to keep it Markovian.
+  """
+  def __init__(self, env_name):
+    self.internal_env = gym.make(env_name).env
+    self.observation_space = self.internal_env.observation_space
+    self.action_space = self.internal_env.action_space
+    self.custom_init()
--- a/research/steve/learner.py
+++ b/research/steve/learner.py
+from __future__ import division
+from __future__ import print_function
+from builtins import zip
+from builtins import range
+from builtins import object
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import traceback, threading, time, warnings
+import tensorflow as tf
+import numpy as np
+
+import util
+from replay import ReplayBuffer
+
+class Learner(object):
+    """
+    Generic object which runs the main training loop of anything that trains using
+    a replay buffer. Handles updating, logging, saving/loading, batching, etc.
+    """
+    def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs):
+        self.learner_name = self.learner_name()
+        self.interactor_queue = interactor_queue
+        self.learner_lock = lock
+        self.config = config
+        self.env_config = env_config
+        self.learner_config = learner_config
+        self.bonus_kwargs = bonus_kwargs
+        self.kill_threads = False
+        self.permit_desync = False
+        self.need_frames_notification = threading.Condition()
+        self._reset_inspections()
+        self.total_frames = 0
+
+        self.save_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"]))
+        self.log_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"],  self.config["log_path"])) + "/%s.log" % self.learner_name
+
+        # replay buffer to store data
+        self.replay_buffer_lock = threading.RLock()
+        self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"],
+                                          np.prod(self.env_config["obs_dims"]),
+                                          self.env_config["action_dim"])
+
+        # data loaders pull data from the replay buffer and put it into the tfqueue for model usage
+        self.data_loaders = self.make_loader_placeholders()
+        queue_capacity = np.ceil(1./self.learner_config["frames_per_update"]) if self.learner_config["frames_per_update"] else 100
+        self.tf_queue = tf.FIFOQueue(capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders])
+        self.enqueue_op = self.tf_queue.enqueue(self.data_loaders)
+        self.current_batch = self.tf_queue.dequeue()
+
+        # build the TF graph for the actual model to train
+        self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model()
+        self.sess = tf.Session()
+        self.sess.run(tf.global_variables_initializer())
+
+    ## Mandatory functions to override
+    def learner_name(self): raise Exception('unimplemented: learner_name')
+    def make_loader_placeholders(self): raise Exception('unimplemented: make_loader_placeholders')
+    def make_core_model(self): raise Exception('unimplemented: make_core_model')
+
+    ## Optional functions to override
+    def initialize(self): warnings.warn('unimplemented: initialize')
+    def resume_from_checkpoint(self, epoch): warnings.warn('unimplemented: resume_from_checkpoint')
+    def checkpoint(self): warnings.warn('unimplemented: checkpoint')
+    def backup(self): warnings.warn('unimplemented: backup')
+
+    ## Internal functions
+    def _start(self):
+        # fetch data from the interactors to pre-fill the replay buffer
+        self.prefetch_thread = threading.Thread(target=self._poll_interactors, args=(True, self.learner_config["frames_before_learning"],))
+        self.prefetch_thread.start()
+        self.prefetch_thread.join()
+
+        # start the interactor and data loader
+        self.data_load_thread = threading.Thread(target=self._run_enqueue_data)
+        self.data_load_thread.start()
+
+        # initialize the learner, pretraining if needed
+        if self.config["resume"]: self._resume_from_checkpoint()
+        else:                     self._initialize()
+
+        # re-sync everything, and start up interactions with the environment
+        self.interactor_poll_thread = threading.Thread(target=self._poll_interactors)
+        self.interactor_poll_thread.start()
+
+        # start the clock
+        self._last_checkpoint_time = time.time()
+
+    def _learn(self, permit_desync=False, log=True, checkpoint=True, backup=True):
+        # this is to keep the frames/update synced properly
+        if self.learner_config["frames_per_update"] is not False and not permit_desync:
+            if not self._have_enough_frames():
+                with self.need_frames_notification:
+                    self.need_frames_notification.notify()
+                return
+
+        # log
+        if log and (self.update_i + 1) % self.learner_config["log_every_n"] == 0:
+            self._log()
+
+        # checkpoint
+        if checkpoint and (self.update_i + 1) % self.learner_config["epoch_every_n"] == 0:
+            self._checkpoint()
+
+        # backup
+        if backup and (self.update_i + 1) % self.learner_config["backup_every_n"] == 0:
+            self._backup()
+
+        # train
+        self._training_step()
+
+    def _have_enough_frames(self):
+        gathered_frames = self.total_frames - self.learner_config["frames_before_learning"]
+        return gathered_frames > self.learner_config["frames_per_update"] * self.update_i
+
+    def _initialize(self):
+        self.epoch = 0
+        self.update_i = 0
+        self.hours = 0
+        self._last_checkpoint_time = time.time()
+
+        self.initialize()
+
+        if self.learner_config["pretrain_n"]: self._pretrain()
+        self._checkpoint()
+
+    def _pretrain(self):
+        for _ in range(self.learner_config["pretrain_n"]):
+            self._learn(permit_desync=True, checkpoint=False, backup=False)
+        self.epoch = 0
+        self.update_i = 0
+
+    def _resume_from_checkpoint(self):
+        epoch = util.get_largest_epoch_in_dir(self.save_path, self.core.saveid)
+        if not self.config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid)
+        if epoch is False:
+            raise Exception("Tried to reload but no model found")
+        with self.learner_lock:
+            self.core.load(self.sess, self.save_path, epoch)
+            self.epoch, self.update_i, self.total_frames, self.hours = self.sess.run([self.core.epoch_n, self.core.update_n, self.core.frame_n, self.core.hours])
+        with self.replay_buffer_lock:
+            self.replay_buffer.load(self.save_path, '%09d_%s' % (epoch, self.learner_name))
+        self.resume_from_checkpoint(epoch)
+
+    def _log(self):
+        logstring = "(%3.2f sec) h%-8.2f e%-8d s%-8d f%-8d\t" % (time.time() - self._log_time, self.hours, self.epoch, self.update_i + 1, self.total_frames) + ', '.join(["%8f" % x for x in (self.running_total / self.denom).tolist()])
+        print("%s\t%s" % (self.learner_name, logstring))
+        with open(self.log_path, "a") as f: f.write(logstring + "\n")
+        self._reset_inspections()
+
+    def _reset_inspections(self):
+        self.running_total = 0.
+        self.denom = 0.
+        self._log_time = time.time()
+
+    def _checkpoint(self):
+        self.checkpoint()
+        self.epoch += 1
+        self.hours += (time.time() - self._last_checkpoint_time) / 3600.
+        self._last_checkpoint_time = time.time()
+        self.core.update_epoch(self.sess, self.epoch, self.update_i, self.total_frames, self.hours)
+        with self.learner_lock: self.core.save(self.sess, self.save_path)
+
+    def _backup(self):
+        self.backup()
+        if not self.learner_config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid)
+        with self.learner_lock:
+            self.core.save(self.sess, self.save_path, self.epoch)
+        with self.replay_buffer_lock:
+            self.replay_buffer.save(self.save_path, '%09d_%s' % (self.epoch, self.learner_name))
+
+    def _training_step(self):
+        train_ops = tuple([op for op, loss in zip(self.train_ops,
+                                                  self.train_losses)
+                           if loss is not None])
+        outs = self.sess.run(train_ops + self.inspect_losses)
+        self.running_total += np.array(outs[len(train_ops):])
+        self.denom += 1.
+        self.update_i += 1
+
+    def _poll_interactors(self, continuous_poll=False, frames_before_terminate=None):
+        # poll the interactors for new frames.
+        # the synced_condition semaphore prevents this from consuming too much CPU
+        while not self.kill_threads:
+            if self.learner_config["frames_per_update"] is not False and not continuous_poll:
+                with self.need_frames_notification: self.need_frames_notification.wait()
+            while not self.interactor_queue.empty():
+                new_frames = self.interactor_queue.get()
+                self._add_frames(new_frames)
+                if frames_before_terminate and self.total_frames >= frames_before_terminate: return
+
+    def _add_frames(self, frames):
+        with self.replay_buffer_lock:
+            for frame in frames:
+                self.replay_buffer.add_replay(*frame)
+            self.total_frames = self.replay_buffer.count
+        return self.total_frames
+
+    def _run_enqueue_data(self):
+        while not self.kill_threads:
+            data = self.replay_buffer.random_batch(self.learner_config["batch_size"])
+            self.sess.run(self.enqueue_op, feed_dict=dict(list(zip(self.data_loaders, data))))
+
+    def _kill_threads(self):
+        self.kill_threads = True
+
+
+class CoreModel(object):
+    """The base class for the "core" of learners."""
+    def __init__(self, name, env_config, learner_config):
+        self.name = self.saveid + "/" + name
+        self.env_config = env_config
+        self.learner_config = learner_config
+
+        with tf.variable_scope(self.name):
+            self.epoch_n = tf.get_variable('epoch_n', [], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False)
+            self.update_n = tf.get_variable('update_n', [], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False)
+            self.frame_n = tf.get_variable('frame_n', [], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False)
+            self.hours = tf.get_variable('hours', [], initializer=tf.constant_initializer(0.), dtype=tf.float64, trainable=False)
+            self.epoch_n_placeholder = tf.placeholder(tf.int64, [])
+            self.update_n_placeholder = tf.placeholder(tf.int64, [])
+            self.frame_n_placeholder = tf.placeholder(tf.int64, [])
+            self.hours_placeholder = tf.placeholder(tf.float64, [])
+        self.assign_epoch_op = [tf.assign(self.epoch_n, self.epoch_n_placeholder), tf.assign(self.update_n, self.update_n_placeholder), tf.assign(self.frame_n, self.frame_n_placeholder), tf.assign(self.hours, self.hours_placeholder)]
+
+        self.create_params(env_config, learner_config)
+        self.model_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
+        self.saver = tf.train.Saver(self.model_params)
+
+    @property
+    def saveid(self):
+        raise Exception("specify a save ID")
+
+    def create_params(self, env_config, learner_config):
+        raise Exception("unimplemented")
+
+    def update_epoch(self, sess, epoch, updates, frames, hours):
+        sess.run(self.assign_epoch_op, feed_dict={self.epoch_n_placeholder: int(epoch), self.update_n_placeholder: int(updates), self.frame_n_placeholder: int(frames), self.hours_placeholder: float(hours)})
+
+    def save(self, sess, path, epoch=None):
+        if epoch is None:  self.saver.save(sess, path + "/%s.params" % self.saveid)
+        else:              self.saver.save(sess, path + "/%09d_%s.params" % (epoch, self.saveid))
+
+    def load(self, sess, path, epoch=None):
+        if epoch is None:  self.saver.restore(sess, path + "/%s.params" % self.saveid)
+        else:              self.saver.restore(sess, path + "/%09d_%s.params" % (epoch, self.saveid))
+
+def run_learner(learner_subclass, queue, lock, config, env_config, learner_config, **bonus_kwargs):
+    learner = learner_subclass(queue, lock, config, env_config, learner_config, **bonus_kwargs)
+    try:
+        learner._start()
+        while True: learner._learn()
+
+    except Exception as e:
+        print('Caught exception in learner process')
+        traceback.print_exc()
+        learner._kill_threads()
+        print()
+        raise e
--- a/research/steve/master.py
+++ b/research/steve/master.py
+from builtins import str
+from builtins import range
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing
+import os, sys, time
+
+from config import config, log_config
+import util
+
+AGENT_COUNT = config["agent_config"]["count"]
+EVALUATOR_COUNT = config["evaluator_config"]["count"]
+MODEL_AUGMENTED = config["model_config"] is not False
+if config["resume"]:
+  ROOT_PATH = "output/" + config["env"]["name"] + "/" + config["name"]
+else:
+  ROOT_PATH = util.create_and_wipe_directory("output/" + config["env"]["name"] + "/" + config["name"])
+log_config()
+import learner, agent, valuerl_learner
+if MODEL_AUGMENTED: import worldmodel_learner
+
+if __name__ == '__main__':
+  all_procs = set([])
+  interaction_procs = set([])
+
+  # lock
+  policy_lock = multiprocessing.Lock()
+  model_lock = multiprocessing.Lock() if MODEL_AUGMENTED else None
+
+  # queue
+  policy_replay_frame_queue = multiprocessing.Queue(1)
+  model_replay_frame_queue = multiprocessing.Queue(1) if MODEL_AUGMENTED else None
+
+  # interactors
+  for interact_proc_i in range(AGENT_COUNT):
+    interact_proc = multiprocessing.Process(target=agent.main, args=(interact_proc_i, False, policy_replay_frame_queue, model_replay_frame_queue, policy_lock, config))
+    all_procs.add(interact_proc)
+    interaction_procs.add(interact_proc)
+
+  # evaluators
+  for interact_proc_i in range(EVALUATOR_COUNT):
+    interact_proc = multiprocessing.Process(target=agent.main, args=(interact_proc_i, True, policy_replay_frame_queue, model_replay_frame_queue, policy_lock, config))
+    all_procs.add(interact_proc)
+    interaction_procs.add(interact_proc)
+
+  # policy training
+  train_policy_proc = multiprocessing.Process(target=learner.run_learner, args=(valuerl_learner.ValueRLLearner, policy_replay_frame_queue, policy_lock, config, config["env"], config["policy_config"]), kwargs={"model_lock": model_lock})
+  all_procs.add(train_policy_proc)
+
+  # model training
+  if MODEL_AUGMENTED:
+    train_model_proc = multiprocessing.Process(target=learner.run_learner, args=(worldmodel_learner.WorldmodelLearner, model_replay_frame_queue, model_lock, config, config["env"], config["model_config"]))
+    all_procs.add(train_model_proc)
+
+  # start all policies
+  os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+  for i, proc in enumerate(interaction_procs):
+    os.environ['CUDA_VISIBLE_DEVICES'] = ''
+    proc.start()
+
+  os.environ['CUDA_VISIBLE_DEVICES'] = str(int(sys.argv[2]))
+  train_policy_proc.start()
+
+  if MODEL_AUGMENTED:
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(1+int(sys.argv[2]))
+    train_model_proc.start()
+
+  while True:
+    try:
+      pass
+    except:
+      for proc in all_procs: proc.join()
--- a/research/steve/nn.py
+++ b/research/steve/nn.py
+from builtins import range
+from builtins import object
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+import numpy as np
+from itertools import product
+
+class FeedForwardNet(object):
+    """Custom feed-forward network layer."""
+    def __init__(self, name, in_size, out_shape, layers=1, hidden_dim=32, final_nonlinearity=None, get_uncertainty=False):
+        self.name = name
+        self.in_size = in_size
+        self.out_shape = out_shape
+        self.out_size = np.prod(out_shape)
+        self.layers = layers
+        self.hidden_dim = hidden_dim
+        self.final_nonlinearity = (lambda x:x) if final_nonlinearity is None else final_nonlinearity
+        self.get_uncertainty = get_uncertainty
+
+        self.weights = [None] * layers
+        self.biases = [None] * layers
+
+        self.params_list = []
+
+        with tf.variable_scope(name):
+            for layer_i in range(self.layers):
+                in_size = self.hidden_dim
+                out_size = self.hidden_dim
+                if layer_i == 0: in_size = self.in_size
+                if layer_i == self.layers - 1: out_size = self.out_size
+                self.weights[layer_i] = tf.get_variable("weights%d" % layer_i, [in_size, out_size], initializer=tf.contrib.layers.xavier_initializer())
+                self.biases[layer_i] = tf.get_variable("bias%d" % layer_i, [1, out_size], initializer=tf.constant_initializer(0.0))
+                self.params_list += [self.weights[layer_i], self.biases[layer_i]]
+
+    def __call__(self, x, stop_params_gradient=False, is_eval=True, ensemble_idxs=None, pre_expanded=None, reduce_mode="none"):
+        original_shape = tf.shape(x)
+        h = tf.reshape(x, [-1, self.in_size])
+        for layer_i in range(self.layers):
+            nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity
+            if stop_params_gradient: h = nonlinearity(tf.matmul(h, tf.stop_gradient(self.weights[layer_i])) + tf.stop_gradient(self.biases[layer_i]))
+            else:             h = nonlinearity(tf.matmul(h, self.weights[layer_i]) + self.biases[layer_i])
+        if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1))
+        else:                       h = tf.reshape(h, original_shape[:-1])
+        if pre_expanded is None: pre_expanded = ensemble_idxs is not None
+        if reduce_mode == "none" and not pre_expanded and self.get_uncertainty:
+            if len(self.out_shape) > 0: h = tf.expand_dims(h, -2)
+            else:                       h = tf.expand_dims(h, -1)
+        return h
+
+    def l2_loss(self):
+        return tf.add_n([tf.reduce_sum(.5 * tf.square(mu)) for mu in self.params_list])
+
+class BayesianDropoutFeedForwardNet(FeedForwardNet):
+    """Custom feed-forward network layer, with dropout as a Bayesian approximation."""
+    def __init__(self, name, in_size, out_shape, layers=1, hidden_dim=32, final_nonlinearity=None, get_uncertainty=False, keep_prob=.5, eval_sample_count=2, consistent_random_seed=False):
+        super(BayesianDropoutFeedForwardNet, self).__init__(name, in_size, out_shape, layers=layers, hidden_dim=hidden_dim,
+                                                            final_nonlinearity=final_nonlinearity, get_uncertainty=get_uncertainty)
+        self.keep_prob = keep_prob
+        self.eval_sample_count = eval_sample_count
+        if eval_sample_count < 2: raise Exception("eval_sample_count must be at least 2 to estimate uncertainty")
+        self.dropout_seed = tf.random_uniform([layers], maxval=1e18, dtype=tf.int64) if consistent_random_seed else [None] * layers
+
+    def __call__(self, x, stop_params_gradient=False, is_eval=True, pre_expanded=False, ensemble_idxs=None, reduce_mode="none"):
+        if is_eval:
+            x = tf.tile(tf.expand_dims(x,0), tf.concat([tf.constant([self.eval_sample_count]), tf.ones_like(tf.shape(x))], 0))
+        original_shape = tf.shape(x)
+        h = tf.reshape(x, [-1, self.in_size])
+        for layer_i in range(self.layers):
+            nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity
+            if layer_i > 0: h = tf.nn.dropout(h, keep_prob=self.keep_prob, seed=self.dropout_seed[layer_i])
+            if stop_params_gradient: h = nonlinearity(tf.matmul(h, tf.stop_gradient(self.weights[layer_i])) + tf.stop_gradient(self.biases[layer_i]))
+            else:                    h = nonlinearity(tf.matmul(h, self.weights[layer_i]) + self.biases[layer_i])
+        if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1))
+        else:                       h = tf.reshape(h, original_shape[:-1])
+        if is_eval:
+            h, uncertainty = tf.nn.moments(h, 0)
+            if self.get_uncertainty: return h, uncertainty
+            else:                    return h
+        else:
+            return h
+
+
+class EnsembleFeedForwardNet(FeedForwardNet):
+    """Custom feed-forward network layer with an ensemble."""
+    def __init__(self, name, in_size, out_shape, layers=1, hidden_dim=32, final_nonlinearity=None, get_uncertainty=False, ensemble_size=2, train_sample_count=2, eval_sample_count=2):
+        if train_sample_count > ensemble_size: raise Exception("train_sample_count cannot be larger than ensemble size")
+        if eval_sample_count > ensemble_size: raise Exception("eval_sample_count cannot be larger than ensemble size")
+        self.name = name
+        self.in_size = in_size
+        self.out_shape = out_shape
+        self.out_size = np.prod(out_shape)
+        self.layers = layers
+        self.hidden_dim = hidden_dim
+        self.final_nonlinearity = (lambda x:x) if final_nonlinearity is None else final_nonlinearity
+        self.get_uncertainty = get_uncertainty
+        self.ensemble_size = ensemble_size
+        self.train_sample_count = train_sample_count
+        self.eval_sample_count = eval_sample_count
+
+        self.weights = [None] * layers
+        self.biases = [None] * layers
+
+        self.params_list = []
+
+        with tf.variable_scope(name):
+            for layer_i in range(self.layers):
+                in_size = self.hidden_dim
+                out_size = self.hidden_dim
+                if layer_i == 0: in_size = self.in_size
+                if layer_i == self.layers - 1: out_size = self.out_size
+                self.weights[layer_i] = tf.get_variable("weights%d" % layer_i, [ensemble_size, in_size, out_size], initializer=tf.contrib.layers.xavier_initializer())
+                self.biases[layer_i] = tf.get_variable("bias%d" % layer_i, [ensemble_size, out_size], initializer=tf.constant_initializer(0.0))
+                self.params_list += [self.weights[layer_i], self.biases[layer_i]]
+
+    def __call__(self, x, stop_params_gradient=False, is_eval=True, ensemble_idxs=None, pre_expanded=None, reduce_mode="none"):
+        if pre_expanded is None: pre_expanded = ensemble_idxs is not None
+        if ensemble_idxs is None:
+            ensemble_idxs = tf.random_shuffle(tf.range(self.ensemble_size))
+            ensemble_sample_n = self.eval_sample_count if is_eval else self.train_sample_count
+            ensemble_idxs = ensemble_idxs[:ensemble_sample_n]
+        else:
+            ensemble_sample_n = tf.shape(ensemble_idxs)[0]
+
+        weights = [tf.gather(w, ensemble_idxs, axis=0) for w in self.weights]
+        biases = [tf.expand_dims(tf.gather(b, ensemble_idxs, axis=0),0) for b in self.biases]
+
+        original_shape = tf.shape(x)
+        if pre_expanded: h = tf.reshape(x, [-1, ensemble_sample_n, self.in_size])
+        else:            h = tf.tile(tf.reshape(x, [-1, 1, self.in_size]), [1, ensemble_sample_n, 1])
+        for layer_i in range(self.layers):
+            nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity
+            if stop_params_gradient: h = nonlinearity(tf.einsum('bri,rij->brj', h, tf.stop_gradient(weights[layer_i])) + tf.stop_gradient(biases[layer_i]))
+            else:                    h = nonlinearity(tf.einsum('bri,rij->brj', h, weights[layer_i]) + biases[layer_i])
+
+        if pre_expanded:
+            if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1))
+            else:                       h = tf.reshape(h, original_shape[:-1])
+        else:
+            if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant([ensemble_sample_n]), tf.constant(self.out_shape)], -1))
+            else:                       h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant([ensemble_sample_n])], -1))
+
+        if reduce_mode == "none":
+            pass
+        elif reduce_mode == "random":
+            if len(self.out_shape) > 0: h = tf.reduce_sum(h * tf.reshape(tf.one_hot(tf.random_uniform([tf.shape(h)[0]], 0, ensemble_sample_n, dtype=tf.int64), ensemble_sample_n), tf.concat([tf.shape(h)[:1], tf.ones_like(tf.shape(h)[1:-2]), tf.constant([ensemble_sample_n]), tf.constant([1])], 0)), -2)
+            else:                       h = tf.reduce_sum(h * tf.reshape(tf.one_hot(tf.random_uniform([tf.shape(h)[0]], 0, ensemble_sample_n, dtype=tf.int64), ensemble_sample_n), tf.concat([tf.shape(h)[:1], tf.ones_like(tf.shape(h)[1:-1]), tf.constant([ensemble_sample_n])], 0)), -1)
+        elif reduce_mode == "mean":
+            if len(self.out_shape) > 0: h = tf.reduce_mean(h, -2)
+            else:                       h = tf.reduce_mean(h, -1)
+        else: raise Exception("use a valid reduce mode: none, random, or mean")
+
+        return h
+
+
+class ReparamNormal(object):
+    """Wrapper to make a feedforward network that outputs both mu and logsigma,
+    for use in the reparameterization trick."""
+    def __init__(self, base_net, name, in_size, out_shape, layers=2, hidden_dim=32, final_nonlinearity=None, ls_start_bias=0.0, final_net=FeedForwardNet, logsigma_min=-5., logsigma_max=2., **kwargs):
+        assert layers > 1
+        self.main_encoder = base_net(name+"_base", in_size, [hidden_dim], layers, hidden_dim, final_nonlinearity=tf.nn.relu, **kwargs)
+        self.mu = final_net(name+"_mu", hidden_dim, out_shape, layers=1, final_nonlinearity=final_nonlinearity, **kwargs)
+        self.logsigma = final_net(name+"_logsigma", hidden_dim, out_shape, layers=1, final_nonlinearity=None, **kwargs)
+        self.ls_start_bias = ls_start_bias
+        self.params_list = self.main_encoder.params_list + self.mu.params_list + self.logsigma.params_list
+        self.logsigma_min = logsigma_min
+        self.logsigma_max = logsigma_max
+
+    def __call__(self, x):
+        encoded = self.main_encoder(x)
+        mu = self.mu(encoded)
+        logsigma = tf.clip_by_value(self.logsigma(encoded) + self.ls_start_bias, self.logsigma_min, self.logsigma_max)
+        return mu, logsigma
+
+    def l2_loss(self):
+        return self.main_encoder.l2_loss() + self.mu.l2_loss() + self.logsigma.l2_loss()
--- a/research/steve/replay.py
+++ b/research/steve/replay.py
+from __future__ import print_function
+from future import standard_library
+standard_library.install_aliases()
+from builtins import zip
+from builtins import str
+from builtins import object
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import pickle
+import multiprocessing
+
+class ReplayBuffer(object):
+    """
+    Stores frames sampled from the environment, with the ability to sample a batch
+    for training.
+    """
+
+    def __init__(self, max_size, obs_dim, action_dim, roundrobin=True):
+        self.max_size = max_size
+        self.obs_dim = obs_dim
+        self.action_dim = action_dim
+        self.roundrobin = roundrobin
+
+        self.obs_buffer = np.zeros([max_size, obs_dim])
+        self.next_obs_buffer = np.zeros([max_size, obs_dim])
+        self.action_buffer = np.zeros([max_size, action_dim])
+        self.reward_buffer = np.zeros([max_size])
+        self.done_buffer = np.zeros([max_size])
+
+        self.count = 0
+
+    def random_batch(self, batch_size):
+        indices = np.random.randint(0, min(self.count, self.max_size), batch_size)
+
+        return (
+            self.obs_buffer[indices],
+            self.next_obs_buffer[indices],
+            self.action_buffer[indices],
+            self.reward_buffer[indices],
+            self.done_buffer[indices],
+            self.count
+        )
+
+    def add_replay(self, obs, next_obs, action, reward, done):
+        if self.count >= self.max_size:
+            if self.roundrobin: index = self.count % self.max_size
+            else:               index = np.random.randint(0, self.max_size)
+        else:
+            index = self.count
+
+        self.obs_buffer[index] = obs
+        self.next_obs_buffer[index] = next_obs
+        self.action_buffer[index] = action
+        self.reward_buffer[index] = reward
+        self.done_buffer[index] = done
+
+        self.count += 1
+
+    def save(self, path, name):
+        def _save(datas, fnames):
+            print("saving replay buffer...")
+            for data, fname in zip(datas, fnames):
+                with open("%s.npz"%fname, "w") as f:
+                    pickle.dump(data, f)
+            with open("%s/%s.count" % (path,name), "w") as f:
+                f.write(str(self.count))
+            print("...done saving.")
+
+        datas = [
+            self.obs_buffer,
+            self.next_obs_buffer,
+            self.action_buffer,
+            self.reward_buffer,
+            self.done_buffer
+        ]
+
+        fnames = [
+            "%s/%s.obs_buffer" % (path, name),
+            "%s/%s.next_obs_buffer" % (path, name),
+            "%s/%s.action_buffer" % (path, name),
+            "%s/%s.reward_buffer" % (path, name),
+            "%s/%s.done_buffer" % (path, name)
+         ]
+
+        proc = multiprocessing.Process(target=_save, args=(datas, fnames))
+        proc.start()
+
+    def load(self, path, name):
+        print("Loading %s replay buffer (may take a while...)" % name)
+        with open("%s/%s.obs_buffer.npz" % (path,name)) as f: self.obs_buffer = pickle.load(f)
+        with open("%s/%s.next_obs_buffer.npz" % (path,name)) as f: self.next_obs_buffer = pickle.load(f)
+        with open("%s/%s.action_buffer.npz" % (path,name)) as f: self.action_buffer = pickle.load(f)
+        with open("%s/%s.reward_buffer.npz" % (path,name)) as f: self.reward_buffer = pickle.load(f)
+        with open("%s/%s.done_buffer.npz" % (path,name)) as f: self.done_buffer = pickle.load(f)
+        with open("%s/%s.count" % (path,name), "r") as f: self.count = int(f.read())
--- a/research/steve/toy_demo.py
+++ b/research/steve/toy_demo.py
+from __future__ import division
+from __future__ import print_function
+from builtins import range
+from past.utils import old_div
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import scipy
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+### Hyperparameters
+
+NONTERMINAL_STATE_COUNT = 100
+NOISE_AMOUNT = 0.1
+TRAIN_STEPS = 10000
+Q_ENSEMBLE_SIZE = 8
+MODEL_ENSEMBLE_SIZE = 8
+HORIZON = 5
+TRIAL_N = 10
+
+### Helper functions
+
+initial_state = 0
+terminal_state = NONTERMINAL_STATE_COUNT + 1
+nonterminal_state_count = NONTERMINAL_STATE_COUNT
+state_count = NONTERMINAL_STATE_COUNT + 1
+final_reward = NONTERMINAL_STATE_COUNT
+colors = sns.color_palette('husl', 4)
+plt.rcParams["figure.figsize"] = (6,5)
+
+def step(state):
+  if state == terminal_state: next_state = terminal_state
+  else:                       next_state = state + 1
+
+  if state == terminal_state:     reward = 0
+  elif state+1 == terminal_state: reward = final_reward
+  else:                           reward = -1
+
+  return next_state, reward
+
+def noisy_step(state):
+  if state == terminal_state:               next_state = terminal_state
+  elif np.random.random([]) < NOISE_AMOUNT: next_state = np.random.randint(0, state_count)
+  else:                                     next_state = state + 1
+
+  if state == terminal_state:     reward = 0
+  elif state+1 == terminal_state: reward = final_reward
+  else:                           reward = -1
+
+  return next_state, reward
+
+def get_error(Q):
+  losses = np.square(np.arange(state_count) - Q[:-1])
+  return np.mean(losses)
+
+def downsample(array, factor):
+  pad_size = np.ceil(old_div(float(array.size),factor))*factor - array.size
+  array_padded = np.append(array, np.zeros([pad_size.astype(np.int64)])*np.NaN)
+  return scipy.nanmean(array_padded.reshape(-1,factor), axis=1)
+
+
+######################
+### Main experiments
+######################
+
+# Basic Q
+if True:
+  print("Running basic Q-learning.")
+  trial_results = []
+  for run_i in range(TRIAL_N):
+    print("Trial %d" % run_i)
+    Q = np.random.randint(0,state_count,[state_count+1]).astype(np.float64)
+    Q[state_count] = 0
+    losses = []
+    for step_i in range(TRAIN_STEPS):
+      state = np.random.randint(0,state_count)
+      next_state, reward = step(state)
+      Q[state] = reward + Q[next_state]
+      losses.append(get_error(Q))
+    trial_results.append(losses)
+  print("...complete.\n")
+
+  result = np.stack(trial_results, axis=1)
+  means = np.mean(result, axis=1)
+  stdevs = np.std(result, axis=1)
+  plt.plot(means, label="Basic Q-learning", color=colors[0])
+  plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[0])
+  with open('Toy-v1/baseline.csv', 'w') as f:
+    data = []
+    for frame_i in range(result.shape[0]):
+      for loss in result[frame_i]:
+        data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
+    f.write("\n".join(data))
+
+# Ensemble Q
+if True:
+  print("Running ensemble Q-learning.")
+  trial_results = []
+  for run_i in range(TRIAL_N):
+    print("Trial %d" % run_i)
+    Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
+    Q[:, state_count] = 0
+    losses = []
+    for step_i in range(TRAIN_STEPS):
+      for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
+        state = np.random.randint(0,state_count)
+        next_state, reward = step(state)
+        Q[q_ensemble_i, state] = reward + np.mean(Q[:, next_state])
+      losses.append(get_error(np.mean(Q, axis=0)))
+    trial_results.append(losses)
+  print("...complete.\n")
+
+  result = np.stack(trial_results, axis=1)
+  means = np.mean(result, axis=1)
+  stdevs = np.std(result, axis=1)
+  plt.plot(means, label="Ensemble Q-learning", color=colors[1])
+  plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[1])
+
+# Ensemble MVE-Oracle
+if True:
+  print("Running ensemble oracle MVE.")
+  trial_results = []
+  for run_i in range(TRIAL_N):
+    print("Trial %d" % run_i)
+    Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
+    Q[:, state_count] = 0
+    losses = []
+    for step_i in range(TRAIN_STEPS):
+      for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
+        state = np.random.randint(0,state_count)
+        next_state, reward = step(state)
+
+        # MVE rollout
+        target = reward
+        for _ in range(HORIZON):
+          next_state, reward = step(next_state)
+          target += reward
+        target += np.mean(Q[:,next_state])
+
+        Q[q_ensemble_i, state] = target
+      losses.append(get_error(np.mean(Q, axis=0)))
+    trial_results.append(losses)
+  print("...complete.\n")
+
+  result = np.stack(trial_results, axis=1)
+  means = np.mean(result, axis=1)
+  stdevs = np.std(result, axis=1)
+  plt.plot(means, label="MVE-oracle", color=colors[2])
+  plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[2])
+  with open('Toy-v1/mve_oracle.csv', 'w') as f:
+    data = []
+    for frame_i in range(result.shape[0]):
+      for loss in result[frame_i]:
+        data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
+    f.write("\n".join(data))
+
+# Ensemble MVE-Noisy
+if True:
+  print("Running ensemble noisy MVE.")
+  trial_results = []
+  for run_i in range(TRIAL_N):
+    print("Trial %d" % run_i)
+    Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
+    Q[:, state_count] = 0
+    losses = []
+    for step_i in range(TRAIN_STEPS):
+      for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
+        state = np.random.randint(0,state_count)
+        next_state, reward = step(state)
+
+        # MVE rollout
+        targets = []
+        first_next_state, first_reward = next_state, reward
+        for model_ensemble_i in range(MODEL_ENSEMBLE_SIZE):
+          next_state, reward = first_next_state, first_reward
+          target = reward
+          for _ in range(HORIZON):
+            next_state, reward = noisy_step(next_state)
+            target += reward
+          target += np.mean(Q[:,next_state])
+          targets.append(target)
+
+        Q[q_ensemble_i, state] = np.mean(targets)
+      losses.append(get_error(np.mean(Q, axis=0)))
+    trial_results.append(losses)
+  print("...complete.\n")
+
+  result = np.stack(trial_results, axis=1)
+  means = np.mean(result, axis=1)
+  stdevs = np.std(result, axis=1)
+  plt.plot(means, label="MVE-noisy", color=colors[2], linestyle='dotted')
+  plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[2])
+  with open('Toy-v1/mve_noisy.csv', 'w') as f:
+    data = []
+    for frame_i in range(result.shape[0]):
+      for loss in result[frame_i]:
+        data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
+    f.write("\n".join(data))
+
+# STEVE-Oracle
+if True:
+  print("Running ensemble oracle STEVE.")
+  trial_results = []
+
+  oracle_q_estimate_errors = []
+  oracle_mve_estimate_errors = []
+  oracle_steve_estimate_errors = []
+  oracle_opt_estimate_errors = []
+
+
+  for run_i in range(TRIAL_N):
+    print("Trial %d" % run_i)
+    Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
+    Q[:, state_count] = 0
+    losses = []
+
+    q_estimate_errors = []
+    mve_estimate_errors = []
+    steve_estimate_errors = []
+    opt_estimate_errors = []
+    steve_beat_freq= []
+
+    for step_i in range(TRAIN_STEPS):
+      _q_estimate_errors = []
+      _mve_estimate_errors = []
+      _steve_estimate_errors = []
+      _opt_estimate_errors = []
+      _steve_beat_freq = []
+
+      for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
+        state = np.random.randint(0,state_count)
+        next_state, reward = step(state)
+
+        # STEVE rollout
+        Q_est_mat = np.zeros([HORIZON + 1, Q_ENSEMBLE_SIZE])
+        reward_est_mat = np.zeros([HORIZON + 1, 1])
+        first_next_state, first_reward = next_state, reward
+        next_state, reward = first_next_state, first_reward
+        Q_est_mat[0, :] = Q[:, next_state]
+        reward_est_mat[0, 0] = reward
+        for timestep_i in range(1,HORIZON+1):
+          next_state, reward = step(next_state)
+          Q_est_mat[timestep_i, :] = Q[:, next_state]
+          reward_est_mat[timestep_i, 0] = reward
+        all_targets = Q_est_mat + np.cumsum(reward_est_mat, axis=0)
+
+        # STEVE weight calculation
+        estimates = np.mean(all_targets, axis=1)
+        confidences = old_div(1., (np.var(all_targets, axis=1) + 1e-8))
+        coefficients = old_div(confidences, np.sum(confidences))
+        target = np.sum(estimates * coefficients)
+
+        Q[q_ensemble_i, state] = target
+
+        true_target = state + 1. if state != terminal_state else 0.
+        _q_estimate_errors.append(np.square(estimates[0] - true_target))
+        _mve_estimate_errors.append(np.square(estimates[-1] - true_target))
+        _steve_estimate_errors.append(np.square(np.sum(estimates * coefficients) - true_target))
+        _opt_estimate_errors.append(np.min(np.square(estimates - true_target)))
+
+      losses.append(get_error(np.mean(Q, axis=0)))
+      q_estimate_errors.append(np.mean(_q_estimate_errors))
+      mve_estimate_errors.append(np.mean(_mve_estimate_errors))
+      steve_estimate_errors.append(np.mean(_steve_estimate_errors))
+      opt_estimate_errors.append(np.mean(_opt_estimate_errors))
+    trial_results.append(losses)
+    oracle_q_estimate_errors.append(q_estimate_errors)
+    oracle_mve_estimate_errors.append(mve_estimate_errors)
+    oracle_steve_estimate_errors.append(steve_estimate_errors)
+    oracle_opt_estimate_errors.append(opt_estimate_errors)
+  print("...complete.\n")
+
+  result = np.stack(trial_results, axis=1)
+  means = np.mean(result, axis=1)
+  stdevs = np.std(result, axis=1)
+  plt.plot(means, label="STEVE-oracle", color=colors[3])
+  plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[3])
+  with open('Toy-v1/steve_oracle.csv', 'w') as f:
+    data = []
+    for frame_i in range(result.shape[0]):
+      for loss in result[frame_i]:
+        data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
+    f.write("\n".join(data))
+
+# STEVE-Noisy
+if True:
+  print("Running ensemble noisy STEVE.")
+  trial_results = []
+
+  noisy_q_estimate_errors = []
+  noisy_mve_estimate_errors = []
+  noisy_steve_estimate_errors = []
+  noisy_opt_estimate_errors = []
+  noisy_steve_beat_freq = []
+
+  for run_i in range(TRIAL_N):
+    print("Trial %d" % run_i)
+    Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
+    Q[:, state_count] = 0
+    losses = []
+
+    q_estimate_errors = []
+    mve_estimate_errors = []
+    steve_estimate_errors = []
+    opt_estimate_errors = []
+    steve_beat_freq= []
+
+    for step_i in range(TRAIN_STEPS):
+      _q_estimate_errors = []
+      _mve_estimate_errors = []
+      _steve_estimate_errors = []
+      _opt_estimate_errors = []
+      _steve_beat_freq = []
+      for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
+        state = np.random.randint(0,state_count)
+        next_state, reward = step(state)
+
+        # STEVE rollout
+        Q_est_mat = np.zeros([HORIZON + 1, MODEL_ENSEMBLE_SIZE, Q_ENSEMBLE_SIZE])
+        reward_est_mat = np.zeros([HORIZON + 1, MODEL_ENSEMBLE_SIZE, 1])
+        first_next_state, first_reward = next_state, reward
+        for model_ensemble_i in range(MODEL_ENSEMBLE_SIZE):
+          next_state, reward = first_next_state, first_reward
+          Q_est_mat[0, model_ensemble_i, :] = Q[:, next_state]
+          reward_est_mat[0, model_ensemble_i, 0] = reward
+          for timestep_i in range(1,HORIZON+1):
+            next_state, reward = noisy_step(next_state)
+            Q_est_mat[timestep_i, model_ensemble_i, :] = Q[:, next_state]
+            reward_est_mat[timestep_i, model_ensemble_i, 0] = reward
+        all_targets = Q_est_mat + np.cumsum(reward_est_mat, axis=0)
+
+        # STEVE weight calculation
+        all_targets = np.reshape(all_targets, [HORIZON+1, MODEL_ENSEMBLE_SIZE * Q_ENSEMBLE_SIZE])
+        estimates = np.mean(all_targets, axis=1)
+        confidences = old_div(1., (np.var(all_targets, axis=1) + 1e-8))
+        coefficients = old_div(confidences, np.sum(confidences))
+        target = np.sum(estimates * coefficients)
+        # target = estimates[0]
+
+        Q[q_ensemble_i, state] = target
+
+        true_target = state + 1. if state != terminal_state else 0.
+        _q_estimate_errors.append(np.square(estimates[0] - true_target))
+        _mve_estimate_errors.append(np.square(estimates[-1] - true_target))
+        _steve_estimate_errors.append(np.square(np.sum(estimates * coefficients) - true_target))
+        _opt_estimate_errors.append(np.min(np.square(estimates - true_target)))
+        _steve_beat_freq.append(float(np.square(estimates[0] - true_target) > np.square(target - true_target)))
+
+      losses.append(get_error(np.mean(Q, axis=0)))
+      q_estimate_errors.append(np.mean(_q_estimate_errors))
+      mve_estimate_errors.append(np.mean(_mve_estimate_errors))
+      steve_estimate_errors.append(np.mean(_steve_estimate_errors))
+      opt_estimate_errors.append(np.mean(_opt_estimate_errors))
+      steve_beat_freq.append(np.mean(_steve_beat_freq))
+    trial_results.append(losses)
+    noisy_q_estimate_errors.append(q_estimate_errors)
+    noisy_mve_estimate_errors.append(mve_estimate_errors)
+    noisy_steve_estimate_errors.append(steve_estimate_errors)
+    noisy_opt_estimate_errors.append(opt_estimate_errors)
+    noisy_steve_beat_freq.append(steve_beat_freq)
+
+  print("...complete.\n")
+
+  result = np.stack(trial_results, axis=1)
+  means = np.mean(result, axis=1)
+  stdevs = np.std(result, axis=1)
+  plt.plot(means, label="STEVE-noisy", color=colors[3], linestyle='dotted')
+  plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[3])
+  with open('Toy-v1/steve_noisy.csv', 'w') as f:
+    data = []
+    for frame_i in range(result.shape[0]):
+      for loss in result[frame_i]:
+        data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
+    f.write("\n".join(data))
+
+# ### Display results
+# plt.title("Comparison of convergence rates")
+# plt.legend()
+# plt.savefig("comparison.pdf")
+# plt.show()
+#
+# ### Display secondary results - error comparison
+# DOWNSAMPLE = 50
+# colors = sns.color_palette('husl', 8)
+# for i, (error_curve, label) in enumerate([
+#                                           (oracle_q_estimate_errors, "Oracle Q error"),
+#                                           (oracle_mve_estimate_errors, "Oracle MVE error"),
+#                                           (oracle_steve_estimate_errors, "Oracle STEVE error"),
+#                                           # (oracle_opt_estimate_errors, "Oracle minimum single-estimate error"),
+#                                          ]):
+#   result = np.stack(error_curve, axis=1)
+#   means = downsample(np.mean(result, axis=1), DOWNSAMPLE)
+#   stdevs = downsample(np.std(result, axis=1), DOWNSAMPLE)
+#   plt.plot(means, label=label, color=colors[i])
+#   plt.fill_between(np.arange(means.shape[0]), means - stdevs, means + stdevs, alpha=.2, color=colors[i])
+#
+# plt.title("Comparison of errors for oracle dynamics")
+# plt.legend()
+# plt.show()
+#
+# for i, (error_curve, label) in enumerate([
+#                                           (noisy_q_estimate_errors, "Noisy Q error"),
+#                                           (noisy_mve_estimate_errors, "Noisy MVE error"),
+#                                           (noisy_steve_estimate_errors, "Noisy STEVE error"),
+#                                           # (noisy_opt_estimate_errors, "Noisy minimum single-estimate error"),
+#                                           # (trial_steve_beat_freq, "STEVE beat freq"),
+#                                         ]):
+#   result = np.stack(error_curve, axis=1)
+#   means = downsample(np.mean(result, axis=1), DOWNSAMPLE)
+#   stdevs = downsample(np.std(result, axis=1), DOWNSAMPLE)
+#   plt.plot(means, label=label, color=colors[i])
+#   plt.fill_between(np.arange(means.shape[0]), means - stdevs, means + stdevs, alpha=.2, color=colors[i])
+#
+# plt.title("Comparison of errors for noisy dynamics")
+# plt.legend()
+# plt.show()
\ No newline at end of file