Unverified Commit e10d986e authored by Lukasz Kaiser's avatar Lukasz Kaiser Committed by GitHub
Browse files

Merge pull request #4642 from buckman-google/master

Addition of STEVE
parents ee0e9d11 f789dcf5
{"inherits": ["config/algos/steve.json", "config/envs/walker2d.json"]}
{"inherits": ["config/algos/steve.json", "config/envs/walker2d.json"]}
{"inherits": ["config/algos/ddpg.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/ddpg.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/mve_tdk.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/mve_tdk.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/steve.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/steve.json", "config/envs/flagrun.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/ddpg.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/ddpg.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/mve_tdk.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/mve_tdk.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/steve.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
{"inherits": ["config/algos/steve.json", "config/envs/humanoid.json", "config/experimental_setups/speedrun.json"]}
from builtins import object
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
try:
import roboschool
except:
pass
import gym
import numpy as np
from config import config
MAX_FRAMES = config["env"]["max_frames"]
gym.logger.level=40
def get_env(env_name, *args, **kwargs):
MAPPING = {
"CartPole-v0": CartPoleWrapper,
}
if env_name in MAPPING: return MAPPING[env_name](env_name, *args, **kwargs)
else: return NoTimeLimitMujocoWrapper(env_name, *args, **kwargs)
class GymWrapper(object):
"""
Generic wrapper for OpenAI gym environments.
"""
def __init__(self, env_name):
self.internal_env = gym.make(env_name)
self.observation_space = self.internal_env.observation_space
self.action_space = self.internal_env.action_space
self.custom_init()
def custom_init(self):
pass
def reset(self):
self.clock = 0
return self.preprocess_obs(self.internal_env.reset())
# returns normalized actions
def sample(self):
return self.action_space.sample()
# this is used for converting continuous approximations back to the original domain
def normalize_actions(self, actions):
return actions
# puts actions into a form where they can be predicted. by default, called after sample()
def unnormalize_actions(self, actions):
return actions
def preprocess_obs(self, obs):
# return np.append(obs, [self.clock/float(MAX_FRAMES)])
return obs
def step(self, normalized_action):
out = self.internal_env.step(normalized_action)
self.clock += 1
obs, reward, done = self.preprocess_obs(out[0]), out[1], float(out[2])
reset = done == 1. or self.clock == MAX_FRAMES
return obs, reward, done, reset
def render_rollout(self, states):
## states is numpy array of size [timesteps, state]
self.internal_env.reset()
for state in states:
self.internal_env.env.state = state
self.internal_env.render()
class CartPoleWrapper(GymWrapper):
"""
Wrap CartPole.
"""
def sample(self):
return np.array([np.random.uniform(0., 1.)])
def normalize_actions(self, action):
return 1 if action[0] >= 0 else 0
def unnormalize_actions(self, action):
return 2. * action - 1.
class NoTimeLimitMujocoWrapper(GymWrapper):
"""
Wrap Mujoco-style environments, removing the termination condition after time.
This is needed to keep it Markovian.
"""
def __init__(self, env_name):
self.internal_env = gym.make(env_name).env
self.observation_space = self.internal_env.observation_space
self.action_space = self.internal_env.action_space
self.custom_init()
from __future__ import division
from __future__ import print_function
from builtins import zip
from builtins import range
from builtins import object
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import traceback, threading, time, warnings
import tensorflow as tf
import numpy as np
import util
from replay import ReplayBuffer
class Learner(object):
"""
Generic object which runs the main training loop of anything that trains using
a replay buffer. Handles updating, logging, saving/loading, batching, etc.
"""
def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs):
self.learner_name = self.learner_name()
self.interactor_queue = interactor_queue
self.learner_lock = lock
self.config = config
self.env_config = env_config
self.learner_config = learner_config
self.bonus_kwargs = bonus_kwargs
self.kill_threads = False
self.permit_desync = False
self.need_frames_notification = threading.Condition()
self._reset_inspections()
self.total_frames = 0
self.save_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"]))
self.log_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["log_path"])) + "/%s.log" % self.learner_name
# replay buffer to store data
self.replay_buffer_lock = threading.RLock()
self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"],
np.prod(self.env_config["obs_dims"]),
self.env_config["action_dim"])
# data loaders pull data from the replay buffer and put it into the tfqueue for model usage
self.data_loaders = self.make_loader_placeholders()
queue_capacity = np.ceil(1./self.learner_config["frames_per_update"]) if self.learner_config["frames_per_update"] else 100
self.tf_queue = tf.FIFOQueue(capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders])
self.enqueue_op = self.tf_queue.enqueue(self.data_loaders)
self.current_batch = self.tf_queue.dequeue()
# build the TF graph for the actual model to train
self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model()
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
## Mandatory functions to override
def learner_name(self): raise Exception('unimplemented: learner_name')
def make_loader_placeholders(self): raise Exception('unimplemented: make_loader_placeholders')
def make_core_model(self): raise Exception('unimplemented: make_core_model')
## Optional functions to override
def initialize(self): warnings.warn('unimplemented: initialize')
def resume_from_checkpoint(self, epoch): warnings.warn('unimplemented: resume_from_checkpoint')
def checkpoint(self): warnings.warn('unimplemented: checkpoint')
def backup(self): warnings.warn('unimplemented: backup')
## Internal functions
def _start(self):
# fetch data from the interactors to pre-fill the replay buffer
self.prefetch_thread = threading.Thread(target=self._poll_interactors, args=(True, self.learner_config["frames_before_learning"],))
self.prefetch_thread.start()
self.prefetch_thread.join()
# start the interactor and data loader
self.data_load_thread = threading.Thread(target=self._run_enqueue_data)
self.data_load_thread.start()
# initialize the learner, pretraining if needed
if self.config["resume"]: self._resume_from_checkpoint()
else: self._initialize()
# re-sync everything, and start up interactions with the environment
self.interactor_poll_thread = threading.Thread(target=self._poll_interactors)
self.interactor_poll_thread.start()
# start the clock
self._last_checkpoint_time = time.time()
def _learn(self, permit_desync=False, log=True, checkpoint=True, backup=True):
# this is to keep the frames/update synced properly
if self.learner_config["frames_per_update"] is not False and not permit_desync:
if not self._have_enough_frames():
with self.need_frames_notification:
self.need_frames_notification.notify()
return
# log
if log and (self.update_i + 1) % self.learner_config["log_every_n"] == 0:
self._log()
# checkpoint
if checkpoint and (self.update_i + 1) % self.learner_config["epoch_every_n"] == 0:
self._checkpoint()
# backup
if backup and (self.update_i + 1) % self.learner_config["backup_every_n"] == 0:
self._backup()
# train
self._training_step()
def _have_enough_frames(self):
gathered_frames = self.total_frames - self.learner_config["frames_before_learning"]
return gathered_frames > self.learner_config["frames_per_update"] * self.update_i
def _initialize(self):
self.epoch = 0
self.update_i = 0
self.hours = 0
self._last_checkpoint_time = time.time()
self.initialize()
if self.learner_config["pretrain_n"]: self._pretrain()
self._checkpoint()
def _pretrain(self):
for _ in range(self.learner_config["pretrain_n"]):
self._learn(permit_desync=True, checkpoint=False, backup=False)
self.epoch = 0
self.update_i = 0
def _resume_from_checkpoint(self):
epoch = util.get_largest_epoch_in_dir(self.save_path, self.core.saveid)
if not self.config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid)
if epoch is False:
raise Exception("Tried to reload but no model found")
with self.learner_lock:
self.core.load(self.sess, self.save_path, epoch)
self.epoch, self.update_i, self.total_frames, self.hours = self.sess.run([self.core.epoch_n, self.core.update_n, self.core.frame_n, self.core.hours])
with self.replay_buffer_lock:
self.replay_buffer.load(self.save_path, '%09d_%s' % (epoch, self.learner_name))
self.resume_from_checkpoint(epoch)
def _log(self):
logstring = "(%3.2f sec) h%-8.2f e%-8d s%-8d f%-8d\t" % (time.time() - self._log_time, self.hours, self.epoch, self.update_i + 1, self.total_frames) + ', '.join(["%8f" % x for x in (self.running_total / self.denom).tolist()])
print("%s\t%s" % (self.learner_name, logstring))
with open(self.log_path, "a") as f: f.write(logstring + "\n")
self._reset_inspections()
def _reset_inspections(self):
self.running_total = 0.
self.denom = 0.
self._log_time = time.time()
def _checkpoint(self):
self.checkpoint()
self.epoch += 1
self.hours += (time.time() - self._last_checkpoint_time) / 3600.
self._last_checkpoint_time = time.time()
self.core.update_epoch(self.sess, self.epoch, self.update_i, self.total_frames, self.hours)
with self.learner_lock: self.core.save(self.sess, self.save_path)
def _backup(self):
self.backup()
if not self.learner_config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid)
with self.learner_lock:
self.core.save(self.sess, self.save_path, self.epoch)
with self.replay_buffer_lock:
self.replay_buffer.save(self.save_path, '%09d_%s' % (self.epoch, self.learner_name))
def _training_step(self):
train_ops = tuple([op for op, loss in zip(self.train_ops,
self.train_losses)
if loss is not None])
outs = self.sess.run(train_ops + self.inspect_losses)
self.running_total += np.array(outs[len(train_ops):])
self.denom += 1.
self.update_i += 1
def _poll_interactors(self, continuous_poll=False, frames_before_terminate=None):
# poll the interactors for new frames.
# the synced_condition semaphore prevents this from consuming too much CPU
while not self.kill_threads:
if self.learner_config["frames_per_update"] is not False and not continuous_poll:
with self.need_frames_notification: self.need_frames_notification.wait()
while not self.interactor_queue.empty():
new_frames = self.interactor_queue.get()
self._add_frames(new_frames)
if frames_before_terminate and self.total_frames >= frames_before_terminate: return
def _add_frames(self, frames):
with self.replay_buffer_lock:
for frame in frames:
self.replay_buffer.add_replay(*frame)
self.total_frames = self.replay_buffer.count
return self.total_frames
def _run_enqueue_data(self):
while not self.kill_threads:
data = self.replay_buffer.random_batch(self.learner_config["batch_size"])
self.sess.run(self.enqueue_op, feed_dict=dict(list(zip(self.data_loaders, data))))
def _kill_threads(self):
self.kill_threads = True
class CoreModel(object):
"""The base class for the "core" of learners."""
def __init__(self, name, env_config, learner_config):
self.name = self.saveid + "/" + name
self.env_config = env_config
self.learner_config = learner_config
with tf.variable_scope(self.name):
self.epoch_n = tf.get_variable('epoch_n', [], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False)
self.update_n = tf.get_variable('update_n', [], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False)
self.frame_n = tf.get_variable('frame_n', [], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False)
self.hours = tf.get_variable('hours', [], initializer=tf.constant_initializer(0.), dtype=tf.float64, trainable=False)
self.epoch_n_placeholder = tf.placeholder(tf.int64, [])
self.update_n_placeholder = tf.placeholder(tf.int64, [])
self.frame_n_placeholder = tf.placeholder(tf.int64, [])
self.hours_placeholder = tf.placeholder(tf.float64, [])
self.assign_epoch_op = [tf.assign(self.epoch_n, self.epoch_n_placeholder), tf.assign(self.update_n, self.update_n_placeholder), tf.assign(self.frame_n, self.frame_n_placeholder), tf.assign(self.hours, self.hours_placeholder)]
self.create_params(env_config, learner_config)
self.model_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
self.saver = tf.train.Saver(self.model_params)
@property
def saveid(self):
raise Exception("specify a save ID")
def create_params(self, env_config, learner_config):
raise Exception("unimplemented")
def update_epoch(self, sess, epoch, updates, frames, hours):
sess.run(self.assign_epoch_op, feed_dict={self.epoch_n_placeholder: int(epoch), self.update_n_placeholder: int(updates), self.frame_n_placeholder: int(frames), self.hours_placeholder: float(hours)})
def save(self, sess, path, epoch=None):
if epoch is None: self.saver.save(sess, path + "/%s.params" % self.saveid)
else: self.saver.save(sess, path + "/%09d_%s.params" % (epoch, self.saveid))
def load(self, sess, path, epoch=None):
if epoch is None: self.saver.restore(sess, path + "/%s.params" % self.saveid)
else: self.saver.restore(sess, path + "/%09d_%s.params" % (epoch, self.saveid))
def run_learner(learner_subclass, queue, lock, config, env_config, learner_config, **bonus_kwargs):
learner = learner_subclass(queue, lock, config, env_config, learner_config, **bonus_kwargs)
try:
learner._start()
while True: learner._learn()
except Exception as e:
print('Caught exception in learner process')
traceback.print_exc()
learner._kill_threads()
print()
raise e
from builtins import str
from builtins import range
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import multiprocessing
import os, sys, time
from config import config, log_config
import util
AGENT_COUNT = config["agent_config"]["count"]
EVALUATOR_COUNT = config["evaluator_config"]["count"]
MODEL_AUGMENTED = config["model_config"] is not False
if config["resume"]:
ROOT_PATH = "output/" + config["env"]["name"] + "/" + config["name"]
else:
ROOT_PATH = util.create_and_wipe_directory("output/" + config["env"]["name"] + "/" + config["name"])
log_config()
import learner, agent, valuerl_learner
if MODEL_AUGMENTED: import worldmodel_learner
if __name__ == '__main__':
all_procs = set([])
interaction_procs = set([])
# lock
policy_lock = multiprocessing.Lock()
model_lock = multiprocessing.Lock() if MODEL_AUGMENTED else None
# queue
policy_replay_frame_queue = multiprocessing.Queue(1)
model_replay_frame_queue = multiprocessing.Queue(1) if MODEL_AUGMENTED else None
# interactors
for interact_proc_i in range(AGENT_COUNT):
interact_proc = multiprocessing.Process(target=agent.main, args=(interact_proc_i, False, policy_replay_frame_queue, model_replay_frame_queue, policy_lock, config))
all_procs.add(interact_proc)
interaction_procs.add(interact_proc)
# evaluators
for interact_proc_i in range(EVALUATOR_COUNT):
interact_proc = multiprocessing.Process(target=agent.main, args=(interact_proc_i, True, policy_replay_frame_queue, model_replay_frame_queue, policy_lock, config))
all_procs.add(interact_proc)
interaction_procs.add(interact_proc)
# policy training
train_policy_proc = multiprocessing.Process(target=learner.run_learner, args=(valuerl_learner.ValueRLLearner, policy_replay_frame_queue, policy_lock, config, config["env"], config["policy_config"]), kwargs={"model_lock": model_lock})
all_procs.add(train_policy_proc)
# model training
if MODEL_AUGMENTED:
train_model_proc = multiprocessing.Process(target=learner.run_learner, args=(worldmodel_learner.WorldmodelLearner, model_replay_frame_queue, model_lock, config, config["env"], config["model_config"]))
all_procs.add(train_model_proc)
# start all policies
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
for i, proc in enumerate(interaction_procs):
os.environ['CUDA_VISIBLE_DEVICES'] = ''
proc.start()
os.environ['CUDA_VISIBLE_DEVICES'] = str(int(sys.argv[2]))
train_policy_proc.start()
if MODEL_AUGMENTED:
os.environ['CUDA_VISIBLE_DEVICES'] = str(1+int(sys.argv[2]))
train_model_proc.start()
while True:
try:
pass
except:
for proc in all_procs: proc.join()
from builtins import range
from builtins import object
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
import numpy as np
from itertools import product
class FeedForwardNet(object):
"""Custom feed-forward network layer."""
def __init__(self, name, in_size, out_shape, layers=1, hidden_dim=32, final_nonlinearity=None, get_uncertainty=False):
self.name = name
self.in_size = in_size
self.out_shape = out_shape
self.out_size = np.prod(out_shape)
self.layers = layers
self.hidden_dim = hidden_dim
self.final_nonlinearity = (lambda x:x) if final_nonlinearity is None else final_nonlinearity
self.get_uncertainty = get_uncertainty
self.weights = [None] * layers
self.biases = [None] * layers
self.params_list = []
with tf.variable_scope(name):
for layer_i in range(self.layers):
in_size = self.hidden_dim
out_size = self.hidden_dim
if layer_i == 0: in_size = self.in_size
if layer_i == self.layers - 1: out_size = self.out_size
self.weights[layer_i] = tf.get_variable("weights%d" % layer_i, [in_size, out_size], initializer=tf.contrib.layers.xavier_initializer())
self.biases[layer_i] = tf.get_variable("bias%d" % layer_i, [1, out_size], initializer=tf.constant_initializer(0.0))
self.params_list += [self.weights[layer_i], self.biases[layer_i]]
def __call__(self, x, stop_params_gradient=False, is_eval=True, ensemble_idxs=None, pre_expanded=None, reduce_mode="none"):
original_shape = tf.shape(x)
h = tf.reshape(x, [-1, self.in_size])
for layer_i in range(self.layers):
nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity
if stop_params_gradient: h = nonlinearity(tf.matmul(h, tf.stop_gradient(self.weights[layer_i])) + tf.stop_gradient(self.biases[layer_i]))
else: h = nonlinearity(tf.matmul(h, self.weights[layer_i]) + self.biases[layer_i])
if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1))
else: h = tf.reshape(h, original_shape[:-1])
if pre_expanded is None: pre_expanded = ensemble_idxs is not None
if reduce_mode == "none" and not pre_expanded and self.get_uncertainty:
if len(self.out_shape) > 0: h = tf.expand_dims(h, -2)
else: h = tf.expand_dims(h, -1)
return h
def l2_loss(self):
return tf.add_n([tf.reduce_sum(.5 * tf.square(mu)) for mu in self.params_list])
class BayesianDropoutFeedForwardNet(FeedForwardNet):
"""Custom feed-forward network layer, with dropout as a Bayesian approximation."""
def __init__(self, name, in_size, out_shape, layers=1, hidden_dim=32, final_nonlinearity=None, get_uncertainty=False, keep_prob=.5, eval_sample_count=2, consistent_random_seed=False):
super(BayesianDropoutFeedForwardNet, self).__init__(name, in_size, out_shape, layers=layers, hidden_dim=hidden_dim,
final_nonlinearity=final_nonlinearity, get_uncertainty=get_uncertainty)
self.keep_prob = keep_prob
self.eval_sample_count = eval_sample_count
if eval_sample_count < 2: raise Exception("eval_sample_count must be at least 2 to estimate uncertainty")
self.dropout_seed = tf.random_uniform([layers], maxval=1e18, dtype=tf.int64) if consistent_random_seed else [None] * layers
def __call__(self, x, stop_params_gradient=False, is_eval=True, pre_expanded=False, ensemble_idxs=None, reduce_mode="none"):
if is_eval:
x = tf.tile(tf.expand_dims(x,0), tf.concat([tf.constant([self.eval_sample_count]), tf.ones_like(tf.shape(x))], 0))
original_shape = tf.shape(x)
h = tf.reshape(x, [-1, self.in_size])
for layer_i in range(self.layers):
nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity
if layer_i > 0: h = tf.nn.dropout(h, keep_prob=self.keep_prob, seed=self.dropout_seed[layer_i])
if stop_params_gradient: h = nonlinearity(tf.matmul(h, tf.stop_gradient(self.weights[layer_i])) + tf.stop_gradient(self.biases[layer_i]))
else: h = nonlinearity(tf.matmul(h, self.weights[layer_i]) + self.biases[layer_i])
if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1))
else: h = tf.reshape(h, original_shape[:-1])
if is_eval:
h, uncertainty = tf.nn.moments(h, 0)
if self.get_uncertainty: return h, uncertainty
else: return h
else:
return h
class EnsembleFeedForwardNet(FeedForwardNet):
"""Custom feed-forward network layer with an ensemble."""
def __init__(self, name, in_size, out_shape, layers=1, hidden_dim=32, final_nonlinearity=None, get_uncertainty=False, ensemble_size=2, train_sample_count=2, eval_sample_count=2):
if train_sample_count > ensemble_size: raise Exception("train_sample_count cannot be larger than ensemble size")
if eval_sample_count > ensemble_size: raise Exception("eval_sample_count cannot be larger than ensemble size")
self.name = name
self.in_size = in_size
self.out_shape = out_shape
self.out_size = np.prod(out_shape)
self.layers = layers
self.hidden_dim = hidden_dim
self.final_nonlinearity = (lambda x:x) if final_nonlinearity is None else final_nonlinearity
self.get_uncertainty = get_uncertainty
self.ensemble_size = ensemble_size
self.train_sample_count = train_sample_count
self.eval_sample_count = eval_sample_count
self.weights = [None] * layers
self.biases = [None] * layers
self.params_list = []
with tf.variable_scope(name):
for layer_i in range(self.layers):
in_size = self.hidden_dim
out_size = self.hidden_dim
if layer_i == 0: in_size = self.in_size
if layer_i == self.layers - 1: out_size = self.out_size
self.weights[layer_i] = tf.get_variable("weights%d" % layer_i, [ensemble_size, in_size, out_size], initializer=tf.contrib.layers.xavier_initializer())
self.biases[layer_i] = tf.get_variable("bias%d" % layer_i, [ensemble_size, out_size], initializer=tf.constant_initializer(0.0))
self.params_list += [self.weights[layer_i], self.biases[layer_i]]
def __call__(self, x, stop_params_gradient=False, is_eval=True, ensemble_idxs=None, pre_expanded=None, reduce_mode="none"):
if pre_expanded is None: pre_expanded = ensemble_idxs is not None
if ensemble_idxs is None:
ensemble_idxs = tf.random_shuffle(tf.range(self.ensemble_size))
ensemble_sample_n = self.eval_sample_count if is_eval else self.train_sample_count
ensemble_idxs = ensemble_idxs[:ensemble_sample_n]
else:
ensemble_sample_n = tf.shape(ensemble_idxs)[0]
weights = [tf.gather(w, ensemble_idxs, axis=0) for w in self.weights]
biases = [tf.expand_dims(tf.gather(b, ensemble_idxs, axis=0),0) for b in self.biases]
original_shape = tf.shape(x)
if pre_expanded: h = tf.reshape(x, [-1, ensemble_sample_n, self.in_size])
else: h = tf.tile(tf.reshape(x, [-1, 1, self.in_size]), [1, ensemble_sample_n, 1])
for layer_i in range(self.layers):
nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity
if stop_params_gradient: h = nonlinearity(tf.einsum('bri,rij->brj', h, tf.stop_gradient(weights[layer_i])) + tf.stop_gradient(biases[layer_i]))
else: h = nonlinearity(tf.einsum('bri,rij->brj', h, weights[layer_i]) + biases[layer_i])
if pre_expanded:
if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1))
else: h = tf.reshape(h, original_shape[:-1])
else:
if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant([ensemble_sample_n]), tf.constant(self.out_shape)], -1))
else: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant([ensemble_sample_n])], -1))
if reduce_mode == "none":
pass
elif reduce_mode == "random":
if len(self.out_shape) > 0: h = tf.reduce_sum(h * tf.reshape(tf.one_hot(tf.random_uniform([tf.shape(h)[0]], 0, ensemble_sample_n, dtype=tf.int64), ensemble_sample_n), tf.concat([tf.shape(h)[:1], tf.ones_like(tf.shape(h)[1:-2]), tf.constant([ensemble_sample_n]), tf.constant([1])], 0)), -2)
else: h = tf.reduce_sum(h * tf.reshape(tf.one_hot(tf.random_uniform([tf.shape(h)[0]], 0, ensemble_sample_n, dtype=tf.int64), ensemble_sample_n), tf.concat([tf.shape(h)[:1], tf.ones_like(tf.shape(h)[1:-1]), tf.constant([ensemble_sample_n])], 0)), -1)
elif reduce_mode == "mean":
if len(self.out_shape) > 0: h = tf.reduce_mean(h, -2)
else: h = tf.reduce_mean(h, -1)
else: raise Exception("use a valid reduce mode: none, random, or mean")
return h
class ReparamNormal(object):
"""Wrapper to make a feedforward network that outputs both mu and logsigma,
for use in the reparameterization trick."""
def __init__(self, base_net, name, in_size, out_shape, layers=2, hidden_dim=32, final_nonlinearity=None, ls_start_bias=0.0, final_net=FeedForwardNet, logsigma_min=-5., logsigma_max=2., **kwargs):
assert layers > 1
self.main_encoder = base_net(name+"_base", in_size, [hidden_dim], layers, hidden_dim, final_nonlinearity=tf.nn.relu, **kwargs)
self.mu = final_net(name+"_mu", hidden_dim, out_shape, layers=1, final_nonlinearity=final_nonlinearity, **kwargs)
self.logsigma = final_net(name+"_logsigma", hidden_dim, out_shape, layers=1, final_nonlinearity=None, **kwargs)
self.ls_start_bias = ls_start_bias
self.params_list = self.main_encoder.params_list + self.mu.params_list + self.logsigma.params_list
self.logsigma_min = logsigma_min
self.logsigma_max = logsigma_max
def __call__(self, x):
encoded = self.main_encoder(x)
mu = self.mu(encoded)
logsigma = tf.clip_by_value(self.logsigma(encoded) + self.ls_start_bias, self.logsigma_min, self.logsigma_max)
return mu, logsigma
def l2_loss(self):
return self.main_encoder.l2_loss() + self.mu.l2_loss() + self.logsigma.l2_loss()
from __future__ import print_function
from future import standard_library
standard_library.install_aliases()
from builtins import zip
from builtins import str
from builtins import object
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import pickle
import multiprocessing
class ReplayBuffer(object):
"""
Stores frames sampled from the environment, with the ability to sample a batch
for training.
"""
def __init__(self, max_size, obs_dim, action_dim, roundrobin=True):
self.max_size = max_size
self.obs_dim = obs_dim
self.action_dim = action_dim
self.roundrobin = roundrobin
self.obs_buffer = np.zeros([max_size, obs_dim])
self.next_obs_buffer = np.zeros([max_size, obs_dim])
self.action_buffer = np.zeros([max_size, action_dim])
self.reward_buffer = np.zeros([max_size])
self.done_buffer = np.zeros([max_size])
self.count = 0
def random_batch(self, batch_size):
indices = np.random.randint(0, min(self.count, self.max_size), batch_size)
return (
self.obs_buffer[indices],
self.next_obs_buffer[indices],
self.action_buffer[indices],
self.reward_buffer[indices],
self.done_buffer[indices],
self.count
)
def add_replay(self, obs, next_obs, action, reward, done):
if self.count >= self.max_size:
if self.roundrobin: index = self.count % self.max_size
else: index = np.random.randint(0, self.max_size)
else:
index = self.count
self.obs_buffer[index] = obs
self.next_obs_buffer[index] = next_obs
self.action_buffer[index] = action
self.reward_buffer[index] = reward
self.done_buffer[index] = done
self.count += 1
def save(self, path, name):
def _save(datas, fnames):
print("saving replay buffer...")
for data, fname in zip(datas, fnames):
with open("%s.npz"%fname, "w") as f:
pickle.dump(data, f)
with open("%s/%s.count" % (path,name), "w") as f:
f.write(str(self.count))
print("...done saving.")
datas = [
self.obs_buffer,
self.next_obs_buffer,
self.action_buffer,
self.reward_buffer,
self.done_buffer
]
fnames = [
"%s/%s.obs_buffer" % (path, name),
"%s/%s.next_obs_buffer" % (path, name),
"%s/%s.action_buffer" % (path, name),
"%s/%s.reward_buffer" % (path, name),
"%s/%s.done_buffer" % (path, name)
]
proc = multiprocessing.Process(target=_save, args=(datas, fnames))
proc.start()
def load(self, path, name):
print("Loading %s replay buffer (may take a while...)" % name)
with open("%s/%s.obs_buffer.npz" % (path,name)) as f: self.obs_buffer = pickle.load(f)
with open("%s/%s.next_obs_buffer.npz" % (path,name)) as f: self.next_obs_buffer = pickle.load(f)
with open("%s/%s.action_buffer.npz" % (path,name)) as f: self.action_buffer = pickle.load(f)
with open("%s/%s.reward_buffer.npz" % (path,name)) as f: self.reward_buffer = pickle.load(f)
with open("%s/%s.done_buffer.npz" % (path,name)) as f: self.done_buffer = pickle.load(f)
with open("%s/%s.count" % (path,name), "r") as f: self.count = int(f.read())
from __future__ import division
from __future__ import print_function
from builtins import range
from past.utils import old_div
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
### Hyperparameters
NONTERMINAL_STATE_COUNT = 100
NOISE_AMOUNT = 0.1
TRAIN_STEPS = 10000
Q_ENSEMBLE_SIZE = 8
MODEL_ENSEMBLE_SIZE = 8
HORIZON = 5
TRIAL_N = 10
### Helper functions
initial_state = 0
terminal_state = NONTERMINAL_STATE_COUNT + 1
nonterminal_state_count = NONTERMINAL_STATE_COUNT
state_count = NONTERMINAL_STATE_COUNT + 1
final_reward = NONTERMINAL_STATE_COUNT
colors = sns.color_palette('husl', 4)
plt.rcParams["figure.figsize"] = (6,5)
def step(state):
if state == terminal_state: next_state = terminal_state
else: next_state = state + 1
if state == terminal_state: reward = 0
elif state+1 == terminal_state: reward = final_reward
else: reward = -1
return next_state, reward
def noisy_step(state):
if state == terminal_state: next_state = terminal_state
elif np.random.random([]) < NOISE_AMOUNT: next_state = np.random.randint(0, state_count)
else: next_state = state + 1
if state == terminal_state: reward = 0
elif state+1 == terminal_state: reward = final_reward
else: reward = -1
return next_state, reward
def get_error(Q):
losses = np.square(np.arange(state_count) - Q[:-1])
return np.mean(losses)
def downsample(array, factor):
pad_size = np.ceil(old_div(float(array.size),factor))*factor - array.size
array_padded = np.append(array, np.zeros([pad_size.astype(np.int64)])*np.NaN)
return scipy.nanmean(array_padded.reshape(-1,factor), axis=1)
######################
### Main experiments
######################
# Basic Q
if True:
print("Running basic Q-learning.")
trial_results = []
for run_i in range(TRIAL_N):
print("Trial %d" % run_i)
Q = np.random.randint(0,state_count,[state_count+1]).astype(np.float64)
Q[state_count] = 0
losses = []
for step_i in range(TRAIN_STEPS):
state = np.random.randint(0,state_count)
next_state, reward = step(state)
Q[state] = reward + Q[next_state]
losses.append(get_error(Q))
trial_results.append(losses)
print("...complete.\n")
result = np.stack(trial_results, axis=1)
means = np.mean(result, axis=1)
stdevs = np.std(result, axis=1)
plt.plot(means, label="Basic Q-learning", color=colors[0])
plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[0])
with open('Toy-v1/baseline.csv', 'w') as f:
data = []
for frame_i in range(result.shape[0]):
for loss in result[frame_i]:
data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
f.write("\n".join(data))
# Ensemble Q
if True:
print("Running ensemble Q-learning.")
trial_results = []
for run_i in range(TRIAL_N):
print("Trial %d" % run_i)
Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
Q[:, state_count] = 0
losses = []
for step_i in range(TRAIN_STEPS):
for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
state = np.random.randint(0,state_count)
next_state, reward = step(state)
Q[q_ensemble_i, state] = reward + np.mean(Q[:, next_state])
losses.append(get_error(np.mean(Q, axis=0)))
trial_results.append(losses)
print("...complete.\n")
result = np.stack(trial_results, axis=1)
means = np.mean(result, axis=1)
stdevs = np.std(result, axis=1)
plt.plot(means, label="Ensemble Q-learning", color=colors[1])
plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[1])
# Ensemble MVE-Oracle
if True:
print("Running ensemble oracle MVE.")
trial_results = []
for run_i in range(TRIAL_N):
print("Trial %d" % run_i)
Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
Q[:, state_count] = 0
losses = []
for step_i in range(TRAIN_STEPS):
for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
state = np.random.randint(0,state_count)
next_state, reward = step(state)
# MVE rollout
target = reward
for _ in range(HORIZON):
next_state, reward = step(next_state)
target += reward
target += np.mean(Q[:,next_state])
Q[q_ensemble_i, state] = target
losses.append(get_error(np.mean(Q, axis=0)))
trial_results.append(losses)
print("...complete.\n")
result = np.stack(trial_results, axis=1)
means = np.mean(result, axis=1)
stdevs = np.std(result, axis=1)
plt.plot(means, label="MVE-oracle", color=colors[2])
plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[2])
with open('Toy-v1/mve_oracle.csv', 'w') as f:
data = []
for frame_i in range(result.shape[0]):
for loss in result[frame_i]:
data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
f.write("\n".join(data))
# Ensemble MVE-Noisy
if True:
print("Running ensemble noisy MVE.")
trial_results = []
for run_i in range(TRIAL_N):
print("Trial %d" % run_i)
Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
Q[:, state_count] = 0
losses = []
for step_i in range(TRAIN_STEPS):
for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
state = np.random.randint(0,state_count)
next_state, reward = step(state)
# MVE rollout
targets = []
first_next_state, first_reward = next_state, reward
for model_ensemble_i in range(MODEL_ENSEMBLE_SIZE):
next_state, reward = first_next_state, first_reward
target = reward
for _ in range(HORIZON):
next_state, reward = noisy_step(next_state)
target += reward
target += np.mean(Q[:,next_state])
targets.append(target)
Q[q_ensemble_i, state] = np.mean(targets)
losses.append(get_error(np.mean(Q, axis=0)))
trial_results.append(losses)
print("...complete.\n")
result = np.stack(trial_results, axis=1)
means = np.mean(result, axis=1)
stdevs = np.std(result, axis=1)
plt.plot(means, label="MVE-noisy", color=colors[2], linestyle='dotted')
plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[2])
with open('Toy-v1/mve_noisy.csv', 'w') as f:
data = []
for frame_i in range(result.shape[0]):
for loss in result[frame_i]:
data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
f.write("\n".join(data))
# STEVE-Oracle
if True:
print("Running ensemble oracle STEVE.")
trial_results = []
oracle_q_estimate_errors = []
oracle_mve_estimate_errors = []
oracle_steve_estimate_errors = []
oracle_opt_estimate_errors = []
for run_i in range(TRIAL_N):
print("Trial %d" % run_i)
Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
Q[:, state_count] = 0
losses = []
q_estimate_errors = []
mve_estimate_errors = []
steve_estimate_errors = []
opt_estimate_errors = []
steve_beat_freq= []
for step_i in range(TRAIN_STEPS):
_q_estimate_errors = []
_mve_estimate_errors = []
_steve_estimate_errors = []
_opt_estimate_errors = []
_steve_beat_freq = []
for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
state = np.random.randint(0,state_count)
next_state, reward = step(state)
# STEVE rollout
Q_est_mat = np.zeros([HORIZON + 1, Q_ENSEMBLE_SIZE])
reward_est_mat = np.zeros([HORIZON + 1, 1])
first_next_state, first_reward = next_state, reward
next_state, reward = first_next_state, first_reward
Q_est_mat[0, :] = Q[:, next_state]
reward_est_mat[0, 0] = reward
for timestep_i in range(1,HORIZON+1):
next_state, reward = step(next_state)
Q_est_mat[timestep_i, :] = Q[:, next_state]
reward_est_mat[timestep_i, 0] = reward
all_targets = Q_est_mat + np.cumsum(reward_est_mat, axis=0)
# STEVE weight calculation
estimates = np.mean(all_targets, axis=1)
confidences = old_div(1., (np.var(all_targets, axis=1) + 1e-8))
coefficients = old_div(confidences, np.sum(confidences))
target = np.sum(estimates * coefficients)
Q[q_ensemble_i, state] = target
true_target = state + 1. if state != terminal_state else 0.
_q_estimate_errors.append(np.square(estimates[0] - true_target))
_mve_estimate_errors.append(np.square(estimates[-1] - true_target))
_steve_estimate_errors.append(np.square(np.sum(estimates * coefficients) - true_target))
_opt_estimate_errors.append(np.min(np.square(estimates - true_target)))
losses.append(get_error(np.mean(Q, axis=0)))
q_estimate_errors.append(np.mean(_q_estimate_errors))
mve_estimate_errors.append(np.mean(_mve_estimate_errors))
steve_estimate_errors.append(np.mean(_steve_estimate_errors))
opt_estimate_errors.append(np.mean(_opt_estimate_errors))
trial_results.append(losses)
oracle_q_estimate_errors.append(q_estimate_errors)
oracle_mve_estimate_errors.append(mve_estimate_errors)
oracle_steve_estimate_errors.append(steve_estimate_errors)
oracle_opt_estimate_errors.append(opt_estimate_errors)
print("...complete.\n")
result = np.stack(trial_results, axis=1)
means = np.mean(result, axis=1)
stdevs = np.std(result, axis=1)
plt.plot(means, label="STEVE-oracle", color=colors[3])
plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[3])
with open('Toy-v1/steve_oracle.csv', 'w') as f:
data = []
for frame_i in range(result.shape[0]):
for loss in result[frame_i]:
data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
f.write("\n".join(data))
# STEVE-Noisy
if True:
print("Running ensemble noisy STEVE.")
trial_results = []
noisy_q_estimate_errors = []
noisy_mve_estimate_errors = []
noisy_steve_estimate_errors = []
noisy_opt_estimate_errors = []
noisy_steve_beat_freq = []
for run_i in range(TRIAL_N):
print("Trial %d" % run_i)
Q = np.random.randint(0,state_count,[Q_ENSEMBLE_SIZE, state_count+1]).astype(np.float64)
Q[:, state_count] = 0
losses = []
q_estimate_errors = []
mve_estimate_errors = []
steve_estimate_errors = []
opt_estimate_errors = []
steve_beat_freq= []
for step_i in range(TRAIN_STEPS):
_q_estimate_errors = []
_mve_estimate_errors = []
_steve_estimate_errors = []
_opt_estimate_errors = []
_steve_beat_freq = []
for q_ensemble_i in range(Q_ENSEMBLE_SIZE):
state = np.random.randint(0,state_count)
next_state, reward = step(state)
# STEVE rollout
Q_est_mat = np.zeros([HORIZON + 1, MODEL_ENSEMBLE_SIZE, Q_ENSEMBLE_SIZE])
reward_est_mat = np.zeros([HORIZON + 1, MODEL_ENSEMBLE_SIZE, 1])
first_next_state, first_reward = next_state, reward
for model_ensemble_i in range(MODEL_ENSEMBLE_SIZE):
next_state, reward = first_next_state, first_reward
Q_est_mat[0, model_ensemble_i, :] = Q[:, next_state]
reward_est_mat[0, model_ensemble_i, 0] = reward
for timestep_i in range(1,HORIZON+1):
next_state, reward = noisy_step(next_state)
Q_est_mat[timestep_i, model_ensemble_i, :] = Q[:, next_state]
reward_est_mat[timestep_i, model_ensemble_i, 0] = reward
all_targets = Q_est_mat + np.cumsum(reward_est_mat, axis=0)
# STEVE weight calculation
all_targets = np.reshape(all_targets, [HORIZON+1, MODEL_ENSEMBLE_SIZE * Q_ENSEMBLE_SIZE])
estimates = np.mean(all_targets, axis=1)
confidences = old_div(1., (np.var(all_targets, axis=1) + 1e-8))
coefficients = old_div(confidences, np.sum(confidences))
target = np.sum(estimates * coefficients)
# target = estimates[0]
Q[q_ensemble_i, state] = target
true_target = state + 1. if state != terminal_state else 0.
_q_estimate_errors.append(np.square(estimates[0] - true_target))
_mve_estimate_errors.append(np.square(estimates[-1] - true_target))
_steve_estimate_errors.append(np.square(np.sum(estimates * coefficients) - true_target))
_opt_estimate_errors.append(np.min(np.square(estimates - true_target)))
_steve_beat_freq.append(float(np.square(estimates[0] - true_target) > np.square(target - true_target)))
losses.append(get_error(np.mean(Q, axis=0)))
q_estimate_errors.append(np.mean(_q_estimate_errors))
mve_estimate_errors.append(np.mean(_mve_estimate_errors))
steve_estimate_errors.append(np.mean(_steve_estimate_errors))
opt_estimate_errors.append(np.mean(_opt_estimate_errors))
steve_beat_freq.append(np.mean(_steve_beat_freq))
trial_results.append(losses)
noisy_q_estimate_errors.append(q_estimate_errors)
noisy_mve_estimate_errors.append(mve_estimate_errors)
noisy_steve_estimate_errors.append(steve_estimate_errors)
noisy_opt_estimate_errors.append(opt_estimate_errors)
noisy_steve_beat_freq.append(steve_beat_freq)
print("...complete.\n")
result = np.stack(trial_results, axis=1)
means = np.mean(result, axis=1)
stdevs = np.std(result, axis=1)
plt.plot(means, label="STEVE-noisy", color=colors[3], linestyle='dotted')
plt.fill_between(np.arange(TRAIN_STEPS), means - stdevs, means + stdevs, alpha=.2, color=colors[3])
with open('Toy-v1/steve_noisy.csv', 'w') as f:
data = []
for frame_i in range(result.shape[0]):
for loss in result[frame_i]:
data.append("%f,%f,%f,%f" % (frame_i, frame_i, frame_i, loss))
f.write("\n".join(data))
# ### Display results
# plt.title("Comparison of convergence rates")
# plt.legend()
# plt.savefig("comparison.pdf")
# plt.show()
#
# ### Display secondary results - error comparison
# DOWNSAMPLE = 50
# colors = sns.color_palette('husl', 8)
# for i, (error_curve, label) in enumerate([
# (oracle_q_estimate_errors, "Oracle Q error"),
# (oracle_mve_estimate_errors, "Oracle MVE error"),
# (oracle_steve_estimate_errors, "Oracle STEVE error"),
# # (oracle_opt_estimate_errors, "Oracle minimum single-estimate error"),
# ]):
# result = np.stack(error_curve, axis=1)
# means = downsample(np.mean(result, axis=1), DOWNSAMPLE)
# stdevs = downsample(np.std(result, axis=1), DOWNSAMPLE)
# plt.plot(means, label=label, color=colors[i])
# plt.fill_between(np.arange(means.shape[0]), means - stdevs, means + stdevs, alpha=.2, color=colors[i])
#
# plt.title("Comparison of errors for oracle dynamics")
# plt.legend()
# plt.show()
#
# for i, (error_curve, label) in enumerate([
# (noisy_q_estimate_errors, "Noisy Q error"),
# (noisy_mve_estimate_errors, "Noisy MVE error"),
# (noisy_steve_estimate_errors, "Noisy STEVE error"),
# # (noisy_opt_estimate_errors, "Noisy minimum single-estimate error"),
# # (trial_steve_beat_freq, "STEVE beat freq"),
# ]):
# result = np.stack(error_curve, axis=1)
# means = downsample(np.mean(result, axis=1), DOWNSAMPLE)
# stdevs = downsample(np.std(result, axis=1), DOWNSAMPLE)
# plt.plot(means, label=label, color=colors[i])
# plt.fill_between(np.arange(means.shape[0]), means - stdevs, means + stdevs, alpha=.2, color=colors[i])
#
# plt.title("Comparison of errors for noisy dynamics")
# plt.legend()
# plt.show()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment