Merge pull request #4642 from buckman-google/master

Addition of STEVE

Merge pull request #4642 from buckman-google/master
Addition of STEVE
e10d986e · Lukasz Kaiser · GitHub · ee0e9d11 · f789dcf5 · e10d986e
Unverified Commit e10d986e authored Jun 27, 2018 by Lukasz Kaiser Committed by GitHub Jun 27, 2018
6 changed files
--- a/research/steve/util.py
+++ b/research/steve/util.py
+from __future__ import division
+from future import standard_library
+standard_library.install_aliases()
+from builtins import str
+from builtins import range
+from past.utils import old_div
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import tensorflow as tf
+import os, random, gc, math, re
+import multiprocessing, types, shutil, pickle, json
+from collections import defaultdict, MutableMapping
+
+def tanh_sample_info(mu, logsigma, stop_action_gradient=False, n_samples=1):
+    if n_samples > 1:
+      mu = tf.expand_dims(mu, 2)
+      logsigma = tf.expand_dims(logsigma, 2)
+      sample_shape = tf.concat([tf.shape(mu), n_samples], 0)
+    else:
+      sample_shape = tf.shape(mu)
+
+    flat_act = mu + tf.random_normal(sample_shape) * tf.exp(logsigma)
+    if stop_action_gradient: flat_act = tf.stop_gradient(flat_act)
+    normalized_dist_t = (flat_act - mu) * tf.exp(-logsigma)  # ... x D
+    quadratic = - 0.5 * tf.reduce_sum(normalized_dist_t ** 2, axis=-1) # ... x (None)
+    log_z = tf.reduce_sum(logsigma, axis=-1)  # ... x (None)
+    D_t = tf.cast(tf.shape(mu)[-1], tf.float32)
+    log_z += 0.5 * D_t * np.log(2 * np.pi)
+    flat_ll = quadratic - log_z
+
+    scaled_act = tf.tanh(flat_act)
+    corr = tf.reduce_sum(tf.log(1. - tf.square(scaled_act) + 1e-6), axis=-1)
+    scaled_ll = flat_ll - corr
+    return flat_act, flat_ll, scaled_act, scaled_ll
+
+def tf_cheating_contcartpole(state, action):
+    gravity = 9.8
+    masscart = 1.0
+    masspole = 0.1
+    total_mass = (masspole + masscart)
+    length = 0.5 # actually half the pole's length
+    polemass_length = (masspole * length)
+    force_mag = 10.0
+    tau = 0.02  # seconds between state updates
+
+    # Angle at which to fail the episode
+    theta_threshold_radians = 12 * 2 * math.pi / 360
+    x_threshold = 2.4
+
+    x, x_dot, theta, theta_dot = tf.split(state, 4, axis=-1)
+    done =  tf.logical_or(x < -x_threshold,
+                          tf.logical_or(x > x_threshold,
+                          tf.logical_or(theta < -theta_threshold_radians,
+                                        theta > theta_threshold_radians)))
+
+    force = force_mag * action
+    costheta = tf.cos(theta)
+    sintheta = tf.sin(theta)
+    temp = old_div((force + polemass_length * theta_dot * theta_dot * sintheta), total_mass)
+    thetaacc = old_div((gravity * sintheta - costheta* temp), (length * (old_div(4.0,3.0) - masspole * costheta * costheta / total_mass)))
+    xacc  = temp - polemass_length * thetaacc * costheta / total_mass
+    x  = x + tau * x_dot
+    x_dot = x_dot + tau * xacc
+    theta = theta + tau * theta_dot
+    theta_dot = theta_dot + tau * thetaacc
+    state = tf.concat([x,x_dot,theta,theta_dot], -1)
+    done = tf.squeeze(tf.cast(done, tf.float32), -1)
+    reward = 1.0 - done
+    done *= 0.
+    return state, reward, done
+
+def create_directory(dir):
+    dir_chunks = dir.split("/")
+    for i in range(len(dir_chunks)):
+        partial_dir = "/".join(dir_chunks[:i+1])
+        try:
+            os.makedirs(partial_dir)
+        except OSError:
+            pass
+    return dir
+
+def create_and_wipe_directory(dir):
+    shutil.rmtree(create_directory(dir))
+    create_directory(dir)
+
+def wipe_file(fname):
+    with open(fname, "w") as f:
+        f.write("")
+    return fname
+
+def get_largest_epoch_in_dir(dir, saveid):
+    reg_matches = [re.findall('\d+_%s'%saveid,filename) for filename in os.listdir(dir)]
+    epoch_labels = [int(regmatch[0].split("_")[0]) for regmatch in reg_matches if regmatch]
+    if len(epoch_labels) == 0: return False
+    return max(epoch_labels)
+
+def wipe_all_but_largest_epoch_in_dir(dir, saveid):
+    largest = get_largest_epoch_in_dir(dir, saveid)
+    reg_matches = [(filename, re.findall('\d+_%s'%saveid,filename)) for filename in os.listdir(dir)]
+    for filename, regmatch in reg_matches:
+        if regmatch and int(regmatch[0].split("_")[0]) != largest:
+            os.remove(os.path.join(dir,filename))
+
+class ConfigDict(dict):
+    def __init__(self, loc=None, ghost=False):
+        self._dict = defaultdict(lambda :False)
+        self.ghost = ghost
+        if loc:
+            with open(loc) as f: raw = json.load(f)
+            if "inherits" in raw and raw["inherits"]:
+                for dep_loc in raw["inherits"]:
+                    self.update(ConfigDict(dep_loc))
+            if "updates" in raw and raw["updates"]:
+                self.update(raw["updates"], include_all=True)
+
+    def __getitem__(self, key):
+        return self._dict[key]
+
+    def __setitem__(self, key, value):
+        self._dict[key] = value
+
+    def __str__(self):
+        return str(dict(self._dict))
+
+    def __repr__(self):
+        return str(dict(self._dict))
+
+    def __iter__(self):
+        return self._dict.__iter__()
+
+    def __bool__(self):
+        return bool(self._dict)
+
+    def __nonzero__(self):
+        return bool(self._dict)
+
+    def update(self, dictlike, include_all=False):
+        for key in dictlike:
+            value = dictlike[key]
+            if isinstance(value, dict):
+                if key[0] == "*": # this means only override, do not set
+                    key = key[1:]
+                    ghost = True
+                else:
+                    ghost = False
+                if not include_all and isinstance(value, ConfigDict) and key not in self._dict and value.ghost: continue
+                if key not in self._dict: self._dict[key] = ConfigDict(ghost=ghost)
+                self._dict[key].update(value)
+            else:
+                self._dict[key] = value
--- a/research/steve/valuerl.py
+++ b/research/steve/valuerl.py
+from __future__ import division
+from builtins import zip
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+import numpy as np
+import nn
+import util
+from learner import CoreModel
+
+
+class ValueRL(CoreModel):
+  """
+  Learn a state-action value function and its corresponding policy.
+  """
+
+  @property
+  def saveid(self):
+    return "valuerl"
+
+  def create_params(self, env_config, learner_config):
+    self.obs_dim = np.prod(env_config["obs_dims"])
+    self.action_dim = env_config["action_dim"]
+    self.reward_scale = env_config["reward_scale"]
+    self.discount = env_config["discount"]
+
+    self.hidden_dim = learner_config["hidden_dim"]
+    self.bayesian_config = learner_config["bayesian"]
+    self.value_expansion = learner_config["value_expansion"]
+    self.explore_chance = learner_config["ddpg_explore_chance"]
+
+    with tf.variable_scope(self.name):
+      self.policy = nn.FeedForwardNet('policy', self.obs_dim, [self.action_dim], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=False)
+
+      if self.bayesian_config:
+        self.Q = nn.EnsembleFeedForwardNet('Q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["ensemble_size"], train_sample_count=self.bayesian_config["train_sample_count"], eval_sample_count=self.bayesian_config["eval_sample_count"])
+        self.old_Q = nn.EnsembleFeedForwardNet('old_q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["ensemble_size"], train_sample_count=self.bayesian_config["train_sample_count"], eval_sample_count=self.bayesian_config["eval_sample_count"])
+      else:
+        self.Q = nn.FeedForwardNet('Q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True)
+        self.old_Q = nn.FeedForwardNet('old_q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True)
+
+    self.policy_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) if "policy" in v.name]
+    self.Q_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) if "Q" in v.name]
+    self.agent_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
+
+    self.copy_to_old_ops = [tf.assign(p_old, p) for p_old, p in zip(self.old_Q.params_list, self.Q.params_list)]
+    self.assign_epoch_op = [tf.assign(self.epoch_n, self.epoch_n_placeholder), tf.assign(self.update_n, self.update_n_placeholder), tf.assign(self.frame_n, self.frame_n_placeholder), tf.assign(self.hours, self.hours_placeholder)]
+
+  def update_epoch(self, sess, epoch, updates, frames, hours):
+    sess.run(self.assign_epoch_op, feed_dict={self.epoch_n_placeholder: int(epoch), self.update_n_placeholder: int(updates), self.frame_n_placeholder: int(frames), self.hours_placeholder: float(hours)})
+
+  def copy_to_old(self, sess):
+    sess.run(self.copy_to_old_ops)
+
+  def build_evalution_graph(self, obs, get_full_info=False, mode="regular", n_samples=1):
+    assert mode in {"regular", "explore", "exploit"}
+    policy_actions_pretanh = self.policy(obs)
+
+    if mode == "regular" or mode == "exploit":
+      policy_actions = tf.tanh(policy_actions_pretanh)
+    elif mode == "explore":
+      _, _, exploring_policy_actions, _ = util.tanh_sample_info(policy_actions_pretanh, tf.zeros_like(policy_actions_pretanh), n_samples=n_samples)
+      policy_actions = tf.where(tf.random_uniform(tf.shape(exploring_policy_actions)) < self.explore_chance, x=exploring_policy_actions, y=tf.tanh(policy_actions_pretanh))
+    else: raise Exception('this should never happen')
+
+    if get_full_info:     return policy_actions_pretanh, policy_actions
+    else:                 return policy_actions
+
+  def build_training_graph(self, obs, next_obs, empirical_actions, rewards, dones, data_size, worldmodel=None):
+    average_model_use = tf.constant(0.)
+    empirical_Q_info = tf.concat([obs, empirical_actions], 1)
+
+    if worldmodel is None:
+      policy_action_pretanh, policy_actions = self.build_evalution_graph(obs, get_full_info=True)
+      policy_Q_info = tf.concat([obs, policy_actions], 1)
+      state_value_estimate = self.Q(policy_Q_info, reduce_mode="mean")
+
+      next_policy_actions = self.build_evalution_graph(next_obs)
+      policy_next_Q_info = tf.concat([next_obs, next_policy_actions], 1)
+      next_Q_estimate = self.old_Q(policy_next_Q_info, reduce_mode="mean")
+
+      Q_guess = self.Q(empirical_Q_info, is_eval=False, reduce_mode="random")
+      Q_target = rewards * self.reward_scale + self.discount * next_Q_estimate * (1. - dones)
+
+      policy_losses = -state_value_estimate
+      Q_losses = .5 * tf.square( Q_guess - tf.stop_gradient(Q_target) )
+
+    else:
+      targets, confidence, Q_guesses, reach_probs = self.build_Q_expansion_graph(next_obs, rewards, dones, worldmodel, rollout_len=self.value_expansion["rollout_len"], model_ensembling=worldmodel.bayesian_config is not False)
+
+      # targets is a 3D matrix: [batch_i, start_timestep, end_timestep]. here, we reduce out the last dimension, turning
+      # it into a [batch_i, start_timestep] matrix. in other words, we are taking a bunch of candidate targets and reducing
+      # them into a single target. the four options here correspond to the four ways to do that reduction.
+      if self.value_expansion["mean_k_return"]:
+        target_counts = self.value_expansion["rollout_len"]+1 - tf.reshape(tf.range(self.value_expansion["rollout_len"]+1), [1, self.value_expansion["rollout_len"]+1])
+        k_returns = tf.reduce_sum(targets, 2) / tf.cast(target_counts, tf.float32)
+      elif self.value_expansion["lambda_return"]:
+        cont_coeffs = self.value_expansion["lambda_return"] ** tf.cast(tf.reshape(tf.range(self.value_expansion["rollout_len"]+1), [1,1,self.value_expansion["rollout_len"]+1]), tf.float32)
+        stop_coeffs = tf.concat([(1 - self.value_expansion["lambda_return"]) * tf.ones_like(targets)[:,:,:-1], tf.ones_like(targets)[:,:,-1:]], 2)
+        k_returns = tf.reduce_sum(targets * stop_coeffs * cont_coeffs, 2)
+      elif self.value_expansion["steve_reweight"]:
+        k_returns = tf.reduce_sum(targets * confidence, 2)
+        average_model_use = 1. - tf.reduce_mean(confidence[:,0,0])
+      else:
+        # MVE objective: just take the last one
+        k_returns = targets[:,:,-1]
+
+      # now we have [batch_i, start_timestep]. if we are using the TDK trick, then we want to use all of the targets,
+      # so we construct a corresponding [batch_i, start_timestep] matrix of guesses. otherwise, we just take the targets
+      # for the first timestep.
+      Q_guess = self.Q(empirical_Q_info, is_eval=False, reduce_mode="random")
+      if self.value_expansion["tdk_trick"]:
+        Q_guess = tf.concat([tf.expand_dims(Q_guess, 1), Q_guesses], 1)
+        reach_probs = tf.concat([tf.expand_dims(tf.ones_like(reach_probs[:,0]), 1), reach_probs[:,:-1]], 1)
+        Q_target = k_returns
+      else:
+        # non-TDK trick means we just take the first one
+        Q_target = k_returns[:,0]
+
+      policy_action_pretanh, policy_actions = self.build_evalution_graph(obs, get_full_info=True)
+      policy_Q_info = tf.concat([obs, policy_actions], 1)
+      state_value_estimate = self.Q(policy_Q_info, stop_params_gradient=True, reduce_mode="mean")
+
+      policy_losses = -state_value_estimate
+      Q_losses = .5 * tf.square( Q_guess - tf.stop_gradient(Q_target) )
+      if self.value_expansion["tdk_trick"]: Q_losses *= reach_probs # we downscale the various TDK-trick losses by
+                                                                    # the likelihood of actually reaching the state
+                                                                    # from which the guess was made
+    policy_loss = tf.reduce_mean(policy_losses)
+    Q_loss = tf.reduce_mean(Q_losses)
+    policy_reg_loss = tf.reduce_mean(tf.square(policy_action_pretanh)) * .001 # a small regularization to make sure the
+                                                                              # tanh does not saturate
+
+    # anything in inspect gets logged
+    inspect = (policy_loss, Q_loss, policy_reg_loss, average_model_use)
+
+    return (policy_loss + policy_reg_loss, Q_loss), inspect
+
+
+  def build_Q_expansion_graph(self, obs, first_rewards, first_done, worldmodel, rollout_len=1, model_ensembling=False):
+    ### this sets up the machinery for having multiple parallel rollouts, each of which has a single consistent transition
+    ensemble_idxs, transition_sample_n, reward_sample_n = worldmodel.get_ensemble_idx_info()
+    q_sample_n = self.bayesian_config["eval_sample_count"] if self.bayesian_config is not False else 1
+    first_rewards = tf.tile(tf.expand_dims(tf.expand_dims(first_rewards,1),1), [1,transition_sample_n,reward_sample_n])
+    first_rewards.set_shape([None, transition_sample_n, reward_sample_n])
+    if model_ensembling:
+      obs = tf.tile(tf.expand_dims(obs,1), [1,transition_sample_n,1])
+      obs.set_shape([None, transition_sample_n, self.obs_dim])
+      first_done = tf.tile(tf.expand_dims(first_done, 1), [1, transition_sample_n])
+      first_done.set_shape([None, transition_sample_n])
+
+    ### below, we use a while loop to actually do the iterative model rollout
+    extra_info = worldmodel.init_extra_info(obs)
+
+    action_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
+    obs_ta =       tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
+    done_ta =     tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
+    extra_info_ta =tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
+
+    def rollout_loop_body(r_i, xxx_todo_changeme):
+      (obs, done, extra_info, action_ta, obs_ta, dones_ta, extra_info_ta) = xxx_todo_changeme
+      action_pretanh, action = self.build_evalution_graph(tf.stop_gradient(obs), get_full_info=True)
+
+      if model_ensembling:
+        next_obs, next_dones, next_extra_info = worldmodel.transition(obs, action, extra_info, ensemble_idxs=ensemble_idxs)
+      else:
+        next_obs, next_dones, next_extra_info = worldmodel.transition(obs, action, extra_info)
+        next_obs = tf.reduce_mean(next_obs, -2)
+        next_dones = tf.reduce_mean(next_dones, -1)
+
+      action_ta = action_ta.write(r_i, action)
+      obs_ta = obs_ta.write(r_i, obs)
+      dones_ta = dones_ta.write(r_i, done)
+      extra_info_ta = extra_info_ta.write(r_i, extra_info)
+      return r_i+1, (next_obs, next_dones, next_extra_info, action_ta, obs_ta, dones_ta, extra_info_ta)
+
+    _, (final_obs, final_done, final_extra_info, action_ta, obs_ta, done_ta, extra_info_ta) = tf.while_loop(
+        lambda r_i, _: r_i < rollout_len,
+        rollout_loop_body,
+        [0, (obs, first_done, extra_info, action_ta, obs_ta, done_ta, extra_info_ta)]
+    )
+
+    final_action_pretanh, final_action = self.build_evalution_graph(tf.stop_gradient(final_obs), get_full_info=True)
+
+    ### compile the TensorArrays into useful tensors
+    obss = obs_ta.stack()
+    obss = tf.reshape(obss, tf.stack([rollout_len, -1, transition_sample_n, self.obs_dim]))
+    obss = tf.transpose(obss, [1, 0, 2, 3])
+    final_obs = tf.reshape(final_obs, tf.stack([-1, 1, transition_sample_n, self.obs_dim]))
+    all_obss = tf.concat([obss, final_obs],1)
+    next_obss = all_obss[:,1:]
+
+    dones = done_ta.stack()
+    dones = tf.reshape(dones, tf.stack([rollout_len, -1, transition_sample_n]))
+    dones = tf.transpose(dones, [1, 0, 2])
+    final_done = tf.reshape(final_done, tf.stack([-1, 1, transition_sample_n]))
+    all_dones = tf.concat([dones, final_done],1)
+
+    actions = action_ta.stack()
+    actions = tf.reshape(actions, tf.stack([rollout_len, -1, transition_sample_n, self.action_dim]))
+    actions = tf.transpose(actions , [1, 0, 2, 3])
+    final_action = tf.reshape(final_action, tf.stack([-1, 1, transition_sample_n, self.action_dim]))
+    all_actions = tf.concat([actions, final_action],1)
+
+    continue_probs = tf.cumprod(1. - all_dones, axis=1)
+    rewards = worldmodel.get_rewards(obss, actions, next_obss)
+    rawrew = rewards = tf.concat([tf.expand_dims(first_rewards, 1), rewards],1)
+
+    ### TDK trick means we have to guess at every timestep
+    if self.value_expansion["tdk_trick"]:
+      guess_info = tf.concat([obss,actions], -1)
+      Q_guesses = self.Q(guess_info, reduce_mode="random")
+      Q_guesses = tf.reduce_mean(Q_guesses, -1) # make it so there's only one guess per rollout length, which is the mean of the guesses under all the various model rollouts
+      reached_this_point_to_guess_prob = tf.reduce_mean(continue_probs, -1)
+    else:
+      Q_guesses = None
+      reached_this_point_to_guess_prob = None
+
+    ### use the Q function at every timestep to get value estimates
+    target_info = tf.concat([all_obss, all_actions], -1)
+    Q_targets = self.old_Q(target_info, reduce_mode="none")
+
+    rollout_frames = rollout_len + 1 # if we take N steps, we have N+1 frames
+
+    ### create "decay-exponent matrix" of size [1,ROLLOUT_FRAMES,ROLLOUT_FRAMES,1]. the first ROLLOUT_FRAMES corresponds to the index of the source, the second to the target.
+    ts_count_mat = (tf.cast(tf.reshape(tf.range(rollout_frames), [1, rollout_frames]) - tf.reshape(tf.range(rollout_frames), [rollout_frames, 1]), tf.float32))
+    reward_coeff_matrix = tf.matrix_band_part(tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount ** ts_count_mat
+    value_coeff_matrix = tf.matrix_band_part(tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount ** (1. + ts_count_mat)
+    reward_coeff_matrix = tf.reshape(reward_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1])
+    value_coeff_matrix = tf.reshape(value_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1])
+
+    ### similarly, create a "done" matrix
+    shifted_continue_probs = tf.concat([tf.expand_dims(tf.ones_like(continue_probs[:,0]),1), continue_probs[:,:-1]], 1)
+    reward_continue_matrix = tf.expand_dims(shifted_continue_probs, 1) / tf.expand_dims(shifted_continue_probs+1e-8, 2)
+    value_continue_matrix = tf.expand_dims(continue_probs, 1) / tf.expand_dims(shifted_continue_probs+1e-8, 2)
+    reward_continue_matrix = tf.expand_dims(reward_continue_matrix, -1)
+    value_continue_matrix = tf.expand_dims(value_continue_matrix, -1)
+
+    ### apply the discounting factors to the rewards and values
+    rewards = tf.expand_dims(rewards, 1) * reward_coeff_matrix * reward_continue_matrix
+    rewards = tf.cumsum(rewards, axis=2)
+    values = tf.expand_dims(Q_targets, 1) * value_coeff_matrix * value_continue_matrix
+
+    ### compute the targets using the Bellman equation
+    sampled_targets = tf.expand_dims(rewards,-2) * self.reward_scale + tf.expand_dims(values,-1)
+
+    ### flatten out the various sources of variance (transition, reward, and Q-function ensembles) to get a set of estimates for each candidate target
+    sampled_targets = tf.reshape(sampled_targets, tf.stack([-1, rollout_frames, rollout_frames, transition_sample_n * reward_sample_n * q_sample_n]))
+
+    ### compute the mean and variance for each candidate target
+    target_means, target_variances = tf.nn.moments(sampled_targets, 3)
+
+    ### compute the confidence, either using the full covariance matrix, or approximating all the estimators as independent
+    if self.value_expansion["covariances"]:
+      targetdiffs = sampled_targets - tf.expand_dims(target_means,3)
+      target_covariances = tf.einsum("abij,abjk->abik", targetdiffs, tf.transpose(targetdiffs, [0,1,3,2]))
+      target_confidence = tf.squeeze(tf.matrix_solve(target_covariances + tf.expand_dims(tf.expand_dims(tf.matrix_band_part(tf.ones(tf.shape(target_covariances)[-2:]),0,0) * 1e-3,0),0), tf.ones(tf.concat([tf.shape(target_covariances)[:-1], tf.constant([1])],0))),-1)
+    else:
+      target_confidence = 1./(target_variances + 1e-8)
+
+    ### normalize so weights sum to 1
+    target_confidence *= tf.matrix_band_part(tf.ones([1, rollout_frames, rollout_frames]), 0, -1)
+    target_confidence = target_confidence / tf.reduce_sum(target_confidence, axis=2, keepdims=True)
+
+    ### below here is a bunch of debugging Print statements that I use as a sanity check:
+    # target_confidence = tf.Print(target_confidence, [], message="raw rewards")
+    # target_confidence = tf.Print(target_confidence, [rawrew[0,:,0,0]], summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [], message="\n", summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [(1. - all_dones)[0,:,0]], message="contin", summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [continue_probs[0,:,0]], message="cum_contin", summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [shifted_continue_probs[0,:,0]], message="shifted contin", summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [], message="reward_coeff")
+    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [], message="reward_continue")
+    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [], message="value_coeff")
+    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [], message="value_continue")
+    # for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1)
+    # target_confidence = tf.Print(target_confidence, [], message="rewards")
+    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [rewards[0,i,:,0,0]], summarize=rollout_len+1)
+    # target_confidence = tf.Print(target_confidence, [], message="target Qs")
+    # target_confidence = tf.Print(target_confidence, [Q_targets[0,:,0,0]], summarize=rollout_len+1)
+    # target_confidence = tf.Print(target_confidence, [], message="values")
+    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [values[0,i,:,0,0]], summarize=rollout_len+1)
+    # target_confidence = tf.Print(target_confidence, [], message="target_means")
+    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_means[0,i,:]], summarize=rollout_len+1)
+    # target_confidence = tf.Print(target_confidence, [], message="target_variance")
+    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_variances[0,i,:]], summarize=rollout_len+1)
+    # target_confidence = tf.Print(target_confidence, [], message="target_confidence")
+    # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_confidence[0,i,:]], summarize=rollout_len+1)
+    # target_means = tf.Print(target_means, [target_confidence, action_lls, tf.shape(Q_targets)], message="\n\n", summarize=10)
+
+    return target_means, target_confidence, Q_guesses, reached_this_point_to_guess_prob
\ No newline at end of file
--- a/research/steve/valuerl_learner.py
+++ b/research/steve/valuerl_learner.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+import numpy as np
+import os
+
+from learner import Learner
+from valuerl import ValueRL
+from worldmodel import DeterministicWorldModel
+
+class ValueRLLearner(Learner):
+  """
+  ValueRL-specific training loop details.
+  """
+
+  def learner_name(self): return "valuerl"
+
+  def make_loader_placeholders(self):
+    self.obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
+    self.next_obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
+    self.action_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], self.env_config["action_dim"]])
+    self.reward_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
+    self.done_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
+    self.datasize_loader = tf.placeholder(tf.float64, [])
+    return [self.obs_loader, self.next_obs_loader, self.action_loader, self.reward_loader, self.done_loader, self.datasize_loader]
+
+  def make_core_model(self):
+    if self.config["model_config"] is not False:
+        self.worldmodel = DeterministicWorldModel(self.config["name"], self.env_config, self.config["model_config"])
+    else:
+        self.worldmodel = None
+
+    valuerl = ValueRL(self.config["name"], self.env_config, self.learner_config)
+    (policy_loss, Q_loss), inspect_losses = valuerl.build_training_graph(*self.current_batch, worldmodel=self.worldmodel)
+
+    policy_optimizer = tf.train.AdamOptimizer(3e-4)
+    policy_gvs = policy_optimizer.compute_gradients(policy_loss, var_list=valuerl.policy_params)
+    capped_policy_gvs = policy_gvs
+    policy_train_op = policy_optimizer.apply_gradients(capped_policy_gvs)
+
+    Q_optimizer = tf.train.AdamOptimizer(3e-4)
+    Q_gvs = Q_optimizer.compute_gradients(Q_loss, var_list=valuerl.Q_params)
+    capped_Q_gvs = Q_gvs
+    Q_train_op = Q_optimizer.apply_gradients(capped_Q_gvs)
+
+    return valuerl, (policy_loss, Q_loss), (policy_train_op, Q_train_op), inspect_losses
+
+  ## Optional functions to override
+  def initialize(self):
+      if self.config["model_config"] is not False:
+          while not self.load_worldmodel(): pass
+
+  def resume_from_checkpoint(self, epoch):
+      if self.config["model_config"] is not False:
+          with self.bonus_kwargs["model_lock"]: self.worldmodel.load(self.sess, self.save_path, epoch)
+
+  def checkpoint(self):
+      self.core.copy_to_old(self.sess)
+      if self.config["model_config"] is not False:
+          self.load_worldmodel()
+
+  def backup(self): pass
+
+  # Other functions
+  def load_worldmodel(self):
+      if not os.path.exists("%s/%s.params.index" % (self.save_path, self.worldmodel.saveid)): return False
+      with self.bonus_kwargs["model_lock"]: self.worldmodel.load(self.sess, self.save_path)
+      return True
--- a/research/steve/visualizer.py
+++ b/research/steve/visualizer.py
+from __future__ import print_function
+from builtins import range
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import tensorflow as tf
+# import moviepy.editor as mpy
+import time, os, traceback, multiprocessing, portalocker, sys
+
+import envwrap
+import util
+import valuerl, worldmodel
+from config import config
+
+MODEL_NAME = config["name"]
+LOG_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["log_path"]) + "/" + MODEL_NAME
+LOAD_PATH =    util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["save_model_path"])
+OBS_DIM =   np.prod(config["obs_dims"])
+HIDDEN_DIM = config["hidden_dim"]
+ACTION_DIM = config["action_dim"]
+MAX_FRAMES = config["max_frames"]
+REWARD_SCALE = config["reward_scale"]
+DISCOUNT = config["discount"]
+ALGO = config["policy_config"]["algo"]
+AGENT_BATCH_SIZE = config["agent_config"]["batch_size"]
+EVALUATOR_BATCH_SIZE = config["evaluator_config"]["batch_size"]
+RELOAD_EVERY_N = config["agent_config"]["reload_every_n"]
+FRAMES_BEFORE_LEARNING = config["policy_config"]["frames_before_learning"]
+FRAMES_PER_UPDATE = config["policy_config"]["frames_per_update"]
+LEARNER_EPOCH_N = config["policy_config"]["epoch_n"]
+SYNC_UPDATES = config["policy_config"]["frames_per_update"] >= 0
+POLICY_BAYESIAN_CONFIG = config["policy_config"]["bayesian"]
+AUX_CONFIG = config["aux_config"]
+DDPG_EXPLORE_CHANCE = config["policy_config"]["explore_chance"] if ALGO == "ddpg" else 0.
+MODEL_AUGMENTED = config["model_config"] is not False
+if MODEL_AUGMENTED: MODEL_BAYESIAN_CONFIG = config["model_config"]["bayesian"]
+
+FILENAME = sys.argv[3]
+
+if __name__ == '__main__':
+    oprl = valuerl.ValueRL(MODEL_NAME, ALGO, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, POLICY_BAYESIAN_CONFIG, AUX_CONFIG, DDPG_EXPLORE_CHANCE)
+
+    obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM])
+    policy_actions, _ = oprl.build_evalution_graph(obs_loader, mode="exploit")
+
+    if MODEL_AUGMENTED:
+        next_obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM])
+        reward_loader = tf.placeholder(tf.float32, [1])
+        done_loader = tf.placeholder(tf.float32, [1])
+        worldmodel = worldmodel.DeterministicWorldModel(MODEL_NAME, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, MODEL_BAYESIAN_CONFIG)
+        _, _, _, _, _, confidence, _ = oprl.build_Q_expansion_graph(next_obs_loader, reward_loader, done_loader, worldmodel, rollout_len=3, model_ensembling=True)
+
+    sess = tf.Session()
+    sess.run(tf.global_variables_initializer())
+
+    oprl.load(sess, FILENAME)
+    if MODEL_AUGMENTED: worldmodel.load(sess, FILENAME)
+
+    env = envwrap.get_env(config["env"])
+
+    hist = np.zeros([4, 10])
+    for _ in range(10):
+        ts = 0
+        rgb_frames = []
+        obs, reward, done, reset = env.reset(), 0, False, False
+        while not reset:
+            # env.internal_env.render()
+            # rgb_frames.append(env.internal_env.render(mode='rgb_array'))
+            # action = env.action_space.sample()
+            all_actions = sess.run(policy_actions, feed_dict={obs_loader: np.array([obs])})
+            all_actions = np.clip(all_actions, -1., 1.)
+            action = all_actions[0]
+            obs, _reward, done, reset = env.step(action)
+
+            if MODEL_AUGMENTED:
+                _confidences = sess.run(confidence, feed_dict={next_obs_loader: np.expand_dims(obs,0),
+                                                               reward_loader: np.expand_dims(_reward,0),
+                                                               done_loader: np.expand_dims(done,0)})
+                # print "%.02f %.02f %.02f %.02f" % tuple(_confidences[0,0])
+                for h in range(4):
+                    bucket = int((_confidences[0,0,h]-1e-5)*10)
+                    hist[h,bucket] += 1
+
+            reward += _reward
+            ts += 1
+            # print ts, _reward, reward
+        print(ts, reward)
+    hist /= np.sum(hist, axis=1, keepdims=True)
+    for row in reversed(hist.T): print(' '.join(["%.02f"] * 4) % tuple(row))
+
+    #clip = mpy.ImageSequenceClip(rgb_frames, fps=100)
+    #clip.write_videofile(FILENAME + "/movie.mp4")
+
+
--- a/research/steve/worldmodel.py
+++ b/research/steve/worldmodel.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+import numpy as np
+import nn
+
+from learner import CoreModel
+
+class DeterministicWorldModel(CoreModel):
+  """
+  A simple feed-forward neural network world model, with an option for an ensemble.
+  """
+
+  @property
+  def saveid(self):
+    return "worldmodel"
+
+  def create_params(self, env_config, learner_config):
+    self.obs_dim = np.prod(env_config["obs_dims"])
+    self.action_dim = env_config["action_dim"]
+    self.reward_scale = env_config["reward_scale"]
+    self.discount = env_config["discount"]
+
+    self.aux_hidden_dim = self.learner_config["aux_hidden_dim"]
+    self.transition_hidden_dim = self.learner_config["transition_hidden_dim"]
+    self.bayesian_config = self.learner_config["bayesian"]
+
+    with tf.variable_scope(self.name):
+      if self.bayesian_config:
+        self.transition_predictor = nn.EnsembleFeedForwardNet('transition_predictor', self.obs_dim + self.action_dim, [self.obs_dim], layers=8, hidden_dim=self.transition_hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["transition"]["ensemble_size"], train_sample_count=self.bayesian_config["transition"]["train_sample_count"], eval_sample_count=self.bayesian_config["transition"]["eval_sample_count"])
+        self.done_predictor =       nn.EnsembleFeedForwardNet('done_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["transition"]["ensemble_size"], train_sample_count=self.bayesian_config["transition"]["train_sample_count"], eval_sample_count=self.bayesian_config["transition"]["eval_sample_count"])
+        self.reward_predictor =     nn.EnsembleFeedForwardNet('reward_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["reward"]["ensemble_size"], train_sample_count=self.bayesian_config["reward"]["train_sample_count"], eval_sample_count=self.bayesian_config["reward"]["eval_sample_count"])
+      else:
+        self.transition_predictor = nn.FeedForwardNet('transition_predictor', self.obs_dim + self.action_dim, [self.obs_dim], layers=8, hidden_dim=self.transition_hidden_dim, get_uncertainty=True)
+        self.done_predictor =       nn.FeedForwardNet('done_predictor',   self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True)
+        self.reward_predictor =     nn.FeedForwardNet('reward_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True)
+
+  def get_ensemble_idx_info(self):
+    if self.bayesian_config is not False:
+      ensemble_idxs = tf.random_shuffle(tf.range(self.transition_predictor.ensemble_size))
+      transition_ensemble_sample_n = self.transition_predictor.eval_sample_count
+      reward_ensemble_sample_n = self.reward_predictor.eval_sample_count
+      ensemble_idxs = ensemble_idxs[:transition_ensemble_sample_n]
+      return ensemble_idxs, transition_ensemble_sample_n, reward_ensemble_sample_n
+    else:
+      return None, 1, 1
+
+  def build_training_graph(self, obs, next_obs, actions, rewards, dones, data_size):
+    info = tf.concat([obs, actions], -1)
+    predicted_next_obs = self.transition_predictor(info, is_eval=False, reduce_mode="random") + obs
+    next_info = tf.concat([next_obs, info], -1)
+    predicted_dones = self.done_predictor(next_info, is_eval=False, reduce_mode="random")
+    predicted_rewards = self.reward_predictor(next_info, is_eval=False, reduce_mode="random")
+
+    done_losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=dones, logits=predicted_dones)
+    reward_losses = .5 * tf.square(rewards - predicted_rewards)
+    next_obs_losses = .5 * tf.reduce_sum(tf.square(next_obs - predicted_next_obs), -1)
+
+    done_loss = tf.reduce_mean(done_losses)
+    reward_loss = tf.reduce_mean(reward_losses)
+    next_obs_loss = tf.reduce_mean(next_obs_losses)
+    reg_loss = .0001 * (self.done_predictor.l2_loss() +
+                        self.reward_predictor.l2_loss() +
+                        self.transition_predictor.l2_loss())
+
+    total_loss = done_loss + reward_loss + next_obs_loss + reg_loss
+
+    inspect = (total_loss, done_loss, reward_loss, next_obs_loss, reg_loss)
+
+    return total_loss, inspect
+
+  def init_extra_info(self, obs):
+    return tf.zeros_like(obs)
+
+  def transition(self, obs, action, extra_info, ensemble_idxs=None, pre_expanded=None):
+    info = tf.concat([obs, action], -1)
+    next_obs_delta = self.transition_predictor(info, reduce_mode="none", ensemble_idxs=ensemble_idxs, pre_expanded=pre_expanded)
+    if ensemble_idxs is None:
+      next_obs = tf.expand_dims(obs,-2) + next_obs_delta
+      next_info = tf.concat([next_obs, tf.expand_dims(info,-2)], -1)
+    else:
+      next_obs = obs + next_obs_delta
+      next_info = tf.concat([next_obs, info], -1)
+    done = tf.nn.sigmoid(self.done_predictor(next_info, reduce_mode="none", ensemble_idxs=ensemble_idxs, pre_expanded=True))
+    extra_info = tf.zeros_like(obs)
+    return next_obs, done, extra_info
+
+  def get_rewards(self, obs, action, next_obs):
+    next_info = tf.concat([next_obs, obs, action], -1)
+    reward = self.reward_predictor(next_info, reduce_mode="none")
+    return reward
\ No newline at end of file
--- a/research/steve/worldmodel_learner.py
+++ b/research/steve/worldmodel_learner.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+import numpy as np
+from learner import Learner
+from worldmodel import DeterministicWorldModel
+
+class WorldmodelLearner(Learner):
+    """
+    Worldmodel-specific training loop details.
+    """
+    def learner_name(self): return "worldmodel"
+
+    def make_loader_placeholders(self):
+        self.obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
+        self.next_obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
+        self.action_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], self.env_config["action_dim"]])
+        self.reward_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
+        self.done_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
+        self.datasize_loader = tf.placeholder(tf.float64, [])
+        return [self.obs_loader, self.next_obs_loader, self.action_loader, self.reward_loader, self.done_loader, self.datasize_loader]
+
+    def make_core_model(self):
+        worldmodel = DeterministicWorldModel(self.config["name"], self.env_config, self.learner_config)
+        worldmodel_loss, inspect_losses = worldmodel.build_training_graph(*self.current_batch)
+
+        model_optimizer = tf.train.AdamOptimizer(3e-4)
+        model_gvs = model_optimizer.compute_gradients(worldmodel_loss, var_list=worldmodel.model_params)
+        capped_model_gvs = model_gvs
+        worldmodel_train_op = model_optimizer.apply_gradients(capped_model_gvs)
+
+        return worldmodel, (worldmodel_loss,), (worldmodel_train_op,), inspect_losses
+
+    ## Optional functions to override
+    def initialize(self): pass
+    def resume_from_checkpoint(self, epoch): pass
+    def checkpoint(self): pass
+    def backup(self): pass
+
+
+
+