Unverified Commit e10d986e authored by Lukasz Kaiser's avatar Lukasz Kaiser Committed by GitHub
Browse files

Merge pull request #4642 from buckman-google/master

Addition of STEVE
parents ee0e9d11 f789dcf5
from __future__ import division
from future import standard_library
standard_library.install_aliases()
from builtins import str
from builtins import range
from past.utils import old_div
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import tensorflow as tf
import os, random, gc, math, re
import multiprocessing, types, shutil, pickle, json
from collections import defaultdict, MutableMapping
def tanh_sample_info(mu, logsigma, stop_action_gradient=False, n_samples=1):
if n_samples > 1:
mu = tf.expand_dims(mu, 2)
logsigma = tf.expand_dims(logsigma, 2)
sample_shape = tf.concat([tf.shape(mu), n_samples], 0)
else:
sample_shape = tf.shape(mu)
flat_act = mu + tf.random_normal(sample_shape) * tf.exp(logsigma)
if stop_action_gradient: flat_act = tf.stop_gradient(flat_act)
normalized_dist_t = (flat_act - mu) * tf.exp(-logsigma) # ... x D
quadratic = - 0.5 * tf.reduce_sum(normalized_dist_t ** 2, axis=-1) # ... x (None)
log_z = tf.reduce_sum(logsigma, axis=-1) # ... x (None)
D_t = tf.cast(tf.shape(mu)[-1], tf.float32)
log_z += 0.5 * D_t * np.log(2 * np.pi)
flat_ll = quadratic - log_z
scaled_act = tf.tanh(flat_act)
corr = tf.reduce_sum(tf.log(1. - tf.square(scaled_act) + 1e-6), axis=-1)
scaled_ll = flat_ll - corr
return flat_act, flat_ll, scaled_act, scaled_ll
def tf_cheating_contcartpole(state, action):
gravity = 9.8
masscart = 1.0
masspole = 0.1
total_mass = (masspole + masscart)
length = 0.5 # actually half the pole's length
polemass_length = (masspole * length)
force_mag = 10.0
tau = 0.02 # seconds between state updates
# Angle at which to fail the episode
theta_threshold_radians = 12 * 2 * math.pi / 360
x_threshold = 2.4
x, x_dot, theta, theta_dot = tf.split(state, 4, axis=-1)
done = tf.logical_or(x < -x_threshold,
tf.logical_or(x > x_threshold,
tf.logical_or(theta < -theta_threshold_radians,
theta > theta_threshold_radians)))
force = force_mag * action
costheta = tf.cos(theta)
sintheta = tf.sin(theta)
temp = old_div((force + polemass_length * theta_dot * theta_dot * sintheta), total_mass)
thetaacc = old_div((gravity * sintheta - costheta* temp), (length * (old_div(4.0,3.0) - masspole * costheta * costheta / total_mass)))
xacc = temp - polemass_length * thetaacc * costheta / total_mass
x = x + tau * x_dot
x_dot = x_dot + tau * xacc
theta = theta + tau * theta_dot
theta_dot = theta_dot + tau * thetaacc
state = tf.concat([x,x_dot,theta,theta_dot], -1)
done = tf.squeeze(tf.cast(done, tf.float32), -1)
reward = 1.0 - done
done *= 0.
return state, reward, done
def create_directory(dir):
dir_chunks = dir.split("/")
for i in range(len(dir_chunks)):
partial_dir = "/".join(dir_chunks[:i+1])
try:
os.makedirs(partial_dir)
except OSError:
pass
return dir
def create_and_wipe_directory(dir):
shutil.rmtree(create_directory(dir))
create_directory(dir)
def wipe_file(fname):
with open(fname, "w") as f:
f.write("")
return fname
def get_largest_epoch_in_dir(dir, saveid):
reg_matches = [re.findall('\d+_%s'%saveid,filename) for filename in os.listdir(dir)]
epoch_labels = [int(regmatch[0].split("_")[0]) for regmatch in reg_matches if regmatch]
if len(epoch_labels) == 0: return False
return max(epoch_labels)
def wipe_all_but_largest_epoch_in_dir(dir, saveid):
largest = get_largest_epoch_in_dir(dir, saveid)
reg_matches = [(filename, re.findall('\d+_%s'%saveid,filename)) for filename in os.listdir(dir)]
for filename, regmatch in reg_matches:
if regmatch and int(regmatch[0].split("_")[0]) != largest:
os.remove(os.path.join(dir,filename))
class ConfigDict(dict):
def __init__(self, loc=None, ghost=False):
self._dict = defaultdict(lambda :False)
self.ghost = ghost
if loc:
with open(loc) as f: raw = json.load(f)
if "inherits" in raw and raw["inherits"]:
for dep_loc in raw["inherits"]:
self.update(ConfigDict(dep_loc))
if "updates" in raw and raw["updates"]:
self.update(raw["updates"], include_all=True)
def __getitem__(self, key):
return self._dict[key]
def __setitem__(self, key, value):
self._dict[key] = value
def __str__(self):
return str(dict(self._dict))
def __repr__(self):
return str(dict(self._dict))
def __iter__(self):
return self._dict.__iter__()
def __bool__(self):
return bool(self._dict)
def __nonzero__(self):
return bool(self._dict)
def update(self, dictlike, include_all=False):
for key in dictlike:
value = dictlike[key]
if isinstance(value, dict):
if key[0] == "*": # this means only override, do not set
key = key[1:]
ghost = True
else:
ghost = False
if not include_all and isinstance(value, ConfigDict) and key not in self._dict and value.ghost: continue
if key not in self._dict: self._dict[key] = ConfigDict(ghost=ghost)
self._dict[key].update(value)
else:
self._dict[key] = value
from __future__ import division
from builtins import zip
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
import numpy as np
import nn
import util
from learner import CoreModel
class ValueRL(CoreModel):
"""
Learn a state-action value function and its corresponding policy.
"""
@property
def saveid(self):
return "valuerl"
def create_params(self, env_config, learner_config):
self.obs_dim = np.prod(env_config["obs_dims"])
self.action_dim = env_config["action_dim"]
self.reward_scale = env_config["reward_scale"]
self.discount = env_config["discount"]
self.hidden_dim = learner_config["hidden_dim"]
self.bayesian_config = learner_config["bayesian"]
self.value_expansion = learner_config["value_expansion"]
self.explore_chance = learner_config["ddpg_explore_chance"]
with tf.variable_scope(self.name):
self.policy = nn.FeedForwardNet('policy', self.obs_dim, [self.action_dim], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=False)
if self.bayesian_config:
self.Q = nn.EnsembleFeedForwardNet('Q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["ensemble_size"], train_sample_count=self.bayesian_config["train_sample_count"], eval_sample_count=self.bayesian_config["eval_sample_count"])
self.old_Q = nn.EnsembleFeedForwardNet('old_q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["ensemble_size"], train_sample_count=self.bayesian_config["train_sample_count"], eval_sample_count=self.bayesian_config["eval_sample_count"])
else:
self.Q = nn.FeedForwardNet('Q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True)
self.old_Q = nn.FeedForwardNet('old_q', self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.hidden_dim, get_uncertainty=True)
self.policy_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) if "policy" in v.name]
self.Q_params = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) if "Q" in v.name]
self.agent_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
self.copy_to_old_ops = [tf.assign(p_old, p) for p_old, p in zip(self.old_Q.params_list, self.Q.params_list)]
self.assign_epoch_op = [tf.assign(self.epoch_n, self.epoch_n_placeholder), tf.assign(self.update_n, self.update_n_placeholder), tf.assign(self.frame_n, self.frame_n_placeholder), tf.assign(self.hours, self.hours_placeholder)]
def update_epoch(self, sess, epoch, updates, frames, hours):
sess.run(self.assign_epoch_op, feed_dict={self.epoch_n_placeholder: int(epoch), self.update_n_placeholder: int(updates), self.frame_n_placeholder: int(frames), self.hours_placeholder: float(hours)})
def copy_to_old(self, sess):
sess.run(self.copy_to_old_ops)
def build_evalution_graph(self, obs, get_full_info=False, mode="regular", n_samples=1):
assert mode in {"regular", "explore", "exploit"}
policy_actions_pretanh = self.policy(obs)
if mode == "regular" or mode == "exploit":
policy_actions = tf.tanh(policy_actions_pretanh)
elif mode == "explore":
_, _, exploring_policy_actions, _ = util.tanh_sample_info(policy_actions_pretanh, tf.zeros_like(policy_actions_pretanh), n_samples=n_samples)
policy_actions = tf.where(tf.random_uniform(tf.shape(exploring_policy_actions)) < self.explore_chance, x=exploring_policy_actions, y=tf.tanh(policy_actions_pretanh))
else: raise Exception('this should never happen')
if get_full_info: return policy_actions_pretanh, policy_actions
else: return policy_actions
def build_training_graph(self, obs, next_obs, empirical_actions, rewards, dones, data_size, worldmodel=None):
average_model_use = tf.constant(0.)
empirical_Q_info = tf.concat([obs, empirical_actions], 1)
if worldmodel is None:
policy_action_pretanh, policy_actions = self.build_evalution_graph(obs, get_full_info=True)
policy_Q_info = tf.concat([obs, policy_actions], 1)
state_value_estimate = self.Q(policy_Q_info, reduce_mode="mean")
next_policy_actions = self.build_evalution_graph(next_obs)
policy_next_Q_info = tf.concat([next_obs, next_policy_actions], 1)
next_Q_estimate = self.old_Q(policy_next_Q_info, reduce_mode="mean")
Q_guess = self.Q(empirical_Q_info, is_eval=False, reduce_mode="random")
Q_target = rewards * self.reward_scale + self.discount * next_Q_estimate * (1. - dones)
policy_losses = -state_value_estimate
Q_losses = .5 * tf.square( Q_guess - tf.stop_gradient(Q_target) )
else:
targets, confidence, Q_guesses, reach_probs = self.build_Q_expansion_graph(next_obs, rewards, dones, worldmodel, rollout_len=self.value_expansion["rollout_len"], model_ensembling=worldmodel.bayesian_config is not False)
# targets is a 3D matrix: [batch_i, start_timestep, end_timestep]. here, we reduce out the last dimension, turning
# it into a [batch_i, start_timestep] matrix. in other words, we are taking a bunch of candidate targets and reducing
# them into a single target. the four options here correspond to the four ways to do that reduction.
if self.value_expansion["mean_k_return"]:
target_counts = self.value_expansion["rollout_len"]+1 - tf.reshape(tf.range(self.value_expansion["rollout_len"]+1), [1, self.value_expansion["rollout_len"]+1])
k_returns = tf.reduce_sum(targets, 2) / tf.cast(target_counts, tf.float32)
elif self.value_expansion["lambda_return"]:
cont_coeffs = self.value_expansion["lambda_return"] ** tf.cast(tf.reshape(tf.range(self.value_expansion["rollout_len"]+1), [1,1,self.value_expansion["rollout_len"]+1]), tf.float32)
stop_coeffs = tf.concat([(1 - self.value_expansion["lambda_return"]) * tf.ones_like(targets)[:,:,:-1], tf.ones_like(targets)[:,:,-1:]], 2)
k_returns = tf.reduce_sum(targets * stop_coeffs * cont_coeffs, 2)
elif self.value_expansion["steve_reweight"]:
k_returns = tf.reduce_sum(targets * confidence, 2)
average_model_use = 1. - tf.reduce_mean(confidence[:,0,0])
else:
# MVE objective: just take the last one
k_returns = targets[:,:,-1]
# now we have [batch_i, start_timestep]. if we are using the TDK trick, then we want to use all of the targets,
# so we construct a corresponding [batch_i, start_timestep] matrix of guesses. otherwise, we just take the targets
# for the first timestep.
Q_guess = self.Q(empirical_Q_info, is_eval=False, reduce_mode="random")
if self.value_expansion["tdk_trick"]:
Q_guess = tf.concat([tf.expand_dims(Q_guess, 1), Q_guesses], 1)
reach_probs = tf.concat([tf.expand_dims(tf.ones_like(reach_probs[:,0]), 1), reach_probs[:,:-1]], 1)
Q_target = k_returns
else:
# non-TDK trick means we just take the first one
Q_target = k_returns[:,0]
policy_action_pretanh, policy_actions = self.build_evalution_graph(obs, get_full_info=True)
policy_Q_info = tf.concat([obs, policy_actions], 1)
state_value_estimate = self.Q(policy_Q_info, stop_params_gradient=True, reduce_mode="mean")
policy_losses = -state_value_estimate
Q_losses = .5 * tf.square( Q_guess - tf.stop_gradient(Q_target) )
if self.value_expansion["tdk_trick"]: Q_losses *= reach_probs # we downscale the various TDK-trick losses by
# the likelihood of actually reaching the state
# from which the guess was made
policy_loss = tf.reduce_mean(policy_losses)
Q_loss = tf.reduce_mean(Q_losses)
policy_reg_loss = tf.reduce_mean(tf.square(policy_action_pretanh)) * .001 # a small regularization to make sure the
# tanh does not saturate
# anything in inspect gets logged
inspect = (policy_loss, Q_loss, policy_reg_loss, average_model_use)
return (policy_loss + policy_reg_loss, Q_loss), inspect
def build_Q_expansion_graph(self, obs, first_rewards, first_done, worldmodel, rollout_len=1, model_ensembling=False):
### this sets up the machinery for having multiple parallel rollouts, each of which has a single consistent transition
ensemble_idxs, transition_sample_n, reward_sample_n = worldmodel.get_ensemble_idx_info()
q_sample_n = self.bayesian_config["eval_sample_count"] if self.bayesian_config is not False else 1
first_rewards = tf.tile(tf.expand_dims(tf.expand_dims(first_rewards,1),1), [1,transition_sample_n,reward_sample_n])
first_rewards.set_shape([None, transition_sample_n, reward_sample_n])
if model_ensembling:
obs = tf.tile(tf.expand_dims(obs,1), [1,transition_sample_n,1])
obs.set_shape([None, transition_sample_n, self.obs_dim])
first_done = tf.tile(tf.expand_dims(first_done, 1), [1, transition_sample_n])
first_done.set_shape([None, transition_sample_n])
### below, we use a while loop to actually do the iterative model rollout
extra_info = worldmodel.init_extra_info(obs)
action_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
obs_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
done_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
extra_info_ta =tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32)
def rollout_loop_body(r_i, xxx_todo_changeme):
(obs, done, extra_info, action_ta, obs_ta, dones_ta, extra_info_ta) = xxx_todo_changeme
action_pretanh, action = self.build_evalution_graph(tf.stop_gradient(obs), get_full_info=True)
if model_ensembling:
next_obs, next_dones, next_extra_info = worldmodel.transition(obs, action, extra_info, ensemble_idxs=ensemble_idxs)
else:
next_obs, next_dones, next_extra_info = worldmodel.transition(obs, action, extra_info)
next_obs = tf.reduce_mean(next_obs, -2)
next_dones = tf.reduce_mean(next_dones, -1)
action_ta = action_ta.write(r_i, action)
obs_ta = obs_ta.write(r_i, obs)
dones_ta = dones_ta.write(r_i, done)
extra_info_ta = extra_info_ta.write(r_i, extra_info)
return r_i+1, (next_obs, next_dones, next_extra_info, action_ta, obs_ta, dones_ta, extra_info_ta)
_, (final_obs, final_done, final_extra_info, action_ta, obs_ta, done_ta, extra_info_ta) = tf.while_loop(
lambda r_i, _: r_i < rollout_len,
rollout_loop_body,
[0, (obs, first_done, extra_info, action_ta, obs_ta, done_ta, extra_info_ta)]
)
final_action_pretanh, final_action = self.build_evalution_graph(tf.stop_gradient(final_obs), get_full_info=True)
### compile the TensorArrays into useful tensors
obss = obs_ta.stack()
obss = tf.reshape(obss, tf.stack([rollout_len, -1, transition_sample_n, self.obs_dim]))
obss = tf.transpose(obss, [1, 0, 2, 3])
final_obs = tf.reshape(final_obs, tf.stack([-1, 1, transition_sample_n, self.obs_dim]))
all_obss = tf.concat([obss, final_obs],1)
next_obss = all_obss[:,1:]
dones = done_ta.stack()
dones = tf.reshape(dones, tf.stack([rollout_len, -1, transition_sample_n]))
dones = tf.transpose(dones, [1, 0, 2])
final_done = tf.reshape(final_done, tf.stack([-1, 1, transition_sample_n]))
all_dones = tf.concat([dones, final_done],1)
actions = action_ta.stack()
actions = tf.reshape(actions, tf.stack([rollout_len, -1, transition_sample_n, self.action_dim]))
actions = tf.transpose(actions , [1, 0, 2, 3])
final_action = tf.reshape(final_action, tf.stack([-1, 1, transition_sample_n, self.action_dim]))
all_actions = tf.concat([actions, final_action],1)
continue_probs = tf.cumprod(1. - all_dones, axis=1)
rewards = worldmodel.get_rewards(obss, actions, next_obss)
rawrew = rewards = tf.concat([tf.expand_dims(first_rewards, 1), rewards],1)
### TDK trick means we have to guess at every timestep
if self.value_expansion["tdk_trick"]:
guess_info = tf.concat([obss,actions], -1)
Q_guesses = self.Q(guess_info, reduce_mode="random")
Q_guesses = tf.reduce_mean(Q_guesses, -1) # make it so there's only one guess per rollout length, which is the mean of the guesses under all the various model rollouts
reached_this_point_to_guess_prob = tf.reduce_mean(continue_probs, -1)
else:
Q_guesses = None
reached_this_point_to_guess_prob = None
### use the Q function at every timestep to get value estimates
target_info = tf.concat([all_obss, all_actions], -1)
Q_targets = self.old_Q(target_info, reduce_mode="none")
rollout_frames = rollout_len + 1 # if we take N steps, we have N+1 frames
### create "decay-exponent matrix" of size [1,ROLLOUT_FRAMES,ROLLOUT_FRAMES,1]. the first ROLLOUT_FRAMES corresponds to the index of the source, the second to the target.
ts_count_mat = (tf.cast(tf.reshape(tf.range(rollout_frames), [1, rollout_frames]) - tf.reshape(tf.range(rollout_frames), [rollout_frames, 1]), tf.float32))
reward_coeff_matrix = tf.matrix_band_part(tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount ** ts_count_mat
value_coeff_matrix = tf.matrix_band_part(tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount ** (1. + ts_count_mat)
reward_coeff_matrix = tf.reshape(reward_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1])
value_coeff_matrix = tf.reshape(value_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1])
### similarly, create a "done" matrix
shifted_continue_probs = tf.concat([tf.expand_dims(tf.ones_like(continue_probs[:,0]),1), continue_probs[:,:-1]], 1)
reward_continue_matrix = tf.expand_dims(shifted_continue_probs, 1) / tf.expand_dims(shifted_continue_probs+1e-8, 2)
value_continue_matrix = tf.expand_dims(continue_probs, 1) / tf.expand_dims(shifted_continue_probs+1e-8, 2)
reward_continue_matrix = tf.expand_dims(reward_continue_matrix, -1)
value_continue_matrix = tf.expand_dims(value_continue_matrix, -1)
### apply the discounting factors to the rewards and values
rewards = tf.expand_dims(rewards, 1) * reward_coeff_matrix * reward_continue_matrix
rewards = tf.cumsum(rewards, axis=2)
values = tf.expand_dims(Q_targets, 1) * value_coeff_matrix * value_continue_matrix
### compute the targets using the Bellman equation
sampled_targets = tf.expand_dims(rewards,-2) * self.reward_scale + tf.expand_dims(values,-1)
### flatten out the various sources of variance (transition, reward, and Q-function ensembles) to get a set of estimates for each candidate target
sampled_targets = tf.reshape(sampled_targets, tf.stack([-1, rollout_frames, rollout_frames, transition_sample_n * reward_sample_n * q_sample_n]))
### compute the mean and variance for each candidate target
target_means, target_variances = tf.nn.moments(sampled_targets, 3)
### compute the confidence, either using the full covariance matrix, or approximating all the estimators as independent
if self.value_expansion["covariances"]:
targetdiffs = sampled_targets - tf.expand_dims(target_means,3)
target_covariances = tf.einsum("abij,abjk->abik", targetdiffs, tf.transpose(targetdiffs, [0,1,3,2]))
target_confidence = tf.squeeze(tf.matrix_solve(target_covariances + tf.expand_dims(tf.expand_dims(tf.matrix_band_part(tf.ones(tf.shape(target_covariances)[-2:]),0,0) * 1e-3,0),0), tf.ones(tf.concat([tf.shape(target_covariances)[:-1], tf.constant([1])],0))),-1)
else:
target_confidence = 1./(target_variances + 1e-8)
### normalize so weights sum to 1
target_confidence *= tf.matrix_band_part(tf.ones([1, rollout_frames, rollout_frames]), 0, -1)
target_confidence = target_confidence / tf.reduce_sum(target_confidence, axis=2, keepdims=True)
### below here is a bunch of debugging Print statements that I use as a sanity check:
# target_confidence = tf.Print(target_confidence, [], message="raw rewards")
# target_confidence = tf.Print(target_confidence, [rawrew[0,:,0,0]], summarize=rollout_len+1)
# target_means = tf.Print(target_means, [], message="\n", summarize=rollout_len+1)
# target_means = tf.Print(target_means, [(1. - all_dones)[0,:,0]], message="contin", summarize=rollout_len+1)
# target_means = tf.Print(target_means, [continue_probs[0,:,0]], message="cum_contin", summarize=rollout_len+1)
# target_means = tf.Print(target_means, [shifted_continue_probs[0,:,0]], message="shifted contin", summarize=rollout_len+1)
# target_means = tf.Print(target_means, [], message="reward_coeff")
# for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1)
# target_means = tf.Print(target_means, [], message="reward_continue")
# for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1)
# target_means = tf.Print(target_means, [], message="value_coeff")
# for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1)
# target_means = tf.Print(target_means, [], message="value_continue")
# for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1)
# target_confidence = tf.Print(target_confidence, [], message="rewards")
# for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [rewards[0,i,:,0,0]], summarize=rollout_len+1)
# target_confidence = tf.Print(target_confidence, [], message="target Qs")
# target_confidence = tf.Print(target_confidence, [Q_targets[0,:,0,0]], summarize=rollout_len+1)
# target_confidence = tf.Print(target_confidence, [], message="values")
# for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [values[0,i,:,0,0]], summarize=rollout_len+1)
# target_confidence = tf.Print(target_confidence, [], message="target_means")
# for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_means[0,i,:]], summarize=rollout_len+1)
# target_confidence = tf.Print(target_confidence, [], message="target_variance")
# for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_variances[0,i,:]], summarize=rollout_len+1)
# target_confidence = tf.Print(target_confidence, [], message="target_confidence")
# for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_confidence[0,i,:]], summarize=rollout_len+1)
# target_means = tf.Print(target_means, [target_confidence, action_lls, tf.shape(Q_targets)], message="\n\n", summarize=10)
return target_means, target_confidence, Q_guesses, reached_this_point_to_guess_prob
\ No newline at end of file
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
import numpy as np
import os
from learner import Learner
from valuerl import ValueRL
from worldmodel import DeterministicWorldModel
class ValueRLLearner(Learner):
"""
ValueRL-specific training loop details.
"""
def learner_name(self): return "valuerl"
def make_loader_placeholders(self):
self.obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
self.next_obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
self.action_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], self.env_config["action_dim"]])
self.reward_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
self.done_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
self.datasize_loader = tf.placeholder(tf.float64, [])
return [self.obs_loader, self.next_obs_loader, self.action_loader, self.reward_loader, self.done_loader, self.datasize_loader]
def make_core_model(self):
if self.config["model_config"] is not False:
self.worldmodel = DeterministicWorldModel(self.config["name"], self.env_config, self.config["model_config"])
else:
self.worldmodel = None
valuerl = ValueRL(self.config["name"], self.env_config, self.learner_config)
(policy_loss, Q_loss), inspect_losses = valuerl.build_training_graph(*self.current_batch, worldmodel=self.worldmodel)
policy_optimizer = tf.train.AdamOptimizer(3e-4)
policy_gvs = policy_optimizer.compute_gradients(policy_loss, var_list=valuerl.policy_params)
capped_policy_gvs = policy_gvs
policy_train_op = policy_optimizer.apply_gradients(capped_policy_gvs)
Q_optimizer = tf.train.AdamOptimizer(3e-4)
Q_gvs = Q_optimizer.compute_gradients(Q_loss, var_list=valuerl.Q_params)
capped_Q_gvs = Q_gvs
Q_train_op = Q_optimizer.apply_gradients(capped_Q_gvs)
return valuerl, (policy_loss, Q_loss), (policy_train_op, Q_train_op), inspect_losses
## Optional functions to override
def initialize(self):
if self.config["model_config"] is not False:
while not self.load_worldmodel(): pass
def resume_from_checkpoint(self, epoch):
if self.config["model_config"] is not False:
with self.bonus_kwargs["model_lock"]: self.worldmodel.load(self.sess, self.save_path, epoch)
def checkpoint(self):
self.core.copy_to_old(self.sess)
if self.config["model_config"] is not False:
self.load_worldmodel()
def backup(self): pass
# Other functions
def load_worldmodel(self):
if not os.path.exists("%s/%s.params.index" % (self.save_path, self.worldmodel.saveid)): return False
with self.bonus_kwargs["model_lock"]: self.worldmodel.load(self.sess, self.save_path)
return True
from __future__ import print_function
from builtins import range
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import tensorflow as tf
# import moviepy.editor as mpy
import time, os, traceback, multiprocessing, portalocker, sys
import envwrap
import util
import valuerl, worldmodel
from config import config
MODEL_NAME = config["name"]
LOG_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["log_path"]) + "/" + MODEL_NAME
LOAD_PATH = util.create_directory("output/" + config["env"] + "/" + MODEL_NAME + "/" + config["save_model_path"])
OBS_DIM = np.prod(config["obs_dims"])
HIDDEN_DIM = config["hidden_dim"]
ACTION_DIM = config["action_dim"]
MAX_FRAMES = config["max_frames"]
REWARD_SCALE = config["reward_scale"]
DISCOUNT = config["discount"]
ALGO = config["policy_config"]["algo"]
AGENT_BATCH_SIZE = config["agent_config"]["batch_size"]
EVALUATOR_BATCH_SIZE = config["evaluator_config"]["batch_size"]
RELOAD_EVERY_N = config["agent_config"]["reload_every_n"]
FRAMES_BEFORE_LEARNING = config["policy_config"]["frames_before_learning"]
FRAMES_PER_UPDATE = config["policy_config"]["frames_per_update"]
LEARNER_EPOCH_N = config["policy_config"]["epoch_n"]
SYNC_UPDATES = config["policy_config"]["frames_per_update"] >= 0
POLICY_BAYESIAN_CONFIG = config["policy_config"]["bayesian"]
AUX_CONFIG = config["aux_config"]
DDPG_EXPLORE_CHANCE = config["policy_config"]["explore_chance"] if ALGO == "ddpg" else 0.
MODEL_AUGMENTED = config["model_config"] is not False
if MODEL_AUGMENTED: MODEL_BAYESIAN_CONFIG = config["model_config"]["bayesian"]
FILENAME = sys.argv[3]
if __name__ == '__main__':
oprl = valuerl.ValueRL(MODEL_NAME, ALGO, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, POLICY_BAYESIAN_CONFIG, AUX_CONFIG, DDPG_EXPLORE_CHANCE)
obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM])
policy_actions, _ = oprl.build_evalution_graph(obs_loader, mode="exploit")
if MODEL_AUGMENTED:
next_obs_loader = tf.placeholder(tf.float32, [1, OBS_DIM])
reward_loader = tf.placeholder(tf.float32, [1])
done_loader = tf.placeholder(tf.float32, [1])
worldmodel = worldmodel.DeterministicWorldModel(MODEL_NAME, OBS_DIM, ACTION_DIM, HIDDEN_DIM, REWARD_SCALE, DISCOUNT, MODEL_BAYESIAN_CONFIG)
_, _, _, _, _, confidence, _ = oprl.build_Q_expansion_graph(next_obs_loader, reward_loader, done_loader, worldmodel, rollout_len=3, model_ensembling=True)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
oprl.load(sess, FILENAME)
if MODEL_AUGMENTED: worldmodel.load(sess, FILENAME)
env = envwrap.get_env(config["env"])
hist = np.zeros([4, 10])
for _ in range(10):
ts = 0
rgb_frames = []
obs, reward, done, reset = env.reset(), 0, False, False
while not reset:
# env.internal_env.render()
# rgb_frames.append(env.internal_env.render(mode='rgb_array'))
# action = env.action_space.sample()
all_actions = sess.run(policy_actions, feed_dict={obs_loader: np.array([obs])})
all_actions = np.clip(all_actions, -1., 1.)
action = all_actions[0]
obs, _reward, done, reset = env.step(action)
if MODEL_AUGMENTED:
_confidences = sess.run(confidence, feed_dict={next_obs_loader: np.expand_dims(obs,0),
reward_loader: np.expand_dims(_reward,0),
done_loader: np.expand_dims(done,0)})
# print "%.02f %.02f %.02f %.02f" % tuple(_confidences[0,0])
for h in range(4):
bucket = int((_confidences[0,0,h]-1e-5)*10)
hist[h,bucket] += 1
reward += _reward
ts += 1
# print ts, _reward, reward
print(ts, reward)
hist /= np.sum(hist, axis=1, keepdims=True)
for row in reversed(hist.T): print(' '.join(["%.02f"] * 4) % tuple(row))
#clip = mpy.ImageSequenceClip(rgb_frames, fps=100)
#clip.write_videofile(FILENAME + "/movie.mp4")
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
import numpy as np
import nn
from learner import CoreModel
class DeterministicWorldModel(CoreModel):
"""
A simple feed-forward neural network world model, with an option for an ensemble.
"""
@property
def saveid(self):
return "worldmodel"
def create_params(self, env_config, learner_config):
self.obs_dim = np.prod(env_config["obs_dims"])
self.action_dim = env_config["action_dim"]
self.reward_scale = env_config["reward_scale"]
self.discount = env_config["discount"]
self.aux_hidden_dim = self.learner_config["aux_hidden_dim"]
self.transition_hidden_dim = self.learner_config["transition_hidden_dim"]
self.bayesian_config = self.learner_config["bayesian"]
with tf.variable_scope(self.name):
if self.bayesian_config:
self.transition_predictor = nn.EnsembleFeedForwardNet('transition_predictor', self.obs_dim + self.action_dim, [self.obs_dim], layers=8, hidden_dim=self.transition_hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["transition"]["ensemble_size"], train_sample_count=self.bayesian_config["transition"]["train_sample_count"], eval_sample_count=self.bayesian_config["transition"]["eval_sample_count"])
self.done_predictor = nn.EnsembleFeedForwardNet('done_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["transition"]["ensemble_size"], train_sample_count=self.bayesian_config["transition"]["train_sample_count"], eval_sample_count=self.bayesian_config["transition"]["eval_sample_count"])
self.reward_predictor = nn.EnsembleFeedForwardNet('reward_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True, ensemble_size=self.bayesian_config["reward"]["ensemble_size"], train_sample_count=self.bayesian_config["reward"]["train_sample_count"], eval_sample_count=self.bayesian_config["reward"]["eval_sample_count"])
else:
self.transition_predictor = nn.FeedForwardNet('transition_predictor', self.obs_dim + self.action_dim, [self.obs_dim], layers=8, hidden_dim=self.transition_hidden_dim, get_uncertainty=True)
self.done_predictor = nn.FeedForwardNet('done_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True)
self.reward_predictor = nn.FeedForwardNet('reward_predictor', self.obs_dim + self.obs_dim + self.action_dim, [], layers=4, hidden_dim=self.aux_hidden_dim, get_uncertainty=True)
def get_ensemble_idx_info(self):
if self.bayesian_config is not False:
ensemble_idxs = tf.random_shuffle(tf.range(self.transition_predictor.ensemble_size))
transition_ensemble_sample_n = self.transition_predictor.eval_sample_count
reward_ensemble_sample_n = self.reward_predictor.eval_sample_count
ensemble_idxs = ensemble_idxs[:transition_ensemble_sample_n]
return ensemble_idxs, transition_ensemble_sample_n, reward_ensemble_sample_n
else:
return None, 1, 1
def build_training_graph(self, obs, next_obs, actions, rewards, dones, data_size):
info = tf.concat([obs, actions], -1)
predicted_next_obs = self.transition_predictor(info, is_eval=False, reduce_mode="random") + obs
next_info = tf.concat([next_obs, info], -1)
predicted_dones = self.done_predictor(next_info, is_eval=False, reduce_mode="random")
predicted_rewards = self.reward_predictor(next_info, is_eval=False, reduce_mode="random")
done_losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=dones, logits=predicted_dones)
reward_losses = .5 * tf.square(rewards - predicted_rewards)
next_obs_losses = .5 * tf.reduce_sum(tf.square(next_obs - predicted_next_obs), -1)
done_loss = tf.reduce_mean(done_losses)
reward_loss = tf.reduce_mean(reward_losses)
next_obs_loss = tf.reduce_mean(next_obs_losses)
reg_loss = .0001 * (self.done_predictor.l2_loss() +
self.reward_predictor.l2_loss() +
self.transition_predictor.l2_loss())
total_loss = done_loss + reward_loss + next_obs_loss + reg_loss
inspect = (total_loss, done_loss, reward_loss, next_obs_loss, reg_loss)
return total_loss, inspect
def init_extra_info(self, obs):
return tf.zeros_like(obs)
def transition(self, obs, action, extra_info, ensemble_idxs=None, pre_expanded=None):
info = tf.concat([obs, action], -1)
next_obs_delta = self.transition_predictor(info, reduce_mode="none", ensemble_idxs=ensemble_idxs, pre_expanded=pre_expanded)
if ensemble_idxs is None:
next_obs = tf.expand_dims(obs,-2) + next_obs_delta
next_info = tf.concat([next_obs, tf.expand_dims(info,-2)], -1)
else:
next_obs = obs + next_obs_delta
next_info = tf.concat([next_obs, info], -1)
done = tf.nn.sigmoid(self.done_predictor(next_info, reduce_mode="none", ensemble_idxs=ensemble_idxs, pre_expanded=True))
extra_info = tf.zeros_like(obs)
return next_obs, done, extra_info
def get_rewards(self, obs, action, next_obs):
next_info = tf.concat([next_obs, obs, action], -1)
reward = self.reward_predictor(next_info, reduce_mode="none")
return reward
\ No newline at end of file
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import tensorflow as tf
import numpy as np
from learner import Learner
from worldmodel import DeterministicWorldModel
class WorldmodelLearner(Learner):
"""
Worldmodel-specific training loop details.
"""
def learner_name(self): return "worldmodel"
def make_loader_placeholders(self):
self.obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
self.next_obs_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], np.prod(self.env_config["obs_dims"])])
self.action_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"], self.env_config["action_dim"]])
self.reward_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
self.done_loader = tf.placeholder(tf.float32, [self.learner_config["batch_size"]])
self.datasize_loader = tf.placeholder(tf.float64, [])
return [self.obs_loader, self.next_obs_loader, self.action_loader, self.reward_loader, self.done_loader, self.datasize_loader]
def make_core_model(self):
worldmodel = DeterministicWorldModel(self.config["name"], self.env_config, self.learner_config)
worldmodel_loss, inspect_losses = worldmodel.build_training_graph(*self.current_batch)
model_optimizer = tf.train.AdamOptimizer(3e-4)
model_gvs = model_optimizer.compute_gradients(worldmodel_loss, var_list=worldmodel.model_params)
capped_model_gvs = model_gvs
worldmodel_train_op = model_optimizer.apply_gradients(capped_model_gvs)
return worldmodel, (worldmodel_loss,), (worldmodel_train_op,), inspect_losses
## Optional functions to override
def initialize(self): pass
def resume_from_checkpoint(self, epoch): pass
def checkpoint(self): pass
def backup(self): pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment