"vscode:/vscode.git/clone" did not exist on "92db4bd5a09a8eff62e089ed2ef6bf1670346128"
Commit 8cedd479 authored by Lukasz Kaiser's avatar Lukasz Kaiser Committed by GitHub
Browse files

Merge pull request #1878 from ofirnachum/master

Implementation of PCL and other newly proposed algorithms
parents 252fffb2 51892707
Code for several RL algorithms used in the following papers:
* "Improving Policy Gradient by Exploring Under-appreciated Rewards" by
Ofir Nachum, Mohammad Norouzi, and Dale Schuurmans.
* "Bridging the Gap Between Value and Policy Based Reinforcement Learning" by
Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
* "Trust-PCL: An Off-Policy Trust Region Method for Continuous Control" by
Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
Available algorithms:
* Actor Critic
* TRPO
* PCL
* Unified PCL
* Trust-PCL
* PCL + Constraint Trust Region (un-published)
* REINFORCE
* UREX
Requirements:
* TensorFlow (see http://www.tensorflow.org for how to install/upgrade)
* OpenAI Gym (see http://gym.openai.com/docs)
* NumPy (see http://www.numpy.org/)
* SciPy (see http://www.scipy.org/)
Quick Start:
Run UREX on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.1 --clip_norm=50 \
--num_samples=10 --objective=urex
```
Run REINFORCE on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.01 --clip_norm=50 \
--num_samples=10 --objective=reinforce
```
Run PCL on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
--gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl
```
Run PCL with expert trajectories on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
--gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl \
--num_expert_paths=10
```
Run Mujoco task with TRPO:
```
python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
--validation_frequency=5 --rollout=10 --gamma=0.995 \
--max_step=1000 --cutoff_agent=1000 \
--objective=trpo --norecurrent --internal_dim=64 --trust_region_p \
--max_divergence=0.05 --value_opt=best_fit --critic_weight=0.0 \
```
Run Mujoco task with Trust-PCL:
```
python trainer.py --logtostderr --batch_size=1 --env=HalfCheetah-v1 \
--validation_frequency=50 --rollout=10 --critic_weight=0.0 \
--gamma=0.995 --clip_norm=40 --learning_rate=0.002 \
--replay_buffer_freq=1 --replay_buffer_size=20000 \
--replay_buffer_alpha=0.1 --norecurrent --objective=pcl \
--max_step=100 --tau=0.0 --eviction=fifo --max_divergence=0.001 \
--internal_dim=64 --cutoff_agent=1000 \
--replay_batch_size=25 --nouse_online_batch --batch_by_steps \
--sample_from=target --value_opt=grad --value_hidden_layers=2 \
--update_eps_lambda --unify_episodes --clip_adv=1.0 \
--target_network_lag=0.99 --prioritize_by=step
```
Run Mujoco task with PCL constraint trust region:
```
python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
--validation_frequency=5 --tau=0.001 --rollout=50 --gamma=0.99 \
--max_step=1000 --cutoff_agent=1000 \
--objective=pcl --norecurrent --internal_dim=64 --trust_region_p \
--max_divergence=0.01 --value_opt=best_fit --critic_weight=0.0 \
--tau_decay=0.1 --tau_start=0.1
```
Maintained by Ofir Nachum (ofirnachum).
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Baseline model for value estimates.
Implements the value component of the neural network.
In some cases this is just an additional linear layer on the policy.
In other cases, it is a completely separate neural network.
"""
import tensorflow as tf
import numpy as np
class Baseline(object):
def __init__(self, env_spec, internal_policy_dim,
input_prev_actions=True,
input_time_step=False,
input_policy_state=True,
n_hidden_layers=0,
hidden_dim=64,
tau=0.0):
self.env_spec = env_spec
self.internal_policy_dim = internal_policy_dim
self.input_prev_actions = input_prev_actions
self.input_time_step = input_time_step
self.input_policy_state = input_policy_state
self.n_hidden_layers = n_hidden_layers
self.hidden_dim = hidden_dim
self.tau = tau
self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
def get_inputs(self, time_step, obs, prev_actions,
internal_policy_states):
"""Get inputs to network as single tensor."""
inputs = [tf.ones_like(time_step)]
input_dim = 1
if not self.input_policy_state:
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
inputs.append(
tf.one_hot(obs[i], obs_dim))
input_dim += obs_dim
elif self.env_spec.is_box(obs_type):
cur_obs = obs[i]
inputs.append(cur_obs)
inputs.append(cur_obs ** 2)
input_dim += obs_dim * 2
else:
assert False
if self.input_prev_actions:
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
if self.env_spec.is_discrete(act_type):
inputs.append(
tf.one_hot(prev_actions[i], act_dim))
input_dim += act_dim
elif self.env_spec.is_box(act_type):
inputs.append(prev_actions[i])
input_dim += act_dim
else:
assert False
if self.input_policy_state:
inputs.append(internal_policy_states)
input_dim += self.internal_policy_dim
if self.input_time_step:
scaled_time = 0.01 * time_step
inputs.extend([scaled_time, scaled_time ** 2, scaled_time ** 3])
input_dim += 3
return input_dim, tf.concat(inputs, 1)
def reshape_batched_inputs(self, all_obs, all_actions,
internal_policy_states, policy_logits):
"""Reshape inputs from [time_length, batch_size, ...] to
[time_length * batch_size, ...].
This allows for computing the value estimate in one go.
"""
batch_size = tf.shape(all_obs[0])[1]
time_length = tf.shape(all_obs[0])[0]
reshaped_obs = []
for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
elif self.env_spec.is_box(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
reshaped_prev_act = []
reshaped_policy_logits = []
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
prev_act = all_actions[i]
if self.env_spec.is_discrete(act_type):
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size]))
elif self.env_spec.is_box(act_type):
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size, act_dim]))
reshaped_policy_logits.append(
tf.reshape(policy_logits[i], [time_length * batch_size, -1]))
reshaped_internal_policy_states = tf.reshape(
internal_policy_states,
[time_length * batch_size, self.internal_policy_dim])
time_step = (float(self.input_time_step) *
tf.expand_dims(
tf.to_float(tf.range(time_length * batch_size) /
batch_size), -1))
return (time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states,
reshaped_policy_logits)
def get_values(self, all_obs, all_actions, internal_policy_states,
policy_logits):
"""Get value estimates given input."""
batch_size = tf.shape(all_obs[0])[1]
time_length = tf.shape(all_obs[0])[0]
(time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states,
reshaped_policy_logits) = self.reshape_batched_inputs(
all_obs, all_actions, internal_policy_states, policy_logits)
input_dim, inputs = self.get_inputs(
time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states)
for depth in xrange(self.n_hidden_layers):
with tf.variable_scope('value_layer%d' % depth):
w = tf.get_variable('w', [input_dim, self.hidden_dim])
inputs = tf.nn.tanh(tf.matmul(inputs, w))
input_dim = self.hidden_dim
w_v = tf.get_variable('w_v', [input_dim, 1],
initializer=self.matrix_init)
values = tf.matmul(inputs, w_v)
values = tf.reshape(values, [time_length, batch_size])
inputs = inputs[:-batch_size] # remove "final vals"
return values, inputs, w_v
class UnifiedBaseline(Baseline):
"""Baseline for Unified PCL."""
def get_values(self, all_obs, all_actions, internal_policy_states,
policy_logits):
batch_size = tf.shape(all_obs[0])[1]
time_length = tf.shape(all_obs[0])[0]
(time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states,
reshaped_policy_logits) = self.reshape_batched_inputs(
all_obs, all_actions, internal_policy_states, policy_logits)
def f_transform(q_values, tau):
max_q = tf.reduce_max(q_values, -1, keep_dims=True)
return tf.squeeze(max_q, [-1]) + tau * tf.log(
tf.reduce_sum(tf.exp((q_values - max_q) / tau), -1))
assert len(reshaped_policy_logits) == 1
values = f_transform((self.tau + self.eps_lambda) * reshaped_policy_logits[0],
(self.tau + self.eps_lambda))
values = tf.reshape(values, [time_length, batch_size])
# not used
input_dim, inputs = self.get_inputs(
time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states)
w_v = tf.get_variable('w_v', [input_dim, 1],
initializer=self.matrix_init)
inputs = inputs[:-batch_size] # remove "final vals"
return values, inputs, w_v
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Controller coordinates sampling and training model.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import pickle
import random
flags = tf.flags
gfile = tf.gfile
FLAGS = flags.FLAGS
def find_best_eps_lambda(rewards, lengths):
"""Find the best lambda given a desired epsilon = FLAGS.max_divergence."""
# perhaps not the best way to do this
desired_div = FLAGS.max_divergence * np.mean(lengths)
def calc_divergence(eps_lambda):
max_reward = np.max(rewards)
logz = (max_reward / eps_lambda +
np.log(np.mean(np.exp((rewards - max_reward) / eps_lambda))))
exprr = np.mean(np.exp(rewards / eps_lambda - logz) *
rewards / eps_lambda)
return exprr - logz
left = 0.0
right = 1000.0
if len(rewards) <= 8:
return (left + right) / 2
num_iter = max(4, 1 + int(np.log((right - left) / 0.1) / np.log(2.0)))
for _ in xrange(num_iter):
mid = (left + right) / 2
cur_div = calc_divergence(mid)
if cur_div > desired_div:
left = mid
else:
right = mid
return (left + right) / 2
class Controller(object):
def __init__(self, env, env_spec, internal_dim,
use_online_batch=True,
batch_by_steps=False,
unify_episodes=False,
replay_batch_size=None,
max_step=None,
cutoff_agent=1,
save_trajectories_file=None,
use_trust_region=False,
use_value_opt=False,
update_eps_lambda=False,
prioritize_by='rewards',
get_model=None,
get_replay_buffer=None,
get_buffer_seeds=None):
self.env = env
self.env_spec = env_spec
self.internal_dim = internal_dim
self.use_online_batch = use_online_batch
self.batch_by_steps = batch_by_steps
self.unify_episodes = unify_episodes
self.replay_batch_size = replay_batch_size
self.max_step = max_step
self.cutoff_agent = cutoff_agent
self.save_trajectories_file = save_trajectories_file
self.use_trust_region = use_trust_region
self.use_value_opt = use_value_opt
self.update_eps_lambda = update_eps_lambda
self.prioritize_by = prioritize_by
self.model = get_model()
self.replay_buffer = get_replay_buffer()
self.seed_replay_buffer(get_buffer_seeds())
self.internal_state = np.array([self.initial_internal_state()] *
len(self.env))
self.last_obs = self.env_spec.initial_obs(len(self.env))
self.last_act = self.env_spec.initial_act(len(self.env))
self.last_pad = np.zeros(len(self.env))
self.start_episode = np.array([True] * len(self.env))
self.step_count = np.array([0] * len(self.env))
self.episode_running_rewards = np.zeros(len(self.env))
self.episode_running_lengths = np.zeros(len(self.env))
self.episode_rewards = []
self.episode_lengths = []
self.total_rewards = []
self.best_batch_rewards = None
def setup(self):
self.model.setup()
def initial_internal_state(self):
return np.zeros(self.model.policy.rnn_state_dim)
def _sample_episodes(self, sess, greedy=False):
"""Sample episodes from environment using model."""
# reset environments as necessary
obs_after_reset = self.env.reset_if(self.start_episode)
for i, obs in enumerate(obs_after_reset):
if obs is not None:
self.step_count[i] = 0
self.internal_state[i] = self.initial_internal_state()
for j in xrange(len(self.env_spec.obs_dims)):
self.last_obs[j][i] = obs[j]
for j in xrange(len(self.env_spec.act_dims)):
self.last_act[j][i] = -1
self.last_pad[i] = 0
# maintain episode as a single unit if the last sampling
# batch ended before the episode was terminated
if self.unify_episodes:
assert len(obs_after_reset) == 1
new_ep = obs_after_reset[0] is not None
else:
new_ep = True
self.start_id = 0 if new_ep else len(self.all_obs[:])
initial_state = self.internal_state
all_obs = [] if new_ep else self.all_obs[:]
all_act = ([self.last_act] if new_ep else self.all_act[:])
all_pad = [] if new_ep else self.all_pad[:]
rewards = [] if new_ep else self.rewards[:]
# start stepping in the environments
step = 0
while not self.env.all_done():
self.step_count += 1 - np.array(self.env.dones)
next_internal_state, sampled_actions = self.model.sample_step(
sess, self.last_obs, self.internal_state, self.last_act,
greedy=greedy)
env_actions = self.env_spec.convert_actions_to_env(sampled_actions)
next_obs, reward, next_dones, _ = self.env.step(env_actions)
all_obs.append(self.last_obs)
all_act.append(sampled_actions)
all_pad.append(self.last_pad)
rewards.append(reward)
self.internal_state = next_internal_state
self.last_obs = next_obs
self.last_act = sampled_actions
self.last_pad = np.array(next_dones).astype('float32')
step += 1
if self.max_step and step >= self.max_step:
break
self.all_obs = all_obs[:]
self.all_act = all_act[:]
self.all_pad = all_pad[:]
self.rewards = rewards[:]
# append final observation
all_obs.append(self.last_obs)
return initial_state, all_obs, all_act, rewards, all_pad
def sample_episodes(self, sess):
"""Sample steps from the environment until we have enough for a batch."""
# check if last batch ended with episode that was not terminated
if self.unify_episodes:
self.all_new_ep = self.start_episode[0]
# sample episodes until we either have enough episodes or enough steps
episodes = []
total_steps = 0
while total_steps < self.max_step * len(self.env):
(initial_state,
observations, actions, rewards,
pads) = self._sample_episodes(sess)
observations = zip(*observations)
actions = zip(*actions)
terminated = np.array(self.env.dones)
self.total_rewards = np.sum(np.array(rewards[self.start_id:]) *
(1 - np.array(pads[self.start_id:])), axis=0)
self.episode_running_rewards *= 1 - self.start_episode
self.episode_running_lengths *= 1 - self.start_episode
self.episode_running_rewards += self.total_rewards
self.episode_running_lengths += np.sum(1 - np.array(pads[self.start_id:]), axis=0)
episodes.extend(self.convert_from_batched_episodes(
initial_state, observations, actions, rewards,
terminated, pads))
total_steps += np.sum(1 - np.array(pads))
# set next starting episodes
self.start_episode = np.logical_or(terminated,
self.step_count >= self.cutoff_agent)
episode_rewards = self.episode_running_rewards[self.start_episode].tolist()
self.episode_rewards.extend(episode_rewards)
self.episode_lengths.extend(self.episode_running_lengths[self.start_episode].tolist())
self.episode_rewards = self.episode_rewards[-100:]
self.episode_lengths = self.episode_lengths[-100:]
if (self.save_trajectories_file is not None and
(self.best_batch_rewards is None or
np.mean(self.total_rewards) > self.best_batch_rewards)):
self.best_batch_rewards = np.mean(self.total_rewards)
my_episodes = self.convert_from_batched_episodes(
initial_state, observations, actions, rewards,
terminated, pads)
with gfile.GFile(self.save_trajectories_file, 'w') as f:
pickle.dump(my_episodes, f)
if not self.batch_by_steps:
return (initial_state,
observations, actions, rewards,
terminated, pads)
return self.convert_to_batched_episodes(episodes)
def _train(self, sess,
observations, initial_state, actions,
rewards, terminated, pads):
"""Train model using batch."""
if self.use_trust_region:
# use trust region to optimize policy
loss, _, summary = self.model.trust_region_step(
sess,
observations, initial_state, actions,
rewards, terminated, pads,
avg_episode_reward=np.mean(self.episode_rewards))
else: # otherwise use simple gradient descent on policy
loss, _, summary = self.model.train_step(
sess,
observations, initial_state, actions,
rewards, terminated, pads,
avg_episode_reward=np.mean(self.episode_rewards))
if self.use_value_opt: # optionally perform specific value optimization
self.model.fit_values(
sess,
observations, initial_state, actions,
rewards, terminated, pads)
return loss, summary
def train(self, sess):
"""Sample some episodes and train on some episodes."""
cur_step = sess.run(self.model.inc_global_step)
self.cur_step = cur_step
# on the first iteration, set target network close to online network
if self.cur_step == 0:
for _ in xrange(100):
sess.run(self.model.copy_op)
# on other iterations, just perform single target <-- online operation
sess.run(self.model.copy_op)
# sample from env
(initial_state,
observations, actions, rewards,
terminated, pads) = self.sample_episodes(sess)
# add to replay buffer
self.add_to_replay_buffer(
initial_state, observations, actions,
rewards, terminated, pads)
loss, summary = 0, None
# train on online batch
if self.use_online_batch:
loss, summary = self._train(
sess,
observations, initial_state, actions,
rewards, terminated, pads)
# update relative entropy coefficient
if self.update_eps_lambda:
episode_rewards = np.array(self.episode_rewards)
episode_lengths = np.array(self.episode_lengths)
eps_lambda = find_best_eps_lambda(episode_rewards, episode_lengths)
sess.run(self.model.objective.assign_eps_lambda,
feed_dict={self.model.objective.new_eps_lambda: eps_lambda})
# train on replay batch
replay_batch, replay_probs = self.get_from_replay_buffer(
self.replay_batch_size)
if replay_batch:
(initial_state,
observations, actions, rewards,
terminated, pads) = replay_batch
loss, summary = self._train(
sess,
observations, initial_state, actions,
rewards, terminated, pads)
return loss, summary, self.total_rewards, self.episode_rewards
def eval(self, sess):
"""Use greedy sampling."""
(initial_state,
observations, actions, rewards,
pads) = self._sample_episodes(sess, greedy=True)
total_rewards = np.sum(np.array(rewards) * (1 - np.array(pads)), axis=0)
return np.mean(total_rewards)
def convert_from_batched_episodes(
self, initial_state, observations, actions, rewards,
terminated, pads):
"""Convert time-major batch of episodes to batch-major list of episodes."""
rewards = np.array(rewards)
pads = np.array(pads)
observations = [np.array(obs) for obs in observations]
actions = [np.array(act) for act in actions]
total_rewards = np.sum(rewards * (1 - pads), axis=0)
total_length = np.sum(1 - pads, axis=0).astype('int32')
episodes = []
num_episodes = rewards.shape[1]
for i in xrange(num_episodes):
length = total_length[i]
ep_initial = initial_state[i]
ep_obs = [obs[:length, i, ...] for obs in observations]
ep_act = [act[:length + 1, i, ...] for act in actions]
ep_rewards = rewards[:length, i]
episodes.append(
[ep_initial, ep_obs, ep_act, ep_rewards, terminated[i]])
return episodes
def convert_to_batched_episodes(self, episodes, max_length=None):
"""Convert batch-major list of episodes to time-major batch of episodes."""
lengths = [len(ep[-2]) for ep in episodes]
max_length = max_length or max(lengths)
new_episodes = []
for ep, length in zip(episodes, lengths):
initial, observations, actions, rewards, terminated = ep
observations = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:])
for obs in observations]
actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:])
for act in actions]
pads = np.array([0] * length + [1] * (max_length - length))
rewards = np.resize(rewards, [max_length]) * (1 - pads)
new_episodes.append([initial, observations, actions, rewards,
terminated, pads])
(initial, observations, actions, rewards,
terminated, pads) = zip(*new_episodes)
observations = [np.swapaxes(obs, 0, 1)
for obs in zip(*observations)]
actions = [np.swapaxes(act, 0, 1)
for act in zip(*actions)]
rewards = np.transpose(rewards)
pads = np.transpose(pads)
return (initial, observations, actions, rewards, terminated, pads)
def add_to_replay_buffer(self, initial_state,
observations, actions, rewards,
terminated, pads):
"""Add batch of episodes to replay buffer."""
if self.replay_buffer is None:
return
rewards = np.array(rewards)
pads = np.array(pads)
total_rewards = np.sum(rewards * (1 - pads), axis=0)
episodes = self.convert_from_batched_episodes(
initial_state, observations, actions, rewards,
terminated, pads)
priorities = (total_rewards if self.prioritize_by == 'reward'
else self.cur_step)
if not self.unify_episodes or self.all_new_ep:
self.last_idxs = self.replay_buffer.add(
episodes, priorities)
else:
# If we are unifying episodes, we attempt to
# keep them unified in the replay buffer.
# The first episode sampled in the current batch is a
# continuation of the last episode from the previous batch
self.replay_buffer.add(episodes[:1], priorities, self.last_idxs[-1:])
if len(episodes) > 1:
self.replay_buffer.add(episodes[1:], priorities)
def get_from_replay_buffer(self, batch_size):
"""Sample a batch of episodes from the replay buffer."""
if self.replay_buffer is None or len(self.replay_buffer) < 1 * batch_size:
return None, None
desired_count = batch_size * self.max_step
# in the case of batch_by_steps, we sample larger and larger
# amounts from the replay buffer until we have enough steps.
while True:
if batch_size > len(self.replay_buffer):
batch_size = len(self.replay_buffer)
episodes, probs = self.replay_buffer.get_batch(batch_size)
count = sum(len(ep[-2]) for ep in episodes)
if count >= desired_count or not self.batch_by_steps:
break
if batch_size == len(self.replay_buffer):
return None, None
batch_size *= 1.2
return (self.convert_to_batched_episodes(episodes), probs)
def seed_replay_buffer(self, episodes):
"""Seed the replay buffer with some episodes."""
if self.replay_buffer is None:
return
# just need to add initial state
for i in xrange(len(episodes)):
episodes[i] = [self.initial_internal_state()] + episodes[i]
self.replay_buffer.seed_buffer(episodes)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for environment interface with agent / tensorflow."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class spaces(object):
discrete = 0
box = 1
def get_space(space):
if hasattr(space, 'n'):
return space.n, spaces.discrete, None
elif hasattr(space, 'shape'):
return np.prod(space.shape), spaces.box, (space.low, space.high)
def get_spaces(spaces):
if hasattr(spaces, 'spaces'):
return zip(*[get_space(space) for space in spaces.spaces])
else:
return [(ret,) for ret in get_space(spaces)]
class EnvSpec(object):
def __init__(self, env, try_combining_actions=True,
discretize_actions=None):
self.discretize_actions = discretize_actions
# figure out observation space
self.obs_space = env.observation_space
self.obs_dims, self.obs_types, self.obs_info = get_spaces(self.obs_space)
# figure out action space
self.act_space = env.action_space
self.act_dims, self.act_types, self.act_info = get_spaces(self.act_space)
if self.discretize_actions:
self._act_dims = self.act_dims[:]
self._act_types = self.act_types[:]
self.act_dims = []
self.act_types = []
for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
if typ == spaces.discrete:
self.act_dims.append(dim)
self.act_types.append(spaces.discrete)
elif typ == spaces.box:
for _ in xrange(dim):
self.act_dims.append(self.discretize_actions)
self.act_types.append(spaces.discrete)
else:
self._act_dims = None
self._act_types = None
if (try_combining_actions and
all(typ == spaces.discrete for typ in self.act_types)):
self.combine_actions = True
self.orig_act_dims = self.act_dims[:]
self.orig_act_types = self.act_types[:]
total_act_dim = 1
for dim in self.act_dims:
total_act_dim *= dim
self.act_dims = [total_act_dim]
self.act_types = [spaces.discrete]
else:
self.combine_actions = False
self.obs_dims_and_types = zip(self.obs_dims, self.obs_types)
self.act_dims_and_types = zip(self.act_dims, self.act_types)
self.total_obs_dim = sum(self.obs_dims)
self.total_sampling_act_dim = sum(self.sampling_dim(dim, typ)
for dim, typ in self.act_dims_and_types)
self.total_sampled_act_dim = sum(self.act_dims)
def sampling_dim(self, dim, typ):
if typ == spaces.discrete:
return dim
elif typ == spaces.box:
return 2 * dim # Gaussian mean and std
else:
assert False
def convert_actions_to_env(self, actions):
if self.combine_actions:
new_actions = []
actions = actions[0]
for dim in self.orig_act_dims:
new_actions.append(np.mod(actions, dim))
actions = (actions / dim).astype('int32')
actions = new_actions
if self.discretize_actions:
new_actions = []
idx = 0
for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
if typ == spaces.discrete:
new_actions.append(actions[idx])
idx += 1
elif typ == spaces.box:
low, high = self.act_info[i]
cur_action = []
for j in xrange(dim):
cur_action.append(
low[j] + (high[j] - low[j]) * actions[idx] /
float(self.discretize_actions))
idx += 1
new_actions.append(np.hstack(cur_action))
actions = new_actions
return actions
def convert_env_actions_to_actions(self, actions):
if not self.combine_actions:
return actions
new_actions = 0
base = 1
for act, dim in zip(actions, self.orig_act_dims):
new_actions = new_actions + base * act
base *= dim
return [new_actions]
def convert_obs_to_list(self, obs):
if len(self.obs_dims) == 1:
return [obs]
else:
return list(obs)
def convert_action_to_gym(self, action):
if len(action) == 1:
return action[0]
else:
return list(action)
if ((not self.combine_actions or len(self.orig_act_dims) == 1) and
(len(self.act_dims) == 1 or
(self.discretize_actions and len(self._act_dims) == 1))):
return action[0]
else:
return list(action)
def initial_obs(self, batch_size):
batched = batch_size is not None
batch_size = batch_size or 1
obs = []
for dim, typ in self.obs_dims_and_types:
if typ == spaces.discrete:
obs.append(np.zeros(batch_size))
elif typ == spaces.box:
obs.append(np.zeros([batch_size, dim]))
if batched:
return obs
else:
return zip(*obs)[0]
def initial_act(self, batch_size=None):
batched = batch_size is not None
batch_size = batch_size or 1
act = []
for dim, typ in self.act_dims_and_types:
if typ == spaces.discrete:
act.append(-np.ones(batch_size))
elif typ == spaces.box:
act.append(-np.ones([batch_size, dim]))
if batched:
return act
else:
return zip(*act)[0]
def is_discrete(self, typ):
return typ == spaces.discrete
def is_box(self, typ):
return typ == spaces.box
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Expert paths/trajectories.
For producing or loading expert trajectories in environment.
"""
import tensorflow as tf
import random
import os
import numpy as np
import pickle
gfile = tf.gfile
def sample_expert_paths(num, env_str, env_spec,
load_trajectories_file=None):
"""Sample a number of expert paths randomly."""
if load_trajectories_file is not None:
if not gfile.Exists(load_trajectories_file):
assert False, 'trajectories file %s does not exist' % load_trajectories_file
with gfile.GFile(load_trajectories_file, 'r') as f:
episodes = pickle.load(f)
episodes = random.sample(episodes, num)
return [ep[1:] for ep in episodes]
return [sample_expert_path(env_str, env_spec)
for _ in xrange(num)]
def sample_expert_path(env_str, env_spec):
"""Algorithmic tasks have known distribution of expert paths we sample from."""
t = random.randint(2, 10) # sequence length
observations = []
actions = [env_spec.initial_act(None)]
rewards = []
if env_str in ['DuplicatedInput-v0', 'Copy-v0']:
chars = 5
random_ints = [int(random.random() * 1000) for _ in xrange(t)]
for tt in xrange(t):
char_idx = tt // 2 if env_str == 'DuplicatedInput-v0' else tt
char = random_ints[char_idx] % chars
observations.append([char])
actions.append([1, (tt + 1) % 2, char])
rewards.append((tt + 1) % 2)
elif env_str in ['RepeatCopy-v0']:
chars = 5
random_ints = [int(random.random() * 1000) for _ in xrange(t)]
for tt in xrange(3 * t + 2):
char_idx = (tt if tt < t else
2 * t - tt if tt <= 2 * t else
tt - 2 * t - 2)
if tt in [t, 2 * t + 1]:
char = chars
else:
char = random_ints[char_idx] % chars
observations.append([char])
actions.append([1 if tt < t else 0 if tt <= 2 * t else 1,
tt not in [t, 2 * t + 1], char])
rewards.append(actions[-1][-2])
elif env_str in ['Reverse-v0']:
chars = 2
random_ints = [int(random.random() * 1000) for _ in xrange(t)]
for tt in xrange(2 * t + 1):
char_idx = tt if tt < t else 2 * t - tt
if tt != t:
char = random_ints[char_idx] % chars
else:
char = chars
observations.append([char])
actions.append([tt < t, tt > t, char])
rewards.append(tt > t)
elif env_str in ['ReversedAddition-v0']:
chars = 3
random_ints = [int(random.random() * 1000) for _ in xrange(1 + 2 * t)]
carry = 0
char_history = []
move_map = {0: 3, 1: 1, 2: 2, 3: 1}
for tt in xrange(2 * t + 1):
char_idx = tt
if tt >= 2 * t:
char = chars
else:
char = random_ints[char_idx] % chars
char_history.append(char)
if tt % 2 == 1:
tot = char_history[-2] + char_history[-1] + carry
carry = tot // chars
tot = tot % chars
elif tt == 2 * t:
tot = carry
else:
tot = 0
observations.append([char])
actions.append([move_map[tt % len(move_map)],
tt % 2 or tt == 2 * t, tot])
rewards.append(tt % 2 or tt == 2 * t)
elif env_str in ['ReversedAddition3-v0']:
chars = 3
random_ints = [int(random.random() * 1000) for _ in xrange(1 + 3 * t)]
carry = 0
char_history = []
move_map = {0: 3, 1: 3, 2: 1, 3: 2, 4:2, 5: 1}
for tt in xrange(3 * t + 1):
char_idx = tt
if tt >= 3 * t:
char = chars
else:
char = random_ints[char_idx] % chars
char_history.append(char)
if tt % 3 == 2:
tot = char_history[-3] + char_history[-2] + char_history[-1] + carry
carry = tot // chars
tot = tot % chars
elif tt == 3 * t:
tot = carry
else:
tot = 0
observations.append([char])
actions.append([move_map[tt % len(move_map)],
tt % 3 == 2 or tt == 3 * t, tot])
rewards.append(tt % 3 == 2 or tt == 3 * t)
else:
assert False, 'No expert trajectories for env %s' % env_str
actions = [
env_spec.convert_env_actions_to_actions(act)
for act in actions]
observations.append([chars])
observations = [np.array(obs) for obs in zip(*observations)]
actions = [np.array(act) for act in zip(*actions)]
rewards = np.array(rewards)
return [observations, actions, rewards, True]
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Objectives for full-episode.
Implementations of UREX & REINFORCE. Note that these implementations
use a non-parametric baseline to reduce variance. Thus, multiple
samples with the same seed must be taken from the environment.
"""
import tensorflow as tf
import objective
class Reinforce(objective.Objective):
def __init__(self, learning_rate, clip_norm, num_samples,
tau=0.1, bonus_weight=1.0):
super(Reinforce, self).__init__(learning_rate, clip_norm=clip_norm)
self.num_samples = num_samples
assert self.num_samples > 1
self.tau = tau
self.bonus_weight = bonus_weight
self.eps_lambda = 0.0
def get_bonus(self, total_rewards, total_log_probs):
"""Exploration bonus."""
return -self.tau * total_log_probs
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
seq_length = tf.shape(rewards)[0]
not_pad = tf.reshape(1 - pads, [seq_length, -1, self.num_samples])
rewards = not_pad * tf.reshape(rewards, [seq_length, -1, self.num_samples])
log_probs = not_pad * tf.reshape(sum(log_probs), [seq_length, -1, self.num_samples])
total_rewards = tf.reduce_sum(rewards, 0)
total_log_probs = tf.reduce_sum(log_probs, 0)
rewards_and_bonus = (total_rewards +
self.bonus_weight *
self.get_bonus(total_rewards, total_log_probs))
baseline = tf.reduce_mean(rewards_and_bonus, 1, keep_dims=True)
loss = -tf.stop_gradient(rewards_and_bonus - baseline) * total_log_probs
loss = tf.reduce_mean(loss)
raw_loss = loss # TODO
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', total_log_probs)
tf.summary.histogram('rewards', total_rewards)
tf.summary.scalar('avg_rewards',
tf.reduce_mean(total_rewards))
tf.summary.scalar('loss', loss)
return loss, raw_loss, baseline, gradient_ops, tf.summary.merge_all()
class UREX(Reinforce):
def get_bonus(self, total_rewards, total_log_probs):
"""Exploration bonus."""
discrepancy = total_rewards / self.tau - total_log_probs
normalized_d = self.num_samples * tf.nn.softmax(discrepancy)
return self.tau * normalized_d
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper around gym env.
Allows for using batches of possibly identitically seeded environments.
"""
import gym
import numpy as np
import random
import env_spec
def get_env(env_str):
return gym.make(env_str)
class GymWrapper(object):
def __init__(self, env_str, distinct=1, count=1, seeds=None):
self.distinct = distinct
self.count = count
self.total = self.distinct * self.count
self.seeds = seeds or [random.randint(0, 1e12)
for _ in xrange(self.distinct)]
self.envs = []
for seed in self.seeds:
for _ in xrange(self.count):
env = get_env(env_str)
env.seed(seed)
if hasattr(env, 'last'):
env.last = 100 # for algorithmic envs
self.envs.append(env)
self.dones = [True] * self.total
self.num_episodes_played = 0
one_env = self.get_one()
self.use_action_list = hasattr(one_env.action_space, 'spaces')
self.env_spec = env_spec.EnvSpec(self.get_one())
def get_seeds(self):
return self.seeds
def reset(self):
self.dones = [False] * self.total
self.num_episodes_played += len(self.envs)
# reset seeds to be synchronized
self.seeds = [random.randint(0, 1e12) for _ in xrange(self.distinct)]
counter = 0
for seed in self.seeds:
for _ in xrange(self.count):
self.envs[counter].seed(seed)
counter += 1
return [self.env_spec.convert_obs_to_list(env.reset())
for env in self.envs]
def reset_if(self, predicate=None):
if predicate is None:
predicate = self.dones
if self.count != 1:
assert np.all(predicate)
return self.reset()
self.num_episodes_played += sum(predicate)
output = [self.env_spec.convert_obs_to_list(env.reset())
if pred else None
for env, pred in zip(self.envs, predicate)]
for i, pred in enumerate(predicate):
if pred:
self.dones[i] = False
return output
def all_done(self):
return all(self.dones)
def step(self, actions):
def env_step(action):
action = self.env_spec.convert_action_to_gym(action)
obs, reward, done, tt = env.step(action)
obs = self.env_spec.convert_obs_to_list(obs)
return obs, reward, done, tt
actions = zip(*actions)
outputs = [env_step(action)
if not done else (self.env_spec.initial_obs(None), 0, True, None)
for action, env, done in zip(actions, self.envs, self.dones)]
for i, (_, _, done, _) in enumerate(outputs):
self.dones[i] = self.dones[i] or done
obs, reward, done, tt = zip(*outputs)
obs = [list(oo) for oo in zip(*obs)]
return [obs, reward, done, tt]
def get_one(self):
return random.choice(self.envs)
def __len__(self):
return len(self.envs)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model is responsible for setting up Tensorflow graph.
Creates policy and value networks. Also sets up all optimization
ops, including gradient ops, trust region ops, and value optimizers.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class Model(object):
def __init__(self, env_spec, global_step,
target_network_lag=0.95,
sample_from='online',
get_policy=None,
get_baseline=None,
get_objective=None,
get_trust_region_p_opt=None,
get_value_opt=None):
self.env_spec = env_spec
self.global_step = global_step
self.inc_global_step = self.global_step.assign_add(1)
self.target_network_lag = target_network_lag
self.sample_from = sample_from
self.policy = get_policy()
self.baseline = get_baseline()
self.objective = get_objective()
self.baseline.eps_lambda = self.objective.eps_lambda # TODO: do this better
self.trust_region_policy_opt = get_trust_region_p_opt()
self.value_opt = get_value_opt()
def setup_placeholders(self):
"""Create the Tensorflow placeholders."""
# summary placeholder
self.avg_episode_reward = tf.placeholder(
tf.float32, [], 'avg_episode_reward')
# sampling placeholders
self.internal_state = tf.placeholder(tf.float32,
[None, self.policy.rnn_state_dim],
'internal_state')
self.single_observation = []
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
self.single_observation.append(
tf.placeholder(tf.int32, [None], 'obs%d' % i))
elif self.env_spec.is_box(obs_type):
self.single_observation.append(
tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i))
else:
assert False
self.single_action = []
for i, (action_dim, action_type) in \
enumerate(self.env_spec.act_dims_and_types):
if self.env_spec.is_discrete(action_type):
self.single_action.append(
tf.placeholder(tf.int32, [None], 'act%d' % i))
elif self.env_spec.is_box(action_type):
self.single_action.append(
tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i))
else:
assert False
# training placeholders
self.observations = []
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
self.observations.append(
tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i))
else:
self.observations.append(
tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i))
self.actions = []
self.other_logits = []
for i, (action_dim, action_type) in \
enumerate(self.env_spec.act_dims_and_types):
if self.env_spec.is_discrete(action_type):
self.actions.append(
tf.placeholder(tf.int32, [None, None], 'all_act%d' % i))
if self.env_spec.is_box(action_type):
self.actions.append(
tf.placeholder(tf.float32, [None, None, action_dim],
'all_act%d' % i))
self.other_logits.append(
tf.placeholder(tf.float32, [None, None, None],
'other_logits%d' % i))
self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards')
self.terminated = tf.placeholder(tf.float32, [None], 'terminated')
self.pads = tf.placeholder(tf.float32, [None, None], 'pads')
self.prev_log_probs = tf.placeholder(tf.float32, [None, None],
'prev_log_probs')
def setup(self):
"""Setup Tensorflow Graph."""
self.setup_placeholders()
tf.summary.scalar('avg_episode_reward', self.avg_episode_reward)
with tf.variable_scope('model', reuse=None):
# policy network
with tf.variable_scope('policy_net'):
(self.policy_internal_states, self.logits, self.log_probs,
self.entropies, self.self_kls) = \
self.policy.multi_step(self.observations,
self.internal_state,
self.actions)
self.out_log_probs = sum(self.log_probs)
self.kl = self.policy.calculate_kl(self.other_logits, self.logits)
self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) /
tf.reduce_sum(1 - self.pads))
# value network
with tf.variable_scope('value_net'):
(self.values,
self.regression_input,
self.regression_weight) = self.baseline.get_values(
self.observations, self.actions,
self.policy_internal_states, self.logits)
# target policy network
with tf.variable_scope('target_policy_net'):
(self.target_policy_internal_states,
self.target_logits, self.target_log_probs,
_, _) = \
self.policy.multi_step(self.observations,
self.internal_state,
self.actions)
# target value network
with tf.variable_scope('target_value_net'):
(self.target_values, _, _) = self.baseline.get_values(
self.observations, self.actions,
self.target_policy_internal_states, self.target_logits)
# construct copy op online --> target
all_vars = tf.trainable_variables()
online_vars = [p for p in all_vars if
'/policy_net' in p.name or '/value_net' in p.name]
target_vars = [p for p in all_vars if
'target_policy_net' in p.name or 'target_value_net' in p.name]
online_vars.sort(key=lambda p: p.name)
target_vars.sort(key=lambda p: p.name)
aa = self.target_network_lag
self.copy_op = tf.group(*[
target_p.assign(aa * target_p + (1 - aa) * online_p)
for online_p, target_p in zip(online_vars, target_vars)])
# evaluate objective
(self.loss, self.raw_loss, self.regression_target,
self.gradient_ops, self.summary) = self.objective.get(
self.rewards, self.pads,
self.values[:-1, :],
self.values[-1, :] * (1 - self.terminated),
self.log_probs, self.prev_log_probs, self.target_log_probs,
self.entropies,
self.logits)
self.regression_target = tf.reshape(self.regression_target, [-1])
self.policy_vars = [
v for v in tf.trainable_variables()
if '/policy_net' in v.name]
self.value_vars = [
v for v in tf.trainable_variables()
if '/value_net' in v.name]
# trust region optimizer
if self.trust_region_policy_opt is not None:
with tf.variable_scope('trust_region_policy', reuse=None):
avg_self_kl = (
tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) /
tf.reduce_sum(1 - self.pads))
self.trust_region_policy_opt.setup(
self.policy_vars, self.raw_loss, avg_self_kl,
self.avg_kl)
# value optimizer
if self.value_opt is not None:
with tf.variable_scope('trust_region_value', reuse=None):
self.value_opt.setup(
self.value_vars,
tf.reshape(self.values[:-1, :], [-1]),
self.regression_target,
tf.reshape(self.pads, [-1]),
self.regression_input, self.regression_weight)
# we re-use variables for the sampling operations
with tf.variable_scope('model', reuse=True):
scope = ('target_policy_net' if self.sample_from == 'target'
else 'policy_net')
with tf.variable_scope(scope):
self.next_internal_state, self.sampled_actions = \
self.policy.sample_step(self.single_observation,
self.internal_state,
self.single_action)
self.greedy_next_internal_state, self.greedy_sampled_actions = \
self.policy.sample_step(self.single_observation,
self.internal_state,
self.single_action,
greedy=True)
def sample_step(self, sess,
single_observation, internal_state, single_action,
greedy=False):
"""Sample batch of steps from policy."""
if greedy:
outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions]
else:
outputs = [self.next_internal_state, self.sampled_actions]
feed_dict = {self.internal_state: internal_state}
for action_place, action in zip(self.single_action, single_action):
feed_dict[action_place] = action
for obs_place, obs in zip(self.single_observation, single_observation):
feed_dict[obs_place] = obs
return sess.run(outputs, feed_dict=feed_dict)
def train_step(self, sess,
observations, internal_state, actions,
rewards, terminated, pads,
avg_episode_reward=0):
"""Train network using standard gradient descent."""
outputs = [self.raw_loss, self.gradient_ops, self.summary]
feed_dict = {self.internal_state: internal_state,
self.rewards: rewards,
self.terminated: terminated,
self.pads: pads,
self.avg_episode_reward: avg_episode_reward}
for action_place, action in zip(self.actions, actions):
feed_dict[action_place] = action
for obs_place, obs in zip(self.observations, observations):
feed_dict[obs_place] = obs
return sess.run(outputs, feed_dict=feed_dict)
def trust_region_step(self, sess,
observations, internal_state, actions,
rewards, terminated, pads,
avg_episode_reward=0):
"""Train policy using trust region step."""
feed_dict = {self.internal_state: internal_state,
self.rewards: rewards,
self.terminated: terminated,
self.pads: pads,
self.avg_episode_reward: avg_episode_reward}
for action_place, action in zip(self.actions, actions):
feed_dict[action_place] = action
for obs_place, obs in zip(self.observations, observations):
feed_dict[obs_place] = obs
(prev_log_probs, prev_logits) = sess.run(
[self.out_log_probs, self.logits], feed_dict=feed_dict)
feed_dict[self.prev_log_probs] = prev_log_probs
for other_logit, prev_logit in zip(self.other_logits, prev_logits):
feed_dict[other_logit] = prev_logit
# fit policy
self.trust_region_policy_opt.optimize(sess, feed_dict)
ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict)
ret = [ret[0], None, ret[1]]
return ret
def fit_values(self, sess,
observations, internal_state, actions,
rewards, terminated, pads):
"""Train value network using value-specific optimizer."""
feed_dict = {self.internal_state: internal_state,
self.rewards: rewards,
self.terminated: terminated,
self.pads: pads}
for action_place, action in zip(self.actions, actions):
feed_dict[action_place] = action
for obs_place, obs in zip(self.observations, observations):
feed_dict[obs_place] = obs
# fit values
if self.value_opt is None:
raise ValueError('Specific value optimizer does not exist')
self.value_opt.optimize(sess, feed_dict)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Objectives to compute loss and value targets.
Implements Actor Critic, PCL (vanilla PCL, Unified PCL, Trust PCL), and TRPO.
"""
import tensorflow as tf
import numpy as np
class Objective(object):
def __init__(self, learning_rate, clip_norm):
self.learning_rate = learning_rate
self.clip_norm = clip_norm
def get_optimizer(self, learning_rate):
"""Optimizer for gradient descent ops."""
return tf.train.AdamOptimizer(learning_rate=learning_rate,
epsilon=2e-4)
def training_ops(self, loss, learning_rate=None):
"""Gradient ops."""
opt = self.get_optimizer(learning_rate)
params = tf.trainable_variables()
grads = tf.gradients(loss, params)
if self.clip_norm:
grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm)
tf.summary.scalar('grad_global_norm', global_norm)
return opt.apply_gradients(zip(grads, params))
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
"""Get objective calculations."""
raise NotImplementedError()
def discounted_future_sum(values, discount, rollout):
"""Discounted future sum of time-major values."""
discount_filter = tf.reshape(
discount ** tf.range(float(rollout)), [-1, 1, 1])
expanded_values = tf.concat(
[values, tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
stride=1, padding='VALID'), -1))
return conv_values
def discounted_two_sided_sum(values, discount, rollout):
"""Discounted two-sided sum of time-major values."""
roll = float(rollout)
discount_filter = tf.reshape(
discount ** tf.abs(tf.range(-roll + 1, roll)), [-1, 1, 1])
expanded_values = tf.concat(
[tf.zeros([rollout - 1, tf.shape(values)[1]]), values,
tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
stride=1, padding='VALID'), -1))
return conv_values
def shift_values(values, discount, rollout, final_values=0.0):
"""Shift values up by some amount of time.
Those values that shift from a value beyond the last value
are calculated using final_values.
"""
roll_range = tf.cumsum(tf.ones_like(values[:rollout, :]), 0,
exclusive=True, reverse=True)
final_pad = tf.expand_dims(final_values, 0) * discount ** roll_range
return tf.concat([discount ** rollout * values[rollout:, :],
final_pad], 0)
class ActorCritic(Objective):
"""Standard Actor-Critic."""
def __init__(self, learning_rate, clip_norm=5,
policy_weight=1.0, critic_weight=0.1,
tau=0.1, gamma=1.0, rollout=10,
eps_lambda=0.0, clip_adv=None):
super(ActorCritic, self).__init__(learning_rate, clip_norm=clip_norm)
self.policy_weight = policy_weight
self.critic_weight = critic_weight
self.tau = tau
self.gamma = gamma
self.rollout = rollout
self.clip_adv = clip_adv
self.eps_lambda = tf.get_variable( # TODO: need a better way
'eps_lambda', [], initializer=tf.constant_initializer(eps_lambda))
self.new_eps_lambda = tf.placeholder(tf.float32, [])
self.assign_eps_lambda = self.eps_lambda.assign(
0.95 * self.eps_lambda + 0.05 * self.new_eps_lambda)
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
not_pad = 1 - pads
batch_size = tf.shape(rewards)[1]
entropy = not_pad * sum(entropies)
rewards = not_pad * rewards
value_estimates = not_pad * values
log_probs = not_pad * sum(log_probs)
sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
last_values = shift_values(value_estimates, self.gamma, self.rollout,
final_values)
future_values = sum_rewards + last_values
baseline_values = value_estimates
adv = tf.stop_gradient(-baseline_values + future_values)
if self.clip_adv:
adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
policy_loss = -adv * log_probs
critic_loss = -adv * baseline_values
regularizer = -self.tau * entropy
policy_loss = tf.reduce_mean(
tf.reduce_sum(policy_loss * not_pad, 0))
critic_loss = tf.reduce_mean(
tf.reduce_sum(critic_loss * not_pad, 0))
regularizer = tf.reduce_mean(
tf.reduce_sum(regularizer * not_pad, 0))
# loss for gradient calculation
loss = (self.policy_weight * policy_loss +
self.critic_weight * critic_loss + regularizer)
raw_loss = tf.reduce_mean( # TODO
tf.reduce_sum(not_pad * policy_loss, 0))
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
tf.summary.scalar('avg_rewards',
tf.reduce_mean(tf.reduce_sum(rewards, 0)))
tf.summary.scalar('policy_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('critic_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('loss', loss)
tf.summary.scalar('raw_loss', raw_loss)
return (loss, raw_loss, future_values,
gradient_ops, tf.summary.merge_all())
class PCL(ActorCritic):
"""PCL implementation.
Implements vanilla PCL, Unified PCL, and Trust PCL depending
on provided inputs.
"""
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
not_pad = 1 - pads
batch_size = tf.shape(rewards)[1]
rewards = not_pad * rewards
value_estimates = not_pad * values
log_probs = not_pad * sum(log_probs)
target_log_probs = not_pad * tf.stop_gradient(sum(target_log_probs))
relative_log_probs = not_pad * (log_probs - target_log_probs)
# Prepend.
not_pad = tf.concat([tf.ones([self.rollout - 1, batch_size]),
not_pad], 0)
rewards = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
rewards], 0)
value_estimates = tf.concat(
[self.gamma ** tf.expand_dims(
tf.range(float(self.rollout - 1), 0, -1), 1) *
tf.ones([self.rollout - 1, batch_size]) *
value_estimates[0:1, :],
value_estimates], 0)
log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
log_probs], 0)
prev_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
prev_log_probs], 0)
relative_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
relative_log_probs], 0)
sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
sum_log_probs = discounted_future_sum(log_probs, self.gamma, self.rollout)
sum_prev_log_probs = discounted_future_sum(prev_log_probs, self.gamma, self.rollout)
sum_relative_log_probs = discounted_future_sum(
relative_log_probs, self.gamma, self.rollout)
last_values = shift_values(value_estimates, self.gamma, self.rollout,
final_values)
future_values = (
- self.tau * sum_log_probs
- self.eps_lambda * sum_relative_log_probs
+ sum_rewards + last_values)
baseline_values = value_estimates
adv = tf.stop_gradient(-baseline_values + future_values)
if self.clip_adv:
adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
policy_loss = -adv * sum_log_probs
critic_loss = -adv * (baseline_values - last_values)
policy_loss = tf.reduce_mean(
tf.reduce_sum(policy_loss * not_pad, 0))
critic_loss = tf.reduce_mean(
tf.reduce_sum(critic_loss * not_pad, 0))
# loss for gradient calculation
loss = (self.policy_weight * policy_loss +
self.critic_weight * critic_loss)
# actual quantity we're trying to minimize
raw_loss = tf.reduce_mean(
tf.reduce_sum(not_pad * adv * (-baseline_values + future_values), 0))
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
tf.summary.histogram('future_values', future_values)
tf.summary.histogram('baseline_values', baseline_values)
tf.summary.histogram('advantages', adv)
tf.summary.scalar('avg_rewards',
tf.reduce_mean(tf.reduce_sum(rewards, 0)))
tf.summary.scalar('policy_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('critic_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('loss', loss)
tf.summary.scalar('raw_loss', tf.reduce_mean(raw_loss))
tf.summary.scalar('eps_lambda', self.eps_lambda)
return (loss, raw_loss,
future_values[self.rollout - 1:, :],
gradient_ops, tf.summary.merge_all())
class TRPO(ActorCritic):
"""TRPO."""
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
not_pad = 1 - pads
batch_size = tf.shape(rewards)[1]
rewards = not_pad * rewards
value_estimates = not_pad * values
log_probs = not_pad * sum(log_probs)
prev_log_probs = not_pad * prev_log_probs
sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
last_values = shift_values(value_estimates, self.gamma, self.rollout,
final_values)
future_values = sum_rewards + last_values
baseline_values = value_estimates
adv = tf.stop_gradient(-baseline_values + future_values)
if self.clip_adv:
adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
policy_loss = -adv * tf.exp(log_probs - prev_log_probs)
critic_loss = -adv * baseline_values
policy_loss = tf.reduce_mean(
tf.reduce_sum(policy_loss * not_pad, 0))
critic_loss = tf.reduce_mean(
tf.reduce_sum(critic_loss * not_pad, 0))
raw_loss = policy_loss
# loss for gradient calculation
if self.policy_weight == 0:
policy_loss = 0.0
elif self.critic_weight == 0:
critic_loss = 0.0
loss = (self.policy_weight * policy_loss +
self.critic_weight * critic_loss)
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
tf.summary.scalar('avg_rewards',
tf.reduce_mean(tf.reduce_sum(rewards, 0)))
tf.summary.scalar('policy_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('critic_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('loss', loss)
tf.summary.scalar('raw_loss', raw_loss)
return (loss, raw_loss, future_values,
gradient_ops, tf.summary.merge_all())
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Optimizers mostly for value estimate.
Gradient Descent optimizer
LBFGS optimizer
Best Fit optimizer
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import scipy.optimize
def var_size(v):
return int(np.prod([int(d) for d in v.shape]))
def gradients(loss, var_list):
grads = tf.gradients(loss, var_list)
return [g if g is not None else tf.zeros(v.shape)
for g, v in zip(grads, var_list)]
def flatgrad(loss, var_list):
grads = gradients(loss, var_list)
return tf.concat([tf.reshape(grad, [-1])
for (v, grad) in zip(var_list, grads)
if grad is not None], 0)
def get_flat(var_list):
return tf.concat([tf.reshape(v, [-1]) for v in var_list], 0)
def set_from_flat(var_list, flat_theta):
assigns = []
shapes = [v.shape for v in var_list]
sizes = [var_size(v) for v in var_list]
start = 0
assigns = []
for (shape, size, v) in zip(shapes, sizes, var_list):
assigns.append(v.assign(
tf.reshape(flat_theta[start:start + size], shape)))
start += size
assert start == sum(sizes)
return tf.group(*assigns)
class LbfgsOptimization(object):
def __init__(self, max_iter=25, mix_frac=1.0):
self.max_iter = max_iter
self.mix_frac = mix_frac
def setup_placeholders(self):
self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
def setup(self, var_list, values, targets, pads,
inputs, regression_weight):
self.setup_placeholders()
self.values = values
self.targets = targets
self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
/ tf.reduce_sum(1 - pads))
self.loss_flat_gradient = flatgrad(self.raw_loss, var_list)
self.flat_vars = get_flat(var_list)
self.set_vars = set_from_flat(var_list, self.flat_theta)
def optimize(self, sess, feed_dict):
old_theta = sess.run(self.flat_vars)
old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
feed_dict = dict(feed_dict)
feed_dict[self.intended_values] = intended_values
def calc_loss_and_grad(theta):
sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
loss, grad = sess.run([self.raw_loss, self.loss_flat_gradient],
feed_dict=feed_dict)
grad = grad.astype('float64')
return loss, grad
theta, _, _ = scipy.optimize.fmin_l_bfgs_b(
calc_loss_and_grad, old_theta, maxiter=self.max_iter)
sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
class GradOptimization(object):
def __init__(self, learning_rate=0.001, max_iter=25, mix_frac=1.0):
self.learning_rate = learning_rate
self.max_iter = max_iter
self.mix_frac = mix_frac
def get_optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.learning_rate,
epsilon=2e-4)
def setup_placeholders(self):
self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
def setup(self, var_list, values, targets, pads,
inputs, regression_weight):
self.setup_placeholders()
self.values = values
self.targets = targets
self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
/ tf.reduce_sum(1 - pads))
opt = self.get_optimizer()
params = var_list
grads = tf.gradients(self.raw_loss, params)
self.gradient_ops = opt.apply_gradients(zip(grads, params))
def optimize(self, sess, feed_dict):
old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
feed_dict = dict(feed_dict)
feed_dict[self.intended_values] = intended_values
for _ in xrange(self.max_iter):
sess.run(self.gradient_ops, feed_dict=feed_dict)
class BestFitOptimization(object):
def __init__(self, mix_frac=1.0):
self.mix_frac = mix_frac
def setup_placeholders(self):
self.new_regression_weight = tf.placeholder(
tf.float32, self.regression_weight.shape)
def setup(self, var_list, values, targets, pads,
inputs, regression_weight):
self.values = values
self.targets = targets
self.inputs = inputs
self.regression_weight = regression_weight
self.setup_placeholders()
self.update_regression_weight = tf.assign(
self.regression_weight, self.new_regression_weight)
def optimize(self, sess, feed_dict):
reg_input, reg_weight, old_values, targets = sess.run(
[self.inputs, self.regression_weight, self.values, self.targets],
feed_dict=feed_dict)
intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
# taken from rllab
reg_coeff = 1e-5
for _ in range(5):
best_fit_weight = np.linalg.lstsq(
reg_input.T.dot(reg_input) +
reg_coeff * np.identity(reg_input.shape[1]),
reg_input.T.dot(intended_values))[0]
if not np.any(np.isnan(best_fit_weight)):
break
reg_coeff *= 10
if len(best_fit_weight.shape) == 1:
best_fit_weight = np.expand_dims(best_fit_weight, -1)
sess.run(self.update_regression_weight,
feed_dict={self.new_regression_weight: best_fit_weight})
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Policy neural network.
Implements network which takes in input and produces actions
and log probabilities given a sampling distribution parameterization.
"""
import tensorflow as tf
import numpy as np
class Policy(object):
def __init__(self, env_spec, internal_dim,
fixed_std=True, recurrent=True,
input_prev_actions=True):
self.env_spec = env_spec
self.internal_dim = internal_dim
self.rnn_state_dim = self.internal_dim
self.fixed_std = fixed_std
self.recurrent = recurrent
self.input_prev_actions = input_prev_actions
self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
self.vector_init = tf.constant_initializer(0.0)
@property
def input_dim(self):
return (self.env_spec.total_obs_dim +
self.env_spec.total_sampled_act_dim * self.input_prev_actions)
@property
def output_dim(self):
return self.env_spec.total_sampling_act_dim
def get_cell(self):
"""Get RNN cell."""
self.cell_input_dim = self.internal_dim // 2
cell = tf.contrib.rnn.LSTMCell(self.cell_input_dim,
state_is_tuple=False,
reuse=tf.get_variable_scope().reuse)
cell = tf.contrib.rnn.OutputProjectionWrapper(
cell, self.output_dim,
reuse=tf.get_variable_scope().reuse)
return cell
def core(self, obs, prev_internal_state, prev_actions):
"""Core neural network taking in inputs and outputting sampling
distribution parameters."""
batch_size = tf.shape(obs[0])[0]
if not self.recurrent:
prev_internal_state = tf.zeros([batch_size, self.rnn_state_dim])
cell = self.get_cell()
b = tf.get_variable('input_bias', [self.cell_input_dim],
initializer=self.vector_init)
cell_input = tf.nn.bias_add(tf.zeros([batch_size, self.cell_input_dim]), b)
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
w = tf.get_variable('w_state%d' % i, [obs_dim, self.cell_input_dim],
initializer=self.matrix_init)
if self.env_spec.is_discrete(obs_type):
cell_input += tf.matmul(tf.one_hot(obs[i], obs_dim), w)
elif self.env_spec.is_box(obs_type):
cell_input += tf.matmul(obs[i], w)
else:
assert False
if self.input_prev_actions:
if self.env_spec.combine_actions: # TODO(ofir): clean this up
prev_action = prev_actions[0]
for i, action_dim in enumerate(self.env_spec.orig_act_dims):
act = tf.mod(prev_action, action_dim)
w = tf.get_variable('w_prev_action%d' % i, [action_dim, self.cell_input_dim],
initializer=self.matrix_init)
cell_input += tf.matmul(tf.one_hot(act, action_dim), w)
prev_action = tf.to_int32(prev_action / action_dim)
else:
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
w = tf.get_variable('w_prev_action%d' % i, [act_dim, self.cell_input_dim],
initializer=self.matrix_init)
if self.env_spec.is_discrete(act_type):
cell_input += tf.matmul(tf.one_hot(prev_actions[i], act_dim), w)
elif self.env_spec.is_box(act_type):
cell_input += tf.matmul(prev_actions[i], w)
else:
assert False
output, next_state = cell(cell_input, prev_internal_state)
return output, next_state
def sample_action(self, logits, sampling_dim,
act_dim, act_type, greedy=False):
"""Sample an action from a distribution."""
if self.env_spec.is_discrete(act_type):
if greedy:
act = tf.argmax(logits, 1)
else:
act = tf.reshape(tf.multinomial(logits, 1), [-1])
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
if greedy:
act = means
else:
batch_size = tf.shape(logits)[0]
act = means + std * tf.random_normal([batch_size, act_dim])
else:
assert False
return act
def entropy(self, logits,
sampling_dim, act_dim, act_type):
"""Calculate entropy of distribution."""
if self.env_spec.is_discrete(act_type):
entropy = tf.reduce_sum(
-tf.nn.softmax(logits) * tf.nn.log_softmax(logits), -1)
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
entropy = tf.reduce_sum(
0.5 * (1 + tf.log(2 * np.pi * tf.square(std))), -1)
else:
assert False
return entropy
def self_kl(self, logits,
sampling_dim, act_dim, act_type):
"""Calculate KL of distribution with itself.
Used layer only for the gradients.
"""
if self.env_spec.is_discrete(act_type):
probs = tf.nn.softmax(logits)
log_probs = tf.nn.log_softmax(logits)
self_kl = tf.reduce_sum(
tf.stop_gradient(probs) *
(tf.stop_gradient(log_probs) - log_probs), -1)
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
my_means = tf.stop_gradient(means)
my_std = tf.stop_gradient(std)
self_kl = tf.reduce_sum(
tf.log(std / my_std) +
(tf.square(my_std) + tf.square(my_means - means)) /
(2.0 * tf.square(std)) - 0.5,
-1)
else:
assert False
return self_kl
def log_prob_action(self, action, logits,
sampling_dim, act_dim, act_type):
"""Calculate log-prob of action sampled from distribution."""
if self.env_spec.is_discrete(act_type):
act_log_prob = tf.reduce_sum(
tf.one_hot(action, act_dim) * tf.nn.log_softmax(logits), -1)
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
act_log_prob = (- 0.5 * tf.log(2 * np.pi * tf.square(std))
- 0.5 * tf.square(action - means) / tf.square(std))
act_log_prob = tf.reduce_sum(act_log_prob, -1)
else:
assert False
return act_log_prob
def sample_actions(self, output, actions=None, greedy=False):
"""Sample all actions given output of core network."""
sampled_actions = []
logits = []
log_probs = []
entropy = []
self_kl = []
start_idx = 0
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
if self.fixed_std and self.env_spec.is_box(act_type):
act_logits = output[:, start_idx:start_idx + act_dim]
log_std = tf.get_variable('std%d' % i, [1, sampling_dim // 2])
# fix standard deviations to variable
act_logits = tf.concat(
[act_logits,
1e-6 + tf.exp(log_std) + 0 * act_logits], 1)
else:
act_logits = output[:, start_idx:start_idx + sampling_dim]
if actions is None:
act = self.sample_action(act_logits, sampling_dim,
act_dim, act_type,
greedy=greedy)
else:
act = actions[i]
ent = self.entropy(act_logits, sampling_dim, act_dim, act_type)
kl = self.self_kl(act_logits, sampling_dim, act_dim, act_type)
act_log_prob = self.log_prob_action(
act, act_logits,
sampling_dim, act_dim, act_type)
sampled_actions.append(act)
logits.append(act_logits)
log_probs.append(act_log_prob)
entropy.append(ent)
self_kl.append(kl)
start_idx += sampling_dim
assert start_idx == self.env_spec.total_sampling_act_dim
return sampled_actions, logits, log_probs, entropy, self_kl
def get_kl(self, my_logits, other_logits):
"""Calculate KL between one policy output and another."""
kl = []
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
single_my_logits = my_logits[i]
single_other_logits = other_logits[i]
if self.env_spec.is_discrete(act_type):
my_probs = tf.nn.softmax(single_my_logits)
my_log_probs = tf.nn.log_softmax(single_my_logits)
other_log_probs = tf.nn.log_softmax(single_other_logits)
my_kl = tf.reduce_sum(my_probs * (my_log_probs - other_log_probs), -1)
elif self.env_spec.is_box(act_type):
my_means = single_my_logits[:, :sampling_dim / 2]
my_std = single_my_logits[:, sampling_dim / 2:]
other_means = single_other_logits[:, :sampling_dim / 2]
other_std = single_other_logits[:, sampling_dim / 2:]
my_kl = tf.reduce_sum(
tf.log(other_std / my_std) +
(tf.square(my_std) + tf.square(my_means - other_means)) /
(2.0 * tf.square(other_std)) - 0.5,
-1)
else:
assert False
kl.append(my_kl)
return kl
def single_step(self, prev, cur, greedy=False):
"""Single RNN step. Equivalently, single-time-step sampled actions."""
prev_internal_state, prev_actions, _, _, _, _ = prev
obs, actions = cur # state observed and action taken at this time step
# feed into RNN cell
output, next_state = self.core(
obs, prev_internal_state, prev_actions)
# sample actions with values and log-probs
(actions, logits, log_probs,
entropy, self_kl) = self.sample_actions(
output, actions=actions, greedy=greedy)
return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
tuple(entropy), tuple(self_kl))
def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
"""Sample single step from policy."""
(next_state, sampled_actions, logits, log_probs,
entropies, self_kls) = self.single_step(
(prev_internal_state, prev_actions, None, None, None, None),
(obs, None), greedy=greedy)
return next_state, sampled_actions
def multi_step(self, all_obs, initial_state, all_actions):
"""Calculate log-probs and other calculations on batch of episodes."""
batch_size = tf.shape(initial_state)[0]
time_length = tf.shape(all_obs[0])[0]
initial_actions = [act[0] for act in all_actions]
all_actions = [tf.concat([act[1:], act[0:1]], 0)
for act in all_actions] # "final" action is dummy
(internal_states, _, logits, log_probs,
entropies, self_kls) = tf.scan(
self.single_step,
(all_obs, all_actions),
initializer=self.get_initializer(
batch_size, initial_state, initial_actions))
# remove "final" computations
log_probs = [log_prob[:-1] for log_prob in log_probs]
entropies = [entropy[:-1] for entropy in entropies]
self_kls = [self_kl[:-1] for self_kl in self_kls]
return internal_states, logits, log_probs, entropies, self_kls
def get_initializer(self, batch_size, initial_state, initial_actions):
"""Get initializer for RNN."""
logits_init = []
log_probs_init = []
for act_dim, act_type in self.env_spec.act_dims_and_types:
sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
logits_init.append(tf.zeros([batch_size, sampling_dim]))
log_probs_init.append(tf.zeros([batch_size]))
entropy_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
self_kl_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
return (initial_state,
tuple(initial_actions),
tuple(logits_init), tuple(log_probs_init),
tuple(entropy_init),
tuple(self_kl_init))
def calculate_kl(self, my_logits, other_logits):
"""Calculate KL between one policy and another on batch of episodes."""
batch_size = tf.shape(my_logits[0])[1]
time_length = tf.shape(my_logits[0])[0]
reshaped_my_logits = [
tf.reshape(my_logit, [batch_size * time_length, -1])
for my_logit in my_logits]
reshaped_other_logits = [
tf.reshape(other_logit, [batch_size * time_length, -1])
for other_logit in other_logits]
kl = self.get_kl(reshaped_my_logits, reshaped_other_logits)
kl = [tf.reshape(kkl, [time_length, batch_size])
for kkl in kl]
return kl
class MLPPolicy(Policy):
"""Non-recurrent policy."""
def get_cell(self):
self.cell_input_dim = self.internal_dim
def mlp(cell_input, prev_internal_state):
w1 = tf.get_variable('w1', [self.cell_input_dim, self.internal_dim])
b1 = tf.get_variable('b1', [self.internal_dim])
w2 = tf.get_variable('w2', [self.internal_dim, self.internal_dim])
b2 = tf.get_variable('b2', [self.internal_dim])
w3 = tf.get_variable('w3', [self.internal_dim, self.internal_dim])
b3 = tf.get_variable('b3', [self.internal_dim])
proj = tf.get_variable(
'proj', [self.internal_dim, self.output_dim])
hidden = cell_input
hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w1), b1))
hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w2), b2))
output = tf.matmul(hidden, proj)
return output, hidden
return mlp
def single_step(self, obs, actions, prev_actions, greedy=False):
"""Single step."""
batch_size = tf.shape(obs[0])[0]
prev_internal_state = tf.zeros([batch_size, self.internal_dim])
output, next_state = self.core(
obs, prev_internal_state, prev_actions)
# sample actions with values and log-probs
(actions, logits, log_probs,
entropy, self_kl) = self.sample_actions(
output, actions=actions, greedy=greedy)
return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
tuple(entropy), tuple(self_kl))
def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
"""Sample single step from policy."""
(next_state, sampled_actions, logits, log_probs,
entropies, self_kls) = self.single_step(obs, None, prev_actions,
greedy=greedy)
return next_state, sampled_actions
def multi_step(self, all_obs, initial_state, all_actions):
"""Calculate log-probs and other calculations on batch of episodes."""
batch_size = tf.shape(initial_state)[0]
time_length = tf.shape(all_obs[0])[0]
# first reshape inputs as a single batch
reshaped_obs = []
for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
elif self.env_spec.is_box(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
reshaped_act = []
reshaped_prev_act = []
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
act = tf.concat([all_actions[i][1:], all_actions[i][0:1]], 0)
prev_act = all_actions[i]
if self.env_spec.is_discrete(act_type):
reshaped_act.append(tf.reshape(act, [time_length * batch_size]))
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size]))
elif self.env_spec.is_box(act_type):
reshaped_act.append(
tf.reshape(act, [time_length * batch_size, act_dim]))
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size, act_dim]))
# now inputs go into single step as one large batch
(internal_states, _, logits, log_probs,
entropies, self_kls) = self.single_step(
reshaped_obs, reshaped_act, reshaped_prev_act)
# reshape the outputs back to original time-major format
internal_states = tf.reshape(internal_states, [time_length, batch_size, -1])
logits = [tf.reshape(logit, [time_length, batch_size, -1])
for logit in logits]
log_probs = [tf.reshape(log_prob, [time_length, batch_size])[:-1]
for log_prob in log_probs]
entropies = [tf.reshape(ent, [time_length, batch_size])[:-1]
for ent in entropies]
self_kls = [tf.reshape(self_kl, [time_length, batch_size])[:-1]
for self_kl in self_kls]
return internal_states, logits, log_probs, entropies, self_kls
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Replay buffer.
Implements replay buffer in Python.
"""
import random
import numpy as np
class ReplayBuffer(object):
def __init__(self, max_size):
self.max_size = max_size
self.cur_size = 0
self.buffer = {}
self.init_length = 0
def __len__(self):
return self.cur_size
def seed_buffer(self, episodes):
self.init_length = len(episodes)
self.add(episodes, np.ones(self.init_length))
def add(self, episodes, *args):
"""Add episodes to buffer."""
idx = 0
while self.cur_size < self.max_size and idx < len(episodes):
self.buffer[self.cur_size] = episodes[idx]
self.cur_size += 1
idx += 1
if idx < len(episodes):
remove_idxs = self.remove_n(len(episodes) - idx)
for remove_idx in remove_idxs:
self.buffer[remove_idx] = episodes[idx]
idx += 1
assert len(self.buffer) == self.cur_size
def remove_n(self, n):
"""Get n items for removal."""
# random removal
idxs = random.sample(xrange(self.init_length, self.cur_size), n)
return idxs
def get_batch(self, n):
"""Get batch of episodes to train on."""
# random batch
idxs = random.sample(xrange(self.cur_size), n)
return [self.buffer[idx] for idx in idxs], None
def update_last_batch(self, delta):
pass
class PrioritizedReplayBuffer(ReplayBuffer):
def __init__(self, max_size, alpha=0.2,
eviction_strategy='rand'):
self.max_size = max_size
self.alpha = alpha
self.eviction_strategy = eviction_strategy
assert self.eviction_strategy in ['rand', 'fifo', 'rank']
self.remove_idx = 0
self.cur_size = 0
self.buffer = {}
self.priorities = np.zeros(self.max_size)
self.init_length = 0
def __len__(self):
return self.cur_size
def add(self, episodes, priorities, new_idxs=None):
"""Add episodes to buffer."""
if new_idxs is None:
idx = 0
new_idxs = []
while self.cur_size < self.max_size and idx < len(episodes):
self.buffer[self.cur_size] = episodes[idx]
new_idxs.append(self.cur_size)
self.cur_size += 1
idx += 1
if idx < len(episodes):
remove_idxs = self.remove_n(len(episodes) - idx)
for remove_idx in remove_idxs:
self.buffer[remove_idx] = episodes[idx]
new_idxs.append(remove_idx)
idx += 1
else:
assert len(new_idxs) == len(episodes)
for new_idx, ep in zip(new_idxs, episodes):
self.buffer[new_idx] = ep
self.priorities[new_idxs] = priorities
self.priorities[0:self.init_length] = np.max(
self.priorities[self.init_length:])
assert len(self.buffer) == self.cur_size
return new_idxs
def remove_n(self, n):
"""Get n items for removal."""
assert self.init_length + n <= self.cur_size
if self.eviction_strategy == 'rand':
# random removal
idxs = random.sample(xrange(self.init_length, self.cur_size), n)
elif self.eviction_strategy == 'fifo':
# overwrite elements in cyclical fashion
idxs = [
self.init_length +
(self.remove_idx + i) % (self.max_size - self.init_length)
for i in xrange(n)]
self.remove_idx = idxs[-1] + 1 - self.init_length
elif self.eviction_strategy == 'rank':
# remove lowest-priority indices
idxs = np.argpartition(self.priorities, n)[:n]
return idxs
def sampling_distribution(self):
p = self.priorities[:self.cur_size]
p = np.exp(self.alpha * (p - np.max(p)))
norm = np.sum(p)
if norm > 0:
uniform = 0.0
p = p / norm * (1 - uniform) + 1.0 / self.cur_size * uniform
else:
p = np.ones(self.cur_size) / self.cur_size
return p
def get_batch(self, n):
"""Get batch of episodes to train on."""
p = self.sampling_distribution()
idxs = np.random.choice(self.cur_size, size=n, replace=False, p=p)
self.last_batch = idxs
return [self.buffer[idx] for idx in idxs], p[idxs]
def update_last_batch(self, delta):
"""Update last batch idxs with new priority."""
self.priorities[self.last_batch] = np.abs(delta)
self.priorities[0:self.init_length] = np.max(
self.priorities[self.init_length:])
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Trainer for coordinating single or multi-replica training.
Main point of entry for running models. Specifies most of
the parameters used by different algorithms.
"""
import tensorflow as tf
import numpy as np
import random
import os
import pickle
import controller
import model
import policy
import baseline
import objective
import full_episode_objective
import trust_region
import optimizers
import replay_buffer
import expert_paths
import gym_wrapper
import env_spec
app = tf.app
flags = tf.flags
logging = tf.logging
gfile = tf.gfile
FLAGS = flags.FLAGS
flags.DEFINE_string('env', 'Copy-v0', 'environment name')
flags.DEFINE_integer('batch_size', 100, 'batch size')
flags.DEFINE_integer('replay_batch_size', None, 'replay batch size; defaults to batch_size')
flags.DEFINE_integer('num_samples', 1,
'number of samples from each random seed initialization')
flags.DEFINE_integer('max_step', 200, 'max number of steps to train on')
flags.DEFINE_integer('cutoff_agent', 0,
'number of steps at which to cut-off agent. '
'Defaults to always cutoff')
flags.DEFINE_integer('num_steps', 100000, 'number of training steps')
flags.DEFINE_integer('validation_frequency', 100,
'every so many steps, output some stats')
flags.DEFINE_float('target_network_lag', 0.95,
'This exponential decay on online network yields target '
'network')
flags.DEFINE_string('sample_from', 'online',
'Sample actions from "online" network or "target" network')
flags.DEFINE_string('objective', 'pcl',
'pcl/upcl/a3c/trpo/reinforce/urex')
flags.DEFINE_bool('trust_region_p', False,
'use trust region for policy optimization')
flags.DEFINE_string('value_opt', None,
'leave as None to optimize it along with policy '
'(using critic_weight). Otherwise set to '
'"best_fit" (least squares regression), "lbfgs", or "grad"')
flags.DEFINE_float('max_divergence', 0.01,
'max divergence (i.e. KL) to allow during '
'trust region optimization')
flags.DEFINE_float('learning_rate', 0.01, 'learning rate')
flags.DEFINE_float('clip_norm', 5.0, 'clip norm')
flags.DEFINE_float('clip_adv', 0.0, 'Clip advantages at this value. '
'Leave as 0 to not clip at all.')
flags.DEFINE_float('critic_weight', 0.1, 'critic weight')
flags.DEFINE_float('tau', 0.1, 'entropy regularizer.'
'If using decaying tau, this is the final value.')
flags.DEFINE_float('tau_decay', None,
'decay tau by this much every 100 steps')
flags.DEFINE_float('tau_start', 0.1,
'start tau at this value')
flags.DEFINE_float('eps_lambda', 0.0, 'relative entropy regularizer.')
flags.DEFINE_bool('update_eps_lambda', False,
'Update lambda automatically based on last 100 episodes.')
flags.DEFINE_float('gamma', 1.0, 'discount')
flags.DEFINE_integer('rollout', 10, 'rollout')
flags.DEFINE_bool('fixed_std', True,
'fix the std in Gaussian distributions')
flags.DEFINE_bool('input_prev_actions', True,
'input previous actions to policy network')
flags.DEFINE_bool('recurrent', True,
'use recurrent connections')
flags.DEFINE_bool('input_time_step', False,
'input time step into value calucations')
flags.DEFINE_bool('use_online_batch', True, 'train on batches as they are sampled')
flags.DEFINE_bool('batch_by_steps', False,
'ensure each training batch has batch_size * max_step steps')
flags.DEFINE_bool('unify_episodes', False,
'Make sure replay buffer holds entire episodes, '
'even across distinct sampling steps')
flags.DEFINE_integer('replay_buffer_size', 5000, 'replay buffer size')
flags.DEFINE_float('replay_buffer_alpha', 0.5, 'replay buffer alpha param')
flags.DEFINE_integer('replay_buffer_freq', 0,
'replay buffer frequency (only supports -1/0/1)')
flags.DEFINE_string('eviction', 'rand',
'how to evict from replay buffer: rand/rank/fifo')
flags.DEFINE_string('prioritize_by', 'rewards',
'Prioritize replay buffer by "rewards" or "step"')
flags.DEFINE_integer('num_expert_paths', 0,
'number of expert paths to seed replay buffer with')
flags.DEFINE_integer('internal_dim', 256, 'RNN internal dim')
flags.DEFINE_integer('value_hidden_layers', 0,
'number of hidden layers in value estimate')
flags.DEFINE_integer('tf_seed', 42, 'random seed for tensorflow')
flags.DEFINE_string('save_trajectories_dir', None,
'directory to save trajectories to, if desired')
flags.DEFINE_string('load_trajectories_file', None,
'file to load expert trajectories from')
# supervisor flags
flags.DEFINE_bool('supervisor', False, 'use supervisor training')
flags.DEFINE_integer('task_id', 0, 'task id')
flags.DEFINE_integer('ps_tasks', 0, 'number of ps tasks')
flags.DEFINE_integer('num_replicas', 1, 'number of replicas used')
flags.DEFINE_string('master', 'local', 'name of master')
flags.DEFINE_string('save_dir', '', 'directory to save model to')
flags.DEFINE_string('load_path', '', 'path of saved model to load (if none in save_dir)')
class Trainer(object):
"""Coordinates single or multi-replica training."""
def __init__(self):
self.batch_size = FLAGS.batch_size
self.replay_batch_size = FLAGS.replay_batch_size
if self.replay_batch_size is None:
self.replay_batch_size = self.batch_size
self.num_samples = FLAGS.num_samples
self.env_str = FLAGS.env
self.env = gym_wrapper.GymWrapper(self.env_str,
distinct=FLAGS.batch_size // self.num_samples,
count=self.num_samples)
self.env_spec = env_spec.EnvSpec(self.env.get_one())
self.max_step = FLAGS.max_step
self.cutoff_agent = FLAGS.cutoff_agent
self.num_steps = FLAGS.num_steps
self.validation_frequency = FLAGS.validation_frequency
self.target_network_lag = FLAGS.target_network_lag
self.sample_from = FLAGS.sample_from
assert self.sample_from in ['online', 'target']
self.critic_weight = FLAGS.critic_weight
self.objective = FLAGS.objective
self.trust_region_p = FLAGS.trust_region_p
self.value_opt = FLAGS.value_opt
assert not self.trust_region_p or self.objective in ['pcl', 'trpo']
assert self.objective != 'trpo' or self.trust_region_p
assert self.value_opt is None or self.critic_weight == 0.0
self.max_divergence = FLAGS.max_divergence
self.learning_rate = FLAGS.learning_rate
self.clip_norm = FLAGS.clip_norm
self.clip_adv = FLAGS.clip_adv
self.tau = FLAGS.tau
self.tau_decay = FLAGS.tau_decay
self.tau_start = FLAGS.tau_start
self.eps_lambda = FLAGS.eps_lambda
self.update_eps_lambda = FLAGS.update_eps_lambda
self.gamma = FLAGS.gamma
self.rollout = FLAGS.rollout
self.fixed_std = FLAGS.fixed_std
self.input_prev_actions = FLAGS.input_prev_actions
self.recurrent = FLAGS.recurrent
assert not self.trust_region_p or not self.recurrent
self.input_time_step = FLAGS.input_time_step
assert not self.input_time_step or (self.cutoff_agent <= self.max_step)
self.use_online_batch = FLAGS.use_online_batch
self.batch_by_steps = FLAGS.batch_by_steps
self.unify_episodes = FLAGS.unify_episodes
if self.unify_episodes:
assert self.batch_size == 1
self.replay_buffer_size = FLAGS.replay_buffer_size
self.replay_buffer_alpha = FLAGS.replay_buffer_alpha
self.replay_buffer_freq = FLAGS.replay_buffer_freq
assert self.replay_buffer_freq in [-1, 0, 1]
self.eviction = FLAGS.eviction
self.prioritize_by = FLAGS.prioritize_by
assert self.prioritize_by in ['rewards', 'step']
self.num_expert_paths = FLAGS.num_expert_paths
self.internal_dim = FLAGS.internal_dim
self.value_hidden_layers = FLAGS.value_hidden_layers
self.tf_seed = FLAGS.tf_seed
self.save_trajectories_dir = (
FLAGS.save_trajectories_dir or FLAGS.save_dir)
self.save_trajectories_file = (
os.path.join(
self.save_trajectories_dir, self.env_str.replace('-', '_'))
if self.save_trajectories_dir else None)
self.load_trajectories_file = FLAGS.load_trajectories_file
self.hparams = dict((attr, getattr(self, attr))
for attr in dir(self)
if not attr.startswith('__') and
not callable(getattr(self, attr)))
def hparams_string(self):
return '\n'.join('%s: %s' % item for item in sorted(self.hparams.items()))
def get_objective(self):
tau = self.tau
if self.tau_decay is not None:
assert self.tau_start >= self.tau
tau = tf.maximum(
tf.train.exponential_decay(
self.tau_start, self.global_step, 100, self.tau_decay),
self.tau)
if self.objective in ['pcl', 'a3c', 'trpo', 'upcl']:
cls = (objective.PCL if self.objective in ['pcl', 'upcl'] else
objective.TRPO if self.objective == 'trpo' else
objective.ActorCritic)
policy_weight = 1.0
return cls(self.learning_rate,
clip_norm=self.clip_norm,
policy_weight=policy_weight,
critic_weight=self.critic_weight,
tau=tau, gamma=self.gamma, rollout=self.rollout,
eps_lambda=self.eps_lambda, clip_adv=self.clip_adv)
elif self.objective in ['reinforce', 'urex']:
cls = (full_episode_objective.Reinforce
if self.objective == 'reinforce' else
full_episode_objective.UREX)
return cls(self.learning_rate,
clip_norm=self.clip_norm,
num_samples=self.num_samples,
tau=tau, bonus_weight=1.0) # TODO: bonus weight?
else:
assert False, 'Unknown objective %s' % self.objective
def get_policy(self):
if self.recurrent:
cls = policy.Policy
else:
cls = policy.MLPPolicy
return cls(self.env_spec, self.internal_dim,
fixed_std=self.fixed_std,
recurrent=self.recurrent,
input_prev_actions=self.input_prev_actions)
def get_baseline(self):
cls = (baseline.UnifiedBaseline if self.objective == 'upcl' else
baseline.Baseline)
return cls(self.env_spec, self.internal_dim,
input_prev_actions=self.input_prev_actions,
input_time_step=self.input_time_step,
input_policy_state=self.recurrent, # may want to change this
n_hidden_layers=self.value_hidden_layers,
hidden_dim=self.internal_dim,
tau=self.tau)
def get_trust_region_p_opt(self):
if self.trust_region_p:
return trust_region.TrustRegionOptimization(
max_divergence=self.max_divergence)
else:
return None
def get_value_opt(self):
if self.value_opt == 'grad':
return optimizers.GradOptimization(
learning_rate=self.learning_rate, max_iter=5, mix_frac=0.05)
elif self.value_opt == 'lbfgs':
return optimizers.LbfgsOptimization(max_iter=25, mix_frac=0.1)
elif self.value_opt == 'best_fit':
return optimizers.BestFitOptimization(mix_frac=1.0)
else:
return None
def get_model(self):
cls = model.Model
return cls(self.env_spec, self.global_step,
target_network_lag=self.target_network_lag,
sample_from=self.sample_from,
get_policy=self.get_policy,
get_baseline=self.get_baseline,
get_objective=self.get_objective,
get_trust_region_p_opt=self.get_trust_region_p_opt,
get_value_opt=self.get_value_opt)
def get_replay_buffer(self):
if self.replay_buffer_freq <= 0:
return None
else:
assert self.objective in ['pcl', 'upcl'], 'Can\'t use replay buffer with %s' % (
self.objective)
cls = replay_buffer.PrioritizedReplayBuffer
return cls(self.replay_buffer_size,
alpha=self.replay_buffer_alpha,
eviction_strategy=self.eviction)
def get_buffer_seeds(self):
return expert_paths.sample_expert_paths(
self.num_expert_paths, self.env_str, self.env_spec,
load_trajectories_file=self.load_trajectories_file)
def get_controller(self):
"""Get controller."""
cls = controller.Controller
return cls(self.env, self.env_spec, self.internal_dim,
use_online_batch=self.use_online_batch,
batch_by_steps=self.batch_by_steps,
unify_episodes=self.unify_episodes,
replay_batch_size=self.replay_batch_size,
max_step=self.max_step,
cutoff_agent=self.cutoff_agent,
save_trajectories_file=self.save_trajectories_file,
use_trust_region=self.trust_region_p,
use_value_opt=self.value_opt is not None,
update_eps_lambda=self.update_eps_lambda,
prioritize_by=self.prioritize_by,
get_model=self.get_model,
get_replay_buffer=self.get_replay_buffer,
get_buffer_seeds=self.get_buffer_seeds)
def do_before_step(self, step):
pass
def run(self):
"""Run training."""
is_chief = FLAGS.task_id == 0 or not FLAGS.supervisor
sv = None
def init_fn(sess, saver):
ckpt = None
if FLAGS.save_dir and sv is None:
load_dir = FLAGS.save_dir
ckpt = tf.train.get_checkpoint_state(load_dir)
if ckpt and ckpt.model_checkpoint_path:
logging.info('restoring from %s', ckpt.model_checkpoint_path)
saver.restore(sess, ckpt.model_checkpoint_path)
elif FLAGS.load_path:
logging.info('restoring from %s', FLAGS.load_path)
with gfile.AsUser('distbelief-brain-gpu'):
saver.restore(sess, FLAGS.load_path)
if FLAGS.supervisor:
with tf.device(tf.ReplicaDeviceSetter(FLAGS.ps_tasks, merge_devices=True)):
self.global_step = tf.contrib.framework.get_or_create_global_step()
tf.set_random_seed(FLAGS.tf_seed)
self.controller = self.get_controller()
self.model = self.controller.model
self.controller.setup()
saver = tf.train.Saver(max_to_keep=10)
step = self.model.global_step
sv = tf.Supervisor(logdir=FLAGS.save_dir,
is_chief=is_chief,
saver=saver,
save_model_secs=600,
summary_op=None, # we define it ourselves
save_summaries_secs=60,
global_step=step,
init_fn=lambda sess: init_fn(sess, saver))
sess = sv.PrepareSession(FLAGS.master)
else:
tf.set_random_seed(FLAGS.tf_seed)
self.global_step = tf.contrib.framework.get_or_create_global_step()
self.controller = self.get_controller()
self.model = self.controller.model
self.controller.setup()
saver = tf.train.Saver(max_to_keep=10)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
init_fn(sess, saver)
self.sv = sv
self.sess = sess
logging.info('hparams:\n%s', self.hparams_string())
model_step = sess.run(self.model.global_step)
if model_step >= self.num_steps:
logging.info('training has reached final step')
return
losses = []
rewards = []
all_ep_rewards = []
for step in xrange(1 + self.num_steps):
if sv is not None and sv.ShouldStop():
logging.info('stopping supervisor')
break
self.do_before_step(step)
(loss, summary,
total_rewards, episode_rewards) = self.controller.train(sess)
losses.append(loss)
rewards.append(total_rewards)
all_ep_rewards.extend(episode_rewards)
if random.random() < 1 and is_chief and sv and sv._summary_writer:
sv.summary_computed(sess, summary)
model_step = sess.run(self.model.global_step)
if is_chief and step % self.validation_frequency == 0:
logging.info('at training step %d, model step %d: '
'avg loss %f, avg reward %f, '
'episode rewards: %f',
step, model_step,
np.mean(losses), np.mean(rewards),
np.mean(all_ep_rewards))
losses = []
rewards = []
all_ep_rewards = []
if model_step >= self.num_steps:
logging.info('training has reached final step')
break
if is_chief and sv is not None:
logging.info('saving final model to %s', sv.save_path)
sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
def main(unused_argv):
logging.set_verbosity(logging.INFO)
trainer = Trainer()
trainer.run()
if __name__ == '__main__':
app.run()
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Trust region optimization.
A lot of this is adapted from other's code.
See Schulman's Modular RL, wojzaremba's TRPO, etc.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
def var_size(v):
return int(np.prod([int(d) for d in v.shape]))
def gradients(loss, var_list):
grads = tf.gradients(loss, var_list)
return [g if g is not None else tf.zeros(v.shape)
for g, v in zip(grads, var_list)]
def flatgrad(loss, var_list):
grads = gradients(loss, var_list)
return tf.concat([tf.reshape(grad, [-1])
for (v, grad) in zip(var_list, grads)
if grad is not None], 0)
def get_flat(var_list):
return tf.concat([tf.reshape(v, [-1]) for v in var_list], 0)
def set_from_flat(var_list, flat_theta):
assigns = []
shapes = [v.shape for v in var_list]
sizes = [var_size(v) for v in var_list]
start = 0
assigns = []
for (shape, size, v) in zip(shapes, sizes, var_list):
assigns.append(v.assign(
tf.reshape(flat_theta[start:start + size], shape)))
start += size
assert start == sum(sizes)
return tf.group(*assigns)
class TrustRegionOptimization(object):
def __init__(self, max_divergence=0.1, cg_damping=0.1):
self.max_divergence = max_divergence
self.cg_damping = cg_damping
def setup_placeholders(self):
self.flat_tangent = tf.placeholder(tf.float32, [None], 'flat_tangent')
self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
def setup(self, var_list, raw_loss, self_divergence,
divergence=None):
self.setup_placeholders()
self.raw_loss = raw_loss
self.divergence = divergence
self.loss_flat_gradient = flatgrad(raw_loss, var_list)
self.divergence_gradient = gradients(self_divergence, var_list)
shapes = [var.shape for var in var_list]
sizes = [var_size(var) for var in var_list]
start = 0
tangents = []
for shape, size in zip(shapes, sizes):
param = tf.reshape(self.flat_tangent[start:start + size], shape)
tangents.append(param)
start += size
assert start == sum(sizes)
self.grad_vector_product = sum(
tf.reduce_sum(g * t) for (g, t) in zip(self.divergence_gradient, tangents))
self.fisher_vector_product = flatgrad(self.grad_vector_product, var_list)
self.flat_vars = get_flat(var_list)
self.set_vars = set_from_flat(var_list, self.flat_theta)
def optimize(self, sess, feed_dict):
old_theta = sess.run(self.flat_vars)
loss_flat_grad = sess.run(self.loss_flat_gradient,
feed_dict=feed_dict)
def calc_fisher_vector_product(tangent):
feed_dict[self.flat_tangent] = tangent
fvp = sess.run(self.fisher_vector_product,
feed_dict=feed_dict)
fvp += self.cg_damping * tangent
return fvp
step_dir = conjugate_gradient(calc_fisher_vector_product, -loss_flat_grad)
shs = 0.5 * step_dir.dot(calc_fisher_vector_product(step_dir))
lm = np.sqrt(shs / self.max_divergence)
fullstep = step_dir / lm
neggdotstepdir = -loss_flat_grad.dot(step_dir)
def calc_loss(theta):
sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
if self.divergence is None:
return sess.run(self.raw_loss, feed_dict=feed_dict), True
else:
raw_loss, divergence = sess.run(
[self.raw_loss, self.divergence], feed_dict=feed_dict)
return raw_loss, divergence < self.max_divergence
# find optimal theta
theta = linesearch(calc_loss, old_theta, fullstep, neggdotstepdir / lm)
if self.divergence is not None:
final_divergence = sess.run(self.divergence, feed_dict=feed_dict)
else:
final_divergence = None
# set vars accordingly
if final_divergence is None or final_divergence < self.max_divergence:
sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
else:
sess.run(self.set_vars, feed_dict={self.flat_theta: old_theta})
def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
p = b.copy()
r = b.copy()
x = np.zeros_like(b)
rdotr = r.dot(r)
for i in xrange(cg_iters):
z = f_Ax(p)
v = rdotr / p.dot(z)
x += v * p
r -= v * z
newrdotr = r.dot(r)
mu = newrdotr / rdotr
p = r + mu * p
rdotr = newrdotr
if rdotr < residual_tol:
break
return x
def linesearch(f, x, fullstep, expected_improve_rate):
accept_ratio = 0.1
max_backtracks = 10
fval, _ = f(x)
for (_n_backtracks, stepfrac) in enumerate(.5 ** np.arange(max_backtracks)):
xnew = x + stepfrac * fullstep
newfval, valid = f(xnew)
if not valid:
continue
actual_improve = fval - newfval
expected_improve = expected_improve_rate * stepfrac
ratio = actual_improve / expected_improve
if ratio > accept_ratio and actual_improve > 0:
return xnew
return x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment