Commit dff0f0c1 authored by Alexander Gorban's avatar Alexander Gorban
Browse files

Merge branch 'master' of github.com:tensorflow/models

parents da341f70 36203f09
......@@ -211,9 +211,15 @@ def train(create_tensor_dict_fn, create_model_fn, train_config, master, task,
# Create ops required to initialize the model from a given checkpoint.
init_fn = None
if train_config.fine_tune_checkpoint:
init_fn = detection_model.restore_fn(
train_config.fine_tune_checkpoint,
var_map = detection_model.restore_map(
from_detection_checkpoint=train_config.from_detection_checkpoint)
available_var_map = (variables_helper.
get_variables_available_in_checkpoint(
var_map, train_config.fine_tune_checkpoint))
init_saver = tf.train.Saver(available_var_map)
def initializer_fn(sess):
init_saver.restore(sess, train_config.fine_tune_checkpoint)
init_fn = initializer_fn
with tf.device(deploy_config.optimizer_device()):
total_loss, grads_and_vars = model_deploy.optimize_clones(
......
......@@ -139,21 +139,18 @@ class FakeDetectionModel(model.DetectionModel):
}
return loss_dict
def restore_fn(self, checkpoint_path, from_detection_checkpoint=True):
"""Return callable for loading a checkpoint into the tensorflow graph.
def restore_map(self, from_detection_checkpoint=True):
"""Returns a map of variables to load from a foreign checkpoint.
Args:
checkpoint_path: path to checkpoint to restore.
from_detection_checkpoint: whether to restore from a full detection
checkpoint (with compatible variable names) or to restore from a
classification checkpoint for initialization prior to training.
Returns:
a callable which takes a tf.Session and does nothing.
A dict mapping variable names to variables.
"""
def restore(unused_sess):
return
return restore
return {var.op.name: var for var in tf.global_variables()}
class TrainerTest(tf.test.TestCase):
......
......@@ -120,6 +120,7 @@ py_library(
"//tensorflow_models/object_detection/core:box_list",
"//tensorflow_models/object_detection/core:box_predictor",
"//tensorflow_models/object_detection/core:matcher",
"//tensorflow_models/object_detection/utils:shape_utils"
],
)
......
......@@ -22,6 +22,20 @@ from google.protobuf import text_format
from object_detection.protos import string_int_label_map_pb2
def _validate_label_map(label_map):
"""Checks if a label map is valid.
Args:
label_map: StringIntLabelMap to validate.
Raises:
ValueError: if label map is invalid.
"""
for item in label_map.item:
if item.id < 1:
raise ValueError('Label map ids should be >= 1.')
def create_category_index(categories):
"""Creates dictionary of COCO compatible categories keyed by category id.
......@@ -61,7 +75,7 @@ def convert_label_map_to_categories(label_map,
list is created with max_num_classes categories.
max_num_classes: maximum number of (consecutive) label indices to include.
use_display_name: (boolean) choose whether to load 'display_name' field
as category name. If False of if the display_name field does not exist,
as category name. If False or if the display_name field does not exist,
uses 'name' field as category names instead.
Returns:
categories: a list of dictionaries representing all possible categories.
......@@ -91,7 +105,6 @@ def convert_label_map_to_categories(label_map,
return categories
# TODO: double check documentaion.
def load_labelmap(path):
"""Loads label map proto.
......@@ -107,6 +120,7 @@ def load_labelmap(path):
text_format.Merge(label_map_string, label_map)
except text_format.ParseError:
label_map.ParseFromString(label_map_string)
_validate_label_map(label_map)
return label_map
......
......@@ -53,6 +53,28 @@ class LabelMapUtilTest(tf.test.TestCase):
self.assertEqual(label_map_dict['dog'], 1)
self.assertEqual(label_map_dict['cat'], 2)
def test_load_bad_label_map(self):
label_map_string = """
item {
id:0
name:'class that should not be indexed at zero'
}
item {
id:2
name:'cat'
}
item {
id:1
name:'dog'
}
"""
label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt')
with tf.gfile.Open(label_map_path, 'wb') as f:
f.write(label_map_string)
with self.assertRaises(ValueError):
label_map_util.load_labelmap(label_map_path)
def test_keep_categories_with_unique_id(self):
label_map_proto = string_int_label_map_pb2.StringIntLabelMap()
label_map_string = """
......
......@@ -111,3 +111,26 @@ def pad_or_clip_tensor(t, length):
if not _is_tensor(length):
processed_t = _set_dim_0(processed_t, length)
return processed_t
def combined_static_and_dynamic_shape(tensor):
"""Returns a list containing static and dynamic values for the dimensions.
Returns a list of static and dynamic values for shape dimensions. This is
useful to preserve static shapes when available in reshape operation.
Args:
tensor: A tensor of any type.
Returns:
A list of size tensor.shape.ndims containing integers or a scalar tensor.
"""
static_shape = tensor.shape.as_list()
dynamic_shape = tf.shape(tensor)
combined_shape = []
for index, dim in enumerate(static_shape):
if dim is not None:
combined_shape.append(dim)
else:
combined_shape.append(dynamic_shape[index])
return combined_shape
......@@ -115,6 +115,13 @@ class UtilTest(tf.test.TestCase):
self.assertAllEqual([1, 2], tt3_result)
self.assertAllClose([[0.1, 0.2], [0.2, 0.4]], tt4_result)
def test_combines_static_dynamic_shape(self):
tensor = tf.placeholder(tf.float32, shape=(None, 2, 3))
combined_shape = shape_utils.combined_static_and_dynamic_shape(
tensor)
self.assertTrue(tf.contrib.framework.is_tensor(combined_shape[0]))
self.assertListEqual(combined_shape[1:], [2, 3])
if __name__ == '__main__':
tf.test.main()
......@@ -22,6 +22,7 @@ from object_detection.core import box_coder
from object_detection.core import box_list
from object_detection.core import box_predictor
from object_detection.core import matcher
from object_detection.utils import shape_utils
class MockBoxCoder(box_coder.BoxCoder):
......@@ -45,9 +46,10 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
super(MockBoxPredictor, self).__init__(is_training, num_classes)
def _predict(self, image_features, num_predictions_per_location):
batch_size = image_features.get_shape().as_list()[0]
num_anchors = (image_features.get_shape().as_list()[1]
* image_features.get_shape().as_list()[2])
combined_feature_shape = shape_utils.combined_static_and_dynamic_shape(
image_features)
batch_size = combined_feature_shape[0]
num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
code_size = 4
zero = tf.reduce_sum(0 * image_features)
box_encodings = zero + tf.zeros(
......
......@@ -398,7 +398,7 @@ def visualize_boxes_and_labels_on_image_array(image,
classes[i] % len(STANDARD_COLORS)]
# Draw all boxes onto image.
for box, color in six.iteritems(box_to_color_map):
for box, color in box_to_color_map.items():
ymin, xmin, ymax, xmax = box
if instance_masks is not None:
draw_mask_on_image_array(
......
Code for several RL algorithms used in the following papers:
* "Improving Policy Gradient by Exploring Under-appreciated Rewards" by
Ofir Nachum, Mohammad Norouzi, and Dale Schuurmans.
* "Bridging the Gap Between Value and Policy Based Reinforcement Learning" by
Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
* "Trust-PCL: An Off-Policy Trust Region Method for Continuous Control" by
Ofir Nachum, Mohammad Norouzi, Kelvin Xu, and Dale Schuurmans.
Available algorithms:
* Actor Critic
* TRPO
* PCL
* Unified PCL
* Trust-PCL
* PCL + Constraint Trust Region (un-published)
* REINFORCE
* UREX
Requirements:
* TensorFlow (see http://www.tensorflow.org for how to install/upgrade)
* OpenAI Gym (see http://gym.openai.com/docs)
* NumPy (see http://www.numpy.org/)
* SciPy (see http://www.scipy.org/)
Quick Start:
Run UREX on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.1 --clip_norm=50 \
--num_samples=10 --objective=urex
```
Run REINFORCE on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.01 --clip_norm=50 \
--num_samples=10 --objective=reinforce
```
Run PCL on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
--gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl
```
Run PCL with expert trajectories on a simple environment:
```
python trainer.py --logtostderr --batch_size=400 --env=DuplicatedInput-v0 \
--validation_frequency=25 --tau=0.025 --rollout=10 --critic_weight=1.0 \
--gamma=0.9 --clip_norm=10 --replay_buffer_freq=1 --objective=pcl \
--num_expert_paths=10
```
Run Mujoco task with TRPO:
```
python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
--validation_frequency=5 --rollout=10 --gamma=0.995 \
--max_step=1000 --cutoff_agent=1000 \
--objective=trpo --norecurrent --internal_dim=64 --trust_region_p \
--max_divergence=0.05 --value_opt=best_fit --critic_weight=0.0 \
```
Run Mujoco task with Trust-PCL:
```
python trainer.py --logtostderr --batch_size=1 --env=HalfCheetah-v1 \
--validation_frequency=50 --rollout=10 --critic_weight=0.0 \
--gamma=0.995 --clip_norm=40 --learning_rate=0.002 \
--replay_buffer_freq=1 --replay_buffer_size=20000 \
--replay_buffer_alpha=0.1 --norecurrent --objective=pcl \
--max_step=100 --tau=0.0 --eviction=fifo --max_divergence=0.001 \
--internal_dim=64 --cutoff_agent=1000 \
--replay_batch_size=25 --nouse_online_batch --batch_by_steps \
--sample_from=target --value_opt=grad --value_hidden_layers=2 \
--update_eps_lambda --unify_episodes --clip_adv=1.0 \
--target_network_lag=0.99 --prioritize_by=step
```
Run Mujoco task with PCL constraint trust region:
```
python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
--validation_frequency=5 --tau=0.001 --rollout=50 --gamma=0.99 \
--max_step=1000 --cutoff_agent=1000 \
--objective=pcl --norecurrent --internal_dim=64 --trust_region_p \
--max_divergence=0.01 --value_opt=best_fit --critic_weight=0.0 \
--tau_decay=0.1 --tau_start=0.1
```
Maintained by Ofir Nachum (ofirnachum).
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Baseline model for value estimates.
Implements the value component of the neural network.
In some cases this is just an additional linear layer on the policy.
In other cases, it is a completely separate neural network.
"""
import tensorflow as tf
import numpy as np
class Baseline(object):
def __init__(self, env_spec, internal_policy_dim,
input_prev_actions=True,
input_time_step=False,
input_policy_state=True,
n_hidden_layers=0,
hidden_dim=64,
tau=0.0):
self.env_spec = env_spec
self.internal_policy_dim = internal_policy_dim
self.input_prev_actions = input_prev_actions
self.input_time_step = input_time_step
self.input_policy_state = input_policy_state
self.n_hidden_layers = n_hidden_layers
self.hidden_dim = hidden_dim
self.tau = tau
self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
def get_inputs(self, time_step, obs, prev_actions,
internal_policy_states):
"""Get inputs to network as single tensor."""
inputs = [tf.ones_like(time_step)]
input_dim = 1
if not self.input_policy_state:
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
inputs.append(
tf.one_hot(obs[i], obs_dim))
input_dim += obs_dim
elif self.env_spec.is_box(obs_type):
cur_obs = obs[i]
inputs.append(cur_obs)
inputs.append(cur_obs ** 2)
input_dim += obs_dim * 2
else:
assert False
if self.input_prev_actions:
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
if self.env_spec.is_discrete(act_type):
inputs.append(
tf.one_hot(prev_actions[i], act_dim))
input_dim += act_dim
elif self.env_spec.is_box(act_type):
inputs.append(prev_actions[i])
input_dim += act_dim
else:
assert False
if self.input_policy_state:
inputs.append(internal_policy_states)
input_dim += self.internal_policy_dim
if self.input_time_step:
scaled_time = 0.01 * time_step
inputs.extend([scaled_time, scaled_time ** 2, scaled_time ** 3])
input_dim += 3
return input_dim, tf.concat(inputs, 1)
def reshape_batched_inputs(self, all_obs, all_actions,
internal_policy_states, policy_logits):
"""Reshape inputs from [time_length, batch_size, ...] to
[time_length * batch_size, ...].
This allows for computing the value estimate in one go.
"""
batch_size = tf.shape(all_obs[0])[1]
time_length = tf.shape(all_obs[0])[0]
reshaped_obs = []
for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
elif self.env_spec.is_box(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
reshaped_prev_act = []
reshaped_policy_logits = []
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
prev_act = all_actions[i]
if self.env_spec.is_discrete(act_type):
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size]))
elif self.env_spec.is_box(act_type):
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size, act_dim]))
reshaped_policy_logits.append(
tf.reshape(policy_logits[i], [time_length * batch_size, -1]))
reshaped_internal_policy_states = tf.reshape(
internal_policy_states,
[time_length * batch_size, self.internal_policy_dim])
time_step = (float(self.input_time_step) *
tf.expand_dims(
tf.to_float(tf.range(time_length * batch_size) /
batch_size), -1))
return (time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states,
reshaped_policy_logits)
def get_values(self, all_obs, all_actions, internal_policy_states,
policy_logits):
"""Get value estimates given input."""
batch_size = tf.shape(all_obs[0])[1]
time_length = tf.shape(all_obs[0])[0]
(time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states,
reshaped_policy_logits) = self.reshape_batched_inputs(
all_obs, all_actions, internal_policy_states, policy_logits)
input_dim, inputs = self.get_inputs(
time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states)
for depth in xrange(self.n_hidden_layers):
with tf.variable_scope('value_layer%d' % depth):
w = tf.get_variable('w', [input_dim, self.hidden_dim])
inputs = tf.nn.tanh(tf.matmul(inputs, w))
input_dim = self.hidden_dim
w_v = tf.get_variable('w_v', [input_dim, 1],
initializer=self.matrix_init)
values = tf.matmul(inputs, w_v)
values = tf.reshape(values, [time_length, batch_size])
inputs = inputs[:-batch_size] # remove "final vals"
return values, inputs, w_v
class UnifiedBaseline(Baseline):
"""Baseline for Unified PCL."""
def get_values(self, all_obs, all_actions, internal_policy_states,
policy_logits):
batch_size = tf.shape(all_obs[0])[1]
time_length = tf.shape(all_obs[0])[0]
(time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states,
reshaped_policy_logits) = self.reshape_batched_inputs(
all_obs, all_actions, internal_policy_states, policy_logits)
def f_transform(q_values, tau):
max_q = tf.reduce_max(q_values, -1, keep_dims=True)
return tf.squeeze(max_q, [-1]) + tau * tf.log(
tf.reduce_sum(tf.exp((q_values - max_q) / tau), -1))
assert len(reshaped_policy_logits) == 1
values = f_transform((self.tau + self.eps_lambda) * reshaped_policy_logits[0],
(self.tau + self.eps_lambda))
values = tf.reshape(values, [time_length, batch_size])
# not used
input_dim, inputs = self.get_inputs(
time_step, reshaped_obs, reshaped_prev_act,
reshaped_internal_policy_states)
w_v = tf.get_variable('w_v', [input_dim, 1],
initializer=self.matrix_init)
inputs = inputs[:-batch_size] # remove "final vals"
return values, inputs, w_v
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Controller coordinates sampling and training model.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import pickle
import random
flags = tf.flags
gfile = tf.gfile
FLAGS = flags.FLAGS
def find_best_eps_lambda(rewards, lengths):
"""Find the best lambda given a desired epsilon = FLAGS.max_divergence."""
# perhaps not the best way to do this
desired_div = FLAGS.max_divergence * np.mean(lengths)
def calc_divergence(eps_lambda):
max_reward = np.max(rewards)
logz = (max_reward / eps_lambda +
np.log(np.mean(np.exp((rewards - max_reward) / eps_lambda))))
exprr = np.mean(np.exp(rewards / eps_lambda - logz) *
rewards / eps_lambda)
return exprr - logz
left = 0.0
right = 1000.0
if len(rewards) <= 8:
return (left + right) / 2
num_iter = max(4, 1 + int(np.log((right - left) / 0.1) / np.log(2.0)))
for _ in xrange(num_iter):
mid = (left + right) / 2
cur_div = calc_divergence(mid)
if cur_div > desired_div:
left = mid
else:
right = mid
return (left + right) / 2
class Controller(object):
def __init__(self, env, env_spec, internal_dim,
use_online_batch=True,
batch_by_steps=False,
unify_episodes=False,
replay_batch_size=None,
max_step=None,
cutoff_agent=1,
save_trajectories_file=None,
use_trust_region=False,
use_value_opt=False,
update_eps_lambda=False,
prioritize_by='rewards',
get_model=None,
get_replay_buffer=None,
get_buffer_seeds=None):
self.env = env
self.env_spec = env_spec
self.internal_dim = internal_dim
self.use_online_batch = use_online_batch
self.batch_by_steps = batch_by_steps
self.unify_episodes = unify_episodes
self.replay_batch_size = replay_batch_size
self.max_step = max_step
self.cutoff_agent = cutoff_agent
self.save_trajectories_file = save_trajectories_file
self.use_trust_region = use_trust_region
self.use_value_opt = use_value_opt
self.update_eps_lambda = update_eps_lambda
self.prioritize_by = prioritize_by
self.model = get_model()
self.replay_buffer = get_replay_buffer()
self.seed_replay_buffer(get_buffer_seeds())
self.internal_state = np.array([self.initial_internal_state()] *
len(self.env))
self.last_obs = self.env_spec.initial_obs(len(self.env))
self.last_act = self.env_spec.initial_act(len(self.env))
self.last_pad = np.zeros(len(self.env))
self.start_episode = np.array([True] * len(self.env))
self.step_count = np.array([0] * len(self.env))
self.episode_running_rewards = np.zeros(len(self.env))
self.episode_running_lengths = np.zeros(len(self.env))
self.episode_rewards = []
self.episode_lengths = []
self.total_rewards = []
self.best_batch_rewards = None
def setup(self):
self.model.setup()
def initial_internal_state(self):
return np.zeros(self.model.policy.rnn_state_dim)
def _sample_episodes(self, sess, greedy=False):
"""Sample episodes from environment using model."""
# reset environments as necessary
obs_after_reset = self.env.reset_if(self.start_episode)
for i, obs in enumerate(obs_after_reset):
if obs is not None:
self.step_count[i] = 0
self.internal_state[i] = self.initial_internal_state()
for j in xrange(len(self.env_spec.obs_dims)):
self.last_obs[j][i] = obs[j]
for j in xrange(len(self.env_spec.act_dims)):
self.last_act[j][i] = -1
self.last_pad[i] = 0
# maintain episode as a single unit if the last sampling
# batch ended before the episode was terminated
if self.unify_episodes:
assert len(obs_after_reset) == 1
new_ep = obs_after_reset[0] is not None
else:
new_ep = True
self.start_id = 0 if new_ep else len(self.all_obs[:])
initial_state = self.internal_state
all_obs = [] if new_ep else self.all_obs[:]
all_act = ([self.last_act] if new_ep else self.all_act[:])
all_pad = [] if new_ep else self.all_pad[:]
rewards = [] if new_ep else self.rewards[:]
# start stepping in the environments
step = 0
while not self.env.all_done():
self.step_count += 1 - np.array(self.env.dones)
next_internal_state, sampled_actions = self.model.sample_step(
sess, self.last_obs, self.internal_state, self.last_act,
greedy=greedy)
env_actions = self.env_spec.convert_actions_to_env(sampled_actions)
next_obs, reward, next_dones, _ = self.env.step(env_actions)
all_obs.append(self.last_obs)
all_act.append(sampled_actions)
all_pad.append(self.last_pad)
rewards.append(reward)
self.internal_state = next_internal_state
self.last_obs = next_obs
self.last_act = sampled_actions
self.last_pad = np.array(next_dones).astype('float32')
step += 1
if self.max_step and step >= self.max_step:
break
self.all_obs = all_obs[:]
self.all_act = all_act[:]
self.all_pad = all_pad[:]
self.rewards = rewards[:]
# append final observation
all_obs.append(self.last_obs)
return initial_state, all_obs, all_act, rewards, all_pad
def sample_episodes(self, sess):
"""Sample steps from the environment until we have enough for a batch."""
# check if last batch ended with episode that was not terminated
if self.unify_episodes:
self.all_new_ep = self.start_episode[0]
# sample episodes until we either have enough episodes or enough steps
episodes = []
total_steps = 0
while total_steps < self.max_step * len(self.env):
(initial_state,
observations, actions, rewards,
pads) = self._sample_episodes(sess)
observations = zip(*observations)
actions = zip(*actions)
terminated = np.array(self.env.dones)
self.total_rewards = np.sum(np.array(rewards[self.start_id:]) *
(1 - np.array(pads[self.start_id:])), axis=0)
self.episode_running_rewards *= 1 - self.start_episode
self.episode_running_lengths *= 1 - self.start_episode
self.episode_running_rewards += self.total_rewards
self.episode_running_lengths += np.sum(1 - np.array(pads[self.start_id:]), axis=0)
episodes.extend(self.convert_from_batched_episodes(
initial_state, observations, actions, rewards,
terminated, pads))
total_steps += np.sum(1 - np.array(pads))
# set next starting episodes
self.start_episode = np.logical_or(terminated,
self.step_count >= self.cutoff_agent)
episode_rewards = self.episode_running_rewards[self.start_episode].tolist()
self.episode_rewards.extend(episode_rewards)
self.episode_lengths.extend(self.episode_running_lengths[self.start_episode].tolist())
self.episode_rewards = self.episode_rewards[-100:]
self.episode_lengths = self.episode_lengths[-100:]
if (self.save_trajectories_file is not None and
(self.best_batch_rewards is None or
np.mean(self.total_rewards) > self.best_batch_rewards)):
self.best_batch_rewards = np.mean(self.total_rewards)
my_episodes = self.convert_from_batched_episodes(
initial_state, observations, actions, rewards,
terminated, pads)
with gfile.GFile(self.save_trajectories_file, 'w') as f:
pickle.dump(my_episodes, f)
if not self.batch_by_steps:
return (initial_state,
observations, actions, rewards,
terminated, pads)
return self.convert_to_batched_episodes(episodes)
def _train(self, sess,
observations, initial_state, actions,
rewards, terminated, pads):
"""Train model using batch."""
if self.use_trust_region:
# use trust region to optimize policy
loss, _, summary = self.model.trust_region_step(
sess,
observations, initial_state, actions,
rewards, terminated, pads,
avg_episode_reward=np.mean(self.episode_rewards))
else: # otherwise use simple gradient descent on policy
loss, _, summary = self.model.train_step(
sess,
observations, initial_state, actions,
rewards, terminated, pads,
avg_episode_reward=np.mean(self.episode_rewards))
if self.use_value_opt: # optionally perform specific value optimization
self.model.fit_values(
sess,
observations, initial_state, actions,
rewards, terminated, pads)
return loss, summary
def train(self, sess):
"""Sample some episodes and train on some episodes."""
cur_step = sess.run(self.model.inc_global_step)
self.cur_step = cur_step
# on the first iteration, set target network close to online network
if self.cur_step == 0:
for _ in xrange(100):
sess.run(self.model.copy_op)
# on other iterations, just perform single target <-- online operation
sess.run(self.model.copy_op)
# sample from env
(initial_state,
observations, actions, rewards,
terminated, pads) = self.sample_episodes(sess)
# add to replay buffer
self.add_to_replay_buffer(
initial_state, observations, actions,
rewards, terminated, pads)
loss, summary = 0, None
# train on online batch
if self.use_online_batch:
loss, summary = self._train(
sess,
observations, initial_state, actions,
rewards, terminated, pads)
# update relative entropy coefficient
if self.update_eps_lambda:
episode_rewards = np.array(self.episode_rewards)
episode_lengths = np.array(self.episode_lengths)
eps_lambda = find_best_eps_lambda(episode_rewards, episode_lengths)
sess.run(self.model.objective.assign_eps_lambda,
feed_dict={self.model.objective.new_eps_lambda: eps_lambda})
# train on replay batch
replay_batch, replay_probs = self.get_from_replay_buffer(
self.replay_batch_size)
if replay_batch:
(initial_state,
observations, actions, rewards,
terminated, pads) = replay_batch
loss, summary = self._train(
sess,
observations, initial_state, actions,
rewards, terminated, pads)
return loss, summary, self.total_rewards, self.episode_rewards
def eval(self, sess):
"""Use greedy sampling."""
(initial_state,
observations, actions, rewards,
pads) = self._sample_episodes(sess, greedy=True)
total_rewards = np.sum(np.array(rewards) * (1 - np.array(pads)), axis=0)
return np.mean(total_rewards)
def convert_from_batched_episodes(
self, initial_state, observations, actions, rewards,
terminated, pads):
"""Convert time-major batch of episodes to batch-major list of episodes."""
rewards = np.array(rewards)
pads = np.array(pads)
observations = [np.array(obs) for obs in observations]
actions = [np.array(act) for act in actions]
total_rewards = np.sum(rewards * (1 - pads), axis=0)
total_length = np.sum(1 - pads, axis=0).astype('int32')
episodes = []
num_episodes = rewards.shape[1]
for i in xrange(num_episodes):
length = total_length[i]
ep_initial = initial_state[i]
ep_obs = [obs[:length, i, ...] for obs in observations]
ep_act = [act[:length + 1, i, ...] for act in actions]
ep_rewards = rewards[:length, i]
episodes.append(
[ep_initial, ep_obs, ep_act, ep_rewards, terminated[i]])
return episodes
def convert_to_batched_episodes(self, episodes, max_length=None):
"""Convert batch-major list of episodes to time-major batch of episodes."""
lengths = [len(ep[-2]) for ep in episodes]
max_length = max_length or max(lengths)
new_episodes = []
for ep, length in zip(episodes, lengths):
initial, observations, actions, rewards, terminated = ep
observations = [np.resize(obs, [max_length + 1] + list(obs.shape)[1:])
for obs in observations]
actions = [np.resize(act, [max_length + 1] + list(act.shape)[1:])
for act in actions]
pads = np.array([0] * length + [1] * (max_length - length))
rewards = np.resize(rewards, [max_length]) * (1 - pads)
new_episodes.append([initial, observations, actions, rewards,
terminated, pads])
(initial, observations, actions, rewards,
terminated, pads) = zip(*new_episodes)
observations = [np.swapaxes(obs, 0, 1)
for obs in zip(*observations)]
actions = [np.swapaxes(act, 0, 1)
for act in zip(*actions)]
rewards = np.transpose(rewards)
pads = np.transpose(pads)
return (initial, observations, actions, rewards, terminated, pads)
def add_to_replay_buffer(self, initial_state,
observations, actions, rewards,
terminated, pads):
"""Add batch of episodes to replay buffer."""
if self.replay_buffer is None:
return
rewards = np.array(rewards)
pads = np.array(pads)
total_rewards = np.sum(rewards * (1 - pads), axis=0)
episodes = self.convert_from_batched_episodes(
initial_state, observations, actions, rewards,
terminated, pads)
priorities = (total_rewards if self.prioritize_by == 'reward'
else self.cur_step)
if not self.unify_episodes or self.all_new_ep:
self.last_idxs = self.replay_buffer.add(
episodes, priorities)
else:
# If we are unifying episodes, we attempt to
# keep them unified in the replay buffer.
# The first episode sampled in the current batch is a
# continuation of the last episode from the previous batch
self.replay_buffer.add(episodes[:1], priorities, self.last_idxs[-1:])
if len(episodes) > 1:
self.replay_buffer.add(episodes[1:], priorities)
def get_from_replay_buffer(self, batch_size):
"""Sample a batch of episodes from the replay buffer."""
if self.replay_buffer is None or len(self.replay_buffer) < 1 * batch_size:
return None, None
desired_count = batch_size * self.max_step
# in the case of batch_by_steps, we sample larger and larger
# amounts from the replay buffer until we have enough steps.
while True:
if batch_size > len(self.replay_buffer):
batch_size = len(self.replay_buffer)
episodes, probs = self.replay_buffer.get_batch(batch_size)
count = sum(len(ep[-2]) for ep in episodes)
if count >= desired_count or not self.batch_by_steps:
break
if batch_size == len(self.replay_buffer):
return None, None
batch_size *= 1.2
return (self.convert_to_batched_episodes(episodes), probs)
def seed_replay_buffer(self, episodes):
"""Seed the replay buffer with some episodes."""
if self.replay_buffer is None:
return
# just need to add initial state
for i in xrange(len(episodes)):
episodes[i] = [self.initial_internal_state()] + episodes[i]
self.replay_buffer.seed_buffer(episodes)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for environment interface with agent / tensorflow."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class spaces(object):
discrete = 0
box = 1
def get_space(space):
if hasattr(space, 'n'):
return space.n, spaces.discrete, None
elif hasattr(space, 'shape'):
return np.prod(space.shape), spaces.box, (space.low, space.high)
def get_spaces(spaces):
if hasattr(spaces, 'spaces'):
return zip(*[get_space(space) for space in spaces.spaces])
else:
return [(ret,) for ret in get_space(spaces)]
class EnvSpec(object):
def __init__(self, env, try_combining_actions=True,
discretize_actions=None):
self.discretize_actions = discretize_actions
# figure out observation space
self.obs_space = env.observation_space
self.obs_dims, self.obs_types, self.obs_info = get_spaces(self.obs_space)
# figure out action space
self.act_space = env.action_space
self.act_dims, self.act_types, self.act_info = get_spaces(self.act_space)
if self.discretize_actions:
self._act_dims = self.act_dims[:]
self._act_types = self.act_types[:]
self.act_dims = []
self.act_types = []
for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
if typ == spaces.discrete:
self.act_dims.append(dim)
self.act_types.append(spaces.discrete)
elif typ == spaces.box:
for _ in xrange(dim):
self.act_dims.append(self.discretize_actions)
self.act_types.append(spaces.discrete)
else:
self._act_dims = None
self._act_types = None
if (try_combining_actions and
all(typ == spaces.discrete for typ in self.act_types)):
self.combine_actions = True
self.orig_act_dims = self.act_dims[:]
self.orig_act_types = self.act_types[:]
total_act_dim = 1
for dim in self.act_dims:
total_act_dim *= dim
self.act_dims = [total_act_dim]
self.act_types = [spaces.discrete]
else:
self.combine_actions = False
self.obs_dims_and_types = zip(self.obs_dims, self.obs_types)
self.act_dims_and_types = zip(self.act_dims, self.act_types)
self.total_obs_dim = sum(self.obs_dims)
self.total_sampling_act_dim = sum(self.sampling_dim(dim, typ)
for dim, typ in self.act_dims_and_types)
self.total_sampled_act_dim = sum(self.act_dims)
def sampling_dim(self, dim, typ):
if typ == spaces.discrete:
return dim
elif typ == spaces.box:
return 2 * dim # Gaussian mean and std
else:
assert False
def convert_actions_to_env(self, actions):
if self.combine_actions:
new_actions = []
actions = actions[0]
for dim in self.orig_act_dims:
new_actions.append(np.mod(actions, dim))
actions = (actions / dim).astype('int32')
actions = new_actions
if self.discretize_actions:
new_actions = []
idx = 0
for i, (dim, typ) in enumerate(zip(self._act_dims, self._act_types)):
if typ == spaces.discrete:
new_actions.append(actions[idx])
idx += 1
elif typ == spaces.box:
low, high = self.act_info[i]
cur_action = []
for j in xrange(dim):
cur_action.append(
low[j] + (high[j] - low[j]) * actions[idx] /
float(self.discretize_actions))
idx += 1
new_actions.append(np.hstack(cur_action))
actions = new_actions
return actions
def convert_env_actions_to_actions(self, actions):
if not self.combine_actions:
return actions
new_actions = 0
base = 1
for act, dim in zip(actions, self.orig_act_dims):
new_actions = new_actions + base * act
base *= dim
return [new_actions]
def convert_obs_to_list(self, obs):
if len(self.obs_dims) == 1:
return [obs]
else:
return list(obs)
def convert_action_to_gym(self, action):
if len(action) == 1:
return action[0]
else:
return list(action)
if ((not self.combine_actions or len(self.orig_act_dims) == 1) and
(len(self.act_dims) == 1 or
(self.discretize_actions and len(self._act_dims) == 1))):
return action[0]
else:
return list(action)
def initial_obs(self, batch_size):
batched = batch_size is not None
batch_size = batch_size or 1
obs = []
for dim, typ in self.obs_dims_and_types:
if typ == spaces.discrete:
obs.append(np.zeros(batch_size))
elif typ == spaces.box:
obs.append(np.zeros([batch_size, dim]))
if batched:
return obs
else:
return zip(*obs)[0]
def initial_act(self, batch_size=None):
batched = batch_size is not None
batch_size = batch_size or 1
act = []
for dim, typ in self.act_dims_and_types:
if typ == spaces.discrete:
act.append(-np.ones(batch_size))
elif typ == spaces.box:
act.append(-np.ones([batch_size, dim]))
if batched:
return act
else:
return zip(*act)[0]
def is_discrete(self, typ):
return typ == spaces.discrete
def is_box(self, typ):
return typ == spaces.box
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Expert paths/trajectories.
For producing or loading expert trajectories in environment.
"""
import tensorflow as tf
import random
import os
import numpy as np
import pickle
gfile = tf.gfile
def sample_expert_paths(num, env_str, env_spec,
load_trajectories_file=None):
"""Sample a number of expert paths randomly."""
if load_trajectories_file is not None:
if not gfile.Exists(load_trajectories_file):
assert False, 'trajectories file %s does not exist' % load_trajectories_file
with gfile.GFile(load_trajectories_file, 'r') as f:
episodes = pickle.load(f)
episodes = random.sample(episodes, num)
return [ep[1:] for ep in episodes]
return [sample_expert_path(env_str, env_spec)
for _ in xrange(num)]
def sample_expert_path(env_str, env_spec):
"""Algorithmic tasks have known distribution of expert paths we sample from."""
t = random.randint(2, 10) # sequence length
observations = []
actions = [env_spec.initial_act(None)]
rewards = []
if env_str in ['DuplicatedInput-v0', 'Copy-v0']:
chars = 5
random_ints = [int(random.random() * 1000) for _ in xrange(t)]
for tt in xrange(t):
char_idx = tt // 2 if env_str == 'DuplicatedInput-v0' else tt
char = random_ints[char_idx] % chars
observations.append([char])
actions.append([1, (tt + 1) % 2, char])
rewards.append((tt + 1) % 2)
elif env_str in ['RepeatCopy-v0']:
chars = 5
random_ints = [int(random.random() * 1000) for _ in xrange(t)]
for tt in xrange(3 * t + 2):
char_idx = (tt if tt < t else
2 * t - tt if tt <= 2 * t else
tt - 2 * t - 2)
if tt in [t, 2 * t + 1]:
char = chars
else:
char = random_ints[char_idx] % chars
observations.append([char])
actions.append([1 if tt < t else 0 if tt <= 2 * t else 1,
tt not in [t, 2 * t + 1], char])
rewards.append(actions[-1][-2])
elif env_str in ['Reverse-v0']:
chars = 2
random_ints = [int(random.random() * 1000) for _ in xrange(t)]
for tt in xrange(2 * t + 1):
char_idx = tt if tt < t else 2 * t - tt
if tt != t:
char = random_ints[char_idx] % chars
else:
char = chars
observations.append([char])
actions.append([tt < t, tt > t, char])
rewards.append(tt > t)
elif env_str in ['ReversedAddition-v0']:
chars = 3
random_ints = [int(random.random() * 1000) for _ in xrange(1 + 2 * t)]
carry = 0
char_history = []
move_map = {0: 3, 1: 1, 2: 2, 3: 1}
for tt in xrange(2 * t + 1):
char_idx = tt
if tt >= 2 * t:
char = chars
else:
char = random_ints[char_idx] % chars
char_history.append(char)
if tt % 2 == 1:
tot = char_history[-2] + char_history[-1] + carry
carry = tot // chars
tot = tot % chars
elif tt == 2 * t:
tot = carry
else:
tot = 0
observations.append([char])
actions.append([move_map[tt % len(move_map)],
tt % 2 or tt == 2 * t, tot])
rewards.append(tt % 2 or tt == 2 * t)
elif env_str in ['ReversedAddition3-v0']:
chars = 3
random_ints = [int(random.random() * 1000) for _ in xrange(1 + 3 * t)]
carry = 0
char_history = []
move_map = {0: 3, 1: 3, 2: 1, 3: 2, 4:2, 5: 1}
for tt in xrange(3 * t + 1):
char_idx = tt
if tt >= 3 * t:
char = chars
else:
char = random_ints[char_idx] % chars
char_history.append(char)
if tt % 3 == 2:
tot = char_history[-3] + char_history[-2] + char_history[-1] + carry
carry = tot // chars
tot = tot % chars
elif tt == 3 * t:
tot = carry
else:
tot = 0
observations.append([char])
actions.append([move_map[tt % len(move_map)],
tt % 3 == 2 or tt == 3 * t, tot])
rewards.append(tt % 3 == 2 or tt == 3 * t)
else:
assert False, 'No expert trajectories for env %s' % env_str
actions = [
env_spec.convert_env_actions_to_actions(act)
for act in actions]
observations.append([chars])
observations = [np.array(obs) for obs in zip(*observations)]
actions = [np.array(act) for act in zip(*actions)]
rewards = np.array(rewards)
return [observations, actions, rewards, True]
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Objectives for full-episode.
Implementations of UREX & REINFORCE. Note that these implementations
use a non-parametric baseline to reduce variance. Thus, multiple
samples with the same seed must be taken from the environment.
"""
import tensorflow as tf
import objective
class Reinforce(objective.Objective):
def __init__(self, learning_rate, clip_norm, num_samples,
tau=0.1, bonus_weight=1.0):
super(Reinforce, self).__init__(learning_rate, clip_norm=clip_norm)
self.num_samples = num_samples
assert self.num_samples > 1
self.tau = tau
self.bonus_weight = bonus_weight
self.eps_lambda = 0.0
def get_bonus(self, total_rewards, total_log_probs):
"""Exploration bonus."""
return -self.tau * total_log_probs
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
seq_length = tf.shape(rewards)[0]
not_pad = tf.reshape(1 - pads, [seq_length, -1, self.num_samples])
rewards = not_pad * tf.reshape(rewards, [seq_length, -1, self.num_samples])
log_probs = not_pad * tf.reshape(sum(log_probs), [seq_length, -1, self.num_samples])
total_rewards = tf.reduce_sum(rewards, 0)
total_log_probs = tf.reduce_sum(log_probs, 0)
rewards_and_bonus = (total_rewards +
self.bonus_weight *
self.get_bonus(total_rewards, total_log_probs))
baseline = tf.reduce_mean(rewards_and_bonus, 1, keep_dims=True)
loss = -tf.stop_gradient(rewards_and_bonus - baseline) * total_log_probs
loss = tf.reduce_mean(loss)
raw_loss = loss # TODO
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', total_log_probs)
tf.summary.histogram('rewards', total_rewards)
tf.summary.scalar('avg_rewards',
tf.reduce_mean(total_rewards))
tf.summary.scalar('loss', loss)
return loss, raw_loss, baseline, gradient_ops, tf.summary.merge_all()
class UREX(Reinforce):
def get_bonus(self, total_rewards, total_log_probs):
"""Exploration bonus."""
discrepancy = total_rewards / self.tau - total_log_probs
normalized_d = self.num_samples * tf.nn.softmax(discrepancy)
return self.tau * normalized_d
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper around gym env.
Allows for using batches of possibly identitically seeded environments.
"""
import gym
import numpy as np
import random
import env_spec
def get_env(env_str):
return gym.make(env_str)
class GymWrapper(object):
def __init__(self, env_str, distinct=1, count=1, seeds=None):
self.distinct = distinct
self.count = count
self.total = self.distinct * self.count
self.seeds = seeds or [random.randint(0, 1e12)
for _ in xrange(self.distinct)]
self.envs = []
for seed in self.seeds:
for _ in xrange(self.count):
env = get_env(env_str)
env.seed(seed)
if hasattr(env, 'last'):
env.last = 100 # for algorithmic envs
self.envs.append(env)
self.dones = [True] * self.total
self.num_episodes_played = 0
one_env = self.get_one()
self.use_action_list = hasattr(one_env.action_space, 'spaces')
self.env_spec = env_spec.EnvSpec(self.get_one())
def get_seeds(self):
return self.seeds
def reset(self):
self.dones = [False] * self.total
self.num_episodes_played += len(self.envs)
# reset seeds to be synchronized
self.seeds = [random.randint(0, 1e12) for _ in xrange(self.distinct)]
counter = 0
for seed in self.seeds:
for _ in xrange(self.count):
self.envs[counter].seed(seed)
counter += 1
return [self.env_spec.convert_obs_to_list(env.reset())
for env in self.envs]
def reset_if(self, predicate=None):
if predicate is None:
predicate = self.dones
if self.count != 1:
assert np.all(predicate)
return self.reset()
self.num_episodes_played += sum(predicate)
output = [self.env_spec.convert_obs_to_list(env.reset())
if pred else None
for env, pred in zip(self.envs, predicate)]
for i, pred in enumerate(predicate):
if pred:
self.dones[i] = False
return output
def all_done(self):
return all(self.dones)
def step(self, actions):
def env_step(action):
action = self.env_spec.convert_action_to_gym(action)
obs, reward, done, tt = env.step(action)
obs = self.env_spec.convert_obs_to_list(obs)
return obs, reward, done, tt
actions = zip(*actions)
outputs = [env_step(action)
if not done else (self.env_spec.initial_obs(None), 0, True, None)
for action, env, done in zip(actions, self.envs, self.dones)]
for i, (_, _, done, _) in enumerate(outputs):
self.dones[i] = self.dones[i] or done
obs, reward, done, tt = zip(*outputs)
obs = [list(oo) for oo in zip(*obs)]
return [obs, reward, done, tt]
def get_one(self):
return random.choice(self.envs)
def __len__(self):
return len(self.envs)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model is responsible for setting up Tensorflow graph.
Creates policy and value networks. Also sets up all optimization
ops, including gradient ops, trust region ops, and value optimizers.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class Model(object):
def __init__(self, env_spec, global_step,
target_network_lag=0.95,
sample_from='online',
get_policy=None,
get_baseline=None,
get_objective=None,
get_trust_region_p_opt=None,
get_value_opt=None):
self.env_spec = env_spec
self.global_step = global_step
self.inc_global_step = self.global_step.assign_add(1)
self.target_network_lag = target_network_lag
self.sample_from = sample_from
self.policy = get_policy()
self.baseline = get_baseline()
self.objective = get_objective()
self.baseline.eps_lambda = self.objective.eps_lambda # TODO: do this better
self.trust_region_policy_opt = get_trust_region_p_opt()
self.value_opt = get_value_opt()
def setup_placeholders(self):
"""Create the Tensorflow placeholders."""
# summary placeholder
self.avg_episode_reward = tf.placeholder(
tf.float32, [], 'avg_episode_reward')
# sampling placeholders
self.internal_state = tf.placeholder(tf.float32,
[None, self.policy.rnn_state_dim],
'internal_state')
self.single_observation = []
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
self.single_observation.append(
tf.placeholder(tf.int32, [None], 'obs%d' % i))
elif self.env_spec.is_box(obs_type):
self.single_observation.append(
tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i))
else:
assert False
self.single_action = []
for i, (action_dim, action_type) in \
enumerate(self.env_spec.act_dims_and_types):
if self.env_spec.is_discrete(action_type):
self.single_action.append(
tf.placeholder(tf.int32, [None], 'act%d' % i))
elif self.env_spec.is_box(action_type):
self.single_action.append(
tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i))
else:
assert False
# training placeholders
self.observations = []
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
self.observations.append(
tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i))
else:
self.observations.append(
tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i))
self.actions = []
self.other_logits = []
for i, (action_dim, action_type) in \
enumerate(self.env_spec.act_dims_and_types):
if self.env_spec.is_discrete(action_type):
self.actions.append(
tf.placeholder(tf.int32, [None, None], 'all_act%d' % i))
if self.env_spec.is_box(action_type):
self.actions.append(
tf.placeholder(tf.float32, [None, None, action_dim],
'all_act%d' % i))
self.other_logits.append(
tf.placeholder(tf.float32, [None, None, None],
'other_logits%d' % i))
self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards')
self.terminated = tf.placeholder(tf.float32, [None], 'terminated')
self.pads = tf.placeholder(tf.float32, [None, None], 'pads')
self.prev_log_probs = tf.placeholder(tf.float32, [None, None],
'prev_log_probs')
def setup(self):
"""Setup Tensorflow Graph."""
self.setup_placeholders()
tf.summary.scalar('avg_episode_reward', self.avg_episode_reward)
with tf.variable_scope('model', reuse=None):
# policy network
with tf.variable_scope('policy_net'):
(self.policy_internal_states, self.logits, self.log_probs,
self.entropies, self.self_kls) = \
self.policy.multi_step(self.observations,
self.internal_state,
self.actions)
self.out_log_probs = sum(self.log_probs)
self.kl = self.policy.calculate_kl(self.other_logits, self.logits)
self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) /
tf.reduce_sum(1 - self.pads))
# value network
with tf.variable_scope('value_net'):
(self.values,
self.regression_input,
self.regression_weight) = self.baseline.get_values(
self.observations, self.actions,
self.policy_internal_states, self.logits)
# target policy network
with tf.variable_scope('target_policy_net'):
(self.target_policy_internal_states,
self.target_logits, self.target_log_probs,
_, _) = \
self.policy.multi_step(self.observations,
self.internal_state,
self.actions)
# target value network
with tf.variable_scope('target_value_net'):
(self.target_values, _, _) = self.baseline.get_values(
self.observations, self.actions,
self.target_policy_internal_states, self.target_logits)
# construct copy op online --> target
all_vars = tf.trainable_variables()
online_vars = [p for p in all_vars if
'/policy_net' in p.name or '/value_net' in p.name]
target_vars = [p for p in all_vars if
'target_policy_net' in p.name or 'target_value_net' in p.name]
online_vars.sort(key=lambda p: p.name)
target_vars.sort(key=lambda p: p.name)
aa = self.target_network_lag
self.copy_op = tf.group(*[
target_p.assign(aa * target_p + (1 - aa) * online_p)
for online_p, target_p in zip(online_vars, target_vars)])
# evaluate objective
(self.loss, self.raw_loss, self.regression_target,
self.gradient_ops, self.summary) = self.objective.get(
self.rewards, self.pads,
self.values[:-1, :],
self.values[-1, :] * (1 - self.terminated),
self.log_probs, self.prev_log_probs, self.target_log_probs,
self.entropies,
self.logits)
self.regression_target = tf.reshape(self.regression_target, [-1])
self.policy_vars = [
v for v in tf.trainable_variables()
if '/policy_net' in v.name]
self.value_vars = [
v for v in tf.trainable_variables()
if '/value_net' in v.name]
# trust region optimizer
if self.trust_region_policy_opt is not None:
with tf.variable_scope('trust_region_policy', reuse=None):
avg_self_kl = (
tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) /
tf.reduce_sum(1 - self.pads))
self.trust_region_policy_opt.setup(
self.policy_vars, self.raw_loss, avg_self_kl,
self.avg_kl)
# value optimizer
if self.value_opt is not None:
with tf.variable_scope('trust_region_value', reuse=None):
self.value_opt.setup(
self.value_vars,
tf.reshape(self.values[:-1, :], [-1]),
self.regression_target,
tf.reshape(self.pads, [-1]),
self.regression_input, self.regression_weight)
# we re-use variables for the sampling operations
with tf.variable_scope('model', reuse=True):
scope = ('target_policy_net' if self.sample_from == 'target'
else 'policy_net')
with tf.variable_scope(scope):
self.next_internal_state, self.sampled_actions = \
self.policy.sample_step(self.single_observation,
self.internal_state,
self.single_action)
self.greedy_next_internal_state, self.greedy_sampled_actions = \
self.policy.sample_step(self.single_observation,
self.internal_state,
self.single_action,
greedy=True)
def sample_step(self, sess,
single_observation, internal_state, single_action,
greedy=False):
"""Sample batch of steps from policy."""
if greedy:
outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions]
else:
outputs = [self.next_internal_state, self.sampled_actions]
feed_dict = {self.internal_state: internal_state}
for action_place, action in zip(self.single_action, single_action):
feed_dict[action_place] = action
for obs_place, obs in zip(self.single_observation, single_observation):
feed_dict[obs_place] = obs
return sess.run(outputs, feed_dict=feed_dict)
def train_step(self, sess,
observations, internal_state, actions,
rewards, terminated, pads,
avg_episode_reward=0):
"""Train network using standard gradient descent."""
outputs = [self.raw_loss, self.gradient_ops, self.summary]
feed_dict = {self.internal_state: internal_state,
self.rewards: rewards,
self.terminated: terminated,
self.pads: pads,
self.avg_episode_reward: avg_episode_reward}
for action_place, action in zip(self.actions, actions):
feed_dict[action_place] = action
for obs_place, obs in zip(self.observations, observations):
feed_dict[obs_place] = obs
return sess.run(outputs, feed_dict=feed_dict)
def trust_region_step(self, sess,
observations, internal_state, actions,
rewards, terminated, pads,
avg_episode_reward=0):
"""Train policy using trust region step."""
feed_dict = {self.internal_state: internal_state,
self.rewards: rewards,
self.terminated: terminated,
self.pads: pads,
self.avg_episode_reward: avg_episode_reward}
for action_place, action in zip(self.actions, actions):
feed_dict[action_place] = action
for obs_place, obs in zip(self.observations, observations):
feed_dict[obs_place] = obs
(prev_log_probs, prev_logits) = sess.run(
[self.out_log_probs, self.logits], feed_dict=feed_dict)
feed_dict[self.prev_log_probs] = prev_log_probs
for other_logit, prev_logit in zip(self.other_logits, prev_logits):
feed_dict[other_logit] = prev_logit
# fit policy
self.trust_region_policy_opt.optimize(sess, feed_dict)
ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict)
ret = [ret[0], None, ret[1]]
return ret
def fit_values(self, sess,
observations, internal_state, actions,
rewards, terminated, pads):
"""Train value network using value-specific optimizer."""
feed_dict = {self.internal_state: internal_state,
self.rewards: rewards,
self.terminated: terminated,
self.pads: pads}
for action_place, action in zip(self.actions, actions):
feed_dict[action_place] = action
for obs_place, obs in zip(self.observations, observations):
feed_dict[obs_place] = obs
# fit values
if self.value_opt is None:
raise ValueError('Specific value optimizer does not exist')
self.value_opt.optimize(sess, feed_dict)
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Objectives to compute loss and value targets.
Implements Actor Critic, PCL (vanilla PCL, Unified PCL, Trust PCL), and TRPO.
"""
import tensorflow as tf
import numpy as np
class Objective(object):
def __init__(self, learning_rate, clip_norm):
self.learning_rate = learning_rate
self.clip_norm = clip_norm
def get_optimizer(self, learning_rate):
"""Optimizer for gradient descent ops."""
return tf.train.AdamOptimizer(learning_rate=learning_rate,
epsilon=2e-4)
def training_ops(self, loss, learning_rate=None):
"""Gradient ops."""
opt = self.get_optimizer(learning_rate)
params = tf.trainable_variables()
grads = tf.gradients(loss, params)
if self.clip_norm:
grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm)
tf.summary.scalar('grad_global_norm', global_norm)
return opt.apply_gradients(zip(grads, params))
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
"""Get objective calculations."""
raise NotImplementedError()
def discounted_future_sum(values, discount, rollout):
"""Discounted future sum of time-major values."""
discount_filter = tf.reshape(
discount ** tf.range(float(rollout)), [-1, 1, 1])
expanded_values = tf.concat(
[values, tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
stride=1, padding='VALID'), -1))
return conv_values
def discounted_two_sided_sum(values, discount, rollout):
"""Discounted two-sided sum of time-major values."""
roll = float(rollout)
discount_filter = tf.reshape(
discount ** tf.abs(tf.range(-roll + 1, roll)), [-1, 1, 1])
expanded_values = tf.concat(
[tf.zeros([rollout - 1, tf.shape(values)[1]]), values,
tf.zeros([rollout - 1, tf.shape(values)[1]])], 0)
conv_values = tf.transpose(tf.squeeze(tf.nn.conv1d(
tf.expand_dims(tf.transpose(expanded_values), -1), discount_filter,
stride=1, padding='VALID'), -1))
return conv_values
def shift_values(values, discount, rollout, final_values=0.0):
"""Shift values up by some amount of time.
Those values that shift from a value beyond the last value
are calculated using final_values.
"""
roll_range = tf.cumsum(tf.ones_like(values[:rollout, :]), 0,
exclusive=True, reverse=True)
final_pad = tf.expand_dims(final_values, 0) * discount ** roll_range
return tf.concat([discount ** rollout * values[rollout:, :],
final_pad], 0)
class ActorCritic(Objective):
"""Standard Actor-Critic."""
def __init__(self, learning_rate, clip_norm=5,
policy_weight=1.0, critic_weight=0.1,
tau=0.1, gamma=1.0, rollout=10,
eps_lambda=0.0, clip_adv=None):
super(ActorCritic, self).__init__(learning_rate, clip_norm=clip_norm)
self.policy_weight = policy_weight
self.critic_weight = critic_weight
self.tau = tau
self.gamma = gamma
self.rollout = rollout
self.clip_adv = clip_adv
self.eps_lambda = tf.get_variable( # TODO: need a better way
'eps_lambda', [], initializer=tf.constant_initializer(eps_lambda))
self.new_eps_lambda = tf.placeholder(tf.float32, [])
self.assign_eps_lambda = self.eps_lambda.assign(
0.95 * self.eps_lambda + 0.05 * self.new_eps_lambda)
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
not_pad = 1 - pads
batch_size = tf.shape(rewards)[1]
entropy = not_pad * sum(entropies)
rewards = not_pad * rewards
value_estimates = not_pad * values
log_probs = not_pad * sum(log_probs)
sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
last_values = shift_values(value_estimates, self.gamma, self.rollout,
final_values)
future_values = sum_rewards + last_values
baseline_values = value_estimates
adv = tf.stop_gradient(-baseline_values + future_values)
if self.clip_adv:
adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
policy_loss = -adv * log_probs
critic_loss = -adv * baseline_values
regularizer = -self.tau * entropy
policy_loss = tf.reduce_mean(
tf.reduce_sum(policy_loss * not_pad, 0))
critic_loss = tf.reduce_mean(
tf.reduce_sum(critic_loss * not_pad, 0))
regularizer = tf.reduce_mean(
tf.reduce_sum(regularizer * not_pad, 0))
# loss for gradient calculation
loss = (self.policy_weight * policy_loss +
self.critic_weight * critic_loss + regularizer)
raw_loss = tf.reduce_mean( # TODO
tf.reduce_sum(not_pad * policy_loss, 0))
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
tf.summary.scalar('avg_rewards',
tf.reduce_mean(tf.reduce_sum(rewards, 0)))
tf.summary.scalar('policy_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('critic_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('loss', loss)
tf.summary.scalar('raw_loss', raw_loss)
return (loss, raw_loss, future_values,
gradient_ops, tf.summary.merge_all())
class PCL(ActorCritic):
"""PCL implementation.
Implements vanilla PCL, Unified PCL, and Trust PCL depending
on provided inputs.
"""
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
not_pad = 1 - pads
batch_size = tf.shape(rewards)[1]
rewards = not_pad * rewards
value_estimates = not_pad * values
log_probs = not_pad * sum(log_probs)
target_log_probs = not_pad * tf.stop_gradient(sum(target_log_probs))
relative_log_probs = not_pad * (log_probs - target_log_probs)
# Prepend.
not_pad = tf.concat([tf.ones([self.rollout - 1, batch_size]),
not_pad], 0)
rewards = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
rewards], 0)
value_estimates = tf.concat(
[self.gamma ** tf.expand_dims(
tf.range(float(self.rollout - 1), 0, -1), 1) *
tf.ones([self.rollout - 1, batch_size]) *
value_estimates[0:1, :],
value_estimates], 0)
log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
log_probs], 0)
prev_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
prev_log_probs], 0)
relative_log_probs = tf.concat([tf.zeros([self.rollout - 1, batch_size]),
relative_log_probs], 0)
sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
sum_log_probs = discounted_future_sum(log_probs, self.gamma, self.rollout)
sum_prev_log_probs = discounted_future_sum(prev_log_probs, self.gamma, self.rollout)
sum_relative_log_probs = discounted_future_sum(
relative_log_probs, self.gamma, self.rollout)
last_values = shift_values(value_estimates, self.gamma, self.rollout,
final_values)
future_values = (
- self.tau * sum_log_probs
- self.eps_lambda * sum_relative_log_probs
+ sum_rewards + last_values)
baseline_values = value_estimates
adv = tf.stop_gradient(-baseline_values + future_values)
if self.clip_adv:
adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
policy_loss = -adv * sum_log_probs
critic_loss = -adv * (baseline_values - last_values)
policy_loss = tf.reduce_mean(
tf.reduce_sum(policy_loss * not_pad, 0))
critic_loss = tf.reduce_mean(
tf.reduce_sum(critic_loss * not_pad, 0))
# loss for gradient calculation
loss = (self.policy_weight * policy_loss +
self.critic_weight * critic_loss)
# actual quantity we're trying to minimize
raw_loss = tf.reduce_mean(
tf.reduce_sum(not_pad * adv * (-baseline_values + future_values), 0))
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
tf.summary.histogram('future_values', future_values)
tf.summary.histogram('baseline_values', baseline_values)
tf.summary.histogram('advantages', adv)
tf.summary.scalar('avg_rewards',
tf.reduce_mean(tf.reduce_sum(rewards, 0)))
tf.summary.scalar('policy_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('critic_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('loss', loss)
tf.summary.scalar('raw_loss', tf.reduce_mean(raw_loss))
tf.summary.scalar('eps_lambda', self.eps_lambda)
return (loss, raw_loss,
future_values[self.rollout - 1:, :],
gradient_ops, tf.summary.merge_all())
class TRPO(ActorCritic):
"""TRPO."""
def get(self, rewards, pads, values, final_values,
log_probs, prev_log_probs, target_log_probs,
entropies, logits):
not_pad = 1 - pads
batch_size = tf.shape(rewards)[1]
rewards = not_pad * rewards
value_estimates = not_pad * values
log_probs = not_pad * sum(log_probs)
prev_log_probs = not_pad * prev_log_probs
sum_rewards = discounted_future_sum(rewards, self.gamma, self.rollout)
last_values = shift_values(value_estimates, self.gamma, self.rollout,
final_values)
future_values = sum_rewards + last_values
baseline_values = value_estimates
adv = tf.stop_gradient(-baseline_values + future_values)
if self.clip_adv:
adv = tf.minimum(self.clip_adv, tf.maximum(-self.clip_adv, adv))
policy_loss = -adv * tf.exp(log_probs - prev_log_probs)
critic_loss = -adv * baseline_values
policy_loss = tf.reduce_mean(
tf.reduce_sum(policy_loss * not_pad, 0))
critic_loss = tf.reduce_mean(
tf.reduce_sum(critic_loss * not_pad, 0))
raw_loss = policy_loss
# loss for gradient calculation
if self.policy_weight == 0:
policy_loss = 0.0
elif self.critic_weight == 0:
critic_loss = 0.0
loss = (self.policy_weight * policy_loss +
self.critic_weight * critic_loss)
gradient_ops = self.training_ops(
loss, learning_rate=self.learning_rate)
tf.summary.histogram('log_probs', tf.reduce_sum(log_probs, 0))
tf.summary.histogram('rewards', tf.reduce_sum(rewards, 0))
tf.summary.scalar('avg_rewards',
tf.reduce_mean(tf.reduce_sum(rewards, 0)))
tf.summary.scalar('policy_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('critic_loss',
tf.reduce_mean(tf.reduce_sum(not_pad * policy_loss)))
tf.summary.scalar('loss', loss)
tf.summary.scalar('raw_loss', raw_loss)
return (loss, raw_loss, future_values,
gradient_ops, tf.summary.merge_all())
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Optimizers mostly for value estimate.
Gradient Descent optimizer
LBFGS optimizer
Best Fit optimizer
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import scipy.optimize
def var_size(v):
return int(np.prod([int(d) for d in v.shape]))
def gradients(loss, var_list):
grads = tf.gradients(loss, var_list)
return [g if g is not None else tf.zeros(v.shape)
for g, v in zip(grads, var_list)]
def flatgrad(loss, var_list):
grads = gradients(loss, var_list)
return tf.concat([tf.reshape(grad, [-1])
for (v, grad) in zip(var_list, grads)
if grad is not None], 0)
def get_flat(var_list):
return tf.concat([tf.reshape(v, [-1]) for v in var_list], 0)
def set_from_flat(var_list, flat_theta):
assigns = []
shapes = [v.shape for v in var_list]
sizes = [var_size(v) for v in var_list]
start = 0
assigns = []
for (shape, size, v) in zip(shapes, sizes, var_list):
assigns.append(v.assign(
tf.reshape(flat_theta[start:start + size], shape)))
start += size
assert start == sum(sizes)
return tf.group(*assigns)
class LbfgsOptimization(object):
def __init__(self, max_iter=25, mix_frac=1.0):
self.max_iter = max_iter
self.mix_frac = mix_frac
def setup_placeholders(self):
self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
def setup(self, var_list, values, targets, pads,
inputs, regression_weight):
self.setup_placeholders()
self.values = values
self.targets = targets
self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
/ tf.reduce_sum(1 - pads))
self.loss_flat_gradient = flatgrad(self.raw_loss, var_list)
self.flat_vars = get_flat(var_list)
self.set_vars = set_from_flat(var_list, self.flat_theta)
def optimize(self, sess, feed_dict):
old_theta = sess.run(self.flat_vars)
old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
feed_dict = dict(feed_dict)
feed_dict[self.intended_values] = intended_values
def calc_loss_and_grad(theta):
sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
loss, grad = sess.run([self.raw_loss, self.loss_flat_gradient],
feed_dict=feed_dict)
grad = grad.astype('float64')
return loss, grad
theta, _, _ = scipy.optimize.fmin_l_bfgs_b(
calc_loss_and_grad, old_theta, maxiter=self.max_iter)
sess.run(self.set_vars, feed_dict={self.flat_theta: theta})
class GradOptimization(object):
def __init__(self, learning_rate=0.001, max_iter=25, mix_frac=1.0):
self.learning_rate = learning_rate
self.max_iter = max_iter
self.mix_frac = mix_frac
def get_optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.learning_rate,
epsilon=2e-4)
def setup_placeholders(self):
self.flat_theta = tf.placeholder(tf.float32, [None], 'flat_theta')
self.intended_values = tf.placeholder(tf.float32, [None], 'intended_values')
def setup(self, var_list, values, targets, pads,
inputs, regression_weight):
self.setup_placeholders()
self.values = values
self.targets = targets
self.raw_loss = (tf.reduce_sum((1 - pads) * tf.square(values - self.intended_values))
/ tf.reduce_sum(1 - pads))
opt = self.get_optimizer()
params = var_list
grads = tf.gradients(self.raw_loss, params)
self.gradient_ops = opt.apply_gradients(zip(grads, params))
def optimize(self, sess, feed_dict):
old_values, targets = sess.run([self.values, self.targets], feed_dict=feed_dict)
intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
feed_dict = dict(feed_dict)
feed_dict[self.intended_values] = intended_values
for _ in xrange(self.max_iter):
sess.run(self.gradient_ops, feed_dict=feed_dict)
class BestFitOptimization(object):
def __init__(self, mix_frac=1.0):
self.mix_frac = mix_frac
def setup_placeholders(self):
self.new_regression_weight = tf.placeholder(
tf.float32, self.regression_weight.shape)
def setup(self, var_list, values, targets, pads,
inputs, regression_weight):
self.values = values
self.targets = targets
self.inputs = inputs
self.regression_weight = regression_weight
self.setup_placeholders()
self.update_regression_weight = tf.assign(
self.regression_weight, self.new_regression_weight)
def optimize(self, sess, feed_dict):
reg_input, reg_weight, old_values, targets = sess.run(
[self.inputs, self.regression_weight, self.values, self.targets],
feed_dict=feed_dict)
intended_values = targets * self.mix_frac + old_values * (1 - self.mix_frac)
# taken from rllab
reg_coeff = 1e-5
for _ in range(5):
best_fit_weight = np.linalg.lstsq(
reg_input.T.dot(reg_input) +
reg_coeff * np.identity(reg_input.shape[1]),
reg_input.T.dot(intended_values))[0]
if not np.any(np.isnan(best_fit_weight)):
break
reg_coeff *= 10
if len(best_fit_weight.shape) == 1:
best_fit_weight = np.expand_dims(best_fit_weight, -1)
sess.run(self.update_regression_weight,
feed_dict={self.new_regression_weight: best_fit_weight})
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Policy neural network.
Implements network which takes in input and produces actions
and log probabilities given a sampling distribution parameterization.
"""
import tensorflow as tf
import numpy as np
class Policy(object):
def __init__(self, env_spec, internal_dim,
fixed_std=True, recurrent=True,
input_prev_actions=True):
self.env_spec = env_spec
self.internal_dim = internal_dim
self.rnn_state_dim = self.internal_dim
self.fixed_std = fixed_std
self.recurrent = recurrent
self.input_prev_actions = input_prev_actions
self.matrix_init = tf.truncated_normal_initializer(stddev=0.01)
self.vector_init = tf.constant_initializer(0.0)
@property
def input_dim(self):
return (self.env_spec.total_obs_dim +
self.env_spec.total_sampled_act_dim * self.input_prev_actions)
@property
def output_dim(self):
return self.env_spec.total_sampling_act_dim
def get_cell(self):
"""Get RNN cell."""
self.cell_input_dim = self.internal_dim // 2
cell = tf.contrib.rnn.LSTMCell(self.cell_input_dim,
state_is_tuple=False,
reuse=tf.get_variable_scope().reuse)
cell = tf.contrib.rnn.OutputProjectionWrapper(
cell, self.output_dim,
reuse=tf.get_variable_scope().reuse)
return cell
def core(self, obs, prev_internal_state, prev_actions):
"""Core neural network taking in inputs and outputting sampling
distribution parameters."""
batch_size = tf.shape(obs[0])[0]
if not self.recurrent:
prev_internal_state = tf.zeros([batch_size, self.rnn_state_dim])
cell = self.get_cell()
b = tf.get_variable('input_bias', [self.cell_input_dim],
initializer=self.vector_init)
cell_input = tf.nn.bias_add(tf.zeros([batch_size, self.cell_input_dim]), b)
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types):
w = tf.get_variable('w_state%d' % i, [obs_dim, self.cell_input_dim],
initializer=self.matrix_init)
if self.env_spec.is_discrete(obs_type):
cell_input += tf.matmul(tf.one_hot(obs[i], obs_dim), w)
elif self.env_spec.is_box(obs_type):
cell_input += tf.matmul(obs[i], w)
else:
assert False
if self.input_prev_actions:
if self.env_spec.combine_actions: # TODO(ofir): clean this up
prev_action = prev_actions[0]
for i, action_dim in enumerate(self.env_spec.orig_act_dims):
act = tf.mod(prev_action, action_dim)
w = tf.get_variable('w_prev_action%d' % i, [action_dim, self.cell_input_dim],
initializer=self.matrix_init)
cell_input += tf.matmul(tf.one_hot(act, action_dim), w)
prev_action = tf.to_int32(prev_action / action_dim)
else:
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
w = tf.get_variable('w_prev_action%d' % i, [act_dim, self.cell_input_dim],
initializer=self.matrix_init)
if self.env_spec.is_discrete(act_type):
cell_input += tf.matmul(tf.one_hot(prev_actions[i], act_dim), w)
elif self.env_spec.is_box(act_type):
cell_input += tf.matmul(prev_actions[i], w)
else:
assert False
output, next_state = cell(cell_input, prev_internal_state)
return output, next_state
def sample_action(self, logits, sampling_dim,
act_dim, act_type, greedy=False):
"""Sample an action from a distribution."""
if self.env_spec.is_discrete(act_type):
if greedy:
act = tf.argmax(logits, 1)
else:
act = tf.reshape(tf.multinomial(logits, 1), [-1])
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
if greedy:
act = means
else:
batch_size = tf.shape(logits)[0]
act = means + std * tf.random_normal([batch_size, act_dim])
else:
assert False
return act
def entropy(self, logits,
sampling_dim, act_dim, act_type):
"""Calculate entropy of distribution."""
if self.env_spec.is_discrete(act_type):
entropy = tf.reduce_sum(
-tf.nn.softmax(logits) * tf.nn.log_softmax(logits), -1)
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
entropy = tf.reduce_sum(
0.5 * (1 + tf.log(2 * np.pi * tf.square(std))), -1)
else:
assert False
return entropy
def self_kl(self, logits,
sampling_dim, act_dim, act_type):
"""Calculate KL of distribution with itself.
Used layer only for the gradients.
"""
if self.env_spec.is_discrete(act_type):
probs = tf.nn.softmax(logits)
log_probs = tf.nn.log_softmax(logits)
self_kl = tf.reduce_sum(
tf.stop_gradient(probs) *
(tf.stop_gradient(log_probs) - log_probs), -1)
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
my_means = tf.stop_gradient(means)
my_std = tf.stop_gradient(std)
self_kl = tf.reduce_sum(
tf.log(std / my_std) +
(tf.square(my_std) + tf.square(my_means - means)) /
(2.0 * tf.square(std)) - 0.5,
-1)
else:
assert False
return self_kl
def log_prob_action(self, action, logits,
sampling_dim, act_dim, act_type):
"""Calculate log-prob of action sampled from distribution."""
if self.env_spec.is_discrete(act_type):
act_log_prob = tf.reduce_sum(
tf.one_hot(action, act_dim) * tf.nn.log_softmax(logits), -1)
elif self.env_spec.is_box(act_type):
means = logits[:, :sampling_dim / 2]
std = logits[:, sampling_dim / 2:]
act_log_prob = (- 0.5 * tf.log(2 * np.pi * tf.square(std))
- 0.5 * tf.square(action - means) / tf.square(std))
act_log_prob = tf.reduce_sum(act_log_prob, -1)
else:
assert False
return act_log_prob
def sample_actions(self, output, actions=None, greedy=False):
"""Sample all actions given output of core network."""
sampled_actions = []
logits = []
log_probs = []
entropy = []
self_kl = []
start_idx = 0
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
if self.fixed_std and self.env_spec.is_box(act_type):
act_logits = output[:, start_idx:start_idx + act_dim]
log_std = tf.get_variable('std%d' % i, [1, sampling_dim // 2])
# fix standard deviations to variable
act_logits = tf.concat(
[act_logits,
1e-6 + tf.exp(log_std) + 0 * act_logits], 1)
else:
act_logits = output[:, start_idx:start_idx + sampling_dim]
if actions is None:
act = self.sample_action(act_logits, sampling_dim,
act_dim, act_type,
greedy=greedy)
else:
act = actions[i]
ent = self.entropy(act_logits, sampling_dim, act_dim, act_type)
kl = self.self_kl(act_logits, sampling_dim, act_dim, act_type)
act_log_prob = self.log_prob_action(
act, act_logits,
sampling_dim, act_dim, act_type)
sampled_actions.append(act)
logits.append(act_logits)
log_probs.append(act_log_prob)
entropy.append(ent)
self_kl.append(kl)
start_idx += sampling_dim
assert start_idx == self.env_spec.total_sampling_act_dim
return sampled_actions, logits, log_probs, entropy, self_kl
def get_kl(self, my_logits, other_logits):
"""Calculate KL between one policy output and another."""
kl = []
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
single_my_logits = my_logits[i]
single_other_logits = other_logits[i]
if self.env_spec.is_discrete(act_type):
my_probs = tf.nn.softmax(single_my_logits)
my_log_probs = tf.nn.log_softmax(single_my_logits)
other_log_probs = tf.nn.log_softmax(single_other_logits)
my_kl = tf.reduce_sum(my_probs * (my_log_probs - other_log_probs), -1)
elif self.env_spec.is_box(act_type):
my_means = single_my_logits[:, :sampling_dim / 2]
my_std = single_my_logits[:, sampling_dim / 2:]
other_means = single_other_logits[:, :sampling_dim / 2]
other_std = single_other_logits[:, sampling_dim / 2:]
my_kl = tf.reduce_sum(
tf.log(other_std / my_std) +
(tf.square(my_std) + tf.square(my_means - other_means)) /
(2.0 * tf.square(other_std)) - 0.5,
-1)
else:
assert False
kl.append(my_kl)
return kl
def single_step(self, prev, cur, greedy=False):
"""Single RNN step. Equivalently, single-time-step sampled actions."""
prev_internal_state, prev_actions, _, _, _, _ = prev
obs, actions = cur # state observed and action taken at this time step
# feed into RNN cell
output, next_state = self.core(
obs, prev_internal_state, prev_actions)
# sample actions with values and log-probs
(actions, logits, log_probs,
entropy, self_kl) = self.sample_actions(
output, actions=actions, greedy=greedy)
return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
tuple(entropy), tuple(self_kl))
def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
"""Sample single step from policy."""
(next_state, sampled_actions, logits, log_probs,
entropies, self_kls) = self.single_step(
(prev_internal_state, prev_actions, None, None, None, None),
(obs, None), greedy=greedy)
return next_state, sampled_actions
def multi_step(self, all_obs, initial_state, all_actions):
"""Calculate log-probs and other calculations on batch of episodes."""
batch_size = tf.shape(initial_state)[0]
time_length = tf.shape(all_obs[0])[0]
initial_actions = [act[0] for act in all_actions]
all_actions = [tf.concat([act[1:], act[0:1]], 0)
for act in all_actions] # "final" action is dummy
(internal_states, _, logits, log_probs,
entropies, self_kls) = tf.scan(
self.single_step,
(all_obs, all_actions),
initializer=self.get_initializer(
batch_size, initial_state, initial_actions))
# remove "final" computations
log_probs = [log_prob[:-1] for log_prob in log_probs]
entropies = [entropy[:-1] for entropy in entropies]
self_kls = [self_kl[:-1] for self_kl in self_kls]
return internal_states, logits, log_probs, entropies, self_kls
def get_initializer(self, batch_size, initial_state, initial_actions):
"""Get initializer for RNN."""
logits_init = []
log_probs_init = []
for act_dim, act_type in self.env_spec.act_dims_and_types:
sampling_dim = self.env_spec.sampling_dim(act_dim, act_type)
logits_init.append(tf.zeros([batch_size, sampling_dim]))
log_probs_init.append(tf.zeros([batch_size]))
entropy_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
self_kl_init = [tf.zeros([batch_size]) for _ in self.env_spec.act_dims]
return (initial_state,
tuple(initial_actions),
tuple(logits_init), tuple(log_probs_init),
tuple(entropy_init),
tuple(self_kl_init))
def calculate_kl(self, my_logits, other_logits):
"""Calculate KL between one policy and another on batch of episodes."""
batch_size = tf.shape(my_logits[0])[1]
time_length = tf.shape(my_logits[0])[0]
reshaped_my_logits = [
tf.reshape(my_logit, [batch_size * time_length, -1])
for my_logit in my_logits]
reshaped_other_logits = [
tf.reshape(other_logit, [batch_size * time_length, -1])
for other_logit in other_logits]
kl = self.get_kl(reshaped_my_logits, reshaped_other_logits)
kl = [tf.reshape(kkl, [time_length, batch_size])
for kkl in kl]
return kl
class MLPPolicy(Policy):
"""Non-recurrent policy."""
def get_cell(self):
self.cell_input_dim = self.internal_dim
def mlp(cell_input, prev_internal_state):
w1 = tf.get_variable('w1', [self.cell_input_dim, self.internal_dim])
b1 = tf.get_variable('b1', [self.internal_dim])
w2 = tf.get_variable('w2', [self.internal_dim, self.internal_dim])
b2 = tf.get_variable('b2', [self.internal_dim])
w3 = tf.get_variable('w3', [self.internal_dim, self.internal_dim])
b3 = tf.get_variable('b3', [self.internal_dim])
proj = tf.get_variable(
'proj', [self.internal_dim, self.output_dim])
hidden = cell_input
hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w1), b1))
hidden = tf.tanh(tf.nn.bias_add(tf.matmul(hidden, w2), b2))
output = tf.matmul(hidden, proj)
return output, hidden
return mlp
def single_step(self, obs, actions, prev_actions, greedy=False):
"""Single step."""
batch_size = tf.shape(obs[0])[0]
prev_internal_state = tf.zeros([batch_size, self.internal_dim])
output, next_state = self.core(
obs, prev_internal_state, prev_actions)
# sample actions with values and log-probs
(actions, logits, log_probs,
entropy, self_kl) = self.sample_actions(
output, actions=actions, greedy=greedy)
return (next_state, tuple(actions), tuple(logits), tuple(log_probs),
tuple(entropy), tuple(self_kl))
def sample_step(self, obs, prev_internal_state, prev_actions, greedy=False):
"""Sample single step from policy."""
(next_state, sampled_actions, logits, log_probs,
entropies, self_kls) = self.single_step(obs, None, prev_actions,
greedy=greedy)
return next_state, sampled_actions
def multi_step(self, all_obs, initial_state, all_actions):
"""Calculate log-probs and other calculations on batch of episodes."""
batch_size = tf.shape(initial_state)[0]
time_length = tf.shape(all_obs[0])[0]
# first reshape inputs as a single batch
reshaped_obs = []
for obs, (obs_dim, obs_type) in zip(all_obs, self.env_spec.obs_dims_and_types):
if self.env_spec.is_discrete(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size]))
elif self.env_spec.is_box(obs_type):
reshaped_obs.append(tf.reshape(obs, [time_length * batch_size, obs_dim]))
reshaped_act = []
reshaped_prev_act = []
for i, (act_dim, act_type) in enumerate(self.env_spec.act_dims_and_types):
act = tf.concat([all_actions[i][1:], all_actions[i][0:1]], 0)
prev_act = all_actions[i]
if self.env_spec.is_discrete(act_type):
reshaped_act.append(tf.reshape(act, [time_length * batch_size]))
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size]))
elif self.env_spec.is_box(act_type):
reshaped_act.append(
tf.reshape(act, [time_length * batch_size, act_dim]))
reshaped_prev_act.append(
tf.reshape(prev_act, [time_length * batch_size, act_dim]))
# now inputs go into single step as one large batch
(internal_states, _, logits, log_probs,
entropies, self_kls) = self.single_step(
reshaped_obs, reshaped_act, reshaped_prev_act)
# reshape the outputs back to original time-major format
internal_states = tf.reshape(internal_states, [time_length, batch_size, -1])
logits = [tf.reshape(logit, [time_length, batch_size, -1])
for logit in logits]
log_probs = [tf.reshape(log_prob, [time_length, batch_size])[:-1]
for log_prob in log_probs]
entropies = [tf.reshape(ent, [time_length, batch_size])[:-1]
for ent in entropies]
self_kls = [tf.reshape(self_kl, [time_length, batch_size])[:-1]
for self_kl in self_kls]
return internal_states, logits, log_probs, entropies, self_kls
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment