Unverified Commit c9f03bf6 authored by Neal Wu's avatar Neal Wu Committed by GitHub
Browse files

Merge pull request #5870 from ofirnachum/master

Add training and eval code for efficient-hrl
parents 2c181308 052361de
#-*-Python-*-
create_maze_env.env_name = "AntPush"
context_range = (%CONTEXT_RANGE_MIN, %CONTEXT_RANGE_MAX)
meta_context_range = ((-16, -4), (16, 20))
RESET_EPISODE_PERIOD = 500
RESET_ENV_PERIOD = 1
# End episode every N steps
UvfAgent.reset_episode_cond_fn = @every_n_steps
every_n_steps.n = %RESET_EPISODE_PERIOD
train_uvf.max_steps_per_episode = %RESET_EPISODE_PERIOD
# Do a manual reset every N episodes
UvfAgent.reset_env_cond_fn = @every_n_episodes
every_n_episodes.n = %RESET_ENV_PERIOD
every_n_episodes.steps_per_episode = %RESET_EPISODE_PERIOD
## Config defaults
EVAL_MODES = ["eval2"]
## Config agent
CONTEXT = @agent/Context
META_CONTEXT = @meta/Context
## Config agent context
agent/Context.context_ranges = [%context_range]
agent/Context.context_shapes = [%SUBGOAL_DIM]
agent/Context.meta_action_every_n = 10
agent/Context.samplers = {
"train": [@train/DirectionSampler],
"explore": [@train/DirectionSampler],
}
agent/Context.context_transition_fn = @relative_context_transition_fn
agent/Context.context_multi_transition_fn = @relative_context_multi_transition_fn
agent/Context.reward_fn = @uvf/negative_distance
## Config meta context
meta/Context.context_ranges = [%meta_context_range]
meta/Context.context_shapes = [2]
meta/Context.samplers = {
"train": [@eval2/ConstantSampler],
"explore": [@eval2/ConstantSampler],
"eval2": [@eval2/ConstantSampler],
}
meta/Context.reward_fn = @task/negative_distance
## Config rewards
task/negative_distance.state_indices = [0, 1]
task/negative_distance.relative_context = False
task/negative_distance.diff = False
task/negative_distance.offset = 0.0
## Config samplers
train/RandomSampler.context_range = %meta_context_range
train/DirectionSampler.context_range = %context_range
train/DirectionSampler.k = %SUBGOAL_DIM
relative_context_transition_fn.k = %SUBGOAL_DIM
relative_context_multi_transition_fn.k = %SUBGOAL_DIM
MetaAgent.k = %SUBGOAL_DIM
eval2/ConstantSampler.value = [0, 19]
#-*-Python-*-
ENV_CONTEXT = None
EVAL_MODES = ["eval"]
TARGET_Q_CLIPPING = None
RESET_EPISODE_PERIOD = None
ZERO_OBS = False
CONTEXT_RANGE_MIN = -10
CONTEXT_RANGE_MAX = 10
SUBGOAL_DIM = 2
uvf/negative_distance.summarize = False
uvf/negative_distance.relative_context = True
#-*-Python-*-
ENV_CONTEXT = None
EVAL_MODES = ["eval"]
TARGET_Q_CLIPPING = None
RESET_EPISODE_PERIOD = None
ZERO_OBS = True
IMAGES = False
CONTEXT_RANGE_MIN = (-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)
CONTEXT_RANGE_MAX = ( 10, 10, 0.5, 1, 1, 1, 1, 0.5, 0.3, 0.5, 0.3, 0.5, 0.3, 0.5, 0.3)
SUBGOAL_DIM = 15
META_EXPLORE_NOISE = 1.0
uvf/negative_distance.summarize = False
uvf/negative_distance.relative_context = True
#-*-Python-*-
ENV_CONTEXT = None
EVAL_MODES = ["eval"]
TARGET_Q_CLIPPING = None
RESET_EPISODE_PERIOD = None
ZERO_OBS = False
IMAGES = False
CONTEXT_RANGE_MIN = -10
CONTEXT_RANGE_MAX = 10
SUBGOAL_DIM = 2
META_EXPLORE_NOISE = 5.0
StatePreprocess.trainable = True
StatePreprocess.state_preprocess_net = @state_preprocess_net
StatePreprocess.action_embed_net = @action_embed_net
uvf/negative_distance.summarize = False
uvf/negative_distance.relative_context = True
#-*-Python-*-
ENV_CONTEXT = None
EVAL_MODES = ["eval"]
TARGET_Q_CLIPPING = None
RESET_EPISODE_PERIOD = None
ZERO_OBS = False
IMAGES = False
CONTEXT_RANGE_MIN = -10
CONTEXT_RANGE_MAX = 10
SUBGOAL_DIM = 2
META_EXPLORE_NOISE = 1.0
uvf/negative_distance.summarize = False
uvf/negative_distance.relative_context = True
#-*-Python-*-
# NOTE: For best training, low-level exploration (uvf_add_noise_fn.stddev)
# should be reduced to around 0.1.
create_maze_env.env_name = "PointMaze"
context_range_min = -10
context_range_max = 10
context_range = (%context_range_min, %context_range_max)
meta_context_range = ((-2, -2), (10, 10))
RESET_EPISODE_PERIOD = 500
RESET_ENV_PERIOD = 1
# End episode every N steps
UvfAgent.reset_episode_cond_fn = @every_n_steps
every_n_steps.n = %RESET_EPISODE_PERIOD
train_uvf.max_steps_per_episode = %RESET_EPISODE_PERIOD
# Do a manual reset every N episodes
UvfAgent.reset_env_cond_fn = @every_n_episodes
every_n_episodes.n = %RESET_ENV_PERIOD
every_n_episodes.steps_per_episode = %RESET_EPISODE_PERIOD
## Config defaults
EVAL_MODES = ["eval1", "eval2", "eval3"]
## Config agent
CONTEXT = @agent/Context
META_CONTEXT = @meta/Context
## Config agent context
agent/Context.context_ranges = [%context_range]
agent/Context.context_shapes = [%SUBGOAL_DIM]
agent/Context.meta_action_every_n = 10
agent/Context.samplers = {
"train": [@train/DirectionSampler],
"explore": [@train/DirectionSampler],
"eval1": [@uvf_eval1/ConstantSampler],
"eval2": [@uvf_eval2/ConstantSampler],
"eval3": [@uvf_eval3/ConstantSampler],
}
agent/Context.context_transition_fn = @relative_context_transition_fn
agent/Context.context_multi_transition_fn = @relative_context_multi_transition_fn
agent/Context.reward_fn = @uvf/negative_distance
## Config meta context
meta/Context.context_ranges = [%meta_context_range]
meta/Context.context_shapes = [2]
meta/Context.samplers = {
"train": [@train/RandomSampler],
"explore": [@train/RandomSampler],
"eval1": [@eval1/ConstantSampler],
"eval2": [@eval2/ConstantSampler],
"eval3": [@eval3/ConstantSampler],
}
meta/Context.reward_fn = @task/negative_distance
## Config rewards
task/negative_distance.state_indices = [0, 1]
task/negative_distance.relative_context = False
task/negative_distance.diff = False
task/negative_distance.offset = 0.0
## Config samplers
train/RandomSampler.context_range = %meta_context_range
train/DirectionSampler.context_range = %context_range
train/DirectionSampler.k = %SUBGOAL_DIM
relative_context_transition_fn.k = %SUBGOAL_DIM
relative_context_multi_transition_fn.k = %SUBGOAL_DIM
MetaAgent.k = %SUBGOAL_DIM
eval1/ConstantSampler.value = [8, 0]
eval2/ConstantSampler.value = [8, 8]
eval3/ConstantSampler.value = [0, 8]
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Context for Universal Value Function agents.
A context specifies a list of contextual variables, each with
own sampling and reward computation methods.
Examples of contextual variables include
goal states, reward combination vectors, etc.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from tf_agents import specs
import gin.tf
from utils import utils as uvf_utils
@gin.configurable
class Context(object):
"""Base context."""
VAR_NAME = 'action'
def __init__(self,
tf_env,
context_ranges=None,
context_shapes=None,
state_indices=None,
variable_indices=None,
gamma_index=None,
settable_context=False,
timers=None,
samplers=None,
reward_weights=None,
reward_fn=None,
random_sampler_mode='random',
normalizers=None,
context_transition_fn=None,
context_multi_transition_fn=None,
meta_action_every_n=None):
self._tf_env = tf_env
self.variable_indices = variable_indices
self.gamma_index = gamma_index
self._settable_context = settable_context
self.timers = timers
self._context_transition_fn = context_transition_fn
self._context_multi_transition_fn = context_multi_transition_fn
self._random_sampler_mode = random_sampler_mode
# assign specs
self._obs_spec = self._tf_env.observation_spec()
self._context_shapes = tuple([
shape if shape is not None else self._obs_spec.shape
for shape in context_shapes
])
self.context_specs = tuple([
specs.TensorSpec(dtype=self._obs_spec.dtype, shape=shape)
for shape in self._context_shapes
])
if context_ranges is not None:
self.context_ranges = context_ranges
else:
self.context_ranges = [None] * len(self._context_shapes)
self.context_as_action_specs = tuple([
specs.BoundedTensorSpec(
shape=shape,
dtype=(tf.float32 if self._obs_spec.dtype in
[tf.float32, tf.float64] else self._obs_spec.dtype),
minimum=context_range[0],
maximum=context_range[-1])
for shape, context_range in zip(self._context_shapes, self.context_ranges)
])
if state_indices is not None:
self.state_indices = state_indices
else:
self.state_indices = [None] * len(self._context_shapes)
if self.variable_indices is not None and self.n != len(
self.variable_indices):
raise ValueError(
'variable_indices (%s) must have the same length as contexts (%s).' %
(self.variable_indices, self.context_specs))
assert self.n == len(self.context_ranges)
assert self.n == len(self.state_indices)
# assign reward/sampler fns
self._sampler_fns = dict()
self._samplers = dict()
self._reward_fns = dict()
# assign reward fns
self._add_custom_reward_fns()
reward_weights = reward_weights or None
self._reward_fn = self._make_reward_fn(reward_fn, reward_weights)
# assign samplers
self._add_custom_sampler_fns()
for mode, sampler_fns in samplers.items():
self._make_sampler_fn(sampler_fns, mode)
# create normalizers
if normalizers is None:
self._normalizers = [None] * len(self.context_specs)
else:
self._normalizers = [
normalizer(tf.zeros(shape=spec.shape, dtype=spec.dtype))
if normalizer is not None else None
for normalizer, spec in zip(normalizers, self.context_specs)
]
assert self.n == len(self._normalizers)
self.meta_action_every_n = meta_action_every_n
# create vars
self.context_vars = {}
self.timer_vars = {}
self.create_vars(self.VAR_NAME)
self.t = tf.Variable(
tf.zeros(shape=(), dtype=tf.int32), name='num_timer_steps')
def _add_custom_reward_fns(self):
pass
def _add_custom_sampler_fns(self):
pass
def sample_random_contexts(self, batch_size):
"""Sample random batch contexts."""
assert self._random_sampler_mode is not None
return self.sample_contexts(self._random_sampler_mode, batch_size)[0]
def sample_contexts(self, mode, batch_size, state=None, next_state=None,
**kwargs):
"""Sample a batch of contexts.
Args:
mode: A string representing the mode [`train`, `explore`, `eval`].
batch_size: Batch size.
Returns:
Two lists of [batch_size, num_context_dims] contexts.
"""
contexts, next_contexts = self._sampler_fns[mode](
batch_size, state=state, next_state=next_state,
**kwargs)
self._validate_contexts(contexts)
self._validate_contexts(next_contexts)
return contexts, next_contexts
def compute_rewards(self, mode, states, actions, rewards, next_states,
contexts):
"""Compute context-based rewards.
Args:
mode: A string representing the mode ['uvf', 'task'].
states: A [batch_size, num_state_dims] tensor.
actions: A [batch_size, num_action_dims] tensor.
rewards: A [batch_size] tensor representing unmodified rewards.
next_states: A [batch_size, num_state_dims] tensor.
contexts: A list of [batch_size, num_context_dims] tensors.
Returns:
A [batch_size] tensor representing rewards.
"""
return self._reward_fn(states, actions, rewards, next_states,
contexts)
def _make_reward_fn(self, reward_fns_list, reward_weights):
"""Returns a fn that computes rewards.
Args:
reward_fns_list: A fn or a list of reward fns.
mode: A string representing the operating mode.
reward_weights: A list of reward weights.
"""
if not isinstance(reward_fns_list, (list, tuple)):
reward_fns_list = [reward_fns_list]
if reward_weights is None:
reward_weights = [1.0] * len(reward_fns_list)
assert len(reward_fns_list) == len(reward_weights)
reward_fns_list = [
self._custom_reward_fns[fn] if isinstance(fn, (str,)) else fn
for fn in reward_fns_list
]
def reward_fn(*args, **kwargs):
"""Returns rewards, discounts."""
reward_tuples = [
reward_fn(*args, **kwargs) for reward_fn in reward_fns_list
]
rewards_list = [reward_tuple[0] for reward_tuple in reward_tuples]
discounts_list = [reward_tuple[1] for reward_tuple in reward_tuples]
ndims = max([r.shape.ndims for r in rewards_list])
if ndims > 1: # expand reward shapes to allow broadcasting
for i in range(len(rewards_list)):
for _ in range(rewards_list[i].shape.ndims - ndims):
rewards_list[i] = tf.expand_dims(rewards_list[i], axis=-1)
for _ in range(discounts_list[i].shape.ndims - ndims):
discounts_list[i] = tf.expand_dims(discounts_list[i], axis=-1)
rewards = tf.add_n(
[r * tf.to_float(w) for r, w in zip(rewards_list, reward_weights)])
discounts = discounts_list[0]
for d in discounts_list[1:]:
discounts *= d
return rewards, discounts
return reward_fn
def _make_sampler_fn(self, sampler_cls_list, mode):
"""Returns a fn that samples a list of context vars.
Args:
sampler_cls_list: A list of sampler classes.
mode: A string representing the operating mode.
"""
if not isinstance(sampler_cls_list, (list, tuple)):
sampler_cls_list = [sampler_cls_list]
self._samplers[mode] = []
sampler_fns = []
for spec, sampler in zip(self.context_specs, sampler_cls_list):
if isinstance(sampler, (str,)):
sampler_fn = self._custom_sampler_fns[sampler]
else:
sampler_fn = sampler(context_spec=spec)
self._samplers[mode].append(sampler_fn)
sampler_fns.append(sampler_fn)
def batch_sampler_fn(batch_size, state=None, next_state=None, **kwargs):
"""Sampler fn."""
contexts_tuples = [
sampler(batch_size, state=state, next_state=next_state, **kwargs)
for sampler in sampler_fns]
contexts = [c[0] for c in contexts_tuples]
next_contexts = [c[1] for c in contexts_tuples]
contexts = [
normalizer.update_apply(c) if normalizer is not None else c
for normalizer, c in zip(self._normalizers, contexts)
]
next_contexts = [
normalizer.apply(c) if normalizer is not None else c
for normalizer, c in zip(self._normalizers, next_contexts)
]
return contexts, next_contexts
self._sampler_fns[mode] = batch_sampler_fn
def set_env_context_op(self, context, disable_unnormalizer=False):
"""Returns a TensorFlow op that sets the environment context.
Args:
context: A list of context Tensor variables.
disable_unnormalizer: Disable unnormalization.
Returns:
A TensorFlow op that sets the environment context.
"""
ret_val = np.array(1.0, dtype=np.float32)
if not self._settable_context:
return tf.identity(ret_val)
if not disable_unnormalizer:
context = [
normalizer.unapply(tf.expand_dims(c, 0))[0]
if normalizer is not None else c
for normalizer, c in zip(self._normalizers, context)
]
def set_context_func(*env_context_values):
tf.logging.info('[set_env_context_op] Setting gym environment context.')
# pylint: disable=protected-access
self.gym_env.set_context(*env_context_values)
return ret_val
# pylint: enable=protected-access
with tf.name_scope('set_env_context'):
set_op = tf.py_func(set_context_func, context, tf.float32,
name='set_env_context_py_func')
set_op.set_shape([])
return set_op
def set_replay(self, replay):
"""Set replay buffer for samplers.
Args:
replay: A replay buffer.
"""
for _, samplers in self._samplers.items():
for sampler in samplers:
sampler.set_replay(replay)
def get_clip_fns(self):
"""Returns a list of clip fns for contexts.
Returns:
A list of fns that clip context tensors.
"""
clip_fns = []
for context_range in self.context_ranges:
def clip_fn(var_, range_=context_range):
"""Clip a tensor."""
if range_ is None:
clipped_var = tf.identity(var_)
elif isinstance(range_[0], (int, long, float, list, np.ndarray)):
clipped_var = tf.clip_by_value(
var_,
range_[0],
range_[1],)
else: raise NotImplementedError(range_)
return clipped_var
clip_fns.append(clip_fn)
return clip_fns
def _validate_contexts(self, contexts):
"""Validate if contexts have right specs.
Args:
contexts: A list of [batch_size, num_context_dim] tensors.
Raises:
ValueError: If shape or dtype mismatches that of spec.
"""
for i, (context, spec) in enumerate(zip(contexts, self.context_specs)):
if context[0].shape != spec.shape:
raise ValueError('contexts[%d] has invalid shape %s wrt spec shape %s' %
(i, context[0].shape, spec.shape))
if context.dtype != spec.dtype:
raise ValueError('contexts[%d] has invalid dtype %s wrt spec dtype %s' %
(i, context.dtype, spec.dtype))
def context_multi_transition_fn(self, contexts, **kwargs):
"""Returns multiple future contexts starting from a batch."""
assert self._context_multi_transition_fn
return self._context_multi_transition_fn(contexts, None, None, **kwargs)
def step(self, mode, agent=None, action_fn=None, **kwargs):
"""Returns [next_contexts..., next_timer] list of ops.
Args:
mode: a string representing the mode=[train, explore, eval].
**kwargs: kwargs for context_transition_fn.
Returns:
a list of ops that set the context.
"""
if agent is None:
ops = []
if self._context_transition_fn is not None:
def sampler_fn():
samples = self.sample_contexts(mode, 1)[0]
return [s[0] for s in samples]
values = self._context_transition_fn(self.vars, self.t, sampler_fn, **kwargs)
ops += [tf.assign(var, value) for var, value in zip(self.vars, values)]
ops.append(tf.assign_add(self.t, 1)) # increment timer
return ops
else:
ops = agent.tf_context.step(mode, **kwargs)
state = kwargs['state']
next_state = kwargs['next_state']
state_repr = kwargs['state_repr']
next_state_repr = kwargs['next_state_repr']
with tf.control_dependencies(ops): # Step high level context before computing low level one.
# Get the context transition function output.
values = self._context_transition_fn(self.vars, self.t, None,
state=state_repr,
next_state=next_state_repr)
# Select a new goal every C steps, otherwise use context transition.
low_level_context = [
tf.cond(tf.equal(self.t % self.meta_action_every_n, 0),
lambda: tf.cast(action_fn(next_state, context=None), tf.float32),
lambda: values)]
ops = [tf.assign(var, value)
for var, value in zip(self.vars, low_level_context)]
with tf.control_dependencies(ops):
return [tf.assign_add(self.t, 1)] # increment timer
return ops
def reset(self, mode, agent=None, action_fn=None, state=None):
"""Returns ops that reset the context.
Args:
mode: a string representing the mode=[train, explore, eval].
Returns:
a list of ops that reset the context.
"""
if agent is None:
values = self.sample_contexts(mode=mode, batch_size=1)[0]
if values is None:
return []
values = [value[0] for value in values]
values[0] = uvf_utils.tf_print(
values[0],
values,
message='context:reset, mode=%s' % mode,
first_n=10,
name='context:reset:%s' % mode)
all_ops = []
for _, context_vars in sorted(self.context_vars.items()):
ops = [tf.assign(var, value) for var, value in zip(context_vars, values)]
all_ops += ops
all_ops.append(self.set_env_context_op(values))
all_ops.append(tf.assign(self.t, 0)) # reset timer
return all_ops
else:
ops = agent.tf_context.reset(mode)
# NOTE: The code is currently written in such a way that the higher level
# policy does not provide a low-level context until the second
# observation. Insead, we just zero-out low-level contexts.
for key, context_vars in sorted(self.context_vars.items()):
ops += [tf.assign(var, tf.zeros_like(var)) for var, meta_var in
zip(context_vars, agent.tf_context.context_vars[key])]
ops.append(tf.assign(self.t, 0)) # reset timer
return ops
def create_vars(self, name, agent=None):
"""Create tf variables for contexts.
Args:
name: Name of the variables.
Returns:
A list of [num_context_dims] tensors.
"""
if agent is not None:
meta_vars = agent.create_vars(name)
else:
meta_vars = {}
assert name not in self.context_vars, ('Conflict! %s is already '
'initialized.') % name
self.context_vars[name] = tuple([
tf.Variable(
tf.zeros(shape=spec.shape, dtype=spec.dtype),
name='%s_context_%d' % (name, i))
for i, spec in enumerate(self.context_specs)
])
return self.context_vars[name], meta_vars
@property
def n(self):
return len(self.context_specs)
@property
def vars(self):
return self.context_vars[self.VAR_NAME]
# pylint: disable=protected-access
@property
def gym_env(self):
return self._tf_env.pyenv._gym_env
@property
def tf_env(self):
return self._tf_env
# pylint: enable=protected-access
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Context functions.
Given the current contexts, timer and context sampler, returns new contexts
after an environment step. This can be used to define a high-level policy
that controls contexts as its actions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import gin.tf
import utils as uvf_utils
@gin.configurable
def periodic_context_fn(contexts, timer, sampler_fn, period=1):
"""Periodically samples contexts.
Args:
contexts: a list of [num_context_dims] tensor variables representing
current contexts.
timer: a scalar integer tensor variable holding the current time step.
sampler_fn: a sampler function that samples a list of [num_context_dims]
tensors.
period: (integer) period of update.
Returns:
a list of [num_context_dims] tensors.
"""
contexts = list(contexts[:]) # create copy
return tf.cond(tf.mod(timer, period) == 0, sampler_fn, lambda: contexts)
@gin.configurable
def timer_context_fn(contexts,
timer,
sampler_fn,
period=1,
timer_index=-1,
debug=False):
"""Samples contexts based on timer in contexts.
Args:
contexts: a list of [num_context_dims] tensor variables representing
current contexts.
timer: a scalar integer tensor variable holding the current time step.
sampler_fn: a sampler function that samples a list of [num_context_dims]
tensors.
period: (integer) period of update; actual period = `period` + 1.
timer_index: (integer) Index of context list that present timer.
debug: (boolean) Print debug messages.
Returns:
a list of [num_context_dims] tensors.
"""
contexts = list(contexts[:]) # create copy
cond = tf.equal(contexts[timer_index][0], 0)
def reset():
"""Sample context and reset the timer."""
new_contexts = sampler_fn()
new_contexts[timer_index] = tf.zeros_like(
contexts[timer_index]) + period
return new_contexts
def update():
"""Decrement the timer."""
contexts[timer_index] -= 1
return contexts
values = tf.cond(cond, reset, update)
if debug:
values[0] = uvf_utils.tf_print(
values[0],
values + [timer],
'timer_context_fn',
first_n=200,
name='timer_context_fn:contexts')
return values
@gin.configurable
def relative_context_transition_fn(
contexts, timer, sampler_fn,
k=2, state=None, next_state=None,
**kwargs):
"""Contexts updated to be relative to next state.
"""
contexts = list(contexts[:]) # create copy
assert len(contexts) == 1
new_contexts = [
tf.concat(
[contexts[0][:k] + state[:k] - next_state[:k],
contexts[0][k:]], -1)]
return new_contexts
@gin.configurable
def relative_context_multi_transition_fn(
contexts, timer, sampler_fn,
k=2, states=None,
**kwargs):
"""Given contexts at first state and sequence of states, derives sequence of all contexts.
"""
contexts = list(contexts[:]) # create copy
assert len(contexts) == 1
contexts = [
tf.concat(
[tf.expand_dims(contexts[0][:, :k] + states[:, 0, :k], 1) - states[:, :, :k],
contexts[0][:, None, k:] * tf.ones_like(states[:, :, :1])], -1)]
return contexts
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Import gin configurable modules.
"""
# pylint: disable=unused-import
from context import context
from context import context_transition_functions
from context import gin_utils
from context import rewards_functions
from context import samplers
# pylint: disable=unused-import
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Gin configurable utility functions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import gin.tf
@gin.configurable
def gin_sparse_array(size, values, indices, fill_value=0):
arr = np.zeros(size)
arr.fill(fill_value)
arr[indices] = values
return arr
@gin.configurable
def gin_sum(values):
result = values[0]
for value in values[1:]:
result += value
return result
@gin.configurable
def gin_range(n):
return range(n)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Reward shaping functions used by Contexts.
Each reward function should take the following inputs and return new rewards,
and discounts.
new_rewards, discounts = reward_fn(states, actions, rewards,
next_states, contexts)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import gin.tf
def summarize_stats(stats):
"""Summarize a dictionary of variables.
Args:
stats: a dictionary of {name: tensor} to compute stats over.
"""
for name, stat in stats.items():
mean = tf.reduce_mean(stat)
tf.summary.scalar('mean_%s' % name, mean)
tf.summary.scalar('max_%s' % name, tf.reduce_max(stat))
tf.summary.scalar('min_%s' % name, tf.reduce_min(stat))
std = tf.sqrt(tf.reduce_mean(tf.square(stat)) - tf.square(mean) + 1e-10)
tf.summary.scalar('std_%s' % name, std)
tf.summary.histogram(name, stat)
def index_states(states, indices):
"""Return indexed states.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
indices: (a list of Numpy integer array) Indices of states dimensions
to be mapped.
Returns:
A [batch_size, num_indices] Tensor representing the batch of indexed states.
"""
if indices is None:
return states
indices = tf.constant(indices, dtype=tf.int32)
return tf.gather(states, indices=indices, axis=1)
def record_tensor(tensor, indices, stats, name='states'):
"""Record specified tensor dimensions into stats.
Args:
tensor: A [batch_size, num_dims] Tensor.
indices: (a list of integers) Indices of dimensions to record.
stats: A dictionary holding stats.
name: (string) Name of tensor.
"""
if indices is None:
indices = range(tensor.shape.as_list()[1])
for index in indices:
stats['%s_%02d' % (name, index)] = tensor[:, index]
@gin.configurable
def potential_rewards(states,
actions,
rewards,
next_states,
contexts,
gamma=1.0,
reward_fn=None):
"""Return the potential-based rewards.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
gamma: Reward discount.
reward_fn: A reward function.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del actions # unused args
gamma = tf.to_float(gamma)
rewards_tp1, discounts = reward_fn(None, None, rewards, next_states, contexts)
rewards, _ = reward_fn(None, None, rewards, states, contexts)
return -rewards + gamma * rewards_tp1, discounts
@gin.configurable
def timed_rewards(states,
actions,
rewards,
next_states,
contexts,
reward_fn=None,
dense=False,
timer_index=-1):
"""Return the timed rewards.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
reward_fn: A reward function.
dense: (boolean) Provide dense rewards or sparse rewards at time = 0.
timer_index: (integer) The context list index that specifies timer.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
assert contexts[timer_index].get_shape().as_list()[1] == 1
timers = contexts[timer_index][:, 0]
rewards, discounts = reward_fn(states, actions, rewards, next_states,
contexts)
terminates = tf.to_float(timers <= 0) # if terminate set 1, else set 0
for _ in range(rewards.shape.ndims - 1):
terminates = tf.expand_dims(terminates, axis=-1)
if not dense:
rewards *= terminates # if terminate, return rewards, else return 0
discounts *= (tf.to_float(1.0) - terminates)
return rewards, discounts
@gin.configurable
def reset_rewards(states,
actions,
rewards,
next_states,
contexts,
reset_index=0,
reset_state=None,
reset_reward_function=None,
include_forward_rewards=True,
include_reset_rewards=True):
"""Returns the rewards for a forward/reset agent.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
reset_index: (integer) The context list index that specifies reset.
reset_state: Reset state.
reset_reward_function: Reward function for reset step.
include_forward_rewards: Include the rewards from the forward pass.
include_reset_rewards: Include the rewards from the reset pass.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
reset_state = tf.constant(
reset_state, dtype=next_states.dtype, shape=next_states.shape)
reset_states = tf.expand_dims(reset_state, 0)
def true_fn():
if include_reset_rewards:
return reset_reward_function(states, actions, rewards, next_states,
[reset_states] + contexts[1:])
else:
return tf.zeros_like(rewards), tf.ones_like(rewards)
def false_fn():
if include_forward_rewards:
return plain_rewards(states, actions, rewards, next_states, contexts)
else:
return tf.zeros_like(rewards), tf.ones_like(rewards)
rewards, discounts = tf.cond(
tf.cast(contexts[reset_index][0, 0], dtype=tf.bool), true_fn, false_fn)
return rewards, discounts
@gin.configurable
def tanh_similarity(states,
actions,
rewards,
next_states,
contexts,
mse_scale=1.0,
state_scales=1.0,
goal_scales=1.0,
summarize=False):
"""Returns the similarity between next_states and contexts using tanh and mse.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
mse_scale: A float, to scale mse before tanh.
state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
must be broadcastable to number of state dimensions.
goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
must be broadcastable to number of goal dimensions.
summarize: (boolean) enable summary ops.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del states, actions, rewards # Unused
mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
contexts[0] * goal_scales), -1)
tanh = tf.tanh(mse_scale * mse)
if summarize:
with tf.name_scope('RewardFn/'):
tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
tf.summary.histogram('mse', mse)
tf.summary.scalar('mean_tanh', tf.reduce_mean(tanh))
tf.summary.histogram('tanh', tanh)
rewards = tf.to_float(1 - tanh)
return rewards, tf.ones_like(rewards)
@gin.configurable
def negative_mse(states,
actions,
rewards,
next_states,
contexts,
state_scales=1.0,
goal_scales=1.0,
summarize=False):
"""Returns the negative mean square error between next_states and contexts.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
must be broadcastable to number of state dimensions.
goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
must be broadcastable to number of goal dimensions.
summarize: (boolean) enable summary ops.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del states, actions, rewards # Unused
mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
contexts[0] * goal_scales), -1)
if summarize:
with tf.name_scope('RewardFn/'):
tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
tf.summary.histogram('mse', mse)
rewards = tf.to_float(-mse)
return rewards, tf.ones_like(rewards)
@gin.configurable
def negative_distance(states,
actions,
rewards,
next_states,
contexts,
state_scales=1.0,
goal_scales=1.0,
reward_scales=1.0,
weight_index=None,
weight_vector=None,
summarize=False,
termination_epsilon=1e-4,
state_indices=None,
goal_indices=None,
vectorize=False,
relative_context=False,
diff=False,
norm='L2',
epsilon=1e-10,
bonus_epsilon=0., #5.,
offset=0.0):
"""Returns the negative euclidean distance between next_states and contexts.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
must be broadcastable to number of state dimensions.
goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
must be broadcastable to number of goal dimensions.
reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
must be broadcastable to number of reward dimensions.
weight_index: (integer) The context list index that specifies weight.
weight_vector: (a number or a list or Numpy array) The weighting vector,
broadcastable to `next_states`.
summarize: (boolean) enable summary ops.
termination_epsilon: terminate if dist is less than this quantity.
state_indices: (a list of integers) list of state indices to select.
goal_indices: (a list of integers) list of goal indices to select.
vectorize: Return a vectorized form.
norm: L1 or L2.
epsilon: small offset to ensure non-negative/zero distance.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del actions, rewards # Unused
stats = {}
record_tensor(next_states, state_indices, stats, 'next_states')
states = index_states(states, state_indices)
next_states = index_states(next_states, state_indices)
goals = index_states(contexts[0], goal_indices)
if relative_context:
goals = states + goals
sq_dists = tf.squared_difference(next_states * state_scales,
goals * goal_scales)
old_sq_dists = tf.squared_difference(states * state_scales,
goals * goal_scales)
record_tensor(sq_dists, None, stats, 'sq_dists')
if weight_vector is not None:
sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
old_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
if weight_index is not None:
#sq_dists *= contexts[weight_index]
weights = tf.abs(index_states(contexts[0], weight_index))
#weights /= tf.reduce_sum(weights, -1, keepdims=True)
sq_dists *= weights
old_sq_dists *= weights
if norm == 'L1':
dist = tf.sqrt(sq_dists + epsilon)
old_dist = tf.sqrt(old_sq_dists + epsilon)
if not vectorize:
dist = tf.reduce_sum(dist, -1)
old_dist = tf.reduce_sum(old_dist, -1)
elif norm == 'L2':
if vectorize:
dist = sq_dists
old_dist = old_sq_dists
else:
dist = tf.reduce_sum(sq_dists, -1)
old_dist = tf.reduce_sum(old_sq_dists, -1)
dist = tf.sqrt(dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0)
old_dist = tf.sqrt(old_dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0)
else:
raise NotImplementedError(norm)
discounts = dist > termination_epsilon
if summarize:
with tf.name_scope('RewardFn/'):
tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
tf.summary.histogram('dist', dist)
summarize_stats(stats)
bonus = tf.to_float(dist < bonus_epsilon)
dist *= reward_scales
old_dist *= reward_scales
if diff:
return bonus + offset + tf.to_float(old_dist - dist), tf.to_float(discounts)
return bonus + offset + tf.to_float(-dist), tf.to_float(discounts)
@gin.configurable
def cosine_similarity(states,
actions,
rewards,
next_states,
contexts,
state_scales=1.0,
goal_scales=1.0,
reward_scales=1.0,
normalize_states=True,
normalize_goals=True,
weight_index=None,
weight_vector=None,
summarize=False,
state_indices=None,
goal_indices=None,
offset=0.0):
"""Returns the cosine similarity between next_states - states and contexts.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
must be broadcastable to number of state dimensions.
goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
must be broadcastable to number of goal dimensions.
reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
must be broadcastable to number of reward dimensions.
weight_index: (integer) The context list index that specifies weight.
weight_vector: (a number or a list or Numpy array) The weighting vector,
broadcastable to `next_states`.
summarize: (boolean) enable summary ops.
termination_epsilon: terminate if dist is less than this quantity.
state_indices: (a list of integers) list of state indices to select.
goal_indices: (a list of integers) list of goal indices to select.
vectorize: Return a vectorized form.
norm: L1 or L2.
epsilon: small offset to ensure non-negative/zero distance.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del actions, rewards # Unused
stats = {}
record_tensor(next_states, state_indices, stats, 'next_states')
states = index_states(states, state_indices)
next_states = index_states(next_states, state_indices)
goals = index_states(contexts[0], goal_indices)
if weight_vector is not None:
goals *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
if weight_index is not None:
weights = tf.abs(index_states(contexts[0], weight_index))
goals *= weights
direction_vec = next_states - states
if normalize_states:
direction_vec = tf.nn.l2_normalize(direction_vec, -1)
goal_vec = goals
if normalize_goals:
goal_vec = tf.nn.l2_normalize(goal_vec, -1)
similarity = tf.reduce_sum(goal_vec * direction_vec, -1)
discounts = tf.ones_like(similarity)
return offset + tf.to_float(similarity), tf.to_float(discounts)
@gin.configurable
def diff_distance(states,
actions,
rewards,
next_states,
contexts,
state_scales=1.0,
goal_scales=1.0,
reward_scales=1.0,
weight_index=None,
weight_vector=None,
summarize=False,
termination_epsilon=1e-4,
state_indices=None,
goal_indices=None,
norm='L2',
epsilon=1e-10):
"""Returns the difference in euclidean distance between states/next_states and contexts.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
must be broadcastable to number of state dimensions.
goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
must be broadcastable to number of goal dimensions.
reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
must be broadcastable to number of reward dimensions.
weight_index: (integer) The context list index that specifies weight.
weight_vector: (a number or a list or Numpy array) The weighting vector,
broadcastable to `next_states`.
summarize: (boolean) enable summary ops.
termination_epsilon: terminate if dist is less than this quantity.
state_indices: (a list of integers) list of state indices to select.
goal_indices: (a list of integers) list of goal indices to select.
vectorize: Return a vectorized form.
norm: L1 or L2.
epsilon: small offset to ensure non-negative/zero distance.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del actions, rewards # Unused
stats = {}
record_tensor(next_states, state_indices, stats, 'next_states')
next_states = index_states(next_states, state_indices)
states = index_states(states, state_indices)
goals = index_states(contexts[0], goal_indices)
next_sq_dists = tf.squared_difference(next_states * state_scales,
goals * goal_scales)
sq_dists = tf.squared_difference(states * state_scales,
goals * goal_scales)
record_tensor(sq_dists, None, stats, 'sq_dists')
if weight_vector is not None:
next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
if weight_index is not None:
next_sq_dists *= contexts[weight_index]
sq_dists *= contexts[weight_index]
if norm == 'L1':
next_dist = tf.sqrt(next_sq_dists + epsilon)
dist = tf.sqrt(sq_dists + epsilon)
next_dist = tf.reduce_sum(next_dist, -1)
dist = tf.reduce_sum(dist, -1)
elif norm == 'L2':
next_dist = tf.reduce_sum(next_sq_dists, -1)
next_dist = tf.sqrt(next_dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0)
dist = tf.reduce_sum(sq_dists, -1)
dist = tf.sqrt(dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0)
else:
raise NotImplementedError(norm)
discounts = next_dist > termination_epsilon
if summarize:
with tf.name_scope('RewardFn/'):
tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
tf.summary.histogram('dist', dist)
summarize_stats(stats)
diff = dist - next_dist
diff *= reward_scales
return tf.to_float(diff), tf.to_float(discounts)
@gin.configurable
def binary_indicator(states,
actions,
rewards,
next_states,
contexts,
termination_epsilon=1e-4,
offset=0,
epsilon=1e-10,
state_indices=None,
summarize=False):
"""Returns 0/1 by checking if next_states and contexts overlap.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
termination_epsilon: terminate if dist is less than this quantity.
offset: Offset the rewards.
epsilon: small offset to ensure non-negative/zero distance.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del states, actions # unused args
next_states = index_states(next_states, state_indices)
dist = tf.reduce_sum(tf.squared_difference(next_states, contexts[0]), -1)
dist = tf.sqrt(dist + epsilon)
discounts = dist > termination_epsilon
rewards = tf.logical_not(discounts)
rewards = tf.to_float(rewards) + offset
return tf.to_float(rewards), tf.ones_like(tf.to_float(discounts)) #tf.to_float(discounts)
@gin.configurable
def plain_rewards(states, actions, rewards, next_states, contexts):
"""Returns the given rewards.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del states, actions, next_states, contexts # Unused
return rewards, tf.ones_like(rewards)
@gin.configurable
def ctrl_rewards(states,
actions,
rewards,
next_states,
contexts,
reward_scales=1.0):
"""Returns the negative control cost.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
must be broadcastable to number of reward dimensions.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del states, rewards, contexts # Unused
if actions is None:
rewards = tf.to_float(tf.zeros(shape=next_states.shape[:1]))
else:
rewards = -tf.reduce_sum(tf.square(actions), axis=1)
rewards *= reward_scales
rewards = tf.to_float(rewards)
return rewards, tf.ones_like(rewards)
@gin.configurable
def diff_rewards(
states,
actions,
rewards,
next_states,
contexts,
state_indices=None,
goal_index=0,):
"""Returns (next_states - goals) as a batched vector reward."""
del states, rewards, actions # Unused
if state_indices is not None:
next_states = index_states(next_states, state_indices)
rewards = tf.to_float(next_states - contexts[goal_index])
return rewards, tf.ones_like(rewards)
@gin.configurable
def state_rewards(states,
actions,
rewards,
next_states,
contexts,
weight_index=None,
state_indices=None,
weight_vector=1.0,
offset_vector=0.0,
summarize=False):
"""Returns the rewards that are linear mapping of next_states.
Args:
states: A [batch_size, num_state_dims] Tensor representing a batch
of states.
actions: A [batch_size, num_action_dims] Tensor representing a batch
of actions.
rewards: A [batch_size] Tensor representing a batch of rewards.
next_states: A [batch_size, num_state_dims] Tensor representing a batch
of next states.
contexts: A list of [batch_size, num_context_dims] Tensor representing
a batch of contexts.
weight_index: (integer) Index of contexts lists that specify weighting.
state_indices: (a list of Numpy integer array) Indices of states dimensions
to be mapped.
weight_vector: (a number or a list or Numpy array) The weighting vector,
broadcastable to `next_states`.
offset_vector: (a number or a list of Numpy array) The off vector.
summarize: (boolean) enable summary ops.
Returns:
A new tf.float32 [batch_size] rewards Tensor, and
tf.float32 [batch_size] discounts tensor.
"""
del states, actions, rewards # unused args
stats = {}
record_tensor(next_states, state_indices, stats)
next_states = index_states(next_states, state_indices)
weight = tf.constant(
weight_vector, dtype=next_states.dtype, shape=next_states[0].shape)
weights = tf.expand_dims(weight, 0)
offset = tf.constant(
offset_vector, dtype=next_states.dtype, shape=next_states[0].shape)
offsets = tf.expand_dims(offset, 0)
if weight_index is not None:
weights *= contexts[weight_index]
rewards = tf.to_float(tf.reduce_sum(weights * (next_states+offsets), axis=1))
if summarize:
with tf.name_scope('RewardFn/'):
summarize_stats(stats)
return rewards, tf.ones_like(rewards)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Samplers for Contexts.
Each sampler class should define __call__(batch_size).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
import gin.tf
@gin.configurable
class BaseSampler(object):
"""Base sampler."""
def __init__(self, context_spec, context_range=None, k=2, scope='sampler'):
"""Construct a base sampler.
Args:
context_spec: A context spec.
context_range: A tuple of (minval, max), where minval, maxval are floats
or Numpy arrays with the same shape as the context.
scope: A string denoting scope.
"""
self._context_spec = context_spec
self._context_range = context_range
self._k = k
self._scope = scope
def __call__(self, batch_size, **kwargs):
raise NotImplementedError
def set_replay(self, replay=None):
pass
def _validate_contexts(self, contexts):
"""Validate if contexts have right spec.
Args:
contexts: A [batch_size, num_contexts_dim] tensor.
Raises:
ValueError: If shape or dtype mismatches that of spec.
"""
if contexts[0].shape != self._context_spec.shape:
raise ValueError('contexts has invalid shape %s wrt spec shape %s' %
(contexts[0].shape, self._context_spec.shape))
if contexts.dtype != self._context_spec.dtype:
raise ValueError('contexts has invalid dtype %s wrt spec dtype %s' %
(contexts.dtype, self._context_spec.dtype))
@gin.configurable
class ZeroSampler(BaseSampler):
"""Zero sampler."""
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
contexts = tf.zeros(
dtype=self._context_spec.dtype,
shape=[
batch_size,
] + self._context_spec.shape.as_list())
return contexts, contexts
@gin.configurable
class BinarySampler(BaseSampler):
"""Binary sampler."""
def __init__(self, probs=0.5, *args, **kwargs):
"""Constructor."""
super(BinarySampler, self).__init__(*args, **kwargs)
self._probs = probs
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context."""
spec = self._context_spec
contexts = tf.random_uniform(
shape=[
batch_size,
] + spec.shape.as_list(), dtype=tf.float32)
contexts = tf.cast(tf.greater(contexts, self._probs), dtype=spec.dtype)
return contexts, contexts
@gin.configurable
class RandomSampler(BaseSampler):
"""Random sampler."""
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
spec = self._context_spec
context_range = self._context_range
if isinstance(context_range[0], (int, float)):
contexts = tf.random_uniform(
shape=[
batch_size,
] + spec.shape.as_list(),
minval=context_range[0],
maxval=context_range[1],
dtype=spec.dtype)
elif isinstance(context_range[0], (list, tuple, np.ndarray)):
assert len(spec.shape.as_list()) == 1
assert spec.shape.as_list()[0] == len(context_range[0])
assert spec.shape.as_list()[0] == len(context_range[1])
contexts = tf.concat(
[
tf.random_uniform(
shape=[
batch_size, 1,
] + spec.shape.as_list()[1:],
minval=context_range[0][i],
maxval=context_range[1][i],
dtype=spec.dtype) for i in range(spec.shape.as_list()[0])
],
axis=1)
else: raise NotImplementedError(context_range)
self._validate_contexts(contexts)
state, next_state = kwargs['state'], kwargs['next_state']
if state is not None and next_state is not None:
pass
#contexts = tf.concat(
# [tf.random_normal(tf.shape(state[:, :self._k]), dtype=tf.float64) +
# tf.random_shuffle(state[:, :self._k]),
# contexts[:, self._k:]], 1)
return contexts, contexts
@gin.configurable
class ScheduledSampler(BaseSampler):
"""Scheduled sampler."""
def __init__(self,
scope='default',
values=None,
scheduler='cycle',
scheduler_params=None,
*args, **kwargs):
"""Construct sampler.
Args:
scope: Scope name.
values: A list of numbers or [num_context_dim] Numpy arrays
representing the values to cycle.
scheduler: scheduler type.
scheduler_params: scheduler parameters.
*args: arguments.
**kwargs: keyword arguments.
"""
super(ScheduledSampler, self).__init__(*args, **kwargs)
self._scope = scope
self._values = values
self._scheduler = scheduler
self._scheduler_params = scheduler_params or {}
assert self._values is not None and len(
self._values), 'must provide non-empty values.'
self._n = len(self._values)
# TODO(shanegu): move variable creation outside. resolve tf.cond problem.
self._count = 0
self._i = tf.Variable(
tf.zeros(shape=(), dtype=tf.int32),
name='%s-scheduled_sampler_%d' % (self._scope, self._count))
self._values = tf.constant(self._values, dtype=self._context_spec.dtype)
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
spec = self._context_spec
next_op = self._next(self._i)
with tf.control_dependencies([next_op]):
value = self._values[self._i]
if value.get_shape().as_list():
values = tf.tile(
tf.expand_dims(value, 0), (batch_size,) + (1,) * spec.shape.ndims)
else:
values = value + tf.zeros(
shape=[
batch_size,
] + spec.shape.as_list(), dtype=spec.dtype)
self._validate_contexts(values)
self._count += 1
return values, values
def _next(self, i):
"""Return op that increments pointer to next value.
Args:
i: A tensorflow integer variable.
Returns:
Op that increments pointer.
"""
if self._scheduler == 'cycle':
inc = ('inc' in self._scheduler_params and
self._scheduler_params['inc']) or 1
return tf.assign(i, tf.mod(i+inc, self._n))
else:
raise NotImplementedError(self._scheduler)
@gin.configurable
class ReplaySampler(BaseSampler):
"""Replay sampler."""
def __init__(self,
prefetch_queue_capacity=2,
override_indices=None,
state_indices=None,
*args,
**kwargs):
"""Construct sampler.
Args:
prefetch_queue_capacity: Capacity for prefetch queue.
override_indices: Override indices.
state_indices: Select certain indices from state dimension.
*args: arguments.
**kwargs: keyword arguments.
"""
super(ReplaySampler, self).__init__(*args, **kwargs)
self._prefetch_queue_capacity = prefetch_queue_capacity
self._override_indices = override_indices
self._state_indices = state_indices
def set_replay(self, replay):
"""Set replay.
Args:
replay: A replay buffer.
"""
self._replay = replay
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
batch = self._replay.GetRandomBatch(batch_size)
next_states = batch[4]
if self._prefetch_queue_capacity > 0:
batch_queue = slim.prefetch_queue.prefetch_queue(
[next_states],
capacity=self._prefetch_queue_capacity,
name='%s/batch_context_queue' % self._scope)
next_states = batch_queue.dequeue()
if self._override_indices is not None:
assert self._context_range is not None and isinstance(
self._context_range[0], (int, long, float))
next_states = tf.concat(
[
tf.random_uniform(
shape=next_states[:, :1].shape,
minval=self._context_range[0],
maxval=self._context_range[1],
dtype=next_states.dtype)
if i in self._override_indices else next_states[:, i:i + 1]
for i in range(self._context_spec.shape.as_list()[0])
],
axis=1)
if self._state_indices is not None:
next_states = tf.concat(
[
next_states[:, i:i + 1]
for i in range(self._context_spec.shape.as_list()[0])
],
axis=1)
self._validate_contexts(next_states)
return next_states, next_states
@gin.configurable
class TimeSampler(BaseSampler):
"""Time Sampler."""
def __init__(self, minval=0, maxval=1, timestep=-1, *args, **kwargs):
"""Construct sampler.
Args:
minval: Min value integer.
maxval: Max value integer.
timestep: Time step between states and next_states.
*args: arguments.
**kwargs: keyword arguments.
"""
super(TimeSampler, self).__init__(*args, **kwargs)
assert self._context_spec.shape.as_list() == [1]
self._minval = minval
self._maxval = maxval
self._timestep = timestep
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
if self._maxval == self._minval:
contexts = tf.constant(
self._maxval, shape=[batch_size, 1], dtype=tf.int32)
else:
contexts = tf.random_uniform(
shape=[batch_size, 1],
dtype=tf.int32,
maxval=self._maxval,
minval=self._minval)
next_contexts = tf.maximum(contexts + self._timestep, 0)
return tf.cast(
contexts, dtype=self._context_spec.dtype), tf.cast(
next_contexts, dtype=self._context_spec.dtype)
@gin.configurable
class ConstantSampler(BaseSampler):
"""Constant sampler."""
def __init__(self, value=None, *args, **kwargs):
"""Construct sampler.
Args:
value: A list or Numpy array for values of the constant.
*args: arguments.
**kwargs: keyword arguments.
"""
super(ConstantSampler, self).__init__(*args, **kwargs)
self._value = value
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
spec = self._context_spec
value_ = tf.constant(self._value, shape=spec.shape, dtype=spec.dtype)
values = tf.tile(
tf.expand_dims(value_, 0), (batch_size,) + (1,) * spec.shape.ndims)
self._validate_contexts(values)
return values, values
@gin.configurable
class DirectionSampler(RandomSampler):
"""Direction sampler."""
def __call__(self, batch_size, **kwargs):
"""Sample a batch of context.
Args:
batch_size: Batch size.
Returns:
Two [batch_size, num_context_dims] tensors.
"""
spec = self._context_spec
context_range = self._context_range
if isinstance(context_range[0], (int, float)):
contexts = tf.random_uniform(
shape=[
batch_size,
] + spec.shape.as_list(),
minval=context_range[0],
maxval=context_range[1],
dtype=spec.dtype)
elif isinstance(context_range[0], (list, tuple, np.ndarray)):
assert len(spec.shape.as_list()) == 1
assert spec.shape.as_list()[0] == len(context_range[0])
assert spec.shape.as_list()[0] == len(context_range[1])
contexts = tf.concat(
[
tf.random_uniform(
shape=[
batch_size, 1,
] + spec.shape.as_list()[1:],
minval=context_range[0][i],
maxval=context_range[1][i],
dtype=spec.dtype) for i in range(spec.shape.as_list()[0])
],
axis=1)
else: raise NotImplementedError(context_range)
self._validate_contexts(contexts)
if 'sampler_fn' in kwargs:
other_contexts = kwargs['sampler_fn']()
else:
other_contexts = contexts
state, next_state = kwargs['state'], kwargs['next_state']
if state is not None and next_state is not None:
my_context_range = (np.array(context_range[1]) - np.array(context_range[0])) / 2 * np.ones(spec.shape.as_list())
contexts = tf.concat(
[0.1 * my_context_range[:self._k] *
tf.random_normal(tf.shape(state[:, :self._k]), dtype=state.dtype) +
tf.random_shuffle(state[:, :self._k]) - state[:, :self._k],
other_contexts[:, self._k:]], 1)
#contexts = tf.Print(contexts,
# [contexts, tf.reduce_max(contexts, 0),
# tf.reduce_min(state, 0), tf.reduce_max(state, 0)], 'contexts', summarize=15)
next_contexts = tf.concat( #LALA
[state[:, :self._k] + contexts[:, :self._k] - next_state[:, :self._k],
other_contexts[:, self._k:]], 1)
next_contexts = contexts #LALA cosine
else:
next_contexts = contexts
return tf.stop_gradient(contexts), tf.stop_gradient(next_contexts)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Random policy on an environment."""
import tensorflow as tf
import numpy as np
import random
import create_maze_env
app = tf.app
flags = tf.flags
logging = tf.logging
FLAGS = flags.FLAGS
flags.DEFINE_string('env', 'AntMaze', 'environment name: AntMaze, AntPush, or AntFall')
flags.DEFINE_integer('episode_length', 500, 'episode length')
flags.DEFINE_integer('num_episodes', 50, 'number of episodes')
def get_goal_sample_fn(env_name):
if env_name == 'AntMaze':
# NOTE: When evaluating (i.e. the metrics shown in the paper,
# we use the commented out goal sampling function. The uncommented
# one is only used for training.
#return lambda: np.array([0., 16.])
return lambda: np.random.uniform((-4, -4), (20, 20))
elif env_name == 'AntPush':
return lambda: np.array([0., 19.])
elif env_name == 'AntFall':
return lambda: np.array([0., 27., 4.5])
else:
assert False, 'Unknown env'
def get_reward_fn(env_name):
if env_name == 'AntMaze':
return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
elif env_name == 'AntPush':
return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
elif env_name == 'AntFall':
return lambda obs, goal: -np.sum(np.square(obs[:3] - goal)) ** 0.5
else:
assert False, 'Unknown env'
def success_fn(last_reward):
return last_reward > -5.0
class EnvWithGoal(object):
def __init__(self, base_env, env_name):
self.base_env = base_env
self.goal_sample_fn = get_goal_sample_fn(env_name)
self.reward_fn = get_reward_fn(env_name)
self.goal = None
def reset(self):
obs = self.base_env.reset()
self.goal = self.goal_sample_fn()
return np.concatenate([obs, self.goal])
def step(self, a):
obs, _, done, info = self.base_env.step(a)
reward = self.reward_fn(obs, self.goal)
return np.concatenate([obs, self.goal]), reward, done, info
@property
def action_space(self):
return self.base_env.action_space
def run_environment(env_name, episode_length, num_episodes):
env = EnvWithGoal(
create_maze_env.create_maze_env(env_name),
env_name)
def action_fn(obs):
action_space = env.action_space
action_space_mean = (action_space.low + action_space.high) / 2.0
action_space_magn = (action_space.high - action_space.low) / 2.0
random_action = (action_space_mean +
action_space_magn *
np.random.uniform(low=-1.0, high=1.0,
size=action_space.shape))
return random_action
rewards = []
successes = []
for ep in range(num_episodes):
rewards.append(0.0)
successes.append(False)
obs = env.reset()
for _ in range(episode_length):
obs, reward, done, _ = env.step(action_fn(obs))
rewards[-1] += reward
successes[-1] = success_fn(reward)
if done:
break
logging.info('Episode %d reward: %.2f, Success: %d', ep + 1, rewards[-1], successes[-1])
logging.info('Average Reward over %d episodes: %.2f',
num_episodes, np.mean(rewards))
logging.info('Average Success over %d episodes: %.2f',
num_episodes, np.mean(successes))
def main(unused_argv):
logging.set_verbosity(logging.INFO)
run_environment(FLAGS.env, FLAGS.episode_length, FLAGS.num_episodes)
if __name__ == '__main__':
app.run()
......@@ -21,8 +21,21 @@ from gym import utils
from gym.envs.mujoco import mujoco_env
def q_inv(a):
return [a[0], -a[1], -a[2], -a[3]]
def q_mult(a, b): # multiply two quaternion
w = a[0] * b[0] - a[1] * b[1] - a[2] * b[2] - a[3] * b[3]
i = a[0] * b[1] + a[1] * b[0] + a[2] * b[3] - a[3] * b[2]
j = a[0] * b[2] - a[1] * b[3] + a[2] * b[0] + a[3] * b[1]
k = a[0] * b[3] + a[1] * b[2] - a[2] * b[1] + a[3] * b[0]
return [w, i, j, k]
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
FILE = "ant.xml"
ORI_IND = 3
def __init__(self, file_path=None, expose_all_qpos=True,
expose_body_coms=None, expose_body_comvels=None):
......@@ -101,3 +114,21 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
def viewer_setup(self):
self.viewer.cam.distance = self.model.stat.extent * 0.5
def get_ori(self):
ori = [0, 1, 0, 0]
rot = self.model.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND + 4] # take the quaternion
ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3] # project onto x-y plane
ori = math.atan2(ori[1], ori[0])
return ori
def set_xy(self, xy):
qpos = np.copy(self.physics.data.qpos)
qpos[0] = xy[0]
qpos[1] = xy[1]
qvel = self.physics.data.qvel
self.set_state(qpos, qvel)
def get_xy(self):
return self.physics.data.qpos[:2]
......@@ -13,8 +13,8 @@
# limitations under the License.
# ==============================================================================
from maze_env import MazeEnv
from ant import AntEnv
from environments.maze_env import MazeEnv
from environments.ant import AntEnv
class AntMazeEnv(MazeEnv):
......
......@@ -13,18 +13,85 @@
# limitations under the License.
# ==============================================================================
from ant_maze_env import AntMazeEnv
from environments.ant_maze_env import AntMazeEnv
from environments.point_maze_env import PointMazeEnv
import tensorflow as tf
import gin.tf
from tf_agents.environments import gym_wrapper
from tf_agents.environments import tf_py_environment
@gin.configurable
def create_maze_env(env_name=None, top_down_view=False):
n_bins = 0
manual_collision = False
if env_name.startswith('Ego'):
n_bins = 8
env_name = env_name[3:]
if env_name.startswith('Ant'):
cls = AntMazeEnv
env_name = env_name[3:]
maze_size_scaling = 8
elif env_name.startswith('Point'):
cls = PointMazeEnv
manual_collision = True
env_name = env_name[5:]
maze_size_scaling = 4
else:
assert False, 'unknown env %s' % env_name
def create_maze_env(env_name=None):
maze_id = None
if env_name.startswith('AntMaze'):
observe_blocks = False
put_spin_near_agent = False
if env_name == 'Maze':
maze_id = 'Maze'
elif env_name.startswith('AntPush'):
elif env_name == 'Push':
maze_id = 'Push'
elif env_name.startswith('AntFall'):
elif env_name == 'Fall':
maze_id = 'Fall'
elif env_name == 'Block':
maze_id = 'Block'
put_spin_near_agent = True
observe_blocks = True
elif env_name == 'BlockMaze':
maze_id = 'BlockMaze'
put_spin_near_agent = True
observe_blocks = True
else:
raise ValueError('Unknown maze environment %s' % env_name)
return AntMazeEnv(maze_id=maze_id)
gym_mujoco_kwargs = {
'maze_id': maze_id,
'n_bins': n_bins,
'observe_blocks': observe_blocks,
'put_spin_near_agent': put_spin_near_agent,
'top_down_view': top_down_view,
'manual_collision': manual_collision,
'maze_size_scaling': maze_size_scaling
}
gym_env = cls(**gym_mujoco_kwargs)
gym_env.reset()
wrapped_env = gym_wrapper.GymWrapper(gym_env)
return wrapped_env
class TFPyEnvironment(tf_py_environment.TFPyEnvironment):
def __init__(self, *args, **kwargs):
super(TFPyEnvironment, self).__init__(*args, **kwargs)
def start_collect(self):
pass
def current_obs(self):
time_step = self.current_time_step()
return time_step.observation[0] # For some reason, there is an extra dim.
def step(self, actions):
actions = tf.expand_dims(actions, 0)
next_step = super(TFPyEnvironment, self).step(actions)
return next_step.is_last()[0], next_step.reward[0], next_step.discount[0]
def reset(self):
return super(TFPyEnvironment, self).reset()
......@@ -22,7 +22,7 @@ import math
import numpy as np
import gym
import maze_env_utils
from environments import maze_env_utils
# Directory that contains mujoco xml files.
MODEL_DIR = 'environments/assets'
......@@ -39,6 +39,13 @@ class MazeEnv(gym.Env):
maze_id=None,
maze_height=0.5,
maze_size_scaling=8,
n_bins=0,
sensor_range=3.,
sensor_span=2 * math.pi,
observe_blocks=False,
put_spin_near_agent=False,
top_down_view=False,
manual_collision=False,
*args,
**kwargs):
self._maze_id = maze_id
......@@ -52,6 +59,14 @@ class MazeEnv(gym.Env):
self.MAZE_HEIGHT = height = maze_height
self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
self._n_bins = n_bins
self._sensor_range = sensor_range * size_scaling
self._sensor_span = sensor_span
self._observe_blocks = observe_blocks
self._put_spin_near_agent = put_spin_near_agent
self._top_down_view = top_down_view
self._manual_collision = manual_collision
self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze(maze_id=self._maze_id)
self.elevated = any(-1 in row for row in structure) # Elevate the maze to allow for falling.
self.blocks = any(
......@@ -61,6 +76,13 @@ class MazeEnv(gym.Env):
torso_x, torso_y = self._find_robot()
self._init_torso_x = torso_x
self._init_torso_y = torso_y
self._init_positions = [
(x - torso_x, y - torso_y)
for x, y in self._find_all_robots()]
self._xy_to_rowcol = lambda x, y: (2 + (y + size_scaling / 2) / size_scaling,
2 + (x + size_scaling / 2) / size_scaling)
self._view = np.zeros([5, 5, 3]) # walls (immovable), chasms (fall), movable blocks
height_offset = 0.
if self.elevated:
......@@ -74,9 +96,13 @@ class MazeEnv(gym.Env):
default = tree.find(".//default")
default.find('.//geom').set('solimp', '.995 .995 .01')
self.movable_blocks = []
for i in range(len(structure)):
for j in range(len(structure[0])):
if self.elevated and structure[i][j] not in [-1]:
struct = structure[i][j]
if struct == 'r' and self._put_spin_near_agent:
struct = maze_env_utils.Move.SpinXY
if self.elevated and struct not in [-1]:
# Create elevated platform.
ET.SubElement(
worldbody, "geom",
......@@ -93,7 +119,7 @@ class MazeEnv(gym.Env):
conaffinity="1",
rgba="0.9 0.9 0.9 1",
)
if structure[i][j] == 1: # Unmovable block.
if struct == 1: # Unmovable block.
# Offset all coordinates so that robot starts at the origin.
ET.SubElement(
worldbody, "geom",
......@@ -111,26 +137,32 @@ class MazeEnv(gym.Env):
conaffinity="1",
rgba="0.4 0.4 0.4 1",
)
elif maze_env_utils.can_move(structure[i][j]): # Movable block.
elif maze_env_utils.can_move(struct): # Movable block.
# The "falling" blocks are shrunk slightly and increased in mass to
# ensure that it can fall easily through a gap in the platform blocks.
falling = maze_env_utils.can_move_z(structure[i][j])
shrink = 0.99 if falling else 1.0
moveable_body = ET.SubElement(
name = "movable_%d_%d" % (i, j)
self.movable_blocks.append((name, struct))
falling = maze_env_utils.can_move_z(struct)
spinning = maze_env_utils.can_spin(struct)
x_offset = 0.25 * size_scaling if spinning else 0.0
y_offset = 0.0
shrink = 0.1 if spinning else 0.99 if falling else 1.0
height_shrink = 0.1 if spinning else 1.0
movable_body = ET.SubElement(
worldbody, "body",
name="moveable_%d_%d" % (i, j),
pos="%f %f %f" % (j * size_scaling - torso_x,
i * size_scaling - torso_y,
name=name,
pos="%f %f %f" % (j * size_scaling - torso_x + x_offset,
i * size_scaling - torso_y + y_offset,
height_offset +
height / 2 * size_scaling),
height / 2 * size_scaling * height_shrink),
)
ET.SubElement(
moveable_body, "geom",
movable_body, "geom",
name="block_%d_%d" % (i, j),
pos="0 0 0",
size="%f %f %f" % (0.5 * size_scaling * shrink,
0.5 * size_scaling * shrink,
height / 2 * size_scaling),
height / 2 * size_scaling * height_shrink),
type="box",
material="",
mass="0.001" if falling else "0.0002",
......@@ -138,45 +170,56 @@ class MazeEnv(gym.Env):
conaffinity="1",
rgba="0.9 0.1 0.1 1"
)
if maze_env_utils.can_move_x(structure[i][j]):
if maze_env_utils.can_move_x(struct):
ET.SubElement(
moveable_body, "joint",
movable_body, "joint",
armature="0",
axis="1 0 0",
damping="0.0",
limited="true" if falling else "false",
range="%f %f" % (-size_scaling, size_scaling),
margin="0.01",
name="moveable_x_%d_%d" % (i, j),
name="movable_x_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_move_y(structure[i][j]):
if maze_env_utils.can_move_y(struct):
ET.SubElement(
moveable_body, "joint",
movable_body, "joint",
armature="0",
axis="0 1 0",
damping="0.0",
limited="true" if falling else "false",
range="%f %f" % (-size_scaling, size_scaling),
margin="0.01",
name="moveable_y_%d_%d" % (i, j),
name="movable_y_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_move_z(structure[i][j]):
if maze_env_utils.can_move_z(struct):
ET.SubElement(
moveable_body, "joint",
movable_body, "joint",
armature="0",
axis="0 0 1",
damping="0.0",
limited="true",
range="%f 0" % (-height_offset),
margin="0.01",
name="moveable_z_%d_%d" % (i, j),
name="movable_z_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_spin(struct):
ET.SubElement(
movable_body, "joint",
armature="0",
axis="0 0 1",
damping="0.0",
limited="false",
name="spinable_%d_%d" % (i, j),
pos="0 0 0",
type="ball"
)
torso = tree.find(".//body[@name='torso']")
geoms = torso.findall(".//geom")
......@@ -190,13 +233,203 @@ class MazeEnv(gym.Env):
self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs)
def get_ori(self):
return self.wrapped_env.get_ori()
def get_top_down_view(self):
self._view = np.zeros_like(self._view)
def valid(row, col):
return self._view.shape[0] > row >= 0 and self._view.shape[1] > col >= 0
def update_view(x, y, d, row=None, col=None):
if row is None or col is None:
x = x - self._robot_x
y = y - self._robot_y
th = self._robot_ori
row, col = self._xy_to_rowcol(x, y)
update_view(x, y, d, row=row, col=col)
return
row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1
if row_frac < 0:
row_frac += 1
if col_frac < 0:
col_frac += 1
if valid(row, col):
self._view[row, col, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row - 1, col):
self._view[row - 1, col, d] += (
(max(0., 0.5 - row_frac)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row + 1, col):
self._view[row + 1, col, d] += (
(max(0., row_frac - 0.5)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row, col - 1):
self._view[row, col - 1, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(max(0., 0.5 - col_frac)))
if valid(row, col + 1):
self._view[row, col + 1, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(max(0., col_frac - 0.5)))
if valid(row - 1, col - 1):
self._view[row - 1, col - 1, d] += (
(max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac))
if valid(row - 1, col + 1):
self._view[row - 1, col + 1, d] += (
(max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5))
if valid(row + 1, col + 1):
self._view[row + 1, col + 1, d] += (
(max(0., row_frac - 0.5)) * max(0., col_frac - 0.5))
if valid(row + 1, col - 1):
self._view[row + 1, col - 1, d] += (
(max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac))
# Draw ant.
robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2]
self._robot_x = robot_x
self._robot_y = robot_y
self._robot_ori = self.get_ori()
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
height = self.MAZE_HEIGHT
# Draw immovable blocks and chasms.
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 1: # Wall.
update_view(j * size_scaling - self._init_torso_x,
i * size_scaling - self._init_torso_y,
0)
if structure[i][j] == -1: # Chasm.
update_view(j * size_scaling - self._init_torso_x,
i * size_scaling - self._init_torso_y,
1)
# Draw movable blocks.
for block_name, block_type in self.movable_blocks:
block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2]
update_view(block_x, block_y, 2)
return self._view
def get_range_sensor_obs(self):
"""Returns egocentric range sensor observations of maze."""
robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3]
ori = self.get_ori()
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
height = self.MAZE_HEIGHT
segments = []
# Get line segments (corresponding to outer boundary) of each immovable
# block or drop-off.
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] in [1, -1]: # There's a wall or drop-off.
cx = j * size_scaling - self._init_torso_x
cy = i * size_scaling - self._init_torso_y
x1 = cx - 0.5 * size_scaling
x2 = cx + 0.5 * size_scaling
y1 = cy - 0.5 * size_scaling
y2 = cy + 0.5 * size_scaling
struct_segments = [
((x1, y1), (x2, y1)),
((x2, y1), (x2, y2)),
((x2, y2), (x1, y2)),
((x1, y2), (x1, y1)),
]
for seg in struct_segments:
segments.append(dict(
segment=seg,
type=structure[i][j],
))
# Get line segments (corresponding to outer boundary) of each movable
# block within the agent's z-view.
for block_name, block_type in self.movable_blocks:
block_x, block_y, block_z = self.wrapped_env.get_body_com(block_name)[:3]
if (block_z + height * size_scaling / 2 >= robot_z and
robot_z >= block_z - height * size_scaling / 2): # Block in view.
x1 = block_x - 0.5 * size_scaling
x2 = block_x + 0.5 * size_scaling
y1 = block_y - 0.5 * size_scaling
y2 = block_y + 0.5 * size_scaling
struct_segments = [
((x1, y1), (x2, y1)),
((x2, y1), (x2, y2)),
((x2, y2), (x1, y2)),
((x1, y2), (x1, y1)),
]
for seg in struct_segments:
segments.append(dict(
segment=seg,
type=block_type,
))
sensor_readings = np.zeros((self._n_bins, 3)) # 3 for wall, drop-off, block
for ray_idx in range(self._n_bins):
ray_ori = (ori - self._sensor_span * 0.5 +
(2 * ray_idx + 1.0) / (2 * self._n_bins) * self._sensor_span)
ray_segments = []
# Get all segments that intersect with ray.
for seg in segments:
p = maze_env_utils.ray_segment_intersect(
ray=((robot_x, robot_y), ray_ori),
segment=seg["segment"])
if p is not None:
ray_segments.append(dict(
segment=seg["segment"],
type=seg["type"],
ray_ori=ray_ori,
distance=maze_env_utils.point_distance(p, (robot_x, robot_y)),
))
if len(ray_segments) > 0:
# Find out which segment is intersected first.
first_seg = sorted(ray_segments, key=lambda x: x["distance"])[0]
seg_type = first_seg["type"]
idx = (0 if seg_type == 1 else # Wall.
1 if seg_type == -1 else # Drop-off.
2 if maze_env_utils.can_move(seg_type) else # Block.
None)
if first_seg["distance"] <= self._sensor_range:
sensor_readings[ray_idx][idx] = (self._sensor_range - first_seg["distance"]) / self._sensor_range
return sensor_readings
def _get_obs(self):
return np.concatenate([self.wrapped_env._get_obs(),
[self.t * 0.001]])
wrapped_obs = self.wrapped_env._get_obs()
if self._top_down_view:
view = [self.get_top_down_view().flat]
else:
view = []
if self._observe_blocks:
additional_obs = []
for block_name, block_type in self.movable_blocks:
additional_obs.append(self.wrapped_env.get_body_com(block_name))
wrapped_obs = np.concatenate([wrapped_obs[:3]] + additional_obs +
[wrapped_obs[3:]])
range_sensor_obs = self.get_range_sensor_obs()
return np.concatenate([wrapped_obs,
range_sensor_obs.flat] +
view + [[self.t * 0.001]])
def reset(self):
self.t = 0
self.trajectory = []
self.wrapped_env.reset()
if len(self._init_positions) > 1:
xy = random.choice(self._init_positions)
self.wrapped_env.set_xy(xy)
return self._get_obs()
@property
......@@ -226,9 +459,41 @@ class MazeEnv(gym.Env):
return j * size_scaling, i * size_scaling
assert False, 'No robot in maze specification.'
def _find_all_robots(self):
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
coords = []
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 'r':
coords.append((j * size_scaling, i * size_scaling))
return coords
def _is_in_collision(self, pos):
x, y = pos
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 1:
minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
if minx <= x <= maxx and miny <= y <= maxy:
return True
return False
def step(self, action):
self.t += 1
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
if self._manual_collision:
old_pos = self.wrapped_env.get_xy()
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
new_pos = self.wrapped_env.get_xy()
if self._is_in_collision(new_pos):
self.wrapped_env.set_xy(old_pos)
else:
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
next_obs = self._get_obs()
done = False
return next_obs, inner_reward, done, info
......@@ -26,20 +26,27 @@ class Move(object):
XZ = 15
YZ = 16
XYZ = 17
SpinXY = 18
def can_move_x(movable):
return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ]
return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
Move.SpinXY]
def can_move_y(movable):
return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ]
return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
Move.SpinXY]
def can_move_z(movable):
return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]
def can_spin(movable):
return movable in [Move.SpinXY]
def can_move(movable):
return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)
......@@ -70,7 +77,88 @@ def construct_maze(maze_id='Maze'):
[1, 0, 0, 1],
[1, 1, 1, 1],
]
elif maze_id == 'Block':
O = 'r'
structure = [
[1, 1, 1, 1, 1],
[1, O, 0, 0, 1],
[1, 0, 0, 0, 1],
[1, 0, 0, 0, 1],
[1, 1, 1, 1, 1],
]
elif maze_id == 'BlockMaze':
O = 'r'
structure = [
[1, 1, 1, 1],
[1, O, 0, 1],
[1, 1, 0, 1],
[1, 0, 0, 1],
[1, 1, 1, 1],
]
else:
raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id)
return structure
def line_intersect(pt1, pt2, ptA, ptB):
"""
Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
"""
DET_TOLERANCE = 0.00000001
# the first line is pt1 + r*(pt2-pt1)
# in component form:
x1, y1 = pt1
x2, y2 = pt2
dx1 = x2 - x1
dy1 = y2 - y1
# the second line is ptA + s*(ptB-ptA)
x, y = ptA
xB, yB = ptB
dx = xB - x
dy = yB - y
DET = (-dx1 * dy + dy1 * dx)
if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0)
# now, the determinant should be OK
DETinv = 1.0 / DET
# find the scalar amount along the "self" segment
r = DETinv * (-dy * (x - x1) + dx * (y - y1))
# find the scalar amount along the input line
s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
# return the average of the two descriptions
xi = (x1 + r * dx1 + x + s * dx) / 2.0
yi = (y1 + r * dy1 + y + s * dy) / 2.0
return (xi, yi, 1, r, s)
def ray_segment_intersect(ray, segment):
"""
Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2),
and return the intersection point if there is one
"""
(x, y), theta = ray
# (x1, y1), (x2, y2) = segment
pt1 = (x, y)
len = 1
pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
if valid and r >= 0 and 0 <= s <= 1:
return (xo, yo)
return None
def point_distance(p1, p2):
x1, y1 = p1
x2, y2 = p2
return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper for creating the ant environment in gym_mujoco."""
import math
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env
class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
FILE = "point.xml"
ORI_IND = 2
def __init__(self, file_path=None, expose_all_qpos=True):
self._expose_all_qpos = expose_all_qpos
mujoco_env.MujocoEnv.__init__(self, file_path, 1)
utils.EzPickle.__init__(self)
@property
def physics(self):
return self.model
def _step(self, a):
return self.step(a)
def step(self, action):
action[0] = 0.2 * action[0]
qpos = np.copy(self.physics.data.qpos)
qpos[2] += action[1]
ori = qpos[2]
# compute increment in each direction
dx = math.cos(ori) * action[0]
dy = math.sin(ori) * action[0]
# ensure that the robot is within reasonable range
qpos[0] = np.clip(qpos[0] + dx, -100, 100)
qpos[1] = np.clip(qpos[1] + dy, -100, 100)
qvel = self.physics.data.qvel
self.set_state(qpos, qvel)
for _ in range(0, self.frame_skip):
self.physics.step()
next_obs = self._get_obs()
reward = 0
done = False
info = {}
return next_obs, reward, done, info
def _get_obs(self):
if self._expose_all_qpos:
return np.concatenate([
self.physics.data.qpos.flat[:3], # Only point-relevant coords.
self.physics.data.qvel.flat[:3]])
return np.concatenate([
self.physics.data.qpos.flat[2:3],
self.physics.data.qvel.flat[:3]])
def reset_model(self):
qpos = self.init_qpos + self.np_random.uniform(
size=self.physics.model.nq, low=-.1, high=.1)
qvel = self.init_qvel + self.np_random.randn(self.physics.model.nv) * .1
# Set everything other than point to original position and 0 velocity.
qpos[3:] = self.init_qpos[3:]
qvel[3:] = 0.
self.set_state(qpos, qvel)
return self._get_obs()
def get_ori(self):
return self.model.data.qpos[self.__class__.ORI_IND]
def set_xy(self, xy):
qpos = np.copy(self.physics.data.qpos)
qpos[0] = xy[0]
qpos[1] = xy[1]
qvel = self.physics.data.qvel
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from environments.maze_env import MazeEnv
from environments.point import PointEnv
class PointMazeEnv(MazeEnv):
MODEL_CLASS = PointEnv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment