Merge pull request #5870 from ofirnachum/master

Add training and eval code for efficient-hrl

Merge pull request #5870 from ofirnachum/master
Add training and eval code for efficient-hrl
c9f03bf6 · Neal Wu · GitHub · 2c181308 · 052361de · c9f03bf6
Unverified Commit c9f03bf6 authored Dec 06, 2018 by Neal Wu Committed by GitHub Dec 06, 2018
20 changed files
--- a/research/efficient-hrl/context/configs/ant_push_single.gin
+++ b/research/efficient-hrl/context/configs/ant_push_single.gin
+#-*-Python-*-
+create_maze_env.env_name = "AntPush"
+context_range = (%CONTEXT_RANGE_MIN, %CONTEXT_RANGE_MAX)
+meta_context_range = ((-16, -4), (16, 20))
+
+RESET_EPISODE_PERIOD = 500
+RESET_ENV_PERIOD = 1
+# End episode every N steps
+UvfAgent.reset_episode_cond_fn = @every_n_steps
+every_n_steps.n = %RESET_EPISODE_PERIOD
+train_uvf.max_steps_per_episode = %RESET_EPISODE_PERIOD
+# Do a manual reset every N episodes
+UvfAgent.reset_env_cond_fn = @every_n_episodes
+every_n_episodes.n = %RESET_ENV_PERIOD
+every_n_episodes.steps_per_episode = %RESET_EPISODE_PERIOD
+
+## Config defaults
+EVAL_MODES = ["eval2"]
+
+## Config agent
+CONTEXT = @agent/Context
+META_CONTEXT = @meta/Context
+
+## Config agent context
+agent/Context.context_ranges = [%context_range]
+agent/Context.context_shapes = [%SUBGOAL_DIM]
+agent/Context.meta_action_every_n = 10
+agent/Context.samplers = {
+    "train": [@train/DirectionSampler],
+    "explore": [@train/DirectionSampler],
+}
+
+agent/Context.context_transition_fn = @relative_context_transition_fn
+agent/Context.context_multi_transition_fn = @relative_context_multi_transition_fn
+
+agent/Context.reward_fn = @uvf/negative_distance
+
+## Config meta context
+meta/Context.context_ranges = [%meta_context_range]
+meta/Context.context_shapes = [2]
+meta/Context.samplers = {
+    "train": [@eval2/ConstantSampler],
+    "explore": [@eval2/ConstantSampler],
+    "eval2": [@eval2/ConstantSampler],
+}
+meta/Context.reward_fn = @task/negative_distance
+
+## Config rewards
+task/negative_distance.state_indices = [0, 1]
+task/negative_distance.relative_context = False
+task/negative_distance.diff = False
+task/negative_distance.offset = 0.0
+
+## Config samplers
+train/RandomSampler.context_range = %meta_context_range
+train/DirectionSampler.context_range = %context_range
+train/DirectionSampler.k = %SUBGOAL_DIM
+relative_context_transition_fn.k = %SUBGOAL_DIM
+relative_context_multi_transition_fn.k = %SUBGOAL_DIM
+MetaAgent.k = %SUBGOAL_DIM
+
+eval2/ConstantSampler.value = [0, 19]
--- a/research/efficient-hrl/context/configs/default.gin
+++ b/research/efficient-hrl/context/configs/default.gin
+#-*-Python-*-
+ENV_CONTEXT = None
+EVAL_MODES = ["eval"]
+TARGET_Q_CLIPPING = None
+RESET_EPISODE_PERIOD = None
+ZERO_OBS = False
+CONTEXT_RANGE_MIN = -10
+CONTEXT_RANGE_MAX = 10
+SUBGOAL_DIM = 2
+
+uvf/negative_distance.summarize = False
+uvf/negative_distance.relative_context = True
--- a/research/efficient-hrl/context/configs/hiro_orig.gin
+++ b/research/efficient-hrl/context/configs/hiro_orig.gin
+#-*-Python-*-
+ENV_CONTEXT = None
+EVAL_MODES = ["eval"]
+TARGET_Q_CLIPPING = None
+RESET_EPISODE_PERIOD = None
+ZERO_OBS = True
+IMAGES = False
+CONTEXT_RANGE_MIN = (-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)
+CONTEXT_RANGE_MAX = ( 10,  10,  0.5,  1,  1,  1,  1,  0.5,  0.3,  0.5,  0.3,  0.5,  0.3,  0.5,  0.3)
+SUBGOAL_DIM = 15
+META_EXPLORE_NOISE = 1.0
+
+uvf/negative_distance.summarize = False
+uvf/negative_distance.relative_context = True
--- a/research/efficient-hrl/context/configs/hiro_repr.gin
+++ b/research/efficient-hrl/context/configs/hiro_repr.gin
+#-*-Python-*-
+ENV_CONTEXT = None
+EVAL_MODES = ["eval"]
+TARGET_Q_CLIPPING = None
+RESET_EPISODE_PERIOD = None
+ZERO_OBS = False
+IMAGES = False
+CONTEXT_RANGE_MIN = -10
+CONTEXT_RANGE_MAX = 10
+SUBGOAL_DIM = 2
+META_EXPLORE_NOISE = 5.0
+
+StatePreprocess.trainable = True
+StatePreprocess.state_preprocess_net = @state_preprocess_net
+StatePreprocess.action_embed_net = @action_embed_net
+
+uvf/negative_distance.summarize = False
+uvf/negative_distance.relative_context = True
--- a/research/efficient-hrl/context/configs/hiro_xy.gin
+++ b/research/efficient-hrl/context/configs/hiro_xy.gin
+#-*-Python-*-
+ENV_CONTEXT = None
+EVAL_MODES = ["eval"]
+TARGET_Q_CLIPPING = None
+RESET_EPISODE_PERIOD = None
+ZERO_OBS = False
+IMAGES = False
+CONTEXT_RANGE_MIN = -10
+CONTEXT_RANGE_MAX = 10
+SUBGOAL_DIM = 2
+META_EXPLORE_NOISE = 1.0
+
+uvf/negative_distance.summarize = False
+uvf/negative_distance.relative_context = True
--- a/research/efficient-hrl/context/configs/point_maze.gin
+++ b/research/efficient-hrl/context/configs/point_maze.gin
+#-*-Python-*-
+# NOTE: For best training, low-level exploration (uvf_add_noise_fn.stddev)
+# should be reduced to around 0.1.
+create_maze_env.env_name = "PointMaze"
+context_range_min = -10
+context_range_max = 10
+context_range = (%context_range_min, %context_range_max)
+meta_context_range = ((-2, -2), (10, 10))
+
+RESET_EPISODE_PERIOD = 500
+RESET_ENV_PERIOD = 1
+# End episode every N steps
+UvfAgent.reset_episode_cond_fn = @every_n_steps
+every_n_steps.n = %RESET_EPISODE_PERIOD
+train_uvf.max_steps_per_episode = %RESET_EPISODE_PERIOD
+# Do a manual reset every N episodes
+UvfAgent.reset_env_cond_fn = @every_n_episodes
+every_n_episodes.n = %RESET_ENV_PERIOD
+every_n_episodes.steps_per_episode = %RESET_EPISODE_PERIOD
+
+## Config defaults
+EVAL_MODES = ["eval1", "eval2", "eval3"]
+
+## Config agent
+CONTEXT = @agent/Context
+META_CONTEXT = @meta/Context
+
+## Config agent context
+agent/Context.context_ranges = [%context_range]
+agent/Context.context_shapes = [%SUBGOAL_DIM]
+agent/Context.meta_action_every_n = 10
+agent/Context.samplers = {
+    "train": [@train/DirectionSampler],
+    "explore": [@train/DirectionSampler],
+    "eval1": [@uvf_eval1/ConstantSampler],
+    "eval2": [@uvf_eval2/ConstantSampler],
+    "eval3": [@uvf_eval3/ConstantSampler],
+}
+
+agent/Context.context_transition_fn = @relative_context_transition_fn
+agent/Context.context_multi_transition_fn = @relative_context_multi_transition_fn
+
+agent/Context.reward_fn = @uvf/negative_distance
+
+## Config meta context
+meta/Context.context_ranges = [%meta_context_range]
+meta/Context.context_shapes = [2]
+meta/Context.samplers = {
+    "train": [@train/RandomSampler],
+    "explore": [@train/RandomSampler],
+    "eval1": [@eval1/ConstantSampler],
+    "eval2": [@eval2/ConstantSampler],
+    "eval3": [@eval3/ConstantSampler],
+}
+meta/Context.reward_fn = @task/negative_distance
+
+## Config rewards
+task/negative_distance.state_indices = [0, 1]
+task/negative_distance.relative_context = False
+task/negative_distance.diff = False
+task/negative_distance.offset = 0.0
+
+## Config samplers
+train/RandomSampler.context_range = %meta_context_range
+train/DirectionSampler.context_range = %context_range
+train/DirectionSampler.k = %SUBGOAL_DIM
+relative_context_transition_fn.k = %SUBGOAL_DIM
+relative_context_multi_transition_fn.k = %SUBGOAL_DIM
+MetaAgent.k = %SUBGOAL_DIM
+
+eval1/ConstantSampler.value = [8, 0]
+eval2/ConstantSampler.value = [8, 8]
+eval3/ConstantSampler.value = [0, 8]
--- a/research/efficient-hrl/context/context.py
+++ b/research/efficient-hrl/context/context.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Context for Universal Value Function agents.
+
+A context specifies a list of contextual variables, each with
+  own sampling and reward computation methods.
+
+Examples of contextual variables include
+  goal states, reward combination vectors, etc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+from tf_agents import specs
+import gin.tf
+from utils import utils as uvf_utils
+
+
+@gin.configurable
+class Context(object):
+  """Base context."""
+  VAR_NAME = 'action'
+
+  def __init__(self,
+               tf_env,
+               context_ranges=None,
+               context_shapes=None,
+               state_indices=None,
+               variable_indices=None,
+               gamma_index=None,
+               settable_context=False,
+               timers=None,
+               samplers=None,
+               reward_weights=None,
+               reward_fn=None,
+               random_sampler_mode='random',
+               normalizers=None,
+               context_transition_fn=None,
+               context_multi_transition_fn=None,
+               meta_action_every_n=None):
+    self._tf_env = tf_env
+    self.variable_indices = variable_indices
+    self.gamma_index = gamma_index
+    self._settable_context = settable_context
+    self.timers = timers
+    self._context_transition_fn = context_transition_fn
+    self._context_multi_transition_fn = context_multi_transition_fn
+    self._random_sampler_mode = random_sampler_mode
+
+    # assign specs
+    self._obs_spec = self._tf_env.observation_spec()
+    self._context_shapes = tuple([
+        shape if shape is not None else self._obs_spec.shape
+        for shape in context_shapes
+    ])
+    self.context_specs = tuple([
+        specs.TensorSpec(dtype=self._obs_spec.dtype, shape=shape)
+        for shape in self._context_shapes
+    ])
+    if context_ranges is not None:
+      self.context_ranges = context_ranges
+    else:
+      self.context_ranges = [None] * len(self._context_shapes)
+
+    self.context_as_action_specs = tuple([
+        specs.BoundedTensorSpec(
+            shape=shape,
+            dtype=(tf.float32 if self._obs_spec.dtype in
+                   [tf.float32, tf.float64] else self._obs_spec.dtype),
+            minimum=context_range[0],
+            maximum=context_range[-1])
+        for shape, context_range in zip(self._context_shapes, self.context_ranges)
+    ])
+
+    if state_indices is not None:
+      self.state_indices = state_indices
+    else:
+      self.state_indices = [None] * len(self._context_shapes)
+    if self.variable_indices is not None and self.n != len(
+        self.variable_indices):
+      raise ValueError(
+          'variable_indices (%s) must have the same length as contexts (%s).' %
+          (self.variable_indices, self.context_specs))
+    assert self.n == len(self.context_ranges)
+    assert self.n == len(self.state_indices)
+
+    # assign reward/sampler fns
+    self._sampler_fns = dict()
+    self._samplers = dict()
+    self._reward_fns = dict()
+
+    # assign reward fns
+    self._add_custom_reward_fns()
+    reward_weights = reward_weights or None
+    self._reward_fn = self._make_reward_fn(reward_fn, reward_weights)
+
+    # assign samplers
+    self._add_custom_sampler_fns()
+    for mode, sampler_fns in samplers.items():
+      self._make_sampler_fn(sampler_fns, mode)
+
+    # create normalizers
+    if normalizers is None:
+      self._normalizers = [None] * len(self.context_specs)
+    else:
+      self._normalizers = [
+          normalizer(tf.zeros(shape=spec.shape, dtype=spec.dtype))
+          if normalizer is not None else None
+          for normalizer, spec in zip(normalizers, self.context_specs)
+      ]
+    assert self.n == len(self._normalizers)
+
+    self.meta_action_every_n = meta_action_every_n
+
+    # create vars
+    self.context_vars = {}
+    self.timer_vars = {}
+    self.create_vars(self.VAR_NAME)
+    self.t = tf.Variable(
+        tf.zeros(shape=(), dtype=tf.int32), name='num_timer_steps')
+
+  def _add_custom_reward_fns(self):
+    pass
+
+  def _add_custom_sampler_fns(self):
+    pass
+
+  def sample_random_contexts(self, batch_size):
+    """Sample random batch contexts."""
+    assert self._random_sampler_mode is not None
+    return self.sample_contexts(self._random_sampler_mode, batch_size)[0]
+
+  def sample_contexts(self, mode, batch_size, state=None, next_state=None,
+                      **kwargs):
+    """Sample a batch of contexts.
+
+    Args:
+      mode: A string representing the mode [`train`, `explore`, `eval`].
+      batch_size: Batch size.
+    Returns:
+      Two lists of [batch_size, num_context_dims] contexts.
+    """
+    contexts, next_contexts = self._sampler_fns[mode](
+        batch_size, state=state, next_state=next_state,
+        **kwargs)
+    self._validate_contexts(contexts)
+    self._validate_contexts(next_contexts)
+    return contexts, next_contexts
+
+  def compute_rewards(self, mode, states, actions, rewards, next_states,
+                      contexts):
+    """Compute context-based rewards.
+
+    Args:
+      mode: A string representing the mode ['uvf', 'task'].
+      states: A [batch_size, num_state_dims] tensor.
+      actions: A [batch_size, num_action_dims] tensor.
+      rewards: A [batch_size] tensor representing unmodified rewards.
+      next_states: A [batch_size, num_state_dims] tensor.
+      contexts: A list of [batch_size, num_context_dims] tensors.
+    Returns:
+      A [batch_size] tensor representing rewards.
+    """
+    return self._reward_fn(states, actions, rewards, next_states,
+                           contexts)
+
+  def _make_reward_fn(self, reward_fns_list, reward_weights):
+    """Returns a fn that computes rewards.
+
+    Args:
+      reward_fns_list: A fn or a list of reward fns.
+      mode: A string representing the operating mode.
+      reward_weights: A list of reward weights.
+    """
+    if not isinstance(reward_fns_list, (list, tuple)):
+      reward_fns_list = [reward_fns_list]
+    if reward_weights is None:
+      reward_weights = [1.0] * len(reward_fns_list)
+    assert len(reward_fns_list) == len(reward_weights)
+
+    reward_fns_list = [
+        self._custom_reward_fns[fn] if isinstance(fn, (str,)) else fn
+        for fn in reward_fns_list
+    ]
+
+    def reward_fn(*args, **kwargs):
+      """Returns rewards, discounts."""
+      reward_tuples = [
+          reward_fn(*args, **kwargs) for reward_fn in reward_fns_list
+      ]
+      rewards_list = [reward_tuple[0] for reward_tuple in reward_tuples]
+      discounts_list = [reward_tuple[1] for reward_tuple in reward_tuples]
+      ndims = max([r.shape.ndims for r in rewards_list])
+      if ndims > 1:  # expand reward shapes to allow broadcasting
+        for i in range(len(rewards_list)):
+          for _ in range(rewards_list[i].shape.ndims - ndims):
+            rewards_list[i] = tf.expand_dims(rewards_list[i], axis=-1)
+          for _ in range(discounts_list[i].shape.ndims - ndims):
+            discounts_list[i] = tf.expand_dims(discounts_list[i], axis=-1)
+      rewards = tf.add_n(
+          [r * tf.to_float(w) for r, w in zip(rewards_list, reward_weights)])
+      discounts = discounts_list[0]
+      for d in discounts_list[1:]:
+        discounts *= d
+
+      return rewards, discounts
+
+    return reward_fn
+
+  def _make_sampler_fn(self, sampler_cls_list, mode):
+    """Returns a fn that samples a list of context vars.
+
+    Args:
+      sampler_cls_list: A list of sampler classes.
+      mode: A string representing the operating mode.
+    """
+    if not isinstance(sampler_cls_list, (list, tuple)):
+      sampler_cls_list = [sampler_cls_list]
+
+    self._samplers[mode] = []
+    sampler_fns = []
+    for spec, sampler in zip(self.context_specs, sampler_cls_list):
+      if isinstance(sampler, (str,)):
+        sampler_fn = self._custom_sampler_fns[sampler]
+      else:
+        sampler_fn = sampler(context_spec=spec)
+        self._samplers[mode].append(sampler_fn)
+      sampler_fns.append(sampler_fn)
+
+    def batch_sampler_fn(batch_size, state=None, next_state=None, **kwargs):
+      """Sampler fn."""
+      contexts_tuples = [
+          sampler(batch_size, state=state, next_state=next_state, **kwargs)
+          for sampler in sampler_fns]
+      contexts = [c[0] for c in contexts_tuples]
+      next_contexts = [c[1] for c in contexts_tuples]
+      contexts = [
+          normalizer.update_apply(c) if normalizer is not None else c
+          for normalizer, c in zip(self._normalizers, contexts)
+      ]
+      next_contexts = [
+          normalizer.apply(c) if normalizer is not None else c
+          for normalizer, c in zip(self._normalizers, next_contexts)
+      ]
+      return contexts, next_contexts
+
+    self._sampler_fns[mode] = batch_sampler_fn
+
+  def set_env_context_op(self, context, disable_unnormalizer=False):
+    """Returns a TensorFlow op that sets the environment context.
+
+    Args:
+      context: A list of context Tensor variables.
+      disable_unnormalizer: Disable unnormalization.
+    Returns:
+      A TensorFlow op that sets the environment context.
+    """
+    ret_val = np.array(1.0, dtype=np.float32)
+    if not self._settable_context:
+      return tf.identity(ret_val)
+
+    if not disable_unnormalizer:
+      context = [
+          normalizer.unapply(tf.expand_dims(c, 0))[0]
+          if normalizer is not None else c
+          for normalizer, c in zip(self._normalizers, context)
+      ]
+
+    def set_context_func(*env_context_values):
+      tf.logging.info('[set_env_context_op] Setting gym environment context.')
+      # pylint: disable=protected-access
+      self.gym_env.set_context(*env_context_values)
+      return ret_val
+      # pylint: enable=protected-access
+
+    with tf.name_scope('set_env_context'):
+      set_op = tf.py_func(set_context_func, context, tf.float32,
+                          name='set_env_context_py_func')
+      set_op.set_shape([])
+    return set_op
+
+  def set_replay(self, replay):
+    """Set replay buffer for samplers.
+
+    Args:
+      replay: A replay buffer.
+    """
+    for _, samplers in self._samplers.items():
+      for sampler in samplers:
+        sampler.set_replay(replay)
+
+  def get_clip_fns(self):
+    """Returns a list of clip fns for contexts.
+
+    Returns:
+      A list of fns that clip context tensors.
+    """
+    clip_fns = []
+    for context_range in self.context_ranges:
+      def clip_fn(var_, range_=context_range):
+        """Clip a tensor."""
+        if range_ is None:
+          clipped_var = tf.identity(var_)
+        elif isinstance(range_[0], (int, long, float, list, np.ndarray)):
+          clipped_var = tf.clip_by_value(
+              var_,
+              range_[0],
+              range_[1],)
+        else: raise NotImplementedError(range_)
+        return clipped_var
+      clip_fns.append(clip_fn)
+    return clip_fns
+
+  def _validate_contexts(self, contexts):
+    """Validate if contexts have right specs.
+
+    Args:
+      contexts: A list of [batch_size, num_context_dim] tensors.
+    Raises:
+      ValueError: If shape or dtype mismatches that of spec.
+    """
+    for i, (context, spec) in enumerate(zip(contexts, self.context_specs)):
+      if context[0].shape != spec.shape:
+        raise ValueError('contexts[%d] has invalid shape %s wrt spec shape %s' %
+                         (i, context[0].shape, spec.shape))
+      if context.dtype != spec.dtype:
+        raise ValueError('contexts[%d] has invalid dtype %s wrt spec dtype %s' %
+                         (i, context.dtype, spec.dtype))
+
+  def context_multi_transition_fn(self, contexts, **kwargs):
+    """Returns multiple future contexts starting from a batch."""
+    assert self._context_multi_transition_fn
+    return self._context_multi_transition_fn(contexts, None, None, **kwargs)
+
+  def step(self, mode, agent=None, action_fn=None, **kwargs):
+    """Returns [next_contexts..., next_timer] list of ops.
+
+    Args:
+      mode: a string representing the mode=[train, explore, eval].
+      **kwargs: kwargs for context_transition_fn.
+    Returns:
+      a list of ops that set the context.
+    """
+    if agent is None:
+      ops = []
+      if self._context_transition_fn is not None:
+        def sampler_fn():
+          samples = self.sample_contexts(mode, 1)[0]
+          return [s[0] for s in samples]
+        values = self._context_transition_fn(self.vars, self.t, sampler_fn, **kwargs)
+        ops += [tf.assign(var, value) for var, value in zip(self.vars, values)]
+      ops.append(tf.assign_add(self.t, 1))  # increment timer
+      return ops
+    else:
+      ops = agent.tf_context.step(mode, **kwargs)
+      state = kwargs['state']
+      next_state = kwargs['next_state']
+      state_repr = kwargs['state_repr']
+      next_state_repr = kwargs['next_state_repr']
+      with tf.control_dependencies(ops):  # Step high level context before computing low level one.
+        # Get the context transition function output.
+        values = self._context_transition_fn(self.vars, self.t, None,
+                                             state=state_repr,
+                                             next_state=next_state_repr)
+        # Select a new goal every C steps, otherwise use context transition.
+        low_level_context = [
+            tf.cond(tf.equal(self.t % self.meta_action_every_n, 0),
+                    lambda: tf.cast(action_fn(next_state, context=None), tf.float32),
+                    lambda: values)]
+        ops = [tf.assign(var, value)
+               for var, value in zip(self.vars, low_level_context)]
+        with tf.control_dependencies(ops):
+          return [tf.assign_add(self.t, 1)]  # increment timer
+        return ops
+
+  def reset(self, mode, agent=None, action_fn=None, state=None):
+    """Returns ops that reset the context.
+
+    Args:
+      mode: a string representing the mode=[train, explore, eval].
+    Returns:
+      a list of ops that reset the context.
+    """
+    if agent is None:
+      values = self.sample_contexts(mode=mode, batch_size=1)[0]
+      if values is None:
+        return []
+      values = [value[0] for value in values]
+      values[0] = uvf_utils.tf_print(
+          values[0],
+          values,
+          message='context:reset, mode=%s' % mode,
+          first_n=10,
+          name='context:reset:%s' % mode)
+      all_ops = []
+      for _, context_vars in sorted(self.context_vars.items()):
+        ops = [tf.assign(var, value) for var, value in zip(context_vars, values)]
+      all_ops += ops
+      all_ops.append(self.set_env_context_op(values))
+      all_ops.append(tf.assign(self.t, 0))  # reset timer
+      return all_ops
+    else:
+      ops = agent.tf_context.reset(mode)
+      # NOTE: The code is currently written in such a way that the higher level
+      # policy does not provide a low-level context until the second
+      # observation.  Insead, we just zero-out low-level contexts.
+      for key, context_vars in sorted(self.context_vars.items()):
+        ops += [tf.assign(var, tf.zeros_like(var)) for var, meta_var in
+                zip(context_vars, agent.tf_context.context_vars[key])]
+
+      ops.append(tf.assign(self.t, 0))  # reset timer
+      return ops
+
+  def create_vars(self, name, agent=None):
+    """Create tf variables for contexts.
+
+    Args:
+      name: Name of the variables.
+    Returns:
+      A list of [num_context_dims] tensors.
+    """
+    if agent is not None:
+      meta_vars = agent.create_vars(name)
+    else:
+      meta_vars = {}
+    assert name not in self.context_vars, ('Conflict! %s is already '
+                                           'initialized.') % name
+    self.context_vars[name] = tuple([
+        tf.Variable(
+            tf.zeros(shape=spec.shape, dtype=spec.dtype),
+            name='%s_context_%d' % (name, i))
+        for i, spec in enumerate(self.context_specs)
+    ])
+    return self.context_vars[name], meta_vars
+
+  @property
+  def n(self):
+    return len(self.context_specs)
+
+  @property
+  def vars(self):
+    return self.context_vars[self.VAR_NAME]
+
+  # pylint: disable=protected-access
+  @property
+  def gym_env(self):
+    return self._tf_env.pyenv._gym_env
+
+  @property
+  def tf_env(self):
+    return self._tf_env
+  # pylint: enable=protected-access
--- a/research/efficient-hrl/context/context_transition_functions.py
+++ b/research/efficient-hrl/context/context_transition_functions.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Context functions.
+
+Given the current contexts, timer and context sampler, returns new contexts
+  after an environment step. This can be used to define a high-level policy
+  that controls contexts as its actions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import gin.tf
+import utils as uvf_utils
+
+
+@gin.configurable
+def periodic_context_fn(contexts, timer, sampler_fn, period=1):
+  """Periodically samples contexts.
+
+  Args:
+    contexts: a list of [num_context_dims] tensor variables representing
+      current contexts.
+    timer: a scalar integer tensor variable holding the current time step.
+    sampler_fn: a sampler function that samples a list of [num_context_dims]
+      tensors.
+    period: (integer) period of update.
+  Returns:
+    a list of [num_context_dims] tensors.
+  """
+  contexts = list(contexts[:])  # create copy
+  return tf.cond(tf.mod(timer, period) == 0, sampler_fn, lambda: contexts)
+
+
+@gin.configurable
+def timer_context_fn(contexts,
+                     timer,
+                     sampler_fn,
+                     period=1,
+                     timer_index=-1,
+                     debug=False):
+  """Samples contexts based on timer in contexts.
+
+  Args:
+    contexts: a list of [num_context_dims] tensor variables representing
+      current contexts.
+    timer: a scalar integer tensor variable holding the current time step.
+    sampler_fn: a sampler function that samples a list of [num_context_dims]
+      tensors.
+    period: (integer) period of update; actual period = `period` + 1.
+    timer_index: (integer) Index of context list that present timer.
+    debug: (boolean) Print debug messages.
+  Returns:
+    a list of [num_context_dims] tensors.
+  """
+  contexts = list(contexts[:])  # create copy
+  cond = tf.equal(contexts[timer_index][0], 0)
+  def reset():
+    """Sample context and reset the timer."""
+    new_contexts = sampler_fn()
+    new_contexts[timer_index] = tf.zeros_like(
+        contexts[timer_index]) + period
+    return new_contexts
+  def update():
+    """Decrement the timer."""
+    contexts[timer_index] -= 1
+    return contexts
+  values = tf.cond(cond, reset, update)
+  if debug:
+    values[0] = uvf_utils.tf_print(
+        values[0],
+        values + [timer],
+        'timer_context_fn',
+        first_n=200,
+        name='timer_context_fn:contexts')
+  return values
+
+
+@gin.configurable
+def relative_context_transition_fn(
+    contexts, timer, sampler_fn,
+    k=2, state=None, next_state=None,
+    **kwargs):
+  """Contexts updated to be relative to next state.
+  """
+  contexts = list(contexts[:])  # create copy
+  assert len(contexts) == 1
+  new_contexts = [
+      tf.concat(
+          [contexts[0][:k] + state[:k] - next_state[:k],
+           contexts[0][k:]], -1)]
+  return new_contexts
+
+
+@gin.configurable
+def relative_context_multi_transition_fn(
+    contexts, timer, sampler_fn,
+    k=2, states=None,
+    **kwargs):
+  """Given contexts at first state and sequence of states, derives sequence of all contexts.
+  """
+  contexts = list(contexts[:])  # create copy
+  assert len(contexts) == 1
+  contexts = [
+      tf.concat(
+          [tf.expand_dims(contexts[0][:, :k] + states[:, 0, :k], 1) - states[:, :, :k],
+           contexts[0][:, None, k:] * tf.ones_like(states[:, :, :1])], -1)]
+  return contexts
--- a/research/efficient-hrl/context/gin_imports.py
+++ b/research/efficient-hrl/context/gin_imports.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Import gin configurable modules.
+"""
+
+# pylint: disable=unused-import
+from context import context
+from context import context_transition_functions
+from context import gin_utils
+from context import rewards_functions
+from context import samplers
+# pylint: disable=unused-import
--- a/research/efficient-hrl/context/gin_utils.py
+++ b/research/efficient-hrl/context/gin_utils.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gin configurable utility functions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import gin.tf
+
+
+@gin.configurable
+def gin_sparse_array(size, values, indices, fill_value=0):
+  arr = np.zeros(size)
+  arr.fill(fill_value)
+  arr[indices] = values
+  return arr
+
+
+@gin.configurable
+def gin_sum(values):
+  result = values[0]
+  for value in values[1:]:
+    result += value
+  return result
+
+
+@gin.configurable
+def gin_range(n):
+  return range(n)
--- a/research/efficient-hrl/context/rewards_functions.py
+++ b/research/efficient-hrl/context/rewards_functions.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Reward shaping functions used by Contexts.
+
+  Each reward function should take the following inputs and return new rewards,
+    and discounts.
+
+  new_rewards, discounts = reward_fn(states, actions, rewards,
+    next_states, contexts)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import gin.tf
+
+
+def summarize_stats(stats):
+  """Summarize a dictionary of variables.
+
+  Args:
+    stats: a dictionary of {name: tensor} to compute stats over.
+  """
+  for name, stat in stats.items():
+    mean = tf.reduce_mean(stat)
+    tf.summary.scalar('mean_%s' % name, mean)
+    tf.summary.scalar('max_%s' % name, tf.reduce_max(stat))
+    tf.summary.scalar('min_%s' % name, tf.reduce_min(stat))
+    std = tf.sqrt(tf.reduce_mean(tf.square(stat)) - tf.square(mean) + 1e-10)
+    tf.summary.scalar('std_%s' % name, std)
+    tf.summary.histogram(name, stat)
+
+
+def index_states(states, indices):
+  """Return indexed states.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    indices: (a list of Numpy integer array) Indices of states dimensions
+      to be mapped.
+  Returns:
+    A [batch_size, num_indices] Tensor representing the batch of indexed states.
+  """
+  if indices is None:
+    return states
+  indices = tf.constant(indices, dtype=tf.int32)
+  return tf.gather(states, indices=indices, axis=1)
+
+
+def record_tensor(tensor, indices, stats, name='states'):
+  """Record specified tensor dimensions into stats.
+
+  Args:
+    tensor: A [batch_size, num_dims] Tensor.
+    indices: (a list of integers) Indices of dimensions to record.
+    stats: A dictionary holding stats.
+    name: (string) Name of tensor.
+  """
+  if indices is None:
+    indices = range(tensor.shape.as_list()[1])
+  for index in indices:
+    stats['%s_%02d' % (name, index)] = tensor[:, index]
+
+
+@gin.configurable
+def potential_rewards(states,
+                      actions,
+                      rewards,
+                      next_states,
+                      contexts,
+                      gamma=1.0,
+                      reward_fn=None):
+  """Return the potential-based rewards.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    gamma: Reward discount.
+    reward_fn: A reward function.
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del actions  # unused args
+  gamma = tf.to_float(gamma)
+  rewards_tp1, discounts = reward_fn(None, None, rewards, next_states, contexts)
+  rewards, _ = reward_fn(None, None, rewards, states, contexts)
+  return -rewards + gamma * rewards_tp1, discounts
+
+
+@gin.configurable
+def timed_rewards(states,
+                  actions,
+                  rewards,
+                  next_states,
+                  contexts,
+                  reward_fn=None,
+                  dense=False,
+                  timer_index=-1):
+  """Return the timed rewards.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    reward_fn: A reward function.
+    dense: (boolean) Provide dense rewards or sparse rewards at time = 0.
+    timer_index: (integer) The context list index that specifies timer.
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  assert contexts[timer_index].get_shape().as_list()[1] == 1
+  timers = contexts[timer_index][:, 0]
+  rewards, discounts = reward_fn(states, actions, rewards, next_states,
+                                 contexts)
+  terminates = tf.to_float(timers <= 0)  # if terminate set 1, else set 0
+  for _ in range(rewards.shape.ndims - 1):
+    terminates = tf.expand_dims(terminates, axis=-1)
+  if not dense:
+    rewards *= terminates  # if terminate, return rewards, else return 0
+  discounts *= (tf.to_float(1.0) - terminates)
+  return rewards, discounts
+
+
+@gin.configurable
+def reset_rewards(states,
+                  actions,
+                  rewards,
+                  next_states,
+                  contexts,
+                  reset_index=0,
+                  reset_state=None,
+                  reset_reward_function=None,
+                  include_forward_rewards=True,
+                  include_reset_rewards=True):
+  """Returns the rewards for a forward/reset agent.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    reset_index: (integer) The context list index that specifies reset.
+    reset_state: Reset state.
+    reset_reward_function: Reward function for reset step.
+    include_forward_rewards: Include the rewards from the forward pass.
+    include_reset_rewards: Include the rewards from the reset pass.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  reset_state = tf.constant(
+      reset_state, dtype=next_states.dtype, shape=next_states.shape)
+  reset_states = tf.expand_dims(reset_state, 0)
+
+  def true_fn():
+    if include_reset_rewards:
+      return reset_reward_function(states, actions, rewards, next_states,
+                                   [reset_states] + contexts[1:])
+    else:
+      return tf.zeros_like(rewards), tf.ones_like(rewards)
+
+  def false_fn():
+    if include_forward_rewards:
+      return plain_rewards(states, actions, rewards, next_states, contexts)
+    else:
+      return tf.zeros_like(rewards), tf.ones_like(rewards)
+
+  rewards, discounts = tf.cond(
+      tf.cast(contexts[reset_index][0, 0], dtype=tf.bool), true_fn, false_fn)
+  return rewards, discounts
+
+
+@gin.configurable
+def tanh_similarity(states,
+                    actions,
+                    rewards,
+                    next_states,
+                    contexts,
+                    mse_scale=1.0,
+                    state_scales=1.0,
+                    goal_scales=1.0,
+                    summarize=False):
+  """Returns the similarity between next_states and contexts using tanh and mse.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    mse_scale: A float, to scale mse before tanh.
+    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
+      must be broadcastable to number of state dimensions.
+    goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
+      must be broadcastable to number of goal dimensions.
+    summarize: (boolean) enable summary ops.
+
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del states, actions, rewards  # Unused
+  mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
+                                             contexts[0] * goal_scales), -1)
+  tanh = tf.tanh(mse_scale * mse)
+  if summarize:
+    with tf.name_scope('RewardFn/'):
+      tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
+      tf.summary.histogram('mse', mse)
+      tf.summary.scalar('mean_tanh', tf.reduce_mean(tanh))
+      tf.summary.histogram('tanh', tanh)
+  rewards = tf.to_float(1 - tanh)
+  return rewards, tf.ones_like(rewards)
+
+
+@gin.configurable
+def negative_mse(states,
+                 actions,
+                 rewards,
+                 next_states,
+                 contexts,
+                 state_scales=1.0,
+                 goal_scales=1.0,
+                 summarize=False):
+  """Returns the negative mean square error between next_states and contexts.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
+      must be broadcastable to number of state dimensions.
+    goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
+      must be broadcastable to number of goal dimensions.
+    summarize: (boolean) enable summary ops.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del states, actions, rewards  # Unused
+  mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
+                                             contexts[0] * goal_scales), -1)
+  if summarize:
+    with tf.name_scope('RewardFn/'):
+      tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
+      tf.summary.histogram('mse', mse)
+  rewards = tf.to_float(-mse)
+  return rewards, tf.ones_like(rewards)
+
+
+@gin.configurable
+def negative_distance(states,
+                      actions,
+                      rewards,
+                      next_states,
+                      contexts,
+                      state_scales=1.0,
+                      goal_scales=1.0,
+                      reward_scales=1.0,
+                      weight_index=None,
+                      weight_vector=None,
+                      summarize=False,
+                      termination_epsilon=1e-4,
+                      state_indices=None,
+                      goal_indices=None,
+                      vectorize=False,
+                      relative_context=False,
+                      diff=False,
+                      norm='L2',
+                      epsilon=1e-10,
+                      bonus_epsilon=0., #5.,
+                      offset=0.0):
+  """Returns the negative euclidean distance between next_states and contexts.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
+      must be broadcastable to number of state dimensions.
+    goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
+      must be broadcastable to number of goal dimensions.
+    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
+      must be broadcastable to number of reward dimensions.
+    weight_index: (integer) The context list index that specifies weight.
+    weight_vector: (a number or a list or Numpy array) The weighting vector,
+      broadcastable to `next_states`.
+    summarize: (boolean) enable summary ops.
+    termination_epsilon: terminate if dist is less than this quantity.
+    state_indices: (a list of integers) list of state indices to select.
+    goal_indices: (a list of integers) list of goal indices to select.
+    vectorize: Return a vectorized form.
+    norm: L1 or L2.
+    epsilon: small offset to ensure non-negative/zero distance.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del actions, rewards  # Unused
+  stats = {}
+  record_tensor(next_states, state_indices, stats, 'next_states')
+  states = index_states(states, state_indices)
+  next_states = index_states(next_states, state_indices)
+  goals = index_states(contexts[0], goal_indices)
+  if relative_context:
+    goals = states + goals
+  sq_dists = tf.squared_difference(next_states * state_scales,
+                                   goals * goal_scales)
+  old_sq_dists = tf.squared_difference(states * state_scales,
+                                       goals * goal_scales)
+  record_tensor(sq_dists, None, stats, 'sq_dists')
+  if weight_vector is not None:
+    sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
+    old_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
+  if weight_index is not None:
+    #sq_dists *= contexts[weight_index]
+    weights = tf.abs(index_states(contexts[0], weight_index))
+    #weights /= tf.reduce_sum(weights, -1, keepdims=True)
+    sq_dists *= weights
+    old_sq_dists *= weights
+  if norm == 'L1':
+    dist = tf.sqrt(sq_dists + epsilon)
+    old_dist = tf.sqrt(old_sq_dists + epsilon)
+    if not vectorize:
+      dist = tf.reduce_sum(dist, -1)
+      old_dist = tf.reduce_sum(old_dist, -1)
+  elif norm == 'L2':
+    if vectorize:
+      dist = sq_dists
+      old_dist = old_sq_dists
+    else:
+      dist = tf.reduce_sum(sq_dists, -1)
+      old_dist = tf.reduce_sum(old_sq_dists, -1)
+    dist = tf.sqrt(dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
+    old_dist = tf.sqrt(old_dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
+  else:
+    raise NotImplementedError(norm)
+  discounts = dist > termination_epsilon
+  if summarize:
+    with tf.name_scope('RewardFn/'):
+      tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
+      tf.summary.histogram('dist', dist)
+      summarize_stats(stats)
+  bonus = tf.to_float(dist < bonus_epsilon)
+  dist *= reward_scales
+  old_dist *= reward_scales
+  if diff:
+    return bonus + offset + tf.to_float(old_dist - dist), tf.to_float(discounts)
+  return bonus + offset + tf.to_float(-dist), tf.to_float(discounts)
+
+
+@gin.configurable
+def cosine_similarity(states,
+                      actions,
+                      rewards,
+                      next_states,
+                      contexts,
+                      state_scales=1.0,
+                      goal_scales=1.0,
+                      reward_scales=1.0,
+                      normalize_states=True,
+                      normalize_goals=True,
+                      weight_index=None,
+                      weight_vector=None,
+                      summarize=False,
+                      state_indices=None,
+                      goal_indices=None,
+                      offset=0.0):
+  """Returns the cosine similarity between next_states - states and contexts.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
+      must be broadcastable to number of state dimensions.
+    goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
+      must be broadcastable to number of goal dimensions.
+    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
+      must be broadcastable to number of reward dimensions.
+    weight_index: (integer) The context list index that specifies weight.
+    weight_vector: (a number or a list or Numpy array) The weighting vector,
+      broadcastable to `next_states`.
+    summarize: (boolean) enable summary ops.
+    termination_epsilon: terminate if dist is less than this quantity.
+    state_indices: (a list of integers) list of state indices to select.
+    goal_indices: (a list of integers) list of goal indices to select.
+    vectorize: Return a vectorized form.
+    norm: L1 or L2.
+    epsilon: small offset to ensure non-negative/zero distance.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del actions, rewards  # Unused
+  stats = {}
+  record_tensor(next_states, state_indices, stats, 'next_states')
+  states = index_states(states, state_indices)
+  next_states = index_states(next_states, state_indices)
+  goals = index_states(contexts[0], goal_indices)
+
+  if weight_vector is not None:
+    goals *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
+  if weight_index is not None:
+    weights = tf.abs(index_states(contexts[0], weight_index))
+    goals *= weights
+
+  direction_vec = next_states - states
+  if normalize_states:
+    direction_vec = tf.nn.l2_normalize(direction_vec, -1)
+  goal_vec = goals
+  if normalize_goals:
+    goal_vec = tf.nn.l2_normalize(goal_vec, -1)
+
+  similarity = tf.reduce_sum(goal_vec * direction_vec, -1)
+  discounts = tf.ones_like(similarity)
+  return offset + tf.to_float(similarity), tf.to_float(discounts)
+
+
+@gin.configurable
+def diff_distance(states,
+                  actions,
+                  rewards,
+                  next_states,
+                  contexts,
+                  state_scales=1.0,
+                  goal_scales=1.0,
+                  reward_scales=1.0,
+                  weight_index=None,
+                  weight_vector=None,
+                  summarize=False,
+                  termination_epsilon=1e-4,
+                  state_indices=None,
+                  goal_indices=None,
+                  norm='L2',
+                  epsilon=1e-10):
+  """Returns the difference in euclidean distance between states/next_states and contexts.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
+      must be broadcastable to number of state dimensions.
+    goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
+      must be broadcastable to number of goal dimensions.
+    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
+      must be broadcastable to number of reward dimensions.
+    weight_index: (integer) The context list index that specifies weight.
+    weight_vector: (a number or a list or Numpy array) The weighting vector,
+      broadcastable to `next_states`.
+    summarize: (boolean) enable summary ops.
+    termination_epsilon: terminate if dist is less than this quantity.
+    state_indices: (a list of integers) list of state indices to select.
+    goal_indices: (a list of integers) list of goal indices to select.
+    vectorize: Return a vectorized form.
+    norm: L1 or L2.
+    epsilon: small offset to ensure non-negative/zero distance.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del actions, rewards  # Unused
+  stats = {}
+  record_tensor(next_states, state_indices, stats, 'next_states')
+  next_states = index_states(next_states, state_indices)
+  states = index_states(states, state_indices)
+  goals = index_states(contexts[0], goal_indices)
+  next_sq_dists = tf.squared_difference(next_states * state_scales,
+                                        goals * goal_scales)
+  sq_dists = tf.squared_difference(states * state_scales,
+                                   goals * goal_scales)
+  record_tensor(sq_dists, None, stats, 'sq_dists')
+  if weight_vector is not None:
+    next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
+    sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
+  if weight_index is not None:
+    next_sq_dists *= contexts[weight_index]
+    sq_dists *= contexts[weight_index]
+  if norm == 'L1':
+    next_dist = tf.sqrt(next_sq_dists + epsilon)
+    dist = tf.sqrt(sq_dists + epsilon)
+    next_dist = tf.reduce_sum(next_dist, -1)
+    dist = tf.reduce_sum(dist, -1)
+  elif norm == 'L2':
+    next_dist = tf.reduce_sum(next_sq_dists, -1)
+    next_dist = tf.sqrt(next_dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
+    dist = tf.reduce_sum(sq_dists, -1)
+    dist = tf.sqrt(dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
+  else:
+    raise NotImplementedError(norm)
+  discounts = next_dist > termination_epsilon
+  if summarize:
+    with tf.name_scope('RewardFn/'):
+      tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
+      tf.summary.histogram('dist', dist)
+      summarize_stats(stats)
+  diff = dist - next_dist
+  diff *= reward_scales
+  return tf.to_float(diff), tf.to_float(discounts)
+
+
+@gin.configurable
+def binary_indicator(states,
+                     actions,
+                     rewards,
+                     next_states,
+                     contexts,
+                     termination_epsilon=1e-4,
+                     offset=0,
+                     epsilon=1e-10,
+                     state_indices=None,
+                     summarize=False):
+  """Returns 0/1 by checking if next_states and contexts overlap.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    termination_epsilon: terminate if dist is less than this quantity.
+    offset: Offset the rewards.
+    epsilon: small offset to ensure non-negative/zero distance.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del states, actions  # unused args
+  next_states = index_states(next_states, state_indices)
+  dist = tf.reduce_sum(tf.squared_difference(next_states, contexts[0]), -1)
+  dist = tf.sqrt(dist + epsilon)
+  discounts = dist > termination_epsilon
+  rewards = tf.logical_not(discounts)
+  rewards = tf.to_float(rewards) + offset
+  return tf.to_float(rewards), tf.ones_like(tf.to_float(discounts)) #tf.to_float(discounts)
+
+
+@gin.configurable
+def plain_rewards(states, actions, rewards, next_states, contexts):
+  """Returns the given rewards.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del states, actions, next_states, contexts  # Unused
+  return rewards, tf.ones_like(rewards)
+
+
+@gin.configurable
+def ctrl_rewards(states,
+                 actions,
+                 rewards,
+                 next_states,
+                 contexts,
+                 reward_scales=1.0):
+  """Returns the negative control cost.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
+      must be broadcastable to number of reward dimensions.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del states, rewards, contexts  # Unused
+  if actions is None:
+    rewards = tf.to_float(tf.zeros(shape=next_states.shape[:1]))
+  else:
+    rewards = -tf.reduce_sum(tf.square(actions), axis=1)
+    rewards *= reward_scales
+    rewards = tf.to_float(rewards)
+  return rewards, tf.ones_like(rewards)
+
+
+@gin.configurable
+def diff_rewards(
+    states,
+    actions,
+    rewards,
+    next_states,
+    contexts,
+    state_indices=None,
+    goal_index=0,):
+  """Returns (next_states - goals) as a batched vector reward."""
+  del states, rewards, actions  # Unused
+  if state_indices is not None:
+    next_states = index_states(next_states, state_indices)
+  rewards = tf.to_float(next_states - contexts[goal_index])
+  return rewards, tf.ones_like(rewards)
+
+
+@gin.configurable
+def state_rewards(states,
+                  actions,
+                  rewards,
+                  next_states,
+                  contexts,
+                  weight_index=None,
+                  state_indices=None,
+                  weight_vector=1.0,
+                  offset_vector=0.0,
+                  summarize=False):
+  """Returns the rewards that are linear mapping of next_states.
+
+  Args:
+    states: A [batch_size, num_state_dims] Tensor representing a batch
+        of states.
+    actions: A [batch_size, num_action_dims] Tensor representing a batch
+      of actions.
+    rewards: A [batch_size] Tensor representing a batch of rewards.
+    next_states: A [batch_size, num_state_dims] Tensor representing a batch
+      of next states.
+    contexts: A list of [batch_size, num_context_dims] Tensor representing
+      a batch of contexts.
+    weight_index: (integer) Index of contexts lists that specify weighting.
+    state_indices: (a list of Numpy integer array) Indices of states dimensions
+      to be mapped.
+    weight_vector: (a number or a list or Numpy array) The weighting vector,
+      broadcastable to `next_states`.
+    offset_vector: (a number or a list of Numpy array) The off vector.
+    summarize: (boolean) enable summary ops.
+
+  Returns:
+    A new tf.float32 [batch_size] rewards Tensor, and
+      tf.float32 [batch_size] discounts tensor.
+  """
+  del states, actions, rewards  # unused args
+  stats = {}
+  record_tensor(next_states, state_indices, stats)
+  next_states = index_states(next_states, state_indices)
+  weight = tf.constant(
+      weight_vector, dtype=next_states.dtype, shape=next_states[0].shape)
+  weights = tf.expand_dims(weight, 0)
+  offset = tf.constant(
+      offset_vector, dtype=next_states.dtype, shape=next_states[0].shape)
+  offsets = tf.expand_dims(offset, 0)
+  if weight_index is not None:
+    weights *= contexts[weight_index]
+  rewards = tf.to_float(tf.reduce_sum(weights * (next_states+offsets), axis=1))
+  if summarize:
+    with tf.name_scope('RewardFn/'):
+      summarize_stats(stats)
+  return rewards, tf.ones_like(rewards)
--- a/research/efficient-hrl/context/samplers.py
+++ b/research/efficient-hrl/context/samplers.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Samplers for Contexts.
+
+  Each sampler class should define __call__(batch_size).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+slim = tf.contrib.slim
+import gin.tf
+
+
+@gin.configurable
+class BaseSampler(object):
+  """Base sampler."""
+
+  def __init__(self, context_spec, context_range=None, k=2, scope='sampler'):
+    """Construct a base sampler.
+
+    Args:
+      context_spec: A context spec.
+      context_range: A tuple of (minval, max), where minval, maxval are floats
+        or Numpy arrays with the same shape as the context.
+      scope: A string denoting scope.
+    """
+    self._context_spec = context_spec
+    self._context_range = context_range
+    self._k = k
+    self._scope = scope
+
+  def __call__(self, batch_size, **kwargs):
+    raise NotImplementedError
+
+  def set_replay(self, replay=None):
+    pass
+
+  def _validate_contexts(self, contexts):
+    """Validate if contexts have right spec.
+
+    Args:
+      contexts: A [batch_size, num_contexts_dim] tensor.
+    Raises:
+      ValueError: If shape or dtype mismatches that of spec.
+    """
+    if contexts[0].shape != self._context_spec.shape:
+      raise ValueError('contexts has invalid shape %s wrt spec shape %s' %
+                       (contexts[0].shape, self._context_spec.shape))
+    if contexts.dtype != self._context_spec.dtype:
+      raise ValueError('contexts has invalid dtype %s wrt spec dtype %s' %
+                       (contexts.dtype, self._context_spec.dtype))
+
+
+@gin.configurable
+class ZeroSampler(BaseSampler):
+  """Zero sampler."""
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    contexts = tf.zeros(
+        dtype=self._context_spec.dtype,
+        shape=[
+            batch_size,
+        ] + self._context_spec.shape.as_list())
+    return contexts, contexts
+
+
+@gin.configurable
+class BinarySampler(BaseSampler):
+  """Binary sampler."""
+
+  def __init__(self, probs=0.5, *args, **kwargs):
+    """Constructor."""
+    super(BinarySampler, self).__init__(*args, **kwargs)
+    self._probs = probs
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context."""
+    spec = self._context_spec
+    contexts = tf.random_uniform(
+        shape=[
+            batch_size,
+        ] + spec.shape.as_list(), dtype=tf.float32)
+    contexts = tf.cast(tf.greater(contexts, self._probs), dtype=spec.dtype)
+    return contexts, contexts
+
+
+@gin.configurable
+class RandomSampler(BaseSampler):
+  """Random sampler."""
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    spec = self._context_spec
+    context_range = self._context_range
+    if isinstance(context_range[0], (int, float)):
+      contexts = tf.random_uniform(
+          shape=[
+              batch_size,
+          ] + spec.shape.as_list(),
+          minval=context_range[0],
+          maxval=context_range[1],
+          dtype=spec.dtype)
+    elif isinstance(context_range[0], (list, tuple, np.ndarray)):
+      assert len(spec.shape.as_list()) == 1
+      assert spec.shape.as_list()[0] == len(context_range[0])
+      assert spec.shape.as_list()[0] == len(context_range[1])
+      contexts = tf.concat(
+          [
+              tf.random_uniform(
+                  shape=[
+                      batch_size, 1,
+                  ] + spec.shape.as_list()[1:],
+                  minval=context_range[0][i],
+                  maxval=context_range[1][i],
+                  dtype=spec.dtype) for i in range(spec.shape.as_list()[0])
+          ],
+          axis=1)
+    else: raise NotImplementedError(context_range)
+    self._validate_contexts(contexts)
+    state, next_state = kwargs['state'], kwargs['next_state']
+    if state is not None and next_state is not None:
+      pass
+      #contexts = tf.concat(
+      #    [tf.random_normal(tf.shape(state[:, :self._k]), dtype=tf.float64) +
+      #     tf.random_shuffle(state[:, :self._k]),
+      #     contexts[:, self._k:]], 1)
+
+    return contexts, contexts
+
+
+@gin.configurable
+class ScheduledSampler(BaseSampler):
+  """Scheduled sampler."""
+
+  def __init__(self,
+               scope='default',
+               values=None,
+               scheduler='cycle',
+               scheduler_params=None,
+               *args, **kwargs):
+    """Construct sampler.
+
+    Args:
+      scope: Scope name.
+      values: A list of numbers or [num_context_dim] Numpy arrays
+        representing the values to cycle.
+      scheduler: scheduler type.
+      scheduler_params: scheduler parameters.
+      *args: arguments.
+      **kwargs: keyword arguments.
+    """
+    super(ScheduledSampler, self).__init__(*args, **kwargs)
+    self._scope = scope
+    self._values = values
+    self._scheduler = scheduler
+    self._scheduler_params = scheduler_params or {}
+    assert self._values is not None and len(
+        self._values), 'must provide non-empty values.'
+    self._n = len(self._values)
+    # TODO(shanegu): move variable creation outside. resolve tf.cond problem.
+    self._count = 0
+    self._i = tf.Variable(
+        tf.zeros(shape=(), dtype=tf.int32),
+        name='%s-scheduled_sampler_%d' % (self._scope, self._count))
+    self._values = tf.constant(self._values, dtype=self._context_spec.dtype)
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    spec = self._context_spec
+    next_op = self._next(self._i)
+    with tf.control_dependencies([next_op]):
+      value = self._values[self._i]
+      if value.get_shape().as_list():
+        values = tf.tile(
+            tf.expand_dims(value, 0), (batch_size,) + (1,) * spec.shape.ndims)
+      else:
+        values = value + tf.zeros(
+            shape=[
+                batch_size,
+            ] + spec.shape.as_list(), dtype=spec.dtype)
+    self._validate_contexts(values)
+    self._count += 1
+    return values, values
+
+  def _next(self, i):
+    """Return op that increments pointer to next value.
+
+    Args:
+      i: A tensorflow integer variable.
+    Returns:
+      Op that increments pointer.
+    """
+    if self._scheduler == 'cycle':
+      inc = ('inc' in self._scheduler_params and
+             self._scheduler_params['inc']) or 1
+      return tf.assign(i, tf.mod(i+inc, self._n))
+    else:
+      raise NotImplementedError(self._scheduler)
+
+
+@gin.configurable
+class ReplaySampler(BaseSampler):
+  """Replay sampler."""
+
+  def __init__(self,
+               prefetch_queue_capacity=2,
+               override_indices=None,
+               state_indices=None,
+               *args,
+               **kwargs):
+    """Construct sampler.
+
+    Args:
+      prefetch_queue_capacity: Capacity for prefetch queue.
+      override_indices: Override indices.
+      state_indices: Select certain indices from state dimension.
+      *args: arguments.
+      **kwargs: keyword arguments.
+    """
+    super(ReplaySampler, self).__init__(*args, **kwargs)
+    self._prefetch_queue_capacity = prefetch_queue_capacity
+    self._override_indices = override_indices
+    self._state_indices = state_indices
+
+  def set_replay(self, replay):
+    """Set replay.
+
+    Args:
+      replay: A replay buffer.
+    """
+    self._replay = replay
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    batch = self._replay.GetRandomBatch(batch_size)
+    next_states = batch[4]
+    if self._prefetch_queue_capacity > 0:
+      batch_queue = slim.prefetch_queue.prefetch_queue(
+          [next_states],
+          capacity=self._prefetch_queue_capacity,
+          name='%s/batch_context_queue' % self._scope)
+      next_states = batch_queue.dequeue()
+    if self._override_indices is not None:
+      assert self._context_range is not None and isinstance(
+          self._context_range[0], (int, long, float))
+      next_states = tf.concat(
+          [
+              tf.random_uniform(
+                  shape=next_states[:, :1].shape,
+                  minval=self._context_range[0],
+                  maxval=self._context_range[1],
+                  dtype=next_states.dtype)
+              if i in self._override_indices else next_states[:, i:i + 1]
+              for i in range(self._context_spec.shape.as_list()[0])
+          ],
+          axis=1)
+    if self._state_indices is not None:
+      next_states = tf.concat(
+          [
+              next_states[:, i:i + 1]
+              for i in range(self._context_spec.shape.as_list()[0])
+          ],
+          axis=1)
+    self._validate_contexts(next_states)
+    return next_states, next_states
+
+
+@gin.configurable
+class TimeSampler(BaseSampler):
+  """Time Sampler."""
+
+  def __init__(self, minval=0, maxval=1, timestep=-1, *args, **kwargs):
+    """Construct sampler.
+
+    Args:
+      minval: Min value integer.
+      maxval: Max value integer.
+      timestep: Time step between states and next_states.
+      *args: arguments.
+      **kwargs: keyword arguments.
+    """
+    super(TimeSampler, self).__init__(*args, **kwargs)
+    assert self._context_spec.shape.as_list() == [1]
+    self._minval = minval
+    self._maxval = maxval
+    self._timestep = timestep
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    if self._maxval == self._minval:
+      contexts = tf.constant(
+          self._maxval, shape=[batch_size, 1], dtype=tf.int32)
+    else:
+      contexts = tf.random_uniform(
+          shape=[batch_size, 1],
+          dtype=tf.int32,
+          maxval=self._maxval,
+          minval=self._minval)
+    next_contexts = tf.maximum(contexts + self._timestep, 0)
+
+    return tf.cast(
+        contexts, dtype=self._context_spec.dtype), tf.cast(
+            next_contexts, dtype=self._context_spec.dtype)
+
+
+@gin.configurable
+class ConstantSampler(BaseSampler):
+  """Constant sampler."""
+
+  def __init__(self, value=None, *args, **kwargs):
+    """Construct sampler.
+
+    Args:
+      value: A list or Numpy array for values of the constant.
+      *args: arguments.
+      **kwargs: keyword arguments.
+    """
+    super(ConstantSampler, self).__init__(*args, **kwargs)
+    self._value = value
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    spec = self._context_spec
+    value_ = tf.constant(self._value, shape=spec.shape, dtype=spec.dtype)
+    values = tf.tile(
+        tf.expand_dims(value_, 0), (batch_size,) + (1,) * spec.shape.ndims)
+    self._validate_contexts(values)
+    return values, values
+
+
+@gin.configurable
+class DirectionSampler(RandomSampler):
+  """Direction sampler."""
+
+  def __call__(self, batch_size, **kwargs):
+    """Sample a batch of context.
+
+    Args:
+      batch_size: Batch size.
+    Returns:
+      Two [batch_size, num_context_dims] tensors.
+    """
+    spec = self._context_spec
+    context_range = self._context_range
+    if isinstance(context_range[0], (int, float)):
+      contexts = tf.random_uniform(
+          shape=[
+              batch_size,
+          ] + spec.shape.as_list(),
+          minval=context_range[0],
+          maxval=context_range[1],
+          dtype=spec.dtype)
+    elif isinstance(context_range[0], (list, tuple, np.ndarray)):
+      assert len(spec.shape.as_list()) == 1
+      assert spec.shape.as_list()[0] == len(context_range[0])
+      assert spec.shape.as_list()[0] == len(context_range[1])
+      contexts = tf.concat(
+          [
+              tf.random_uniform(
+                  shape=[
+                      batch_size, 1,
+                  ] + spec.shape.as_list()[1:],
+                  minval=context_range[0][i],
+                  maxval=context_range[1][i],
+                  dtype=spec.dtype) for i in range(spec.shape.as_list()[0])
+          ],
+          axis=1)
+    else: raise NotImplementedError(context_range)
+    self._validate_contexts(contexts)
+    if 'sampler_fn' in kwargs:
+      other_contexts = kwargs['sampler_fn']()
+    else:
+      other_contexts = contexts
+    state, next_state = kwargs['state'], kwargs['next_state']
+    if state is not None and next_state is not None:
+      my_context_range = (np.array(context_range[1]) - np.array(context_range[0])) / 2 * np.ones(spec.shape.as_list())
+      contexts = tf.concat(
+          [0.1 * my_context_range[:self._k] *
+           tf.random_normal(tf.shape(state[:, :self._k]), dtype=state.dtype) +
+           tf.random_shuffle(state[:, :self._k]) - state[:, :self._k],
+           other_contexts[:, self._k:]], 1)
+      #contexts = tf.Print(contexts,
+      #                    [contexts, tf.reduce_max(contexts, 0),
+      #                     tf.reduce_min(state, 0), tf.reduce_max(state, 0)], 'contexts', summarize=15)
+      next_contexts = tf.concat( #LALA
+          [state[:, :self._k] + contexts[:, :self._k] - next_state[:, :self._k],
+           other_contexts[:, self._k:]], 1)
+      next_contexts = contexts  #LALA cosine
+    else:
+      next_contexts = contexts
+    return tf.stop_gradient(contexts), tf.stop_gradient(next_contexts)
--- a/research/efficient-hrl/environments/__init__.py
+++ b/research/efficient-hrl/environments/__init__.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================

-"""Random policy on an environment."""
-
-import tensorflow as tf
-import numpy as np
-import random
-
-import create_maze_env
-
-app = tf.app
-flags = tf.flags
-logging = tf.logging
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('env', 'AntMaze', 'environment name: AntMaze, AntPush, or AntFall')
-flags.DEFINE_integer('episode_length', 500, 'episode length')
-flags.DEFINE_integer('num_episodes', 50, 'number of episodes')
-
-
-def get_goal_sample_fn(env_name):
-  if env_name == 'AntMaze':
-    # NOTE: When evaluating (i.e. the metrics shown in the paper,
-    # we use the commented out goal sampling function.  The uncommented
-    # one is only used for training.
-    #return lambda: np.array([0., 16.])
-    return lambda: np.random.uniform((-4, -4), (20, 20))
-  elif env_name == 'AntPush':
-    return lambda: np.array([0., 19.])
-  elif env_name == 'AntFall':
-    return lambda: np.array([0., 27., 4.5])
-  else:
-    assert False, 'Unknown env'
-
-
-def get_reward_fn(env_name):
-  if env_name == 'AntMaze':
-    return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
-  elif env_name == 'AntPush':
-    return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
-  elif env_name == 'AntFall':
-    return lambda obs, goal: -np.sum(np.square(obs[:3] - goal)) ** 0.5
-  else:
-    assert False, 'Unknown env'
-
-
-def success_fn(last_reward):
-  return last_reward > -5.0
-
-
-class EnvWithGoal(object):
-
-  def __init__(self, base_env, env_name):
-    self.base_env = base_env
-    self.goal_sample_fn = get_goal_sample_fn(env_name)
-    self.reward_fn = get_reward_fn(env_name)
-    self.goal = None
-
-  def reset(self):
-    obs = self.base_env.reset()
-    self.goal = self.goal_sample_fn()
-    return np.concatenate([obs, self.goal])
-
-  def step(self, a):
-    obs, _, done, info = self.base_env.step(a)
-    reward = self.reward_fn(obs, self.goal)
-    return np.concatenate([obs, self.goal]), reward, done, info
-
-  @property
-  def action_space(self):
-    return self.base_env.action_space
-
-
-def run_environment(env_name, episode_length, num_episodes):
-  env = EnvWithGoal(
-      create_maze_env.create_maze_env(env_name),
-      env_name)
-
-  def action_fn(obs):
-    action_space = env.action_space
-    action_space_mean = (action_space.low + action_space.high) / 2.0
-    action_space_magn = (action_space.high - action_space.low) / 2.0
-    random_action = (action_space_mean +
-                     action_space_magn *
-                     np.random.uniform(low=-1.0, high=1.0,
-                                       size=action_space.shape))
-    return random_action
-
-  rewards = []
-  successes = []
-  for ep in range(num_episodes):
-    rewards.append(0.0)
-    successes.append(False)
-    obs = env.reset()
-    for _ in range(episode_length):
-      obs, reward, done, _ = env.step(action_fn(obs))
-      rewards[-1] += reward
-      successes[-1] = success_fn(reward)
-      if done:
-        break
-    logging.info('Episode %d reward: %.2f, Success: %d', ep + 1, rewards[-1], successes[-1])
-
-  logging.info('Average Reward over %d episodes: %.2f',
-               num_episodes, np.mean(rewards))
-  logging.info('Average Success over %d episodes: %.2f',
-               num_episodes, np.mean(successes))
-
-
-def main(unused_argv):
-  logging.set_verbosity(logging.INFO)
-  run_environment(FLAGS.env, FLAGS.episode_length, FLAGS.num_episodes)
-
-
-if __name__ == '__main__':
-  app.run()
--- a/research/efficient-hrl/environments/ant.py
+++ b/research/efficient-hrl/environments/ant.py
@@ -21,8 +21,21 @@ from gym import utils
 from gym.envs.mujoco import mujoco_env


+def q_inv(a):
+  return [a[0], -a[1], -a[2], -a[3]]
+
+
+def q_mult(a, b): # multiply two quaternion
+  w = a[0] * b[0] - a[1] * b[1] - a[2] * b[2] - a[3] * b[3]
+  i = a[0] * b[1] + a[1] * b[0] + a[2] * b[3] - a[3] * b[2]
+  j = a[0] * b[2] - a[1] * b[3] + a[2] * b[0] + a[3] * b[1]
+  k = a[0] * b[3] + a[1] * b[2] - a[2] * b[1] + a[3] * b[0]
+  return [w, i, j, k]
+
+
 class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  FILE = "ant.xml"
+  ORI_IND = 3

  def __init__(self, file_path=None, expose_all_qpos=True,
               expose_body_coms=None, expose_body_comvels=None):
@@ -101,3 +114,21 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):

  def viewer_setup(self):
    self.viewer.cam.distance = self.model.stat.extent * 0.5
+
+  def get_ori(self):
+    ori = [0, 1, 0, 0]
+    rot = self.model.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND + 4]  # take the quaternion
+    ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3]  # project onto x-y plane
+    ori = math.atan2(ori[1], ori[0])
+    return ori
+
+  def set_xy(self, xy):
+    qpos = np.copy(self.physics.data.qpos)
+    qpos[0] = xy[0]
+    qpos[1] = xy[1]
+
+    qvel = self.physics.data.qvel
+    self.set_state(qpos, qvel)
+
+  def get_xy(self):
+    return self.physics.data.qpos[:2]
--- a/research/efficient-hrl/environments/ant_maze_env.py
+++ b/research/efficient-hrl/environments/ant_maze_env.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================

-from maze_env import MazeEnv
-from ant import AntEnv
+from environments.maze_env import MazeEnv
+from environments.ant import AntEnv


 class AntMazeEnv(MazeEnv):

--- a/research/efficient-hrl/environments/create_maze_env.py
+++ b/research/efficient-hrl/environments/create_maze_env.py
@@ -13,18 +13,85 @@
 # limitations under the License.
 # ==============================================================================

-from ant_maze_env import AntMazeEnv
+from environments.ant_maze_env import AntMazeEnv
+from environments.point_maze_env import PointMazeEnv

+import tensorflow as tf
+import gin.tf
+from tf_agents.environments import gym_wrapper
+from tf_agents.environments import tf_py_environment
+
+
+@gin.configurable
+def create_maze_env(env_name=None, top_down_view=False):
+  n_bins = 0
+  manual_collision = False
+  if env_name.startswith('Ego'):
+    n_bins = 8
+    env_name = env_name[3:]
+  if env_name.startswith('Ant'):
+    cls = AntMazeEnv
+    env_name = env_name[3:]
+    maze_size_scaling = 8
+  elif env_name.startswith('Point'):
+    cls = PointMazeEnv
+    manual_collision = True
+    env_name = env_name[5:]
+    maze_size_scaling = 4
+  else:
+    assert False, 'unknown env %s' % env_name

-def create_maze_env(env_name=None):
  maze_id = None
-  if env_name.startswith('AntMaze'):
+  observe_blocks = False
+  put_spin_near_agent = False
+  if env_name == 'Maze':
    maze_id = 'Maze'
-  elif env_name.startswith('AntPush'):
+  elif env_name == 'Push':
    maze_id = 'Push'
-  elif env_name.startswith('AntFall'):
+  elif env_name == 'Fall':
    maze_id = 'Fall'
+  elif env_name == 'Block':
+    maze_id = 'Block'
+    put_spin_near_agent = True
+    observe_blocks = True
+  elif env_name == 'BlockMaze':
+    maze_id = 'BlockMaze'
+    put_spin_near_agent = True
+    observe_blocks = True
  else:
    raise ValueError('Unknown maze environment %s' % env_name)

-  return AntMazeEnv(maze_id=maze_id)
+  gym_mujoco_kwargs = {
+      'maze_id': maze_id,
+      'n_bins': n_bins,
+      'observe_blocks': observe_blocks,
+      'put_spin_near_agent': put_spin_near_agent,
+      'top_down_view': top_down_view,
+      'manual_collision': manual_collision,
+      'maze_size_scaling': maze_size_scaling
+  }
+  gym_env = cls(**gym_mujoco_kwargs)
+  gym_env.reset()
+  wrapped_env = gym_wrapper.GymWrapper(gym_env)
+  return wrapped_env
+
+
+class TFPyEnvironment(tf_py_environment.TFPyEnvironment):
+
+  def __init__(self, *args, **kwargs):
+    super(TFPyEnvironment, self).__init__(*args, **kwargs)
+
+  def start_collect(self):
+    pass
+
+  def current_obs(self):
+    time_step = self.current_time_step()
+    return time_step.observation[0]  # For some reason, there is an extra dim.
+
+  def step(self, actions):
+    actions = tf.expand_dims(actions, 0)
+    next_step = super(TFPyEnvironment, self).step(actions)
+    return next_step.is_last()[0], next_step.reward[0], next_step.discount[0]
+
+  def reset(self):
+    return super(TFPyEnvironment, self).reset()
--- a/research/efficient-hrl/environments/maze_env.py
+++ b/research/efficient-hrl/environments/maze_env.py
@@ -22,7 +22,7 @@ import math
 import numpy as np
 import gym

-import maze_env_utils
+from environments import maze_env_utils

 # Directory that contains mujoco xml files.
 MODEL_DIR = 'environments/assets'
@@ -39,6 +39,13 @@ class MazeEnv(gym.Env):
      maze_id=None,
      maze_height=0.5,
      maze_size_scaling=8,
+      n_bins=0,
+      sensor_range=3.,
+      sensor_span=2 * math.pi,
+      observe_blocks=False,
+      put_spin_near_agent=False,
+      top_down_view=False,
+      manual_collision=False,
      *args,
      **kwargs):
    self._maze_id = maze_id
@@ -52,6 +59,14 @@ class MazeEnv(gym.Env):

    self.MAZE_HEIGHT = height = maze_height
    self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
+    self._n_bins = n_bins
+    self._sensor_range = sensor_range * size_scaling
+    self._sensor_span = sensor_span
+    self._observe_blocks = observe_blocks
+    self._put_spin_near_agent = put_spin_near_agent
+    self._top_down_view = top_down_view
+    self._manual_collision = manual_collision
+
    self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze(maze_id=self._maze_id)
    self.elevated = any(-1 in row for row in structure)  # Elevate the maze to allow for falling.
    self.blocks = any(
@@ -61,6 +76,13 @@ class MazeEnv(gym.Env):
    torso_x, torso_y = self._find_robot()
    self._init_torso_x = torso_x
    self._init_torso_y = torso_y
+    self._init_positions = [
+        (x - torso_x, y - torso_y)
+        for x, y in self._find_all_robots()]
+
+    self._xy_to_rowcol = lambda x, y: (2 + (y + size_scaling / 2) / size_scaling,
+                                       2 + (x + size_scaling / 2) / size_scaling)
+    self._view = np.zeros([5, 5, 3])  # walls (immovable), chasms (fall), movable blocks

    height_offset = 0.
    if self.elevated:
@@ -74,9 +96,13 @@ class MazeEnv(gym.Env):
      default = tree.find(".//default")
      default.find('.//geom').set('solimp', '.995 .995 .01')

+    self.movable_blocks = []
    for i in range(len(structure)):
      for j in range(len(structure[0])):
-        if self.elevated and structure[i][j] not in [-1]:
+        struct = structure[i][j]
+        if struct == 'r' and self._put_spin_near_agent:
+          struct = maze_env_utils.Move.SpinXY
+        if self.elevated and struct not in [-1]:
          # Create elevated platform.
          ET.SubElement(
              worldbody, "geom",
@@ -93,7 +119,7 @@ class MazeEnv(gym.Env):
              conaffinity="1",
              rgba="0.9 0.9 0.9 1",
          )
-        if structure[i][j] == 1:  # Unmovable block.
+        if struct == 1:  # Unmovable block.
          # Offset all coordinates so that robot starts at the origin.
          ET.SubElement(
              worldbody, "geom",
@@ -111,26 +137,32 @@ class MazeEnv(gym.Env):
              conaffinity="1",
              rgba="0.4 0.4 0.4 1",
          )
-        elif maze_env_utils.can_move(structure[i][j]):  # Movable block.
+        elif maze_env_utils.can_move(struct):  # Movable block.
          # The "falling" blocks are shrunk slightly and increased in mass to
          # ensure that it can fall easily through a gap in the platform blocks.
-          falling = maze_env_utils.can_move_z(structure[i][j])
-          shrink = 0.99 if falling else 1.0
-          moveable_body = ET.SubElement(
+          name = "movable_%d_%d" % (i, j)
+          self.movable_blocks.append((name, struct))
+          falling = maze_env_utils.can_move_z(struct)
+          spinning = maze_env_utils.can_spin(struct)
+          x_offset = 0.25 * size_scaling if spinning else 0.0
+          y_offset = 0.0
+          shrink = 0.1 if spinning else 0.99 if falling else 1.0
+          height_shrink = 0.1 if spinning else 1.0
+          movable_body = ET.SubElement(
              worldbody, "body",
-              name="moveable_%d_%d" % (i, j),
-              pos="%f %f %f" % (j * size_scaling - torso_x,
-                                i * size_scaling - torso_y,
+              name=name,
+              pos="%f %f %f" % (j * size_scaling - torso_x + x_offset,
+                                i * size_scaling - torso_y + y_offset,
                                height_offset +
-                                height / 2 * size_scaling),
+                                height / 2 * size_scaling * height_shrink),
          )
          ET.SubElement(
-              moveable_body, "geom",
+              movable_body, "geom",
              name="block_%d_%d" % (i, j),
              pos="0 0 0",
              size="%f %f %f" % (0.5 * size_scaling * shrink,
                                 0.5 * size_scaling * shrink,
-                                 height / 2 * size_scaling),
+                                 height / 2 * size_scaling * height_shrink),
              type="box",
              material="",
              mass="0.001" if falling else "0.0002",
@@ -138,45 +170,56 @@ class MazeEnv(gym.Env):
              conaffinity="1",
              rgba="0.9 0.1 0.1 1"
          )
-          if maze_env_utils.can_move_x(structure[i][j]):
+          if maze_env_utils.can_move_x(struct):
            ET.SubElement(
-                moveable_body, "joint",
+                movable_body, "joint",
                armature="0",
                axis="1 0 0",
                damping="0.0",
                limited="true" if falling else "false",
                range="%f %f" % (-size_scaling, size_scaling),
                margin="0.01",
-                name="moveable_x_%d_%d" % (i, j),
+                name="movable_x_%d_%d" % (i, j),
                pos="0 0 0",
                type="slide"
            )
-          if maze_env_utils.can_move_y(structure[i][j]):
+          if maze_env_utils.can_move_y(struct):
            ET.SubElement(
-                moveable_body, "joint",
+                movable_body, "joint",
                armature="0",
                axis="0 1 0",
                damping="0.0",
                limited="true" if falling else "false",
                range="%f %f" % (-size_scaling, size_scaling),
                margin="0.01",
-                name="moveable_y_%d_%d" % (i, j),
+                name="movable_y_%d_%d" % (i, j),
                pos="0 0 0",
                type="slide"
            )
-          if maze_env_utils.can_move_z(structure[i][j]):
+          if maze_env_utils.can_move_z(struct):
            ET.SubElement(
-                moveable_body, "joint",
+                movable_body, "joint",
                armature="0",
                axis="0 0 1",
                damping="0.0",
                limited="true",
                range="%f 0" % (-height_offset),
                margin="0.01",
-                name="moveable_z_%d_%d" % (i, j),
+                name="movable_z_%d_%d" % (i, j),
                pos="0 0 0",
                type="slide"
            )
+          if maze_env_utils.can_spin(struct):
+            ET.SubElement(
+                movable_body, "joint",
+                armature="0",
+                axis="0 0 1",
+                damping="0.0",
+                limited="false",
+                name="spinable_%d_%d" % (i, j),
+                pos="0 0 0",
+                type="ball"
+            )

    torso = tree.find(".//body[@name='torso']")
    geoms = torso.findall(".//geom")
@@ -190,13 +233,203 @@ class MazeEnv(gym.Env):

    self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs)

+  def get_ori(self):
+    return self.wrapped_env.get_ori()
+
+  def get_top_down_view(self):
+    self._view = np.zeros_like(self._view)
+
+    def valid(row, col):
+      return self._view.shape[0] > row >= 0 and self._view.shape[1] > col >= 0
+
+    def update_view(x, y, d, row=None, col=None):
+      if row is None or col is None:
+        x = x - self._robot_x
+        y = y - self._robot_y
+        th = self._robot_ori
+
+        row, col = self._xy_to_rowcol(x, y)
+        update_view(x, y, d, row=row, col=col)
+        return
+
+      row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1
+      if row_frac < 0:
+        row_frac += 1
+      if col_frac < 0:
+        col_frac += 1
+
+      if valid(row, col):
+        self._view[row, col, d] += (
+            (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
+            (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
+      if valid(row - 1, col):
+        self._view[row - 1, col, d] += (
+            (max(0., 0.5 - row_frac)) *
+            (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
+      if valid(row + 1, col):
+        self._view[row + 1, col, d] += (
+            (max(0., row_frac - 0.5)) *
+            (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
+      if valid(row, col - 1):
+        self._view[row, col - 1, d] += (
+            (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
+            (max(0., 0.5 - col_frac)))
+      if valid(row, col + 1):
+        self._view[row, col + 1, d] += (
+            (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
+            (max(0., col_frac - 0.5)))
+      if valid(row - 1, col - 1):
+        self._view[row - 1, col - 1, d] += (
+            (max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac))
+      if valid(row - 1, col + 1):
+        self._view[row - 1, col + 1, d] += (
+            (max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5))
+      if valid(row + 1, col + 1):
+        self._view[row + 1, col + 1, d] += (
+            (max(0., row_frac - 0.5)) * max(0., col_frac - 0.5))
+      if valid(row + 1, col - 1):
+        self._view[row + 1, col - 1, d] += (
+            (max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac))
+
+    # Draw ant.
+    robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2]
+    self._robot_x = robot_x
+    self._robot_y = robot_y
+    self._robot_ori = self.get_ori()
+
+    structure = self.MAZE_STRUCTURE
+    size_scaling = self.MAZE_SIZE_SCALING
+    height = self.MAZE_HEIGHT
+
+    # Draw immovable blocks and chasms.
+    for i in range(len(structure)):
+      for j in range(len(structure[0])):
+        if structure[i][j] == 1:  # Wall.
+          update_view(j * size_scaling - self._init_torso_x,
+                      i * size_scaling - self._init_torso_y,
+                      0)
+        if structure[i][j] == -1:  # Chasm.
+          update_view(j * size_scaling - self._init_torso_x,
+                      i * size_scaling - self._init_torso_y,
+                      1)
+
+    # Draw movable blocks.
+    for block_name, block_type in self.movable_blocks:
+      block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2]
+      update_view(block_x, block_y, 2)
+
+    return self._view
+
+  def get_range_sensor_obs(self):
+    """Returns egocentric range sensor observations of maze."""
+    robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3]
+    ori = self.get_ori()
+
+    structure = self.MAZE_STRUCTURE
+    size_scaling = self.MAZE_SIZE_SCALING
+    height = self.MAZE_HEIGHT
+
+    segments = []
+    # Get line segments (corresponding to outer boundary) of each immovable
+    # block or drop-off.
+    for i in range(len(structure)):
+      for j in range(len(structure[0])):
+        if structure[i][j] in [1, -1]:  # There's a wall or drop-off.
+          cx = j * size_scaling - self._init_torso_x
+          cy = i * size_scaling - self._init_torso_y
+          x1 = cx - 0.5 * size_scaling
+          x2 = cx + 0.5 * size_scaling
+          y1 = cy - 0.5 * size_scaling
+          y2 = cy + 0.5 * size_scaling
+          struct_segments = [
+              ((x1, y1), (x2, y1)),
+              ((x2, y1), (x2, y2)),
+              ((x2, y2), (x1, y2)),
+              ((x1, y2), (x1, y1)),
+          ]
+          for seg in struct_segments:
+            segments.append(dict(
+                segment=seg,
+                type=structure[i][j],
+            ))
+    # Get line segments (corresponding to outer boundary) of each movable
+    # block within the agent's z-view.
+    for block_name, block_type in self.movable_blocks:
+      block_x, block_y, block_z = self.wrapped_env.get_body_com(block_name)[:3]
+      if (block_z + height * size_scaling / 2 >= robot_z and
+          robot_z >= block_z - height * size_scaling / 2):  # Block in view.
+        x1 = block_x - 0.5 * size_scaling
+        x2 = block_x + 0.5 * size_scaling
+        y1 = block_y - 0.5 * size_scaling
+        y2 = block_y + 0.5 * size_scaling
+        struct_segments = [
+            ((x1, y1), (x2, y1)),
+            ((x2, y1), (x2, y2)),
+            ((x2, y2), (x1, y2)),
+            ((x1, y2), (x1, y1)),
+        ]
+        for seg in struct_segments:
+          segments.append(dict(
+              segment=seg,
+              type=block_type,
+          ))
+
+    sensor_readings = np.zeros((self._n_bins, 3))  # 3 for wall, drop-off, block
+    for ray_idx in range(self._n_bins):
+      ray_ori = (ori - self._sensor_span * 0.5 +
+                 (2 * ray_idx + 1.0) / (2 * self._n_bins) * self._sensor_span)
+      ray_segments = []
+      # Get all segments that intersect with ray.
+      for seg in segments:
+        p = maze_env_utils.ray_segment_intersect(
+            ray=((robot_x, robot_y), ray_ori),
+            segment=seg["segment"])
+        if p is not None:
+          ray_segments.append(dict(
+              segment=seg["segment"],
+              type=seg["type"],
+              ray_ori=ray_ori,
+              distance=maze_env_utils.point_distance(p, (robot_x, robot_y)),
+          ))
+      if len(ray_segments) > 0:
+        # Find out which segment is intersected first.
+        first_seg = sorted(ray_segments, key=lambda x: x["distance"])[0]
+        seg_type = first_seg["type"]
+        idx = (0 if seg_type == 1 else  # Wall.
+               1 if seg_type == -1 else  # Drop-off.
+               2 if maze_env_utils.can_move(seg_type) else  # Block.
+               None)
+        if first_seg["distance"] <= self._sensor_range:
+          sensor_readings[ray_idx][idx] = (self._sensor_range - first_seg["distance"]) / self._sensor_range
+
+    return sensor_readings
+
  def _get_obs(self):
-    return np.concatenate([self.wrapped_env._get_obs(),
-                           [self.t * 0.001]])
+    wrapped_obs = self.wrapped_env._get_obs()
+    if self._top_down_view:
+      view = [self.get_top_down_view().flat]
+    else:
+      view = []
+
+    if self._observe_blocks:
+      additional_obs = []
+      for block_name, block_type in self.movable_blocks:
+        additional_obs.append(self.wrapped_env.get_body_com(block_name))
+      wrapped_obs = np.concatenate([wrapped_obs[:3]] + additional_obs +
+                                   [wrapped_obs[3:]])
+
+    range_sensor_obs = self.get_range_sensor_obs()
+    return np.concatenate([wrapped_obs,
+                           range_sensor_obs.flat] +
+                           view + [[self.t * 0.001]])

  def reset(self):
    self.t = 0
+    self.trajectory = []
    self.wrapped_env.reset()
+    if len(self._init_positions) > 1:
+      xy = random.choice(self._init_positions)
+      self.wrapped_env.set_xy(xy)
    return self._get_obs()

  @property
@@ -226,9 +459,41 @@ class MazeEnv(gym.Env):
          return j * size_scaling, i * size_scaling
    assert False, 'No robot in maze specification.'

+  def _find_all_robots(self):
+    structure = self.MAZE_STRUCTURE
+    size_scaling = self.MAZE_SIZE_SCALING
+    coords = []
+    for i in range(len(structure)):
+      for j in range(len(structure[0])):
+        if structure[i][j] == 'r':
+          coords.append((j * size_scaling, i * size_scaling))
+    return coords
+
+  def _is_in_collision(self, pos):
+    x, y = pos
+    structure = self.MAZE_STRUCTURE
+    size_scaling = self.MAZE_SIZE_SCALING
+    for i in range(len(structure)):
+      for j in range(len(structure[0])):
+        if structure[i][j] == 1:
+          minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
+          maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
+          miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
+          maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
+          if minx <= x <= maxx and miny <= y <= maxy:
+            return True
+    return False
+
  def step(self, action):
    self.t += 1
-    inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
+    if self._manual_collision:
+      old_pos = self.wrapped_env.get_xy()
+      inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
+      new_pos = self.wrapped_env.get_xy()
+      if self._is_in_collision(new_pos):
+        self.wrapped_env.set_xy(old_pos)
+    else:
+      inner_next_obs, inner_reward, done, info = self.wrapped_env.step(action)
    next_obs = self._get_obs()
    done = False
    return next_obs, inner_reward, done, info
--- a/research/efficient-hrl/environments/maze_env_utils.py
+++ b/research/efficient-hrl/environments/maze_env_utils.py
@@ -26,20 +26,27 @@ class Move(object):
  XZ = 15
  YZ = 16
  XYZ = 17
+  SpinXY = 18


 def can_move_x(movable):
-  return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ]
+  return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
+                     Move.SpinXY]


 def can_move_y(movable):
-  return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ]
+  return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
+                     Move.SpinXY]


 def can_move_z(movable):
  return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]


+def can_spin(movable):
+  return movable in [Move.SpinXY]
+
+
 def can_move(movable):
  return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)

@@ -70,7 +77,88 @@ def construct_maze(maze_id='Maze'):
        [1, 0,   0,  1],
        [1, 1,   1,  1],
    ]
+  elif maze_id == 'Block':
+    O = 'r'
+    structure = [
+        [1, 1, 1, 1, 1],
+        [1, O, 0, 0, 1],
+        [1, 0, 0, 0, 1],
+        [1, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1],
+    ]
+  elif maze_id == 'BlockMaze':
+    O = 'r'
+    structure = [
+        [1, 1, 1, 1],
+        [1, O, 0, 1],
+        [1, 1, 0, 1],
+        [1, 0, 0, 1],
+        [1, 1, 1, 1],
+    ]
  else:
      raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id)

  return structure
+
+
+def line_intersect(pt1, pt2, ptA, ptB):
+  """
+  Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
+
+  this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
+  """
+
+  DET_TOLERANCE = 0.00000001
+
+  # the first line is pt1 + r*(pt2-pt1)
+  # in component form:
+  x1, y1 = pt1
+  x2, y2 = pt2
+  dx1 = x2 - x1
+  dy1 = y2 - y1
+
+  # the second line is ptA + s*(ptB-ptA)
+  x, y = ptA
+  xB, yB = ptB
+  dx = xB - x
+  dy = yB - y
+
+  DET = (-dx1 * dy + dy1 * dx)
+
+  if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0)
+
+  # now, the determinant should be OK
+  DETinv = 1.0 / DET
+
+  # find the scalar amount along the "self" segment
+  r = DETinv * (-dy * (x - x1) + dx * (y - y1))
+
+  # find the scalar amount along the input line
+  s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
+
+  # return the average of the two descriptions
+  xi = (x1 + r * dx1 + x + s * dx) / 2.0
+  yi = (y1 + r * dy1 + y + s * dy) / 2.0
+  return (xi, yi, 1, r, s)
+
+
+def ray_segment_intersect(ray, segment):
+  """
+  Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2),
+  and return the intersection point if there is one
+  """
+  (x, y), theta = ray
+  # (x1, y1), (x2, y2) = segment
+  pt1 = (x, y)
+  len = 1
+  pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
+  xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
+  if valid and r >= 0 and 0 <= s <= 1:
+    return (xo, yo)
+  return None
+
+
+def point_distance(p1, p2):
+  x1, y1 = p1
+  x2, y2 = p2
+  return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
--- a/research/efficient-hrl/environments/point.py
+++ b/research/efficient-hrl/environments/point.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper for creating the ant environment in gym_mujoco."""
+
+import math
+import numpy as np
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+  FILE = "point.xml"
+  ORI_IND = 2
+
+  def __init__(self, file_path=None, expose_all_qpos=True):
+    self._expose_all_qpos = expose_all_qpos
+
+    mujoco_env.MujocoEnv.__init__(self, file_path, 1)
+    utils.EzPickle.__init__(self)
+
+  @property
+  def physics(self):
+    return self.model
+
+  def _step(self, a):
+    return self.step(a)
+
+  def step(self, action):
+    action[0] = 0.2 * action[0]
+    qpos = np.copy(self.physics.data.qpos)
+    qpos[2] += action[1]
+    ori = qpos[2]
+    # compute increment in each direction
+    dx = math.cos(ori) * action[0]
+    dy = math.sin(ori) * action[0]
+    # ensure that the robot is within reasonable range
+    qpos[0] = np.clip(qpos[0] + dx, -100, 100)
+    qpos[1] = np.clip(qpos[1] + dy, -100, 100)
+    qvel = self.physics.data.qvel
+    self.set_state(qpos, qvel)
+    for _ in range(0, self.frame_skip):
+      self.physics.step()
+    next_obs = self._get_obs()
+    reward = 0
+    done = False
+    info = {}
+    return next_obs, reward, done, info
+
+  def _get_obs(self):
+    if self._expose_all_qpos:
+      return np.concatenate([
+          self.physics.data.qpos.flat[:3],  # Only point-relevant coords.
+          self.physics.data.qvel.flat[:3]])
+    return np.concatenate([
+        self.physics.data.qpos.flat[2:3],
+        self.physics.data.qvel.flat[:3]])
+
+  def reset_model(self):
+    qpos = self.init_qpos + self.np_random.uniform(
+        size=self.physics.model.nq, low=-.1, high=.1)
+    qvel = self.init_qvel + self.np_random.randn(self.physics.model.nv) * .1
+
+    # Set everything other than point to original position and 0 velocity.
+    qpos[3:] = self.init_qpos[3:]
+    qvel[3:] = 0.
+    self.set_state(qpos, qvel)
+    return self._get_obs()
+
+  def get_ori(self):
+    return self.model.data.qpos[self.__class__.ORI_IND]
+
+  def set_xy(self, xy):
+    qpos = np.copy(self.physics.data.qpos)
+    qpos[0] = xy[0]
+    qpos[1] = xy[1]
+
+    qvel = self.physics.data.qvel
--- a/research/efficient-hrl/environments/point_maze_env.py
+++ b/research/efficient-hrl/environments/point_maze_env.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from environments.maze_env import MazeEnv
+from environments.point import PointEnv
+
+
+class PointMazeEnv(MazeEnv):
+    MODEL_CLASS = PointEnv