Merge remote-tracking branch 'upstream/master'

27b4acd4 · Aman Gupta · 5133522f · d4e1f97f · 27b4acd4 · 27b4acd4
Commit 27b4acd4 authored Sep 25, 2018 by Aman Gupta
20 changed files
--- a/research/cognitive_planning/envs/util.py
+++ b/research/cognitive_planning/envs/util.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A module with utility functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def trajectory_to_deltas(trajectory, state):
+  """Computes a sequence of deltas of a state to traverse a trajectory in 2D.
+  The initial state of the agent contains its pose -- location in 2D and
+  orientation. When the computed deltas are incrementally added to it, it
+  traverses the specified trajectory while keeping its orientation parallel to
+  the trajectory.
+  Args:
+    trajectory: a np.array of floats of shape n x 2. The n-th row contains the
+      n-th point.
+    state: a 3 element np.array of floats containing agent's location and
+      orientation in radians.
+  Returns:
+    A np.array of floats of size n x 3.
+  """
+  state = np.reshape(state, [-1])
+  init_xy = state[0:2]
+  init_theta = state[2]
+  delta_xy = trajectory - np.concatenate(
+      [np.reshape(init_xy, [1, 2]), trajectory[:-1, :]], axis=0)
+  thetas = np.reshape(np.arctan2(delta_xy[:, 1], delta_xy[:, 0]), [-1, 1])
+  thetas = np.concatenate([np.reshape(init_theta, [1, 1]), thetas], axis=0)
+  delta_thetas = thetas[1:] - thetas[:-1]
+  deltas = np.concatenate([delta_xy, delta_thetas], axis=1)
+  return deltas
--- a/research/cognitive_planning/label_map.txt
+++ b/research/cognitive_planning/label_map.txt
+item {
+  name: "/m/01g317"
+  id: 1
+  display_name: "person"
+}
+item {
+  name: "/m/0199g"
+  id: 2
+  display_name: "bicycle"
+}
+item {
+  name: "/m/0k4j"
+  id: 3
+  display_name: "car"
+}
+item {
+  name: "/m/04_sv"
+  id: 4
+  display_name: "motorcycle"
+}
+item {
+  name: "/m/05czz6l"
+  id: 5
+  display_name: "airplane"
+}
+item {
+  name: "/m/01bjv"
+  id: 6
+  display_name: "bus"
+}
+item {
+  name: "/m/07jdr"
+  id: 7
+  display_name: "train"
+}
+item {
+  name: "/m/07r04"
+  id: 8
+  display_name: "truck"
+}
+item {
+  name: "/m/019jd"
+  id: 9
+  display_name: "boat"
+}
+item {
+  name: "/m/015qff"
+  id: 10
+  display_name: "traffic light"
+}
+item {
+  name: "/m/01pns0"
+  id: 11
+  display_name: "fire hydrant"
+}
+item {
+  name: "/m/02pv19"
+  id: 13
+  display_name: "stop sign"
+}
+item {
+  name: "/m/015qbp"
+  id: 14
+  display_name: "parking meter"
+}
+item {
+  name: "/m/0cvnqh"
+  id: 15
+  display_name: "bench"
+}
+item {
+  name: "/m/015p6"
+  id: 16
+  display_name: "bird"
+}
+item {
+  name: "/m/01yrx"
+  id: 17
+  display_name: "cat"
+}
+item {
+  name: "/m/0bt9lr"
+  id: 18
+  display_name: "dog"
+}
+item {
+  name: "/m/03k3r"
+  id: 19
+  display_name: "horse"
+}
+item {
+  name: "/m/07bgp"
+  id: 20
+  display_name: "sheep"
+}
+item {
+  name: "/m/01xq0k1"
+  id: 21
+  display_name: "cow"
+}
+item {
+  name: "/m/0bwd_0j"
+  id: 22
+  display_name: "elephant"
+}
+item {
+  name: "/m/01dws"
+  id: 23
+  display_name: "bear"
+}
+item {
+  name: "/m/0898b"
+  id: 24
+  display_name: "zebra"
+}
+item {
+  name: "/m/03bk1"
+  id: 25
+  display_name: "giraffe"
+}
+item {
+  name: "/m/01940j"
+  id: 27
+  display_name: "backpack"
+}
+item {
+  name: "/m/0hnnb"
+  id: 28
+  display_name: "umbrella"
+}
+item {
+  name: "/m/080hkjn"
+  id: 31
+  display_name: "handbag"
+}
+item {
+  name: "/m/01rkbr"
+  id: 32
+  display_name: "tie"
+}
+item {
+  name: "/m/01s55n"
+  id: 33
+  display_name: "suitcase"
+}
+item {
+  name: "/m/02wmf"
+  id: 34
+  display_name: "frisbee"
+}
+item {
+  name: "/m/071p9"
+  id: 35
+  display_name: "skis"
+}
+item {
+  name: "/m/06__v"
+  id: 36
+  display_name: "snowboard"
+}
+item {
+  name: "/m/018xm"
+  id: 37
+  display_name: "sports ball"
+}
+item {
+  name: "/m/02zt3"
+  id: 38
+  display_name: "kite"
+}
+item {
+  name: "/m/03g8mr"
+  id: 39
+  display_name: "baseball bat"
+}
+item {
+  name: "/m/03grzl"
+  id: 40
+  display_name: "baseball glove"
+}
+item {
+  name: "/m/06_fw"
+  id: 41
+  display_name: "skateboard"
+}
+item {
+  name: "/m/019w40"
+  id: 42
+  display_name: "surfboard"
+}
+item {
+  name: "/m/0dv9c"
+  id: 43
+  display_name: "tennis racket"
+}
+item {
+  name: "/m/04dr76w"
+  id: 44
+  display_name: "bottle"
+}
+item {
+  name: "/m/09tvcd"
+  id: 46
+  display_name: "wine glass"
+}
+item {
+  name: "/m/08gqpm"
+  id: 47
+  display_name: "cup"
+}
+item {
+  name: "/m/0dt3t"
+  id: 48
+  display_name: "fork"
+}
+item {
+  name: "/m/04ctx"
+  id: 49
+  display_name: "knife"
+}
+item {
+  name: "/m/0cmx8"
+  id: 50
+  display_name: "spoon"
+}
+item {
+  name: "/m/04kkgm"
+  id: 51
+  display_name: "bowl"
+}
+item {
+  name: "/m/09qck"
+  id: 52
+  display_name: "banana"
+}
+item {
+  name: "/m/014j1m"
+  id: 53
+  display_name: "apple"
+}
+item {
+  name: "/m/0l515"
+  id: 54
+  display_name: "sandwich"
+}
+item {
+  name: "/m/0cyhj_"
+  id: 55
+  display_name: "orange"
+}
+item {
+  name: "/m/0hkxq"
+  id: 56
+  display_name: "broccoli"
+}
+item {
+  name: "/m/0fj52s"
+  id: 57
+  display_name: "carrot"
+}
+item {
+  name: "/m/01b9xk"
+  id: 58
+  display_name: "hot dog"
+}
+item {
+  name: "/m/0663v"
+  id: 59
+  display_name: "pizza"
+}
+item {
+  name: "/m/0jy4k"
+  id: 60
+  display_name: "donut"
+}
+item {
+  name: "/m/0fszt"
+  id: 61
+  display_name: "cake"
+}
+item {
+  name: "/m/01mzpv"
+  id: 62
+  display_name: "chair"
+}
+item {
+  name: "/m/02crq1"
+  id: 63
+  display_name: "couch"
+}
+item {
+  name: "/m/03fp41"
+  id: 64
+  display_name: "potted plant"
+}
+item {
+  name: "/m/03ssj5"
+  id: 65
+  display_name: "bed"
+}
+item {
+  name: "/m/04bcr3"
+  id: 67
+  display_name: "dining table"
+}
+item {
+  name: "/m/09g1w"
+  id: 70
+  display_name: "toilet"
+}
+item {
+  name: "/m/07c52"
+  id: 72
+  display_name: "tv"
+}
+item {
+  name: "/m/01c648"
+  id: 73
+  display_name: "laptop"
+}
+item {
+  name: "/m/020lf"
+  id: 74
+  display_name: "mouse"
+}
+item {
+  name: "/m/0qjjc"
+  id: 75
+  display_name: "remote"
+}
+item {
+  name: "/m/01m2v"
+  id: 76
+  display_name: "keyboard"
+}
+item {
+  name: "/m/050k8"
+  id: 77
+  display_name: "cell phone"
+}
+item {
+  name: "/m/0fx9l"
+  id: 78
+  display_name: "microwave"
+}
+item {
+  name: "/m/029bxz"
+  id: 79
+  display_name: "oven"
+}
+item {
+  name: "/m/01k6s3"
+  id: 80
+  display_name: "toaster"
+}
+item {
+  name: "/m/0130jx"
+  id: 81
+  display_name: "sink"
+}
+item {
+  name: "/m/040b_t"
+  id: 82
+  display_name: "refrigerator"
+}
+item {
+  name: "/m/0bt_c3"
+  id: 84
+  display_name: "book"
+}
+item {
+  name: "/m/01x3z"
+  id: 85
+  display_name: "clock"
+}
+item {
+  name: "/m/02s195"
+  id: 86
+  display_name: "vase"
+}
+item {
+  name: "/m/01lsmm"
+  id: 87
+  display_name: "scissors"
+}
+item {
+  name: "/m/0kmg4"
+  id: 88
+  display_name: "teddy bear"
+}
+item {
+  name: "/m/03wvsk"
+  id: 89
+  display_name: "hair drier"
+}
+item {
+  name: "/m/012xff"
+  id: 90
+  display_name: "toothbrush"
+}
--- a/research/cognitive_planning/label_map_util.py
+++ b/research/cognitive_planning/label_map_util.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Label map utility functions."""
+import logging
+import tensorflow as tf
+from google.protobuf import text_format
+import string_int_label_map_pb2
+def _validate_label_map(label_map):
+  """Checks if a label map is valid.
+  Args:
+    label_map: StringIntLabelMap to validate.
+  Raises:
+    ValueError: if label map is invalid.
+  """
+  for item in label_map.item:
+    if item.id < 0:
+      raise ValueError('Label map ids should be >= 0.')
+    if (item.id == 0 and item.name != 'background' and
+        item.display_name != 'background'):
+      raise ValueError('Label map id 0 is reserved for the background label')
+def create_category_index(categories):
+  """Creates dictionary of COCO compatible categories keyed by category id.
+  Args:
+    categories: a list of dicts, each of which has the following keys:
+      'id': (required) an integer id uniquely identifying this category.
+      'name': (required) string representing category name
+        e.g., 'cat', 'dog', 'pizza'.
+  Returns:
+    category_index: a dict containing the same entries as categories, but keyed
+      by the 'id' field of each category.
+  """
+  category_index = {}
+  for cat in categories:
+    category_index[cat['id']] = cat
+  return category_index
+def get_max_label_map_index(label_map):
+  """Get maximum index in label map.
+  Args:
+    label_map: a StringIntLabelMapProto
+  Returns:
+    an integer
+  """
+  return max([item.id for item in label_map.item])
+def convert_label_map_to_categories(label_map,
+                                    max_num_classes,
+                                    use_display_name=True):
+  """Loads label map proto and returns categories list compatible with eval.
+  This function loads a label map and returns a list of dicts, each of which
+  has the following keys:
+    'id': (required) an integer id uniquely identifying this category.
+    'name': (required) string representing category name
+      e.g., 'cat', 'dog', 'pizza'.
+  We only allow class into the list if its id-label_id_offset is
+  between 0 (inclusive) and max_num_classes (exclusive).
+  If there are several items mapping to the same id in the label map,
+  we will only keep the first one in the categories list.
+  Args:
+    label_map: a StringIntLabelMapProto or None.  If None, a default categories
+      list is created with max_num_classes categories.
+    max_num_classes: maximum number of (consecutive) label indices to include.
+    use_display_name: (boolean) choose whether to load 'display_name' field
+      as category name.  If False or if the display_name field does not exist,
+      uses 'name' field as category names instead.
+  Returns:
+    categories: a list of dictionaries representing all possible categories.
+  """
+  categories = []
+  list_of_ids_already_added = []
+  if not label_map:
+    label_id_offset = 1
+    for class_id in range(max_num_classes):
+      categories.append({
+          'id': class_id + label_id_offset,
+          'name': 'category_{}'.format(class_id + label_id_offset)
+      })
+    return categories
+  for item in label_map.item:
+    if not 0 < item.id <= max_num_classes:
+      logging.info('Ignore item %d since it falls outside of requested '
+                   'label range.', item.id)
+      continue
+    if use_display_name and item.HasField('display_name'):
+      name = item.display_name
+    else:
+      name = item.name
+    if item.id not in list_of_ids_already_added:
+      list_of_ids_already_added.append(item.id)
+      categories.append({'id': item.id, 'name': name})
+  return categories
+def load_labelmap(path):
+  """Loads label map proto.
+  Args:
+    path: path to StringIntLabelMap proto text file.
+  Returns:
+    a StringIntLabelMapProto
+  """
+  with tf.gfile.GFile(path, 'r') as fid:
+    label_map_string = fid.read()
+    label_map = string_int_label_map_pb2.StringIntLabelMap()
+    try:
+      text_format.Merge(label_map_string, label_map)
+    except text_format.ParseError:
+      label_map.ParseFromString(label_map_string)
+  _validate_label_map(label_map)
+  return label_map
+def get_label_map_dict(label_map_path, use_display_name=False):
+  """Reads a label map and returns a dictionary of label names to id.
+  Args:
+    label_map_path: path to label_map.
+    use_display_name: whether to use the label map items' display names as keys.
+  Returns:
+    A dictionary mapping label names to id.
+  """
+  label_map = load_labelmap(label_map_path)
+  label_map_dict = {}
+  for item in label_map.item:
+    if use_display_name:
+      label_map_dict[item.display_name] = item.id
+    else:
+      label_map_dict[item.name] = item.id
+  return label_map_dict
+def create_category_index_from_labelmap(label_map_path):
+  """Reads a label map and returns a category index.
+  Args:
+    label_map_path: Path to `StringIntLabelMap` proto text file.
+  Returns:
+    A category index, which is a dictionary that maps integer ids to dicts
+    containing categories, e.g.
+    {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...}
+  """
+  label_map = load_labelmap(label_map_path)
+  max_num_classes = max(item.id for item in label_map.item)
+  categories = convert_label_map_to_categories(label_map, max_num_classes)
+  return create_category_index(categories)
+def create_class_agnostic_category_index():
+  """Creates a category index with a single `object` class."""
+  return {1: {'id': 1, 'name': 'object'}}
--- a/research/cognitive_planning/policies.py
+++ b/research/cognitive_planning/policies.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Interface for the policy of the agents use for navigation."""
+import abc
+import tensorflow as tf
+from absl import logging
+import embedders
+from envs import task_env
+slim = tf.contrib.slim
+def _print_debug_ios(history, goal, output):
+  """Prints sizes of history, goal and outputs."""
+  if history is not None:
+    shape = history.get_shape().as_list()
+    # logging.info('history embedding shape ')
+    # logging.info(shape)
+  if len(shape) != 3:
+      raise ValueError('history Tensor must have rank=3')
+  if goal is not None:
+     logging.info('goal embedding shape ')
+     logging.info(goal.get_shape().as_list())
+  if output is not None:
+     logging.info('targets shape ')
+     logging.info(output.get_shape().as_list())
+class Policy(object):
+  """Represents the policy of the agent for navigation tasks.
+  Instantiates a policy that takes embedders for each modality and builds a
+  model to infer the actions.
+  """
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, embedders_dict, action_size):
+    """Instantiates the policy.
+    Args:
+      embedders_dict: Dictionary of embedders for different modalities. Keys
+        should be identical to keys of observation modality.
+      action_size: Number of possible actions.
+    """
+    self._embedders = embedders_dict
+    self._action_size = action_size
+  @abc.abstractmethod
+  def build(self, observations, prev_state):
+    """Builds the model that represents the policy of the agent.
+    Args:
+      observations: Dictionary of observations from different modalities. Keys
+        are the name of the modalities.
+      prev_state: The tensor of the previous state of the model. Should be set
+        to None if the policy is stateless
+    Returns:
+      Tuple of (action, state) where action is the action logits and state is
+      the state of the model after taking new observation.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of Policy interface')
+class LSTMPolicy(Policy):
+  """Represents the implementation of the LSTM based policy.
+  The architecture of the model is as follows. It embeds all the observations
+  using the embedders, concatenates the embeddings of all the modalities. Feed
+  them through two fully connected layers. The lstm takes the features from
+  fully connected layer and the previous action and success of previous action
+  and feed them to LSTM. The value for each action is predicted afterwards.
+  Although the class name has the word LSTM in it, it also supports a mode that
+  builds the network without LSTM just for comparison purposes.
+  """
+  def __init__(self,
+               modality_names,
+               embedders_dict,
+               action_size,
+               params,
+               max_episode_length,
+               feedforward_mode=False):
+    """Instantiates the LSTM policy.
+    Args:
+      modality_names: List of modality names. Makes sure the ordering in
+        concatenation remains the same as modality_names list. Each modality
+        needs to be in the embedders_dict.
+      embedders_dict: Dictionary of embedders for different modalities. Keys
+        should be identical to keys of observation modality. Values should be
+        instance of Embedder class. All the observations except PREV_ACTION
+        requires embedder.
+      action_size: Number of possible actions.
+      params: is instance of tf.hparams and contains the hyperparameters for the
+        policy network.
+      max_episode_length: integer, specifying the maximum length of each
+        episode.
+      feedforward_mode: If True, it does not add LSTM to the model. It should
+        only be set True for comparison between LSTM and feedforward models.
+    """
+    super(LSTMPolicy, self).__init__(embedders_dict, action_size)
+    self._modality_names = modality_names
+    self._lstm_state_size = params.lstm_state_size
+    self._fc_channels = params.fc_channels
+    self._weight_decay = params.weight_decay
+    self._target_embedding_size = params.target_embedding_size
+    self._max_episode_length = max_episode_length
+    self._feedforward_mode = feedforward_mode
+  def _build_lstm(self, encoded_inputs, prev_state, episode_length,
+                  prev_action=None):
+    """Builds an LSTM on top of the encoded inputs.
+    If prev_action is not None then it concatenates them to the input of LSTM.
+    Args:
+      encoded_inputs: The embedding of the observations and goal.
+      prev_state: previous state of LSTM.
+      episode_length: The tensor that contains the length of the sequence for
+        each element of the batch.
+      prev_action: tensor to previous chosen action and additional bit for
+        indicating whether the previous action was successful or not.
+    Returns:
+      a tuple of (lstm output, lstm state).
+    """
+    # Adding prev action and success in addition to the embeddings of the
+    # modalities.
+    if prev_action is not None:
+      encoded_inputs = tf.concat([encoded_inputs, prev_action], axis=-1)
+    with tf.variable_scope('LSTM'):
+      lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_state_size)
+      if prev_state is None:
+        # If prev state is set to None, a state of all zeros will be
+        # passed as a previous value for the cell. Should be used for the
+        # first step of each episode.
+        tf_prev_state = lstm_cell.zero_state(
+            encoded_inputs.get_shape().as_list()[0], dtype=tf.float32)
+      else:
+        tf_prev_state = tf.nn.rnn_cell.LSTMStateTuple(prev_state[0],
+                                                      prev_state[1])
+      lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
+          cell=lstm_cell,
+          inputs=encoded_inputs,
+          sequence_length=episode_length,
+          initial_state=tf_prev_state,
+          dtype=tf.float32,
+      )
+    lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])
+    return lstm_outputs, lstm_state
+  def build(
+      self,
+      observations,
+      prev_state,
+  ):
+    """Builds the model that represents the policy of the agent.
+    Args:
+      observations: Dictionary of observations from different modalities. Keys
+        are the name of the modalities. Observation should have the following
+        key-values.
+          observations['goal']: One-hot tensor that indicates the semantic
+            category of the goal. The shape should be
+            (batch_size x max_sequence_length x goals).
+          observations[task_env.ModalityTypes.PREV_ACTION]: has action_size + 1
+            elements where the first action_size numbers are the one hot vector
+            of the previous action and the last element indicates whether the
+            previous action was successful or not. If
+            task_env.ModalityTypes.PREV_ACTION is not in the observation, it
+            will not be used in the policy.
+      prev_state: Previous state of the model. It should be a tuple of (c,h)
+        where c and h are the previous cell value and hidden state of the lstm.
+        Each element of tuple has shape of (batch_size x lstm_cell_size).
+        If it is set to None, then it initializes the state of the lstm with all
+        zeros.
+    Returns:
+      Tuple of (action, state) where action is the action logits and state is
+      the state of the model after taking new observation.
+    Raises:
+      ValueError: If any of the modality names is not in observations or
+        embedders_dict.
+      ValueError: If 'goal' is not in the observations.
+    """
+    for modality_name in self._modality_names:
+      if modality_name not in observations:
+        raise ValueError('modality name does not exist in observations: {} not '
+                         'in {}'.format(modality_name, observations.keys()))
+      if modality_name not in self._embedders:
+        if modality_name == task_env.ModalityTypes.PREV_ACTION:
+          continue
+        raise ValueError('modality name does not have corresponding embedder'
+                         ' {} not in {}'.format(modality_name,
+                                                self._embedders.keys()))
+    if task_env.ModalityTypes.GOAL not in observations:
+      raise ValueError('goal should be provided in the observations')
+    goal = observations[task_env.ModalityTypes.GOAL]
+    prev_action = None
+    if task_env.ModalityTypes.PREV_ACTION in observations:
+      prev_action = observations[task_env.ModalityTypes.PREV_ACTION]
+    with tf.variable_scope('policy'):
+      with slim.arg_scope(
+          [slim.fully_connected],
+          activation_fn=tf.nn.relu,
+          weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
+          weights_regularizer=slim.l2_regularizer(self._weight_decay)):
+        all_inputs = []
+        # Concatenating the embedding of each modality by applying the embedders
+        # to corresponding observations.
+        def embed(name):
+          with tf.variable_scope('embed_{}'.format(name)):
+            # logging.info('Policy uses embedding %s', name)
+            return self._embedders[name].build(observations[name])
+        all_inputs = map(embed, [
+            x for x in self._modality_names
+            if x != task_env.ModalityTypes.PREV_ACTION
+        ])
+        # Computing goal embedding.
+        shape = goal.get_shape().as_list()
+        with tf.variable_scope('embed_goal'):
+          encoded_goal = tf.reshape(goal, [shape[0] * shape[1], -1])
+          encoded_goal = slim.fully_connected(encoded_goal,
+                                              self._target_embedding_size)
+          encoded_goal = tf.reshape(encoded_goal, [shape[0], shape[1], -1])
+          all_inputs.append(encoded_goal)
+        # Concatenating all the modalities and goal.
+        all_inputs = tf.concat(all_inputs, axis=-1, name='concat_embeddings')
+        shape = all_inputs.get_shape().as_list()
+        all_inputs = tf.reshape(all_inputs, [shape[0] * shape[1], shape[2]])
+        # Applying fully connected layers.
+        encoded_inputs = slim.fully_connected(all_inputs, self._fc_channels)
+        encoded_inputs = slim.fully_connected(encoded_inputs, self._fc_channels)
+        if not self._feedforward_mode:
+          encoded_inputs = tf.reshape(encoded_inputs,
+                                      [shape[0], shape[1], self._fc_channels])
+          lstm_outputs, lstm_state = self._build_lstm(
+              encoded_inputs=encoded_inputs,
+              prev_state=prev_state,
+              episode_length=tf.ones((shape[0],), dtype=tf.float32) *
+              self._max_episode_length,
+              prev_action=prev_action,
+          )
+        else:
+          # If feedforward_mode=True, directly compute bypass the whole LSTM
+          # computations.
+          lstm_outputs = encoded_inputs
+        lstm_outputs = slim.fully_connected(lstm_outputs, self._fc_channels)
+        action_values = slim.fully_connected(
+            lstm_outputs, self._action_size, activation_fn=None)
+        action_values = tf.reshape(action_values, [shape[0], shape[1], -1])
+        if not self._feedforward_mode:
+          return action_values, lstm_state
+        else:
+          return action_values, None
+class TaskPolicy(Policy):
+  """A covenience abstract class providing functionality to deal with Tasks."""
+  def __init__(self,
+               task_config,
+               model_hparams=None,
+               embedder_hparams=None,
+               train_hparams=None):
+    """Constructs a policy which knows how to work with tasks (see tasks.py).
+    It allows to read task history, goal and outputs in consistency with the
+    task config.
+    Args:
+      task_config: an object of type tasks.TaskIOConfig (see tasks.py)
+      model_hparams: a tf.HParams object containing parameter pertaining to
+        model (these are implementation specific)
+      embedder_hparams: a tf.HParams object containing parameter pertaining to
+        history, goal embedders (these are implementation specific)
+      train_hparams: a tf.HParams object containing parameter pertaining to
+        trainin (these are implementation specific)`
+    """
+    super(TaskPolicy, self).__init__(None, None)
+    self._model_hparams = model_hparams
+    self._embedder_hparams = embedder_hparams
+    self._train_hparams = train_hparams
+    self._task_config = task_config
+    self._extra_train_ops = []
+  @property
+  def extra_train_ops(self):
+    """Training ops in addition to the loss, e.g. batch norm updates.
+    Returns:
+      A list of tf ops.
+    """
+    return self._extra_train_ops
+  def _embed_task_ios(self, streams):
+    """Embeds a list of heterogenous streams.
+    These streams correspond to task history, goal and output. The number of
+    streams is equal to the total number of history, plus one for the goal if
+    present, plus one for the output. If the number of history is k, then the
+    first k streams are the history.
+    The used embedders depend on the input (or goal) types. If an input is an
+    image, then a ResNet embedder is used, otherwise
+    MLPEmbedder (see embedders.py).
+    Args:
+      streams: a list of Tensors.
+    Returns:
+      Three float Tensors history, goal, output. If there are no history, or no
+      goal, then the corresponding returned values are None. The shape of the
+      embedded history is batch_size x sequence_length x sum of all embedding
+      dimensions for all history. The shape of the goal is embedding dimension.
+    """
+    # EMBED history.
+    index = 0
+    inps = []
+    scopes = []
+    for c in self._task_config.inputs:
+      if c == task_env.ModalityTypes.IMAGE:
+        scope_name = 'image_embedder/image'
+        reuse = scope_name in scopes
+        scopes.append(scope_name)
+        with tf.variable_scope(scope_name, reuse=reuse):
+          resnet_embedder = embedders.ResNet(self._embedder_hparams.image)
+          image_embeddings = resnet_embedder.build(streams[index])
+          # Uncover batch norm ops.
+          if self._embedder_hparams.image.is_train:
+            self._extra_train_ops += resnet_embedder.extra_train_ops
+          inps.append(image_embeddings)
+          index += 1
+      else:
+        scope_name = 'input_embedder/vector'
+        reuse = scope_name in scopes
+        scopes.append(scope_name)
+        with tf.variable_scope(scope_name, reuse=reuse):
+          input_vector_embedder = embedders.MLPEmbedder(
+              layers=self._embedder_hparams.vector)
+          vector_embedder = input_vector_embedder.build(streams[index])
+          inps.append(vector_embedder)
+          index += 1
+    history = tf.concat(inps, axis=2) if inps else None
+    # EMBED goal.
+    goal = None
+    if self._task_config.query is not None:
+      scope_name = 'image_embedder/query'
+      reuse = scope_name in scopes
+      scopes.append(scope_name)
+      with tf.variable_scope(scope_name, reuse=reuse):
+        resnet_goal_embedder = embedders.ResNet(self._embedder_hparams.goal)
+        goal = resnet_goal_embedder.build(streams[index])
+        if self._embedder_hparams.goal.is_train:
+          self._extra_train_ops += resnet_goal_embedder.extra_train_ops
+        index += 1
+    # Embed true targets if needed (tbd).
+    true_target = streams[index]
+    return history, goal, true_target
+  @abc.abstractmethod
+  def build(self, feeds, prev_state):
+    pass
+class ReactivePolicy(TaskPolicy):
+  """A policy which ignores history.
+  It processes only the current observation (last element in history) and the
+  goal to output a prediction.
+  """
+  def __init__(self, *args, **kwargs):
+    super(ReactivePolicy, self).__init__(*args, **kwargs)
+  # The current implementation ignores the prev_state as it is purely reactive.
+  # It returns None for the current state.
+  def build(self, feeds, prev_state):
+    history, goal, _ = self._embed_task_ios(feeds)
+    _print_debug_ios(history, goal, None)
+    with tf.variable_scope('output_decoder'):
+      # Concatenate the embeddings of the current observation and the goal.
+      reactive_input = tf.concat([tf.squeeze(history[:, -1, :]), goal], axis=1)
+      oconfig = self._task_config.output.shape
+      assert len(oconfig) == 1
+      decoder = embedders.MLPEmbedder(
+          layers=self._embedder_hparams.predictions.layer_sizes + oconfig)
+      predictions = decoder.build(reactive_input)
+    return predictions, None
+class RNNPolicy(TaskPolicy):
+  """A policy which takes into account the full history via RNN.
+  The implementation might and will change.
+  The history, together with the goal, is processed using a stacked LSTM. The
+  output of the last LSTM step is used to produce a prediction. Currently, only
+  a single step output is supported.
+  """
+  def __init__(self, lstm_hparams, *args, **kwargs):
+    super(RNNPolicy, self).__init__(*args, **kwargs)
+    self._lstm_hparams = lstm_hparams
+  # The prev_state is ignored as for now the full history is specified as first
+  # element of the feeds. It might turn out to be beneficial to keep the state
+  # as part of the policy object.
+  def build(self, feeds, state):
+    history, goal, _ = self._embed_task_ios(feeds)
+    _print_debug_ios(history, goal, None)
+    params = self._lstm_hparams
+    cell = lambda: tf.contrib.rnn.BasicLSTMCell(params.cell_size)
+    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
+        [cell() for _ in range(params.num_layers)])
+    # history is of shape batch_size x seq_len x embedding_dimension
+    batch_size, seq_len, _ = tuple(history.get_shape().as_list())
+    if state is None:
+      state = stacked_lstm.zero_state(batch_size, tf.float32)
+    for t in range(seq_len):
+      if params.concat_goal_everywhere:
+        lstm_input = tf.concat([tf.squeeze(history[:, t, :]), goal], axis=1)
+      else:
+        lstm_input = tf.squeeze(history[:, t, :])
+      output, state = stacked_lstm(lstm_input, state)
+    with tf.variable_scope('output_decoder'):
+      oconfig = self._task_config.output.shape
+      assert len(oconfig) == 1
+      features = tf.concat([output, goal], axis=1)
+      assert len(output.get_shape().as_list()) == 2
+      assert len(goal.get_shape().as_list()) == 2
+      decoder = embedders.MLPEmbedder(
+          layers=self._embedder_hparams.predictions.layer_sizes + oconfig)
+      # Prediction is done off the last step lstm output and the goal.
+      predictions = decoder.build(features)
+    return predictions, state
--- a/research/cognitive_planning/preprocessing/__init__.py
+++ b/research/cognitive_planning/preprocessing/__init__.py
--- a/research/cognitive_planning/preprocessing/cifarnet_preprocessing.py
+++ b/research/cognitive_planning/preprocessing/cifarnet_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to preprocess images in CIFAR-10.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+_PADDING = 4
+slim = tf.contrib.slim
+def preprocess_for_train(image,
+                         output_height,
+                         output_width,
+                         padding=_PADDING,
+                         add_image_summaries=True):
+  """Preprocesses the given image for training.
+  Note that the actual resizing scale is sampled from
+    [`resize_size_min`, `resize_size_max`].
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    padding: The amound of padding before and after each dimension of the image.
+    add_image_summaries: Enable image summaries.
+  Returns:
+    A preprocessed image.
+  """
+  if add_image_summaries:
+    tf.summary.image('image', tf.expand_dims(image, 0))
+  # Transform the image to floats.
+  image = tf.to_float(image)
+  if padding > 0:
+    image = tf.pad(image, [[padding, padding], [padding, padding], [0, 0]])
+  # Randomly crop a [height, width] section of the image.
+  distorted_image = tf.random_crop(image,
+                                   [output_height, output_width, 3])
+  # Randomly flip the image horizontally.
+  distorted_image = tf.image.random_flip_left_right(distorted_image)
+  if add_image_summaries:
+    tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
+  # Because these operations are not commutative, consider randomizing
+  # the order their operation.
+  distorted_image = tf.image.random_brightness(distorted_image,
+                                               max_delta=63)
+  distorted_image = tf.image.random_contrast(distorted_image,
+                                             lower=0.2, upper=1.8)
+  # Subtract off the mean and divide by the variance of the pixels.
+  return tf.image.per_image_standardization(distorted_image)
+def preprocess_for_eval(image, output_height, output_width,
+                        add_image_summaries=True):
+  """Preprocesses the given image for evaluation.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    add_image_summaries: Enable image summaries.
+  Returns:
+    A preprocessed image.
+  """
+  if add_image_summaries:
+    tf.summary.image('image', tf.expand_dims(image, 0))
+  # Transform the image to floats.
+  image = tf.to_float(image)
+  # Resize and crop if needed.
+  resized_image = tf.image.resize_image_with_crop_or_pad(image,
+                                                         output_width,
+                                                         output_height)
+  if add_image_summaries:
+    tf.summary.image('resized_image', tf.expand_dims(resized_image, 0))
+  # Subtract off the mean and divide by the variance of the pixels.
+  return tf.image.per_image_standardization(resized_image)
+def preprocess_image(image, output_height, output_width, is_training=False,
+                     add_image_summaries=True):
+  """Preprocesses the given image.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+    add_image_summaries: Enable image summaries.
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    return preprocess_for_train(
+        image, output_height, output_width,
+        add_image_summaries=add_image_summaries)
+  else:
+    return preprocess_for_eval(
+        image, output_height, output_width,
+        add_image_summaries=add_image_summaries)
--- a/research/cognitive_planning/preprocessing/inception_preprocessing.py
+++ b/research/cognitive_planning/preprocessing/inception_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to preprocess images for the Inception networks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.ops import control_flow_ops
+def apply_with_random_selector(x, func, num_cases):
+  """Computes func(x, sel), with sel sampled from [0...num_cases-1].
+  Args:
+    x: input Tensor.
+    func: Python function to apply.
+    num_cases: Python int32, number of cases to sample sel from.
+  Returns:
+    The result of func(x, sel), where func receives the value of the
+    selector as a python integer, but sel is sampled dynamically.
+  """
+  sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32)
+  # Pass the real x only to one of the func calls.
+  return control_flow_ops.merge([
+      func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
+      for case in range(num_cases)])[0]
+def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
+  """Distort the color of a Tensor image.
+  Each color distortion is non-commutative and thus ordering of the color ops
+  matters. Ideally we would randomly permute the ordering of the color ops.
+  Rather then adding that level of complication, we select a distinct ordering
+  of color ops for each preprocessing thread.
+  Args:
+    image: 3-D Tensor containing single image in [0, 1].
+    color_ordering: Python int, a type of distortion (valid values: 0-3).
+    fast_mode: Avoids slower ops (random_hue and random_contrast)
+    scope: Optional scope for name_scope.
+  Returns:
+    3-D Tensor color-distorted image on range [0, 1]
+  Raises:
+    ValueError: if color_ordering not in [0, 3]
+  """
+  with tf.name_scope(scope, 'distort_color', [image]):
+    if fast_mode:
+      if color_ordering == 0:
+        image = tf.image.random_brightness(image, max_delta=32. / 255.)
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_brightness(image, max_delta=32. / 255.)
+    else:
+      if color_ordering == 0:
+        image = tf.image.random_brightness(image, max_delta=32. / 255.)
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      elif color_ordering == 1:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_brightness(image, max_delta=32. / 255.)
+        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      elif color_ordering == 2:
+        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+        image = tf.image.random_brightness(image, max_delta=32. / 255.)
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+      elif color_ordering == 3:
+        image = tf.image.random_hue(image, max_delta=0.2)
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+        image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      else:
+        raise ValueError('color_ordering must be in [0, 3]')
+    # The random_* ops do not necessarily clamp.
+    return tf.clip_by_value(image, 0.0, 1.0)
+def distorted_bounding_box_crop(image,
+                                bbox,
+                                min_object_covered=0.1,
+                                aspect_ratio_range=(0.75, 1.33),
+                                area_range=(0.05, 1.0),
+                                max_attempts=100,
+                                scope=None):
+  """Generates cropped_image using a one of the bboxes randomly distorted.
+  See `tf.image.sample_distorted_bounding_box` for more documentation.
+  Args:
+    image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged
+      as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole
+      image.
+    min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
+      area of the image must contain at least this fraction of any bounding box
+      supplied.
+    aspect_ratio_range: An optional list of `floats`. The cropped area of the
+      image must have an aspect ratio = width / height within this range.
+    area_range: An optional list of `floats`. The cropped area of the image
+      must contain a fraction of the supplied image within in this range.
+    max_attempts: An optional `int`. Number of attempts at generating a cropped
+      region of the image of the specified constraints. After `max_attempts`
+      failures, return the entire image.
+    scope: Optional scope for name_scope.
+  Returns:
+    A tuple, a 3-D Tensor cropped_image and the distorted bbox
+  """
+  with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bbox]):
+    # Each bounding box has shape [1, num_boxes, box coords] and
+    # the coordinates are ordered [ymin, xmin, ymax, xmax].
+    # A large fraction of image datasets contain a human-annotated bounding
+    # box delineating the region of the image containing the object of interest.
+    # We choose to create a new bounding box for the object which is a randomly
+    # distorted version of the human-annotated bounding box that obeys an
+    # allowed range of aspect ratios, sizes and overlap with the human-annotated
+    # bounding box. If no box is supplied, then we assume the bounding box is
+    # the entire image.
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        bounding_boxes=bbox,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=True)
+    bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
+    # Crop the image to the specified bounding box.
+    cropped_image = tf.slice(image, bbox_begin, bbox_size)
+    return cropped_image, distort_bbox
+def preprocess_for_train(image, height, width, bbox,
+                         fast_mode=True,
+                         scope=None,
+                         add_image_summaries=True):
+  """Distort one image for training a network.
+  Distorting images provides a useful technique for augmenting the data
+  set during training in order to make the network invariant to aspects
+  of the image that do not effect the label.
+  Additionally it would create image_summaries to display the different
+  transformations applied to the image.
+  Args:
+    image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
+      [0, 1], otherwise it would converted to tf.float32 assuming that the range
+      is [0, MAX], where MAX is largest positive representable number for
+      int(8/16/32) data type (see `tf.image.convert_image_dtype` for details).
+    height: integer
+    width: integer
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged
+      as [ymin, xmin, ymax, xmax].
+    fast_mode: Optional boolean, if True avoids slower transformations (i.e.
+      bi-cubic resizing, random_hue or random_contrast).
+    scope: Optional scope for name_scope.
+    add_image_summaries: Enable image summaries.
+  Returns:
+    3-D float Tensor of distorted image used for training with range [-1, 1].
+  """
+  with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]):
+    if bbox is None:
+      bbox = tf.constant([0.0, 0.0, 1.0, 1.0],
+                         dtype=tf.float32,
+                         shape=[1, 1, 4])
+    if image.dtype != tf.float32:
+      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+    # Each bounding box has shape [1, num_boxes, box coords] and
+    # the coordinates are ordered [ymin, xmin, ymax, xmax].
+    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                  bbox)
+    if add_image_summaries:
+      tf.summary.image('image_with_bounding_boxes', image_with_box)
+    distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox)
+    # Restore the shape since the dynamic slice based upon the bbox_size loses
+    # the third dimension.
+    distorted_image.set_shape([None, None, 3])
+    image_with_distorted_box = tf.image.draw_bounding_boxes(
+        tf.expand_dims(image, 0), distorted_bbox)
+    if add_image_summaries:
+      tf.summary.image('images_with_distorted_bounding_box',
+                       image_with_distorted_box)
+    # This resizing operation may distort the images because the aspect
+    # ratio is not respected. We select a resize method in a round robin
+    # fashion based on the thread number.
+    # Note that ResizeMethod contains 4 enumerated resizing methods.
+    # We select only 1 case for fast_mode bilinear.
+    num_resize_cases = 1 if fast_mode else 4
+    distorted_image = apply_with_random_selector(
+        distorted_image,
+        lambda x, method: tf.image.resize_images(x, [height, width], method),
+        num_cases=num_resize_cases)
+    if add_image_summaries:
+      tf.summary.image('cropped_resized_image',
+                       tf.expand_dims(distorted_image, 0))
+    # Randomly flip the image horizontally.
+    distorted_image = tf.image.random_flip_left_right(distorted_image)
+    # Randomly distort the colors. There are 1 or 4 ways to do it.
+    num_distort_cases = 1 if fast_mode else 4
+    distorted_image = apply_with_random_selector(
+        distorted_image,
+        lambda x, ordering: distort_color(x, ordering, fast_mode),
+        num_cases=num_distort_cases)
+    if add_image_summaries:
+      tf.summary.image('final_distorted_image',
+                       tf.expand_dims(distorted_image, 0))
+    distorted_image = tf.subtract(distorted_image, 0.5)
+    distorted_image = tf.multiply(distorted_image, 2.0)
+    return distorted_image
+def preprocess_for_eval(image, height, width,
+                        central_fraction=0.875, scope=None):
+  """Prepare one image for evaluation.
+  If height and width are specified it would output an image with that size by
+  applying resize_bilinear.
+  If central_fraction is specified it would crop the central fraction of the
+  input image.
+  Args:
+    image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
+      [0, 1], otherwise it would converted to tf.float32 assuming that the range
+      is [0, MAX], where MAX is largest positive representable number for
+      int(8/16/32) data type (see `tf.image.convert_image_dtype` for details).
+    height: integer
+    width: integer
+    central_fraction: Optional Float, fraction of the image to crop.
+    scope: Optional scope for name_scope.
+  Returns:
+    3-D float Tensor of prepared image.
+  """
+  with tf.name_scope(scope, 'eval_image', [image, height, width]):
+    if image.dtype != tf.float32:
+      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+    # Crop the central region of the image with an area containing 87.5% of
+    # the original image.
+    if central_fraction:
+      image = tf.image.central_crop(image, central_fraction=central_fraction)
+    if height and width:
+      # Resize the image to the specified height and width.
+      image = tf.expand_dims(image, 0)
+      image = tf.image.resize_bilinear(image, [height, width],
+                                       align_corners=False)
+      image = tf.squeeze(image, [0])
+    image = tf.subtract(image, 0.5)
+    image = tf.multiply(image, 2.0)
+    return image
+def preprocess_image(image, height, width,
+                     is_training=False,
+                     bbox=None,
+                     fast_mode=True,
+                     add_image_summaries=True):
+  """Pre-process one image for training or evaluation.
+  Args:
+    image: 3-D Tensor [height, width, channels] with the image. If dtype is
+      tf.float32 then the range should be [0, 1], otherwise it would converted
+      to tf.float32 assuming that the range is [0, MAX], where MAX is largest
+      positive representable number for int(8/16/32) data type (see
+      `tf.image.convert_image_dtype` for details).
+    height: integer, image expected height.
+    width: integer, image expected width.
+    is_training: Boolean. If true it would transform an image for train,
+      otherwise it would transform it for evaluation.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    fast_mode: Optional boolean, if True avoids slower transformations.
+    add_image_summaries: Enable image summaries.
+  Returns:
+    3-D float Tensor containing an appropriately scaled image
+  Raises:
+    ValueError: if user does not provide bounding box
+  """
+  if is_training:
+    return preprocess_for_train(image, height, width, bbox, fast_mode,
+                                add_image_summaries=add_image_summaries)
+  else:
+    return preprocess_for_eval(image, height, width)
--- a/research/cognitive_planning/preprocessing/lenet_preprocessing.py
+++ b/research/cognitive_planning/preprocessing/lenet_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities for preprocessing."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+slim = tf.contrib.slim
+def preprocess_image(image, output_height, output_width, is_training):
+  """Preprocesses the given image.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+  Returns:
+    A preprocessed image.
+  """
+  image = tf.to_float(image)
+  image = tf.image.resize_image_with_crop_or_pad(
+      image, output_width, output_height)
+  image = tf.subtract(image, 128.0)
+  image = tf.div(image, 128.0)
+  return image
--- a/research/cognitive_planning/preprocessing/preprocessing_factory.py
+++ b/research/cognitive_planning/preprocessing/preprocessing_factory.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains a factory for building various models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from preprocessing import cifarnet_preprocessing
+from preprocessing import inception_preprocessing
+from preprocessing import lenet_preprocessing
+from preprocessing import vgg_preprocessing
+slim = tf.contrib.slim
+def get_preprocessing(name, is_training=False):
+  """Returns preprocessing_fn(image, height, width, **kwargs).
+  Args:
+    name: The name of the preprocessing function.
+    is_training: `True` if the model is being used for training and `False`
+      otherwise.
+  Returns:
+    preprocessing_fn: A function that preprocessing a single image (pre-batch).
+      It has the following signature:
+        image = preprocessing_fn(image, output_height, output_width, ...).
+  Raises:
+    ValueError: If Preprocessing `name` is not recognized.
+  """
+  preprocessing_fn_map = {
+      'cifarnet': cifarnet_preprocessing,
+      'inception': inception_preprocessing,
+      'inception_v1': inception_preprocessing,
+      'inception_v2': inception_preprocessing,
+      'inception_v3': inception_preprocessing,
+      'inception_v4': inception_preprocessing,
+      'inception_resnet_v2': inception_preprocessing,
+      'lenet': lenet_preprocessing,
+      'mobilenet_v1': inception_preprocessing,
+      'nasnet_mobile': inception_preprocessing,
+      'nasnet_large': inception_preprocessing,
+      'pnasnet_large': inception_preprocessing,
+      'resnet_v1_50': vgg_preprocessing,
+      'resnet_v1_101': vgg_preprocessing,
+      'resnet_v1_152': vgg_preprocessing,
+      'resnet_v1_200': vgg_preprocessing,
+      'resnet_v2_50': vgg_preprocessing,
+      'resnet_v2_101': vgg_preprocessing,
+      'resnet_v2_152': vgg_preprocessing,
+      'resnet_v2_200': vgg_preprocessing,
+      'vgg': vgg_preprocessing,
+      'vgg_a': vgg_preprocessing,
+      'vgg_16': vgg_preprocessing,
+      'vgg_19': vgg_preprocessing,
+  }
+  if name not in preprocessing_fn_map:
+    raise ValueError('Preprocessing name [%s] was not recognized' % name)
+  def preprocessing_fn(image, output_height, output_width, **kwargs):
+    return preprocessing_fn_map[name].preprocess_image(
+        image, output_height, output_width, is_training=is_training, **kwargs)
+  return preprocessing_fn
--- a/research/cognitive_planning/preprocessing/vgg_preprocessing.py
+++ b/research/cognitive_planning/preprocessing/vgg_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to preprocess images.
+The preprocessing steps for VGG were introduced in the following technical
+report:
+  Very Deep Convolutional Networks For Large-Scale Image Recognition
+  Karen Simonyan and Andrew Zisserman
+  arXiv technical report, 2015
+  PDF: http://arxiv.org/pdf/1409.1556.pdf
+  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
+  CC-BY-4.0
+More information can be obtained from the VGG website:
+www.robots.ox.ac.uk/~vgg/research/very_deep/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+slim = tf.contrib.slim
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+_RESIZE_SIDE_MIN = 256
+_RESIZE_SIDE_MAX = 512
+def _crop(image, offset_height, offset_width, crop_height, crop_width):
+  """Crops the given image using the provided offsets and sizes.
+  Note that the method doesn't assume we know the input image size but it does
+  assume we know the input image rank.
+  Args:
+    image: an image of shape [height, width, channels].
+    offset_height: a scalar tensor indicating the height offset.
+    offset_width: a scalar tensor indicating the width offset.
+    crop_height: the height of the cropped image.
+    crop_width: the width of the cropped image.
+  Returns:
+    the cropped (and resized) image.
+  Raises:
+    InvalidArgumentError: if the rank is not 3 or if the image dimensions are
+      less than the crop size.
+  """
+  original_shape = tf.shape(image)
+  rank_assertion = tf.Assert(
+      tf.equal(tf.rank(image), 3),
+      ['Rank of image must be equal to 3.'])
+  with tf.control_dependencies([rank_assertion]):
+    cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]])
+  size_assertion = tf.Assert(
+      tf.logical_and(
+          tf.greater_equal(original_shape[0], crop_height),
+          tf.greater_equal(original_shape[1], crop_width)),
+      ['Crop size greater than the image size.'])
+  offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))
+  # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
+  # define the crop size.
+  with tf.control_dependencies([size_assertion]):
+    image = tf.slice(image, offsets, cropped_shape)
+  return tf.reshape(image, cropped_shape)
+def _random_crop(image_list, crop_height, crop_width):
+  """Crops the given list of images.
+  The function applies the same crop to each image in the list. This can be
+  effectively applied when there are multiple image inputs of the same
+  dimension such as:
+    image, depths, normals = _random_crop([image, depths, normals], 120, 150)
+  Args:
+    image_list: a list of image tensors of the same dimension but possibly
+      varying channel.
+    crop_height: the new height.
+    crop_width: the new width.
+  Returns:
+    the image_list with cropped images.
+  Raises:
+    ValueError: if there are multiple image inputs provided with different size
+      or the images are smaller than the crop dimensions.
+  """
+  if not image_list:
+    raise ValueError('Empty image_list.')
+  # Compute the rank assertions.
+  rank_assertions = []
+  for i in range(len(image_list)):
+    image_rank = tf.rank(image_list[i])
+    rank_assert = tf.Assert(
+        tf.equal(image_rank, 3),
+        ['Wrong rank for tensor  %s [expected] [actual]',
+         image_list[i].name, 3, image_rank])
+    rank_assertions.append(rank_assert)
+  with tf.control_dependencies([rank_assertions[0]]):
+    image_shape = tf.shape(image_list[0])
+  image_height = image_shape[0]
+  image_width = image_shape[1]
+  crop_size_assert = tf.Assert(
+      tf.logical_and(
+          tf.greater_equal(image_height, crop_height),
+          tf.greater_equal(image_width, crop_width)),
+      ['Crop size greater than the image size.'])
+  asserts = [rank_assertions[0], crop_size_assert]
+  for i in range(1, len(image_list)):
+    image = image_list[i]
+    asserts.append(rank_assertions[i])
+    with tf.control_dependencies([rank_assertions[i]]):
+      shape = tf.shape(image)
+    height = shape[0]
+    width = shape[1]
+    height_assert = tf.Assert(
+        tf.equal(height, image_height),
+        ['Wrong height for tensor %s [expected][actual]',
+         image.name, height, image_height])
+    width_assert = tf.Assert(
+        tf.equal(width, image_width),
+        ['Wrong width for tensor %s [expected][actual]',
+         image.name, width, image_width])
+    asserts.extend([height_assert, width_assert])
+  # Create a random bounding box.
+  #
+  # Use tf.random_uniform and not numpy.random.rand as doing the former would
+  # generate random numbers at graph eval time, unlike the latter which
+  # generates random numbers at graph definition time.
+  with tf.control_dependencies(asserts):
+    max_offset_height = tf.reshape(image_height - crop_height + 1, [])
+  with tf.control_dependencies(asserts):
+    max_offset_width = tf.reshape(image_width - crop_width + 1, [])
+  offset_height = tf.random_uniform(
+      [], maxval=max_offset_height, dtype=tf.int32)
+  offset_width = tf.random_uniform(
+      [], maxval=max_offset_width, dtype=tf.int32)
+  return [_crop(image, offset_height, offset_width,
+                crop_height, crop_width) for image in image_list]
+def _central_crop(image_list, crop_height, crop_width):
+  """Performs central crops of the given image list.
+  Args:
+    image_list: a list of image tensors of the same dimension but possibly
+      varying channel.
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+  Returns:
+    the list of cropped images.
+  """
+  outputs = []
+  for image in image_list:
+    image_height = tf.shape(image)[0]
+    image_width = tf.shape(image)[1]
+    offset_height = (image_height - crop_height) / 2
+    offset_width = (image_width - crop_width) / 2
+    outputs.append(_crop(image, offset_height, offset_width,
+                         crop_height, crop_width))
+  return outputs
+def _mean_image_subtraction(image, means):
+  """Subtracts the given means from each image channel.
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+  Note that the rank of `image` must be known.
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+  Returns:
+    the centered image.
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+  num_channels = image.get_shape().as_list()[-1]
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+  channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
+  for i in range(num_channels):
+    channels[i] -= means[i]
+  return tf.concat(axis=2, values=channels)
+def _smallest_size_at_least(height, width, smallest_side):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    smallest_side: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: and int32 scalar tensor indicating the new width.
+  """
+  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+  height = tf.to_float(height)
+  width = tf.to_float(width)
+  smallest_side = tf.to_float(smallest_side)
+  scale = tf.cond(tf.greater(height, width),
+                  lambda: smallest_side / width,
+                  lambda: smallest_side / height)
+  new_height = tf.to_int32(tf.rint(height * scale))
+  new_width = tf.to_int32(tf.rint(width * scale))
+  return new_height, new_width
+def _aspect_preserving_resize(image, smallest_side):
+  """Resize images preserving the original aspect ratio.
+  Args:
+    image: A 3-D image `Tensor`.
+    smallest_side: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
+  shape = tf.shape(image)
+  height = shape[0]
+  width = shape[1]
+  new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
+  image = tf.expand_dims(image, 0)
+  resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
+                                           align_corners=False)
+  resized_image = tf.squeeze(resized_image)
+  resized_image.set_shape([None, None, 3])
+  return resized_image
+def preprocess_for_train(image,
+                         output_height,
+                         output_width,
+                         resize_side_min=_RESIZE_SIDE_MIN,
+                         resize_side_max=_RESIZE_SIDE_MAX):
+  """Preprocesses the given image for training.
+  Note that the actual resizing scale is sampled from
+    [`resize_size_min`, `resize_size_max`].
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    resize_side_min: The lower bound for the smallest side of the image for
+      aspect-preserving resizing.
+    resize_side_max: The upper bound for the smallest side of the image for
+      aspect-preserving resizing.
+  Returns:
+    A preprocessed image.
+  """
+  resize_side = tf.random_uniform(
+      [], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32)
+  image = _aspect_preserving_resize(image, resize_side)
+  image = _random_crop([image], output_height, output_width)[0]
+  image.set_shape([output_height, output_width, 3])
+  image = tf.to_float(image)
+  image = tf.image.random_flip_left_right(image)
+  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
+def preprocess_for_eval(image, output_height, output_width, resize_side):
+  """Preprocesses the given image for evaluation.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    resize_side: The smallest side of the image for aspect-preserving resizing.
+  Returns:
+    A preprocessed image.
+  """
+  image = _aspect_preserving_resize(image, resize_side)
+  image = _central_crop([image], output_height, output_width)[0]
+  image.set_shape([output_height, output_width, 3])
+  image = tf.to_float(image)
+  return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
+def preprocess_image(image, output_height, output_width, is_training=False,
+                     resize_side_min=_RESIZE_SIDE_MIN,
+                     resize_side_max=_RESIZE_SIDE_MAX):
+  """Preprocesses the given image.
+  Args:
+    image: A `Tensor` representing an image of arbitrary size.
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+    resize_side_min: The lower bound for the smallest side of the image for
+      aspect-preserving resizing. If `is_training` is `False`, then this value
+      is used for rescaling.
+    resize_side_max: The upper bound for the smallest side of the image for
+      aspect-preserving resizing. If `is_training` is `False`, this value is
+      ignored. Otherwise, the resize side is sampled from
+        [resize_size_min, resize_size_max].
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    return preprocess_for_train(image, output_height, output_width,
+                                resize_side_min, resize_side_max)
+  else:
+    return preprocess_for_eval(image, output_height, output_width,
+                               resize_side_min)
--- a/research/cognitive_planning/standard_fields.py
+++ b/research/cognitive_planning/standard_fields.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains classes specifying naming conventions used for object detection.
+Specifies:
+  InputDataFields: standard fields used by reader/preprocessor/batcher.
+  DetectionResultFields: standard fields returned by object detector.
+  BoxListFields: standard field used by BoxList
+  TfExampleFields: standard fields for tf-example data format (go/tf-example).
+"""
+class InputDataFields(object):
+  """Names for the input tensors.
+  Holds the standard data field names to use for identifying input tensors. This
+  should be used by the decoder to identify keys for the returned tensor_dict
+  containing input tensors. And it should be used by the model to identify the
+  tensors it needs.
+  Attributes:
+    image: image.
+    image_additional_channels: additional channels.
+    original_image: image in the original input size.
+    key: unique key corresponding to image.
+    source_id: source of the original image.
+    filename: original filename of the dataset (without common path).
+    groundtruth_image_classes: image-level class labels.
+    groundtruth_boxes: coordinates of the ground truth boxes in the image.
+    groundtruth_classes: box-level class labels.
+    groundtruth_label_types: box-level label types (e.g. explicit negative).
+    groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
+      is the groundtruth a single object or a crowd.
+    groundtruth_area: area of a groundtruth segment.
+    groundtruth_difficult: is a `difficult` object
+    groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of the
+      same class, forming a connected group, where instances are heavily
+      occluding each other.
+    proposal_boxes: coordinates of object proposal boxes.
+    proposal_objectness: objectness score of each proposal.
+    groundtruth_instance_masks: ground truth instance masks.
+    groundtruth_instance_boundaries: ground truth instance boundaries.
+    groundtruth_instance_classes: instance mask-level class labels.
+    groundtruth_keypoints: ground truth keypoints.
+    groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
+    groundtruth_label_scores: groundtruth label scores.
+    groundtruth_weights: groundtruth weight factor for bounding boxes.
+    num_groundtruth_boxes: number of groundtruth boxes.
+    true_image_shapes: true shapes of images in the resized images, as resized
+      images can be padded with zeros.
+    multiclass_scores: the label score per class for each box.
+  """
+  image = 'image'
+  image_additional_channels = 'image_additional_channels'
+  original_image = 'original_image'
+  key = 'key'
+  source_id = 'source_id'
+  filename = 'filename'
+  groundtruth_image_classes = 'groundtruth_image_classes'
+  groundtruth_boxes = 'groundtruth_boxes'
+  groundtruth_classes = 'groundtruth_classes'
+  groundtruth_label_types = 'groundtruth_label_types'
+  groundtruth_is_crowd = 'groundtruth_is_crowd'
+  groundtruth_area = 'groundtruth_area'
+  groundtruth_difficult = 'groundtruth_difficult'
+  groundtruth_group_of = 'groundtruth_group_of'
+  proposal_boxes = 'proposal_boxes'
+  proposal_objectness = 'proposal_objectness'
+  groundtruth_instance_masks = 'groundtruth_instance_masks'
+  groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
+  groundtruth_instance_classes = 'groundtruth_instance_classes'
+  groundtruth_keypoints = 'groundtruth_keypoints'
+  groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
+  groundtruth_label_scores = 'groundtruth_label_scores'
+  groundtruth_weights = 'groundtruth_weights'
+  num_groundtruth_boxes = 'num_groundtruth_boxes'
+  true_image_shape = 'true_image_shape'
+  multiclass_scores = 'multiclass_scores'
+class DetectionResultFields(object):
+  """Naming conventions for storing the output of the detector.
+  Attributes:
+    source_id: source of the original image.
+    key: unique key corresponding to image.
+    detection_boxes: coordinates of the detection boxes in the image.
+    detection_scores: detection scores for the detection boxes in the image.
+    detection_classes: detection-level class labels.
+    detection_masks: contains a segmentation mask for each detection box.
+    detection_boundaries: contains an object boundary for each detection box.
+    detection_keypoints: contains detection keypoints for each detection box.
+    num_detections: number of detections in the batch.
+  """
+  source_id = 'source_id'
+  key = 'key'
+  detection_boxes = 'detection_boxes'
+  detection_scores = 'detection_scores'
+  detection_classes = 'detection_classes'
+  detection_masks = 'detection_masks'
+  detection_boundaries = 'detection_boundaries'
+  detection_keypoints = 'detection_keypoints'
+  num_detections = 'num_detections'
+class BoxListFields(object):
+  """Naming conventions for BoxLists.
+  Attributes:
+    boxes: bounding box coordinates.
+    classes: classes per bounding box.
+    scores: scores per bounding box.
+    weights: sample weights per bounding box.
+    objectness: objectness score per bounding box.
+    masks: masks per bounding box.
+    boundaries: boundaries per bounding box.
+    keypoints: keypoints per bounding box.
+    keypoint_heatmaps: keypoint heatmaps per bounding box.
+    is_crowd: is_crowd annotation per bounding box.
+  """
+  boxes = 'boxes'
+  classes = 'classes'
+  scores = 'scores'
+  weights = 'weights'
+  objectness = 'objectness'
+  masks = 'masks'
+  boundaries = 'boundaries'
+  keypoints = 'keypoints'
+  keypoint_heatmaps = 'keypoint_heatmaps'
+  is_crowd = 'is_crowd'
+class TfExampleFields(object):
+  """TF-example proto feature names for object detection.
+  Holds the standard feature names to load from an Example proto for object
+  detection.
+  Attributes:
+    image_encoded: JPEG encoded string
+    image_format: image format, e.g. "JPEG"
+    filename: filename
+    channels: number of channels of image
+    colorspace: colorspace, e.g. "RGB"
+    height: height of image in pixels, e.g. 462
+    width: width of image in pixels, e.g. 581
+    source_id: original source of the image
+    image_class_text: image-level label in text format
+    image_class_label: image-level label in numerical format
+    object_class_text: labels in text format, e.g. ["person", "cat"]
+    object_class_label: labels in numbers, e.g. [16, 8]
+    object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30
+    object_bbox_xmax: xmax coordinates of groundtruth box, e.g. 50, 40
+    object_bbox_ymin: ymin coordinates of groundtruth box, e.g. 40, 50
+    object_bbox_ymax: ymax coordinates of groundtruth box, e.g. 80, 70
+    object_view: viewpoint of object, e.g. ["frontal", "left"]
+    object_truncated: is object truncated, e.g. [true, false]
+    object_occluded: is object occluded, e.g. [true, false]
+    object_difficult: is object difficult, e.g. [true, false]
+    object_group_of: is object a single object or a group of objects
+    object_depiction: is object a depiction
+    object_is_crowd: [DEPRECATED, use object_group_of instead]
+      is the object a single object or a crowd
+    object_segment_area: the area of the segment.
+    object_weight: a weight factor for the object's bounding box.
+    instance_masks: instance segmentation masks.
+    instance_boundaries: instance boundaries.
+    instance_classes: Classes for each instance segmentation mask.
+    detection_class_label: class label in numbers.
+    detection_bbox_ymin: ymin coordinates of a detection box.
+    detection_bbox_xmin: xmin coordinates of a detection box.
+    detection_bbox_ymax: ymax coordinates of a detection box.
+    detection_bbox_xmax: xmax coordinates of a detection box.
+    detection_score: detection score for the class label and box.
+  """
+  image_encoded = 'image/encoded'
+  image_format = 'image/format'  # format is reserved keyword
+  filename = 'image/filename'
+  channels = 'image/channels'
+  colorspace = 'image/colorspace'
+  height = 'image/height'
+  width = 'image/width'
+  source_id = 'image/source_id'
+  image_class_text = 'image/class/text'
+  image_class_label = 'image/class/label'
+  object_class_text = 'image/object/class/text'
+  object_class_label = 'image/object/class/label'
+  object_bbox_ymin = 'image/object/bbox/ymin'
+  object_bbox_xmin = 'image/object/bbox/xmin'
+  object_bbox_ymax = 'image/object/bbox/ymax'
+  object_bbox_xmax = 'image/object/bbox/xmax'
+  object_view = 'image/object/view'
+  object_truncated = 'image/object/truncated'
+  object_occluded = 'image/object/occluded'
+  object_difficult = 'image/object/difficult'
+  object_group_of = 'image/object/group_of'
+  object_depiction = 'image/object/depiction'
+  object_is_crowd = 'image/object/is_crowd'
+  object_segment_area = 'image/object/segment/area'
+  object_weight = 'image/object/weight'
+  instance_masks = 'image/segmentation/object'
+  instance_boundaries = 'image/boundaries/object'
+  instance_classes = 'image/segmentation/object/class'
+  detection_class_label = 'image/detection/label'
+  detection_bbox_ymin = 'image/detection/bbox/ymin'
+  detection_bbox_xmin = 'image/detection/bbox/xmin'
+  detection_bbox_ymax = 'image/detection/bbox/ymax'
+  detection_bbox_xmax = 'image/detection/bbox/xmax'
+  detection_score = 'image/detection/score'
--- a/research/cognitive_planning/string_int_label_map_pb2.py
+++ b/research/cognitive_planning/string_int_label_map_pb2.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: object_detection/protos/string_int_label_map.proto
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='object_detection/protos/string_int_label_map.proto',
+  package='object_detection.protos',
+  syntax='proto2',
+  serialized_pb=_b('\n2object_detection/protos/string_int_label_map.proto\x12\x17object_detection.protos\"G\n\x15StringIntLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"Q\n\x11StringIntLabelMap\x12<\n\x04item\x18\x01 \x03(\x0b\x32..object_detection.protos.StringIntLabelMapItem')
+)
+_STRINGINTLABELMAPITEM = _descriptor.Descriptor(
+  name='StringIntLabelMapItem',
+  full_name='object_detection.protos.StringIntLabelMapItem',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='object_detection.protos.StringIntLabelMapItem.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='id', full_name='object_detection.protos.StringIntLabelMapItem.id', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='display_name', full_name='object_detection.protos.StringIntLabelMapItem.display_name', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=79,
+  serialized_end=150,
+)
+_STRINGINTLABELMAP = _descriptor.Descriptor(
+  name='StringIntLabelMap',
+  full_name='object_detection.protos.StringIntLabelMap',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='item', full_name='object_detection.protos.StringIntLabelMap.item', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=152,
+  serialized_end=233,
+)
+_STRINGINTLABELMAP.fields_by_name['item'].message_type = _STRINGINTLABELMAPITEM
+DESCRIPTOR.message_types_by_name['StringIntLabelMapItem'] = _STRINGINTLABELMAPITEM
+DESCRIPTOR.message_types_by_name['StringIntLabelMap'] = _STRINGINTLABELMAP
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+StringIntLabelMapItem = _reflection.GeneratedProtocolMessageType('StringIntLabelMapItem', (_message.Message,), dict(
+  DESCRIPTOR = _STRINGINTLABELMAPITEM,
+  __module__ = 'object_detection.protos.string_int_label_map_pb2'
+  # @@protoc_insertion_point(class_scope:object_detection.protos.StringIntLabelMapItem)
+  ))
+_sym_db.RegisterMessage(StringIntLabelMapItem)
+StringIntLabelMap = _reflection.GeneratedProtocolMessageType('StringIntLabelMap', (_message.Message,), dict(
+  DESCRIPTOR = _STRINGINTLABELMAP,
+  __module__ = 'object_detection.protos.string_int_label_map_pb2'
+  # @@protoc_insertion_point(class_scope:object_detection.protos.StringIntLabelMap)
+  ))
+_sym_db.RegisterMessage(StringIntLabelMap)
+# @@protoc_insertion_point(module_scope)
--- a/research/cognitive_planning/tasks.py
+++ b/research/cognitive_planning/tasks.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A library of tasks.
+This interface is intended to implement a wide variety of navigation
+tasks. See go/navigation_tasks for a list.
+"""
+import abc
+import collections
+import math
+import threading
+import networkx as nx
+import numpy as np
+import tensorflow as tf
+#from pyglib import logging
+#import gin
+from envs import task_env
+from envs import util as envs_util
+# Utility functions.
+def _pad_or_clip_array(np_arr, arr_len, is_front_clip=True, output_mask=False):
+  """Make np_arr array to have length arr_len.
+  If the array is shorter than arr_len, then it is padded from the front with
+  zeros. If it is longer, then it is clipped either from the back or from the
+  front. Only the first dimension is modified.
+  Args:
+    np_arr: numpy array.
+    arr_len: integer scalar.
+    is_front_clip: a boolean. If true then clipping is done in the front,
+      otherwise in the back.
+    output_mask: If True, outputs a numpy array of rank 1 which represents
+      a mask of which values have been added (0 - added, 1 - actual output).
+  Returns:
+    A numpy array and the size of padding (as a python int32). This size is
+    negative is the array is clipped.
+  """
+  shape = list(np_arr.shape)
+  pad_size = arr_len - shape[0]
+  padded_or_clipped = None
+  if pad_size < 0:
+    if is_front_clip:
+      padded_or_clipped = np_arr[-pad_size:, :]
+    else:
+      padded_or_clipped = np_arr[:arr_len, :]
+  elif pad_size > 0:
+    padding = np.zeros([pad_size] + shape[1:], dtype=np_arr.dtype)
+    padded_or_clipped = np.concatenate([np_arr, padding], axis=0)
+  else:
+    padded_or_clipped = np_arr
+  if output_mask:
+    mask = np.ones((arr_len,), dtype=np.int)
+    if pad_size > 0:
+      mask[-pad_size:] = 0
+    return padded_or_clipped, pad_size, mask
+  else:
+    return padded_or_clipped, pad_size
+def classification_loss(truth, predicted, weights=None, is_one_hot=True):
+  """A cross entropy loss.
+  Computes the mean of cross entropy losses for all pairs of true labels and
+  predictions. It wraps around a tf implementation of the cross entropy loss
+  with additional reformating of the inputs. If the truth and predicted are
+  n-rank Tensors with n > 2, then these are reshaped to 2-rank Tensors. It
+  allows for truth to be specified as one hot vector or class indices. Finally,
+  a weight can be specified for each element in truth and predicted.
+  Args:
+    truth: an n-rank or (n-1)-rank Tensor containing labels. If is_one_hot is
+      True, then n-rank Tensor is expected, otherwise (n-1) rank one.
+    predicted: an n-rank float Tensor containing prediction probabilities.
+    weights: an (n-1)-rank float Tensor of weights
+    is_one_hot: a boolean.
+  Returns:
+    A TF float scalar.
+  """
+  num_labels = predicted.get_shape().as_list()[-1]
+  if not is_one_hot:
+    truth = tf.reshape(truth, [-1])
+    truth = tf.one_hot(
+        truth, depth=num_labels, on_value=1.0, off_value=0.0, axis=-1)
+  else:
+    truth = tf.reshape(truth, [-1, num_labels])
+  predicted = tf.reshape(predicted, [-1, num_labels])
+  losses = tf.nn.softmax_cross_entropy_with_logits(
+      labels=truth, logits=predicted)
+  if weights is not None:
+    losses = tf.boolean_mask(losses,
+                             tf.cast(tf.reshape(weights, [-1]), dtype=tf.bool))
+  return tf.reduce_mean(losses)
+class UnrolledTaskIOConfig(object):
+  """Configuration of task inputs and outputs.
+  A task can have multiple inputs, which define the context, and a task query
+  which defines what is to be executed in this context. The desired execution
+  is encoded in an output. The config defines the shapes of the inputs, the
+  query and the outputs.
+  """
+  def __init__(self, inputs, output, query=None):
+    """Constructs a Task input/output config.
+    Args:
+      inputs: a list of tuples. Each tuple represents the configuration of an
+        input, with first element being the type (a string value) and the second
+        element the shape.
+      output: a tuple representing the configuration of the output.
+      query: a tuple representing the configuration of the query. If no query,
+        then None.
+    """
+    # A configuration of a single input, output or query. Consists of the type,
+    # which can be one of the three specified above, and a shape. The shape must
+    # be consistent with the type, e.g. if type == 'image', then shape is a 3
+    # valued list.
+    io_config = collections.namedtuple('IOConfig', ['type', 'shape'])
+    def assert_config(config):
+      if not isinstance(config, tuple):
+        raise ValueError('config must be a tuple. Received {}'.format(
+            type(config)))
+      if len(config) != 2:
+        raise ValueError('config must have 2 elements, has %d' % len(config))
+      if not isinstance(config[0], tf.DType):
+        raise ValueError('First element of config must be a tf.DType.')
+      if not isinstance(config[1], list):
+        raise ValueError('Second element of config must be a list.')
+    assert isinstance(inputs, collections.OrderedDict)
+    for modality_type in inputs:
+      assert_config(inputs[modality_type])
+    self._inputs = collections.OrderedDict(
+        [(k, io_config(*value)) for k, value in inputs.iteritems()])
+    if query is not None:
+      assert_config(query)
+      self._query = io_config(*query)
+    else:
+      self._query = None
+    assert_config(output)
+    self._output = io_config(*output)
+  @property
+  def inputs(self):
+    return self._inputs
+  @property
+  def output(self):
+    return self._output
+  @property
+  def query(self):
+    return self._query
+class UnrolledTask(object):
+  """An interface for a Task which can be unrolled during training.
+  Each example is called episode and consists of inputs and target output, where
+  the output can be considered as desired unrolled sequence of actions for the
+  inputs. For the specified tasks, these action sequences are to be
+  unambiguously definable.
+  """
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, config):
+    assert isinstance(config, UnrolledTaskIOConfig)
+    self._config = config
+    # A dict of bookkeeping variables.
+    self.info = {}
+    # Tensorflow input is multithreaded and this lock is needed to prevent
+    # race condition in the environment. Without the lock, non-thread safe
+    # environments crash.
+    self._lock = threading.Lock()
+  @property
+  def config(self):
+    return self._config
+  @abc.abstractmethod
+  def episode(self):
+    """Returns data needed to train and test a single episode.
+    Each episode consists of inputs, which define the context of the task, a
+    query which defines the task, and a target output, which defines a
+    sequence of actions to be executed for this query. This sequence should not
+    require feedback, i.e. can be predicted purely from input and query.]
+    Returns:
+      inputs, query, output, where inputs is a list of numpy arrays and query
+      and output are numpy arrays. These arrays must be of shape and type as
+      specified in the task configuration.
+    """
+    pass
+  def reset(self, observation):
+    """Called after the environment is reset."""
+    pass
+  def episode_batch(self, batch_size):
+    """Returns a batch of episodes.
+    Args:
+      batch_size: size of batch.
+    Returns:
+      (inputs, query, output, masks) where inputs is list of numpy arrays and
+      query, output, and mask are numpy arrays. These arrays must be of shape
+      and type as specified in the task configuration with one additional
+      preceding dimension corresponding to the batch.
+    Raises:
+      ValueError: if self.episode() returns illegal values.
+    """
+    batched_inputs = collections.OrderedDict(
+        [[mtype, []] for mtype in self.config.inputs])
+    batched_queries = []
+    batched_outputs = []
+    batched_masks = []
+    for _ in range(int(batch_size)):
+      with self._lock:
+        # The episode function needs to be thread-safe. Since the current
+        # implementation for the envs are not thread safe we need to have lock
+        # the operations here.
+        inputs, query, outputs = self.episode()
+      if not isinstance(outputs, tuple):
+        raise ValueError('Outputs return value must be tuple.')
+      if len(outputs) != 2:
+        raise ValueError('Output tuple must be of size 2.')
+      if inputs is not None:
+        for modality_type in batched_inputs:
+          batched_inputs[modality_type].append(
+              np.expand_dims(inputs[modality_type], axis=0))
+      if query is not None:
+        batched_queries.append(np.expand_dims(query, axis=0))
+      batched_outputs.append(np.expand_dims(outputs[0], axis=0))
+      if outputs[1] is not None:
+        batched_masks.append(np.expand_dims(outputs[1], axis=0))
+    batched_inputs = {
+        k: np.concatenate(i, axis=0) for k, i in batched_inputs.iteritems()
+    }
+    if batched_queries:
+      batched_queries = np.concatenate(batched_queries, axis=0)
+    batched_outputs = np.concatenate(batched_outputs, axis=0)
+    if batched_masks:
+      batched_masks = np.concatenate(batched_masks, axis=0).astype(np.float32)
+    else:
+      # When the array is empty, the default np.dtype is float64 which causes
+      # py_func to crash in the tests.
+      batched_masks = np.array([], dtype=np.float32)
+    batched_inputs = [batched_inputs[k] for k in self._config.inputs]
+    return batched_inputs, batched_queries, batched_outputs, batched_masks
+  def tf_episode_batch(self, batch_size):
+    """A batch of episodes as TF Tensors.
+    Same as episode_batch with the difference that the return values are TF
+    Tensors.
+    Args:
+      batch_size: a python float for the batch size.
+    Returns:
+      inputs, query, output, mask where inputs is a dictionary of tf.Tensor
+      where the keys are the modality types specified in the config.inputs.
+      query, output, and mask are TF Tensors. These tensors must
+      be of shape and type as specified in the task configuration with one
+      additional preceding  dimension corresponding to the batch. Both mask and
+      output have the same shape as output.
+    """
+    # Define TF outputs.
+    touts = []
+    shapes = []
+    for _, i in self._config.inputs.iteritems():
+      touts.append(i.type)
+      shapes.append(i.shape)
+    if self._config.query is not None:
+      touts.append(self._config.query.type)
+      shapes.append(self._config.query.shape)
+    # Shapes and types for batched_outputs.
+    touts.append(self._config.output.type)
+    shapes.append(self._config.output.shape)
+    # Shapes and types for batched_masks.
+    touts.append(self._config.output.type)
+    shapes.append(self._config.output.shape[0:1])
+    def episode_batch_func():
+      if self.config.query is None:
+        inp, _, output, masks = self.episode_batch(int(batch_size))
+        return tuple(inp) + (output, masks)
+      else:
+        inp, query, output, masks = self.episode_batch(int(batch_size))
+        return tuple(inp) + (query, output, masks)
+    tf_episode_batch = tf.py_func(episode_batch_func, [], touts,
+                                  stateful=True, name='taskdata')
+    for episode, shape in zip(tf_episode_batch, shapes):
+      episode.set_shape([batch_size] + shape)
+    tf_episode_batch_dict = collections.OrderedDict([
+        (mtype, episode)
+        for mtype, episode in zip(self.config.inputs.keys(), tf_episode_batch)
+    ])
+    cur_index = len(self.config.inputs.keys())
+    tf_query = None
+    if self.config.query is not None:
+      tf_query = tf_episode_batch[cur_index]
+      cur_index += 1
+    tf_outputs = tf_episode_batch[cur_index]
+    tf_masks = tf_episode_batch[cur_index + 1]
+    return tf_episode_batch_dict, tf_query, tf_outputs, tf_masks
+  @abc.abstractmethod
+  def target_loss(self, true_targets, targets, weights=None):
+    """A loss for training a task model.
+    This loss measures the discrepancy between the task outputs, the true and
+    predicted ones.
+    Args:
+      true_targets: tf.Tensor of shape and type as defined in the task config
+        containing the true outputs.
+      targets: tf.Tensor of shape and type as defined in the task config
+        containing the predicted outputs.
+      weights: a bool tf.Tensor of shape as targets. Only true values are
+        considered when formulating the loss.
+    """
+    pass
+  def reward(self, obs, done, info):
+    """Returns a reward.
+    The tasks has to compute a reward based on the state of the environment. The
+    reward computation, though, is task specific. The task is to use the
+    environment interface, as defined in task_env.py, to compute the reward. If
+    this interface does not expose enough information, it is to be updated.
+    Args:
+      obs: Observation from environment's step function.
+      done: Done flag from environment's step function.
+      info: Info dict from environment's step function.
+    Returns:
+      obs: Observation.
+      reward: Floating point value.
+      done: Done flag.
+      info: Info dict.
+    """
+    # Default implementation does not do anything.
+    return obs, 0.0, done, info
+class RandomExplorationBasedTask(UnrolledTask):
+  """A Task which starts with a random exploration of the environment."""
+  def __init__(self,
+               env,
+               seed,
+               add_query_noise=False,
+               query_noise_var=0.0,
+               *args,
+               **kwargs):  # pylint: disable=keyword-arg-before-vararg
+    """Initializes a Task using a random exploration runs.
+    Args:
+      env: an instance of type TaskEnv and gym.Env.
+      seed: a random seed.
+      add_query_noise: boolean, if True then whatever queries are generated,
+        they are randomly perturbed. The semantics of the queries depends on the
+        concrete task implementation.
+      query_noise_var: float, the variance of Gaussian noise used for query
+        perturbation. Used iff add_query_noise==True.
+      *args: see super class.
+      **kwargs: see super class.
+    """
+    super(RandomExplorationBasedTask, self).__init__(*args, **kwargs)
+    assert isinstance(env, task_env.TaskEnv)
+    self._env = env
+    self._env.set_task(self)
+    self._rng = np.random.RandomState(seed)
+    self._add_query_noise = add_query_noise
+    self._query_noise_var = query_noise_var
+    # GoToStaticXTask can also take empty config but for the rest of the classes
+    # the number of modality types is 1.
+    if len(self.config.inputs.keys()) > 1:
+      raise NotImplementedError('current implementation supports input '
+                                'with only one modality type or less.')
+  def _exploration(self):
+    """Generates a random exploration run.
+    The function uses the environment to generate a run.
+    Returns:
+      A tuple of numpy arrays. The i-th array contains observation of type and
+      shape as specified in config.inputs[i].
+      A list of states along the exploration path.
+      A list of vertex indices corresponding to the path of the exploration.
+    """
+    in_seq_len = self._config.inputs.values()[0].shape[0]
+    path, _, states, step_outputs = self._env.random_step_sequence(
+        min_len=in_seq_len)
+    obs = {modality_type: [] for modality_type in self._config.inputs}
+    for o in step_outputs:
+      step_obs, _, done, _ = o
+      # It is expected that each value of step_obs is a dict of observations,
+      # whose dimensions are consistent with the config.inputs sizes.
+      for modality_type in self._config.inputs:
+        assert modality_type in step_obs, '{}'.format(type(step_obs))
+        o = step_obs[modality_type]
+        i = self._config.inputs[modality_type]
+        assert len(o.shape) == len(i.shape) - 1
+        for dim_o, dim_i in zip(o.shape, i.shape[1:]):
+          assert dim_o == dim_i, '{} != {}'.format(dim_o, dim_i)
+        obs[modality_type].append(o)
+      if done:
+        break
+    if not obs:
+      return obs, states, path
+    max_path_len = int(
+        round(in_seq_len * float(len(path)) / float(len(obs.values()[0]))))
+    path = path[-max_path_len:]
+    states = states[-in_seq_len:]
+    # The above obs is a list of tuples of np,array. Re-format them as tuple of
+    # np.array, each array containing all observations from all steps.
+    def regroup(obs, i):
+      """Regroups observations.
+      Args:
+        obs: a list of tuples of same size. The k-th tuple contains all the
+          observations from k-th step. Each observation is a numpy array.
+        i: the index of the observation in each tuple to be grouped.
+      Returns:
+        A numpy array of shape config.inputs[i] which contains all i-th
+        observations from all steps. These are concatenated along the first
+        dimension. In addition, if the number of observations is different from
+        the one specified in config.inputs[i].shape[0], then the array is either
+        padded from front or clipped.
+      """
+      grouped_obs = np.concatenate(
+          [np.expand_dims(o, axis=0) for o in obs[i]], axis=0)
+      in_seq_len = self._config.inputs[i].shape[0]
+      # pylint: disable=unbalanced-tuple-unpacking
+      grouped_obs, _ = _pad_or_clip_array(
+          grouped_obs, in_seq_len, is_front_clip=True)
+      return grouped_obs
+    all_obs = {i: regroup(obs, i) for i in self._config.inputs}
+    return all_obs, states, path
+  def _obs_to_state(self, path, states):
+    """Computes mapping between path nodes and states."""
+    # Generate a numpy array of locations corresponding to the path vertices.
+    path_coordinates = map(self._env.vertex_to_pose, path)
+    path_coordinates = np.concatenate(
+        [np.reshape(p, [1, 2]) for p in path_coordinates])
+    # The observations are taken along a smoothed trajectory following the path.
+    # We compute a mapping between the obeservations and the map vertices.
+    path_to_obs = collections.defaultdict(list)
+    obs_to_state = []
+    for i, s in enumerate(states):
+      location = np.reshape(s[0:2], [1, 2])
+      index = np.argmin(
+          np.reshape(
+              np.sum(np.power(path_coordinates - location, 2), axis=1), [-1]))
+      index = path[index]
+      path_to_obs[index].append(i)
+      obs_to_state.append(index)
+    return path_to_obs, obs_to_state
+  def _perturb_state(self, state, noise_var):
+    """Perturbes the state.
+    The location are purturbed using a Gaussian noise with variance
+    noise_var. The orientation is uniformly sampled.
+    Args:
+      state: a numpy array containing an env state (x, y locations).
+      noise_var: float
+    Returns:
+      The perturbed state.
+    """
+    def normal(v, std):
+      if std > 0:
+        n = self._rng.normal(0.0, std)
+        n = min(n, 2.0 * std)
+        n = max(n, -2.0 * std)
+        return v + n
+      else:
+        return v
+    state = state.copy()
+    state[0] = normal(state[0], noise_var)
+    state[1] = normal(state[1], noise_var)
+    if state.size > 2:
+      state[2] = self._rng.uniform(-math.pi, math.pi)
+    return state
+  def _sample_obs(self,
+                  indices,
+                  observations,
+                  observation_states,
+                  path_to_obs,
+                  max_obs_index=None,
+                  use_exploration_obs=True):
+    """Samples one observation which corresponds to vertex_index in path.
+    In addition, the sampled observation must have index in observations less
+    than max_obs_index. If these two conditions cannot be satisfied the
+    function returns None.
+    Args:
+      indices: a list of integers.
+      observations: a list of numpy arrays containing all the observations.
+      observation_states: a list of numpy arrays, each array representing the
+        state of the observation.
+      path_to_obs: a dict of path indices to lists of observation indices.
+      max_obs_index: an integer.
+      use_exploration_obs: if True, then the observation is sampled among the
+        specified observations, otherwise it is obtained from the environment.
+    Returns:
+      A tuple of:
+        -- A numpy array of size width x height x 3 representing the sampled
+          observation.
+        -- The index of the sampld observation among the input observations.
+        -- The state at which the observation is captured.
+    Raises:
+      ValueError: if the observation and observation_states lists are of
+        different lengths.
+    """
+    if len(observations) != len(observation_states):
+      raise ValueError('observation and observation_states lists must have '
+                       'equal lengths')
+    if not indices:
+      return None, None, None
+    vertex_index = self._rng.choice(indices)
+    if use_exploration_obs:
+      obs_indices = path_to_obs[vertex_index]
+      if max_obs_index is not None:
+        obs_indices = [i for i in obs_indices if i < max_obs_index]
+      if obs_indices:
+        index = self._rng.choice(obs_indices)
+        if self._add_query_noise:
+          xytheta = self._perturb_state(observation_states[index],
+                                        self._query_noise_var)
+          return self._env.observation(xytheta), index, xytheta
+        else:
+          return observations[index], index, observation_states[index]
+      else:
+        return None, None, None
+    else:
+      xy = self._env.vertex_to_pose(vertex_index)
+      xytheta = np.array([xy[0], xy[1], 0.0])
+      xytheta = self._perturb_state(xytheta, self._query_noise_var)
+      return self._env.observation(xytheta), None, xytheta
+class AreNearbyTask(RandomExplorationBasedTask):
+  """A task of identifying whether a query is nearby current location or not.
+  The query is guaranteed to be in proximity of an already visited location,
+  i.e. close to one of the observations. For each observation we have one
+  query, which is either close or not to this observation.
+  """
+  def __init__(
+      self,
+      max_distance=0,
+      *args,
+      **kwargs):  # pylint: disable=keyword-arg-before-vararg
+    super(AreNearbyTask, self).__init__(*args, **kwargs)
+    self._max_distance = max_distance
+    if len(self.config.inputs.keys()) != 1:
+      raise NotImplementedError('current implementation supports input '
+                                'with only one modality type')
+  def episode(self):
+    """Episode data.
+    Returns:
+      observations: a tuple with one element. This element is a numpy array of
+        size in_seq_len x observation_size x observation_size x 3 containing
+        in_seq_len images.
+      query: a numpy array of size
+        in_seq_len x observation_size X observation_size x 3 containing a query
+        image.
+      A tuple of size two. First element is a in_seq_len x 2 numpy array of
+        either 1.0 or 0.0. The i-th element denotes whether the i-th query
+        image is neraby (value 1.0) or not (value 0.0) to the i-th observation.
+        The second element in the tuple is a mask, a numpy array of size
+        in_seq_len x 1 and values 1.0 or 0.0 denoting whether the query is
+        valid or not (it can happen that the query is not valid, e.g. there are
+        not enough observations to have a meaningful queries).
+    """
+    observations, states, path = self._exploration()
+    assert len(observations.values()[0]) == len(states)
+    # The observations are taken along a smoothed trajectory following the path.
+    # We compute a mapping between the obeservations and the map vertices.
+    path_to_obs, obs_to_path = self._obs_to_state(path, states)
+    # Go over all observations, and sample a query. With probability 0.5 this
+    # query is a nearby observation (defined as belonging to the same vertex
+    # in path).
+    g = self._env.graph
+    queries = []
+    labels = []
+    validity_masks = []
+    query_index_in_observations = []
+    for i, curr_o in enumerate(observations.values()[0]):
+      p = obs_to_path[i]
+      low = max(0, i - self._max_distance)
+      # A list of lists of vertex indices. Each list in this group corresponds
+      # to one possible label.
+      index_groups = [[], [], []]
+      # Nearby visited indices, label 1.
+      nearby_visited = [
+          ii for ii in path[low:i + 1] + g[p].keys() if ii in obs_to_path[:i]
+      ]
+      nearby_visited = [ii for ii in index_groups[1] if ii in path_to_obs]
+      # NOT Nearby visited indices, label 0.
+      not_nearby_visited = [ii for ii in path[:low] if ii not in g[p].keys()]
+      not_nearby_visited = [ii for ii in index_groups[0] if ii in path_to_obs]
+      # NOT visited indices, label 2.
+      not_visited = [
+          ii for ii in range(g.number_of_nodes()) if ii not in path[:i + 1]
+      ]
+      index_groups = [not_nearby_visited, nearby_visited, not_visited]
+      # Consider only labels for which there are indices.
+      allowed_labels = [ii for ii, group in enumerate(index_groups) if group]
+      label = self._rng.choice(allowed_labels)
+      indices = list(set(index_groups[label]))
+      max_obs_index = None if label == 2 else i
+      use_exploration_obs = False if label == 2 else True
+      o, obs_index, _ = self._sample_obs(
+          indices=indices,
+          observations=observations.values()[0],
+          observation_states=states,
+          path_to_obs=path_to_obs,
+          max_obs_index=max_obs_index,
+          use_exploration_obs=use_exploration_obs)
+      query_index_in_observations.append(obs_index)
+      # If we cannot sample a valid query, we mark it as not valid in mask.
+      if o is None:
+        label = 0.0
+        o = curr_o
+        validity_masks.append(0)
+      else:
+        validity_masks.append(1)
+      queries.append(o.values()[0])
+      labels.append(label)
+    query = np.concatenate([np.expand_dims(q, axis=0) for q in queries], axis=0)
+    def one_hot(label, num_labels=3):
+      a = np.zeros((num_labels,), dtype=np.float)
+      a[int(label)] = 1.0
+      return a
+    outputs = np.stack([one_hot(l) for l in labels], axis=0)
+    validity_mask = np.reshape(
+        np.array(validity_masks, dtype=np.int32), [-1, 1])
+    self.info['query_index_in_observations'] = query_index_in_observations
+    self.info['observation_states'] = states
+    return observations, query, (outputs, validity_mask)
+  def target_loss(self, truth, predicted, weights=None):
+    pass
+class NeighboringQueriesTask(RandomExplorationBasedTask):
+  """A task of identifying whether two queries are closeby or not.
+  The proximity between queries is defined by the length of the shorest path
+  between them.
+  """
+  def __init__(
+      self,
+      max_distance=1,
+      *args,
+      **kwargs):   # pylint: disable=keyword-arg-before-vararg
+    """Initializes a NeighboringQueriesTask.
+    Args:
+      max_distance: integer, the maximum distance in terms of number of vertices
+        between the two queries, so that they are considered neighboring.
+      *args: for super class.
+      **kwargs: for super class.
+    """
+    super(NeighboringQueriesTask, self).__init__(*args, **kwargs)
+    self._max_distance = max_distance
+    if len(self.config.inputs.keys()) != 1:
+      raise NotImplementedError('current implementation supports input '
+                                'with only one modality type')
+  def episode(self):
+    """Episode data.
+    Returns:
+      observations: a tuple with one element. This element is a numpy array of
+        size in_seq_len x observation_size x observation_size x 3 containing
+        in_seq_len images.
+      query: a numpy array of size
+        2 x observation_size X observation_size x 3 containing a pair of query
+        images.
+      A tuple of size two. First element is a numpy array of size 2 containing
+        a one hot vector of whether the two observations are neighobring. Second
+        element is a boolean numpy value denoting whether this is a valid
+        episode.
+    """
+    observations, states, path = self._exploration()
+    assert len(observations.values()[0]) == len(states)
+    path_to_obs, _ = self._obs_to_state(path, states)
+    # Restrict path to ones for which observations have been generated.
+    path = [p for p in path if p in path_to_obs]
+    # Sample first query.
+    query1_index = self._rng.choice(path)
+    # Sample label.
+    label = self._rng.randint(2)
+    # Sample second query.
+    # If label == 1, then second query must be nearby, otherwise not.
+    closest_indices = nx.single_source_shortest_path(
+        self._env.graph, query1_index, self._max_distance).keys()
+    if label == 0:
+      # Closest indices on the path.
+      indices = [p for p in path if p not in closest_indices]
+    else:
+      # Indices which are not closest on the path.
+      indices = [p for p in closest_indices if p in path]
+    query2_index = self._rng.choice(indices)
+    # Generate an observation.
+    query1, query1_index, _ = self._sample_obs(
+        [query1_index],
+        observations.values()[0],
+        states,
+        path_to_obs,
+        max_obs_index=None,
+        use_exploration_obs=True)
+    query2, query2_index, _ = self._sample_obs(
+        [query2_index],
+        observations.values()[0],
+        states,
+        path_to_obs,
+        max_obs_index=None,
+        use_exploration_obs=True)
+    queries = np.concatenate(
+        [np.expand_dims(q, axis=0) for q in [query1, query2]])
+    labels = np.array([0, 0])
+    labels[label] = 1
+    is_valid = np.array([1])
+    self.info['observation_states'] = states
+    self.info['query_indices_in_observations'] = [query1_index, query2_index]
+    return observations, queries, (labels, is_valid)
+  def target_loss(self, truth, predicted, weights=None):
+    pass
+#@gin.configurable
+class GotoStaticXTask(RandomExplorationBasedTask):
+  """Task go to a static X.
+  If continuous reward is used only one goal is allowed so that the reward can
+  be computed as a delta-distance to that goal..
+  """
+  def __init__(self,
+               step_reward=0.0,
+               goal_reward=1.0,
+               hit_wall_reward=-1.0,
+               done_at_target=False,
+               use_continuous_reward=False,
+               *args,
+               **kwargs):  # pylint: disable=keyword-arg-before-vararg
+    super(GotoStaticXTask, self).__init__(*args, **kwargs)
+    if len(self.config.inputs.keys()) > 1:
+      raise NotImplementedError('current implementation supports input '
+                                'with only one modality type or less.')
+    self._step_reward = step_reward
+    self._goal_reward = goal_reward
+    self._hit_wall_reward = hit_wall_reward
+    self._done_at_target = done_at_target
+    self._use_continuous_reward = use_continuous_reward
+    self._previous_path_length = None
+  def episode(self):
+    observations, _, path = self._exploration()
+    if len(path) < 2:
+      raise ValueError('The exploration path has only one node.')
+    g = self._env.graph
+    start = path[-1]
+    while True:
+      goal = self._rng.choice(path[:-1])
+      if goal != start:
+        break
+    goal_path = nx.shortest_path(g, start, goal)
+    init_orientation = self._rng.uniform(0, np.pi, (1,))
+    trajectory = np.array(
+        [list(self._env.vertex_to_pose(p)) for p in goal_path])
+    init_xy = np.reshape(trajectory[0, :], [-1])
+    init_state = np.concatenate([init_xy, init_orientation], 0)
+    trajectory = trajectory[1:, :]
+    deltas = envs_util.trajectory_to_deltas(trajectory, init_state)
+    output_seq_len = self._config.output.shape[0]
+    arr = _pad_or_clip_array(deltas, output_seq_len, output_mask=True)
+    # pylint: disable=unbalanced-tuple-unpacking
+    thetas, _, thetas_mask = arr
+    query = self._env.observation(self._env.vertex_to_pose(goal)).values()[0]
+    return observations, query, (thetas, thetas_mask)
+  def reward(self, obs, done, info):
+    if 'wall_collision' in info and info['wall_collision']:
+      return obs, self._hit_wall_reward, done, info
+    reward = 0.0
+    current_vertex = self._env.pose_to_vertex(self._env.state)
+    if current_vertex in self._env.targets():
+      if self._done_at_target:
+        done = True
+      else:
+        obs = self._env.reset()
+      reward = self._goal_reward
+    else:
+      if self._use_continuous_reward:
+        if len(self._env.targets()) != 1:
+          raise ValueError(
+              'FindX task with continuous reward is assuming only one target.')
+        goal_vertex = self._env.targets()[0]
+        path_length = self._compute_path_length(goal_vertex)
+        reward = self._previous_path_length - path_length
+        self._previous_path_length = path_length
+      else:
+        reward = self._step_reward
+    return obs, reward, done, info
+  def _compute_path_length(self, goal_vertex):
+    current_vertex = self._env.pose_to_vertex(self._env.state)
+    path = nx.shortest_path(self._env.graph, current_vertex, goal_vertex)
+    assert len(path) >= 2
+    curr_xy = np.array(self._env.state[:2])
+    next_xy = np.array(self._env.vertex_to_pose(path[1]))
+    last_step_distance = np.linalg.norm(next_xy - curr_xy)
+    return (len(path) - 2) * self._env.cell_size_px + last_step_distance
+  def reset(self, observation):
+    if self._use_continuous_reward:
+      if len(self._env.targets()) != 1:
+        raise ValueError(
+            'FindX task with continuous reward is assuming only one target.')
+      goal_vertex = self._env.targets()[0]
+      self._previous_path_length = self._compute_path_length(goal_vertex)
+  def target_loss(self, truth, predicted, weights=None):
+    """Action classification loss.
+    Args:
+      truth: a batch_size x sequence length x number of labels float
+        Tensor containing a one hot vector for each label in each batch and
+        time.
+      predicted: a batch_size x sequence length x number of labels float
+        Tensor containing a predicted distribution over all actions.
+      weights: a batch_size x sequence_length float Tensor of bool
+        denoting which actions are valid.
+    Returns:
+      An average cross entropy over all batches and elements in sequence.
+    """
+    return classification_loss(
+        truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
+class RelativeLocationTask(RandomExplorationBasedTask):
+  """A task of estimating the relative location of a query w.r.t current.
+  It is to be used for debugging. It is designed such that the output is a
+  single value, out of a discrete set of values, so that it can be phrased as
+  a classification problem.
+  """
+  def __init__(self, num_labels, *args, **kwargs):
+    """Initializes a relative location task.
+    Args:
+      num_labels: integer, number of orientations to bin the relative
+        orientation into.
+      *args: see super class.
+      **kwargs: see super class.
+    """
+    super(RelativeLocationTask, self).__init__(*args, **kwargs)
+    self._num_labels = num_labels
+    if len(self.config.inputs.keys()) != 1:
+      raise NotImplementedError('current implementation supports input '
+                                'with only one modality type')
+  def episode(self):
+    observations, states, path = self._exploration()
+    # Select a random element from history.
+    path_to_obs, _ = self._obs_to_state(path, states)
+    use_exploration_obs = not self._add_query_noise
+    query, _, query_state = self._sample_obs(
+        path[:-1],
+        observations.values()[0],
+        states,
+        path_to_obs,
+        max_obs_index=None,
+        use_exploration_obs=use_exploration_obs)
+    x, y, theta = tuple(states[-1])
+    q_x, q_y, _ = tuple(query_state)
+    t_x, t_y = q_x - x, q_y - y
+    (rt_x, rt_y) = (np.sin(theta) * t_x - np.cos(theta) * t_y,
+                    np.cos(theta) * t_x + np.sin(theta) * t_y)
+    # Bins are [a(i), a(i+1)] for a(i) = -pi + 0.5 * bin_size + i * bin_size.
+    shift = np.pi * (1 - 1.0 / (2.0 * self._num_labels))
+    orientation = np.arctan2(rt_y, rt_x) + shift
+    if orientation < 0:
+      orientation += 2 * np.pi
+    label = int(np.floor(self._num_labels * orientation / (2 * np.pi)))
+    out_shape = self._config.output.shape
+    if len(out_shape) != 1:
+      raise ValueError('Output shape should be of rank 1.')
+    if out_shape[0] != self._num_labels:
+      raise ValueError('Output shape must be of size %d' % self._num_labels)
+    output = np.zeros(out_shape, dtype=np.float32)
+    output[label] = 1
+    return observations, query, (output, None)
+  def target_loss(self, truth, predicted, weights=None):
+    return classification_loss(
+        truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
+class LocationClassificationTask(UnrolledTask):
+  """A task of classifying a location as one of several classes.
+  The task does not have an input, but just a query and an output. The query
+  is an observation of the current location, e.g. an image taken from the
+  current state. The output is a label classifying this location in one of
+  predefined set of locations (or landmarks).
+  The current implementation classifies locations as intersections based on the
+  number and directions of biforcations. It is expected that a location can have
+  at most 4 different directions, aligned with the axes. As each of these four
+  directions might be present or not, the number of possible intersections are
+  2^4 = 16.
+  """
+  def __init__(self, env, seed, *args, **kwargs):
+    super(LocationClassificationTask, self).__init__(*args, **kwargs)
+    self._env = env
+    self._rng = np.random.RandomState(seed)
+    # A location property which can be set. If not set, a random one is
+    # generated.
+    self._location = None
+    if len(self.config.inputs.keys()) > 1:
+      raise NotImplementedError('current implementation supports input '
+                                'with only one modality type or less.')
+  @property
+  def location(self):
+    return self._location
+  @location.setter
+  def location(self, location):
+    self._location = location
+  def episode(self):
+    # Get a location. If not set, sample on at a vertex with a random
+    # orientation
+    location = self._location
+    if location is None:
+      num_nodes = self._env.graph.number_of_nodes()
+      vertex = int(math.floor(self._rng.uniform(0, num_nodes)))
+      xy = self._env.vertex_to_pose(vertex)
+      theta = self._rng.uniform(0, 2 * math.pi)
+      location = np.concatenate(
+          [np.reshape(xy, [-1]), np.array([theta])], axis=0)
+    else:
+      vertex = self._env.pose_to_vertex(location)
+    theta = location[2]
+    neighbors = self._env.graph.neighbors(vertex)
+    xy_s = [self._env.vertex_to_pose(n) for n in neighbors]
+    def rotate(xy, theta):
+      """Rotates a vector around the origin by angle theta.
+      Args:
+        xy: a numpy darray of shape (2, ) of floats containing the x and y
+          coordinates of a vector.
+        theta: a python float containing the rotation angle in radians.
+      Returns:
+        A numpy darray of floats of shape (2,) containing the x and y
+          coordinates rotated xy.
+      """
+      rotated_x = np.cos(theta) * xy[0] - np.sin(theta) * xy[1]
+      rotated_y = np.sin(theta) * xy[0] + np.cos(theta) * xy[1]
+      return np.array([rotated_x, rotated_y])
+    # Rotate all intersection biforcation by the orientation of the agent as the
+    # intersection label is defined in an agent centered fashion.
+    xy_s = [
+        rotate(xy - location[0:2], -location[2] - math.pi / 4) for xy in xy_s
+    ]
+    th_s = [np.arctan2(xy[1], xy[0]) for xy in xy_s]
+    out_shape = self._config.output.shape
+    if len(out_shape) != 1:
+      raise ValueError('Output shape should be of rank 1.')
+    num_labels = out_shape[0]
+    if num_labels != 16:
+      raise ValueError('Currently only 16 labels are supported '
+                       '(there are 16 different 4 way intersection types).')
+    th_s = set([int(math.floor(4 * (th / (2 * np.pi) + 0.5))) for th in th_s])
+    one_hot_label = np.zeros((num_labels,), dtype=np.float32)
+    label = 0
+    for th in th_s:
+      label += pow(2, th)
+    one_hot_label[int(label)] = 1.0
+    query = self._env.observation(location).values()[0]
+    return [], query, (one_hot_label, None)
+  def reward(self, obs, done, info):
+    raise ValueError('Do not call.')
+  def target_loss(self, truth, predicted, weights=None):
+    return classification_loss(
+        truth=truth, predicted=predicted, weights=weights, is_one_hot=True)
+class GotoStaticXNoExplorationTask(UnrolledTask):
+  """An interface for findX tasks without exploration.
+  The agent is initialized a random location in a random world and a random goal
+  and the objective is for the agent to move toward the goal. This class
+  generates episode for such task. Each generates a sequence of observations x
+  and target outputs y. x is the observations and is an OrderedDict with keys
+  provided from config.inputs.keys() and the shapes provided in the
+  config.inputs. The output is a numpy arrays with the shape specified in the
+  config.output. The shape of the array is (sequence_length x action_size) where
+  action is the number of actions that can be done in the environment. Note that
+  config.output.shape should be set according to the number of actions that can
+  be done in the env.
+  target outputs y are the groundtruth value of each action that is computed
+  from the environment graph. The target output for each action is proportional
+  to the progress that each action makes. Target value of 1 means that the
+  action takes the agent one step closer, -1 means the action takes the agent
+  one step farther. Value of -2 means that action should not take place at all.
+  This can be because the action leads to collision or it wants to terminate the
+  episode prematurely.
+  """
+  def __init__(self, env, *args, **kwargs):
+    super(GotoStaticXNoExplorationTask, self).__init__(*args, **kwargs)
+    if self._config.query is not None:
+      raise ValueError('query should be None.')
+    if len(self._config.output.shape) != 2:
+      raise ValueError('output should only have two dimensions:'
+                       '(sequence_length x number_of_actions)')
+    for input_config in self._config.inputs.values():
+      if input_config.shape[0] != self._config.output.shape[0]:
+        raise ValueError('the first dimension of the input and output should'
+                         'be the same.')
+    if len(self._config.output.shape) != 2:
+      raise ValueError('output shape should be '
+                       '(sequence_length x number_of_actions)')
+    self._env = env
+  def _compute_shortest_path_length(self, vertex, target_vertices):
+    """Computes length of the shortest path from vertex to any target vertexes.
+    Args:
+      vertex: integer, index of the vertex in the environment graph.
+      target_vertices: list of the target vertexes
+    Returns:
+      integer, minimum distance from the vertex to any of the target_vertices.
+    Raises:
+      ValueError: if there is no path between the vertex and at least one of
+        the target_vertices.
+    """
+    try:
+      return np.min([
+          len(nx.shortest_path(self._env.graph, vertex, t))
+          for t in target_vertices
+      ])
+    except:
+      #logging.error('there is no path between vertex %d and at least one of '
+      #              'the targets %r', vertex, target_vertices)
+      raise
+  def _compute_gt_value(self, vertex, target_vertices):
+    """Computes groundtruth value of all the actions at the vertex.
+    The value of each action is the difference each action makes in the length
+    of the shortest path to the goal. If an action takes the agent one step
+    closer to the goal the value is 1. In case, it takes the agent one step away
+    from the goal it would be -1. If it leads to collision or if the agent uses
+    action stop before reaching to the goal it is -2. To avoid scale issues the
+    gt_values are multipled by 0.5.
+    Args:
+      vertex: integer, the index of current vertex.
+      target_vertices: list of the integer indexes of the target views.
+    Returns:
+      numpy array with shape (action_size,) and each element is the groundtruth
+      value of each action based on the progress each action makes.
+    """
+    action_size = self._config.output.shape[1]
+    output_value = np.ones((action_size), dtype=np.float32) * -2
+    my_distance = self._compute_shortest_path_length(vertex, target_vertices)
+    for adj in self._env.graph[vertex]:
+      adj_distance = self._compute_shortest_path_length(adj, target_vertices)
+      if adj_distance is None:
+        continue
+      action_index = self._env.action(
+          self._env.vertex_to_pose(vertex), self._env.vertex_to_pose(adj))
+      assert action_index is not None, ('{} is not adjacent to {}. There might '
+                                        'be a problem in environment graph '
+                                        'connectivity because there is no '
+                                        'direct edge between the given '
+                                        'vertices').format(
+                                            self._env.vertex_to_pose(vertex),
+                                            self._env.vertex_to_pose(adj))
+      output_value[action_index] = my_distance - adj_distance
+    return output_value * 0.5
+  def episode(self):
+    """Returns data needed to train and test a single episode.
+    Returns:
+      (inputs, None, output) where inputs is a dictionary of modality types to
+        numpy arrays. The second element is query but we assume that the goal
+        is also given as part of observation so it should be None for this task,
+        and the outputs is the tuple of ground truth action values with the
+        shape of (sequence_length x action_size) that is coming from
+        config.output.shape and a numpy array with the shape of
+        (sequence_length,) that is 1 if the corresponding element of the
+        input and output should be used in the training optimization.
+    Raises:
+      ValueError: If the output values for env.random_step_sequence is not
+        valid.
+      ValueError: If the shape of observations coming from the env is not
+        consistent with the config.
+      ValueError: If there is a modality type specified in the config but the
+        environment does not return that.
+    """
+    # Sequence length is the first dimension of any of the input tensors.
+    sequence_length = self._config.inputs.values()[0].shape[0]
+    modality_types = self._config.inputs.keys()
+    path, _, _, step_outputs = self._env.random_step_sequence(
+        max_len=sequence_length)
+    target_vertices = [self._env.pose_to_vertex(x) for x in self._env.targets()]
+    if len(path) != len(step_outputs):
+      raise ValueError('path, and step_outputs should have equal length'
+                       ' {}!={}'.format(len(path), len(step_outputs)))
+    # Building up observations. observations will be a OrderedDict of
+    # modality types. The values are numpy arrays that follow the given shape
+    # in the input config for each modality type.
+    observations = collections.OrderedDict([k, []] for k in modality_types)
+    for step_output in step_outputs:
+      obs_dict = step_output[0]
+      # Only going over the modality types that are specified in the input
+      # config.
+      for modality_type in modality_types:
+        if modality_type not in obs_dict:
+          raise ValueError('modality type is not returned from the environment.'
+                           '{} not in {}'.format(modality_type,
+                                                 obs_dict.keys()))
+        obs = obs_dict[modality_type]
+        if np.any(
+            obs.shape != tuple(self._config.inputs[modality_type].shape[1:])):
+          raise ValueError(
+              'The observations should have the same size as speicifed in'
+              'config for modality type {}. {} != {}'.format(
+                  modality_type, obs.shape,
+                  self._config.inputs[modality_type].shape[1:]))
+        observations[modality_type].append(obs)
+    gt_value = [self._compute_gt_value(v, target_vertices) for v in path]
+    # pylint: disable=unbalanced-tuple-unpacking
+    gt_value, _, value_mask = _pad_or_clip_array(
+        np.array(gt_value),
+        sequence_length,
+        is_front_clip=False,
+        output_mask=True,
+    )
+    for modality_type, obs in observations.iteritems():
+      observations[modality_type], _, mask = _pad_or_clip_array(
+          np.array(obs), sequence_length, is_front_clip=False, output_mask=True)
+      assert np.all(mask == value_mask)
+    return observations, None, (gt_value, value_mask)
+  def reset(self, observation):
+    """Called after the environment is reset."""
+    pass
+  def target_loss(self, true_targets, targets, weights=None):
+    """A loss for training a task model.
+    This loss measures the discrepancy between the task outputs, the true and
+    predicted ones.
+    Args:
+      true_targets: tf.Tensor of tf.float32 with the shape of
+        (batch_size x sequence_length x action_size).
+      targets: tf.Tensor of tf.float32 with the shape of
+        (batch_size x sequence_length x action_size).
+      weights: tf.Tensor of tf.bool with the shape of
+        (batch_size x sequence_length).
+    Raises:
+      ValueError: if the shapes of the input tensors are not consistent.
+    Returns:
+      L2 loss between the predicted action values and true action values.
+    """
+    targets_shape = targets.get_shape().as_list()
+    true_targets_shape = true_targets.get_shape().as_list()
+    if len(targets_shape) != 3 or len(true_targets_shape) != 3:
+      raise ValueError('invalid shape for targets or true_targets_shape')
+    if np.any(targets_shape != true_targets_shape):
+      raise ValueError('the shape of targets and true_targets are not the same'
+                       '{} != {}'.format(targets_shape, true_targets_shape))
+    if weights is not None:
+      # Filtering targets and true_targets using weights.
+      weights_shape = weights.get_shape().as_list()
+      if np.any(weights_shape != targets_shape[0:2]):
+        raise ValueError('The first two elements of weights shape should match'
+                         'target. {} != {}'.format(weights_shape,
+                                                   targets_shape))
+      true_targets = tf.boolean_mask(true_targets, weights)
+      targets = tf.boolean_mask(targets, weights)
+    return tf.losses.mean_squared_error(tf.reshape(targets, [-1]),
+                                        tf.reshape(true_targets, [-1]))
+  def reward(self, obs, done, info):
+    raise NotImplementedError('reward is not implemented for this task')
+################################################################################
+class NewTask(UnrolledTask):
+  def __init__(self, env, *args, **kwargs):
+    super(NewTask, self).__init__(*args, **kwargs)
+    self._env = env
+  def _compute_shortest_path_length(self, vertex, target_vertices):
+    """Computes length of the shortest path from vertex to any target vertexes.
+    Args:
+      vertex: integer, index of the vertex in the environment graph.
+      target_vertices: list of the target vertexes
+    Returns:
+      integer, minimum distance from the vertex to any of the target_vertices.
+    Raises:
+      ValueError: if there is no path between the vertex and at least one of
+        the target_vertices.
+    """
+    try:
+      return np.min([
+          len(nx.shortest_path(self._env.graph, vertex, t))
+          for t in target_vertices
+      ])
+    except:
+      logging.error('there is no path between vertex %d and at least one of '
+                    'the targets %r', vertex, target_vertices)
+      raise
+  def _compute_gt_value(self, vertex, target_vertices):
+    """Computes groundtruth value of all the actions at the vertex.
+    The value of each action is the difference each action makes in the length
+    of the shortest path to the goal. If an action takes the agent one step
+    closer to the goal the value is 1. In case, it takes the agent one step away
+    from the goal it would be -1. If it leads to collision or if the agent uses
+    action stop before reaching to the goal it is -2. To avoid scale issues the
+    gt_values are multipled by 0.5.
+    Args:
+      vertex: integer, the index of current vertex.
+      target_vertices: list of the integer indexes of the target views.
+    Returns:
+      numpy array with shape (action_size,) and each element is the groundtruth
+      value of each action based on the progress each action makes.
+    """
+    action_size = self._config.output.shape[1]
+    output_value = np.ones((action_size), dtype=np.float32) * -2
+    # own compute _compute_shortest_path_length - returnts float
+    my_distance = self._compute_shortest_path_length(vertex, target_vertices)
+    for adj in self._env.graph[vertex]:
+      adj_distance = self._compute_shortest_path_length(adj, target_vertices)
+      if adj_distance is None:
+        continue
+      action_index = self._env.action(
+          self._env.vertex_to_pose(vertex), self._env.vertex_to_pose(adj))
+      assert action_index is not None, ('{} is not adjacent to {}. There might '
+                                        'be a problem in environment graph '
+                                        'connectivity because there is no '
+                                        'direct edge between the given '
+                                        'vertices').format(
+                                            self._env.vertex_to_pose(vertex),
+                                            self._env.vertex_to_pose(adj))
+      output_value[action_index] = my_distance - adj_distance
+    return output_value * 0.5
+  def episode(self):
+    """Returns data needed to train and test a single episode.
+    Returns:
+      (inputs, None, output) where inputs is a dictionary of modality types to
+        numpy arrays. The second element is query but we assume that the goal
+        is also given as part of observation so it should be None for this task,
+        and the outputs is the tuple of ground truth action values with the
+        shape of (sequence_length x action_size) that is coming from
+        config.output.shape and a numpy array with the shape of
+        (sequence_length,) that is 1 if the corresponding element of the
+        input and output should be used in the training optimization.
+    Raises:
+      ValueError: If the output values for env.random_step_sequence is not
+        valid.
+      ValueError: If the shape of observations coming from the env is not
+        consistent with the config.
+      ValueError: If there is a modality type specified in the config but the
+        environment does not return that.
+    """
+    # Sequence length is the first dimension of any of the input tensors.
+    sequence_length = self._config.inputs.values()[0].shape[0]
+    modality_types = self._config.inputs.keys()
+    path, _, _, step_outputs = self._env.random_step_sequence(
+        max_len=sequence_length)
+    target_vertices = [self._env.pose_to_vertex(x) for x in self._env.targets()]
+    if len(path) != len(step_outputs):
+      raise ValueError('path, and step_outputs should have equal length'
+                       ' {}!={}'.format(len(path), len(step_outputs)))
+    # Building up observations. observations will be a OrderedDict of
+    # modality types. The values are numpy arrays that follow the given shape
+    # in the input config for each modality type.
+    observations = collections.OrderedDict([k, []] for k in modality_types)
+    for step_output in step_outputs:
+      obs_dict = step_output[0]
+      # Only going over the modality types that are specified in the input
+      # config.
+      for modality_type in modality_types:
+        if modality_type not in obs_dict:
+          raise ValueError('modality type is not returned from the environment.'
+                           '{} not in {}'.format(modality_type,
+                                                 obs_dict.keys()))
+        obs = obs_dict[modality_type]
+        if np.any(
+            obs.shape != tuple(self._config.inputs[modality_type].shape[1:])):
+          raise ValueError(
+              'The observations should have the same size as speicifed in'
+              'config for modality type {}. {} != {}'.format(
+                  modality_type, obs.shape,
+                  self._config.inputs[modality_type].shape[1:]))
+        observations[modality_type].append(obs)
+    gt_value = [self._compute_gt_value(v, target_vertices) for v in path]
+    # pylint: disable=unbalanced-tuple-unpacking
+    gt_value, _, value_mask = _pad_or_clip_array(
+        np.array(gt_value),
+        sequence_length,
+        is_front_clip=False,
+        output_mask=True,
+    )
+    for modality_type, obs in observations.iteritems():
+      observations[modality_type], _, mask = _pad_or_clip_array(
+          np.array(obs), sequence_length, is_front_clip=False, output_mask=True)
+      assert np.all(mask == value_mask)
+    return observations, None, (gt_value, value_mask)
+  def reset(self, observation):
+    """Called after the environment is reset."""
+    pass
+  def target_loss(self, true_targets, targets, weights=None):
+    """A loss for training a task model.
+    This loss measures the discrepancy between the task outputs, the true and
+    predicted ones.
+    Args:
+      true_targets: tf.Tensor of tf.float32 with the shape of
+        (batch_size x sequence_length x action_size).
+      targets: tf.Tensor of tf.float32 with the shape of
+        (batch_size x sequence_length x action_size).
+      weights: tf.Tensor of tf.bool with the shape of
+        (batch_size x sequence_length).
+    Raises:
+      ValueError: if the shapes of the input tensors are not consistent.
+    Returns:
+      L2 loss between the predicted action values and true action values.
+    """
+    targets_shape = targets.get_shape().as_list()
+    true_targets_shape = true_targets.get_shape().as_list()
+    if len(targets_shape) != 3 or len(true_targets_shape) != 3:
+      raise ValueError('invalid shape for targets or true_targets_shape')
+    if np.any(targets_shape != true_targets_shape):
+      raise ValueError('the shape of targets and true_targets are not the same'
+                       '{} != {}'.format(targets_shape, true_targets_shape))
+    if weights is not None:
+      # Filtering targets and true_targets using weights.
+      weights_shape = weights.get_shape().as_list()
+      if np.any(weights_shape != targets_shape[0:2]):
+        raise ValueError('The first two elements of weights shape should match'
+                         'target. {} != {}'.format(weights_shape,
+                                                   targets_shape))
+      true_targets = tf.boolean_mask(true_targets, weights)
+      targets = tf.boolean_mask(targets, weights)
+    return tf.losses.mean_squared_error(tf.reshape(targets, [-1]),
+                                        tf.reshape(true_targets, [-1]))
+  def reward(self, obs, done, info):
+    raise NotImplementedError('reward is not implemented for this task')
--- a/research/cognitive_planning/train_supervised_active_vision.py
+++ b/research/cognitive_planning/train_supervised_active_vision.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+# pyformat: disable
+"""Train and eval for supervised navigation training.
+For training:
+python train_supervised_active_vision.py \
+  --mode='train' \
+  --logdir=$logdir/checkin_log_det/ \
+  --modality_types='det' \
+  --batch_size=8 \
+  --train_iters=200000 \
+  --lstm_cell_size=2048 \
+  --policy_fc_size=2048 \
+  --sequence_length=20 \
+  --max_eval_episode_length=100 \
+  --test_iters=194 \
+  --gin_config=envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root="$datadir"' \
+  --logtostderr
+For testing:
+python train_supervised_active_vision.py
+  --mode='eval' \
+  --logdir=$logdir/checkin_log_det/ \
+  --modality_types='det' \
+  --batch_size=8 \
+  --train_iters=200000 \
+  --lstm_cell_size=2048 \
+  --policy_fc_size=2048 \
+  --sequence_length=20 \
+  --max_eval_episode_length=100 \
+  --test_iters=194 \
+  --gin_config=envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root="$datadir"' \
+  --logtostderr
+"""
+import collections
+import os
+import time
+from absl import app
+from absl import flags
+from absl import logging
+import networkx as nx
+import numpy as np
+import tensorflow as tf
+import gin
+import embedders
+import policies
+import tasks
+from envs import active_vision_dataset_env
+from envs import task_env
+slim = tf.contrib.slim
+flags.DEFINE_string('logdir', '',
+                    'Path to a directory to write summaries and checkpoints')
+# Parameters controlling the training setup. In general one would not need to
+# modify them.
+flags.DEFINE_string('master', 'local',
+                    'BNS name of the TensorFlow master, or local.')
+flags.DEFINE_integer('task_id', 0,
+                     'Task id of the replica running the training.')
+flags.DEFINE_integer('ps_tasks', 0,
+                     'Number of tasks in the ps job. If 0 no ps job is used.')
+flags.DEFINE_integer('decay_steps', 1000,
+                     'Number of steps for exponential decay.')
+flags.DEFINE_float('learning_rate', 0.0001, 'Learning rate.')
+flags.DEFINE_integer('batch_size', 8, 'Batch size.')
+flags.DEFINE_integer('sequence_length', 20, 'sequence length')
+flags.DEFINE_integer('train_iters', 200000, 'number of training iterations.')
+flags.DEFINE_integer('save_summaries_secs', 300,
+                     'number of seconds between saving summaries')
+flags.DEFINE_integer('save_interval_secs', 300,
+                     'numer of seconds between saving variables')
+flags.DEFINE_integer('log_every_n_steps', 20, 'number of steps between logging')
+flags.DEFINE_string('modality_types', '',
+                    'modality names in _ separated format')
+flags.DEFINE_string('conv_window_sizes', '8_4_3',
+                    'conv window size in separated by _')
+flags.DEFINE_string('conv_strides', '4_2_1', '')
+flags.DEFINE_string('conv_channels', '8_16_16', '')
+flags.DEFINE_integer('embedding_fc_size', 128,
+                     'size of embedding for each modality')
+flags.DEFINE_integer('obs_resolution', 64,
+                     'resolution of the input observations')
+flags.DEFINE_integer('lstm_cell_size', 2048, 'size of lstm cell size')
+flags.DEFINE_integer('policy_fc_size', 2048,
+                     'size of fully connected layers for policy part')
+flags.DEFINE_float('weight_decay', 0.0002, 'weight decay')
+flags.DEFINE_integer('goal_category_count', 5, 'number of goal categories')
+flags.DEFINE_integer('action_size', 7, 'number of possible actions')
+flags.DEFINE_integer('max_eval_episode_length', 100,
+                     'maximum sequence length for evaluation.')
+flags.DEFINE_enum('mode', 'train', ['train', 'eval'],
+                  'indicates whether it is in training or evaluation')
+flags.DEFINE_integer('test_iters', 194,
+                     'number of iterations that the eval needs to be run')
+flags.DEFINE_multi_string('gin_config', [],
+                          'List of paths to a gin config files for the env.')
+flags.DEFINE_multi_string('gin_params', [],
+                          'Newline separated list of Gin parameter bindings.')
+flags.DEFINE_string(
+    'resnet50_path', './resnet_v2_50_checkpoint/resnet_v2_50.ckpt', 'path to resnet50'
+    'checkpoint')
+flags.DEFINE_bool('freeze_resnet_weights', True, '')
+flags.DEFINE_string(
+    'eval_init_points_file_name', '',
+    'Name of the file that containts the initial locations and'
+    'worlds for each evalution point')
+FLAGS = flags.FLAGS
+TRAIN_WORLDS = [
+    'Home_001_1', 'Home_001_2', 'Home_002_1', 'Home_003_1', 'Home_003_2',
+    'Home_004_1', 'Home_004_2', 'Home_005_1', 'Home_005_2', 'Home_006_1',
+    'Home_010_1'
+]
+TEST_WORLDS = ['Home_011_1', 'Home_013_1', 'Home_016_1']
+def create_modality_types():
+  """Parses the modality_types and returns a list of task_env.ModalityType."""
+  if not FLAGS.modality_types:
+    raise ValueError('there needs to be at least one modality type')
+  modality_types = FLAGS.modality_types.split('_')
+  for x in modality_types:
+    if x not in ['image', 'sseg', 'det', 'depth']:
+      raise ValueError('invalid modality type: {}'.format(x))
+  conversion_dict = {
+      'image': task_env.ModalityTypes.IMAGE,
+      'sseg': task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
+      'depth': task_env.ModalityTypes.DEPTH,
+      'det': task_env.ModalityTypes.OBJECT_DETECTION,
+  }
+  return [conversion_dict[k] for k in modality_types]
+def create_task_io_config(
+    modality_types,
+    goal_category_count,
+    action_size,
+    sequence_length,
+):
+  """Generates task io config."""
+  shape_prefix = [sequence_length, FLAGS.obs_resolution, FLAGS.obs_resolution]
+  shapes = {
+      task_env.ModalityTypes.IMAGE: [sequence_length, 224, 224, 3],
+      task_env.ModalityTypes.DEPTH: shape_prefix + [
+          2,
+      ],
+      task_env.ModalityTypes.SEMANTIC_SEGMENTATION: shape_prefix + [
+          1,
+      ],
+      task_env.ModalityTypes.OBJECT_DETECTION: shape_prefix + [
+          90,
+      ]
+  }
+  types = {k: tf.float32 for k in shapes}
+  types[task_env.ModalityTypes.IMAGE] = tf.uint8
+  inputs = collections.OrderedDict(
+      [[mtype, (types[mtype], shapes[mtype])] for mtype in modality_types])
+  inputs[task_env.ModalityTypes.GOAL] = (tf.float32,
+                                         [sequence_length, goal_category_count])
+  inputs[task_env.ModalityTypes.PREV_ACTION] = (tf.float32, [
+      sequence_length, action_size + 1
+  ])
+  print inputs
+  return tasks.UnrolledTaskIOConfig(
+      inputs=inputs,
+      output=(tf.float32, [sequence_length, action_size]),
+      query=None)
+def map_to_embedder(modality_type):
+  """Maps modality_type to its corresponding embedder."""
+  if modality_type == task_env.ModalityTypes.PREV_ACTION:
+    return None
+  if modality_type == task_env.ModalityTypes.GOAL:
+    return embedders.IdentityEmbedder()
+  if modality_type == task_env.ModalityTypes.IMAGE:
+    return embedders.ResNet50Embedder()
+  conv_window_sizes = [int(x) for x in FLAGS.conv_window_sizes.split('_')]
+  conv_channels = [int(x) for x in FLAGS.conv_channels.split('_')]
+  conv_strides = [int(x) for x in FLAGS.conv_strides.split('_')]
+  params = tf.contrib.training.HParams(
+      to_one_hot=modality_type == task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
+      one_hot_length=10,
+      conv_sizes=conv_window_sizes,
+      conv_strides=conv_strides,
+      conv_channels=conv_channels,
+      embedding_size=FLAGS.embedding_fc_size,
+      weight_decay_rate=FLAGS.weight_decay,
+  )
+  return embedders.SmallNetworkEmbedder(params)
+def create_train_and_init_ops(policy, task):
+  """Creates training ops given the arguments.
+  Args:
+    policy: the policy for the task.
+    task: the task instance.
+  Returns:
+    train_op: the op that needs to be runned at each step.
+    summaries_op: the summary op that is executed.
+    init_fn: the op that initializes the variables if there is no previous
+      checkpoint. If Resnet50 is not used in the model it is None, otherwise
+      it reads the weights from FLAGS.resnet50_path and sets the init_fn
+      to the op that initializes the ResNet50 with the pre-trained weights.
+  """
+  assert isinstance(task, tasks.GotoStaticXNoExplorationTask)
+  assert isinstance(policy, policies.Policy)
+  inputs, _, gt_outputs, masks = task.tf_episode_batch(FLAGS.batch_size)
+  outputs, _ = policy.build(inputs, None)
+  loss = task.target_loss(gt_outputs, outputs, masks)
+  init_fn = None
+  # If resnet is added to the graph, init_fn should initialize resnet weights
+  # if there is no previous checkpoint.
+  variables_assign_dict = {}
+  vars_list = []
+  for v in slim.get_model_variables():
+    if v.name.find('resnet') >= 0:
+      if not FLAGS.freeze_resnet_weights:
+        vars_list.append(v)
+      variables_assign_dict[v.name[v.name.find('resnet'):-2]] = v
+    else:
+      vars_list.append(v)
+  global_step = tf.train.get_or_create_global_step()
+  learning_rate = tf.train.exponential_decay(
+      FLAGS.learning_rate,
+      global_step,
+      decay_steps=FLAGS.decay_steps,
+      decay_rate=0.98,
+      staircase=True)
+  optimizer = tf.train.AdamOptimizer(learning_rate)
+  train_op = slim.learning.create_train_op(
+      loss,
+      optimizer,
+      global_step=global_step,
+      variables_to_train=vars_list,
+  )
+  if variables_assign_dict:
+    init_fn = slim.assign_from_checkpoint_fn(
+        FLAGS.resnet50_path,
+        variables_assign_dict,
+        ignore_missing_vars=False)
+  scalar_summaries = {}
+  scalar_summaries['LR'] = learning_rate
+  scalar_summaries['loss'] = loss
+  for name, summary in scalar_summaries.iteritems():
+    tf.summary.scalar(name, summary)
+  return train_op, init_fn
+def create_eval_ops(policy, config, possible_targets):
+  """Creates the necessary ops for evaluation."""
+  inputs_feed = collections.OrderedDict([[
+      mtype,
+      tf.placeholder(config.inputs[mtype].type,
+                     [1] + config.inputs[mtype].shape)
+  ] for mtype in config.inputs])
+  inputs_feed[task_env.ModalityTypes.PREV_ACTION] = tf.placeholder(
+      tf.float32, [1, 1] + [
+          config.output.shape[-1] + 1,
+      ])
+  prev_state_feed = [
+      tf.placeholder(
+          tf.float32, [1, FLAGS.lstm_cell_size], name='prev_state_{}'.format(i))
+      for i in range(2)
+  ]
+  policy_outputs = policy.build(inputs_feed, prev_state_feed)
+  summary_feed = {}
+  for c in possible_targets + ['mean']:
+    summary_feed[c] = tf.placeholder(
+        tf.float32, [], name='eval_in_range_{}_input'.format(c))
+    tf.summary.scalar('eval_in_range_{}'.format(c), summary_feed[c])
+  return inputs_feed, prev_state_feed, policy_outputs, (tf.summary.merge_all(),
+                                                        summary_feed)
+def unroll_policy_for_eval(
+    sess,
+    env,
+    inputs_feed,
+    prev_state_feed,
+    policy_outputs,
+    number_of_steps,
+    output_folder,
+):
+  """unrolls the policy for testing.
+  Args:
+    sess: tf.Session
+    env: The environment.
+    inputs_feed: dictionary of placeholder for the input modalities.
+    prev_state_feed: placeholder for the input to the prev_state of the model.
+    policy_outputs: tensor that contains outputs of the policy.
+    number_of_steps: maximum number of unrolling steps.
+    output_folder: output_folder where the function writes a dictionary of
+      detailed information about the path. The dictionary keys are 'states' and
+      'distance'. The value for 'states' is the list of states that the agent
+      goes along the path. The value for 'distance' contains the length of
+      shortest path to the goal at each step.
+  Returns:
+    states: list of states along the path.
+    distance: list of distances along the path.
+  """
+  prev_state = [
+      np.zeros((1, FLAGS.lstm_cell_size), dtype=np.float32) for _ in range(2)
+  ]
+  prev_action = np.zeros((1, 1, FLAGS.action_size + 1), dtype=np.float32)
+  obs = env.reset()
+  distances_to_goal = []
+  states = []
+  unique_id = '{}_{}'.format(env.cur_image_id(), env.goal_string)
+  for _ in range(number_of_steps):
+    distances_to_goal.append(
+        np.min([
+            len(
+                nx.shortest_path(env.graph, env.pose_to_vertex(env.state()),
+                                 env.pose_to_vertex(target_view)))
+            for target_view in env.targets()
+        ]))
+    states.append(env.state())
+    feed_dict = {inputs_feed[mtype]: [[obs[mtype]]] for mtype in inputs_feed}
+    feed_dict[prev_state_feed[0]] = prev_state[0]
+    feed_dict[prev_state_feed[1]] = prev_state[1]
+    action_values, prev_state = sess.run(policy_outputs, feed_dict=feed_dict)
+    chosen_action = np.argmax(action_values[0])
+    obs, _, done, info = env.step(np.int32(chosen_action))
+    prev_action[0][0][chosen_action] = 1.
+    prev_action[0][0][-1] = float(info['success'])
+    # If the agent chooses action stop or the number of steps exceeeded
+    # env._episode_length.
+    if done:
+      break
+  # logging.info('distance = %d, id = %s, #steps = %d', distances_to_goal[-1],
+  output_path = os.path.join(output_folder, unique_id + '.npy')
+  with tf.gfile.Open(output_path, 'w') as f:
+    print 'saving path information to {}'.format(output_path)
+    np.save(f, {'states': states, 'distance': distances_to_goal})
+  return states, distances_to_goal
+def init(sequence_length, eval_init_points_file_name, worlds):
+  """Initializes the common operations between train and test."""
+  modality_types = create_modality_types()
+  logging.info('modality types: %r', modality_types)
+  # negative reward_goal_range prevents the env from terminating early when the
+  # agent is close to the goal. The policy should keep the agent until the end
+  # of the 100 steps either through chosing stop action or oscilating around
+  # the target.
+  env = active_vision_dataset_env.ActiveVisionDatasetEnv(
+      modality_types=modality_types +
+      [task_env.ModalityTypes.GOAL, task_env.ModalityTypes.PREV_ACTION],
+      reward_goal_range=-1,
+      eval_init_points_file_name=eval_init_points_file_name,
+      worlds=worlds,
+      output_size=FLAGS.obs_resolution,
+  )
+  config = create_task_io_config(
+      modality_types=modality_types,
+      goal_category_count=FLAGS.goal_category_count,
+      action_size=FLAGS.action_size,
+      sequence_length=sequence_length,
+  )
+  task = tasks.GotoStaticXNoExplorationTask(env=env, config=config)
+  embedders_dict = {mtype: map_to_embedder(mtype) for mtype in config.inputs}
+  policy_params = tf.contrib.training.HParams(
+      lstm_state_size=FLAGS.lstm_cell_size,
+      fc_channels=FLAGS.policy_fc_size,
+      weight_decay=FLAGS.weight_decay,
+      target_embedding_size=FLAGS.embedding_fc_size,
+  )
+  policy = policies.LSTMPolicy(
+      modality_names=config.inputs.keys(),
+      embedders_dict=embedders_dict,
+      action_size=FLAGS.action_size,
+      params=policy_params,
+      max_episode_length=sequence_length)
+  return env, config, task, policy
+def test():
+  """Contains all the operations for testing policies."""
+  env, config, _, policy = init(1, 'all_init_configs', TEST_WORLDS)
+  inputs_feed, prev_state_feed, policy_outputs, summary_op = create_eval_ops(
+      policy, config, env.possible_targets)
+  sv = tf.train.Supervisor(logdir=FLAGS.logdir)
+  prev_checkpoint = None
+  with sv.managed_session(
+      start_standard_services=False,
+      config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+    while not sv.should_stop():
+      while True:
+        new_checkpoint = tf.train.latest_checkpoint(FLAGS.logdir)
+        print 'new_checkpoint ', new_checkpoint
+        if not new_checkpoint:
+          time.sleep(1)
+          continue
+        if prev_checkpoint is None:
+          prev_checkpoint = new_checkpoint
+          break
+        if prev_checkpoint != new_checkpoint:
+          prev_checkpoint = new_checkpoint
+          break
+        else:  # if prev_checkpoint == new_checkpoint, we have to wait more.
+          time.sleep(1)
+      checkpoint_step = int(new_checkpoint[new_checkpoint.rfind('-') + 1:])
+      sv.saver.restore(sess, new_checkpoint)
+      print '--------------------'
+      print 'evaluating checkpoint {}'.format(new_checkpoint)
+      folder_path = os.path.join(FLAGS.logdir, 'evals', str(checkpoint_step))
+      if not tf.gfile.Exists(folder_path):
+        tf.gfile.MakeDirs(folder_path)
+      eval_stats = {c: [] for c in env.possible_targets}
+      for test_iter in range(FLAGS.test_iters):
+        print 'evaluating {} of {}'.format(test_iter, FLAGS.test_iters)
+        _, distance_to_goal = unroll_policy_for_eval(
+            sess,
+            env,
+            inputs_feed,
+            prev_state_feed,
+            policy_outputs,
+            FLAGS.max_eval_episode_length,
+            folder_path,
+        )
+        print 'goal = {}'.format(env.goal_string)
+        eval_stats[env.goal_string].append(float(distance_to_goal[-1] <= 7))
+      eval_stats = {k: np.mean(v) for k, v in eval_stats.iteritems()}
+      eval_stats['mean'] = np.mean(eval_stats.values())
+      print eval_stats
+      feed_dict = {summary_op[1][c]: eval_stats[c] for c in eval_stats}
+      summary_str = sess.run(summary_op[0], feed_dict=feed_dict)
+      writer = sv.summary_writer
+      writer.add_summary(summary_str, checkpoint_step)
+      writer.flush()
+def train():
+  _, _, task, policy = init(FLAGS.sequence_length, None, TRAIN_WORLDS)
+  print(FLAGS.save_summaries_secs)
+  print(FLAGS.save_interval_secs)
+  print(FLAGS.logdir)
+  with tf.device(
+      tf.train.replica_device_setter(ps_tasks=FLAGS.ps_tasks, merge_devices=True)):
+    train_op, init_fn = create_train_and_init_ops(policy=policy, task=task)
+    print(FLAGS.logdir)
+    slim.learning.train(
+        train_op=train_op,
+        init_fn=init_fn,
+        logdir=FLAGS.logdir,
+        is_chief=FLAGS.task_id == 0,
+        number_of_steps=FLAGS.train_iters,
+        save_summaries_secs=FLAGS.save_summaries_secs,
+        save_interval_secs=FLAGS.save_interval_secs,
+        session_config=tf.ConfigProto(allow_soft_placement=True),
+    )
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_params)
+  if FLAGS.mode == 'train':
+    train()
+  else:
+    test()
+if __name__ == '__main__':
+  app.run(main)
--- a/research/cognitive_planning/train_supervised_active_vision.sh
+++ b/research/cognitive_planning/train_supervised_active_vision.sh
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# blaze build -c opt train_supervised_active_vision
+# bazel build -c opt --config=cuda --copt=-mavx train_supervised_active_vision && \
+bazel-bin/research/cognitive_planning/train_supervised_active_vision \
+  --mode='train' \
+  --logdir=/usr/local/google/home/kosecka/local_avd_train/ \
+  --modality_types='det' \
+  --batch_size=8 \
+  --train_iters=200000 \
+  --lstm_cell_size=2048 \
+  --policy_fc_size=2048 \
+  --sequence_length=20 \
+  --max_eval_episode_length=100 \
+  --test_iters=194 \
+  --gin_config=envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root="/cns/jn-d/home/kosecka/AVD_Minimal/"' \
+  --logtostderr
--- a/research/cognitive_planning/visualization_utils.py
+++ b/research/cognitive_planning/visualization_utils.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A set of functions that are used for visualization.
+These functions often receive an image, perform some visualization on the image.
+The functions do not return a value, instead they modify the image itself.
+"""
+import collections
+import functools
+# Set headless-friendly backend.
+import matplotlib; matplotlib.use('Agg')  # pylint: disable=multiple-statements
+import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+import numpy as np
+import PIL.Image as Image
+import PIL.ImageColor as ImageColor
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+import six
+import tensorflow as tf
+import standard_fields as fields
+_TITLE_LEFT_MARGIN = 10
+_TITLE_TOP_MARGIN = 10
+STANDARD_COLORS = [
+    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
+    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
+    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
+    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
+    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
+    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
+    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
+    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
+    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
+    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
+    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
+    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
+    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
+    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
+    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
+    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
+    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
+    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
+    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
+    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
+    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
+    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
+    'WhiteSmoke', 'Yellow', 'YellowGreen'
+]
+def save_image_array_as_png(image, output_path):
+  """Saves an image (represented as a numpy array) to PNG.
+  Args:
+    image: a numpy array with shape [height, width, 3].
+    output_path: path to which image should be written.
+  """
+  image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
+  with tf.gfile.Open(output_path, 'w') as fid:
+    image_pil.save(fid, 'PNG')
+def encode_image_array_as_png_str(image):
+  """Encodes a numpy array into a PNG string.
+  Args:
+    image: a numpy array with shape [height, width, 3].
+  Returns:
+    PNG encoded image string.
+  """
+  image_pil = Image.fromarray(np.uint8(image))
+  output = six.BytesIO()
+  image_pil.save(output, format='PNG')
+  png_string = output.getvalue()
+  output.close()
+  return png_string
+def draw_bounding_box_on_image_array(image,
+                                     ymin,
+                                     xmin,
+                                     ymax,
+                                     xmax,
+                                     color='red',
+                                     thickness=4,
+                                     display_str_list=(),
+                                     use_normalized_coordinates=True):
+  """Adds a bounding box to an image (numpy array).
+  Bounding box coordinates can be specified in either absolute (pixel) or
+  normalized coordinates by setting the use_normalized_coordinates argument.
+  Args:
+    image: a numpy array with shape [height, width, 3].
+    ymin: ymin of bounding box.
+    xmin: xmin of bounding box.
+    ymax: ymax of bounding box.
+    xmax: xmax of bounding box.
+    color: color to draw bounding box. Default is red.
+    thickness: line thickness. Default value is 4.
+    display_str_list: list of strings to display in box
+                      (each to be shown on its own line).
+    use_normalized_coordinates: If True (default), treat coordinates
+      ymin, xmin, ymax, xmax as relative to the image.  Otherwise treat
+      coordinates as absolute.
+  """
+  image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
+  draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color,
+                             thickness, display_str_list,
+                             use_normalized_coordinates)
+  np.copyto(image, np.array(image_pil))
+def draw_bounding_box_on_image(image,
+                               ymin,
+                               xmin,
+                               ymax,
+                               xmax,
+                               color='red',
+                               thickness=4,
+                               display_str_list=(),
+                               use_normalized_coordinates=True):
+  """Adds a bounding box to an image.
+  Bounding box coordinates can be specified in either absolute (pixel) or
+  normalized coordinates by setting the use_normalized_coordinates argument.
+  Each string in display_str_list is displayed on a separate line above the
+  bounding box in black text on a rectangle filled with the input 'color'.
+  If the top of the bounding box extends to the edge of the image, the strings
+  are displayed below the bounding box.
+  Args:
+    image: a PIL.Image object.
+    ymin: ymin of bounding box.
+    xmin: xmin of bounding box.
+    ymax: ymax of bounding box.
+    xmax: xmax of bounding box.
+    color: color to draw bounding box. Default is red.
+    thickness: line thickness. Default value is 4.
+    display_str_list: list of strings to display in box
+                      (each to be shown on its own line).
+    use_normalized_coordinates: If True (default), treat coordinates
+      ymin, xmin, ymax, xmax as relative to the image.  Otherwise treat
+      coordinates as absolute.
+  """
+  draw = ImageDraw.Draw(image)
+  im_width, im_height = image.size
+  if use_normalized_coordinates:
+    (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
+                                  ymin * im_height, ymax * im_height)
+  else:
+    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+  draw.line([(left, top), (left, bottom), (right, bottom),
+             (right, top), (left, top)], width=thickness, fill=color)
+  try:
+    font = ImageFont.truetype('arial.ttf', 24)
+  except IOError:
+    font = ImageFont.load_default()
+  # If the total height of the display strings added to the top of the bounding
+  # box exceeds the top of the image, stack the strings below the bounding box
+  # instead of above.
+  display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
+  # Each display_str has a top and bottom margin of 0.05x.
+  total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+  if top > total_display_str_height:
+    text_bottom = top
+  else:
+    text_bottom = bottom + total_display_str_height
+  # Reverse list and print from bottom to top.
+  for display_str in display_str_list[::-1]:
+    text_width, text_height = font.getsize(display_str)
+    margin = np.ceil(0.05 * text_height)
+    draw.rectangle(
+        [(left, text_bottom - text_height - 2 * margin), (left + text_width,
+                                                          text_bottom)],
+        fill=color)
+    draw.text(
+        (left + margin, text_bottom - text_height - margin),
+        display_str,
+        fill='black',
+        font=font)
+    text_bottom -= text_height - 2 * margin
+def draw_bounding_boxes_on_image_array(image,
+                                       boxes,
+                                       color='red',
+                                       thickness=4,
+                                       display_str_list_list=()):
+  """Draws bounding boxes on image (numpy array).
+  Args:
+    image: a numpy array object.
+    boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
+           The coordinates are in normalized format between [0, 1].
+    color: color to draw bounding box. Default is red.
+    thickness: line thickness. Default value is 4.
+    display_str_list_list: list of list of strings.
+                           a list of strings for each bounding box.
+                           The reason to pass a list of strings for a
+                           bounding box is that it might contain
+                           multiple labels.
+  Raises:
+    ValueError: if boxes is not a [N, 4] array
+  """
+  image_pil = Image.fromarray(image)
+  draw_bounding_boxes_on_image(image_pil, boxes, color, thickness,
+                               display_str_list_list)
+  np.copyto(image, np.array(image_pil))
+def draw_bounding_boxes_on_image(image,
+                                 boxes,
+                                 color='red',
+                                 thickness=4,
+                                 display_str_list_list=()):
+  """Draws bounding boxes on image.
+  Args:
+    image: a PIL.Image object.
+    boxes: a 2 dimensional numpy array of [N, 4]: (ymin, xmin, ymax, xmax).
+           The coordinates are in normalized format between [0, 1].
+    color: color to draw bounding box. Default is red.
+    thickness: line thickness. Default value is 4.
+    display_str_list_list: list of list of strings.
+                           a list of strings for each bounding box.
+                           The reason to pass a list of strings for a
+                           bounding box is that it might contain
+                           multiple labels.
+  Raises:
+    ValueError: if boxes is not a [N, 4] array
+  """
+  boxes_shape = boxes.shape
+  if not boxes_shape:
+    return
+  if len(boxes_shape) != 2 or boxes_shape[1] != 4:
+    raise ValueError('Input must be of size [N, 4]')
+  for i in range(boxes_shape[0]):
+    display_str_list = ()
+    if display_str_list_list:
+      display_str_list = display_str_list_list[i]
+    draw_bounding_box_on_image(image, boxes[i, 0], boxes[i, 1], boxes[i, 2],
+                               boxes[i, 3], color, thickness, display_str_list)
+def _visualize_boxes(image, boxes, classes, scores, category_index, **kwargs):
+  return visualize_boxes_and_labels_on_image_array(
+      image, boxes, classes, scores, category_index=category_index, **kwargs)
+def _visualize_boxes_and_masks(image, boxes, classes, scores, masks,
+                               category_index, **kwargs):
+  return visualize_boxes_and_labels_on_image_array(
+      image,
+      boxes,
+      classes,
+      scores,
+      category_index=category_index,
+      instance_masks=masks,
+      **kwargs)
+def _visualize_boxes_and_keypoints(image, boxes, classes, scores, keypoints,
+                                   category_index, **kwargs):
+  return visualize_boxes_and_labels_on_image_array(
+      image,
+      boxes,
+      classes,
+      scores,
+      category_index=category_index,
+      keypoints=keypoints,
+      **kwargs)
+def _visualize_boxes_and_masks_and_keypoints(
+    image, boxes, classes, scores, masks, keypoints, category_index, **kwargs):
+  return visualize_boxes_and_labels_on_image_array(
+      image,
+      boxes,
+      classes,
+      scores,
+      category_index=category_index,
+      instance_masks=masks,
+      keypoints=keypoints,
+      **kwargs)
+def draw_bounding_boxes_on_image_tensors(images,
+                                         boxes,
+                                         classes,
+                                         scores,
+                                         category_index,
+                                         instance_masks=None,
+                                         keypoints=None,
+                                         max_boxes_to_draw=20,
+                                         min_score_thresh=0.2,
+                                         use_normalized_coordinates=True):
+  """Draws bounding boxes, masks, and keypoints on batch of image tensors.
+  Args:
+    images: A 4D uint8 image tensor of shape [N, H, W, C]. If C > 3, additional
+      channels will be ignored.
+    boxes: [N, max_detections, 4] float32 tensor of detection boxes.
+    classes: [N, max_detections] int tensor of detection classes. Note that
+      classes are 1-indexed.
+    scores: [N, max_detections] float32 tensor of detection scores.
+    category_index: a dict that maps integer ids to category dicts. e.g.
+      {1: {1: 'dog'}, 2: {2: 'cat'}, ...}
+    instance_masks: A 4D uint8 tensor of shape [N, max_detection, H, W] with
+      instance masks.
+    keypoints: A 4D float32 tensor of shape [N, max_detection, num_keypoints, 2]
+      with keypoints.
+    max_boxes_to_draw: Maximum number of boxes to draw on an image. Default 20.
+    min_score_thresh: Minimum score threshold for visualization. Default 0.2.
+    use_normalized_coordinates: Whether to assume boxes and kepoints are in
+      normalized coordinates (as opposed to absolute coordiantes).
+      Default is True.
+  Returns:
+    4D image tensor of type uint8, with boxes drawn on top.
+  """
+  # Additional channels are being ignored.
+  images = images[:, :, :, 0:3]
+  visualization_keyword_args = {
+      'use_normalized_coordinates': use_normalized_coordinates,
+      'max_boxes_to_draw': max_boxes_to_draw,
+      'min_score_thresh': min_score_thresh,
+      'agnostic_mode': False,
+      'line_thickness': 4
+  }
+  if instance_masks is not None and keypoints is None:
+    visualize_boxes_fn = functools.partial(
+        _visualize_boxes_and_masks,
+        category_index=category_index,
+        **visualization_keyword_args)
+    elems = [images, boxes, classes, scores, instance_masks]
+  elif instance_masks is None and keypoints is not None:
+    visualize_boxes_fn = functools.partial(
+        _visualize_boxes_and_keypoints,
+        category_index=category_index,
+        **visualization_keyword_args)
+    elems = [images, boxes, classes, scores, keypoints]
+  elif instance_masks is not None and keypoints is not None:
+    visualize_boxes_fn = functools.partial(
+        _visualize_boxes_and_masks_and_keypoints,
+        category_index=category_index,
+        **visualization_keyword_args)
+    elems = [images, boxes, classes, scores, instance_masks, keypoints]
+  else:
+    visualize_boxes_fn = functools.partial(
+        _visualize_boxes,
+        category_index=category_index,
+        **visualization_keyword_args)
+    elems = [images, boxes, classes, scores]
+  def draw_boxes(image_and_detections):
+    """Draws boxes on image."""
+    image_with_boxes = tf.py_func(visualize_boxes_fn, image_and_detections,
+                                  tf.uint8)
+    return image_with_boxes
+  images = tf.map_fn(draw_boxes, elems, dtype=tf.uint8, back_prop=False)
+  return images
+def draw_side_by_side_evaluation_image(eval_dict,
+                                       category_index,
+                                       max_boxes_to_draw=20,
+                                       min_score_thresh=0.2,
+                                       use_normalized_coordinates=True):
+  """Creates a side-by-side image with detections and groundtruth.
+  Bounding boxes (and instance masks, if available) are visualized on both
+  subimages.
+  Args:
+    eval_dict: The evaluation dictionary returned by
+      eval_util.result_dict_for_single_example().
+    category_index: A category index (dictionary) produced from a labelmap.
+    max_boxes_to_draw: The maximum number of boxes to draw for detections.
+    min_score_thresh: The minimum score threshold for showing detections.
+    use_normalized_coordinates: Whether to assume boxes and kepoints are in
+      normalized coordinates (as opposed to absolute coordiantes).
+      Default is True.
+  Returns:
+    A [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to
+      detections, while the subimage on the right corresponds to groundtruth.
+  """
+  detection_fields = fields.DetectionResultFields()
+  input_data_fields = fields.InputDataFields()
+  instance_masks = None
+  if detection_fields.detection_masks in eval_dict:
+    instance_masks = tf.cast(
+        tf.expand_dims(eval_dict[detection_fields.detection_masks], axis=0),
+        tf.uint8)
+  keypoints = None
+  if detection_fields.detection_keypoints in eval_dict:
+    keypoints = tf.expand_dims(
+        eval_dict[detection_fields.detection_keypoints], axis=0)
+  groundtruth_instance_masks = None
+  if input_data_fields.groundtruth_instance_masks in eval_dict:
+    groundtruth_instance_masks = tf.cast(
+        tf.expand_dims(
+            eval_dict[input_data_fields.groundtruth_instance_masks], axis=0),
+        tf.uint8)
+  images_with_detections = draw_bounding_boxes_on_image_tensors(
+      eval_dict[input_data_fields.original_image],
+      tf.expand_dims(eval_dict[detection_fields.detection_boxes], axis=0),
+      tf.expand_dims(eval_dict[detection_fields.detection_classes], axis=0),
+      tf.expand_dims(eval_dict[detection_fields.detection_scores], axis=0),
+      category_index,
+      instance_masks=instance_masks,
+      keypoints=keypoints,
+      max_boxes_to_draw=max_boxes_to_draw,
+      min_score_thresh=min_score_thresh,
+      use_normalized_coordinates=use_normalized_coordinates)
+  images_with_groundtruth = draw_bounding_boxes_on_image_tensors(
+      eval_dict[input_data_fields.original_image],
+      tf.expand_dims(eval_dict[input_data_fields.groundtruth_boxes], axis=0),
+      tf.expand_dims(eval_dict[input_data_fields.groundtruth_classes], axis=0),
+      tf.expand_dims(
+          tf.ones_like(
+              eval_dict[input_data_fields.groundtruth_classes],
+              dtype=tf.float32),
+          axis=0),
+      category_index,
+      instance_masks=groundtruth_instance_masks,
+      keypoints=None,
+      max_boxes_to_draw=None,
+      min_score_thresh=0.0,
+      use_normalized_coordinates=use_normalized_coordinates)
+  return tf.concat([images_with_detections, images_with_groundtruth], axis=2)
+def draw_keypoints_on_image_array(image,
+                                  keypoints,
+                                  color='red',
+                                  radius=2,
+                                  use_normalized_coordinates=True):
+  """Draws keypoints on an image (numpy array).
+  Args:
+    image: a numpy array with shape [height, width, 3].
+    keypoints: a numpy array with shape [num_keypoints, 2].
+    color: color to draw the keypoints with. Default is red.
+    radius: keypoint radius. Default value is 2.
+    use_normalized_coordinates: if True (default), treat keypoint values as
+      relative to the image.  Otherwise treat them as absolute.
+  """
+  image_pil = Image.fromarray(np.uint8(image)).convert('RGB')
+  draw_keypoints_on_image(image_pil, keypoints, color, radius,
+                          use_normalized_coordinates)
+  np.copyto(image, np.array(image_pil))
+def draw_keypoints_on_image(image,
+                            keypoints,
+                            color='red',
+                            radius=2,
+                            use_normalized_coordinates=True):
+  """Draws keypoints on an image.
+  Args:
+    image: a PIL.Image object.
+    keypoints: a numpy array with shape [num_keypoints, 2].
+    color: color to draw the keypoints with. Default is red.
+    radius: keypoint radius. Default value is 2.
+    use_normalized_coordinates: if True (default), treat keypoint values as
+      relative to the image.  Otherwise treat them as absolute.
+  """
+  draw = ImageDraw.Draw(image)
+  im_width, im_height = image.size
+  keypoints_x = [k[1] for k in keypoints]
+  keypoints_y = [k[0] for k in keypoints]
+  if use_normalized_coordinates:
+    keypoints_x = tuple([im_width * x for x in keypoints_x])
+    keypoints_y = tuple([im_height * y for y in keypoints_y])
+  for keypoint_x, keypoint_y in zip(keypoints_x, keypoints_y):
+    draw.ellipse([(keypoint_x - radius, keypoint_y - radius),
+                  (keypoint_x + radius, keypoint_y + radius)],
+                 outline=color, fill=color)
+def draw_mask_on_image_array(image, mask, color='red', alpha=0.4):
+  """Draws mask on an image.
+  Args:
+    image: uint8 numpy array with shape (img_height, img_height, 3)
+    mask: a uint8 numpy array of shape (img_height, img_height) with
+      values between either 0 or 1.
+    color: color to draw the keypoints with. Default is red.
+    alpha: transparency value between 0 and 1. (default: 0.4)
+  Raises:
+    ValueError: On incorrect data type for image or masks.
+  """
+  if image.dtype != np.uint8:
+    raise ValueError('`image` not of type np.uint8')
+  if mask.dtype != np.uint8:
+    raise ValueError('`mask` not of type np.uint8')
+  if np.any(np.logical_and(mask != 1, mask != 0)):
+    raise ValueError('`mask` elements should be in [0, 1]')
+  if image.shape[:2] != mask.shape:
+    raise ValueError('The image has spatial dimensions %s but the mask has '
+                     'dimensions %s' % (image.shape[:2], mask.shape))
+  rgb = ImageColor.getrgb(color)
+  pil_image = Image.fromarray(image)
+  solid_color = np.expand_dims(
+      np.ones_like(mask), axis=2) * np.reshape(list(rgb), [1, 1, 3])
+  pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert('RGBA')
+  pil_mask = Image.fromarray(np.uint8(255.0*alpha*mask)).convert('L')
+  pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
+  np.copyto(image, np.array(pil_image.convert('RGB')))
+def visualize_boxes_and_labels_on_image_array(
+    image,
+    boxes,
+    classes,
+    scores,
+    category_index,
+    instance_masks=None,
+    instance_boundaries=None,
+    keypoints=None,
+    use_normalized_coordinates=False,
+    max_boxes_to_draw=20,
+    min_score_thresh=.5,
+    agnostic_mode=False,
+    line_thickness=4,
+    groundtruth_box_visualization_color='black',
+    skip_scores=False,
+    skip_labels=False):
+  """Overlay labeled boxes on an image with formatted scores and label names.
+  This function groups boxes that correspond to the same location
+  and creates a display string for each detection and overlays these
+  on the image. Note that this function modifies the image in place, and returns
+  that same image.
+  Args:
+    image: uint8 numpy array with shape (img_height, img_width, 3)
+    boxes: a numpy array of shape [N, 4]
+    classes: a numpy array of shape [N]. Note that class indices are 1-based,
+      and match the keys in the label map.
+    scores: a numpy array of shape [N] or None.  If scores=None, then
+      this function assumes that the boxes to be plotted are groundtruth
+      boxes and plot all boxes as black with no classes or scores.
+    category_index: a dict containing category dictionaries (each holding
+      category index `id` and category name `name`) keyed by category indices.
+    instance_masks: a numpy array of shape [N, image_height, image_width] with
+      values ranging between 0 and 1, can be None.
+    instance_boundaries: a numpy array of shape [N, image_height, image_width]
+      with values ranging between 0 and 1, can be None.
+    keypoints: a numpy array of shape [N, num_keypoints, 2], can
+      be None
+    use_normalized_coordinates: whether boxes is to be interpreted as
+      normalized coordinates or not.
+    max_boxes_to_draw: maximum number of boxes to visualize.  If None, draw
+      all boxes.
+    min_score_thresh: minimum score threshold for a box to be visualized
+    agnostic_mode: boolean (default: False) controlling whether to evaluate in
+      class-agnostic mode or not.  This mode will display scores but ignore
+      classes.
+    line_thickness: integer (default: 4) controlling line width of the boxes.
+    groundtruth_box_visualization_color: box color for visualizing groundtruth
+      boxes
+    skip_scores: whether to skip score when drawing a single detection
+    skip_labels: whether to skip label when drawing a single detection
+  Returns:
+    uint8 numpy array with shape (img_height, img_width, 3) with overlaid boxes.
+  """
+  # Create a display string (and color) for every box location, group any boxes
+  # that correspond to the same location.
+  box_to_display_str_map = collections.defaultdict(list)
+  box_to_color_map = collections.defaultdict(str)
+  box_to_instance_masks_map = {}
+  box_to_instance_boundaries_map = {}
+  box_to_keypoints_map = collections.defaultdict(list)
+  if not max_boxes_to_draw:
+    max_boxes_to_draw = boxes.shape[0]
+  for i in range(min(max_boxes_to_draw, boxes.shape[0])):
+    if scores is None or scores[i] > min_score_thresh:
+      box = tuple(boxes[i].tolist())
+      if instance_masks is not None:
+        box_to_instance_masks_map[box] = instance_masks[i]
+      if instance_boundaries is not None:
+        box_to_instance_boundaries_map[box] = instance_boundaries[i]
+      if keypoints is not None:
+        box_to_keypoints_map[box].extend(keypoints[i])
+      if scores is None:
+        box_to_color_map[box] = groundtruth_box_visualization_color
+      else:
+        display_str = ''
+        if not skip_labels:
+          if not agnostic_mode:
+            if classes[i] in category_index.keys():
+              class_name = category_index[classes[i]]['name']
+            else:
+              class_name = 'N/A'
+            display_str = str(class_name)
+        if not skip_scores:
+          if not display_str:
+            display_str = '{}%'.format(int(100*scores[i]))
+          else:
+            display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
+        box_to_display_str_map[box].append(display_str)
+        if agnostic_mode:
+          box_to_color_map[box] = 'DarkOrange'
+        else:
+          box_to_color_map[box] = STANDARD_COLORS[
+              classes[i] % len(STANDARD_COLORS)]
+  # Draw all boxes onto image.
+  for box, color in box_to_color_map.items():
+    ymin, xmin, ymax, xmax = box
+    if instance_masks is not None:
+      draw_mask_on_image_array(
+          image,
+          box_to_instance_masks_map[box],
+          color=color
+      )
+    if instance_boundaries is not None:
+      draw_mask_on_image_array(
+          image,
+          box_to_instance_boundaries_map[box],
+          color='red',
+          alpha=1.0
+      )
+    draw_bounding_box_on_image_array(
+        image,
+        ymin,
+        xmin,
+        ymax,
+        xmax,
+        color=color,
+        thickness=line_thickness,
+        display_str_list=box_to_display_str_map[box],
+        use_normalized_coordinates=use_normalized_coordinates)
+    if keypoints is not None:
+      draw_keypoints_on_image_array(
+          image,
+          box_to_keypoints_map[box],
+          color=color,
+          radius=line_thickness / 2,
+          use_normalized_coordinates=use_normalized_coordinates)
+  return image
+def add_cdf_image_summary(values, name):
+  """Adds a tf.summary.image for a CDF plot of the values.
+  Normalizes `values` such that they sum to 1, plots the cumulative distribution
+  function and creates a tf image summary.
+  Args:
+    values: a 1-D float32 tensor containing the values.
+    name: name for the image summary.
+  """
+  def cdf_plot(values):
+    """Numpy function to plot CDF."""
+    normalized_values = values / np.sum(values)
+    sorted_values = np.sort(normalized_values)
+    cumulative_values = np.cumsum(sorted_values)
+    fraction_of_examples = (np.arange(cumulative_values.size, dtype=np.float32)
+                            / cumulative_values.size)
+    fig = plt.figure(frameon=False)
+    ax = fig.add_subplot('111')
+    ax.plot(fraction_of_examples, cumulative_values)
+    ax.set_ylabel('cumulative normalized values')
+    ax.set_xlabel('fraction of examples')
+    fig.canvas.draw()
+    width, height = fig.get_size_inches() * fig.get_dpi()
+    image = np.fromstring(fig.canvas.tostring_rgb(), dtype='uint8').reshape(
+        1, int(height), int(width), 3)
+    return image
+  cdf_plot = tf.py_func(cdf_plot, [values], tf.uint8)
+  tf.summary.image(name, cdf_plot)
+def add_hist_image_summary(values, bins, name):
+  """Adds a tf.summary.image for a histogram plot of the values.
+  Plots the histogram of values and creates a tf image summary.
+  Args:
+    values: a 1-D float32 tensor containing the values.
+    bins: bin edges which will be directly passed to np.histogram.
+    name: name for the image summary.
+  """
+  def hist_plot(values, bins):
+    """Numpy function to plot hist."""
+    fig = plt.figure(frameon=False)
+    ax = fig.add_subplot('111')
+    y, x = np.histogram(values, bins=bins)
+    ax.plot(x[:-1], y)
+    ax.set_ylabel('count')
+    ax.set_xlabel('value')
+    fig.canvas.draw()
+    width, height = fig.get_size_inches() * fig.get_dpi()
+    image = np.fromstring(
+        fig.canvas.tostring_rgb(), dtype='uint8').reshape(
+            1, int(height), int(width), 3)
+    return image
+  hist_plot = tf.py_func(hist_plot, [values, bins], tf.uint8)
+  tf.summary.image(name, hist_plot)
--- a/research/cognitive_planning/viz_active_vision_dataset_main.py
+++ b/research/cognitive_planning/viz_active_vision_dataset_main.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Initializes at random location and visualizes the optimal path.
+Different modes of execution:
+1) benchmark: It generates benchmark_iter sample trajectory to random goals
+   and plots the histogram of path lengths. It can be also used to see how fast
+   it runs.
+2) vis: It visualizes the generated paths by image, semantic segmentation, and
+   so on.
+3) human: allows the user to navigate through environment from keyboard input.
+python viz_active_vision_dataset_main -- \
+  --mode=benchmark --benchmark_iter=1000 --gin_config=envs/configs/active_vision_config.gin
+python viz_active_vision_dataset_main -- \
+  --mode=vis \
+  --gin_config=envs/configs/active_vision_config.gin
+python viz_active_vision_dataset_main -- \
+  --mode=human \
+  --gin_config=envs/configs/active_vision_config.gin
+python viz_active_vision_dataset_main.py --mode=eval --eval_folder=/usr/local/google/home/$USER/checkin_log_det/evals/ --output_folder=/usr/local/google/home/$USER/test_imgs/ --gin_config=envs/configs/active_vision_config.gin
+"""
+import matplotlib
+# pylint: disable=g-import-not-at-top
+# Need Tk for interactive plots.
+matplotlib.use('TkAgg')
+import tensorflow as tf
+from matplotlib import pyplot as plt
+import numpy as np
+import os
+from pyglib import app
+from pyglib import flags
+import gin
+import cv2
+from envs import active_vision_dataset_env
+from envs import task_env
+VIS_MODE = 'vis'
+HUMAN_MODE = 'human'
+BENCHMARK_MODE = 'benchmark'
+GRAPH_MODE = 'graph'
+EVAL_MODE = 'eval'
+flags.DEFINE_enum('mode', VIS_MODE,
+                  [VIS_MODE, HUMAN_MODE, BENCHMARK_MODE, GRAPH_MODE, EVAL_MODE],
+                  'mode of the execution')
+flags.DEFINE_integer('benchmark_iter', 1000,
+                     'number of iterations for benchmarking')
+flags.DEFINE_string('eval_folder', '', 'the path to the eval folder')
+flags.DEFINE_string('output_folder', '',
+                    'the path to which the images and gifs are written')
+flags.DEFINE_multi_string('gin_config', [],
+                          'List of paths to a gin config files for the env.')
+flags.DEFINE_multi_string('gin_params', [],
+                          'Newline separated list of Gin parameter bindings.')
+mt = task_env.ModalityTypes
+FLAGS = flags.FLAGS
+def benchmark(env, targets):
+  """Benchmarks the speed of sequence generation by env.
+  Args:
+    env: environment.
+    targets: list of target classes.
+  """
+  episode_lengths = {}
+  all_init_configs = {}
+  all_actions = dict([(a, 0.) for a in env.actions])
+  for i in range(FLAGS.benchmark_iter):
+    path, actions, _, _ = env.random_step_sequence()
+    selected_actions = np.argmax(actions, axis=-1)
+    new_actions = dict([(a, 0.) for a in env.actions])
+    for a in selected_actions:
+      new_actions[env.actions[a]] += 1. / selected_actions.shape[0]
+    for a in new_actions:
+      all_actions[a] += new_actions[a] / FLAGS.benchmark_iter
+    start_image_id, world, goal = env.get_init_config(path)
+    print world
+    if world not in all_init_configs:
+      all_init_configs[world] = set()
+    all_init_configs[world].add((start_image_id, goal, len(actions)))
+    if env.goal_index not in episode_lengths:
+      episode_lengths[env.goal_index] = []
+    episode_lengths[env.goal_index].append(len(actions))
+  for i, cls in enumerate(episode_lengths):
+    plt.subplot(231 + i)
+    plt.hist(episode_lengths[cls])
+    plt.title(targets[cls])
+  plt.show()
+def human(env, targets):
+  """Lets user play around the env manually."""
+  string_key_map = {
+      'a': 'left',
+      'd': 'right',
+      'w': 'forward',
+      's': 'backward',
+      'j': 'rotate_ccw',
+      'l': 'rotate_cw',
+      'n': 'stop'
+  }
+  integer_key_map = {
+      'a': env.actions.index('left'),
+      'd': env.actions.index('right'),
+      'w': env.actions.index('forward'),
+      's': env.actions.index('backward'),
+      'j': env.actions.index('rotate_ccw'),
+      'l': env.actions.index('rotate_cw'),
+      'n': env.actions.index('stop')
+  }
+  for k in integer_key_map:
+    integer_key_map[k] = np.int32(integer_key_map[k])
+  plt.ion()
+  for _ in range(20):
+    obs = env.reset()
+    steps = -1
+    action = None
+    while True:
+      print 'distance = ', obs[task_env.ModalityTypes.DISTANCE]
+      steps += 1
+      depth_value = obs[task_env.ModalityTypes.DEPTH][:, :, 0]
+      depth_mask = obs[task_env.ModalityTypes.DEPTH][:, :, 1]
+      seg_mask = np.squeeze(obs[task_env.ModalityTypes.SEMANTIC_SEGMENTATION])
+      det_mask = np.argmax(
+          obs[task_env.ModalityTypes.OBJECT_DETECTION], axis=-1)
+      img = obs[task_env.ModalityTypes.IMAGE]
+      plt.subplot(231)
+      plt.title('steps = {}'.format(steps))
+      plt.imshow(img.astype(np.uint8))
+      plt.subplot(232)
+      plt.imshow(depth_value)
+      plt.title('depth value')
+      plt.subplot(233)
+      plt.imshow(depth_mask)
+      plt.title('depth mask')
+      plt.subplot(234)
+      plt.imshow(seg_mask)
+      plt.title('seg')
+      plt.subplot(235)
+      plt.imshow(det_mask)
+      plt.title('det')
+      plt.subplot(236)
+      plt.title('goal={}'.format(targets[env.goal_index]))
+      plt.draw()
+      while True:
+        s = raw_input('key = ')
+        if np.random.rand() > 0.5:
+          key_map = string_key_map
+        else:
+          key_map = integer_key_map
+        if s in key_map:
+          action = key_map[s]
+          break
+        else:
+          print 'invalid action'
+      print 'action = {}'.format(action)
+      if action == 'stop':
+        print 'dist to goal: {}'.format(len(env.path_to_goal()) - 2)
+        break
+      obs, reward, done, info = env.step(action)
+      print 'reward = {}, done = {}, success = {}'.format(
+          reward, done, info['success'])
+def visualize_random_step_sequence(env):
+  """Visualizes random sequence of steps."""
+  plt.ion()
+  for _ in range(20):
+    path, actions, _, step_outputs = env.random_step_sequence(max_len=30)
+    print 'path = {}'.format(path)
+    for action, step_output in zip(actions, step_outputs):
+      obs, _, done, _ = step_output
+      depth_value = obs[task_env.ModalityTypes.DEPTH][:, :, 0]
+      depth_mask = obs[task_env.ModalityTypes.DEPTH][:, :, 1]
+      seg_mask = np.squeeze(obs[task_env.ModalityTypes.SEMANTIC_SEGMENTATION])
+      det_mask = np.argmax(
+          obs[task_env.ModalityTypes.OBJECT_DETECTION], axis=-1)
+      img = obs[task_env.ModalityTypes.IMAGE]
+      plt.subplot(231)
+      plt.imshow(img.astype(np.uint8))
+      plt.subplot(232)
+      plt.imshow(depth_value)
+      plt.title('depth value')
+      plt.subplot(233)
+      plt.imshow(depth_mask)
+      plt.title('depth mask')
+      plt.subplot(234)
+      plt.imshow(seg_mask)
+      plt.title('seg')
+      plt.subplot(235)
+      plt.imshow(det_mask)
+      plt.title('det')
+      plt.subplot(236)
+      print 'action = {}'.format(action)
+      print 'done = {}'.format(done)
+      plt.draw()
+      if raw_input('press \'n\' to go to the next random sequence. Otherwise, '
+                   'press any key to continue...') == 'n':
+        break
+def visualize(env, input_folder, output_root_folder):
+  """visualizes images for sequence of steps from the evals folder."""
+  def which_env(file_name):
+    img_name = file_name.split('_')[0][2:5]
+    env_dict = {'161': 'Home_016_1', '131': 'Home_013_1', '111': 'Home_011_1'}
+    if img_name in env_dict:
+      return env_dict[img_name]
+    else:
+      raise ValueError('could not resolve env: {} {}'.format(
+          img_name, file_name))
+  def which_goal(file_name):
+    return file_name[file_name.find('_')+1:]
+  output_images_folder = os.path.join(output_root_folder, 'images')
+  output_gifs_folder = os.path.join(output_root_folder, 'gifs')
+  if not tf.gfile.IsDirectory(output_images_folder):
+    tf.gfile.MakeDirs(output_images_folder)
+  if not tf.gfile.IsDirectory(output_gifs_folder):
+    tf.gfile.MakeDirs(output_gifs_folder)
+  npy_files = [
+      os.path.join(input_folder, name)
+      for name in tf.gfile.ListDirectory(input_folder)
+      if name.find('npy') >= 0
+  ]
+  for i, npy_file in enumerate(npy_files):
+    print 'saving images {}/{}'.format(i, len(npy_files))
+    pure_name = npy_file[npy_file.rfind('/') + 1:-4]
+    output_folder = os.path.join(output_images_folder, pure_name)
+    if not tf.gfile.IsDirectory(output_folder):
+      tf.gfile.MakeDirs(output_folder)
+    print '*******'
+    print pure_name[0:pure_name.find('_')]
+    env.reset_for_eval(which_env(pure_name),
+                       which_goal(pure_name),
+                       pure_name[0:pure_name.find('_')],
+                      )
+    with tf.gfile.Open(npy_file) as h:
+      states = np.load(h).item()['states']
+      images = [
+          env.observation(state)[mt.IMAGE] for state in states
+      ]
+      for j, img in enumerate(images):
+        cv2.imwrite(os.path.join(output_folder, '{0:03d}'.format(j) + '.jpg'),
+                    img[:, :, ::-1])
+      print 'converting to gif'
+      os.system(
+          'convert -set delay 20 -colors 256 -dispose 1 {}/*.jpg {}.gif'.format(
+              output_folder,
+              os.path.join(output_gifs_folder, pure_name + '.gif')
+          )
+      )
+def evaluate_folder(env, folder_path):
+  """Evaluates the performance from the evals folder."""
+  targets = ['fridge', 'dining_table', 'microwave', 'tv', 'couch']
+  def compute_acc(npy_file):
+    with tf.gfile.Open(npy_file) as h:
+      data = np.load(h).item()
+    if npy_file.find('dining_table') >= 0:
+      category = 'dining_table'
+    else:
+      category = npy_file[npy_file.rfind('_') + 1:-4]
+    return category, data['distance'][-1] - 2
+  def evaluate_iteration(folder):
+    """Evaluates the data from the folder of certain eval iteration."""
+    print folder
+    npy_files = [
+        os.path.join(folder, name)
+        for name in tf.gfile.ListDirectory(folder)
+        if name.find('npy') >= 0
+    ]
+    eval_stats = {c: [] for c in targets}
+    for npy_file in npy_files:
+      try:
+        category, dist = compute_acc(npy_file)
+      except:  # pylint: disable=bare-except
+        continue
+      eval_stats[category].append(float(dist <= 5))
+    for c in eval_stats:
+      if not eval_stats[c]:
+        print 'incomplete eval {}: empty class {}'.format(folder_path, c)
+        return None
+      eval_stats[c] = np.mean(eval_stats[c])
+    eval_stats['mean'] = np.mean(eval_stats.values())
+    return eval_stats
+  checkpoint_folders = [
+      folder_path + x
+      for x in tf.gfile.ListDirectory(folder_path)
+      if tf.gfile.IsDirectory(folder_path + x)
+  ]
+  print '{} folders found'.format(len(checkpoint_folders))
+  print '------------------------'
+  all_iters = []
+  all_accs = []
+  for i, folder in enumerate(checkpoint_folders):
+    print 'processing {}/{}'.format(i, len(checkpoint_folders))
+    eval_stats = evaluate_iteration(folder)
+    if eval_stats is None:
+      continue
+    else:
+      iter_no = int(folder[folder.rfind('/') + 1:])
+      print 'result ', iter_no, eval_stats['mean']
+      all_accs.append(eval_stats['mean'])
+      all_iters.append(iter_no)
+  all_accs = np.asarray(all_accs)
+  all_iters = np.asarray(all_iters)
+  idx = np.argmax(all_accs)
+  print 'best result at iteration {} was {}'.format(all_iters[idx],
+                                                    all_accs[idx])
+  order = np.argsort(all_iters)
+  all_iters = all_iters[order]
+  all_accs = all_accs[order]
+  #plt.plot(all_iters, all_accs)
+  #plt.show()
+  #print 'done plotting'
+  best_iteration_folder = os.path.join(folder_path, str(all_iters[idx]))
+  print 'generating gifs and images for {}'.format(best_iteration_folder)
+  visualize(env, best_iteration_folder, FLAGS.output_folder)
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_config, FLAGS.gin_params)
+  print('********')
+  print(FLAGS.mode)
+  print(FLAGS.gin_config)
+  print(FLAGS.gin_params)
+  env = active_vision_dataset_env.ActiveVisionDatasetEnv(modality_types=[
+      task_env.ModalityTypes.IMAGE,
+      task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
+      task_env.ModalityTypes.OBJECT_DETECTION, task_env.ModalityTypes.DEPTH,
+      task_env.ModalityTypes.DISTANCE
+  ])
+  if FLAGS.mode == BENCHMARK_MODE:
+    benchmark(env, env.possible_targets)
+  elif FLAGS.mode == GRAPH_MODE:
+    for loc in env.worlds:
+      env.check_scene_graph(loc, 'fridge')
+  elif FLAGS.mode == HUMAN_MODE:
+    human(env, env.possible_targets)
+  elif FLAGS.mode == VIS_MODE:
+    visualize_random_step_sequence(env)
+  elif FLAGS.mode == EVAL_MODE:
+    evaluate_folder(env, FLAGS.eval_folder)
+if __name__ == '__main__':
+  app.run(main)
--- a/research/cvt_text/README.md
+++ b/research/cvt_text/README.md
+# Cross-View Training
+This repository contains code for *Semi-Supervised Sequence Modeling with Cross-View Training*. Currently sequence tagging and dependency parsing tasks are supported.
+## Requirements
+* [Tensorflow](https://www.tensorflow.org/)
+* [Numpy](http://www.numpy.org/)
+This code has been run with TensorFlow 1.10.1 and Numpy 1.14.5; other versions may work, but have not been tested.
+## Fetching and Preprocessing Data
+Run `fetch_data.sh` to download and extract pretrained [GloVe](https://nlp.stanford.edu/projects/glove/) vectors, the [1 Billion Word Language Model Benchmark](http://www.statmt.org/lm-benchmark/) corpus of unlabeled data, and the CoNLL-2000 [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/) dataset. Unfortunately the other datasets from our paper are not freely available and so can't be included in this repository.
+To apply CVT to other datasets, the data should be placed in `data/raw_data/<task_name>/(train|dev|test).txt`. For sequence tagging data, each line should contain a word followed by a space followed by that word's tag. Sentences should be separated by empty lines. For dependency parsing, each tag should be of the form ``<index_of_head>-<relation>`` (e.g., `0-root`).
+After all of the data has been downloaded, run `preprocessing.py`.
+## Training a Model
+Run `python cvt.py --mode=train --model_name=chunking_model`. By default this trains a model on the chunking data downloaded with `fetch_data.sh`. To change which task(s) are trained on or model hyperparameters, modify [base/configure.py](base/configure.py). Models are automatically checkpointed every 1000 steps; training will continue from the latest checkpoint if training is interrupted and restarted. Model checkpoints and other data such as dev set accuracy over time are stored in `data/models/<model_name>`.
+## Evaluating a Model
+Run `python cvt.py --mode=eval --model_name=chunking_model`. A CVT model trained on the chunking data for 200k steps should get at least 97.1 F1 on the dev set and 96.6 F1 on the test set.
+## Citation
+If you use this code for your publication, please cite the original paper
+```
+@inproceedings{clark2018semi,
+  title = {Semi-Supervised Sequence Modeling with Cross-View Training},
+  author = {Kevin Clark and Minh-Thang Luong and Christopher D. Manning and Quoc V. Le},
+  booktitle = {ACL},
+  year = {2018}
+}
+```
+## Contact
+* [Kevin Clark](https://cs.stanford.edu/~kevclark/) (@clarkkev).
+* [Thang Luong](https://nlp.stanford.edu/~lmthang/) (@lmthang).
--- a/research/cvt_text/__init__.py
+++ b/research/cvt_text/__init__.py
--- a/research/cvt_text/base/__init__.py
+++ b/research/cvt_text/base/__init__.py