Merge remote-tracking branch 'upstream/master'

27b4acd4 · Aman Gupta · 5133522f · d4e1f97f · 27b4acd4 · 27b4acd4
Commit 27b4acd4 authored Sep 25, 2018 by Aman Gupta
20 changed files
--- a/research/audioset/vggish_train_demo.py
+++ b/research/audioset/vggish_train_demo.py
@@ -64,7 +64,7 @@ flags.DEFINE_integer(
 flags.DEFINE_boolean(
    'train_vggish', True,
-    'If Frue, allow VGGish parameters to change during training, thus '
+    'If True, allow VGGish parameters to change during training, thus '
    'fine-tuning VGGish. If False, VGGish parameters are fixed, thus using '
    'VGGish as a fixed feature extractor.')

--- a/research/autoaugment/README.md
+++ b/research/autoaugment/README.md
+<font size=4><b>Train Wide-ResNet, Shake-Shake and ShakeDrop models on CIFAR-10
+and CIFAR-100 dataset with AutoAugment.</b></font>
+The CIFAR-10/CIFAR-100 data can be downloaded from:
+https://www.cs.toronto.edu/~kriz/cifar.html.
+The code replicates the results from Tables 1 and 2 on CIFAR-10/100 with the
+following models: Wide-ResNet-28-10, Shake-Shake (26 2x32d), Shake-Shake (26
+2x96d) and PyramidNet+ShakeDrop.
+<b>Related papers:</b>
+AutoAugment: Learning Augmentation Policies from Data
+https://arxiv.org/abs/1805.09501
+Wide Residual Networks
+https://arxiv.org/abs/1605.07146
+Shake-Shake regularization
+https://arxiv.org/abs/1705.07485
+ShakeDrop regularization
+https://arxiv.org/abs/1802.02375
+<b>Settings:</b>
+CIFAR-10 Model         | Learning Rate | Weight Decay | Num. Epochs | Batch Size
+---------------------- | ------------- | ------------ | ----------- | ----------
+Wide-ResNet-28-10      | 0.1           | 5e-4         | 200         | 128
+Shake-Shake (26 2x32d) | 0.01          | 1e-3         | 1800        | 128
+Shake-Shake (26 2x96d) | 0.01          | 1e-3         | 1800        | 128
+PyramidNet + ShakeDrop | 0.05          | 5e-5         | 1800        | 64
+<b>Prerequisite:</b>
+1.  Install TensorFlow.
+2.  Download CIFAR-10/CIFAR-100 dataset.
+```shell
+curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
+curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
+```
+<b>How to run:</b>
+```shell
+# cd to the your workspace.
+# Specify the directory where dataset is located using the data_path flag.
+# Note: User can split samples from training set into the eval set by changing train_size and validation_size.
+# For example, to train the Wide-ResNet-28-10 model on a GPU.
+python train_cifar.py --model_name=wrn \
+                      --checkpoint_dir=/tmp/training \
+                      --data_path=/tmp/data \
+                      --dataset='cifar10' \
+                      --use_cpu=0
+```
+## Contact for Issues
+*   Barret Zoph, @barretzoph <barretzoph@google.com>
+*   Ekin Dogus Cubuk, <cubuk@google.com>
--- a/research/autoaugment/augmentation_transforms.py
+++ b/research/autoaugment/augmentation_transforms.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Transforms used in the Augmentation Policies."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+import numpy as np
+# pylint:disable=g-multiple-import
+from PIL import ImageOps, ImageEnhance, ImageFilter, Image
+# pylint:enable=g-multiple-import
+IMAGE_SIZE = 32
+# What is the dataset mean and std of the images on the training set
+MEANS = [0.49139968, 0.48215841, 0.44653091]
+STDS = [0.24703223, 0.24348513, 0.26158784]
+PARAMETER_MAX = 10  # What is the max 'level' a transform could be predicted
+def random_flip(x):
+  """Flip the input x horizontally with 50% probability."""
+  if np.random.rand(1)[0] > 0.5:
+    return np.fliplr(x)
+  return x
+def zero_pad_and_crop(img, amount=4):
+  """Zero pad by `amount` zero pixels on each side then take a random crop.
+  Args:
+    img: numpy image that will be zero padded and cropped.
+    amount: amount of zeros to pad `img` with horizontally and verically.
+  Returns:
+    The cropped zero padded img. The returned numpy array will be of the same
+    shape as `img`.
+  """
+  padded_img = np.zeros((img.shape[0] + amount * 2, img.shape[1] + amount * 2,
+                         img.shape[2]))
+  padded_img[amount:img.shape[0] + amount, amount:
+             img.shape[1] + amount, :] = img
+  top = np.random.randint(low=0, high=2 * amount)
+  left = np.random.randint(low=0, high=2 * amount)
+  new_img = padded_img[top:top + img.shape[0], left:left + img.shape[1], :]
+  return new_img
+def create_cutout_mask(img_height, img_width, num_channels, size):
+  """Creates a zero mask used for cutout of shape `img_height` x `img_width`.
+  Args:
+    img_height: Height of image cutout mask will be applied to.
+    img_width: Width of image cutout mask will be applied to.
+    num_channels: Number of channels in the image.
+    size: Size of the zeros mask.
+  Returns:
+    A mask of shape `img_height` x `img_width` with all ones except for a
+    square of zeros of shape `size` x `size`. This mask is meant to be
+    elementwise multiplied with the original image. Additionally returns
+    the `upper_coord` and `lower_coord` which specify where the cutout mask
+    will be applied.
+  """
+  assert img_height == img_width
+  # Sample center where cutout mask will be applied
+  height_loc = np.random.randint(low=0, high=img_height)
+  width_loc = np.random.randint(low=0, high=img_width)
+  # Determine upper right and lower left corners of patch
+  upper_coord = (max(0, height_loc - size // 2), max(0, width_loc - size // 2))
+  lower_coord = (min(img_height, height_loc + size // 2),
+                 min(img_width, width_loc + size // 2))
+  mask_height = lower_coord[0] - upper_coord[0]
+  mask_width = lower_coord[1] - upper_coord[1]
+  assert mask_height > 0
+  assert mask_width > 0
+  mask = np.ones((img_height, img_width, num_channels))
+  zeros = np.zeros((mask_height, mask_width, num_channels))
+  mask[upper_coord[0]:lower_coord[0], upper_coord[1]:lower_coord[1], :] = (
+      zeros)
+  return mask, upper_coord, lower_coord
+def cutout_numpy(img, size=16):
+  """Apply cutout with mask of shape `size` x `size` to `img`.
+  The cutout operation is from the paper https://arxiv.org/abs/1708.04552.
+  This operation applies a `size`x`size` mask of zeros to a random location
+  within `img`.
+  Args:
+    img: Numpy image that cutout will be applied to.
+    size: Height/width of the cutout mask that will be
+  Returns:
+    A numpy tensor that is the result of applying the cutout mask to `img`.
+  """
+  img_height, img_width, num_channels = (img.shape[0], img.shape[1],
+                                         img.shape[2])
+  assert len(img.shape) == 3
+  mask, _, _ = create_cutout_mask(img_height, img_width, num_channels, size)
+  return img * mask
+def float_parameter(level, maxval):
+  """Helper function to scale `val` between 0 and maxval .
+  Args:
+    level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+    maxval: Maximum value that the operation can have. This will be scaled
+      to level/PARAMETER_MAX.
+  Returns:
+    A float that results from scaling `maxval` according to `level`.
+  """
+  return float(level) * maxval / PARAMETER_MAX
+def int_parameter(level, maxval):
+  """Helper function to scale `val` between 0 and maxval .
+  Args:
+    level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+    maxval: Maximum value that the operation can have. This will be scaled
+      to level/PARAMETER_MAX.
+  Returns:
+    An int that results from scaling `maxval` according to `level`.
+  """
+  return int(level * maxval / PARAMETER_MAX)
+def pil_wrap(img):
+  """Convert the `img` numpy tensor to a PIL Image."""
+  return Image.fromarray(
+      np.uint8((img * STDS + MEANS) * 255.0)).convert('RGBA')
+def pil_unwrap(pil_img):
+  """Converts the PIL img to a numpy array."""
+  pic_array = (np.array(pil_img.getdata()).reshape((32, 32, 4)) / 255.0)
+  i1, i2 = np.where(pic_array[:, :, 3] == 0)
+  pic_array = (pic_array[:, :, :3] - MEANS) / STDS
+  pic_array[i1, i2] = [0, 0, 0]
+  return pic_array
+def apply_policy(policy, img):
+  """Apply the `policy` to the numpy `img`.
+  Args:
+    policy: A list of tuples with the form (name, probability, level) where
+      `name` is the name of the augmentation operation to apply, `probability`
+      is the probability of applying the operation and `level` is what strength
+      the operation to apply.
+    img: Numpy image that will have `policy` applied to it.
+  Returns:
+    The result of applying `policy` to `img`.
+  """
+  pil_img = pil_wrap(img)
+  for xform in policy:
+    assert len(xform) == 3
+    name, probability, level = xform
+    xform_fn = NAME_TO_TRANSFORM[name].pil_transformer(probability, level)
+    pil_img = xform_fn(pil_img)
+  return pil_unwrap(pil_img)
+class TransformFunction(object):
+  """Wraps the Transform function for pretty printing options."""
+  def __init__(self, func, name):
+    self.f = func
+    self.name = name
+  def __repr__(self):
+    return '<' + self.name + '>'
+  def __call__(self, pil_img):
+    return self.f(pil_img)
+class TransformT(object):
+  """Each instance of this class represents a specific transform."""
+  def __init__(self, name, xform_fn):
+    self.name = name
+    self.xform = xform_fn
+  def pil_transformer(self, probability, level):
+    def return_function(im):
+      if random.random() < probability:
+        im = self.xform(im, level)
+      return im
+    name = self.name + '({:.1f},{})'.format(probability, level)
+    return TransformFunction(return_function, name)
+  def do_transform(self, image, level):
+    f = self.pil_transformer(PARAMETER_MAX, level)
+    return pil_unwrap(f(pil_wrap(image)))
+################## Transform Functions ##################
+identity = TransformT('identity', lambda pil_img, level: pil_img)
+flip_lr = TransformT(
+    'FlipLR',
+    lambda pil_img, level: pil_img.transpose(Image.FLIP_LEFT_RIGHT))
+flip_ud = TransformT(
+    'FlipUD',
+    lambda pil_img, level: pil_img.transpose(Image.FLIP_TOP_BOTTOM))
+# pylint:disable=g-long-lambda
+auto_contrast = TransformT(
+    'AutoContrast',
+    lambda pil_img, level: ImageOps.autocontrast(
+        pil_img.convert('RGB')).convert('RGBA'))
+equalize = TransformT(
+    'Equalize',
+    lambda pil_img, level: ImageOps.equalize(
+        pil_img.convert('RGB')).convert('RGBA'))
+invert = TransformT(
+    'Invert',
+    lambda pil_img, level: ImageOps.invert(
+        pil_img.convert('RGB')).convert('RGBA'))
+# pylint:enable=g-long-lambda
+blur = TransformT(
+    'Blur', lambda pil_img, level: pil_img.filter(ImageFilter.BLUR))
+smooth = TransformT(
+    'Smooth',
+    lambda pil_img, level: pil_img.filter(ImageFilter.SMOOTH))
+def _rotate_impl(pil_img, level):
+  """Rotates `pil_img` from -30 to 30 degrees depending on `level`."""
+  degrees = int_parameter(level, 30)
+  if random.random() > 0.5:
+    degrees = -degrees
+  return pil_img.rotate(degrees)
+rotate = TransformT('Rotate', _rotate_impl)
+def _posterize_impl(pil_img, level):
+  """Applies PIL Posterize to `pil_img`."""
+  level = int_parameter(level, 4)
+  return ImageOps.posterize(pil_img.convert('RGB'), 4 - level).convert('RGBA')
+posterize = TransformT('Posterize', _posterize_impl)
+def _shear_x_impl(pil_img, level):
+  """Applies PIL ShearX to `pil_img`.
+  The ShearX operation shears the image along the horizontal axis with `level`
+  magnitude.
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+  Returns:
+    A PIL Image that has had ShearX applied to it.
+  """
+  level = float_parameter(level, 0.3)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((32, 32), Image.AFFINE, (1, level, 0, 0, 1, 0))
+shear_x = TransformT('ShearX', _shear_x_impl)
+def _shear_y_impl(pil_img, level):
+  """Applies PIL ShearY to `pil_img`.
+  The ShearY operation shears the image along the vertical axis with `level`
+  magnitude.
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+  Returns:
+    A PIL Image that has had ShearX applied to it.
+  """
+  level = float_parameter(level, 0.3)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((32, 32), Image.AFFINE, (1, 0, 0, level, 1, 0))
+shear_y = TransformT('ShearY', _shear_y_impl)
+def _translate_x_impl(pil_img, level):
+  """Applies PIL TranslateX to `pil_img`.
+  Translate the image in the horizontal direction by `level`
+  number of pixels.
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+  Returns:
+    A PIL Image that has had TranslateX applied to it.
+  """
+  level = int_parameter(level, 10)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((32, 32), Image.AFFINE, (1, 0, level, 0, 1, 0))
+translate_x = TransformT('TranslateX', _translate_x_impl)
+def _translate_y_impl(pil_img, level):
+  """Applies PIL TranslateY to `pil_img`.
+  Translate the image in the vertical direction by `level`
+  number of pixels.
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+  Returns:
+    A PIL Image that has had TranslateY applied to it.
+  """
+  level = int_parameter(level, 10)
+  if random.random() > 0.5:
+    level = -level
+  return pil_img.transform((32, 32), Image.AFFINE, (1, 0, 0, 0, 1, level))
+translate_y = TransformT('TranslateY', _translate_y_impl)
+def _crop_impl(pil_img, level, interpolation=Image.BILINEAR):
+  """Applies a crop to `pil_img` with the size depending on the `level`."""
+  cropped = pil_img.crop((level, level, IMAGE_SIZE - level, IMAGE_SIZE - level))
+  resized = cropped.resize((IMAGE_SIZE, IMAGE_SIZE), interpolation)
+  return resized
+crop_bilinear = TransformT('CropBilinear', _crop_impl)
+def _solarize_impl(pil_img, level):
+  """Applies PIL Solarize to `pil_img`.
+  Translate the image in the vertical direction by `level`
+  number of pixels.
+  Args:
+    pil_img: Image in PIL object.
+    level: Strength of the operation specified as an Integer from
+      [0, `PARAMETER_MAX`].
+  Returns:
+    A PIL Image that has had Solarize applied to it.
+  """
+  level = int_parameter(level, 256)
+  return ImageOps.solarize(pil_img.convert('RGB'), 256 - level).convert('RGBA')
+solarize = TransformT('Solarize', _solarize_impl)
+def _cutout_pil_impl(pil_img, level):
+  """Apply cutout to pil_img at the specified level."""
+  size = int_parameter(level, 20)
+  if size <= 0:
+    return pil_img
+  img_height, img_width, num_channels = (32, 32, 3)
+  _, upper_coord, lower_coord = (
+      create_cutout_mask(img_height, img_width, num_channels, size))
+  pixels = pil_img.load()  # create the pixel map
+  for i in range(upper_coord[0], lower_coord[0]):  # for every col:
+    for j in range(upper_coord[1], lower_coord[1]):  # For every row
+      pixels[i, j] = (125, 122, 113, 0)  # set the colour accordingly
+  return pil_img
+cutout = TransformT('Cutout', _cutout_pil_impl)
+def _enhancer_impl(enhancer):
+  """Sets level to be between 0.1 and 1.8 for ImageEnhance transforms of PIL."""
+  def impl(pil_img, level):
+    v = float_parameter(level, 1.8) + .1  # going to 0 just destroys it
+    return enhancer(pil_img).enhance(v)
+  return impl
+color = TransformT('Color', _enhancer_impl(ImageEnhance.Color))
+contrast = TransformT('Contrast', _enhancer_impl(ImageEnhance.Contrast))
+brightness = TransformT('Brightness', _enhancer_impl(
+    ImageEnhance.Brightness))
+sharpness = TransformT('Sharpness', _enhancer_impl(ImageEnhance.Sharpness))
+ALL_TRANSFORMS = [
+    flip_lr,
+    flip_ud,
+    auto_contrast,
+    equalize,
+    invert,
+    rotate,
+    posterize,
+    crop_bilinear,
+    solarize,
+    color,
+    contrast,
+    brightness,
+    sharpness,
+    shear_x,
+    shear_y,
+    translate_x,
+    translate_y,
+    cutout,
+    blur,
+    smooth
+]
+NAME_TO_TRANSFORM = {t.name: t for t in ALL_TRANSFORMS}
+TRANSFORM_NAMES = NAME_TO_TRANSFORM.keys()
--- a/research/autoaugment/custom_ops.py
+++ b/research/autoaugment/custom_ops.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains convenience wrappers for typical Neural Network TensorFlow layers.
+   Ops that have different behavior during training or eval have an is_training
+   parameter.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+arg_scope = tf.contrib.framework.arg_scope
+def variable(name, shape, dtype, initializer, trainable):
+  """Returns a TF variable with the passed in specifications."""
+  var = tf.get_variable(
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      trainable=trainable)
+  return var
+def global_avg_pool(x, scope=None):
+  """Average pools away spatial height and width dimension of 4D tensor."""
+  assert x.get_shape().ndims == 4
+  with tf.name_scope(scope, 'global_avg_pool', [x]):
+    kernel_size = (1, int(x.shape[1]), int(x.shape[2]), 1)
+    squeeze_dims = (1, 2)
+    result = tf.nn.avg_pool(
+        x,
+        ksize=kernel_size,
+        strides=(1, 1, 1, 1),
+        padding='VALID',
+        data_format='NHWC')
+    return tf.squeeze(result, squeeze_dims)
+def zero_pad(inputs, in_filter, out_filter):
+  """Zero pads `input` tensor to have `out_filter` number of filters."""
+  outputs = tf.pad(inputs, [[0, 0], [0, 0], [0, 0],
+                            [(out_filter - in_filter) // 2,
+                             (out_filter - in_filter) // 2]])
+  return outputs
+@tf.contrib.framework.add_arg_scope
+def batch_norm(inputs,
+               decay=0.999,
+               center=True,
+               scale=False,
+               epsilon=0.001,
+               is_training=True,
+               reuse=None,
+               scope=None):
+  """Small wrapper around tf.contrib.layers.batch_norm."""
+  return tf.contrib.layers.batch_norm(
+      inputs,
+      decay=decay,
+      center=center,
+      scale=scale,
+      epsilon=epsilon,
+      activation_fn=None,
+      param_initializers=None,
+      updates_collections=tf.GraphKeys.UPDATE_OPS,
+      is_training=is_training,
+      reuse=reuse,
+      trainable=True,
+      fused=True,
+      data_format='NHWC',
+      zero_debias_moving_mean=False,
+      scope=scope)
+def stride_arr(stride_h, stride_w):
+  return [1, stride_h, stride_w, 1]
+@tf.contrib.framework.add_arg_scope
+def conv2d(inputs,
+           num_filters_out,
+           kernel_size,
+           stride=1,
+           scope=None,
+           reuse=None):
+  """Adds a 2D convolution.
+  conv2d creates a variable called 'weights', representing the convolutional
+  kernel, that is convolved with the input.
+  Args:
+    inputs: a 4D tensor in NHWC format.
+    num_filters_out: the number of output filters.
+    kernel_size: an int specifying the kernel height and width size.
+    stride: an int specifying the height and width stride.
+    scope: Optional scope for variable_scope.
+    reuse: whether or not the layer and its variables should be reused.
+  Returns:
+    a tensor that is the result of a convolution being applied to `inputs`.
+  """
+  with tf.variable_scope(scope, 'Conv', [inputs], reuse=reuse):
+    num_filters_in = int(inputs.shape[3])
+    weights_shape = [kernel_size, kernel_size, num_filters_in, num_filters_out]
+    # Initialization
+    n = int(weights_shape[0] * weights_shape[1] * weights_shape[3])
+    weights_initializer = tf.random_normal_initializer(
+        stddev=np.sqrt(2.0 / n))
+    weights = variable(
+        name='weights',
+        shape=weights_shape,
+        dtype=tf.float32,
+        initializer=weights_initializer,
+        trainable=True)
+    strides = stride_arr(stride, stride)
+    outputs = tf.nn.conv2d(
+        inputs, weights, strides, padding='SAME', data_format='NHWC')
+    return outputs
+@tf.contrib.framework.add_arg_scope
+def fc(inputs,
+       num_units_out,
+       scope=None,
+       reuse=None):
+  """Creates a fully connected layer applied to `inputs`.
+  Args:
+    inputs: a tensor that the fully connected layer will be applied to. It
+      will be reshaped if it is not 2D.
+    num_units_out: the number of output units in the layer.
+    scope: Optional scope for variable_scope.
+    reuse: whether or not the layer and its variables should be reused.
+  Returns:
+     a tensor that is the result of applying a linear matrix to `inputs`.
+  """
+  if len(inputs.shape) > 2:
+    inputs = tf.reshape(inputs, [int(inputs.shape[0]), -1])
+  with tf.variable_scope(scope, 'FC', [inputs], reuse=reuse):
+    num_units_in = inputs.shape[1]
+    weights_shape = [num_units_in, num_units_out]
+    unif_init_range = 1.0 / (num_units_out)**(0.5)
+    weights_initializer = tf.random_uniform_initializer(
+        -unif_init_range, unif_init_range)
+    weights = variable(
+        name='weights',
+        shape=weights_shape,
+        dtype=tf.float32,
+        initializer=weights_initializer,
+        trainable=True)
+    bias_initializer = tf.constant_initializer(0.0)
+    biases = variable(
+        name='biases',
+        shape=[num_units_out,],
+        dtype=tf.float32,
+        initializer=bias_initializer,
+        trainable=True)
+    outputs = tf.nn.xw_plus_b(inputs, weights, biases)
+    return outputs
+@tf.contrib.framework.add_arg_scope
+def avg_pool(inputs, kernel_size, stride=2, padding='VALID', scope=None):
+  """Wrapper around tf.nn.avg_pool."""
+  with tf.name_scope(scope, 'AvgPool', [inputs]):
+    kernel = stride_arr(kernel_size, kernel_size)
+    strides = stride_arr(stride, stride)
+    return tf.nn.avg_pool(
+        inputs,
+        ksize=kernel,
+        strides=strides,
+        padding=padding,
+        data_format='NHWC')
--- a/research/autoaugment/data_utils.py
+++ b/research/autoaugment/data_utils.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data utils for CIFAR-10 and CIFAR-100."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import cPickle
+import os
+import augmentation_transforms
+import numpy as np
+import policies as found_policies
+import tensorflow as tf
+# pylint:disable=logging-format-interpolation
+class DataSet(object):
+  """Dataset object that produces augmented training and eval data."""
+  def __init__(self, hparams):
+    self.hparams = hparams
+    self.epochs = 0
+    self.curr_train_index = 0
+    all_labels = []
+    self.good_policies = found_policies.good_policies()
+    # Determine how many databatched to load
+    num_data_batches_to_load = 5
+    total_batches_to_load = num_data_batches_to_load
+    train_batches_to_load = total_batches_to_load
+    assert hparams.train_size + hparams.validation_size <= 50000
+    if hparams.eval_test:
+      total_batches_to_load += 1
+    # Determine how many images we have loaded
+    total_dataset_size = 10000 * num_data_batches_to_load
+    train_dataset_size = total_dataset_size
+    if hparams.eval_test:
+      total_dataset_size += 10000
+    if hparams.dataset == 'cifar10':
+      all_data = np.empty((total_batches_to_load, 10000, 3072), dtype=np.uint8)
+    elif hparams.dataset == 'cifar100':
+      assert num_data_batches_to_load == 5
+      all_data = np.empty((1, 50000, 3072), dtype=np.uint8)
+      if hparams.eval_test:
+        test_data = np.empty((1, 10000, 3072), dtype=np.uint8)
+    if hparams.dataset == 'cifar10':
+      tf.logging.info('Cifar10')
+      datafiles = [
+          'data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4',
+          'data_batch_5']
+      datafiles = datafiles[:train_batches_to_load]
+      if hparams.eval_test:
+        datafiles.append('test_batch')
+      num_classes = 10
+    elif hparams.dataset == 'cifar100':
+      datafiles = ['train']
+      if hparams.eval_test:
+        datafiles.append('test')
+      num_classes = 100
+    else:
+      raise NotImplementedError('Unimplemented dataset: ', hparams.dataset)
+    if hparams.dataset != 'test':
+      for file_num, f in enumerate(datafiles):
+        d = unpickle(os.path.join(hparams.data_path, f))
+        if f == 'test':
+          test_data[0] = copy.deepcopy(d['data'])
+          all_data = np.concatenate([all_data, test_data], axis=1)
+        else:
+          all_data[file_num] = copy.deepcopy(d['data'])
+        if hparams.dataset == 'cifar10':
+          labels = np.array(d['labels'])
+        else:
+          labels = np.array(d['fine_labels'])
+        nsamples = len(labels)
+        for idx in range(nsamples):
+          all_labels.append(labels[idx])
+    all_data = all_data.reshape(total_dataset_size, 3072)
+    all_data = all_data.reshape(-1, 3, 32, 32)
+    all_data = all_data.transpose(0, 2, 3, 1).copy()
+    all_data = all_data / 255.0
+    mean = augmentation_transforms.MEANS
+    std = augmentation_transforms.STDS
+    tf.logging.info('mean:{}    std: {}'.format(mean, std))
+    all_data = (all_data - mean) / std
+    all_labels = np.eye(num_classes)[np.array(all_labels, dtype=np.int32)]
+    assert len(all_data) == len(all_labels)
+    tf.logging.info(
+        'In CIFAR10 loader, number of images: {}'.format(len(all_data)))
+    # Break off test data
+    if hparams.eval_test:
+      self.test_images = all_data[train_dataset_size:]
+      self.test_labels = all_labels[train_dataset_size:]
+    # Shuffle the rest of the data
+    all_data = all_data[:train_dataset_size]
+    all_labels = all_labels[:train_dataset_size]
+    np.random.seed(0)
+    perm = np.arange(len(all_data))
+    np.random.shuffle(perm)
+    all_data = all_data[perm]
+    all_labels = all_labels[perm]
+    # Break into train and val
+    train_size, val_size = hparams.train_size, hparams.validation_size
+    assert 50000 >= train_size + val_size
+    self.train_images = all_data[:train_size]
+    self.train_labels = all_labels[:train_size]
+    self.val_images = all_data[train_size:train_size + val_size]
+    self.val_labels = all_labels[train_size:train_size + val_size]
+    self.num_train = self.train_images.shape[0]
+  def next_batch(self):
+    """Return the next minibatch of augmented data."""
+    next_train_index = self.curr_train_index + self.hparams.batch_size
+    if next_train_index > self.num_train:
+      # Increase epoch number
+      epoch = self.epochs + 1
+      self.reset()
+      self.epochs = epoch
+    batched_data = (
+        self.train_images[self.curr_train_index:
+                          self.curr_train_index + self.hparams.batch_size],
+        self.train_labels[self.curr_train_index:
+                          self.curr_train_index + self.hparams.batch_size])
+    final_imgs = []
+    images, labels = batched_data
+    for data in images:
+      epoch_policy = self.good_policies[np.random.choice(
+          len(self.good_policies))]
+      final_img = augmentation_transforms.apply_policy(
+          epoch_policy, data)
+      final_img = augmentation_transforms.random_flip(
+          augmentation_transforms.zero_pad_and_crop(final_img, 4))
+      # Apply cutout
+      final_img = augmentation_transforms.cutout_numpy(final_img)
+      final_imgs.append(final_img)
+    batched_data = (np.array(final_imgs, np.float32), labels)
+    self.curr_train_index += self.hparams.batch_size
+    return batched_data
+  def reset(self):
+    """Reset training data and index into the training data."""
+    self.epochs = 0
+    # Shuffle the training data
+    perm = np.arange(self.num_train)
+    np.random.shuffle(perm)
+    assert self.num_train == self.train_images.shape[
+        0], 'Error incorrect shuffling mask'
+    self.train_images = self.train_images[perm]
+    self.train_labels = self.train_labels[perm]
+    self.curr_train_index = 0
+def unpickle(f):
+  tf.logging.info('loading file: {}'.format(f))
+  fo = tf.gfile.Open(f, 'r')
+  d = cPickle.load(fo)
+  fo.close()
+  return d
--- a/research/autoaugment/helper_utils.py
+++ b/research/autoaugment/helper_utils.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions used for training AutoAugment models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+def setup_loss(logits, labels):
+  """Returns the cross entropy for the given `logits` and `labels`."""
+  predictions = tf.nn.softmax(logits)
+  cost = tf.losses.softmax_cross_entropy(onehot_labels=labels,
+                                         logits=logits)
+  return predictions, cost
+def decay_weights(cost, weight_decay_rate):
+  """Calculates the loss for l2 weight decay and adds it to `cost`."""
+  costs = []
+  for var in tf.trainable_variables():
+    costs.append(tf.nn.l2_loss(var))
+  cost += tf.multiply(weight_decay_rate, tf.add_n(costs))
+  return cost
+def eval_child_model(session, model, data_loader, mode):
+  """Evaluates `model` on held out data depending on `mode`.
+  Args:
+    session: TensorFlow session the model will be run with.
+    model: TensorFlow model that will be evaluated.
+    data_loader: DataSet object that contains data that `model` will
+      evaluate.
+    mode: Will `model` either evaluate validation or test data.
+  Returns:
+    Accuracy of `model` when evaluated on the specified dataset.
+  Raises:
+    ValueError: if invalid dataset `mode` is specified.
+  """
+  if mode == 'val':
+    images = data_loader.val_images
+    labels = data_loader.val_labels
+  elif mode == 'test':
+    images = data_loader.test_images
+    labels = data_loader.test_labels
+  else:
+    raise ValueError('Not valid eval mode')
+  assert len(images) == len(labels)
+  tf.logging.info('model.batch_size is {}'.format(model.batch_size))
+  assert len(images) % model.batch_size == 0
+  eval_batches = int(len(images) / model.batch_size)
+  for i in range(eval_batches):
+    eval_images = images[i * model.batch_size:(i + 1) * model.batch_size]
+    eval_labels = labels[i * model.batch_size:(i + 1) * model.batch_size]
+    _ = session.run(
+        model.eval_op,
+        feed_dict={
+            model.images: eval_images,
+            model.labels: eval_labels,
+        })
+  return session.run(model.accuracy)
+def cosine_lr(learning_rate, epoch, iteration, batches_per_epoch, total_epochs):
+  """Cosine Learning rate.
+  Args:
+    learning_rate: Initial learning rate.
+    epoch: Current epoch we are one. This is one based.
+    iteration: Current batch in this epoch.
+    batches_per_epoch: Batches per epoch.
+    total_epochs: Total epochs you are training for.
+  Returns:
+    The learning rate to be used for this current batch.
+  """
+  t_total = total_epochs * batches_per_epoch
+  t_cur = float(epoch * batches_per_epoch + iteration)
+  return 0.5 * learning_rate * (1 + np.cos(np.pi * t_cur / t_total))
+def get_lr(curr_epoch, hparams, iteration=None):
+  """Returns the learning rate during training based on the current epoch."""
+  assert iteration is not None
+  batches_per_epoch = int(hparams.train_size / hparams.batch_size)
+  lr = cosine_lr(hparams.lr, curr_epoch, iteration, batches_per_epoch,
+                 hparams.num_epochs)
+  return lr
+def run_epoch_training(session, model, data_loader, curr_epoch):
+  """Runs one epoch of training for the model passed in.
+  Args:
+    session: TensorFlow session the model will be run with.
+    model: TensorFlow model that will be evaluated.
+    data_loader: DataSet object that contains data that `model` will
+      evaluate.
+    curr_epoch: How many of epochs of training have been done so far.
+  Returns:
+    The accuracy of 'model' on the training set
+  """
+  steps_per_epoch = int(model.hparams.train_size / model.hparams.batch_size)
+  tf.logging.info('steps per epoch: {}'.format(steps_per_epoch))
+  curr_step = session.run(model.global_step)
+  assert curr_step % steps_per_epoch == 0
+  # Get the current learning rate for the model based on the current epoch
+  curr_lr = get_lr(curr_epoch, model.hparams, iteration=0)
+  tf.logging.info('lr of {} for epoch {}'.format(curr_lr, curr_epoch))
+  for step in xrange(steps_per_epoch):
+    curr_lr = get_lr(curr_epoch, model.hparams, iteration=(step + 1))
+    # Update the lr rate variable to the current LR.
+    model.lr_rate_ph.load(curr_lr, session=session)
+    if step % 20 == 0:
+      tf.logging.info('Training {}/{}'.format(step, steps_per_epoch))
+    train_images, train_labels = data_loader.next_batch()
+    _, step, _ = session.run(
+        [model.train_op, model.global_step, model.eval_op],
+        feed_dict={
+            model.images: train_images,
+            model.labels: train_labels,
+        })
+  train_accuracy = session.run(model.accuracy)
+  tf.logging.info('Train accuracy: {}'.format(train_accuracy))
+  return train_accuracy
--- a/research/autoaugment/policies.py
+++ b/research/autoaugment/policies.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+def good_policies():
+  """AutoAugment policies found on Cifar."""
+  exp0_0 = [
+      [('Invert', 0.1, 7), ('Contrast', 0.2, 6)],
+      [('Rotate', 0.7, 2), ('TranslateX', 0.3, 9)],
+      [('Sharpness', 0.8, 1), ('Sharpness', 0.9, 3)],
+      [('ShearY', 0.5, 8), ('TranslateY', 0.7, 9)],
+      [('AutoContrast', 0.5, 8), ('Equalize', 0.9, 2)]]
+  exp0_1 = [
+      [('Solarize', 0.4, 5), ('AutoContrast', 0.9, 3)],
+      [('TranslateY', 0.9, 9), ('TranslateY', 0.7, 9)],
+      [('AutoContrast', 0.9, 2), ('Solarize', 0.8, 3)],
+      [('Equalize', 0.8, 8), ('Invert', 0.1, 3)],
+      [('TranslateY', 0.7, 9), ('AutoContrast', 0.9, 1)]]
+  exp0_2 = [
+      [('Solarize', 0.4, 5), ('AutoContrast', 0.0, 2)],
+      [('TranslateY', 0.7, 9), ('TranslateY', 0.7, 9)],
+      [('AutoContrast', 0.9, 0), ('Solarize', 0.4, 3)],
+      [('Equalize', 0.7, 5), ('Invert', 0.1, 3)],
+      [('TranslateY', 0.7, 9), ('TranslateY', 0.7, 9)]]
+  exp0_3 = [
+      [('Solarize', 0.4, 5), ('AutoContrast', 0.9, 1)],
+      [('TranslateY', 0.8, 9), ('TranslateY', 0.9, 9)],
+      [('AutoContrast', 0.8, 0), ('TranslateY', 0.7, 9)],
+      [('TranslateY', 0.2, 7), ('Color', 0.9, 6)],
+      [('Equalize', 0.7, 6), ('Color', 0.4, 9)]]
+  exp1_0 = [
+      [('ShearY', 0.2, 7), ('Posterize', 0.3, 7)],
+      [('Color', 0.4, 3), ('Brightness', 0.6, 7)],
+      [('Sharpness', 0.3, 9), ('Brightness', 0.7, 9)],
+      [('Equalize', 0.6, 5), ('Equalize', 0.5, 1)],
+      [('Contrast', 0.6, 7), ('Sharpness', 0.6, 5)]]
+  exp1_1 = [
+      [('Brightness', 0.3, 7), ('AutoContrast', 0.5, 8)],
+      [('AutoContrast', 0.9, 4), ('AutoContrast', 0.5, 6)],
+      [('Solarize', 0.3, 5), ('Equalize', 0.6, 5)],
+      [('TranslateY', 0.2, 4), ('Sharpness', 0.3, 3)],
+      [('Brightness', 0.0, 8), ('Color', 0.8, 8)]]
+  exp1_2 = [
+      [('Solarize', 0.2, 6), ('Color', 0.8, 6)],
+      [('Solarize', 0.2, 6), ('AutoContrast', 0.8, 1)],
+      [('Solarize', 0.4, 1), ('Equalize', 0.6, 5)],
+      [('Brightness', 0.0, 0), ('Solarize', 0.5, 2)],
+      [('AutoContrast', 0.9, 5), ('Brightness', 0.5, 3)]]
+  exp1_3 = [
+      [('Contrast', 0.7, 5), ('Brightness', 0.0, 2)],
+      [('Solarize', 0.2, 8), ('Solarize', 0.1, 5)],
+      [('Contrast', 0.5, 1), ('TranslateY', 0.2, 9)],
+      [('AutoContrast', 0.6, 5), ('TranslateY', 0.0, 9)],
+      [('AutoContrast', 0.9, 4), ('Equalize', 0.8, 4)]]
+  exp1_4 = [
+      [('Brightness', 0.0, 7), ('Equalize', 0.4, 7)],
+      [('Solarize', 0.2, 5), ('Equalize', 0.7, 5)],
+      [('Equalize', 0.6, 8), ('Color', 0.6, 2)],
+      [('Color', 0.3, 7), ('Color', 0.2, 4)],
+      [('AutoContrast', 0.5, 2), ('Solarize', 0.7, 2)]]
+  exp1_5 = [
+      [('AutoContrast', 0.2, 0), ('Equalize', 0.1, 0)],
+      [('ShearY', 0.6, 5), ('Equalize', 0.6, 5)],
+      [('Brightness', 0.9, 3), ('AutoContrast', 0.4, 1)],
+      [('Equalize', 0.8, 8), ('Equalize', 0.7, 7)],
+      [('Equalize', 0.7, 7), ('Solarize', 0.5, 0)]]
+  exp1_6 = [
+      [('Equalize', 0.8, 4), ('TranslateY', 0.8, 9)],
+      [('TranslateY', 0.8, 9), ('TranslateY', 0.6, 9)],
+      [('TranslateY', 0.9, 0), ('TranslateY', 0.5, 9)],
+      [('AutoContrast', 0.5, 3), ('Solarize', 0.3, 4)],
+      [('Solarize', 0.5, 3), ('Equalize', 0.4, 4)]]
+  exp2_0 = [
+      [('Color', 0.7, 7), ('TranslateX', 0.5, 8)],
+      [('Equalize', 0.3, 7), ('AutoContrast', 0.4, 8)],
+      [('TranslateY', 0.4, 3), ('Sharpness', 0.2, 6)],
+      [('Brightness', 0.9, 6), ('Color', 0.2, 8)],
+      [('Solarize', 0.5, 2), ('Invert', 0.0, 3)]]
+  exp2_1 = [
+      [('AutoContrast', 0.1, 5), ('Brightness', 0.0, 0)],
+      [('Cutout', 0.2, 4), ('Equalize', 0.1, 1)],
+      [('Equalize', 0.7, 7), ('AutoContrast', 0.6, 4)],
+      [('Color', 0.1, 8), ('ShearY', 0.2, 3)],
+      [('ShearY', 0.4, 2), ('Rotate', 0.7, 0)]]
+  exp2_2 = [
+      [('ShearY', 0.1, 3), ('AutoContrast', 0.9, 5)],
+      [('TranslateY', 0.3, 6), ('Cutout', 0.3, 3)],
+      [('Equalize', 0.5, 0), ('Solarize', 0.6, 6)],
+      [('AutoContrast', 0.3, 5), ('Rotate', 0.2, 7)],
+      [('Equalize', 0.8, 2), ('Invert', 0.4, 0)]]
+  exp2_3 = [
+      [('Equalize', 0.9, 5), ('Color', 0.7, 0)],
+      [('Equalize', 0.1, 1), ('ShearY', 0.1, 3)],
+      [('AutoContrast', 0.7, 3), ('Equalize', 0.7, 0)],
+      [('Brightness', 0.5, 1), ('Contrast', 0.1, 7)],
+      [('Contrast', 0.1, 4), ('Solarize', 0.6, 5)]]
+  exp2_4 = [
+      [('Solarize', 0.2, 3), ('ShearX', 0.0, 0)],
+      [('TranslateX', 0.3, 0), ('TranslateX', 0.6, 0)],
+      [('Equalize', 0.5, 9), ('TranslateY', 0.6, 7)],
+      [('ShearX', 0.1, 0), ('Sharpness', 0.5, 1)],
+      [('Equalize', 0.8, 6), ('Invert', 0.3, 6)]]
+  exp2_5 = [
+      [('AutoContrast', 0.3, 9), ('Cutout', 0.5, 3)],
+      [('ShearX', 0.4, 4), ('AutoContrast', 0.9, 2)],
+      [('ShearX', 0.0, 3), ('Posterize', 0.0, 3)],
+      [('Solarize', 0.4, 3), ('Color', 0.2, 4)],
+      [('Equalize', 0.1, 4), ('Equalize', 0.7, 6)]]
+  exp2_6 = [
+      [('Equalize', 0.3, 8), ('AutoContrast', 0.4, 3)],
+      [('Solarize', 0.6, 4), ('AutoContrast', 0.7, 6)],
+      [('AutoContrast', 0.2, 9), ('Brightness', 0.4, 8)],
+      [('Equalize', 0.1, 0), ('Equalize', 0.0, 6)],
+      [('Equalize', 0.8, 4), ('Equalize', 0.0, 4)]]
+  exp2_7 = [
+      [('Equalize', 0.5, 5), ('AutoContrast', 0.1, 2)],
+      [('Solarize', 0.5, 5), ('AutoContrast', 0.9, 5)],
+      [('AutoContrast', 0.6, 1), ('AutoContrast', 0.7, 8)],
+      [('Equalize', 0.2, 0), ('AutoContrast', 0.1, 2)],
+      [('Equalize', 0.6, 9), ('Equalize', 0.4, 4)]]
+  exp0s = exp0_0 + exp0_1 + exp0_2 + exp0_3
+  exp1s = exp1_0 + exp1_1 + exp1_2 + exp1_3 + exp1_4 + exp1_5 + exp1_6
+  exp2s = exp2_0 + exp2_1 + exp2_2 + exp2_3 + exp2_4 + exp2_5 + exp2_6 + exp2_7
+  return  exp0s + exp1s + exp2s
--- a/research/autoaugment/shake_drop.py
+++ b/research/autoaugment/shake_drop.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builds the Shake-Shake Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import custom_ops as ops
+import tensorflow as tf
+def round_int(x):
+  """Rounds `x` and then converts to an int."""
+  return int(math.floor(x + 0.5))
+def shortcut(x, output_filters, stride):
+  """Applies strided avg pool or zero padding to make output_filters match x."""
+  num_filters = int(x.shape[3])
+  if stride == 2:
+    x = ops.avg_pool(x, 2, stride=stride, padding='SAME')
+  if num_filters != output_filters:
+    diff = output_filters - num_filters
+    assert diff > 0
+    # Zero padd diff zeros
+    padding = [[0, 0], [0, 0], [0, 0], [0, diff]]
+    x = tf.pad(x, padding)
+  return x
+def calc_prob(curr_layer, total_layers, p_l):
+  """Calculates drop prob depending on the current layer."""
+  return 1 - (float(curr_layer) / total_layers) * p_l
+def bottleneck_layer(x, n, stride, prob, is_training, alpha, beta):
+  """Bottleneck layer for shake drop model."""
+  assert alpha[1] > alpha[0]
+  assert beta[1] > beta[0]
+  with tf.variable_scope('bottleneck_{}'.format(prob)):
+    input_layer = x
+    x = ops.batch_norm(x, scope='bn_1_pre')
+    x = ops.conv2d(x, n, 1, scope='1x1_conv_contract')
+    x = ops.batch_norm(x, scope='bn_1_post')
+    x = tf.nn.relu(x)
+    x = ops.conv2d(x, n, 3, stride=stride, scope='3x3')
+    x = ops.batch_norm(x, scope='bn_2')
+    x = tf.nn.relu(x)
+    x = ops.conv2d(x, n * 4, 1, scope='1x1_conv_expand')
+    x = ops.batch_norm(x, scope='bn_3')
+    # Apply regularization here
+    # Sample bernoulli with prob
+    if is_training:
+      batch_size = tf.shape(x)[0]
+      bern_shape = [batch_size, 1, 1, 1]
+      random_tensor = prob
+      random_tensor += tf.random_uniform(bern_shape, dtype=tf.float32)
+      binary_tensor = tf.floor(random_tensor)
+      alpha_values = tf.random_uniform(
+          [batch_size, 1, 1, 1], minval=alpha[0], maxval=alpha[1],
+          dtype=tf.float32)
+      beta_values = tf.random_uniform(
+          [batch_size, 1, 1, 1], minval=beta[0], maxval=beta[1],
+          dtype=tf.float32)
+      rand_forward = (
+          binary_tensor + alpha_values - binary_tensor * alpha_values)
+      rand_backward = (
+          binary_tensor + beta_values - binary_tensor * beta_values)
+      x = x * rand_backward + tf.stop_gradient(x * rand_forward -
+                                               x * rand_backward)
+    else:
+      expected_alpha = (alpha[1] + alpha[0])/2
+      # prob is the expectation of the bernoulli variable
+      x = (prob + expected_alpha - prob * expected_alpha) * x
+    res = shortcut(input_layer, n * 4, stride)
+    return x + res
+def build_shake_drop_model(images, num_classes, is_training):
+  """Builds the PyramidNet Shake-Drop model.
+  Build the PyramidNet Shake-Drop model from https://arxiv.org/abs/1802.02375.
+  Args:
+    images: Tensor of images that will be fed into the Wide ResNet Model.
+    num_classes: Number of classed that the model needs to predict.
+    is_training: Is the model training or not.
+  Returns:
+    The logits of the PyramidNet Shake-Drop model.
+  """
+  # ShakeDrop Hparams
+  p_l = 0.5
+  alpha_shake = [-1, 1]
+  beta_shake = [0, 1]
+  # PyramidNet Hparams
+  alpha = 200
+  depth = 272
+  # This is for the bottleneck architecture specifically
+  n = int((depth - 2) / 9)
+  start_channel = 16
+  add_channel = alpha / (3 * n)
+  # Building the models
+  x = images
+  x = ops.conv2d(x, 16, 3, scope='init_conv')
+  x = ops.batch_norm(x, scope='init_bn')
+  layer_num = 1
+  total_layers = n * 3
+  start_channel += add_channel
+  prob = calc_prob(layer_num, total_layers, p_l)
+  x = bottleneck_layer(
+      x, round_int(start_channel), 1, prob, is_training, alpha_shake,
+      beta_shake)
+  layer_num += 1
+  for _ in range(1, n):
+    start_channel += add_channel
+    prob = calc_prob(layer_num, total_layers, p_l)
+    x = bottleneck_layer(
+        x, round_int(start_channel), 1, prob, is_training, alpha_shake,
+        beta_shake)
+    layer_num += 1
+  start_channel += add_channel
+  prob = calc_prob(layer_num, total_layers, p_l)
+  x = bottleneck_layer(
+      x, round_int(start_channel), 2, prob, is_training, alpha_shake,
+      beta_shake)
+  layer_num += 1
+  for _ in range(1, n):
+    start_channel += add_channel
+    prob = calc_prob(layer_num, total_layers, p_l)
+    x = bottleneck_layer(
+        x, round_int(start_channel), 1, prob, is_training, alpha_shake,
+        beta_shake)
+    layer_num += 1
+  start_channel += add_channel
+  prob = calc_prob(layer_num, total_layers, p_l)
+  x = bottleneck_layer(
+      x, round_int(start_channel), 2, prob, is_training, alpha_shake,
+      beta_shake)
+  layer_num += 1
+  for _ in range(1, n):
+    start_channel += add_channel
+    prob = calc_prob(layer_num, total_layers, p_l)
+    x = bottleneck_layer(
+        x, round_int(start_channel), 1, prob, is_training, alpha_shake,
+        beta_shake)
+    layer_num += 1
+  assert layer_num - 1 == total_layers
+  x = ops.batch_norm(x, scope='final_bn')
+  x = tf.nn.relu(x)
+  x = ops.global_avg_pool(x)
+  # Fully connected
+  logits = ops.fc(x, num_classes)
+  return logits
--- a/research/autoaugment/shake_shake.py
+++ b/research/autoaugment/shake_shake.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builds the Shake-Shake Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import custom_ops as ops
+import tensorflow as tf
+def _shake_shake_skip_connection(x, output_filters, stride):
+  """Adds a residual connection to the filter x for the shake-shake model."""
+  curr_filters = int(x.shape[3])
+  if curr_filters == output_filters:
+    return x
+  stride_spec = ops.stride_arr(stride, stride)
+  # Skip path 1
+  path1 = tf.nn.avg_pool(
+      x, [1, 1, 1, 1], stride_spec, 'VALID', data_format='NHWC')
+  path1 = ops.conv2d(path1, int(output_filters / 2), 1, scope='path1_conv')
+  # Skip path 2
+  # First pad with 0's then crop
+  pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]]
+  path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :]
+  concat_axis = 3
+  path2 = tf.nn.avg_pool(
+      path2, [1, 1, 1, 1], stride_spec, 'VALID', data_format='NHWC')
+  path2 = ops.conv2d(path2, int(output_filters / 2), 1, scope='path2_conv')
+  # Concat and apply BN
+  final_path = tf.concat(values=[path1, path2], axis=concat_axis)
+  final_path = ops.batch_norm(final_path, scope='final_path_bn')
+  return final_path
+def _shake_shake_branch(x, output_filters, stride, rand_forward, rand_backward,
+                        is_training):
+  """Building a 2 branching convnet."""
+  x = tf.nn.relu(x)
+  x = ops.conv2d(x, output_filters, 3, stride=stride, scope='conv1')
+  x = ops.batch_norm(x, scope='bn1')
+  x = tf.nn.relu(x)
+  x = ops.conv2d(x, output_filters, 3, scope='conv2')
+  x = ops.batch_norm(x, scope='bn2')
+  if is_training:
+    x = x * rand_backward + tf.stop_gradient(x * rand_forward -
+                                             x * rand_backward)
+  else:
+    x *= 1.0 / 2
+  return x
+def _shake_shake_block(x, output_filters, stride, is_training):
+  """Builds a full shake-shake sub layer."""
+  batch_size = tf.shape(x)[0]
+  # Generate random numbers for scaling the branches
+  rand_forward = [
+      tf.random_uniform(
+          [batch_size, 1, 1, 1], minval=0, maxval=1, dtype=tf.float32)
+      for _ in range(2)
+  ]
+  rand_backward = [
+      tf.random_uniform(
+          [batch_size, 1, 1, 1], minval=0, maxval=1, dtype=tf.float32)
+      for _ in range(2)
+  ]
+  # Normalize so that all sum to 1
+  total_forward = tf.add_n(rand_forward)
+  total_backward = tf.add_n(rand_backward)
+  rand_forward = [samp / total_forward for samp in rand_forward]
+  rand_backward = [samp / total_backward for samp in rand_backward]
+  zipped_rand = zip(rand_forward, rand_backward)
+  branches = []
+  for branch, (r_forward, r_backward) in enumerate(zipped_rand):
+    with tf.variable_scope('branch_{}'.format(branch)):
+      b = _shake_shake_branch(x, output_filters, stride, r_forward, r_backward,
+                              is_training)
+      branches.append(b)
+  res = _shake_shake_skip_connection(x, output_filters, stride)
+  return res + tf.add_n(branches)
+def _shake_shake_layer(x, output_filters, num_blocks, stride,
+                       is_training):
+  """Builds many sub layers into one full layer."""
+  for block_num in range(num_blocks):
+    curr_stride = stride if (block_num == 0) else 1
+    with tf.variable_scope('layer_{}'.format(block_num)):
+      x = _shake_shake_block(x, output_filters, curr_stride,
+                             is_training)
+  return x
+def build_shake_shake_model(images, num_classes, hparams, is_training):
+  """Builds the Shake-Shake model.
+  Build the Shake-Shake model from https://arxiv.org/abs/1705.07485.
+  Args:
+    images: Tensor of images that will be fed into the Wide ResNet Model.
+    num_classes: Number of classed that the model needs to predict.
+    hparams: tf.HParams object that contains additional hparams needed to
+      construct the model. In this case it is the `shake_shake_widen_factor`
+      that is used to determine how many filters the model has.
+    is_training: Is the model training or not.
+  Returns:
+    The logits of the Shake-Shake model.
+  """
+  depth = 26
+  k = hparams.shake_shake_widen_factor  # The widen factor
+  n = int((depth - 2) / 6)
+  x = images
+  x = ops.conv2d(x, 16, 3, scope='init_conv')
+  x = ops.batch_norm(x, scope='init_bn')
+  with tf.variable_scope('L1'):
+    x = _shake_shake_layer(x, 16 * k, n, 1, is_training)
+  with tf.variable_scope('L2'):
+    x = _shake_shake_layer(x, 32 * k, n, 2, is_training)
+  with tf.variable_scope('L3'):
+    x = _shake_shake_layer(x, 64 * k, n, 2, is_training)
+  x = tf.nn.relu(x)
+  x = ops.global_avg_pool(x)
+  # Fully connected
+  logits = ops.fc(x, num_classes)
+  return logits
--- a/research/autoaugment/train_cifar.py
+++ b/research/autoaugment/train_cifar.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AutoAugment Train/Eval module.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import os
+import time
+import custom_ops as ops
+import data_utils
+import helper_utils
+import numpy as np
+from shake_drop import build_shake_drop_model
+from shake_shake import build_shake_shake_model
+import tensorflow as tf
+from wrn import build_wrn_model
+tf.flags.DEFINE_string('model_name', 'wrn',
+                       'wrn, shake_shake_32, shake_shake_96, shake_shake_112, '
+                       'pyramid_net')
+tf.flags.DEFINE_string('checkpoint_dir', '/tmp/training', 'Training Directory.')
+tf.flags.DEFINE_string('data_path', '/tmp/data',
+                       'Directory where dataset is located.')
+tf.flags.DEFINE_string('dataset', 'cifar10',
+                       'Dataset to train with. Either cifar10 or cifar100')
+tf.flags.DEFINE_integer('use_cpu', 1, '1 if use CPU, else GPU.')
+FLAGS = tf.flags.FLAGS
+arg_scope = tf.contrib.framework.arg_scope
+def setup_arg_scopes(is_training):
+  """Sets up the argscopes that will be used when building an image model.
+  Args:
+    is_training: Is the model training or not.
+  Returns:
+    Arg scopes to be put around the model being constructed.
+  """
+  batch_norm_decay = 0.9
+  batch_norm_epsilon = 1e-5
+  batch_norm_params = {
+      # Decay for the moving averages.
+      'decay': batch_norm_decay,
+      # epsilon to prevent 0s in variance.
+      'epsilon': batch_norm_epsilon,
+      'scale': True,
+      # collection containing the moving mean and moving variance.
+      'is_training': is_training,
+  }
+  scopes = []
+  scopes.append(arg_scope([ops.batch_norm], **batch_norm_params))
+  return scopes
+def build_model(inputs, num_classes, is_training, hparams):
+  """Constructs the vision model being trained/evaled.
+  Args:
+    inputs: input features/images being fed to the image model build built.
+    num_classes: number of output classes being predicted.
+    is_training: is the model training or not.
+    hparams: additional hyperparameters associated with the image model.
+  Returns:
+    The logits of the image model.
+  """
+  scopes = setup_arg_scopes(is_training)
+  with contextlib.nested(*scopes):
+    if hparams.model_name == 'pyramid_net':
+      logits = build_shake_drop_model(
+          inputs, num_classes, is_training)
+    elif hparams.model_name == 'wrn':
+      logits = build_wrn_model(
+          inputs, num_classes, hparams.wrn_size)
+    elif hparams.model_name == 'shake_shake':
+      logits = build_shake_shake_model(
+          inputs, num_classes, hparams, is_training)
+  return logits
+class CifarModel(object):
+  """Builds an image model for Cifar10/Cifar100."""
+  def __init__(self, hparams):
+    self.hparams = hparams
+  def build(self, mode):
+    """Construct the cifar model."""
+    assert mode in ['train', 'eval']
+    self.mode = mode
+    self._setup_misc(mode)
+    self._setup_images_and_labels()
+    self._build_graph(self.images, self.labels, mode)
+    self.init = tf.group(tf.global_variables_initializer(),
+                         tf.local_variables_initializer())
+  def _setup_misc(self, mode):
+    """Sets up miscellaneous in the cifar model constructor."""
+    self.lr_rate_ph = tf.Variable(0.0, name='lrn_rate', trainable=False)
+    self.reuse = None if (mode == 'train') else True
+    self.batch_size = self.hparams.batch_size
+    if mode == 'eval':
+      self.batch_size = 25
+  def _setup_images_and_labels(self):
+    """Sets up image and label placeholders for the cifar model."""
+    if FLAGS.dataset == 'cifar10':
+      self.num_classes = 10
+    else:
+      self.num_classes = 100
+    self.images = tf.placeholder(tf.float32, [self.batch_size, 32, 32, 3])
+    self.labels = tf.placeholder(tf.float32,
+                                 [self.batch_size, self.num_classes])
+  def assign_epoch(self, session, epoch_value):
+    session.run(self._epoch_update, feed_dict={self._new_epoch: epoch_value})
+  def _build_graph(self, images, labels, mode):
+    """Constructs the TF graph for the cifar model.
+    Args:
+      images: A 4-D image Tensor
+      labels: A 2-D labels Tensor.
+      mode: string indicating training mode ( e.g., 'train', 'valid', 'test').
+    """
+    is_training = 'train' in mode
+    if is_training:
+      self.global_step = tf.train.get_or_create_global_step()
+    logits = build_model(
+        images,
+        self.num_classes,
+        is_training,
+        self.hparams)
+    self.predictions, self.cost = helper_utils.setup_loss(
+        logits, labels)
+    self.accuracy, self.eval_op = tf.metrics.accuracy(
+        tf.argmax(labels, 1), tf.argmax(self.predictions, 1))
+    self._calc_num_trainable_params()
+    # Adds L2 weight decay to the cost
+    self.cost = helper_utils.decay_weights(self.cost,
+                                           self.hparams.weight_decay_rate)
+    if is_training:
+      self._build_train_op()
+    # Setup checkpointing for this child model
+    # Keep 2 or more checkpoints around during training.
+    with tf.device('/cpu:0'):
+      self.saver = tf.train.Saver(max_to_keep=2)
+    self.init = tf.group(tf.global_variables_initializer(),
+                         tf.local_variables_initializer())
+  def _calc_num_trainable_params(self):
+    self.num_trainable_params = np.sum([
+        np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()
+    ])
+    tf.logging.info('number of trainable params: {}'.format(
+        self.num_trainable_params))
+  def _build_train_op(self):
+    """Builds the train op for the cifar model."""
+    hparams = self.hparams
+    tvars = tf.trainable_variables()
+    grads = tf.gradients(self.cost, tvars)
+    if hparams.gradient_clipping_by_global_norm > 0.0:
+      grads, norm = tf.clip_by_global_norm(
+          grads, hparams.gradient_clipping_by_global_norm)
+      tf.summary.scalar('grad_norm', norm)
+    # Setup the initial learning rate
+    initial_lr = self.lr_rate_ph
+    optimizer = tf.train.MomentumOptimizer(
+        initial_lr,
+        0.9,
+        use_nesterov=True)
+    self.optimizer = optimizer
+    apply_op = optimizer.apply_gradients(
+        zip(grads, tvars), global_step=self.global_step, name='train_step')
+    train_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    with tf.control_dependencies([apply_op]):
+      self.train_op = tf.group(*train_ops)
+class CifarModelTrainer(object):
+  """Trains an instance of the CifarModel class."""
+  def __init__(self, hparams):
+    self._session = None
+    self.hparams = hparams
+    self.model_dir = os.path.join(FLAGS.checkpoint_dir, 'model')
+    self.log_dir = os.path.join(FLAGS.checkpoint_dir, 'log')
+    # Set the random seed to be sure the same validation set
+    # is used for each model
+    np.random.seed(0)
+    self.data_loader = data_utils.DataSet(hparams)
+    np.random.seed()  # Put the random seed back to random
+    self.data_loader.reset()
+  def save_model(self, step=None):
+    """Dumps model into the backup_dir.
+    Args:
+      step: If provided, creates a checkpoint with the given step
+        number, instead of overwriting the existing checkpoints.
+    """
+    model_save_name = os.path.join(self.model_dir, 'model.ckpt')
+    if not tf.gfile.IsDirectory(self.model_dir):
+      tf.gfile.MakeDirs(self.model_dir)
+    self.saver.save(self.session, model_save_name, global_step=step)
+    tf.logging.info('Saved child model')
+  def extract_model_spec(self):
+    """Loads a checkpoint with the architecture structure stored in the name."""
+    checkpoint_path = tf.train.latest_checkpoint(self.model_dir)
+    if checkpoint_path is not None:
+      self.saver.restore(self.session, checkpoint_path)
+      tf.logging.info('Loaded child model checkpoint from %s',
+                      checkpoint_path)
+    else:
+      self.save_model(step=0)
+  def eval_child_model(self, model, data_loader, mode):
+    """Evaluate the child model.
+    Args:
+      model: image model that will be evaluated.
+      data_loader: dataset object to extract eval data from.
+      mode: will the model be evalled on train, val or test.
+    Returns:
+      Accuracy of the model on the specified dataset.
+    """
+    tf.logging.info('Evaluating child model in mode %s', mode)
+    while True:
+      try:
+        with self._new_session(model):
+          accuracy = helper_utils.eval_child_model(
+              self.session,
+              model,
+              data_loader,
+              mode)
+          tf.logging.info('Eval child model accuracy: {}'.format(accuracy))
+          # If epoch trained without raising the below errors, break
+          # from loop.
+          break
+      except (tf.errors.AbortedError, tf.errors.UnavailableError) as e:
+        tf.logging.info('Retryable error caught: %s.  Retrying.', e)
+    return accuracy
+  @contextlib.contextmanager
+  def _new_session(self, m):
+    """Creates a new session for model m."""
+    # Create a new session for this model, initialize
+    # variables, and save / restore from
+    # checkpoint.
+    self._session = tf.Session(
+        '',
+        config=tf.ConfigProto(
+            allow_soft_placement=True, log_device_placement=False))
+    self.session.run(m.init)
+    # Load in a previous checkpoint, or save this one
+    self.extract_model_spec()
+    try:
+      yield
+    finally:
+      tf.Session.reset('')
+      self._session = None
+  def _build_models(self):
+    """Builds the image models for train and eval."""
+    # Determine if we should build the train and eval model. When using
+    # distributed training we only want to build one or the other and not both.
+    with tf.variable_scope('model', use_resource=False):
+      m = CifarModel(self.hparams)
+      m.build('train')
+      self._num_trainable_params = m.num_trainable_params
+      self._saver = m.saver
+    with tf.variable_scope('model', reuse=True, use_resource=False):
+      meval = CifarModel(self.hparams)
+      meval.build('eval')
+    return m, meval
+  def _calc_starting_epoch(self, m):
+    """Calculates the starting epoch for model m based on global step."""
+    hparams = self.hparams
+    batch_size = hparams.batch_size
+    steps_per_epoch = int(hparams.train_size / batch_size)
+    with self._new_session(m):
+      curr_step = self.session.run(m.global_step)
+    total_steps = steps_per_epoch * hparams.num_epochs
+    epochs_left = (total_steps - curr_step) // steps_per_epoch
+    starting_epoch = hparams.num_epochs - epochs_left
+    return starting_epoch
+  def _run_training_loop(self, m, curr_epoch):
+    """Trains the cifar model `m` for one epoch."""
+    start_time = time.time()
+    while True:
+      try:
+        with self._new_session(m):
+          train_accuracy = helper_utils.run_epoch_training(
+              self.session, m, self.data_loader, curr_epoch)
+          tf.logging.info('Saving model after epoch')
+          self.save_model(step=curr_epoch)
+          break
+      except (tf.errors.AbortedError, tf.errors.UnavailableError) as e:
+        tf.logging.info('Retryable error caught: %s.  Retrying.', e)
+    tf.logging.info('Finished epoch: {}'.format(curr_epoch))
+    tf.logging.info('Epoch time(min): {}'.format(
+        (time.time() - start_time) / 60.0))
+    return train_accuracy
+  def _compute_final_accuracies(self, meval):
+    """Run once training is finished to compute final val/test accuracies."""
+    valid_accuracy = self.eval_child_model(meval, self.data_loader, 'val')
+    if self.hparams.eval_test:
+      test_accuracy = self.eval_child_model(meval, self.data_loader, 'test')
+    else:
+      test_accuracy = 0
+    tf.logging.info('Test Accuracy: {}'.format(test_accuracy))
+    return valid_accuracy, test_accuracy
+  def run_model(self):
+    """Trains and evalutes the image model."""
+    hparams = self.hparams
+    # Build the child graph
+    with tf.Graph().as_default(), tf.device(
+        '/cpu:0' if FLAGS.use_cpu else '/gpu:0'):
+      m, meval = self._build_models()
+      # Figure out what epoch we are on
+      starting_epoch = self._calc_starting_epoch(m)
+      # Run the validation error right at the beginning
+      valid_accuracy = self.eval_child_model(
+          meval, self.data_loader, 'val')
+      tf.logging.info('Before Training Epoch: {}     Val Acc: {}'.format(
+          starting_epoch, valid_accuracy))
+      training_accuracy = None
+      for curr_epoch in xrange(starting_epoch, hparams.num_epochs):
+        # Run one training epoch
+        training_accuracy = self._run_training_loop(m, curr_epoch)
+        valid_accuracy = self.eval_child_model(
+            meval, self.data_loader, 'val')
+        tf.logging.info('Epoch: {}    Valid Acc: {}'.format(
+            curr_epoch, valid_accuracy))
+      valid_accuracy, test_accuracy = self._compute_final_accuracies(
+          meval)
+    tf.logging.info(
+        'Train Acc: {}    Valid Acc: {}     Test Acc: {}'.format(
+            training_accuracy, valid_accuracy, test_accuracy))
+  @property
+  def saver(self):
+    return self._saver
+  @property
+  def session(self):
+    return self._session
+  @property
+  def num_trainable_params(self):
+    return self._num_trainable_params
+def main(_):
+  if FLAGS.dataset not in ['cifar10', 'cifar100']:
+    raise ValueError('Invalid dataset: %s' % FLAGS.dataset)
+  hparams = tf.contrib.training.HParams(
+      train_size=50000,
+      validation_size=0,
+      eval_test=1,
+      dataset=FLAGS.dataset,
+      data_path=FLAGS.data_path,
+      batch_size=128,
+      gradient_clipping_by_global_norm=5.0)
+  if FLAGS.model_name == 'wrn':
+    hparams.add_hparam('model_name', 'wrn')
+    hparams.add_hparam('num_epochs', 200)
+    hparams.add_hparam('wrn_size', 160)
+    hparams.add_hparam('lr', 0.1)
+    hparams.add_hparam('weight_decay_rate', 5e-4)
+  elif FLAGS.model_name == 'shake_shake_32':
+    hparams.add_hparam('model_name', 'shake_shake')
+    hparams.add_hparam('num_epochs', 1800)
+    hparams.add_hparam('shake_shake_widen_factor', 2)
+    hparams.add_hparam('lr', 0.01)
+    hparams.add_hparam('weight_decay_rate', 0.001)
+  elif FLAGS.model_name == 'shake_shake_96':
+    hparams.add_hparam('model_name', 'shake_shake')
+    hparams.add_hparam('num_epochs', 1800)
+    hparams.add_hparam('shake_shake_widen_factor', 6)
+    hparams.add_hparam('lr', 0.01)
+    hparams.add_hparam('weight_decay_rate', 0.001)
+  elif FLAGS.model_name == 'shake_shake_112':
+    hparams.add_hparam('model_name', 'shake_shake')
+    hparams.add_hparam('num_epochs', 1800)
+    hparams.add_hparam('shake_shake_widen_factor', 7)
+    hparams.add_hparam('lr', 0.01)
+    hparams.add_hparam('weight_decay_rate', 0.001)
+  elif FLAGS.model_name == 'pyramid_net':
+    hparams.add_hparam('model_name', 'pyramid_net')
+    hparams.add_hparam('num_epochs', 1800)
+    hparams.add_hparam('lr', 0.05)
+    hparams.add_hparam('weight_decay_rate', 5e-5)
+    hparams.batch_size = 64
+  else:
+    raise ValueError('Not Valid Model Name: %s' % FLAGS.model_name)
+  cifar_trainer = CifarModelTrainer(hparams)
+  cifar_trainer.run_model()
+if __name__ == '__main__':
+  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.app.run()
--- a/research/autoaugment/wrn.py
+++ b/research/autoaugment/wrn.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builds the Wide-ResNet Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import custom_ops as ops
+import numpy as np
+import tensorflow as tf
+def residual_block(
+    x, in_filter, out_filter, stride, activate_before_residual=False):
+  """Adds residual connection to `x` in addition to applying BN->ReLU->3x3 Conv.
+  Args:
+    x: Tensor that is the output of the previous layer in the model.
+    in_filter: Number of filters `x` has.
+    out_filter: Number of filters that the output of this layer will have.
+    stride: Integer that specified what stride should be applied to `x`.
+    activate_before_residual: Boolean on whether a BN->ReLU should be applied
+      to x before the convolution is applied.
+  Returns:
+    A Tensor that is the result of applying two sequences of BN->ReLU->3x3 Conv
+    and then adding that Tensor to `x`.
+  """
+  if activate_before_residual:  # Pass up RELU and BN activation for resnet
+    with tf.variable_scope('shared_activation'):
+      x = ops.batch_norm(x, scope='init_bn')
+      x = tf.nn.relu(x)
+      orig_x = x
+  else:
+    orig_x = x
+  block_x = x
+  if not activate_before_residual:
+    with tf.variable_scope('residual_only_activation'):
+      block_x = ops.batch_norm(block_x, scope='init_bn')
+      block_x = tf.nn.relu(block_x)
+  with tf.variable_scope('sub1'):
+    block_x = ops.conv2d(
+        block_x, out_filter, 3, stride=stride, scope='conv1')
+  with tf.variable_scope('sub2'):
+    block_x = ops.batch_norm(block_x, scope='bn2')
+    block_x = tf.nn.relu(block_x)
+    block_x = ops.conv2d(
+        block_x, out_filter, 3, stride=1, scope='conv2')
+  with tf.variable_scope(
+      'sub_add'):  # If number of filters do not agree then zero pad them
+    if in_filter != out_filter:
+      orig_x = ops.avg_pool(orig_x, stride, stride)
+      orig_x = ops.zero_pad(orig_x, in_filter, out_filter)
+  x = orig_x + block_x
+  return x
+def _res_add(in_filter, out_filter, stride, x, orig_x):
+  """Adds `x` with `orig_x`, both of which are layers in the model.
+  Args:
+    in_filter: Number of filters in `orig_x`.
+    out_filter: Number of filters in `x`.
+    stride: Integer specifying the stide that should be applied `orig_x`.
+    x: Tensor that is the output of the previous layer.
+    orig_x: Tensor that is the output of an earlier layer in the network.
+  Returns:
+    A Tensor that is the result of `x` and `orig_x` being added after
+    zero padding and striding are applied to `orig_x` to get the shapes
+    to match.
+  """
+  if in_filter != out_filter:
+    orig_x = ops.avg_pool(orig_x, stride, stride)
+    orig_x = ops.zero_pad(orig_x, in_filter, out_filter)
+  x = x + orig_x
+  orig_x = x
+  return x, orig_x
+def build_wrn_model(images, num_classes, wrn_size):
+  """Builds the WRN model.
+  Build the Wide ResNet model from https://arxiv.org/abs/1605.07146.
+  Args:
+    images: Tensor of images that will be fed into the Wide ResNet Model.
+    num_classes: Number of classed that the model needs to predict.
+    wrn_size: Parameter that scales the number of filters in the Wide ResNet
+      model.
+  Returns:
+    The logits of the Wide ResNet model.
+  """
+  kernel_size = wrn_size
+  filter_size = 3
+  num_blocks_per_resnet = 4
+  filters = [
+      min(kernel_size, 16), kernel_size, kernel_size * 2, kernel_size * 4
+  ]
+  strides = [1, 2, 2]  # stride for each resblock
+  # Run the first conv
+  with tf.variable_scope('init'):
+    x = images
+    output_filters = filters[0]
+    x = ops.conv2d(x, output_filters, filter_size, scope='init_conv')
+  first_x = x  # Res from the beginning
+  orig_x = x  # Res from previous block
+  for block_num in range(1, 4):
+    with tf.variable_scope('unit_{}_0'.format(block_num)):
+      activate_before_residual = True if block_num == 1 else False
+      x = residual_block(
+          x,
+          filters[block_num - 1],
+          filters[block_num],
+          strides[block_num - 1],
+          activate_before_residual=activate_before_residual)
+    for i in range(1, num_blocks_per_resnet):
+      with tf.variable_scope('unit_{}_{}'.format(block_num, i)):
+        x = residual_block(
+            x,
+            filters[block_num],
+            filters[block_num],
+            1,
+            activate_before_residual=False)
+    x, orig_x = _res_add(filters[block_num - 1], filters[block_num],
+                         strides[block_num - 1], x, orig_x)
+  final_stride_val = np.prod(strides)
+  x, _ = _res_add(filters[0], filters[3], final_stride_val, x, first_x)
+  with tf.variable_scope('unit_last'):
+    x = ops.batch_norm(x, scope='final_bn')
+    x = tf.nn.relu(x)
+    x = ops.global_avg_pool(x)
+    logits = ops.fc(x, num_classes)
+  return logits
--- a/research/cognitive_planning/BUILD
+++ b/research/cognitive_planning/BUILD
+package(default_visibility = [":internal"])
+licenses(["notice"])  # Apache 2.0
+exports_files(["LICENSE"])
+package_group(
+    name = "internal",
+    packages = [
+        "//cognitive_planning/...",
+    ],
+)
+py_binary(
+    name = "train_supervised_active_vision",
+    srcs = [
+        "train_supervised_active_vision.py",
+    ],
+)
--- a/research/cognitive_planning/README.md
+++ b/research/cognitive_planning/README.md
+# cognitive_planning
+**Visual Representation for Semantic Target Driven Navigation**
+Arsalan Mousavian, Alexander Toshev, Marek Fiser, Jana Kosecka, James Davidson
+This is the implementation of semantic target driven navigation training and evaluation on 
+Active Vision dataset. 
+ECCV Workshop on Visual Learning and Embodied Agents in Simulation Environments
+2018.
+<div align="center">
+  <table style="width:100%" border="0">
+    <tr>
+      <td align="center"><img src='https://cs.gmu.edu/~amousavi/gifs/smaller_fridge_2.gif'></td>
+      <td align="center"><img src='https://cs.gmu.edu/~amousavi/gifs/smaller_tv_1.gif'></td>
+    </tr>
+    <tr>
+      <td align="center">Target: Fridge</td>
+      <td align="center">Target: Television</td>
+    </tr>
+    <tr>
+      <td align="center"><img src='https://cs.gmu.edu/~amousavi/gifs/smaller_microwave_1.gif'></td>
+      <td align="center"><img src='https://cs.gmu.edu/~amousavi/gifs/smaller_couch_1.gif'></td>
+    </tr>
+    <tr>
+      <td align="center">Target: Microwave</td>
+      <td align="center">Target: Couch</td>
+    </tr>
+  </table>
+</div>
+Paper: [https://arxiv.org/abs/1805.06066](https://arxiv.org/abs/1805.06066)
+## 1. Installation
+### Requirements
+#### Python Packages
+```shell
+networkx
+gin-config
+```
+### Download cognitive_planning
+```shell
+git clone --depth 1 https://github.com/tensorflow/models.git
+```
+## 2. Datasets
+### Download ActiveVision Dataset 
+We used Active Vision Dataset (AVD) which can be downloaded from [here](http://cs.unc.edu/~ammirato/active_vision_dataset_website/). To make our code faster and reduce memory footprint, we created the AVD Minimal dataset. AVD Minimal consists of low resolution images from the original AVD dataset. In addition, we added annotations for target views, predicted object detections from pre-trained object detector on MS-COCO dataset, and predicted semantic segmentation from pre-trained model on NYU-v2 dataset. AVD minimal can be downloaded from [here](https://storage.googleapis.com/active-vision-dataset/AVD_Minimal.zip). Set `$AVD_DIR` as the path to the downloaded AVD Minimal.
+### TODO: SUNCG Dataset
+Current version of the code does not support SUNCG dataset. It can be added by
+implementing necessary functions of `envs/task_env.py` using the public
+released code of SUNCG environment such as
+[House3d](https://github.com/facebookresearch/House3D) and
+[MINOS](https://github.com/minosworld/minos). 
+### ActiveVisionDataset Demo
+If you wish to navigate the environment, to see how the AVD looks like you can use the following command:
+```shell
+python viz_active_vision_dataset_main -- \
+  --mode=human \
+  --gin_config=envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root=$AVD_DIR'
+```
+## 3. Training
+Right now, the released version only supports training and inference using the real data from Active Vision Dataset.
+When RGB image modality is used, the Resnet embeddings are initialized. To start the training download pre-trained Resnet50 check point in the working directory ./resnet_v2_50_checkpoint/resnet_v2_50.ckpt
+```
+wget http://download.tensorflow.org/models/resnet_v2_50_2017_04_14.tar.gz
+```
+### Run training
+Use the following command for training:
+```shell
+# Train
+python train_supervised_active_vision.py \
+  --mode='train' \
+  --logdir=$CHECKPOINT_DIR \
+  --modality_types='det' \
+  --batch_size=8 \
+  --train_iters=200000 \
+  --lstm_cell_size=2048 \
+  --policy_fc_size=2048 \
+  --sequence_length=20 \
+  --max_eval_episode_length=100 \
+  --test_iters=194 \
+  --gin_config=envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root=$AVD_DIR' \
+  --logtostderr
+```
+The training can be run for different modalities and modality combinations, including semantic segmentation, object detectors, RGB images, depth images. Low resolution images and outputs of detectors pretrained on COCO dataset and semantic segmenation pre trained on NYU dataset are provided as a part of this distribution and can be found in Meta directory of AVD_Minimal. 
+Additional details are described in the comments of the code and in the paper.
+### Run Evaluation
+Use the following command for unrolling the policy on the eval environments. The inference code periodically check the checkpoint folder for new checkpoints to use it for unrolling the policy on the eval environments. After each evaluation, it will create a folder in the $CHECKPOINT_DIR/evals/$ITER where $ITER is the iteration number at which the checkpoint is stored.
+```shell
+# Eval
+python train_supervised_active_vision.py \
+  --mode='eval' \
+  --logdir=$CHECKPOINT_DIR \
+  --modality_types='det' \
+  --batch_size=8 \
+  --train_iters=200000 \
+  --lstm_cell_size=2048 \
+  --policy_fc_size=2048 \
+  --sequence_length=20 \
+  --max_eval_episode_length=100 \
+  --test_iters=194 \
+  --gin_config=envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root=$AVD_DIR' \
+  --logtostderr
+```
+At any point, you can run the following command to compute statistics such as success rate over all the evaluations so far. It also generates gif images for unrolling of the best policy.
+```shell
+# Visualize and Compute Stats
+python viz_active_vision_dataset_main.py \
+   --mode=eval \ 
+   --eval_folder=$CHECKPOINT_DIR/evals/ \
+   --output_folder=$OUTPUT_GIFS_FOLDER \
+   --gin_config=envs/configs/active_vision_config.gin \
+   --gin_params='ActiveVisionDatasetEnv.dataset_root=$AVD_DIR'
+```
+## Contact
+To ask questions or report issues please open an issue on the tensorflow/models
+[issues tracker](https://github.com/tensorflow/models/issues).
+Please assign issues to @arsalan-mousavian.
+## Reference
+The details of the training and experiments can be found in the following paper. If you find our work useful in your research please consider citing our paper:
+```
+@inproceedings{MousavianECCVW18,
+  author = {A. Mousavian and A. Toshev and M. Fiser and J. Kosecka and J. Davidson},
+  title = {Visual Representations for Semantic Target Driven Navigation},
+  booktitle = {ECCV Workshop on Visual Learning and Embodied Agents in Simulation Environments},
+  year = {2018},
+}
+```
--- a/research/fivo/data/__init__.py
+++ b/research/fivo/data/__init__.py
--- a/research/cognitive_planning/command
+++ b/research/cognitive_planning/command
+python train_supervised_active_vision \
+  --mode='train' \
+  --logdir=/usr/local/google/home/kosecka/checkin_log_det/ \
+  --modality_types='det' \
+  --batch_size=8 \
+  --train_iters=200000 \
+  --lstm_cell_size=2048 \
+  --policy_fc_size=2048 \
+  --sequence_length=20 \
+  --max_eval_episode_length=100 \
+  --test_iters=194 \
+  --gin_config=robotics/cognitive_planning/envs/configs/active_vision_config.gin \
+  --gin_params='ActiveVisionDatasetEnv.dataset_root="/usr/local/google/home/kosecka/AVD_minimal/"' \
+  --logtostderr
--- a/research/cognitive_planning/embedders.py
+++ b/research/cognitive_planning/embedders.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Interface for different embedders for modalities."""
+import abc
+import numpy as np
+import tensorflow as tf
+import preprocessing
+from tensorflow.contrib.slim.nets import resnet_v2
+slim = tf.contrib.slim
+class Embedder(object):
+  """Represents the embedder for different modalities.
+  Modalities can be semantic segmentation, depth channel, object detection and
+  so on, which require specific embedder for them.
+  """
+  __metaclass__ = abc.ABCMeta
+  @abc.abstractmethod
+  def build(self, observation):
+    """Builds the model to embed the observation modality.
+    Args:
+      observation: tensor that contains the raw observation from modality.
+    Returns:
+      Embedding tensor for the given observation tensor.
+    """
+    raise NotImplementedError(
+        'Needs to be implemented as part of Embedder Interface')
+class DetectionBoxEmbedder(Embedder):
+  """Represents the model that encodes the detection boxes from images."""
+  def __init__(self, rnn_state_size, scope=None):
+    self._rnn_state_size = rnn_state_size
+    self._scope = scope
+  def build(self, observations):
+    """Builds the model to embed object detection observations.
+    Args:
+      observations: a tuple of (dets, det_num).
+        dets is a tensor of BxTxLxE that has the detection boxes in all the
+          images of the batch. B is the batch size, T is the maximum length of
+          episode, L is the maximum number of detections per image in the batch
+          and E is the size of each detection embedding.
+        det_num is a tensor of BxT that contains the number of detected boxes
+          each image of each sequence in the batch.
+    Returns:
+      For each image in the batch, returns the accumulative embedding of all the
+      detection boxes in that image.
+    """
+    with tf.variable_scope(self._scope, default_name=''):
+      shape = observations[0].shape
+      dets = tf.reshape(observations[0], [-1, shape[-2], shape[-1]])
+      det_num = tf.reshape(observations[1], [-1])
+      lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._rnn_state_size)
+      batch_size = tf.shape(dets)[0]
+      lstm_outputs, _ = tf.nn.dynamic_rnn(
+          cell=lstm_cell,
+          inputs=dets,
+          sequence_length=det_num,
+          initial_state=lstm_cell.zero_state(batch_size, dtype=tf.float32),
+          dtype=tf.float32)
+      # Gathering the last state of each sequence in the batch.
+      batch_range = tf.range(batch_size)
+      indices = tf.stack([batch_range, det_num - 1], axis=1)
+      last_lstm_outputs = tf.gather_nd(lstm_outputs, indices)
+      last_lstm_outputs = tf.reshape(last_lstm_outputs,
+                                     [-1, shape[1], self._rnn_state_size])
+    return last_lstm_outputs
+class ResNet(Embedder):
+  """Residual net embedder for image data."""
+  def __init__(self, params, *args, **kwargs):
+    super(ResNet, self).__init__(*args, **kwargs)
+    self._params = params
+    self._extra_train_ops = []
+  def build(self, images):
+    shape = images.get_shape().as_list()
+    if len(shape) == 5:
+      images = tf.reshape(images,
+                          [shape[0] * shape[1], shape[2], shape[3], shape[4]])
+    embedding = self._build_model(images)
+    if len(shape) == 5:
+      embedding = tf.reshape(embedding, [shape[0], shape[1], -1])
+    return embedding
+  @property
+  def extra_train_ops(self):
+    return self._extra_train_ops
+  def _build_model(self, images):
+    """Builds the model."""
+    # Convert images to floats and normalize them.
+    images = tf.to_float(images)
+    bs = images.get_shape().as_list()[0]
+    images = [
+        tf.image.per_image_standardization(tf.squeeze(i))
+        for i in tf.split(images, bs)
+    ]
+    images = tf.concat([tf.expand_dims(i, axis=0) for i in images], axis=0)
+    with tf.variable_scope('init'):
+      x = self._conv('init_conv', images, 3, 3, 16, self._stride_arr(1))
+    strides = [1, 2, 2]
+    activate_before_residual = [True, False, False]
+    if self._params.use_bottleneck:
+      res_func = self._bottleneck_residual
+      filters = [16, 64, 128, 256]
+    else:
+      res_func = self._residual
+      filters = [16, 16, 32, 128]
+    with tf.variable_scope('unit_1_0'):
+      x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
+                   activate_before_residual[0])
+    for i in xrange(1, self._params.num_residual_units):
+      with tf.variable_scope('unit_1_%d' % i):
+        x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)
+    with tf.variable_scope('unit_2_0'):
+      x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
+                   activate_before_residual[1])
+    for i in xrange(1, self._params.num_residual_units):
+      with tf.variable_scope('unit_2_%d' % i):
+        x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)
+    with tf.variable_scope('unit_3_0'):
+      x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
+                   activate_before_residual[2])
+    for i in xrange(1, self._params.num_residual_units):
+      with tf.variable_scope('unit_3_%d' % i):
+        x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)
+    with tf.variable_scope('unit_last'):
+      x = self._batch_norm('final_bn', x)
+      x = self._relu(x, self._params.relu_leakiness)
+    with tf.variable_scope('pool_logit'):
+      x = self._global_avg_pooling(x)
+    return x
+  def _stride_arr(self, stride):
+    return [1, stride, stride, 1]
+  def _batch_norm(self, name, x):
+    """batch norm implementation."""
+    with tf.variable_scope(name):
+      params_shape = [x.shape[-1]]
+      beta = tf.get_variable(
+          'beta',
+          params_shape,
+          tf.float32,
+          initializer=tf.constant_initializer(0.0, tf.float32))
+      gamma = tf.get_variable(
+          'gamma',
+          params_shape,
+          tf.float32,
+          initializer=tf.constant_initializer(1.0, tf.float32))
+      if self._params.is_train:
+        mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')
+        moving_mean = tf.get_variable(
+            'moving_mean',
+            params_shape,
+            tf.float32,
+            initializer=tf.constant_initializer(0.0, tf.float32),
+            trainable=False)
+        moving_variance = tf.get_variable(
+            'moving_variance',
+            params_shape,
+            tf.float32,
+            initializer=tf.constant_initializer(1.0, tf.float32),
+            trainable=False)
+        self._extra_train_ops.append(
+            tf.assign_moving_average(moving_mean, mean, 0.9))
+        self._extra_train_ops.append(
+            tf.assign_moving_average(moving_variance, variance, 0.9))
+      else:
+        mean = tf.get_variable(
+            'moving_mean',
+            params_shape,
+            tf.float32,
+            initializer=tf.constant_initializer(0.0, tf.float32),
+            trainable=False)
+        variance = tf.get_variable(
+            'moving_variance',
+            params_shape,
+            tf.float32,
+            initializer=tf.constant_initializer(1.0, tf.float32),
+            trainable=False)
+        tf.summary.histogram(mean.op.name, mean)
+        tf.summary.histogram(variance.op.name, variance)
+      # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net.
+      y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001)
+      y.set_shape(x.shape)
+      return y
+  def _residual(self,
+                x,
+                in_filter,
+                out_filter,
+                stride,
+                activate_before_residual=False):
+    """Residual unit with 2 sub layers."""
+    if activate_before_residual:
+      with tf.variable_scope('shared_activation'):
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self._params.relu_leakiness)
+        orig_x = x
+    else:
+      with tf.variable_scope('residual_only_activation'):
+        orig_x = x
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self._params.relu_leakiness)
+    with tf.variable_scope('sub1'):
+      x = self._conv('conv1', x, 3, in_filter, out_filter, stride)
+    with tf.variable_scope('sub2'):
+      x = self._batch_norm('bn2', x)
+      x = self._relu(x, self._params.relu_leakiness)
+      x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])
+    with tf.variable_scope('sub_add'):
+      if in_filter != out_filter:
+        orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID')
+        orig_x = tf.pad(
+            orig_x, [[0, 0], [0, 0], [0, 0], [(out_filter - in_filter) // 2,
+                                              (out_filter - in_filter) // 2]])
+      x += orig_x
+    return x
+  def _bottleneck_residual(self,
+                           x,
+                           in_filter,
+                           out_filter,
+                           stride,
+                           activate_before_residual=False):
+    """A residual convolutional layer with a bottleneck.
+    The layer is a composite of three convolutional layers with a ReLU non-
+    linearity and batch normalization after each linear convolution. The depth
+    if the second and third layer is out_filter / 4 (hence it is a bottleneck).
+    Args:
+      x: a float 4 rank Tensor representing the input to the layer.
+      in_filter: a python integer representing depth of the input.
+      out_filter: a python integer representing depth of the output.
+      stride: a python integer denoting the stride of the layer applied before
+        the first convolution.
+      activate_before_residual: a python boolean. If True, then a ReLU is
+        applied as a first operation on the input x before everything else.
+    Returns:
+      A 4 rank Tensor with batch_size = batch size of input, width and height =
+      width / stride and height / stride of the input and depth = out_filter.
+    """
+    if activate_before_residual:
+      with tf.variable_scope('common_bn_relu'):
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self._params.relu_leakiness)
+        orig_x = x
+    else:
+      with tf.variable_scope('residual_bn_relu'):
+        orig_x = x
+        x = self._batch_norm('init_bn', x)
+        x = self._relu(x, self._params.relu_leakiness)
+    with tf.variable_scope('sub1'):
+      x = self._conv('conv1', x, 1, in_filter, out_filter / 4, stride)
+    with tf.variable_scope('sub2'):
+      x = self._batch_norm('bn2', x)
+      x = self._relu(x, self._params.relu_leakiness)
+      x = self._conv('conv2', x, 3, out_filter / 4, out_filter / 4,
+                     [1, 1, 1, 1])
+    with tf.variable_scope('sub3'):
+      x = self._batch_norm('bn3', x)
+      x = self._relu(x, self._params.relu_leakiness)
+      x = self._conv('conv3', x, 1, out_filter / 4, out_filter, [1, 1, 1, 1])
+    with tf.variable_scope('sub_add'):
+      if in_filter != out_filter:
+        orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride)
+      x += orig_x
+    return x
+  def _decay(self):
+    costs = []
+    for var in tf.trainable_variables():
+      if var.op.name.find(r'DW') > 0:
+        costs.append(tf.nn.l2_loss(var))
+    return tf.mul(self._params.weight_decay_rate, tf.add_n(costs))
+  def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
+    """Convolution."""
+    with tf.variable_scope(name):
+      n = filter_size * filter_size * out_filters
+      kernel = tf.get_variable(
+          'DW', [filter_size, filter_size, in_filters, out_filters],
+          tf.float32,
+          initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / n)))
+      return tf.nn.conv2d(x, kernel, strides, padding='SAME')
+  def _relu(self, x, leakiness=0.0):
+    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
+  def _fully_connected(self, x, out_dim):
+    x = tf.reshape(x, [self._params.batch_size, -1])
+    w = tf.get_variable(
+        'DW', [x.get_shape()[1], out_dim],
+        initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
+    b = tf.get_variable(
+        'biases', [out_dim], initializer=tf.constant_initializer())
+    return tf.nn.xw_plus_b(x, w, b)
+  def _global_avg_pooling(self, x):
+    assert x.get_shape().ndims == 4
+    return tf.reduce_mean(x, [1, 2])
+class MLPEmbedder(Embedder):
+  """Embedder of vectorial data.
+  The net is a multi-layer perceptron, with ReLU nonlinearities in all layers
+  except the last one.
+  """
+  def __init__(self, layers, *args, **kwargs):
+    """Constructs MLPEmbedder.
+    Args:
+      layers: a list of python integers representing layer sizes.
+      *args: arguments for super constructor.
+      **kwargs: keyed arguments for super constructor.
+    """
+    super(MLPEmbedder, self).__init__(*args, **kwargs)
+    self._layers = layers
+  def build(self, features):
+    shape = features.get_shape().as_list()
+    if len(shape) == 3:
+      features = tf.reshape(features, [shape[0] * shape[1], shape[2]])
+    x = features
+    for i, dim in enumerate(self._layers):
+      with tf.variable_scope('layer_%i' % i):
+        x = self._fully_connected(x, dim)
+        if i < len(self._layers) - 1:
+          x = self._relu(x)
+    if len(shape) == 3:
+      x = tf.reshape(x, shape[:-1] + [self._layers[-1]])
+    return x
+  def _fully_connected(self, x, out_dim):
+    w = tf.get_variable(
+        'DW', [x.get_shape()[1], out_dim],
+        initializer=tf.variance_scaling_initializer(distribution='uniform'))
+    b = tf.get_variable(
+        'biases', [out_dim], initializer=tf.constant_initializer())
+    return tf.nn.xw_plus_b(x, w, b)
+  def _relu(self, x, leakiness=0.0):
+    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
+class SmallNetworkEmbedder(Embedder):
+  """Embedder for image like observations.
+  The network is comprised of multiple conv layers and a fully connected layer
+  at the end. The number of conv layers and the parameters are configured from
+  params.
+  """
+  def __init__(self, params, *args, **kwargs):
+    """Constructs the small network.
+    Args:
+      params: params should be tf.hparams type. params need to have a list of
+        conv_sizes, conv_strides, conv_channels. The length of these lists
+        should be equal to each other and to the number of conv layers in the
+        network. Plus, it also needs to have boolean variable named to_one_hot
+        which indicates whether the input should be converted to one hot or not.
+        The size of the fully connected layer is specified by
+        params.embedding_size.
+      *args: The rest of the parameters.
+      **kwargs: the reset of the parameters.
+    Raises:
+      ValueError: If the length of params.conv_strides, params.conv_sizes, and
+        params.conv_channels are not equal.
+    """
+    super(SmallNetworkEmbedder, self).__init__(*args, **kwargs)
+    self._params = params
+    if len(self._params.conv_sizes) != len(self._params.conv_strides):
+      raise ValueError(
+          'Conv sizes and strides should have the same length: {} != {}'.format(
+              len(self._params.conv_sizes), len(self._params.conv_strides)))
+    if len(self._params.conv_sizes) != len(self._params.conv_channels):
+      raise ValueError(
+          'Conv sizes and channels should have the same length: {} != {}'.
+          format(len(self._params.conv_sizes), len(self._params.conv_channels)))
+  def build(self, images):
+    """Builds the embedder with the given speicifcation.
+    Args:
+      images: a tensor that contains the input images which has the shape of
+        NxTxHxWxC where N is the batch size, T is the maximum length of the
+        sequence, H and W are the height and width of the images and C is the
+        number of channels.
+    Returns:
+      A tensor that is the embedding of the images.
+    """
+    shape = images.get_shape().as_list()
+    images = tf.reshape(images,
+                        [shape[0] * shape[1], shape[2], shape[3], shape[4]])
+    with slim.arg_scope(
+        [slim.conv2d, slim.fully_connected],
+        activation_fn=tf.nn.relu,
+        weights_regularizer=slim.l2_regularizer(self._params.weight_decay_rate),
+        biases_initializer=tf.zeros_initializer()):
+      with slim.arg_scope([slim.conv2d], padding='SAME'):
+        # convert the image to one hot if needed.
+        if self._params.to_one_hot:
+          net = tf.one_hot(
+              tf.squeeze(tf.to_int32(images), axis=[-1]),
+              self._params.one_hot_length)
+        else:
+          net = images
+        p = self._params
+        # Adding conv layers with the specified configurations.
+        for conv_id, kernel_stride_channel in enumerate(
+            zip(p.conv_sizes, p.conv_strides, p.conv_channels)):
+          kernel_size, stride, channels = kernel_stride_channel
+          net = slim.conv2d(
+              net,
+              channels, [kernel_size, kernel_size],
+              stride,
+              scope='conv_{}'.format(conv_id + 1))
+        net = slim.flatten(net)
+        net = slim.fully_connected(net, self._params.embedding_size, scope='fc')
+        output = tf.reshape(net, [shape[0], shape[1], -1])
+        return output
+class ResNet50Embedder(Embedder):
+  """Uses ResNet50 to embed input images."""
+  def build(self, images):
+    """Builds a ResNet50 embedder for the input images.
+    It assumes that the range of the pixel values in the images tensor is
+      [0,255] and should be castable to tf.uint8.
+    Args:
+      images: a tensor that contains the input images which has the shape of
+          NxTxHxWx3 where N is the batch size, T is the maximum length of the
+          sequence, H and W are the height and width of the images and C is the
+          number of channels.
+    Returns:
+      The embedding of the input image with the shape of NxTxL where L is the
+        embedding size of the output.
+    Raises:
+      ValueError: if the shape of the input does not agree with the expected
+      shape explained in the Args section.
+    """
+    shape = images.get_shape().as_list()
+    if len(shape) != 5:
+      raise ValueError(
+          'The tensor shape should have 5 elements, {} is provided'.format(
+              len(shape)))
+    if shape[4] != 3:
+      raise ValueError('Three channels are expected for the input image')
+    images = tf.cast(images, tf.uint8)
+    images = tf.reshape(images,
+                        [shape[0] * shape[1], shape[2], shape[3], shape[4]])
+    with slim.arg_scope(resnet_v2.resnet_arg_scope()):
+      def preprocess_fn(x):
+        x = tf.expand_dims(x, 0)
+        x = tf.image.resize_bilinear(x, [299, 299],
+                                       align_corners=False)
+        return(tf.squeeze(x, [0]))
+      images = tf.map_fn(preprocess_fn, images, dtype=tf.float32)
+      net, _ = resnet_v2.resnet_v2_50(
+          images, is_training=False, global_pool=True)
+      output = tf.reshape(net, [shape[0], shape[1], -1])
+      return output
+class IdentityEmbedder(Embedder):
+  """This embedder just returns the input as the output.
+  Used for modalitites that the embedding of the modality is the same as the
+  modality itself. For example, it can be used for one_hot goal.
+  """
+  def build(self, images):
+    return images
--- a/research/fivo/models/__init__.py
+++ b/research/fivo/models/__init__.py
--- a/research/cognitive_planning/envs/active_vision_dataset_env.py
+++ b/research/cognitive_planning/envs/active_vision_dataset_env.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gym environment for the ActiveVision Dataset.
+   The dataset is captured with a robot moving around and taking picture in
+   multiple directions. The actions are moving in four directions, and rotate
+   clockwise or counter clockwise. The observations are the output of vision
+   pipelines such as object detectors. The goal is to find objects of interest
+   in each environment. For more details, refer:
+   http://cs.unc.edu/~ammirato/active_vision_dataset_website/.
+"""
+import tensorflow as tf
+import collections
+import copy
+import json
+import os
+from StringIO import StringIO
+import time
+import gym
+from gym.envs.registration import register
+import gym.spaces
+import networkx as nx
+import numpy as np
+import scipy.io as sio
+from absl import logging
+import gin
+import cv2
+import label_map_util
+import visualization_utils as vis_util
+from envs import task_env
+register(
+    id='active-vision-env-v0',
+    entry_point=
+    'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv',  # pylint: disable=line-too-long
+)
+_MAX_DEPTH_VALUE = 12102
+SUPPORTED_ACTIONS = [
+    'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop'
+]
+SUPPORTED_MODALITIES = [
+    task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
+    task_env.ModalityTypes.DEPTH,
+    task_env.ModalityTypes.OBJECT_DETECTION,
+    task_env.ModalityTypes.IMAGE,
+    task_env.ModalityTypes.GOAL,
+    task_env.ModalityTypes.PREV_ACTION,
+    task_env.ModalityTypes.DISTANCE,
+]
+# Data structure for storing the information related to the graph of the world.
+_Graph = collections.namedtuple('_Graph', [
+    'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal'
+])
+def _init_category_index(label_map_path):
+  """Creates category index from class indexes to name of the classes.
+  Args:
+    label_map_path: path to the mapping.
+  Returns:
+    A map for mapping int keys to string categories.
+  """
+  label_map = label_map_util.load_labelmap(label_map_path)
+  num_classes = np.max(x.id for x in label_map.item)
+  categories = label_map_util.convert_label_map_to_categories(
+      label_map, max_num_classes=num_classes, use_display_name=True)
+  category_index = label_map_util.create_category_index(categories)
+  return category_index
+def _draw_detections(image_np, detections, category_index):
+  """Draws detections on to the image.
+  Args:
+    image_np: Image in the form of uint8 numpy array.
+    detections: a dictionary that contains the detection outputs.
+    category_index: contains the mapping between indexes and the category names.
+  Returns:
+    Does not return anything but draws the boxes on the
+  """
+  vis_util.visualize_boxes_and_labels_on_image_array(
+      image_np,
+      detections['detection_boxes'],
+      detections['detection_classes'],
+      detections['detection_scores'],
+      category_index,
+      use_normalized_coordinates=True,
+      max_boxes_to_draw=1000,
+      min_score_thresh=.0,
+      agnostic_mode=False)
+def generate_detection_image(detections,
+                             image_size,
+                             category_map,
+                             num_classes,
+                             is_binary=True):
+  """Generates one_hot vector of the image using the detection boxes.
+  Args:
+    detections: 2D object detections from the image. It's a dictionary that
+      contains detection_boxes, detection_classes, and detection_scores with
+      dimensions of nx4, nx1, nx1 where n is the number of detections.
+    image_size: The resolution of the output image.
+    category_map: dictionary that maps label names to index.
+    num_classes: Number of classes.
+    is_binary: If true, it sets the corresponding channels to 0 and 1.
+      Otherwise, sets the score in the corresponding channel.
+  Returns:
+    Returns image_size x image_size x num_classes image for the detection boxes.
+  """
+  res = np.zeros((image_size, image_size, num_classes), dtype=np.float32)
+  boxes = detections['detection_boxes']
+  labels = detections['detection_classes']
+  scores = detections['detection_scores']
+  for box, label, score in zip(boxes, labels, scores):
+    transformed_boxes = [int(round(t)) for t in box * image_size]
+    y1, x1, y2, x2 = transformed_boxes
+    # Detector returns fixed number of detections. Boxes with area of zero
+    # are equivalent of boxes that don't correspond to any detection box.
+    # So, we need to skip the boxes with area 0.
+    if (y2 - y1) * (x2 - x1) == 0:
+      continue
+    assert category_map[label] < num_classes, 'label = {}'.format(label)
+    value = score
+    if is_binary:
+      value = 1
+    res[y1:y2, x1:x2, category_map[label]] = value
+  return res
+def _get_detection_path(root, detection_folder_name, world):
+  return os.path.join(root, 'Meta', detection_folder_name, world + '.npy')
+def _get_image_folder(root, world):
+  return os.path.join(root, world, 'jpg_rgb')
+def _get_json_path(root, world):
+  return os.path.join(root, world, 'annotations.json')
+def _get_image_path(root, world, image_id):
+  return os.path.join(_get_image_folder(root, world), image_id + '.jpg')
+def _get_image_list(path, worlds):
+  """Builds a dictionary for all the worlds.
+  Args:
+    path: the path to the dataset on cns.
+    worlds: list of the worlds.
+  Returns:
+    dictionary where the key is the world names and the values
+    are the image_ids of that world.
+  """
+  world_id_dict = {}
+  for loc in worlds:
+    files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))]
+    world_id_dict[loc] = files
+  return world_id_dict
+def read_all_poses(dataset_root, world):
+  """Reads all the poses for each world.
+  Args:
+    dataset_root: the path to the root of the dataset.
+    world: string, name of the world.
+  Returns:
+    Dictionary of poses for all the images in each world. The key is the image
+    id of each view and the values are tuple of (x, z, R, scale). Where x and z
+    are the first and third coordinate of translation. R is the 3x3 rotation
+    matrix and scale is a float scalar that indicates the scale that needs to
+    be multipled to x and z in order to get the real world coordinates.
+  Raises:
+    ValueError: if the number of images do not match the number of poses read.
+  """
+  path = os.path.join(dataset_root, world, 'image_structs.mat')
+  with tf.gfile.Open(path) as f:
+    data = sio.loadmat(f)
+  xyz = data['image_structs']['world_pos']
+  image_names = data['image_structs']['image_name'][0]
+  rot = data['image_structs']['R'][0]
+  scale = data['scale'][0][0]
+  n = xyz.shape[1]
+  x = [xyz[0][i][0][0] for i in range(n)]
+  z = [xyz[0][i][2][0] for i in range(n)]
+  names = [name[0][:-4] for name in image_names]
+  if len(names) != len(x):
+    raise ValueError('number of image names are not equal to the number of '
+                     'poses {} != {}'.format(len(names), len(x)))
+  output = {}
+  for i in range(n):
+    if rot[i].shape[0] != 0:
+      assert rot[i].shape[0] == 3
+      assert rot[i].shape[1] == 3
+      output[names[i]] = (x[i], z[i], rot[i], scale)
+    else:
+      output[names[i]] = (x[i], z[i], None, scale)
+  return output
+def read_cached_data(should_load_images, dataset_root, segmentation_file_name,
+                     targets_file_name, output_size):
+  """Reads all the necessary cached data.
+  Args:
+    should_load_images: whether to load the images or not.
+    dataset_root: path to the root of the dataset.
+    segmentation_file_name: The name of the file that contains semantic
+      segmentation annotations.
+    targets_file_name: The name of the file the contains targets annotated for
+      each world.
+    output_size: Size of the output images. This is used for pre-processing the
+      loaded images.
+  Returns:
+    Dictionary of all the cached data.
+  """
+  load_start = time.time()
+  result_data = {}
+  annotated_target_path = os.path.join(dataset_root, 'Meta',
+                                       targets_file_name + '.npy')
+  logging.info('loading targets: %s', annotated_target_path)
+  with tf.gfile.Open(annotated_target_path) as f:
+    result_data['targets'] = np.load(f).item()
+  depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy')
+  logging.info('loading depth: %s', depth_image_path)
+  with tf.gfile.Open(depth_image_path) as f:
+    depth_data = np.load(f).item()
+  logging.info('processing depth')
+  for home_id in depth_data:
+    images = depth_data[home_id]
+    for image_id in images:
+      depth = images[image_id]
+      depth = cv2.resize(
+          depth / _MAX_DEPTH_VALUE, (output_size, output_size),
+          interpolation=cv2.INTER_NEAREST)
+      depth_mask = (depth > 0).astype(np.float32)
+      depth = np.dstack((depth, depth_mask))
+      images[image_id] = depth
+  result_data[task_env.ModalityTypes.DEPTH] = depth_data
+  sseg_path = os.path.join(dataset_root, 'Meta',
+                           segmentation_file_name + '.npy')
+  logging.info('loading sseg: %s', sseg_path)
+  with tf.gfile.Open(sseg_path) as f:
+    sseg_data = np.load(f).item()
+  logging.info('processing sseg')
+  for home_id in sseg_data:
+    images = sseg_data[home_id]
+    for image_id in images:
+      sseg = images[image_id]
+      sseg = cv2.resize(
+          sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST)
+      images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32)
+  result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data
+  if should_load_images:
+    image_path = os.path.join(dataset_root, 'Meta/imgs.npy')
+    logging.info('loading imgs: %s', image_path)
+    with tf.gfile.Open(image_path) as f:
+      image_data = np.load(f).item()
+    result_data[task_env.ModalityTypes.IMAGE] = image_data
+  with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f:
+    result_data['world_id_dict'] = np.load(f).item()
+  logging.info('logging done in %f seconds', time.time() - load_start)
+  return result_data
+@gin.configurable
+def get_spec_dtype_map():
+  return {gym.spaces.Box: np.float32}
+@gin.configurable
+class ActiveVisionDatasetEnv(task_env.TaskEnv):
+  """Simulates the environment from ActiveVisionDataset."""
+  cached_data = None
+  def __init__(
+      self,
+      episode_length,
+      modality_types,
+      confidence_threshold,
+      output_size,
+      worlds,
+      targets,
+      compute_distance,
+      should_draw_detections,
+      dataset_root,
+      labelmap_path,
+      reward_collision,
+      reward_goal_range,
+      num_detection_classes,
+      segmentation_file_name,
+      detection_folder_name,
+      actions,
+      targets_file_name,
+      eval_init_points_file_name=None,
+      shaped_reward=False,
+  ):
+    """Instantiates the environment for ActiveVision Dataset.
+    Args:
+      episode_length: the length of each episode.
+      modality_types: a list of the strings where each entry indicates the name
+        of the modalities to be loaded. Valid entries are "sseg", "det",
+        "depth", "image", "distance", and "prev_action". "distance" should be
+        used for computing metrics in tf agents.
+      confidence_threshold: Consider detections more than confidence_threshold
+        for potential targets.
+      output_size: Resolution of the output image.
+      worlds: List of the name of the worlds.
+      targets: List of the target names. Each entry is a string label of the
+        target category (e.g. 'fridge', 'microwave', so on).
+      compute_distance: If True, outputs the distance of the view to the goal.
+      should_draw_detections (bool): If True, the image returned for the
+        observation will contains the bounding boxes.
+      dataset_root: the path to the root folder of the dataset.
+      labelmap_path: path to the dictionary that converts label strings to
+        indexes.
+      reward_collision: the reward the agents get after hitting an obstacle.
+        It should be a non-positive number.
+      reward_goal_range: the number of steps from goal, such that the agent is
+        considered to have reached the goal. If the agent's distance is less
+        than the specified goal range, the episode is also finishes by setting
+        done = True.
+      num_detection_classes: number of classes that detector outputs.
+      segmentation_file_name: the name of the file that contains the semantic
+        information. The file should be in the dataset_root/Meta/ folder.
+      detection_folder_name: Name of the folder that contains the detections
+        for each world. The folder should be under dataset_root/Meta/ folder.
+      actions: The list of the action names. Valid entries are listed in
+        SUPPORTED_ACTIONS.
+      targets_file_name: the name of the file that contains the annotated
+        targets. The file should be in the dataset_root/Meta/Folder
+      eval_init_points_file_name: The name of the file that contains the initial
+        points for evaluating the performance of the agent. If set to None,
+        episodes start at random locations. Should be only set for evaluation.
+      shaped_reward: Whether to add delta goal distance to the reward each step.
+    Raises:
+      ValueError: If one of the targets are not available in the annotated
+        targets or the modality names are not from the domain specified above.
+      ValueError: If one of the actions is not in SUPPORTED_ACTIONS.
+      ValueError: If the reward_collision is a positive number.
+      ValueError: If there is no action other than stop provided.
+    """
+    if reward_collision > 0:
+      raise ValueError('"reward" for collision should be non positive')
+    if reward_goal_range < 0:
+      logging.warning('environment does not terminate the episode if the agent '
+                      'is too close to the environment')
+    if not modality_types:
+      raise ValueError('modality names can not be empty')
+    for name in modality_types:
+      if name not in SUPPORTED_MODALITIES:
+        raise ValueError('invalid modality type: {}'.format(name))
+    actions_other_than_stop_found = False
+    for a in actions:
+      if a != 'stop':
+        actions_other_than_stop_found = True
+      if a not in SUPPORTED_ACTIONS:
+        raise ValueError('invalid action %s', a)
+    if not actions_other_than_stop_found:
+      raise ValueError('environment needs to have actions other than stop.')
+    super(ActiveVisionDatasetEnv, self).__init__()
+    self._episode_length = episode_length
+    self._modality_types = set(modality_types)
+    self._confidence_threshold = confidence_threshold
+    self._output_size = output_size
+    self._dataset_root = dataset_root
+    self._worlds = worlds
+    self._targets = targets
+    self._all_graph = {}
+    for world in self._worlds:
+      with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f:
+        file_content = f.read()
+        file_content = file_content.replace('.jpg', '')
+        io = StringIO(file_content)
+        self._all_graph[world] = json.load(io)
+    self._cur_world = ''
+    self._cur_image_id = ''
+    self._cur_graph = None  # Loaded by _update_graph
+    self._steps_taken = 0
+    self._last_action_success = True
+    self._category_index = _init_category_index(labelmap_path)
+    self._category_map = dict(
+        [(c, i) for i, c in enumerate(self._category_index)])
+    self._detection_cache = {}
+    if not ActiveVisionDatasetEnv.cached_data:
+      ActiveVisionDatasetEnv.cached_data = read_cached_data(
+          True, self._dataset_root, segmentation_file_name, targets_file_name,
+          self._output_size)
+    cached_data = ActiveVisionDatasetEnv.cached_data
+    self._world_id_dict = cached_data['world_id_dict']
+    self._depth_images = cached_data[task_env.ModalityTypes.DEPTH]
+    self._semantic_segmentations = cached_data[
+        task_env.ModalityTypes.SEMANTIC_SEGMENTATION]
+    self._annotated_targets = cached_data['targets']
+    self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE]
+    self._graph_cache = {}
+    self._compute_distance = compute_distance
+    self._should_draw_detections = should_draw_detections
+    self._reward_collision = reward_collision
+    self._reward_goal_range = reward_goal_range
+    self._num_detection_classes = num_detection_classes
+    self._actions = actions
+    self._detection_folder_name = detection_folder_name
+    self._shaped_reward = shaped_reward
+    self._eval_init_points = None
+    if eval_init_points_file_name is not None:
+      self._eval_init_index = 0
+      init_points_path = os.path.join(self._dataset_root, 'Meta',
+                                      eval_init_points_file_name + '.npy')
+      with tf.gfile.Open(init_points_path) as points_file:
+        data = np.load(points_file).item()
+      self._eval_init_points = []
+      for world in self._worlds:
+        for goal in self._targets:
+          if world in self._annotated_targets[goal]:
+            for image_id in data[world]:
+              self._eval_init_points.append((world, image_id[0], goal))
+        logging.info('loaded %d eval init points', len(self._eval_init_points))
+    self.action_space = gym.spaces.Discrete(len(self._actions))
+    obs_shapes = {}
+    if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box(
+          low=0, high=255, shape=(self._output_size, self._output_size, 1))
+    if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box(
+          low=0,
+          high=255,
+          shape=(self._output_size, self._output_size,
+                 self._num_detection_classes))
+    if task_env.ModalityTypes.DEPTH in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box(
+          low=0,
+          high=_MAX_DEPTH_VALUE,
+          shape=(self._output_size, self._output_size, 2))
+    if task_env.ModalityTypes.IMAGE in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box(
+          low=0, high=255, shape=(self._output_size, self._output_size, 3))
+    if task_env.ModalityTypes.GOAL in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box(
+          low=0, high=1., shape=(len(self._targets),))
+    if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box(
+          low=0, high=1., shape=(len(self._actions) + 1,))
+    if task_env.ModalityTypes.DISTANCE in self._modality_types:
+      obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box(
+          low=0, high=255, shape=(1,))
+    self.observation_space = gym.spaces.Dict(obs_shapes)
+    self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32)
+    # Loading all the poses.
+    all_poses = {}
+    for world in self._worlds:
+      all_poses[world] = read_all_poses(self._dataset_root, world)
+    self._cached_poses = all_poses
+    self._vertex_to_pose = {}
+    self._pose_to_vertex = {}
+  @property
+  def actions(self):
+    """Returns list of actions for the env."""
+    return self._actions
+  def _next_image(self, image_id, action):
+    """Given the action, returns the name of the image that agent ends up in.
+    Args:
+      image_id: The image id of the current view.
+      action: valid actions are ['right', 'rotate_cw', 'rotate_ccw',
+      'forward', 'left']. Each rotation is 30 degrees.
+    Returns:
+      The image name for the next location of the agent. If the action results
+      in collision or it is not possible for the agent to execute that action,
+      returns empty string.
+    """
+    assert action in self._actions, 'invalid action : {}'.format(action)
+    assert self._cur_world in self._all_graph, 'invalid world {}'.format(
+        self._cur_world)
+    assert image_id in self._all_graph[
+        self._cur_world], 'image_id {} is not in {}'.format(
+            image_id, self._cur_world)
+    return self._all_graph[self._cur_world][image_id][action]
+  def _largest_detection_for_image(self, image_id, detections_dict):
+    """Assigns area of the largest box for the view with given image id.
+    Args:
+      image_id: Image id of the view.
+      detections_dict: Detections for the view.
+    """
+    for cls, box, score in zip(detections_dict['detection_classes'],
+                               detections_dict['detection_boxes'],
+                               detections_dict['detection_scores']):
+      if cls not in self._targets:
+        continue
+      if score < self._confidence_threshold:
+        continue
+      ymin, xmin, ymax, xmax = box
+      area = (ymax - ymin) * (xmax - xmin)
+      if abs(area) < 1e-5:
+        continue
+      if image_id not in self._detection_area:
+        self._detection_area[image_id] = area
+      else:
+        self._detection_area[image_id] = max(self._detection_area[image_id],
+                                             area)
+  def _compute_goal_indexes(self):
+    """Computes the goal indexes for the environment.
+    Returns:
+      The indexes of the goals that are closest to target categories. A vertex
+      is goal vertice if the desired objects are detected in the image and the
+      target categories are not seen by moving forward from that vertice.
+    """
+    for image_id in self._world_id_dict[self._cur_world]:
+      detections_dict = self._detection_table[image_id]
+      self._largest_detection_for_image(image_id, detections_dict)
+    goal_indexes = []
+    for image_id in self._world_id_dict[self._cur_world]:
+      if image_id not in self._detection_area:
+        continue
+      # Detection box is large enough.
+      if self._detection_area[image_id] < 0.01:
+        continue
+      ok = True
+      next_image_id = self._next_image(image_id, 'forward')
+      if next_image_id:
+        if next_image_id in self._detection_area:
+          ok = False
+      if ok:
+        goal_indexes.append(self._cur_graph.id_to_index[image_id])
+    return goal_indexes
+  def to_image_id(self, vid):
+    """Converts vertex id to the image id.
+    Args:
+      vid: vertex id of the view.
+    Returns:
+      image id of the input vertex id.
+    """
+    return self._cur_graph.index_to_id[vid]
+  def to_vertex(self, image_id):
+    return self._cur_graph.id_to_index[image_id]
+  def observation(self, view_pose):
+    """Returns the observation at the given the vertex.
+    Args:
+      view_pose: pose of the view of interest.
+    Returns:
+      Observation at the given view point.
+    Raises:
+      ValueError: if the given view pose is not similar to any of the poses in
+        the current world.
+    """
+    vertex = self.pose_to_vertex(view_pose)
+    if vertex is None:
+      raise ValueError('The given found is not close enough to any of the poses'
+                       ' in the environment.')
+    image_id = self._cur_graph.index_to_id[vertex]
+    output = collections.OrderedDict()
+    if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
+      output[task_env.ModalityTypes.
+             SEMANTIC_SEGMENTATION] = self._semantic_segmentations[
+                 self._cur_world][image_id]
+    detection = None
+    need_det = (
+        task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or
+        (task_env.ModalityTypes.IMAGE in self._modality_types and
+         self._should_draw_detections))
+    if need_det:
+      detection = self._detection_table[image_id]
+      detection_image = generate_detection_image(
+          detection,
+          self._output_size,
+          self._category_map,
+          num_classes=self._num_detection_classes)
+    if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
+      output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image
+    if task_env.ModalityTypes.DEPTH in self._modality_types:
+      output[task_env.ModalityTypes.DEPTH] = self._depth_images[
+          self._cur_world][image_id]
+    if task_env.ModalityTypes.IMAGE in self._modality_types:
+      output_img = self._cached_imgs[self._cur_world][image_id]
+      if self._should_draw_detections:
+        output_img = output_img.copy()
+        _draw_detections(output_img, detection, self._category_index)
+      output[task_env.ModalityTypes.IMAGE] = output_img
+    if task_env.ModalityTypes.GOAL in self._modality_types:
+      goal = np.zeros((len(self._targets),), dtype=np.float32)
+      goal[self._targets.index(self._cur_goal)] = 1.
+      output[task_env.ModalityTypes.GOAL] = goal
+    if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
+      output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action
+    if task_env.ModalityTypes.DISTANCE in self._modality_types:
+      output[task_env.ModalityTypes.DISTANCE] = np.asarray(
+          [self.gt_value(self._cur_goal, vertex)], dtype=np.float32)
+    return output
+  def _step_no_reward(self, action):
+    """Performs a step in the environment with given action.
+    Args:
+      action: Action that is used to step in the environment. Action can be
+        string or integer. If the type is integer then it uses the ith element
+        from self._actions list. Otherwise, uses the string value as the action.
+    Returns:
+      observation, done, info
+      observation: dictonary that contains all the observations specified in
+        modality_types.
+        observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the
+        detection of the current view.
+        observation[task_env.ModalityTypes.IMAGE]: contains the
+          image of the current view. Note that if using the images for training,
+          should_load_images should be set to false.
+        observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the
+          semantic segmentation of the current view.
+        observation[task_env.ModalityTypes.DEPTH]: If selected, returns the
+          depth map for the current view.
+        observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns
+          a numpy of (action_size + 1,). The first action_size elements indicate
+          the action and the last element indicates whether the previous action
+          was successful or not.
+      done: True after episode_length steps have been taken, False otherwise.
+      info: Empty dictionary.
+    Raises:
+      ValueError: for invalid actions.
+    """
+    # Primarily used for gym interface.
+    if not isinstance(action, str):
+      if not self.action_space.contains(action):
+        raise ValueError('Not a valid actions: %d', action)
+      action = self._actions[action]
+    if action not in self._actions:
+      raise ValueError('Not a valid action: %s', action)
+    action_index = self._actions.index(action)
+    if action == 'stop':
+      next_image_id = self._cur_image_id
+      done = True
+      success = True
+    else:
+      next_image_id = self._next_image(self._cur_image_id, action)
+      self._steps_taken += 1
+      done = False
+      success = True
+    if not next_image_id:
+      success = False
+    else:
+      self._cur_image_id = next_image_id
+    if self._steps_taken >= self._episode_length:
+      done = True
+    cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
+    observation = self.observation(self.vertex_to_pose(cur_vertex))
+    # Concatenation of one-hot prev action + a binary number for success of
+    # previous actions.
+    self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
+    self._prev_action[action_index] = 1.
+    self._prev_action[-1] = float(success)
+    distance_to_goal = self.gt_value(self._cur_goal, cur_vertex)
+    if success:
+      if distance_to_goal <= self._reward_goal_range:
+        done = True
+    return observation, done, {'success': success}
+  @property
+  def graph(self):
+    return self._cur_graph.graph
+  def state(self):
+    return self.vertex_to_pose(self.to_vertex(self._cur_image_id))
+  def gt_value(self, goal, v):
+    """Computes the distance to the goal from vertex v.
+    Args:
+      goal: name of the goal.
+      v: vertex id.
+    Returns:
+      Minimmum number of steps to the given goal.
+    """
+    assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal)
+    assert v in self._cur_graph.distance_to_goal[goal]
+    res = self._cur_graph.distance_to_goal[goal][v]
+    return res
+  def _update_graph(self):
+    """Creates the graph for each environment and updates the _cur_graph."""
+    if self._cur_world not in self._graph_cache:
+      graph = nx.DiGraph()
+      id_to_index = {}
+      index_to_id = {}
+      image_list = self._world_id_dict[self._cur_world]
+      for i, image_id in enumerate(image_list):
+        id_to_index[image_id] = i
+        index_to_id[i] = image_id
+        graph.add_node(i)
+      for image_id in image_list:
+        for action in self._actions:
+          if action == 'stop':
+            continue
+          next_image = self._all_graph[self._cur_world][image_id][action]
+          if next_image:
+            graph.add_edge(
+                id_to_index[image_id], id_to_index[next_image], action=action)
+      target_indexes = {}
+      number_of_nodes_without_targets = graph.number_of_nodes()
+      distance_to_goal = {}
+      for goal in self._targets:
+        if self._cur_world not in self._annotated_targets[goal]:
+          continue
+        goal_indexes = [
+            id_to_index[i]
+            for i in self._annotated_targets[goal][self._cur_world]
+            if i
+        ]
+        super_source_index = graph.number_of_nodes()
+        target_indexes[goal] = super_source_index
+        graph.add_node(super_source_index)
+        index_to_id[super_source_index] = goal
+        id_to_index[goal] = super_source_index
+        for v in goal_indexes:
+          graph.add_edge(v, super_source_index, action='stop')
+          graph.add_edge(super_source_index, v, action='stop')
+        distance_to_goal[goal] = {}
+        for v in range(number_of_nodes_without_targets):
+          distance_to_goal[goal][v] = len(
+              nx.shortest_path(graph, v, super_source_index)) - 2
+      self._graph_cache[self._cur_world] = _Graph(
+          graph, id_to_index, index_to_id, target_indexes, distance_to_goal)
+    self._cur_graph = self._graph_cache[self._cur_world]
+  def reset_for_eval(self, new_world, new_goal, new_image_id):
+    """Resets to the given goal and image_id."""
+    return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id)
+  def get_init_config(self, path):
+    """Exposes the initial state of the agent for the given path.
+    Args:
+      path: sequences of the vertexes that the agent moves.
+    Returns:
+      image_id of the first view, world, and the goal.
+    """
+    return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal
+  def _reset_env(
+      self,
+      new_world=None,
+      new_goal=None,
+      new_image_id=None,
+  ):
+    """Resets the agent in a random world and random id.
+    Args:
+      new_world: If not None, sets the new world to new_world.
+      new_goal: If not None, sets the new goal to new_goal.
+      new_image_id: If not None, sets the first image id to new_image_id.
+    Returns:
+      observation: dictionary of the observations. Content of the observation
+      is similar to that of the step function.
+    Raises:
+      ValueError: if it can't find a world and annotated goal.
+    """
+    self._steps_taken = 0
+    # The first prev_action is special all zero vector + success=1.
+    self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
+    self._prev_action[len(self._actions)] = 1.
+    if self._eval_init_points is not None:
+      if self._eval_init_index >= len(self._eval_init_points):
+        self._eval_init_index = 0
+      a = self._eval_init_points[self._eval_init_index]
+      self._cur_world, self._cur_image_id, self._cur_goal = a
+      self._eval_init_index += 1
+    elif not new_world:
+      attempts = 100
+      found = False
+      while attempts >= 0:
+        attempts -= 1
+        self._cur_goal = np.random.choice(self._targets)
+        available_worlds = list(
+            set(self._annotated_targets[self._cur_goal].keys()).intersection(
+                set(self._worlds)))
+        if available_worlds:
+          found = True
+          break
+      if not found:
+        raise ValueError('could not find a world that has a target annotated')
+      self._cur_world = np.random.choice(available_worlds)
+    else:
+      self._cur_world = new_world
+      self._cur_goal = new_goal
+      if new_world not in self._annotated_targets[new_goal]:
+        return None
+    self._cur_goal_index = self._targets.index(self._cur_goal)
+    if new_image_id:
+      self._cur_image_id = new_image_id
+    else:
+      self._cur_image_id = np.random.choice(
+          self._world_id_dict[self._cur_world])
+    if self._cur_world not in self._detection_cache:
+      with tf.gfile.Open(
+          _get_detection_path(self._dataset_root, self._detection_folder_name,
+                              self._cur_world)) as f:
+        # Each file contains a dictionary with image ids as keys and detection
+        # dicts as values.
+        self._detection_cache[self._cur_world] = np.load(f).item()
+    self._detection_table = self._detection_cache[self._cur_world]
+    self._detection_area = {}
+    self._update_graph()
+    if self._cur_world not in self._vertex_to_pose:
+      # adding fake pose for the super nodes of each target categories.
+      self._vertex_to_pose[self._cur_world] = {
+          index: (-index,) for index in self._cur_graph.target_indexes.values()
+      }
+      # Calling vetex_to_pose for each vertex results in filling out the
+      # dictionaries that contain pose related data.
+      for image_id in self._world_id_dict[self._cur_world]:
+        self.vertex_to_pose(self.to_vertex(image_id))
+      # Filling out pose_to_vertex from vertex_to_pose.
+      self._pose_to_vertex[self._cur_world] = {
+          tuple(v): k
+          for k, v in self._vertex_to_pose[self._cur_world].iteritems()
+      }
+    cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
+    observation = self.observation(self.vertex_to_pose(cur_vertex))
+    return observation
+  def cur_vertex(self):
+    return self._cur_graph.id_to_index[self._cur_image_id]
+  def cur_image_id(self):
+    return self._cur_image_id
+  def path_to_goal(self, image_id=None):
+    """Returns the path from image_id to the self._cur_goal.
+    Args:
+      image_id: If set to None, computes the path from the current view.
+        Otherwise, sets the current view to the given image_id.
+    Returns:
+      The path to the goal.
+    Raises:
+      Exception if there's no path from the view to the goal.
+    """
+    if image_id is None:
+      image_id = self._cur_image_id
+    super_source = self._cur_graph.target_indexes[self._cur_goal]
+    try:
+      path = nx.shortest_path(self._cur_graph.graph,
+                              self._cur_graph.id_to_index[image_id],
+                              super_source)
+    except:
+      print 'path not found, image_id = ', self._cur_world, self._cur_image_id
+      raise
+    return path[:-1]
+  def targets(self):
+    return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])]
+  def vertex_to_pose(self, v):
+    """Returns pose of the view for a given vertex.
+    Args:
+      v: integer, vertex index.
+    Returns:
+      (x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are
+        a vector giving direction of the view.
+    """
+    if v in self._vertex_to_pose[self._cur_world]:
+      return np.copy(self._vertex_to_pose[self._cur_world][v])
+    x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id(
+        v)]
+    if rot is None:  # if rotation is not provided for the given vertex.
+      self._vertex_to_pose[self._cur_world][v] = np.asarray(
+          [x * scale, z * scale, v])
+      return np.copy(self._vertex_to_pose[self._cur_world][v])
+    # Multiply rotation matrix by [0,0,1] to get a vector of length 1 in the
+    # direction of the ray.
+    direction = np.zeros((3, 1), dtype=np.float32)
+    direction[2][0] = 1
+    direction = np.matmul(np.transpose(rot), direction)
+    direction = [direction[0][0], direction[2][0]]
+    self._vertex_to_pose[self._cur_world][v] = np.asarray(
+        [x * scale, z * scale, direction[0], direction[1]])
+    return np.copy(self._vertex_to_pose[self._cur_world][v])
+  def pose_to_vertex(self, pose):
+    """Returns the vertex id for the given pose."""
+    if tuple(pose) not in self._pose_to_vertex[self._cur_world]:
+      raise ValueError(
+          'The given pose is not present in the dictionary: {}'.format(
+              tuple(pose)))
+    return self._pose_to_vertex[self._cur_world][tuple(pose)]
+  def check_scene_graph(self, world, goal):
+    """Checks the connectivity of the scene graph.
+    Goes over all the views. computes the shortest path to the goal. If it
+    crashes it means that it's not connected. Otherwise, the env graph is fine.
+    Args:
+      world: the string name of the world.
+      goal: the string label for the goal.
+    Returns:
+      Nothing.
+    """
+    obs = self._reset_env(new_world=world, new_goal=goal)
+    if not obs:
+      print '{} is not availble in {}'.format(goal, world)
+      return True
+    for image_id in self._world_id_dict[self._cur_world]:
+      print 'check image_id = {}'.format(image_id)
+      self._cur_image_id = image_id
+      path = self.path_to_goal()
+      actions = []
+      for i in range(len(path) - 2):
+        actions.append(self.action(path[i], path[i + 1]))
+      actions.append('stop')
+  @property
+  def goal_one_hot(self):
+    res = np.zeros((len(self._targets),), dtype=np.float32)
+    res[self._cur_goal_index] = 1.
+    return res
+  @property
+  def goal_index(self):
+    return self._cur_goal_index
+  @property
+  def goal_string(self):
+    return self._cur_goal
+  @property
+  def worlds(self):
+    return self._worlds
+  @property
+  def possible_targets(self):
+    return self._targets
+  def action(self, from_pose, to_pose):
+    """Returns the action that takes source vertex to destination vertex.
+    Args:
+      from_pose: pose of the source.
+      to_pose: pose of the destination.
+    Returns:
+      Returns the index of the action.
+    Raises:
+      ValueError: If it is not possible to go from the first vertice to second
+      vertice with one action, it raises value error.
+    """
+    from_index = self.pose_to_vertex(from_pose)
+    to_index = self.pose_to_vertex(to_pose)
+    if to_index not in self.graph[from_index]:
+      from_image_id = self.to_image_id(from_index)
+      to_image_id = self.to_image_id(to_index)
+      raise ValueError('{},{} is not connected to {},{}'.format(
+          from_index, from_image_id, to_index, to_image_id))
+    return self._actions.index(self.graph[from_index][to_index]['action'])
+  def random_step_sequence(self, min_len=None, max_len=None):
+    """Generates random step sequence that takes agent to the goal.
+    Args:
+      min_len: integer, minimum length of a step sequence. Not yet implemented.
+      max_len: integer, should be set to an integer and it is the maximum number
+        of observations and path length to be max_len.
+    Returns:
+      Tuple of (path, actions, states, step_outputs).
+        path: a random path from a random starting point and random environment.
+        actions: actions of the returned path.
+        states: viewpoints of all the states in between.
+        step_outputs: list of step() return tuples.
+    Raises:
+      ValueError: if first_n is not greater than zero; if min_len is different
+        from None.
+    """
+    if max_len is None:
+      raise ValueError('max_len can not be set as None')
+    if max_len < 1:
+      raise ValueError('first_n must be greater or equal to 1.')
+    if min_len is not None:
+      raise ValueError('min_len is not yet implemented.')
+    path = []
+    actions = []
+    states = []
+    step_outputs = []
+    obs = self.reset()
+    last_obs_tuple = [obs, 0, False, {}]
+    for _ in xrange(max_len):
+      action = np.random.choice(self._actions)
+      # We don't want to sample stop action because stop does not add new
+      # information.
+      while action == 'stop':
+        action = np.random.choice(self._actions)
+      path.append(self.to_vertex(self._cur_image_id))
+      onehot = np.zeros((len(self._actions),), dtype=np.float32)
+      onehot[self._actions.index(action)] = 1.
+      actions.append(onehot)
+      states.append(self.vertex_to_pose(path[-1]))
+      step_outputs.append(copy.deepcopy(last_obs_tuple))
+      last_obs_tuple = self.step(action)
+    return path, actions, states, step_outputs
--- a/research/cognitive_planning/envs/configs/active_vision_config.gin
+++ b/research/cognitive_planning/envs/configs/active_vision_config.gin
+#-*-Python-*-
+ActiveVisionDatasetEnv.episode_length = 200
+ActiveVisionDatasetEnv.actions = [
+    'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop'
+]
+ActiveVisionDatasetEnv.confidence_threshold = 0.5
+ActiveVisionDatasetEnv.output_size = 64
+ActiveVisionDatasetEnv.worlds = [
+    'Home_001_1', 'Home_001_2', 'Home_002_1', 'Home_003_1', 'Home_003_2',
+    'Home_004_1', 'Home_004_2', 'Home_005_1', 'Home_005_2', 'Home_006_1',
+    'Home_007_1', 'Home_010_1', 'Home_011_1', 'Home_013_1', 'Home_014_1',
+    'Home_014_2', 'Home_015_1', 'Home_016_1'
+]
+ActiveVisionDatasetEnv.targets = [
+    'tv', 'dining_table', 'fridge', 'microwave', 'couch'
+]
+ActiveVisionDatasetEnv.compute_distance = False
+ActiveVisionDatasetEnv.should_draw_detections = False
+ActiveVisionDatasetEnv.dataset_root = '/usr/local/google/home/kosecka/AVD_Minimal/'
+ActiveVisionDatasetEnv.labelmap_path = 'label_map.txt'
+ActiveVisionDatasetEnv.reward_collision = 0
+ActiveVisionDatasetEnv.reward_goal_range = 2
+ActiveVisionDatasetEnv.num_detection_classes = 90
+ActiveVisionDatasetEnv.segmentation_file_name='sseg_crf'
+ActiveVisionDatasetEnv.detection_folder_name='Detections'
+ActiveVisionDatasetEnv.targets_file_name='annotated_targets'
+ActiveVisionDatasetEnv.shaped_reward=False
--- a/research/cognitive_planning/envs/task_env.py
+++ b/research/cognitive_planning/envs/task_env.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An interface representing the topology of an environment.
+Allows for high level planning and high level instruction generation for
+navigation tasks.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import enum
+import gym
+import gin
+@gin.config.constants_from_enum
+class ModalityTypes(enum.Enum):
+  """Types of the modalities that can be used."""
+  IMAGE = 0
+  SEMANTIC_SEGMENTATION = 1
+  OBJECT_DETECTION = 2
+  DEPTH = 3
+  GOAL = 4
+  PREV_ACTION = 5
+  PREV_SUCCESS = 6
+  STATE = 7
+  DISTANCE = 8
+  CAN_STEP = 9
+  def __lt__(self, other):
+    if self.__class__ is other.__class__:
+      return self.value < other.value
+    return NotImplemented
+class TaskEnvInterface(object):
+  """Interface for an environment topology.
+  An environment can implement this interface if there is a topological graph
+  underlying this environment. All paths below are defined as paths in this
+  graph. Using path_to_actions function one can translate a topological path
+  to a geometric path in the environment.
+  """
+  __metaclass__ = abc.ABCMeta
+  @abc.abstractmethod
+  def random_step_sequence(self, min_len=None, max_len=None):
+    """Generates a random sequence of actions and executes them.
+    Args:
+      min_len: integer, minimum length of a step sequence.
+      max_len: integer, if it is set to non-None, the method returns only
+        the first n steps of a random sequence. If the environment is
+        computationally heavy this argument should be set to speed up the
+        training and avoid unnecessary computations by the environment.
+    Returns:
+      A path, defined as a list of vertex indices, a list of actions, a list of
+      states, and a list of step() return tuples.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  @abc.abstractmethod
+  def targets(self):
+    """A list of targets in the environment.
+    Returns:
+      A list of target locations.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  @abc.abstractproperty
+  def state(self):
+    """Returns the position for the current location of agent."""
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  @abc.abstractproperty
+  def graph(self):
+    """Returns a graph representing the environment topology.
+    Returns:
+      nx.Graph object.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  @abc.abstractmethod
+  def vertex_to_pose(self, vertex_index):
+    """Maps a vertex index to a pose in the environment.
+    Pose of the camera can be represented by (x,y,theta) or (x,y,z,theta).
+    Args:
+      vertex_index: index of a vertex in the topology graph.
+    Returns:
+      A np.array of floats of size 3 or 4 representing the pose of the vertex.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  @abc.abstractmethod
+  def pose_to_vertex(self, pose):
+    """Maps a coordinate in the maze to the closest vertex in topology graph.
+    Args:
+      pose: np.array of floats containing a the pose of the view.
+    Returns:
+      index of a vertex.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  @abc.abstractmethod
+  def observation(self, state):
+    """Returns observation at location xy and orientation theta.
+    Args:
+      state: a np.array of floats containing coordinates of a location and
+        orientation.
+    Returns:
+      Dictionary of observations in the case of multiple observations.
+      The keys are the modality names and the values are the np.array of float
+      of observations for corresponding modality.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+  def action(self, init_state, final_state):
+    """Computes the transition action from state1 to state2.
+    If the environment is discrete and the views are not adjacent in the
+    environment. i.e. it is not possible to move from the first view to the
+    second view with one action it should return None. In the continuous case,
+    it will be the continuous difference of first view and second view.
+    Args:
+      init_state: numpy array, the initial view of the agent.
+      final_state: numpy array, the final view of the agent.
+    """
+    raise NotImplementedError(
+        'Needs implementation as part of EnvTopology interface.')
+@gin.configurable
+class TaskEnv(gym.Env, TaskEnvInterface):
+  """An environment which uses a Task to compute reward.
+  The environment implements a a gym interface, as well as EnvTopology. The
+  former makes sure it can be used within an RL training, while the latter
+  makes sure it can be used by a Task.
+  This environment requires _step_no_reward to be implemented, which steps
+  through it but does not return reward. Instead, the reward calculation is
+  delegated to the Task object, which in return can access needed properties
+  of the environment. These properties are exposed via the EnvTopology
+  interface.
+  """
+  def __init__(self, task=None):
+    self._task = task
+  def set_task(self, task):
+    self._task = task
+  @abc.abstractmethod
+  def _step_no_reward(self, action):
+    """Same as _step without returning reward.
+    Args:
+      action: see _step.
+    Returns:
+      state, done, info as defined in _step.
+    """
+    raise NotImplementedError('Implement step.')
+  @abc.abstractmethod
+  def _reset_env(self):
+    """Resets the environment. Returns initial observation."""
+    raise NotImplementedError('Implement _reset. Must call super!')
+  def step(self, action):
+    obs, done, info = self._step_no_reward(action)
+    reward = 0.0
+    if self._task is not None:
+      obs, reward, done, info = self._task.reward(obs, done, info)
+    return obs, reward, done, info
+  def reset(self):
+    """Resets the environment. Gym API."""
+    obs = self._reset_env()
+    if self._task is not None:
+      self._task.reset(obs)
+    return obs