Merge remote-tracking branch 'upstream/master' into detr-push-3

356c98bd · Kaushik Shivakumar · d31aba8a · b9785623 · d31aba8a · d31aba8a
Commit 356c98bd authored Aug 07, 2020 by Kaushik Shivakumar
20 changed files
--- a/research/compression/image_encoder/msssim.py
+++ b/research/compression/image_encoder/msssim.py
-#!/usr/bin/python
-#
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Python implementation of MS-SSIM.
-
-Usage:
-
-python msssim.py --original_image=original.png --compared_image=distorted.png
-"""
-import numpy as np
-from scipy import signal
-from scipy.ndimage.filters import convolve
-import tensorflow as tf
-
-
-tf.flags.DEFINE_string('original_image', None, 'Path to PNG image.')
-tf.flags.DEFINE_string('compared_image', None, 'Path to PNG image.')
-FLAGS = tf.flags.FLAGS
-
-
-def _FSpecialGauss(size, sigma):
-  """Function to mimic the 'fspecial' gaussian MATLAB function."""
-  radius = size // 2
-  offset = 0.0
-  start, stop = -radius, radius + 1
-  if size % 2 == 0:
-    offset = 0.5
-    stop -= 1
-  x, y = np.mgrid[offset + start:stop, offset + start:stop]
-  assert len(x) == size
-  g = np.exp(-((x**2 + y**2)/(2.0 * sigma**2)))
-  return g / g.sum()
-
-
-def _SSIMForMultiScale(img1, img2, max_val=255, filter_size=11,
-                       filter_sigma=1.5, k1=0.01, k2=0.03):
-  """Return the Structural Similarity Map between `img1` and `img2`.
-
-  This function attempts to match the functionality of ssim_index_new.m by
-  Zhou Wang: http://www.cns.nyu.edu/~lcv/ssim/msssim.zip
-
-  Arguments:
-    img1: Numpy array holding the first RGB image batch.
-    img2: Numpy array holding the second RGB image batch.
-    max_val: the dynamic range of the images (i.e., the difference between the
-      maximum the and minimum allowed values).
-    filter_size: Size of blur kernel to use (will be reduced for small images).
-    filter_sigma: Standard deviation for Gaussian blur kernel (will be reduced
-      for small images).
-    k1: Constant used to maintain stability in the SSIM calculation (0.01 in
-      the original paper).
-    k2: Constant used to maintain stability in the SSIM calculation (0.03 in
-      the original paper).
-
-  Returns:
-    Pair containing the mean SSIM and contrast sensitivity between `img1` and
-    `img2`.
-
-  Raises:
-    RuntimeError: If input images don't have the same shape or don't have four
-      dimensions: [batch_size, height, width, depth].
-  """
-  if img1.shape != img2.shape:
-    raise RuntimeError('Input images must have the same shape (%s vs. %s).',
-                       img1.shape, img2.shape)
-  if img1.ndim != 4:
-    raise RuntimeError('Input images must have four dimensions, not %d',
-                       img1.ndim)
-
-  img1 = img1.astype(np.float64)
-  img2 = img2.astype(np.float64)
-  _, height, width, _ = img1.shape
-
-  # Filter size can't be larger than height or width of images.
-  size = min(filter_size, height, width)
-
-  # Scale down sigma if a smaller filter size is used.
-  sigma = size * filter_sigma / filter_size if filter_size else 0
-
-  if filter_size:
-    window = np.reshape(_FSpecialGauss(size, sigma), (1, size, size, 1))
-    mu1 = signal.fftconvolve(img1, window, mode='valid')
-    mu2 = signal.fftconvolve(img2, window, mode='valid')
-    sigma11 = signal.fftconvolve(img1 * img1, window, mode='valid')
-    sigma22 = signal.fftconvolve(img2 * img2, window, mode='valid')
-    sigma12 = signal.fftconvolve(img1 * img2, window, mode='valid')
-  else:
-    # Empty blur kernel so no need to convolve.
-    mu1, mu2 = img1, img2
-    sigma11 = img1 * img1
-    sigma22 = img2 * img2
-    sigma12 = img1 * img2
-
-  mu11 = mu1 * mu1
-  mu22 = mu2 * mu2
-  mu12 = mu1 * mu2
-  sigma11 -= mu11
-  sigma22 -= mu22
-  sigma12 -= mu12
-
-  # Calculate intermediate values used by both ssim and cs_map.
-  c1 = (k1 * max_val) ** 2
-  c2 = (k2 * max_val) ** 2
-  v1 = 2.0 * sigma12 + c2
-  v2 = sigma11 + sigma22 + c2
-  ssim = np.mean((((2.0 * mu12 + c1) * v1) / ((mu11 + mu22 + c1) * v2)))
-  cs = np.mean(v1 / v2)
-  return ssim, cs
-
-
-def MultiScaleSSIM(img1, img2, max_val=255, filter_size=11, filter_sigma=1.5,
-                   k1=0.01, k2=0.03, weights=None):
-  """Return the MS-SSIM score between `img1` and `img2`.
-
-  This function implements Multi-Scale Structural Similarity (MS-SSIM) Image
-  Quality Assessment according to Zhou Wang's paper, "Multi-scale structural
-  similarity for image quality assessment" (2003).
-  Link: https://ece.uwaterloo.ca/~z70wang/publications/msssim.pdf
-
-  Author's MATLAB implementation:
-  http://www.cns.nyu.edu/~lcv/ssim/msssim.zip
-
-  Arguments:
-    img1: Numpy array holding the first RGB image batch.
-    img2: Numpy array holding the second RGB image batch.
-    max_val: the dynamic range of the images (i.e., the difference between the
-      maximum the and minimum allowed values).
-    filter_size: Size of blur kernel to use (will be reduced for small images).
-    filter_sigma: Standard deviation for Gaussian blur kernel (will be reduced
-      for small images).
-    k1: Constant used to maintain stability in the SSIM calculation (0.01 in
-      the original paper).
-    k2: Constant used to maintain stability in the SSIM calculation (0.03 in
-      the original paper).
-    weights: List of weights for each level; if none, use five levels and the
-      weights from the original paper.
-
-  Returns:
-    MS-SSIM score between `img1` and `img2`.
-
-  Raises:
-    RuntimeError: If input images don't have the same shape or don't have four
-      dimensions: [batch_size, height, width, depth].
-  """
-  if img1.shape != img2.shape:
-    raise RuntimeError('Input images must have the same shape (%s vs. %s).',
-                       img1.shape, img2.shape)
-  if img1.ndim != 4:
-    raise RuntimeError('Input images must have four dimensions, not %d',
-                       img1.ndim)
-
-  # Note: default weights don't sum to 1.0 but do match the paper / matlab code.
-  weights = np.array(weights if weights else
-                     [0.0448, 0.2856, 0.3001, 0.2363, 0.1333])
-  levels = weights.size
-  downsample_filter = np.ones((1, 2, 2, 1)) / 4.0
-  im1, im2 = [x.astype(np.float64) for x in [img1, img2]]
-  mssim = np.array([])
-  mcs = np.array([])
-  for _ in range(levels):
-    ssim, cs = _SSIMForMultiScale(
-        im1, im2, max_val=max_val, filter_size=filter_size,
-        filter_sigma=filter_sigma, k1=k1, k2=k2)
-    mssim = np.append(mssim, ssim)
-    mcs = np.append(mcs, cs)
-    filtered = [convolve(im, downsample_filter, mode='reflect')
-                for im in [im1, im2]]
-    im1, im2 = [x[:, ::2, ::2, :] for x in filtered]
-  return (np.prod(mcs[0:levels-1] ** weights[0:levels-1]) *
-          (mssim[levels-1] ** weights[levels-1]))
-
-
-def main(_):
-  if FLAGS.original_image is None or FLAGS.compared_image is None:
-    print('\nUsage: python msssim.py --original_image=original.png '
-          '--compared_image=distorted.png\n\n')
-    return
-
-  if not tf.gfile.Exists(FLAGS.original_image):
-    print('\nCannot find --original_image.\n')
-    return
-
-  if not tf.gfile.Exists(FLAGS.compared_image):
-    print('\nCannot find --compared_image.\n')
-    return
-
-  with tf.gfile.FastGFile(FLAGS.original_image) as image_file:
-    img1_str = image_file.read('rb')
-  with tf.gfile.FastGFile(FLAGS.compared_image) as image_file:
-    img2_str = image_file.read('rb')
-
-  input_img = tf.placeholder(tf.string)
-  decoded_image = tf.expand_dims(tf.image.decode_png(input_img, channels=3), 0)
-
-  with tf.Session() as sess:
-    img1 = sess.run(decoded_image, feed_dict={input_img: img1_str})
-    img2 = sess.run(decoded_image, feed_dict={input_img: img2_str})
-
-  print((MultiScaleSSIM(img1, img2, max_val=255)))
-
-
-if __name__ == '__main__':
-  tf.app.run()
--- a/research/deep_contextual_bandits/README.md
+++ b/research/deep_contextual_bandits/README.md
-![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
-![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
-![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
-
-# Deep Bayesian Bandits Library
-
-This library corresponds to the *[Deep Bayesian Bandits Showdown: An Empirical
-Comparison of Bayesian Deep Networks for Thompson
-Sampling](https://arxiv.org/abs/1802.09127)* paper, published in
-[ICLR](https://iclr.cc/) 2018. We provide a benchmark to test decision-making
-algorithms for contextual-bandits. In particular, the current library implements
-a variety of algorithms (many of them based on approximate Bayesian Neural
-Networks and Thompson sampling), and a number of real and syntethic data
-problems exhibiting a diverse set of properties.
-
-It is a Python library that uses [TensorFlow](https://www.tensorflow.org/).
-
-We encourage contributors to add new approximate Bayesian Neural Networks or,
-more generally, contextual bandits algorithms to the library. Also, we would
-like to extend the data sources over time, so we warmly encourage contributions
-in this front too!
-
-Please, use the following when citing the code or the paper:
-
-```
-@article{riquelme2018deep, title={Deep Bayesian Bandits Showdown: An Empirical
-Comparison of Bayesian Deep Networks for Thompson Sampling},
-author={Riquelme, Carlos and Tucker, George and Snoek, Jasper},
-journal={International Conference on Learning Representations, ICLR.}, year={2018}}
-```
-
-**Contact**. This repository is maintained by [Carlos Riquelme](http://rikel.me) ([rikel](https://github.com/rikel)). Feel free to reach out directly at [rikel@google.com](mailto:rikel@google.com) with any questions or comments.
-
-
-We first briefly introduce contextual bandits, Thompson sampling, enumerate the
-implemented algorithms, and the available data sources. Then, we provide a
-simple complete example illustrating how to use the library.
-
-## Contextual Bandits
-
-Contextual bandits are a rich decision-making framework where an algorithm has
-to choose among a set of *k* actions at every time step *t*, after observing
-a context (or side-information) denoted by *X<sub>t</sub>*. The general pseudocode for
-the process if we use algorithm **A** is as follows:
-
-```
-At time t = 1, ..., T:
-  1. Observe new context: X_t
-  2. Choose action: a_t = A.action(X_t)
-  3. Observe reward: r_t
-  4. Update internal state of the algorithm: A.update((X_t, a_t, r_t))
-```
-
-The goal is to maximize the total sum of rewards: &sum;<sub>t</sub> r<sub>t</sub>
-
-For example, each *X<sub>t</sub>* could encode the properties of a specific user (and
-the time or day), and we may have to choose an ad, discount coupon, treatment,
-hyper-parameters, or version of a website to show or provide to the user.
-Hopefully, over time, we will learn how to match each type of user to the most
-beneficial personalized action under some metric (the reward).
-
-## Thompson Sampling
-
-Thompson Sampling is a meta-algorithm that chooses an action for the contextual
-bandit in a statistically efficient manner, simultaneously finding the best arm
-while attempting to incur low cost. Informally speaking, we assume the expected
-reward is given by some function
-**E**[r<sub>t</sub> | X<sub>t</sub>, a<sub>t</sub>] = f(X<sub>t</sub>, a<sub>t</sub>).
-Unfortunately, function **f** is unknown, as otherwise we could just choose the
-action with highest expected value:
-a<sub>t</sub><sup>*</sup> = arg max<sub>i</sub> f(X<sub>t</sub>, a<sub>t</sub>).
-
-The idea behind Thompson Sampling is based on keeping a posterior distribution
-&pi;<sub>t</sub> over functions in some family f &isin; F after observing the first
-*t-1* datapoints. Then, at time *t*, we sample one potential explanation of
-the underlying process: f<sub>t</sub> &sim; &pi;<sub>t</sub>, and act optimally (i.e., greedily)
-*according to f<sub>t</sub>*. In other words, we choose
-a<sub>t</sub> = arg max<sub>i</sub> f<sub>t</sub>(X<sub>t</sub>, a<sub>i</sub>).
-Finally, we update our posterior distribution with the new collected
-datapoint (X<sub>t</sub>, a<sub>t</sub>, r<sub>t</sub>).
-
-The main issue is that keeping an updated posterior &pi;<sub>t</sub> (or, even,
-sampling from it) is often intractable for highly parameterized models like deep
-neural networks. The algorithms we list in the next section provide tractable
-*approximations* that can be used in combination with Thompson Sampling to solve
-the contextual bandit problem.
-
-## Algorithms
-
-The Deep Bayesian Bandits library includes the following algorithms (see the
-[paper](https://arxiv.org/abs/1802.09127) for further details):
-
-1.  **Linear Algorithms**. As a powerful baseline, we provide linear algorithms.
-    In particular, we focus on the exact Bayesian linear regression
-    implementation, while it is easy to derive the greedy OLS version (possibly,
-    with epsilon-greedy exploration). The algorithm is implemented in
-    *linear_full_posterior_sampling.py*, and it is instantiated as follows:
-
-    ```
-        linear_full = LinearFullPosteriorSampling('MyLinearTS', my_hparams)
-    ```
-
-2.  **Neural Linear**. We introduce an algorithm we call Neural Linear, which
-    operates by learning a neural network to map contexts to rewards for each
-    action, and ---simultaneously--- it updates a Bayesian linear regression in
-    the last layer (i.e., the one that maps the final representation **z** to
-    the rewards **r**). Thompson Sampling samples the linear parameters
-    &beta;<sub>i</sub> for each action *i*, but keeps the network that computes the
-    representation. Then, both parts (network and Bayesian linear regression)
-    are updated, possibly at different frequencies. The algorithm is implemented
-    in *neural_linear_sampling.py*, and we create an algorithm instance like
-    this:
-
-    ```
-        neural_linear = NeuralLinearPosteriorSampling('MyNLinear', my_hparams)
-    ```
-
-3.  **Neural Greedy**. Another standard benchmark is to train a neural network
-    that maps contexts to rewards, and at each time *t* just acts greedily
-    according to the current model. In particular, this approach does *not*
-    explicitly use Thompson Sampling. However, due to stochastic gradient
-    descent, there is still some randomness in its output. It is
-    straight-forward to add epsilon-greedy exploration to choose random
-    actions with probability &epsilon; &isin; (0, 1). The algorithm is
-    implemented in *neural_bandit_model.py*, and it is used together with
-    *PosteriorBNNSampling* (defined in *posterior_bnn_sampling.py*) by calling:
-
-    ```
-      neural_greedy = PosteriorBNNSampling('MyNGreedy', my_hparams, 'RMSProp')
-    ```
-
-4.  **Stochastic Variational Inference**, Bayes by Backpropagation. We implement
-    a Bayesian neural network by modeling each individual weight posterior as a
-    univariate Gaussian distribution: w<sub>ij</sub> &sim; N(&mu;<sub>ij</sub>, &sigma;<sub>ij</sub><sup>2</sup>).
-    Thompson sampling then samples a network at each time step
-    by sampling each weight independently. The variational approach consists in
-    maximizing a proxy for maximum likelihood of the observed data, the ELBO or
-    variational lower bound, to fit the values of &mu;<sub>ij</sub>, &sigma;<sub>ij</sub><sup>2</sup>
-    for every *i, j*.
-
-    See [Weight Uncertainty in Neural
-    Networks](https://arxiv.org/abs/1505.05424).
-
-    The BNN algorithm is implemented in *variational_neural_bandit_model.py*,
-    and it is used together with *PosteriorBNNSampling* (defined in
-    *posterior_bnn_sampling.py*) by calling:
-
-    ```
-        bbb = PosteriorBNNSampling('myBBB', my_hparams, 'Variational')
-    ```
-
-5.  **Expectation-Propagation**, Black-box alpha-divergence minimization.
-    The family of expectation-propagation algorithms is based on the message
-    passing framework . They iteratively approximate the posterior by updating a
-    single approximation factor (or site) at a time, which usually corresponds
-    to the likelihood of one data point. We focus on methods that directly
-    optimize the global EP objective via stochastic gradient descent, as, for
-    instance, Power EP. For further details see original paper below.
-
-    See [Black-box alpha-divergence
-    Minimization](https://arxiv.org/abs/1511.03243).
-
-    We create an instance of the algorithm like this:
-
-    ```
-        bb_adiv = PosteriorBNNSampling('MyEP', my_hparams, 'AlphaDiv')
-    ```
-
-6.  **Dropout**. Dropout is a training technique where the output of each neuron
-    is independently zeroed out with probability *p* at each forward pass.
-    Once the network has been trained, dropout can still be used to obtain a
-    distribution of predictions for a specific input. Following the best action
-    with respect to the random dropout prediction can be interpreted as an
-    implicit form of Thompson sampling. The code for dropout is the same as for
-    Neural Greedy (see above), but we need to set two hyper-parameters:
-    *use_dropout=True* and *keep_prob=p* where *p* takes the desired value in
-    (0, 1). Then:
-
-    ```
-        dropout = PosteriorBNNSampling('MyDropout', my_hparams, 'RMSProp')
-    ```
-
-7.  **Monte Carlo Methods**. To be added soon.
-
-8.  **Bootstrapped Networks**. This algorithm trains simultaneously and in
-    parallel **q** neural networks based on different datasets D<sub>1</sub>, ..., D<sub>q</sub>. The way those datasets are collected is by adding each new collected
-    datapoint (X<sub>t</sub>, a<sub>t</sub>, r<sub>t</sub>) to each dataset *D<sub>i</sub>* independently and with
-    probability p &isin; (0, 1]. Therefore, the main hyperparameters of the
-    algorithm are **(q, p)**. In order to choose an action for a new context,
-    one of the **q** networks is first selected with uniform probability (i.e.,
-    *1/q*). Then, the best action according to the *selected* network is
-    played.
-
-    See [Deep Exploration via Bootstrapped
-    DQN](https://arxiv.org/abs/1602.04621).
-
-    The algorithm is implemented in *bootstrapped_bnn_sampling.py*, and we
-    instantiate it as (where *my_hparams* contains both **q** and **p**):
-
-    ```
-        bootstrap = BootstrappedBNNSampling('MyBoot', my_hparams)
-    ```
-
-9.  **Parameter-Noise**. Another approach to approximate a distribution over
-    neural networks (or more generally, models) that map contexts to rewards,
-    consists in randomly perturbing a point estimate trained by Stochastic
-    Gradient Descent on the data. The Parameter-Noise algorithm uses a heuristic
-    to control the amount of noise &sigma;<sub>t</sub><sup>2</sup> it adds independently to the
-    parameters representing a neural network: &theta;<sub>t</sub><sup>'</sup> = &theta;<sub>t</sub> + &epsilon; where
-    &epsilon; &sim; N(0, &sigma;<sub>t</sub><sup>2</sup> Id).
-    After using &theta;<sub>t</sub><sup>'</sup> for decision making, the following SGD
-    training steps start again from &theta;<sub>t</sub>. The key hyperparameters to set
-    are those controlling the noise heuristic.
-
-    See [Parameter Space Noise for
-    Exploration](https://arxiv.org/abs/1706.01905).
-
-    The algorithm is implemented in *parameter_noise_sampling.py*, and we create
-    an instance by calling:
-
-    ```
-        parameter_noise = ParameterNoiseSampling('MyParamNoise', my_hparams)
-    ```
-
-10. **Gaussian Processes**. Another standard benchmark are Gaussian Processes,
-    see *Gaussian Processes for Machine Learning* by Rasmussen and Williams for
-    an introduction. To model the expected reward of different actions, we fit a
-    multitask GP.
-
-    See [Multi-task Gaussian Process
-    Prediction](http://papers.nips.cc/paper/3189-multi-task-gaussian-process-prediction.pdf).
-
-    Our implementation is provided in *multitask_gp.py*, and it is instantiated
-    as follows:
-
-    ```
-        gp = PosteriorBNNSampling('MyMultitaskGP', my_hparams, 'GP')
-    ```
-
-In the code snippet at the bottom, we show how to instantiate some of these
-algorithms, and how to run the contextual bandit simulator, and display the
-high-level results.
-
-## Data
-
-In the paper we use two types of contextual datasets: synthetic and based on
-real-world data.
-
-We provide functions that sample problems from those datasets. In the case of
-real-world data, you first need to download the raw datasets, and pass the route
-to the functions. Links for the datasets are provided below.
-
-### Synthetic Datasets
-
-Synthetic datasets are contained in the *synthetic_data_sampler.py* file. In
-particular, it includes:
-
-1.  **Linear data**. Provides a number of linear arms, and Gaussian contexts.
-
-2.  **Sparse linear data**. Provides a number of sparse linear arms, and
-    Gaussian contexts.
-
-3.  **Wheel bandit data**. Provides sampled data from the wheel bandit data, see
-    [Section 5.4](https://arxiv.org/abs/1802.09127) in the paper.
-
-### Real-World Datasets
-
-Real-world data generating functions are contained in the *data_sampler.py*
-file.
-
-In particular, it includes:
-
-1.  **Mushroom data**. Each incoming context represents a different type of
-    mushroom, and the actions are eat or no-eat. Eating an edible mushroom
-    provides positive reward, while eating a poisonous one provides positive
-    reward with probability *p*, and a large negative reward with probability
-    *1-p*. All the rewards, and the value of *p* are customizable. The
-    [dataset](https://archive.ics.uci.edu/ml/datasets/mushroom) is part of the
-    UCI repository, and the bandit problem was proposed in Blundell et al.
-    (2015). Data is available [here](https://storage.googleapis.com/bandits_datasets/mushroom.data)
-    or alternatively [here](https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/),
-    use the *agaricus-lepiota.data* file.
-
-2.  **Stock data**. We created the Financial Dataset by pulling the stock prices
-    of *d = 21* publicly traded companies in NYSE and Nasdaq, for the last 14
-    years (*n = 3713*). For each day, the context was the price difference
-    between the beginning and end of the session for each stock. We
-    synthetically created the arms to be a linear combination of the contexts,
-    representing *k = 8* different potential portfolios. Data is available
-    [here](https://storage.googleapis.com/bandits_datasets/raw_stock_contexts).
-
-3.  **Jester data**. We create a recommendation system bandit problem as
-    follows. The Jester Dataset (Goldberg et al., 2001) provides continuous
-    ratings in *[-10, 10]* for 100 jokes from a total of 73421 users. We find
-    a *complete* subset of *n = 19181* users rating all 40 jokes. Following
-    Riquelme et al. (2017), we take *d = 32* of the ratings as the context of
-    the user, and *k = 8* as the arms. The agent recommends one joke, and
-    obtains the reward corresponding to the rating of the user for the selected
-    joke. Data is available [here](https://storage.googleapis.com/bandits_datasets/jester_data_40jokes_19181users.npy).
-
-4.  **Statlog data**. The Shuttle Statlog Dataset (Asuncion & Newman, 2007)
-    provides the value of *d = 9* indicators during a space shuttle flight,
-    and the goal is to predict the state of the radiator subsystem of the
-    shuttle. There are *k = 7* possible states, and if the agent selects the
-    right state, then reward 1 is generated. Otherwise, the agent obtains no
-    reward (*r = 0*). The most interesting aspect of the dataset is that one
-    action is the optimal one in 80% of the cases, and some algorithms may
-    commit to this action instead of further exploring. In this case, the number
-    of contexts is *n = 43500*. Data is available [here](https://storage.googleapis.com/bandits_datasets/shuttle.trn) or alternatively
-    [here](https://archive.ics.uci.edu/ml/datasets/Statlog+\(Shuttle\)), use
-    *shuttle.trn* file.
-
-5.  **Adult data**. The Adult Dataset (Kohavi, 1996; Asuncion & Newman, 2007)
-    comprises personal information from the US Census Bureau database, and the
-    standard prediction task is to determine if a person makes over 50K a year
-    or not. However, we consider the *k = 14* different occupations as
-    feasible actions, based on *d = 94* covariates (many of them binarized).
-    As in previous datasets, the agent obtains a reward of 1 for making the
-    right prediction, and 0 otherwise. The total number of contexts is *n =
-    45222*. Data is available [here](https://storage.googleapis.com/bandits_datasets/adult.full) or alternatively
-    [here](https://archive.ics.uci.edu/ml/datasets/adult), use *adult.data*
-    file.
-
-6.  **Census data**. The US Census (1990) Dataset (Asuncion & Newman, 2007)
-    contains a number of personal features (age, native language, education...)
-    which we summarize in *d = 389* covariates, including binary dummy
-    variables for categorical features. Our goal again is to predict the
-    occupation of the individual among *k = 9* classes. The agent obtains
-    reward 1 for making the right prediction, and 0 otherwise. Data is available
-    [here](https://storage.googleapis.com/bandits_datasets/USCensus1990.data.txt) or alternatively [here](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+\(1990\)), use
-    *USCensus1990.data.txt* file.
-
-7.  **Covertype data**. The Covertype Dataset (Asuncion & Newman, 2007)
-    classifies the cover type of northern Colorado forest areas in *k = 7*
-    classes, based on *d = 54* features, including elevation, slope, aspect,
-    and soil type. Again, the agent obtains reward 1 if the correct class is
-    selected, and 0 otherwise. Data is available [here](https://storage.googleapis.com/bandits_datasets/covtype.data) or alternatively
-    [here](https://archive.ics.uci.edu/ml/datasets/covertype), use
-    *covtype.data* file.
-
-In datasets 4-7, each feature of the dataset is normalized first.
-
-## Usage: Basic Example
-
-This library requires Tensorflow, Numpy, and Pandas.
-
-The file *example_main.py* provides a complete example on how to use the
-library. We run the code:
-
-```
-    python example_main.py
-```
-
-**Do not forget to** configure the routes to the data files at the top of *example_main.py*.
-
-For example, we can run the Mushroom bandit for 2000 contexts on a few
-algorithms as follows:
-
-```
-  # Problem parameters
-  num_contexts = 2000
-
-  # Choose data source among:
-  # {linear, sparse_linear, mushroom, financial, jester,
-  #  statlog, adult, covertype, census, wheel}
-  data_type = 'mushroom'
-
-  # Create dataset
-  sampled_vals = sample_data(data_type, num_contexts)
-  dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals
-
-  # Define hyperparameters and algorithms
-  hparams_linear = tf.contrib.training.HParams(num_actions=num_actions,
-                                               context_dim=context_dim,
-                                               a0=6,
-                                               b0=6,
-                                               lambda_prior=0.25,
-                                               initial_pulls=2)
-
-  hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions,
-                                                context_dim=context_dim,
-                                                init_scale=0.3,
-                                                activation=tf.nn.relu,
-                                                layer_sizes=[50],
-                                                batch_size=512,
-                                                activate_decay=True,
-                                                initial_lr=0.1,
-                                                max_grad_norm=5.0,
-                                                show_training=False,
-                                                freq_summary=1000,
-                                                buffer_s=-1,
-                                                initial_pulls=2,
-                                                optimizer='RMS',
-                                                reset_lr=True,
-                                                lr_decay_rate=0.5,
-                                                training_freq=50,
-                                                training_epochs=100,
-                                                keep_prob=0.80,
-                                                use_dropout=True)
-
-  ### Create hyper-parameter configurations for other algorithms
-    [...]
-
-  algos = [
-      UniformSampling('Uniform Sampling', hparams),
-      PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'),
-      PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'),
-      NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear),
-      LinearFullPosteriorSampling('LinFullPost', hparams_linear),
-      BootstrappedBNNSampling('BootRMS', hparams_boot),
-      ParameterNoiseSampling('ParamNoise', hparams_pnoise),
-  ]
-
-  # Run contextual bandit problem
-  t_init = time.time()
-  results = run_contextual_bandit(context_dim, num_actions, dataset, algos)
-  _, h_rewards = results
-
-  # Display results
-  display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
-
-```
-
-The previous code leads to final results that look like:
-
-```
---------------------------------------------------
---------------------------------------------------
-mushroom bandit completed after 69.8401839733 seconds.
---------------------------------------------------
-  0) LinFullPost         |               total reward =     4365.0.
-  1) NeuralLinear        |               total reward =     4110.0.
-  2) Dropout             |               total reward =     3430.0.
-  3) ParamNoise          |               total reward =     3270.0.
-  4) BootRMS             |               total reward =     3050.0.
-  5) BBB                 |               total reward =     2505.0.
-  6) Uniform Sampling    |               total reward =    -4930.0.
---------------------------------------------------
-Optimal total reward = 5235.
-Frequency of optimal actions (action, frequency):
-[[0, 953], [1, 1047]]
---------------------------------------------------
---------------------------------------------------
-```
--- a/research/deep_contextual_bandits/bandits/algorithms/bb_alpha_divergence_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/bb_alpha_divergence_model.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Bayesian NN using expectation propagation (Black-Box Alpha-Divergence).
-
-See https://arxiv.org/abs/1511.03243 for details.
-All formulas used in this implementation are derived in:
-https://www.overleaf.com/12837696kwzjxkyhdytk#/49028744/.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-import numpy as np
-import tensorflow as tf
-from absl import flags
-
-from bandits.core.bayesian_nn import BayesianNN
-
-
-FLAGS = flags.FLAGS
-tfd = tf.contrib.distributions  # update to: tensorflow_probability.distributions
-
-
-def log_gaussian(x, mu, sigma, reduce_sum=True):
-  res = tfd.Normal(mu, sigma).log_prob(x)
-  if reduce_sum:
-    return tf.reduce_sum(res)
-  else:
-    return res
-
-
-class BBAlphaDivergence(BayesianNN):
-  """Implements an approximate Bayesian NN via Black-Box Alpha-Divergence."""
-
-  def __init__(self, hparams, name):
-
-    self.name = name
-    self.hparams = hparams
-
-    self.alpha = getattr(self.hparams, 'alpha', 1.0)
-    self.num_mc_nn_samples = getattr(self.hparams, 'num_mc_nn_samples', 10)
-
-    self.n_in = self.hparams.context_dim
-    self.n_out = self.hparams.num_actions
-    self.layers = self.hparams.layer_sizes
-    self.batch_size = self.hparams.batch_size
-
-    self.show_training = self.hparams.show_training
-    self.freq_summary = self.hparams.freq_summary
-    self.verbose = getattr(self.hparams, 'verbose', True)
-
-    self.cleared_times_trained = self.hparams.cleared_times_trained
-    self.initial_training_steps = self.hparams.initial_training_steps
-    self.training_schedule = np.linspace(self.initial_training_steps,
-                                         self.hparams.training_epochs,
-                                         self.cleared_times_trained)
-
-    self.times_trained = 0
-    self.initialize_model()
-
-  def initialize_model(self):
-    """Builds and initialize the model."""
-
-    self.num_w = 0
-    self.num_b = 0
-
-    self.weights_m = {}
-    self.weights_std = {}
-    self.biases_m = {}
-    self.biases_std = {}
-
-    self.h_max_var = []
-
-    if self.hparams.use_sigma_exp_transform:
-      self.sigma_transform = tfd.bijectors.Exp()
-    else:
-      self.sigma_transform = tfd.bijectors.Softplus()
-
-    # Build the graph corresponding to the Bayesian NN instance.
-    self.graph = tf.Graph()
-
-    with self.graph.as_default():
-
-      self.sess = tf.Session()
-      self.x = tf.placeholder(shape=[None, self.n_in],
-                              dtype=tf.float32, name='x')
-      self.y = tf.placeholder(shape=[None, self.n_out],
-                              dtype=tf.float32, name='y')
-      self.weights = tf.placeholder(shape=[None, self.n_out],
-                                    dtype=tf.float32, name='w')
-      self.data_size = tf.placeholder(tf.float32, shape=(), name='data_size')
-
-      self.prior_variance = self.hparams.prior_variance
-      if self.prior_variance < 0:
-        # if not fixed, we learn the prior.
-        self.prior_variance = self.sigma_transform.forward(
-            self.build_mu_variable([1, 1]))
-
-      self.build_model()
-      self.sess.run(tf.global_variables_initializer())
-
-  def build_mu_variable(self, shape):
-    """Returns a mean variable initialized as N(0, 0.05)."""
-    return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
-
-  def build_sigma_variable(self, shape, init=-5.):
-    """Returns a sigma variable initialized as N(init, 0.05)."""
-    # Initialize sigma to be very small initially to encourage MAP opt first
-    return tf.Variable(tf.random_normal(shape, init, 0.05))
-
-  def build_layer(self, input_x, shape, layer_id, activation_fn=tf.nn.relu):
-    """Builds a layer with N(mean, std) for each weight, and samples from it."""
-
-    w_mu = self.build_mu_variable(shape)
-    w_sigma = self.sigma_transform.forward(self.build_sigma_variable(shape))
-
-    w_noise = tf.random_normal(shape)
-    w = w_mu + w_sigma * w_noise
-
-    b_mu = self.build_mu_variable([1, shape[1]])
-    b_sigma = self.sigma_transform.forward(
-        self.build_sigma_variable([1, shape[1]]))
-
-    b_noise = tf.random_normal([1, shape[1]])
-    b = b_mu + b_sigma * b_noise
-
-    # Create outputs
-    output_h = activation_fn(tf.matmul(input_x, w) + b)
-
-    # Store means and stds
-    self.weights_m[layer_id] = w_mu
-    self.weights_std[layer_id] = w_sigma
-    self.biases_m[layer_id] = b_mu
-    self.biases_std[layer_id] = b_sigma
-
-    return output_h
-
-  def sample_neural_network(self, activation_fn=tf.nn.relu):
-    """Samples a nn from posterior, computes data log lk and log f factor."""
-
-    with self.graph.as_default():
-
-      log_f = 0
-      n = self.data_size
-      input_x = self.x
-
-      for layer_id in range(self.total_layers):
-
-        # load mean and std of each weight
-        w_mu = self.weights_m[layer_id]
-        w_sigma = self.weights_std[layer_id]
-        b_mu = self.biases_m[layer_id]
-        b_sigma = self.biases_std[layer_id]
-
-        # sample weights from Gaussian distribution
-        shape = w_mu.shape
-        w_noise = tf.random_normal(shape)
-        b_noise = tf.random_normal([1, int(shape[1])])
-        w = w_mu + w_sigma * w_noise
-        b = b_mu + b_sigma * b_noise
-
-        # compute contribution to log_f
-        t1 = w * w_mu / (n * w_sigma ** 2)
-        t2 = (0.5 * w ** 2 / n) * (1 / self.prior_variance - 1 / w_sigma ** 2)
-        log_f += tf.reduce_sum(t1 + t2)
-
-        t1 = b * b_mu / (n * b_sigma ** 2)
-        t2 = (0.5 * b ** 2 / n) * (1 / self.prior_variance - 1 / b_sigma ** 2)
-        log_f += tf.reduce_sum(t1 + t2)
-
-        if layer_id < self.total_layers - 1:
-          output_h = activation_fn(tf.matmul(input_x, w) + b)
-        else:
-          output_h = tf.matmul(input_x, w) + b
-
-        input_x = output_h
-
-      # compute log likelihood of the observed reward under the sampled nn
-      log_likelihood = log_gaussian(
-          self.y, output_h, self.noise_sigma, reduce_sum=False)
-      weighted_log_likelihood = tf.reduce_sum(log_likelihood * self.weights, -1)
-
-    return log_f, weighted_log_likelihood
-
-  def log_z_q(self):
-    """Computes log-partition function of current posterior parameters."""
-
-    with self.graph.as_default():
-
-      log_z_q = 0
-
-      for layer_id in range(self.total_layers):
-
-        w_mu = self.weights_m[layer_id]
-        w_sigma = self.weights_std[layer_id]
-        b_mu = self.biases_m[layer_id]
-        b_sigma = self.biases_std[layer_id]
-
-        w_term = 0.5 * tf.reduce_sum(w_mu ** 2 / w_sigma ** 2)
-        w_term += 0.5 * tf.reduce_sum(tf.log(2 * np.pi) + 2 * tf.log(w_sigma))
-
-        b_term = 0.5 * tf.reduce_sum(b_mu ** 2 / b_sigma ** 2)
-        b_term += 0.5 * tf.reduce_sum(tf.log(2 * np.pi) + 2 * tf.log(b_sigma))
-
-        log_z_q += w_term + b_term
-
-      return log_z_q
-
-  def log_z_prior(self):
-    """Computes log-partition function of the prior parameters."""
-    num_params = self.num_w + self.num_b
-    return num_params * 0.5 * tf.log(2 * np.pi * self.prior_variance)
-
-  def log_alpha_likelihood_ratio(self, activation_fn=tf.nn.relu):
-
-    # each nn sample returns (log f, log likelihoods)
-    nn_samples = [
-        self.sample_neural_network(activation_fn)
-        for _ in range(self.num_mc_nn_samples)
-    ]
-    nn_log_f_samples = [elt[0] for elt in nn_samples]
-    nn_log_lk_samples = [elt[1] for elt in nn_samples]
-
-    # we stack the (log f, log likelihoods) from the k nn samples
-    nn_log_f_stack = tf.stack(nn_log_f_samples)      # k x 1
-    nn_log_lk_stack = tf.stack(nn_log_lk_samples)    # k x N
-    nn_f_tile = tf.tile(nn_log_f_stack, [self.batch_size])
-    nn_f_tile = tf.reshape(nn_f_tile,
-                           [self.num_mc_nn_samples, self.batch_size])
-
-    # now both the log f and log likelihood terms have shape: k x N
-    # apply formula in https://www.overleaf.com/12837696kwzjxkyhdytk#/49028744/
-    nn_log_ratio = nn_log_lk_stack - nn_f_tile
-    nn_log_ratio = self.alpha * tf.transpose(nn_log_ratio)
-    logsumexp_value = tf.reduce_logsumexp(nn_log_ratio, -1)
-    log_k_scalar = tf.log(tf.cast(self.num_mc_nn_samples, tf.float32))
-    log_k = log_k_scalar * tf.ones([self.batch_size])
-
-    return tf.reduce_sum(logsumexp_value - log_k, -1)
-
-  def build_model(self, activation_fn=tf.nn.relu):
-    """Defines the actual NN model with fully connected layers.
-
-    Args:
-      activation_fn: Activation function for the neural network.
-
-    The loss is computed for partial feedback settings (bandits), so only
-    the observed outcome is backpropagated (see weighted loss).
-    Selects the optimizer and, finally, it also initializes the graph.
-    """
-
-    print('Initializing model {}.'.format(self.name))
-
-    # Build terms for the noise sigma estimation for each action.
-    noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
-                      + self.sigma_transform.inverse(self.hparams.noise_sigma))
-    noise_sigma_sigma = self.sigma_transform.forward(
-        self.build_sigma_variable([1, self.n_out]))
-
-    pre_noise_sigma = noise_sigma_mu + tf.random_normal(
-        [1, self.n_out]) * noise_sigma_sigma
-    self.noise_sigma = self.sigma_transform.forward(pre_noise_sigma)
-
-    # Build network
-    input_x = self.x
-    n_in = self.n_in
-    self.total_layers = len(self.layers) + 1
-    if self.layers[0] == 0:
-      self.total_layers = 1
-
-    for l_number, n_nodes in enumerate(self.layers):
-      if n_nodes > 0:
-        h = self.build_layer(input_x, [n_in, n_nodes], l_number)
-        input_x = h
-        n_in = n_nodes
-        self.num_w += n_in * n_nodes
-        self.num_b += n_nodes
-
-    self.y_pred = self.build_layer(input_x, [n_in, self.n_out],
-                                   self.total_layers - 1,
-                                   activation_fn=lambda x: x)
-
-    # Compute energy function based on sampled nn's
-    log_coeff = self.data_size / (self.batch_size * self.alpha)
-    log_ratio = log_coeff * self.log_alpha_likelihood_ratio(activation_fn)
-    logzprior = self.log_z_prior()
-    logzq = self.log_z_q()
-    energy = logzprior - logzq - log_ratio
-
-    self.loss = energy
-    self.global_step = tf.train.get_or_create_global_step()
-    self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
-        self.loss, global_step=self.global_step)
-
-    # Useful for debugging
-    sq_loss = tf.squared_difference(self.y_pred, self.y)
-    weighted_sq_loss = self.weights * sq_loss
-    self.cost = tf.reduce_sum(weighted_sq_loss) / self.batch_size
-
-    # Create tensorboard metrics
-    self.create_summaries()
-    self.summary_writer = tf.summary.FileWriter('{}/graph_{}'.format(
-        FLAGS.logdir, self.name), self.sess.graph)
-
-  def create_summaries(self):
-    tf.summary.scalar('loss', self.loss)
-    tf.summary.scalar('cost', self.cost)
-    self.summary_op = tf.summary.merge_all()
-
-  def assign_lr(self):
-    """Resets the learning rate in dynamic schedules for subsequent trainings.
-
-    In bandits settings, we do expand our dataset over time. Then, we need to
-    re-train the network with the new data. Those algorithms that do not keep
-    the step constant, can reset it at the start of each training process.
-    """
-
-    decay_steps = 1
-    if self.hparams.activate_decay:
-      current_gs = self.sess.run(self.global_step)
-      with self.graph.as_default():
-        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
-                                              self.global_step - current_gs,
-                                              decay_steps,
-                                              self.hparams.lr_decay_rate)
-
-  def train(self, data, num_steps):
-    """Trains the BNN for num_steps, using the data in 'data'.
-
-    Args:
-      data: ContextualDataset object that provides the data.
-      num_steps: Number of minibatches to train the network for.
-    """
-
-    if self.times_trained < self.cleared_times_trained:
-      num_steps = int(self.training_schedule[self.times_trained])
-    self.times_trained += 1
-
-    if self.verbose:
-      print('Training {} for {} steps...'.format(self.name, num_steps))
-
-    with self.graph.as_default():
-
-      for step in range(num_steps):
-        x, y, w = data.get_batch_with_weights(self.hparams.batch_size)
-        _, summary, global_step, loss = self.sess.run(
-            [self.train_op, self.summary_op, self.global_step, self.loss],
-            feed_dict={self.x: x, self.y: y, self.weights: w,
-                       self.data_size: data.num_points()})
-
-        weights_l = self.sess.run(self.weights_std[0])
-        self.h_max_var.append(np.max(weights_l))
-
-        if step % self.freq_summary == 0:
-          if self.show_training:
-            print('step: {}, loss: {}'.format(step, loss))
-            sys.stdout.flush()
-          self.summary_writer.add_summary(summary, global_step)
--- a/research/deep_contextual_bandits/bandits/algorithms/bf_variational_neural_bandit_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/bf_variational_neural_bandit_model.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Bayesian NN using factorized VI (Bayes By Backprop. Blundell et al. 2014).
-
-See https://arxiv.org/abs/1505.05424 for details.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-# import tensorflow_probability as tfp
-
-from absl import flags
-from bandits.core.bayesian_nn import BayesianNN
-
-
-FLAGS = flags.FLAGS
-# tfd = tfp.distributions
-tfd = tf.contrib.distributions
-tfl = tf.contrib.layers
-
-
-def log_gaussian(x, mu, sigma, reduce_sum=True):
-  """Returns log Gaussian pdf."""
-  res = tfd.Normal(mu, sigma).log_prob(x)
-  if reduce_sum:
-    return tf.reduce_sum(res)
-  else:
-    return res
-
-
-def analytic_kl(mu_1, sigma_1, mu_2, sigma_2):
-  """KL for two Gaussian distributions with diagonal covariance matrix."""
-  kl = tfd.kl_divergence(tfd.MVNDiag(mu_1, sigma_1), tfd.MVNDiag(mu_2, sigma_2))
-  return kl
-
-
-class BfVariationalNeuralBanditModel(BayesianNN):
-  """Implements an approximate Bayesian NN using Variational Inference."""
-
-  def __init__(self, hparams, name="BBBNN"):
-
-    self.name = name
-    self.hparams = hparams
-
-    self.n_in = self.hparams.context_dim
-    self.n_out = self.hparams.num_actions
-    self.layers = self.hparams.layer_sizes
-    self.init_scale = self.hparams.init_scale
-    self.f_num_points = None
-    if "f_num_points" in hparams:
-      self.f_num_points = self.hparams.f_num_points
-
-    self.cleared_times_trained = self.hparams.cleared_times_trained
-    self.initial_training_steps = self.hparams.initial_training_steps
-    self.training_schedule = np.linspace(self.initial_training_steps,
-                                         self.hparams.training_epochs,
-                                         self.cleared_times_trained)
-    self.verbose = getattr(self.hparams, "verbose", True)
-
-    self.weights_m = {}
-    self.weights_std = {}
-    self.biases_m = {}
-    self.biases_std = {}
-
-    self.times_trained = 0
-
-    if self.hparams.use_sigma_exp_transform:
-      self.sigma_transform = tf.exp
-      self.inverse_sigma_transform = np.log
-    else:
-      self.sigma_transform = tf.nn.softplus
-      self.inverse_sigma_transform = lambda y: y + np.log(1. - np.exp(-y))
-
-    # Whether to use the local reparameterization trick to compute the loss.
-    # See details in https://arxiv.org/abs/1506.02557
-    self.use_local_reparameterization = True
-
-    self.build_graph()
-
-  def build_mu_variable(self, shape):
-    """Returns a mean variable initialized as N(0, 0.05)."""
-    return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
-
-  def build_sigma_variable(self, shape, init=-5.):
-    """Returns a sigma variable initialized as N(init, 0.05)."""
-    # Initialize sigma to be very small initially to encourage MAP opt first
-    return tf.Variable(tf.random_normal(shape, init, 0.05))
-
-  def build_layer(self, input_x, input_x_local, shape,
-                  layer_id, activation_fn=tf.nn.relu):
-    """Builds a variational layer, and computes KL term.
-
-    Args:
-      input_x: Input to the variational layer.
-      input_x_local: Input when the local reparameterization trick was applied.
-      shape: [number_inputs, number_outputs] for the layer.
-      layer_id: Number of layer in the architecture.
-      activation_fn: Activation function to apply.
-
-    Returns:
-      output_h: Output of the variational layer.
-      output_h_local: Output when local reparameterization trick was applied.
-      neg_kl: Negative KL term for the layer.
-    """
-
-    w_mu = self.build_mu_variable(shape)
-    w_sigma = self.sigma_transform(self.build_sigma_variable(shape))
-    w_noise = tf.random_normal(shape)
-    w = w_mu + w_sigma * w_noise
-
-    b_mu = self.build_mu_variable([1, shape[1]])
-    b_sigma = self.sigma_transform(self.build_sigma_variable([1, shape[1]]))
-    b = b_mu
-
-    # Store means and stds
-    self.weights_m[layer_id] = w_mu
-    self.weights_std[layer_id] = w_sigma
-    self.biases_m[layer_id] = b_mu
-    self.biases_std[layer_id] = b_sigma
-
-    # Create outputs
-    output_h = activation_fn(tf.matmul(input_x, w) + b)
-
-    if self.use_local_reparameterization:
-      # Use analytic KL divergence wrt the prior
-      neg_kl = -analytic_kl(w_mu, w_sigma,
-                            0., tf.to_float(np.sqrt(2./shape[0])))
-    else:
-      # Create empirical KL loss terms
-      log_p = log_gaussian(w, 0., tf.to_float(np.sqrt(2./shape[0])))
-      log_q = log_gaussian(w, tf.stop_gradient(w_mu), tf.stop_gradient(w_sigma))
-      neg_kl = log_p - log_q
-
-    # Apply local reparameterization trick: sample activations pre nonlinearity
-    m_h = tf.matmul(input_x_local, w_mu) + b
-    v_h = tf.matmul(tf.square(input_x_local), tf.square(w_sigma))
-    output_h_local = m_h + tf.sqrt(v_h + 1e-6) * tf.random_normal(tf.shape(v_h))
-    output_h_local = activation_fn(output_h_local)
-
-    return output_h, output_h_local, neg_kl
-
-  def build_action_noise(self):
-    """Defines a model for additive noise per action, and its KL term."""
-
-    # Define mean and std variables (log-normal dist) for each action.
-    noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
-                      + self.inverse_sigma_transform(self.hparams.noise_sigma))
-    noise_sigma_sigma = self.sigma_transform(
-        self.build_sigma_variable([1, self.n_out]))
-
-    pre_noise_sigma = (noise_sigma_mu
-                       + tf.random_normal([1, self.n_out]) * noise_sigma_sigma)
-    self.noise_sigma = self.sigma_transform(pre_noise_sigma)
-
-    # Compute KL for additive noise sigma terms.
-    if getattr(self.hparams, "infer_noise_sigma", False):
-      neg_kl_term = log_gaussian(
-          pre_noise_sigma,
-          self.inverse_sigma_transform(self.hparams.noise_sigma),
-          self.hparams.prior_sigma
-      )
-      neg_kl_term -= log_gaussian(pre_noise_sigma,
-                                  noise_sigma_mu,
-                                  noise_sigma_sigma)
-    else:
-      neg_kl_term = 0.
-
-    return neg_kl_term
-
-  def build_model(self, activation_fn=tf.nn.relu):
-    """Defines the actual NN model with fully connected layers.
-
-    The loss is computed for partial feedback settings (bandits), so only
-    the observed outcome is backpropagated (see weighted loss).
-    Selects the optimizer and, finally, it also initializes the graph.
-
-    Args:
-      activation_fn: the activation function used in the nn layers.
-    """
-
-    def weight_prior(dtype, shape, c, d, e):
-      del c, d, e
-      return tfd.Independent(
-          tfd.Normal(loc=tf.zeros(shape, dtype),
-                     scale=tf.to_float(np.sqrt(2) / shape[0])),
-          reinterpreted_batch_ndims=tf.size(shape))
-
-    if self.verbose:
-      print("Initializing model {}.".format(self.name))
-
-    # Compute model additive noise for each action with log-normal distribution
-    neg_kl_term = self.build_action_noise()
-
-    # Build variational network using self.x as input.
-    input_x = self.x
-
-    # Create Keras model using DenseLocalReparameterization (prior N(0, 1)).
-    model_layers = [
-        tfl.DenseLocalReparameterization(
-            n_nodes,
-            activation=tf.nn.relu,
-            kernel_prior_fn=weight_prior
-        )
-        for n_nodes in self.layers if n_nodes > 0
-    ]
-
-    output_layer = tfl.DenseLocalReparameterization(
-        self.n_out,
-        activation=lambda x: x,
-        kernel_prior_fn=weight_prior
-    )
-    model_layers.append(output_layer)
-
-    model = tf.keras.Sequential(model_layers)
-    self.y_pred = model(input_x)
-
-    # Compute KL term
-    neg_kl_term -= tf.add_n(model.losses)
-
-    # Compute log likelihood (with learned or fixed noise level)
-    if getattr(self.hparams, "infer_noise_sigma", False):
-      log_likelihood = log_gaussian(
-          self.y, self.y_pred, self.noise_sigma, reduce_sum=False)
-    else:
-      log_likelihood = log_gaussian(
-          self.y, self.y_pred, self.hparams.noise_sigma, reduce_sum=False)
-
-    # Only take into account observed outcomes (bandits setting)
-    batch_size = tf.to_float(tf.shape(self.x)[0])
-    weighted_log_likelihood = tf.reduce_sum(
-        log_likelihood * self.weights) / batch_size
-
-    # The objective is 1/n * (\sum_i log_like_i - KL); neg_kl_term estimates -KL
-    elbo = weighted_log_likelihood + (neg_kl_term / self.n)
-
-    self.loss = -elbo
-    self.global_step = tf.train.get_or_create_global_step()
-    self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
-        self.loss, global_step=self.global_step)
-
-    # Create tensorboard metrics
-    self.create_summaries()
-    self.summary_writer = tf.summary.FileWriter(
-        "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
-
-  def build_graph(self):
-    """Defines graph, session, placeholders, and model.
-
-    Placeholders are: n (size of the dataset), x and y (context and observed
-    reward for each action), and weights (one-hot encoding of selected action
-    for each context, i.e., only possibly non-zero element in each y).
-    """
-
-    self.graph = tf.Graph()
-    with self.graph.as_default():
-
-      self.sess = tf.Session()
-
-      self.n = tf.placeholder(shape=[], dtype=tf.float32)
-
-      self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float32)
-      self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
-      self.weights = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
-
-      self.build_model()
-      self.sess.run(tf.global_variables_initializer())
-
-  def create_summaries(self):
-    """Defines summaries including mean loss, and global step."""
-
-    with self.graph.as_default():
-      with tf.name_scope(self.name + "_summaries"):
-        tf.summary.scalar("loss", self.loss)
-        tf.summary.scalar("global_step", self.global_step)
-        self.summary_op = tf.summary.merge_all()
-
-  def assign_lr(self):
-    """Resets the learning rate in dynamic schedules for subsequent trainings.
-
-    In bandits settings, we do expand our dataset over time. Then, we need to
-    re-train the network with the new data. The algorithms that do not keep
-    the step constant, can reset it at the start of each *training* process.
-    """
-
-    decay_steps = 1
-    if self.hparams.activate_decay:
-      current_gs = self.sess.run(self.global_step)
-      with self.graph.as_default():
-        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
-                                              self.global_step - current_gs,
-                                              decay_steps,
-                                              self.hparams.lr_decay_rate)
-
-  def train(self, data, num_steps):
-    """Trains the BNN for num_steps, using the data in 'data'.
-
-    Args:
-      data: ContextualDataset object that provides the data.
-      num_steps: Number of minibatches to train the network for.
-
-    Returns:
-      losses: Loss history during training.
-    """
-
-    if self.times_trained < self.cleared_times_trained:
-      num_steps = int(self.training_schedule[self.times_trained])
-    self.times_trained += 1
-
-    losses = []
-
-    with self.graph.as_default():
-
-      if self.verbose:
-        print("Training {} for {} steps...".format(self.name, num_steps))
-
-      for step in range(num_steps):
-        x, y, weights = data.get_batch_with_weights(self.hparams.batch_size)
-        _, summary, global_step, loss = self.sess.run(
-            [self.train_op, self.summary_op, self.global_step, self.loss],
-            feed_dict={
-                self.x: x,
-                self.y: y,
-                self.weights: weights,
-                self.n: data.num_points(self.f_num_points),
-            })
-
-        losses.append(loss)
-
-        if step % self.hparams.freq_summary == 0:
-          if self.hparams.show_training:
-            print("{} | step: {}, loss: {}".format(
-                self.name, global_step, loss))
-          self.summary_writer.add_summary(summary, global_step)
-
-    return losses
--- a/research/deep_contextual_bandits/bandits/algorithms/bootstrapped_bnn_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/bootstrapped_bnn_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contextual algorithm based on boostrapping neural networks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-from bandits.core.contextual_dataset import ContextualDataset
-from bandits.algorithms.neural_bandit_model import NeuralBanditModel
-
-
-class BootstrappedBNNSampling(BanditAlgorithm):
-  """Thompson Sampling algorithm based on training several neural networks."""
-
-  def __init__(self, name, hparams, optimizer='RMS'):
-    """Creates a BootstrappedSGDSampling object based on a specific optimizer.
-
-      hparams.q: Number of models that are independently trained.
-      hparams.p: Prob of independently including each datapoint in each model.
-
-    Args:
-      name: Name given to the instance.
-      hparams: Hyperparameters for each individual model.
-      optimizer: Neural network optimization algorithm.
-    """
-
-    self.name = name
-    self.hparams = hparams
-    self.optimizer_n = optimizer
-
-    self.training_freq = hparams.training_freq
-    self.training_epochs = hparams.training_epochs
-    self.t = 0
-
-    self.q = hparams.q
-    self.p = hparams.p
-
-    self.datasets = [
-        ContextualDataset(hparams.context_dim,
-                          hparams.num_actions,
-                          hparams.buffer_s)
-        for _ in range(self.q)
-    ]
-
-    self.bnn_boot = [
-        NeuralBanditModel(optimizer, hparams, '{}-{}-bnn'.format(name, i))
-        for i in range(self.q)
-    ]
-
-  def action(self, context):
-    """Selects action for context based on Thompson Sampling using one BNN."""
-
-    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
-      # round robin until each action has been taken "initial_pulls" times
-      return self.t % self.hparams.num_actions
-
-    # choose model uniformly at random
-    model_index = np.random.randint(self.q)
-
-    with self.bnn_boot[model_index].graph.as_default():
-      c = context.reshape((1, self.hparams.context_dim))
-      output = self.bnn_boot[model_index].sess.run(
-          self.bnn_boot[model_index].y_pred,
-          feed_dict={self.bnn_boot[model_index].x: c})
-      return np.argmax(output)
-
-  def update(self, context, action, reward):
-    """Updates the data buffer, and re-trains the BNN every self.freq_update."""
-
-    self.t += 1
-    for i in range(self.q):
-      # include the data point with probability p independently in each dataset
-      if np.random.random() < self.p or self.t < 2:
-        self.datasets[i].add(context, action, reward)
-
-    if self.t % self.training_freq == 0:
-      # update all the models:
-      for i in range(self.q):
-        if self.hparams.reset_lr:
-          self.bnn_boot[i].assign_lr()
-        self.bnn_boot[i].train(self.datasets[i], self.training_epochs)
--- a/research/deep_contextual_bandits/bandits/algorithms/fixed_policy_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/fixed_policy_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contextual bandit algorithm that selects an action at random."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-
-
-class FixedPolicySampling(BanditAlgorithm):
-  """Defines a baseline; returns an action at random with probs given by p."""
-
-  def __init__(self, name, p, hparams):
-    """Creates a FixedPolicySampling object.
-
-    Args:
-      name: Name of the algorithm.
-      p: Vector of normalized probabilities corresponding to sampling each arm.
-      hparams: Hyper-parameters, including the number of arms (num_actions).
-
-    Raises:
-      ValueError: when p dimension does not match the number of actions.
-    """
-
-    self.name = name
-    self.p = p
-    self.hparams = hparams
-
-    if len(p) != self.hparams.num_actions:
-      raise ValueError('Policy needs k probabilities.')
-
-  def action(self, context):
-    """Selects an action at random according to distribution p."""
-    return np.random.choice(range(self.hparams.num_actions), p=self.p)
--- a/research/deep_contextual_bandits/bandits/algorithms/linear_full_posterior_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/linear_full_posterior_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contextual algorithm that keeps a full linear posterior for each arm."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from scipy.stats import invgamma
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-from bandits.core.contextual_dataset import ContextualDataset
-
-
-class LinearFullPosteriorSampling(BanditAlgorithm):
-  """Thompson Sampling with independent linear models and unknown noise var."""
-
-  def __init__(self, name, hparams):
-    """Initialize posterior distributions and hyperparameters.
-
-    Assume a linear model for each action i: reward = context^T beta_i + noise
-    Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
-    level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
-    and precision matrices are initialized, and the ContextualDataset created.
-
-    Args:
-      name: Name of the algorithm.
-      hparams: Hyper-parameters of the algorithm.
-    """
-
-    self.name = name
-    self.hparams = hparams
-
-    # Gaussian prior for each beta_i
-    self._lambda_prior = self.hparams.lambda_prior
-
-    self.mu = [
-        np.zeros(self.hparams.context_dim + 1)
-        for _ in range(self.hparams.num_actions)
-    ]
-
-    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
-                for _ in range(self.hparams.num_actions)]
-
-    self.precision = [
-        self.lambda_prior * np.eye(self.hparams.context_dim + 1)
-        for _ in range(self.hparams.num_actions)
-    ]
-
-    # Inverse Gamma prior for each sigma2_i
-    self._a0 = self.hparams.a0
-    self._b0 = self.hparams.b0
-
-    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
-    self.b = [self._b0 for _ in range(self.hparams.num_actions)]
-
-    self.t = 0
-    self.data_h = ContextualDataset(hparams.context_dim,
-                                    hparams.num_actions,
-                                    intercept=True)
-
-  def action(self, context):
-    """Samples beta's from posterior, and chooses best action accordingly.
-
-    Args:
-      context: Context for which the action need to be chosen.
-
-    Returns:
-      action: Selected action for the context.
-    """
-
-    # Round robin until each action has been selected "initial_pulls" times
-    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
-      return self.t % self.hparams.num_actions
-
-    # Sample sigma2, and beta conditional on sigma2
-    sigma2_s = [
-        self.b[i] * invgamma.rvs(self.a[i])
-        for i in range(self.hparams.num_actions)
-    ]
-
-    try:
-      beta_s = [
-          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
-          for i in range(self.hparams.num_actions)
-      ]
-    except np.linalg.LinAlgError as e:
-      # Sampling could fail if covariance is not positive definite
-      print('Exception when sampling from {}.'.format(self.name))
-      print('Details: {} | {}.'.format(e.message, e.args))
-      d = self.hparams.context_dim + 1
-      beta_s = [
-          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
-          for i in range(self.hparams.num_actions)
-      ]
-
-    # Compute sampled expected values, intercept is last component of beta
-    vals = [
-        np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
-        for i in range(self.hparams.num_actions)
-    ]
-
-    return np.argmax(vals)
-
-  def update(self, context, action, reward):
-    """Updates action posterior using the linear Bayesian regression formula.
-
-    Args:
-      context: Last observed context.
-      action: Last observed action.
-      reward: Last observed reward.
-    """
-
-    self.t += 1
-    self.data_h.add(context, action, reward)
-
-    # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
-    x, y = self.data_h.get_data(action)
-
-    # The algorithm could be improved with sequential update formulas (cheaper)
-    s = np.dot(x.T, x)
-
-    # Some terms are removed as we assume prior mu_0 = 0.
-    precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1)
-    cov_a = np.linalg.inv(precision_a)
-    mu_a = np.dot(cov_a, np.dot(x.T, y))
-
-    # Inverse Gamma posterior update
-    a_post = self.a0 + x.shape[0] / 2.0
-    b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a)))
-    b_post = self.b0 + b_upd
-
-    # Store new posterior distributions
-    self.mu[action] = mu_a
-    self.cov[action] = cov_a
-    self.precision[action] = precision_a
-    self.a[action] = a_post
-    self.b[action] = b_post
-
-  @property
-  def a0(self):
-    return self._a0
-
-  @property
-  def b0(self):
-    return self._b0
-
-  @property
-  def lambda_prior(self):
-    return self._lambda_prior
--- a/research/deep_contextual_bandits/bandits/algorithms/multitask_gp.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/multitask_gp.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A Multitask Gaussian process."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import flags
-from absl import logging
-
-import numpy as np
-import tensorflow as tf
-from bandits.core.bayesian_nn import BayesianNN
-
-FLAGS = flags.FLAGS
-tfd = tf.contrib.distributions
-
-class MultitaskGP(BayesianNN):
-  """Implements a Gaussian process with multi-task outputs.
-
-  Optimizes the hyperparameters over the log marginal likelihood.
-  Uses a Matern 3/2 + linear covariance and returns
-  sampled predictions for test inputs.  The outputs are optionally
-  correlated where the correlation structure is learned through latent
-  embeddings of the tasks.
-  """
-
-  def __init__(self, hparams):
-    self.name = "MultiTaskGP"
-    self.hparams = hparams
-
-    self.n_in = self.hparams.context_dim
-    self.n_out = self.hparams.num_outputs
-    self.keep_fixed_after_max_obs = self.hparams.keep_fixed_after_max_obs
-
-    self._show_training = self.hparams.show_training
-    self._freq_summary = self.hparams.freq_summary
-
-    # Dimensionality of the latent task vectors
-    self.task_latent_dim = self.hparams.task_latent_dim
-
-    # Maximum number of observations to include
-    self.max_num_points = self.hparams.max_num_points
-
-    if self.hparams.learn_embeddings:
-      self.learn_embeddings = self.hparams.learn_embeddings
-    else:
-      self.learn_embeddings = False
-
-    # create the graph corresponding to the BNN instance
-    self.graph = tf.Graph()
-    with self.graph.as_default():
-      # store a new session for the graph
-      self.sess = tf.Session()
-
-      with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
-        self.n = tf.placeholder(shape=[], dtype=tf.float64)
-        self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float64)
-        self.x_in = tf.placeholder(shape=[None, self.n_in], dtype=tf.float64)
-        self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float64)
-        self.weights = tf.placeholder(shape=[None, self.n_out],
-                                      dtype=tf.float64)
-
-        self.build_model()
-      self.sess.run(tf.global_variables_initializer())
-
-  def atleast_2d(self, x, dims):
-    return tf.reshape(tf.expand_dims(x, axis=0), (-1, dims))
-
-  def sq_dist(self, x, x2):
-    a2 = tf.reduce_sum(tf.square(x), 1)
-    b2 = tf.reduce_sum(tf.square(x2), 1)
-    sqdists = tf.expand_dims(a2, 1) + b2 - 2.0 * tf.matmul(x, tf.transpose(x2))
-    return sqdists
-
-  # Covariance between outputs
-  def task_cov(self, x, x2):
-    """Squared Exponential Covariance Kernel over latent task embeddings."""
-    # Index into latent task vectors
-    x_vecs = tf.gather(self.task_vectors, tf.argmax(x, axis=1), axis=0)
-    x2_vecs = tf.gather(self.task_vectors, tf.argmax(x2, axis=1), axis=0)
-    r = self.sq_dist(self.atleast_2d(x_vecs, self.task_latent_dim),
-                     self.atleast_2d(x2_vecs, self.task_latent_dim))
-    return tf.exp(-r)
-
-  def cov(self, x, x2):
-    """Matern 3/2 + Linear Gaussian Process Covariance Function."""
-    ls = tf.clip_by_value(self.length_scales, -5.0, 5.0)
-    ls_lin = tf.clip_by_value(self.length_scales_lin, -5.0, 5.0)
-    r = self.sq_dist(self.atleast_2d(x, self.n_in)/tf.nn.softplus(ls),
-                     self.atleast_2d(x2, self.n_in)/tf.nn.softplus(ls))
-    r = tf.clip_by_value(r, 0, 1e8)
-
-    # Matern 3/2 Covariance
-    matern = (1.0 + tf.sqrt(3.0*r + 1e-16)) * tf.exp(-tf.sqrt(3.0*r + 1e-16))
-    # Linear Covariance
-    lin = tf.matmul(x / tf.nn.softplus(ls_lin),
-                    x2 / tf.nn.softplus(ls_lin), transpose_b=True)
-    return (tf.nn.softplus(self.amplitude) * matern +
-            tf.nn.softplus(self.amplitude_linear) * lin)
-
-  def build_model(self):
-    """Defines the GP model.
-
-    The loss is computed for partial feedback settings (bandits), so only
-    the observed outcome is backpropagated (see weighted loss).
-    Selects the optimizer and, finally, it also initializes the graph.
-    """
-
-    logging.info("Initializing model %s.", self.name)
-    self.global_step = tf.train.get_or_create_global_step()
-
-    # Define state for the model (inputs, etc.)
-    self.x_train = tf.get_variable(
-        "training_data",
-        initializer=tf.ones(
-            [self.hparams.batch_size, self.n_in], dtype=tf.float64),
-        validate_shape=False,
-        trainable=False)
-    self.y_train = tf.get_variable(
-        "training_labels",
-        initializer=tf.zeros([self.hparams.batch_size, 1], dtype=tf.float64),
-        validate_shape=False,
-        trainable=False)
-    self.weights_train = tf.get_variable(
-        "weights_train",
-        initializer=tf.ones(
-            [self.hparams.batch_size, self.n_out], dtype=tf.float64),
-        validate_shape=False,
-        trainable=False)
-    self.input_op = tf.assign(self.x_train, self.x_in, validate_shape=False)
-    self.input_w_op = tf.assign(
-        self.weights_train, self.weights, validate_shape=False)
-
-    self.input_std = tf.get_variable(
-        "data_standard_deviation",
-        initializer=tf.ones([1, self.n_out], dtype=tf.float64),
-        dtype=tf.float64,
-        trainable=False)
-    self.input_mean = tf.get_variable(
-        "data_mean",
-        initializer=tf.zeros([1, self.n_out], dtype=tf.float64),
-        dtype=tf.float64,
-        trainable=True)
-
-    # GP Hyperparameters
-    self.noise = tf.get_variable(
-        "noise", initializer=tf.cast(0.0, dtype=tf.float64))
-    self.amplitude = tf.get_variable(
-        "amplitude", initializer=tf.cast(1.0, dtype=tf.float64))
-    self.amplitude_linear = tf.get_variable(
-        "linear_amplitude", initializer=tf.cast(1.0, dtype=tf.float64))
-    self.length_scales = tf.get_variable(
-        "length_scales", initializer=tf.zeros([1, self.n_in], dtype=tf.float64))
-    self.length_scales_lin = tf.get_variable(
-        "length_scales_linear",
-        initializer=tf.zeros([1, self.n_in], dtype=tf.float64))
-
-    # Latent embeddings of the different outputs for task covariance
-    self.task_vectors = tf.get_variable(
-        "latent_task_vectors",
-        initializer=tf.random_normal(
-            [self.n_out, self.task_latent_dim], dtype=tf.float64))
-
-    # Normalize outputs across each dimension
-    # Since we have different numbers of observations across each task, we
-    # normalize by their respective counts.
-    index_counts = self.atleast_2d(tf.reduce_sum(self.weights, axis=0),
-                                   self.n_out)
-    index_counts = tf.where(index_counts > 0, index_counts,
-                            tf.ones(tf.shape(index_counts), dtype=tf.float64))
-    self.mean_op = tf.assign(self.input_mean,
-                             tf.reduce_sum(self.y, axis=0) / index_counts)
-    self.var_op = tf.assign(
-        self.input_std, tf.sqrt(1e-4 + tf.reduce_sum(tf.square(
-            self.y - tf.reduce_sum(self.y, axis=0) / index_counts), axis=0)
-                                / index_counts))
-
-    with tf.control_dependencies([self.var_op]):
-      y_normed = self.atleast_2d(
-          (self.y - self.input_mean) / self.input_std, self.n_out)
-      y_normed = self.atleast_2d(tf.boolean_mask(y_normed, self.weights > 0), 1)
-    self.out_op = tf.assign(self.y_train, y_normed, validate_shape=False)
-
-    # Observation noise
-    alpha = tf.nn.softplus(self.noise) + 1e-6
-
-    # Covariance
-    with tf.control_dependencies([self.input_op, self.input_w_op, self.out_op]):
-      self.self_cov = (self.cov(self.x_in, self.x_in) *
-                       self.task_cov(self.weights, self.weights) +
-                       tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64) * alpha)
-
-    self.chol = tf.cholesky(self.self_cov)
-    self.kinv = tf.cholesky_solve(self.chol, tf.eye(tf.shape(self.x_in)[0],
-                                                    dtype=tf.float64))
-
-    self.input_inv = tf.Variable(
-        tf.eye(self.hparams.batch_size, dtype=tf.float64),
-        validate_shape=False,
-        trainable=False)
-    self.input_cov_op = tf.assign(self.input_inv, self.kinv,
-                                  validate_shape=False)
-
-    # Log determinant by taking the singular values along the diagonal
-    # of self.chol
-    with tf.control_dependencies([self.input_cov_op]):
-      logdet = 2.0 * tf.reduce_sum(tf.log(tf.diag_part(self.chol) + 1e-16))
-
-    # Log Marginal likelihood
-    self.marginal_ll = -tf.reduce_sum(-0.5 * tf.matmul(
-        tf.transpose(y_normed), tf.matmul(self.kinv, y_normed)) - 0.5 * logdet -
-                                      0.5 * self.n * np.log(2 * np.pi))
-
-    zero = tf.cast(0., dtype=tf.float64)
-    one = tf.cast(1., dtype=tf.float64)
-    standard_normal = tfd.Normal(loc=zero, scale=one)
-
-    # Loss is marginal likelihood and priors
-    self.loss = tf.reduce_sum(
-        self.marginal_ll -
-        (standard_normal.log_prob(self.amplitude) +
-         standard_normal.log_prob(tf.exp(self.noise)) +
-         standard_normal.log_prob(self.amplitude_linear) +
-         tfd.Normal(loc=zero, scale=one * 10.).log_prob(
-             self.task_vectors))
-    )
-
-    # Optimizer for hyperparameters
-    optimizer = tf.train.AdamOptimizer(learning_rate=self.hparams.lr)
-    vars_to_optimize = [
-        self.amplitude, self.length_scales, self.length_scales_lin,
-        self.amplitude_linear, self.noise, self.input_mean
-    ]
-
-    if self.learn_embeddings:
-      vars_to_optimize.append(self.task_vectors)
-    grads = optimizer.compute_gradients(self.loss, vars_to_optimize)
-    self.train_op = optimizer.apply_gradients(grads,
-                                              global_step=self.global_step)
-
-    # Predictions for test data
-    self.y_mean, self.y_pred = self.posterior_mean_and_sample(self.x)
-
-    # create tensorboard metrics
-    self.create_summaries()
-    self.summary_writer = tf.summary.FileWriter("{}/graph_{}".format(
-        FLAGS.logdir, self.name), self.sess.graph)
-    self.check = tf.add_check_numerics_ops()
-
-  def posterior_mean_and_sample(self, candidates):
-    """Draw samples for test predictions.
-
-    Given a Tensor of 'candidates' inputs, returns samples from the posterior
-    and the posterior mean prediction for those inputs.
-
-    Args:
-      candidates: A (num-examples x num-dims) Tensor containing the inputs for
-      which to return predictions.
-    Returns:
-      y_mean: The posterior mean prediction given these inputs
-      y_sample: A sample from the posterior of the outputs given these inputs
-    """
-    # Cross-covariance for test predictions
-    w = tf.identity(self.weights_train)
-    inds = tf.squeeze(
-        tf.reshape(
-            tf.tile(
-                tf.reshape(tf.range(self.n_out), (self.n_out, 1)),
-                (1, tf.shape(candidates)[0])), (-1, 1)))
-
-    cross_cov = self.cov(tf.tile(candidates, [self.n_out, 1]), self.x_train)
-    cross_task_cov = self.task_cov(tf.one_hot(inds, self.n_out), w)
-    cross_cov *= cross_task_cov
-
-    # Test mean prediction
-    y_mean = tf.matmul(cross_cov, tf.matmul(self.input_inv, self.y_train))
-
-    # Test sample predictions
-    # Note this can be done much more efficiently using Kronecker products
-    # if all tasks are fully observed (which we won't assume)
-    test_cov = (
-        self.cov(tf.tile(candidates, [self.n_out, 1]),
-                 tf.tile(candidates, [self.n_out, 1])) *
-        self.task_cov(tf.one_hot(inds, self.n_out),
-                      tf.one_hot(inds, self.n_out)) -
-        tf.matmul(cross_cov,
-                  tf.matmul(self.input_inv,
-                            tf.transpose(cross_cov))))
-
-    # Get the matrix square root through an SVD for drawing samples
-    # This seems more numerically stable than the Cholesky
-    s, _, v = tf.svd(test_cov, full_matrices=True)
-    test_sqrt = tf.matmul(v, tf.matmul(tf.diag(s), tf.transpose(v)))
-
-    y_sample = (
-        tf.matmul(
-            test_sqrt,
-            tf.random_normal([tf.shape(test_sqrt)[0], 1], dtype=tf.float64)) +
-        y_mean)
-
-    y_sample = (
-        tf.transpose(tf.reshape(y_sample,
-                                (self.n_out, -1))) * self.input_std +
-        self.input_mean)
-
-    return y_mean, y_sample
-
-  def create_summaries(self):
-    with self.graph.as_default():
-      tf.summary.scalar("loss", self.loss)
-      tf.summary.scalar("log_noise", self.noise)
-      tf.summary.scalar("log_amp", self.amplitude)
-      tf.summary.scalar("log_amp_lin", self.amplitude_linear)
-      tf.summary.histogram("length_scales", self.length_scales)
-      tf.summary.histogram("length_scales_lin", self.length_scales_lin)
-      self.summary_op = tf.summary.merge_all()
-
-  def train(self, data, num_steps):
-    """Trains the GP for num_steps, using the data in 'data'.
-
-    Args:
-      data: ContextualDataset object that provides the data.
-      num_steps: Number of minibatches to train the network for.
-    """
-
-    logging.info("Training %s for %d steps...", self.name, num_steps)
-    for step in range(num_steps):
-      numpts = min(data.num_points(None), self.max_num_points)
-      if numpts >= self.max_num_points and self.keep_fixed_after_max_obs:
-        x = data.contexts[:numpts, :]
-        y = data.rewards[:numpts, :]
-        weights = np.zeros((x.shape[0], self.n_out))
-        for i, val in enumerate(data.actions[:numpts]):
-          weights[i, val] = 1.0
-      else:
-        x, y, weights = data.get_batch_with_weights(numpts)
-
-      ops = [
-          self.global_step, self.summary_op, self.loss, self.noise,
-          self.amplitude, self.amplitude_linear, self.length_scales,
-          self.length_scales_lin, self.input_cov_op, self.input_op, self.var_op,
-          self.input_w_op, self.out_op, self.train_op
-      ]
-
-      res = self.sess.run(ops,
-                          feed_dict={self.x: x,
-                                     self.x_in: x,
-                                     self.y: y,
-                                     self.weights: weights,
-                                     self.n: numpts,
-                                    })
-
-      if step % self._freq_summary == 0:
-        if self._show_training:
-          logging.info("step: %d, loss: %g noise: %f amp: %f amp_lin: %f",
-                       step, res[2], res[3], res[4], res[5])
-      summary = res[1]
-      global_step = res[0]
-      self.summary_writer.add_summary(summary, global_step=global_step)
--- a/research/deep_contextual_bandits/bandits/algorithms/neural_bandit_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/neural_bandit_model.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Define a family of neural network architectures for bandits.
-
-The network accepts different type of optimizers that could lead to different
-approximations of the posterior distribution or simply to point estimates.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from absl import flags
-from bandits.core.bayesian_nn import BayesianNN
-
-FLAGS = flags.FLAGS
-
-
-class NeuralBanditModel(BayesianNN):
-  """Implements a neural network for bandit problems."""
-
-  def __init__(self, optimizer, hparams, name):
-    """Saves hyper-params and builds the Tensorflow graph."""
-
-    self.opt_name = optimizer
-    self.name = name
-    self.hparams = hparams
-    self.verbose = getattr(self.hparams, "verbose", True)
-    self.times_trained = 0
-    self.build_model()
-
-  def build_layer(self, x, num_units):
-    """Builds a layer with input x; dropout and layer norm if specified."""
-
-    init_s = self.hparams.init_scale
-
-    layer_n = getattr(self.hparams, "layer_norm", False)
-    dropout = getattr(self.hparams, "use_dropout", False)
-
-    nn = tf.contrib.layers.fully_connected(
-        x,
-        num_units,
-        activation_fn=self.hparams.activation,
-        normalizer_fn=None if not layer_n else tf.contrib.layers.layer_norm,
-        normalizer_params={},
-        weights_initializer=tf.random_uniform_initializer(-init_s, init_s)
-    )
-
-    if dropout:
-      nn = tf.nn.dropout(nn, self.hparams.keep_prob)
-
-    return nn
-
-  def forward_pass(self):
-
-    init_s = self.hparams.init_scale
-
-    scope_name = "prediction_{}".format(self.name)
-    with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
-      nn = self.x
-      for num_units in self.hparams.layer_sizes:
-        if num_units > 0:
-          nn = self.build_layer(nn, num_units)
-
-      y_pred = tf.layers.dense(
-          nn,
-          self.hparams.num_actions,
-          kernel_initializer=tf.random_uniform_initializer(-init_s, init_s))
-
-    return nn, y_pred
-
-  def build_model(self):
-    """Defines the actual NN model with fully connected layers.
-
-    The loss is computed for partial feedback settings (bandits), so only
-    the observed outcome is backpropagated (see weighted loss).
-    Selects the optimizer and, finally, it also initializes the graph.
-    """
-
-    # create and store the graph corresponding to the BNN instance
-    self.graph = tf.Graph()
-
-    with self.graph.as_default():
-
-      # create and store a new session for the graph
-      self.sess = tf.Session()
-
-      with tf.name_scope(self.name):
-
-        self.global_step = tf.train.get_or_create_global_step()
-
-        # context
-        self.x = tf.placeholder(
-            shape=[None, self.hparams.context_dim],
-            dtype=tf.float32,
-            name="{}_x".format(self.name))
-
-        # reward vector
-        self.y = tf.placeholder(
-            shape=[None, self.hparams.num_actions],
-            dtype=tf.float32,
-            name="{}_y".format(self.name))
-
-        # weights (1 for selected action, 0 otherwise)
-        self.weights = tf.placeholder(
-            shape=[None, self.hparams.num_actions],
-            dtype=tf.float32,
-            name="{}_w".format(self.name))
-
-        # with tf.variable_scope("prediction_{}".format(self.name)):
-        self.nn, self.y_pred = self.forward_pass()
-        self.loss = tf.squared_difference(self.y_pred, self.y)
-        self.weighted_loss = tf.multiply(self.weights, self.loss)
-        self.cost = tf.reduce_sum(self.weighted_loss) / self.hparams.batch_size
-
-        if self.hparams.activate_decay:
-          self.lr = tf.train.inverse_time_decay(
-              self.hparams.initial_lr, self.global_step,
-              1, self.hparams.lr_decay_rate)
-        else:
-          self.lr = tf.Variable(self.hparams.initial_lr, trainable=False)
-
-        # create tensorboard metrics
-        self.create_summaries()
-        self.summary_writer = tf.summary.FileWriter(
-            "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
-
-        tvars = tf.trainable_variables()
-        grads, _ = tf.clip_by_global_norm(
-            tf.gradients(self.cost, tvars), self.hparams.max_grad_norm)
-
-        self.optimizer = self.select_optimizer()
-
-        self.train_op = self.optimizer.apply_gradients(
-            zip(grads, tvars), global_step=self.global_step)
-
-        self.init = tf.global_variables_initializer()
-
-        self.initialize_graph()
-
-  def initialize_graph(self):
-    """Initializes all variables."""
-
-    with self.graph.as_default():
-      if self.verbose:
-        print("Initializing model {}.".format(self.name))
-      self.sess.run(self.init)
-
-  def assign_lr(self):
-    """Resets the learning rate in dynamic schedules for subsequent trainings.
-
-    In bandits settings, we do expand our dataset over time. Then, we need to
-    re-train the network with the new data. The algorithms that do not keep
-    the step constant, can reset it at the start of each *training* process.
-    """
-
-    decay_steps = 1
-    if self.hparams.activate_decay:
-      current_gs = self.sess.run(self.global_step)
-      with self.graph.as_default():
-        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
-                                              self.global_step - current_gs,
-                                              decay_steps,
-                                              self.hparams.lr_decay_rate)
-
-  def select_optimizer(self):
-    """Selects optimizer. To be extended (SGLD, KFAC, etc)."""
-    return tf.train.RMSPropOptimizer(self.lr)
-
-  def create_summaries(self):
-    """Defines summaries including mean loss, learning rate, and global step."""
-
-    with self.graph.as_default():
-      with tf.name_scope(self.name + "_summaries"):
-        tf.summary.scalar("cost", self.cost)
-        tf.summary.scalar("lr", self.lr)
-        tf.summary.scalar("global_step", self.global_step)
-        self.summary_op = tf.summary.merge_all()
-
-  def train(self, data, num_steps):
-    """Trains the network for num_steps, using the provided data.
-
-    Args:
-      data: ContextualDataset object that provides the data.
-      num_steps: Number of minibatches to train the network for.
-    """
-
-    if self.verbose:
-      print("Training {} for {} steps...".format(self.name, num_steps))
-
-    with self.graph.as_default():
-
-      for step in range(num_steps):
-        x, y, w = data.get_batch_with_weights(self.hparams.batch_size)
-        _, cost, summary, lr = self.sess.run(
-            [self.train_op, self.cost, self.summary_op, self.lr],
-            feed_dict={self.x: x, self.y: y, self.weights: w})
-
-        if step % self.hparams.freq_summary == 0:
-          if self.hparams.show_training:
-            print("{} | step: {}, lr: {}, loss: {}".format(
-                self.name, step, lr, cost))
-          self.summary_writer.add_summary(summary, step)
-
-      self.times_trained += 1
--- a/research/deep_contextual_bandits/bandits/algorithms/neural_linear_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/neural_linear_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Thompson Sampling with linear posterior over a learnt deep representation."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from scipy.stats import invgamma
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-from bandits.core.contextual_dataset import ContextualDataset
-from bandits.algorithms.neural_bandit_model import NeuralBanditModel
-
-
-class NeuralLinearPosteriorSampling(BanditAlgorithm):
-  """Full Bayesian linear regression on the last layer of a deep neural net."""
-
-  def __init__(self, name, hparams, optimizer='RMS'):
-
-    self.name = name
-    self.hparams = hparams
-    self.latent_dim = self.hparams.layer_sizes[-1]
-
-    # Gaussian prior for each beta_i
-    self._lambda_prior = self.hparams.lambda_prior
-
-    self.mu = [
-        np.zeros(self.latent_dim)
-        for _ in range(self.hparams.num_actions)
-    ]
-
-    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
-                for _ in range(self.hparams.num_actions)]
-
-    self.precision = [
-        self.lambda_prior * np.eye(self.latent_dim)
-        for _ in range(self.hparams.num_actions)
-    ]
-
-    # Inverse Gamma prior for each sigma2_i
-    self._a0 = self.hparams.a0
-    self._b0 = self.hparams.b0
-
-    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
-    self.b = [self._b0 for _ in range(self.hparams.num_actions)]
-
-    # Regression and NN Update Frequency
-    self.update_freq_lr = hparams.training_freq
-    self.update_freq_nn = hparams.training_freq_network
-
-    self.t = 0
-    self.optimizer_n = optimizer
-
-    self.num_epochs = hparams.training_epochs
-    self.data_h = ContextualDataset(hparams.context_dim,
-                                    hparams.num_actions,
-                                    intercept=False)
-    self.latent_h = ContextualDataset(self.latent_dim,
-                                      hparams.num_actions,
-                                      intercept=False)
-    self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))
-
-  def action(self, context):
-    """Samples beta's from posterior, and chooses best action accordingly."""
-
-    # Round robin until each action has been selected "initial_pulls" times
-    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
-      return self.t % self.hparams.num_actions
-
-    # Sample sigma2, and beta conditional on sigma2
-    sigma2_s = [
-        self.b[i] * invgamma.rvs(self.a[i])
-        for i in range(self.hparams.num_actions)
-    ]
-
-    try:
-      beta_s = [
-          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
-          for i in range(self.hparams.num_actions)
-      ]
-    except np.linalg.LinAlgError as e:
-      # Sampling could fail if covariance is not positive definite
-      print('Exception when sampling for {}.'.format(self.name))
-      print('Details: {} | {}.'.format(e.message, e.args))
-      d = self.latent_dim
-      beta_s = [
-          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
-          for i in range(self.hparams.num_actions)
-      ]
-
-    # Compute last-layer representation for the current context
-    with self.bnn.graph.as_default():
-      c = context.reshape((1, self.hparams.context_dim))
-      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
-
-    # Apply Thompson Sampling to last-layer representation
-    vals = [
-        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
-    ]
-    return np.argmax(vals)
-
-  def update(self, context, action, reward):
-    """Updates the posterior using linear bayesian regression formula."""
-
-    self.t += 1
-    self.data_h.add(context, action, reward)
-    c = context.reshape((1, self.hparams.context_dim))
-    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
-    self.latent_h.add(z_context, action, reward)
-
-    # Retrain the network on the original data (data_h)
-    if self.t % self.update_freq_nn == 0:
-
-      if self.hparams.reset_lr:
-        self.bnn.assign_lr()
-      self.bnn.train(self.data_h, self.num_epochs)
-
-      # Update the latent representation of every datapoint collected so far
-      new_z = self.bnn.sess.run(self.bnn.nn,
-                                feed_dict={self.bnn.x: self.data_h.contexts})
-      self.latent_h.replace_data(contexts=new_z)
-
-    # Update the Bayesian Linear Regression
-    if self.t % self.update_freq_lr == 0:
-
-      # Find all the actions to update
-      actions_to_update = self.latent_h.actions[:-self.update_freq_lr]
-
-      for action_v in np.unique(actions_to_update):
-
-        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
-        z, y = self.latent_h.get_data(action_v)
-
-        # The algorithm could be improved with sequential formulas (cheaper)
-        s = np.dot(z.T, z)
-
-        # Some terms are removed as we assume prior mu_0 = 0.
-        precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
-        cov_a = np.linalg.inv(precision_a)
-        mu_a = np.dot(cov_a, np.dot(z.T, y))
-
-        # Inverse Gamma posterior update
-        a_post = self.a0 + z.shape[0] / 2.0
-        b_upd = 0.5 * np.dot(y.T, y)
-        b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
-        b_post = self.b0 + b_upd
-
-        # Store new posterior distributions
-        self.mu[action_v] = mu_a
-        self.cov[action_v] = cov_a
-        self.precision[action_v] = precision_a
-        self.a[action_v] = a_post
-        self.b[action_v] = b_post
-
-  @property
-  def a0(self):
-    return self._a0
-
-  @property
-  def b0(self):
-    return self._b0
-
-  @property
-  def lambda_prior(self):
-    return self._lambda_prior
--- a/research/deep_contextual_bandits/bandits/algorithms/parameter_noise_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/parameter_noise_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contextual algorithm based on Thompson Sampling + direct noise injection."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from scipy.special import logsumexp
-import tensorflow as tf
-
-from absl import flags
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-from bandits.core.contextual_dataset import ContextualDataset
-from bandits.algorithms.neural_bandit_model import NeuralBanditModel
-
-FLAGS = flags.FLAGS
-
-
-class ParameterNoiseSampling(BanditAlgorithm):
-  """Parameter Noise Sampling algorithm based on adding noise to net params.
-
-  Described in https://arxiv.org/abs/1706.01905
-  """
-
-  def __init__(self, name, hparams):
-    """Creates the algorithm, and sets up the adaptive Gaussian noise."""
-
-    self.name = name
-    self.hparams = hparams
-    self.verbose = getattr(self.hparams, 'verbose', True)
-    self.noise_std = getattr(self.hparams, 'noise_std', 0.005)
-    self.eps = getattr(self.hparams, 'eps', 0.05)
-    self.d_samples = getattr(self.hparams, 'd_samples', 300)
-    self.optimizer = getattr(self.hparams, 'optimizer', 'RMS')
-
-    # keep track of noise heuristic statistics
-    self.std_h = [self.noise_std]
-    self.eps_h = [self.eps]
-    self.kl_h = []
-    self.t = 0
-
-    self.freq_update = hparams.training_freq
-    self.num_epochs = hparams.training_epochs
-
-    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
-                                    hparams.buffer_s)
-    self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name))
-
-    with self.bnn.graph.as_default():
-
-      # noise-injection std placeholder
-      self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=())
-
-      # create noise corruption op; adds noise to all weights
-      tvars = tf.trainable_variables()
-      self.bnn.noisy_grads = [
-          tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph)
-          for v in tvars
-      ]
-
-      # add noise to all params, then compute prediction, then subtract.
-      with tf.control_dependencies(self.bnn.noisy_grads):
-        self.bnn.noise_add_ops = [
-            tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads)
-        ]
-        with tf.control_dependencies(self.bnn.noise_add_ops):
-          # we force the prediction for 'y' to be recomputed after adding noise
-          self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass()
-
-          self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val)
-          with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]):
-            self.bnn.noise_sub_ops = [
-                tvars[i].assign_add(-n)
-                for i, n in enumerate(self.bnn.noisy_grads)
-            ]
-
-  def action(self, context):
-    """Selects action based on Thompson Sampling *after* adding noise."""
-
-    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
-      # round robin until each action has been taken "initial_pulls" times
-      return self.t % self.hparams.num_actions
-
-    with self.bnn.graph.as_default():
-      # run noise prediction op to choose action, and subtract noise op after.
-      c = context.reshape((1, self.hparams.context_dim))
-      output, _ = self.bnn.sess.run(
-          [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
-          feed_dict={self.bnn.x: c,
-                     self.bnn.noise_std_ph: self.noise_std})
-      return np.argmax(output)
-
-  def update(self, context, action, reward):
-    """Updates the data buffer, and re-trains the BNN and noise level."""
-
-    self.t += 1
-    self.data_h.add(context, action, reward)
-
-    if self.t % self.freq_update == 0:
-      self.bnn.train(self.data_h, self.num_epochs)
-      self.update_noise()
-
-  def update_noise(self):
-    """Increase noise if distance btw original and corrupted distrib small."""
-
-    kl = self.compute_distance()
-    delta = -np.log1p(- self.eps + self.eps / self.hparams.num_actions)
-
-    if kl < delta:
-      self.noise_std *= 1.01
-    else:
-      self.noise_std /= 1.01
-
-    self.eps *= 0.99
-
-    if self.verbose:
-      print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.format(
-          self.eps, kl, self.noise_std, delta, kl < delta))
-
-    # store noise-injection statistics for inspection: std, KL, eps.
-    self.std_h.append(self.noise_std)
-    self.kl_h.append(kl)
-    self.eps_h.append(self.eps)
-
-  def compute_distance(self):
-    """Computes empirical KL for original and corrupted output distributions."""
-
-    random_inputs, _ = self.data_h.get_batch(self.d_samples)
-    y_model = self.bnn.sess.run(
-        self.bnn.y_pred,
-        feed_dict={
-            self.bnn.x: random_inputs,
-            self.bnn.noise_std_ph: self.noise_std
-        })
-    y_noisy, _ = self.bnn.sess.run(
-        [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
-        feed_dict={
-            self.bnn.x: random_inputs,
-            self.bnn.noise_std_ph: self.noise_std
-        })
-
-    if self.verbose:
-      # display how often original & perturbed models propose different actions
-      s = np.sum([np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :])
-                  for i in range(y_model.shape[0])])
-      print('{} | % of agreement btw original / corrupted actions: {}.'.format(
-          self.name, s / self.d_samples))
-
-    kl = self.compute_kl_with_logits(y_model, y_noisy)
-    return kl
-
-  def compute_kl_with_logits(self, logits1, logits2):
-    """Computes KL from logits samples from two distributions."""
-
-    def exp_times_diff(a, b):
-      return np.multiply(np.exp(a), a - b)
-
-    logsumexp1 = logsumexp(logits1, axis=1)
-    logsumexp2 = logsumexp(logits2, axis=1)
-    logsumexp_diff = logsumexp2 - logsumexp1
-
-    exp_diff = exp_times_diff(logits1, logits2)
-    exp_diff = np.sum(exp_diff, axis=1)
-
-    inv_exp_sum = np.sum(np.exp(logits1), axis=1)
-    term1 = np.divide(exp_diff, inv_exp_sum)
-
-    kl = term1 + logsumexp_diff
-    kl = np.maximum(kl, 0.0)
-    kl = np.nan_to_num(kl)
-    return np.mean(kl)
--- a/research/deep_contextual_bandits/bandits/algorithms/posterior_bnn_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/posterior_bnn_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contextual bandit algorithm based on Thompson Sampling and a Bayesian NN."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-from bandits.algorithms.bb_alpha_divergence_model import BBAlphaDivergence
-from bandits.algorithms.bf_variational_neural_bandit_model import BfVariationalNeuralBanditModel
-from bandits.core.contextual_dataset import ContextualDataset
-from bandits.algorithms.multitask_gp import MultitaskGP
-from bandits.algorithms.neural_bandit_model import NeuralBanditModel
-from bandits.algorithms.variational_neural_bandit_model import VariationalNeuralBanditModel
-
-
-class PosteriorBNNSampling(BanditAlgorithm):
-  """Posterior Sampling algorithm based on a Bayesian neural network."""
-
-  def __init__(self, name, hparams, bnn_model='RMSProp'):
-    """Creates a PosteriorBNNSampling object based on a specific optimizer.
-
-    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
-    The Bayesian Network keeps the posterior based on the optimizer iterations.
-
-    Args:
-      name: Name of the algorithm.
-      hparams: Hyper-parameters of the algorithm.
-      bnn_model: Type of BNN. By default RMSProp (point estimate).
-    """
-
-    self.name = name
-    self.hparams = hparams
-    self.optimizer_n = hparams.optimizer
-
-    self.training_freq = hparams.training_freq
-    self.training_epochs = hparams.training_epochs
-    self.t = 0
-    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
-                                    hparams.buffer_s)
-
-    # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
-    bnn_name = '{}-bnn'.format(name)
-    if bnn_model == 'Variational':
-      self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
-    elif bnn_model == 'AlphaDiv':
-      self.bnn = BBAlphaDivergence(hparams, bnn_name)
-    elif bnn_model == 'Variational_BF':
-      self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
-    elif bnn_model == 'GP':
-      self.bnn = MultitaskGP(hparams)
-    else:
-      self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)
-
-  def action(self, context):
-    """Selects action for context based on Thompson Sampling using the BNN."""
-
-    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
-      # round robin until each action has been taken "initial_pulls" times
-      return self.t % self.hparams.num_actions
-
-    with self.bnn.graph.as_default():
-      c = context.reshape((1, self.hparams.context_dim))
-      output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
-      return np.argmax(output)
-
-  def update(self, context, action, reward):
-    """Updates data buffer, and re-trains the BNN every training_freq steps."""
-
-    self.t += 1
-    self.data_h.add(context, action, reward)
-
-    if self.t % self.training_freq == 0:
-      if self.hparams.reset_lr:
-        self.bnn.assign_lr()
-      self.bnn.train(self.data_h, self.training_epochs)
--- a/research/deep_contextual_bandits/bandits/algorithms/uniform_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/uniform_sampling.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contextual bandit algorithm that selects an action uniformly at random."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from bandits.core.bandit_algorithm import BanditAlgorithm
-
-
-class UniformSampling(BanditAlgorithm):
-  """Defines a baseline; returns one action uniformly at random."""
-
-  def __init__(self, name, hparams):
-    """Creates a UniformSampling object.
-
-    Args:
-      name: Name of the algorithm.
-      hparams: Hyper-parameters, including the number of arms (num_actions).
-    """
-
-    self.name = name
-    self.hparams = hparams
-
-  def action(self, context):
-    """Selects an action uniformly at random."""
-    return np.random.choice(range(self.hparams.num_actions))
--- a/research/deep_contextual_bandits/bandits/algorithms/variational_neural_bandit_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/variational_neural_bandit_model.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Bayesian NN using factorized VI (Bayes By Backprop. Blundell et al. 2014).
-
-See https://arxiv.org/abs/1505.05424 for details.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from absl import flags
-from bandits.core.bayesian_nn import BayesianNN
-
-FLAGS = flags.FLAGS
-
-
-def log_gaussian(x, mu, sigma, reduce_sum=True):
-  """Returns log Gaussian pdf."""
-  res = (-0.5 * np.log(2 * np.pi) - tf.log(sigma) - tf.square(x - mu) /
-         (2 * tf.square(sigma)))
-  if reduce_sum:
-    return tf.reduce_sum(res)
-  else:
-    return res
-
-
-def analytic_kl(mu_1, sigma_1, mu_2, sigma_2):
-  """KL for two Gaussian distributions with diagonal covariance matrix."""
-  sigma_1_sq = tf.square(sigma_1)
-  sigma_2_sq = tf.square(sigma_2)
-
-  t1 = tf.square(mu_1 - mu_2) / (2. * sigma_2_sq)
-  t2 = (sigma_1_sq/sigma_2_sq - 1. - tf.log(sigma_1_sq) + tf.log(sigma_2_sq))/2.
-  return tf.reduce_sum(t1 + t2)
-
-
-class VariationalNeuralBanditModel(BayesianNN):
-  """Implements an approximate Bayesian NN using Variational Inference."""
-
-  def __init__(self, hparams, name="BBBNN"):
-
-    self.name = name
-    self.hparams = hparams
-
-    self.n_in = self.hparams.context_dim
-    self.n_out = self.hparams.num_actions
-    self.layers = self.hparams.layer_sizes
-    self.init_scale = self.hparams.init_scale
-    self.f_num_points = None
-    if "f_num_points" in hparams:
-      self.f_num_points = self.hparams.f_num_points
-
-    self.cleared_times_trained = self.hparams.cleared_times_trained
-    self.initial_training_steps = self.hparams.initial_training_steps
-    self.training_schedule = np.linspace(self.initial_training_steps,
-                                         self.hparams.training_epochs,
-                                         self.cleared_times_trained)
-    self.verbose = getattr(self.hparams, "verbose", True)
-
-    self.weights_m = {}
-    self.weights_std = {}
-    self.biases_m = {}
-    self.biases_std = {}
-
-    self.times_trained = 0
-
-    if self.hparams.use_sigma_exp_transform:
-      self.sigma_transform = tf.exp
-      self.inverse_sigma_transform = np.log
-    else:
-      self.sigma_transform = tf.nn.softplus
-      self.inverse_sigma_transform = lambda y: y + np.log(1. - np.exp(-y))
-
-    # Whether to use the local reparameterization trick to compute the loss.
-    # See details in https://arxiv.org/abs/1506.02557
-    self.use_local_reparameterization = True
-
-    self.build_graph()
-
-  def build_mu_variable(self, shape):
-    """Returns a mean variable initialized as N(0, 0.05)."""
-    return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
-
-  def build_sigma_variable(self, shape, init=-5.):
-    """Returns a sigma variable initialized as N(init, 0.05)."""
-    # Initialize sigma to be very small initially to encourage MAP opt first
-    return tf.Variable(tf.random_normal(shape, init, 0.05))
-
-  def build_layer(self, input_x, input_x_local, shape,
-                  layer_id, activation_fn=tf.nn.relu):
-    """Builds a variational layer, and computes KL term.
-
-    Args:
-      input_x: Input to the variational layer.
-      input_x_local: Input when the local reparameterization trick was applied.
-      shape: [number_inputs, number_outputs] for the layer.
-      layer_id: Number of layer in the architecture.
-      activation_fn: Activation function to apply.
-
-    Returns:
-      output_h: Output of the variational layer.
-      output_h_local: Output when local reparameterization trick was applied.
-      neg_kl: Negative KL term for the layer.
-    """
-
-    w_mu = self.build_mu_variable(shape)
-    w_sigma = self.sigma_transform(self.build_sigma_variable(shape))
-    w_noise = tf.random_normal(shape)
-    w = w_mu + w_sigma * w_noise
-
-    b_mu = self.build_mu_variable([1, shape[1]])
-    b_sigma = self.sigma_transform(self.build_sigma_variable([1, shape[1]]))
-    b = b_mu
-
-    # Store means and stds
-    self.weights_m[layer_id] = w_mu
-    self.weights_std[layer_id] = w_sigma
-    self.biases_m[layer_id] = b_mu
-    self.biases_std[layer_id] = b_sigma
-
-    # Create outputs
-    output_h = activation_fn(tf.matmul(input_x, w) + b)
-
-    if self.use_local_reparameterization:
-      # Use analytic KL divergence wrt the prior
-      neg_kl = -analytic_kl(w_mu, w_sigma,
-                            0., tf.to_float(np.sqrt(2./shape[0])))
-    else:
-      # Create empirical KL loss terms
-      log_p = log_gaussian(w, 0., tf.to_float(np.sqrt(2./shape[0])))
-      log_q = log_gaussian(w, tf.stop_gradient(w_mu), tf.stop_gradient(w_sigma))
-      neg_kl = log_p - log_q
-
-    # Apply local reparameterization trick: sample activations pre nonlinearity
-    m_h = tf.matmul(input_x_local, w_mu) + b
-    v_h = tf.matmul(tf.square(input_x_local), tf.square(w_sigma))
-    output_h_local = m_h + tf.sqrt(v_h + 1e-6) * tf.random_normal(tf.shape(v_h))
-    output_h_local = activation_fn(output_h_local)
-
-    return output_h, output_h_local, neg_kl
-
-  def build_action_noise(self):
-    """Defines a model for additive noise per action, and its KL term."""
-
-    # Define mean and std variables (log-normal dist) for each action.
-    noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
-                      + self.inverse_sigma_transform(self.hparams.noise_sigma))
-    noise_sigma_sigma = self.sigma_transform(
-        self.build_sigma_variable([1, self.n_out]))
-
-    pre_noise_sigma = (noise_sigma_mu
-                       + tf.random_normal([1, self.n_out]) * noise_sigma_sigma)
-    self.noise_sigma = self.sigma_transform(pre_noise_sigma)
-
-    # Compute KL for additive noise sigma terms.
-    if getattr(self.hparams, "infer_noise_sigma", False):
-      neg_kl_term = log_gaussian(
-          pre_noise_sigma,
-          self.inverse_sigma_transform(self.hparams.noise_sigma),
-          self.hparams.prior_sigma
-      )
-      neg_kl_term -= log_gaussian(pre_noise_sigma,
-                                  noise_sigma_mu,
-                                  noise_sigma_sigma)
-    else:
-      neg_kl_term = 0.
-
-    return neg_kl_term
-
-  def build_model(self, activation_fn=tf.nn.relu):
-    """Defines the actual NN model with fully connected layers.
-
-    The loss is computed for partial feedback settings (bandits), so only
-    the observed outcome is backpropagated (see weighted loss).
-    Selects the optimizer and, finally, it also initializes the graph.
-
-    Args:
-      activation_fn: the activation function used in the nn layers.
-    """
-
-    if self.verbose:
-      print("Initializing model {}.".format(self.name))
-    neg_kl_term, l_number = 0, 0
-    use_local_reparameterization = self.use_local_reparameterization
-
-    # Compute model additive noise for each action with log-normal distribution
-    neg_kl_term += self.build_action_noise()
-
-    # Build network.
-    input_x = self.x
-    input_local = self.x
-    n_in = self.n_in
-
-    for l_number, n_nodes in enumerate(self.layers):
-      if n_nodes > 0:
-        h, h_local, neg_kl = self.build_layer(input_x, input_local,
-                                              [n_in, n_nodes], l_number)
-
-        neg_kl_term += neg_kl
-        input_x, input_local = h, h_local
-        n_in = n_nodes
-
-    # Create last linear layer
-    h, h_local, neg_kl = self.build_layer(input_x, input_local,
-                                          [n_in, self.n_out],
-                                          l_number + 1,
-                                          activation_fn=lambda x: x)
-    neg_kl_term += neg_kl
-
-    self.y_pred = h
-    self.y_pred_local = h_local
-
-    # Compute log likelihood (with learned or fixed noise level)
-    if getattr(self.hparams, "infer_noise_sigma", False):
-      log_likelihood = log_gaussian(
-          self.y, self.y_pred_local, self.noise_sigma, reduce_sum=False)
-    else:
-      y_hat = self.y_pred_local if use_local_reparameterization else self.y_pred
-      log_likelihood = log_gaussian(
-          self.y, y_hat, self.hparams.noise_sigma, reduce_sum=False)
-
-    # Only take into account observed outcomes (bandits setting)
-    batch_size = tf.to_float(tf.shape(self.x)[0])
-    weighted_log_likelihood = tf.reduce_sum(
-        log_likelihood * self.weights) / batch_size
-
-    # The objective is 1/n * (\sum_i log_like_i - KL); neg_kl_term estimates -KL
-    elbo = weighted_log_likelihood + (neg_kl_term / self.n)
-
-    self.loss = -elbo
-    self.global_step = tf.train.get_or_create_global_step()
-    self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
-        self.loss, global_step=self.global_step)
-
-    # Create tensorboard metrics
-    self.create_summaries()
-    self.summary_writer = tf.summary.FileWriter(
-        "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
-
-  def build_graph(self):
-    """Defines graph, session, placeholders, and model.
-
-    Placeholders are: n (size of the dataset), x and y (context and observed
-    reward for each action), and weights (one-hot encoding of selected action
-    for each context, i.e., only possibly non-zero element in each y).
-    """
-
-    self.graph = tf.Graph()
-    with self.graph.as_default():
-
-      self.sess = tf.Session()
-
-      self.n = tf.placeholder(shape=[], dtype=tf.float32)
-
-      self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float32)
-      self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
-      self.weights = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
-
-      self.build_model()
-      self.sess.run(tf.global_variables_initializer())
-
-  def create_summaries(self):
-    """Defines summaries including mean loss, and global step."""
-
-    with self.graph.as_default():
-      with tf.name_scope(self.name + "_summaries"):
-        tf.summary.scalar("loss", self.loss)
-        tf.summary.scalar("global_step", self.global_step)
-        self.summary_op = tf.summary.merge_all()
-
-  def assign_lr(self):
-    """Resets the learning rate in dynamic schedules for subsequent trainings.
-
-    In bandits settings, we do expand our dataset over time. Then, we need to
-    re-train the network with the new data. The algorithms that do not keep
-    the step constant, can reset it at the start of each *training* process.
-    """
-
-    decay_steps = 1
-    if self.hparams.activate_decay:
-      current_gs = self.sess.run(self.global_step)
-      with self.graph.as_default():
-        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
-                                              self.global_step - current_gs,
-                                              decay_steps,
-                                              self.hparams.lr_decay_rate)
-
-  def train(self, data, num_steps):
-    """Trains the BNN for num_steps, using the data in 'data'.
-
-    Args:
-      data: ContextualDataset object that provides the data.
-      num_steps: Number of minibatches to train the network for.
-
-    Returns:
-      losses: Loss history during training.
-    """
-
-    if self.times_trained < self.cleared_times_trained:
-      num_steps = int(self.training_schedule[self.times_trained])
-    self.times_trained += 1
-
-    losses = []
-
-    with self.graph.as_default():
-
-      if self.verbose:
-        print("Training {} for {} steps...".format(self.name, num_steps))
-
-      for step in range(num_steps):
-        x, y, weights = data.get_batch_with_weights(self.hparams.batch_size)
-        _, summary, global_step, loss = self.sess.run(
-            [self.train_op, self.summary_op, self.global_step, self.loss],
-            feed_dict={
-                self.x: x,
-                self.y: y,
-                self.weights: weights,
-                self.n: data.num_points(self.f_num_points),
-            })
-
-        losses.append(loss)
-
-        if step % self.hparams.freq_summary == 0:
-          if self.hparams.show_training:
-            print("{} | step: {}, loss: {}".format(
-                self.name, global_step, loss))
-          self.summary_writer.add_summary(summary, global_step)
-
-    return losses
--- a/research/deep_contextual_bandits/bandits/core/bandit_algorithm.py
+++ b/research/deep_contextual_bandits/bandits/core/bandit_algorithm.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Define the abstract class for contextual bandit algorithms."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class BanditAlgorithm(object):
-  """A bandit algorithm must be able to do two basic operations.
-
-  1. Choose an action given a context.
-  2. Update its internal model given a triple (context, played action, reward).
-  """
-
-  def action(self, context):
-    pass
-
-  def update(self, context, action, reward):
-    pass
--- a/research/deep_contextual_bandits/bandits/core/bayesian_nn.py
+++ b/research/deep_contextual_bandits/bandits/core/bayesian_nn.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Define the abstract class for Bayesian Neural Networks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-class BayesianNN(object):
-  """A Bayesian neural network keeps a distribution over neural nets."""
-
-  def __init__(self, optimizer):
-    pass
-
-  def build_model(self):
-    pass
-
-  def train(self, data):
-    pass
-
-  def sample(self, steps):
-    pass
--- a/research/deep_contextual_bandits/bandits/core/contextual_bandit.py
+++ b/research/deep_contextual_bandits/bandits/core/contextual_bandit.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Define a contextual bandit from which we can sample and compute rewards.
-
-We can feed the data, sample a context, its reward for a specific action, and
-also the optimal action for a given context.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-def run_contextual_bandit(context_dim, num_actions, dataset, algos):
-  """Run a contextual bandit problem on a set of algorithms.
-
-  Args:
-    context_dim: Dimension of the context.
-    num_actions: Number of available actions.
-    dataset: Matrix where every row is a context + num_actions rewards.
-    algos: List of algorithms to use in the contextual bandit instance.
-
-  Returns:
-    h_actions: Matrix with actions: size (num_context, num_algorithms).
-    h_rewards: Matrix with rewards: size (num_context, num_algorithms).
-  """
-
-  num_contexts = dataset.shape[0]
-
-  # Create contextual bandit
-  cmab = ContextualBandit(context_dim, num_actions)
-  cmab.feed_data(dataset)
-
-  h_actions = np.empty((0, len(algos)), float)
-  h_rewards = np.empty((0, len(algos)), float)
-
-  # Run the contextual bandit process
-  for i in range(num_contexts):
-    context = cmab.context(i)
-    actions = [a.action(context) for a in algos]
-    rewards = [cmab.reward(i, action) for action in actions]
-
-    for j, a in enumerate(algos):
-      a.update(context, actions[j], rewards[j])
-
-    h_actions = np.vstack((h_actions, np.array(actions)))
-    h_rewards = np.vstack((h_rewards, np.array(rewards)))
-
-  return h_actions, h_rewards
-
-
-class ContextualBandit(object):
-  """Implements a Contextual Bandit with d-dimensional contexts and k arms."""
-
-  def __init__(self, context_dim, num_actions):
-    """Creates a contextual bandit object.
-
-    Args:
-      context_dim: Dimension of the contexts.
-      num_actions: Number of arms for the multi-armed bandit.
-    """
-
-    self._context_dim = context_dim
-    self._num_actions = num_actions
-
-  def feed_data(self, data):
-    """Feeds the data (contexts + rewards) to the bandit object.
-
-    Args:
-      data: Numpy array with shape [n, d+k], where n is the number of contexts,
-        d is the dimension of each context, and k the number of arms (rewards).
-
-    Raises:
-      ValueError: when data dimensions do not correspond to the object values.
-    """
-
-    if data.shape[1] != self.context_dim + self.num_actions:
-      raise ValueError('Data dimensions do not match.')
-
-    self._number_contexts = data.shape[0]
-    self.data = data
-    self.order = range(self.number_contexts)
-
-  def reset(self):
-    """Randomly shuffle the order of the contexts to deliver."""
-    self.order = np.random.permutation(self.number_contexts)
-
-  def context(self, number):
-    """Returns the number-th context."""
-    return self.data[self.order[number]][:self.context_dim]
-
-  def reward(self, number, action):
-    """Returns the reward for the number-th context and action."""
-    return self.data[self.order[number]][self.context_dim + action]
-
-  def optimal(self, number):
-    """Returns the optimal action (in hindsight) for the number-th context."""
-    return np.argmax(self.data[self.order[number]][self.context_dim:])
-
-  @property
-  def context_dim(self):
-    return self._context_dim
-
-  @property
-  def num_actions(self):
-    return self._num_actions
-
-  @property
-  def number_contexts(self):
-    return self._number_contexts
--- a/research/deep_contextual_bandits/bandits/core/contextual_dataset.py
+++ b/research/deep_contextual_bandits/bandits/core/contextual_dataset.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Define a data buffer for contextual bandit algorithms."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-class ContextualDataset(object):
-  """The buffer is able to append new data, and sample random minibatches."""
-
-  def __init__(self, context_dim, num_actions, buffer_s=-1, intercept=False):
-    """Creates a ContextualDataset object.
-
-    The data is stored in attributes: contexts and rewards.
-    The sequence of taken actions are stored in attribute actions.
-
-    Args:
-      context_dim: Dimension of the contexts.
-      num_actions: Number of arms for the multi-armed bandit.
-      buffer_s: Size of buffer for training. Only last buffer_s will be
-        returned as minibatch. If buffer_s = -1, all data will be used.
-      intercept: If True, it adds a constant (1.0) dimension to each context X,
-        at the end.
-    """
-
-    self._context_dim = context_dim
-    self._num_actions = num_actions
-    self._contexts = None
-    self._rewards = None
-    self.actions = []
-    self.buffer_s = buffer_s
-    self.intercept = intercept
-
-  def add(self, context, action, reward):
-    """Adds a new triplet (context, action, reward) to the dataset.
-
-    The reward for the actions that weren't played is assumed to be zero.
-
-    Args:
-      context: A d-dimensional vector with the context.
-      action: Integer between 0 and k-1 representing the chosen arm.
-      reward: Real number representing the reward for the (context, action).
-    """
-
-    if self.intercept:
-      c = np.array(context[:])
-      c = np.append(c, 1.0).reshape((1, self.context_dim + 1))
-    else:
-      c = np.array(context[:]).reshape((1, self.context_dim))
-
-    if self.contexts is None:
-      self.contexts = c
-    else:
-      self.contexts = np.vstack((self.contexts, c))
-
-    r = np.zeros((1, self.num_actions))
-    r[0, action] = reward
-    if self.rewards is None:
-      self.rewards = r
-    else:
-      self.rewards = np.vstack((self.rewards, r))
-
-    self.actions.append(action)
-
-  def replace_data(self, contexts=None, actions=None, rewards=None):
-    if contexts is not None:
-      self.contexts = contexts
-    if actions is not None:
-      self.actions = actions
-    if rewards is not None:
-      self.rewards = rewards
-
-  def get_batch(self, batch_size):
-    """Returns a random minibatch of (contexts, rewards) with batch_size."""
-    n, _ = self.contexts.shape
-    if self.buffer_s == -1:
-      # use all the data
-      ind = np.random.choice(range(n), batch_size)
-    else:
-      # use only buffer (last buffer_s observations)
-      ind = np.random.choice(range(max(0, n - self.buffer_s), n), batch_size)
-    return self.contexts[ind, :], self.rewards[ind, :]
-
-  def get_data(self, action):
-    """Returns all (context, reward) where the action was played."""
-    n, _ = self.contexts.shape
-    ind = np.array([i for i in range(n) if self.actions[i] == action])
-    return self.contexts[ind, :], self.rewards[ind, action]
-
-  def get_data_with_weights(self):
-    """Returns all observations with one-hot weights for actions."""
-    weights = np.zeros((self.contexts.shape[0], self.num_actions))
-    a_ind = np.array([(i, val) for i, val in enumerate(self.actions)])
-    weights[a_ind[:, 0], a_ind[:, 1]] = 1.0
-    return self.contexts, self.rewards, weights
-
-  def get_batch_with_weights(self, batch_size):
-    """Returns a random mini-batch with one-hot weights for actions."""
-    n, _ = self.contexts.shape
-    if self.buffer_s == -1:
-      # use all the data
-      ind = np.random.choice(range(n), batch_size)
-    else:
-      # use only buffer (last buffer_s obs)
-      ind = np.random.choice(range(max(0, n - self.buffer_s), n), batch_size)
-
-    weights = np.zeros((batch_size, self.num_actions))
-    sampled_actions = np.array(self.actions)[ind]
-    a_ind = np.array([(i, val) for i, val in enumerate(sampled_actions)])
-    weights[a_ind[:, 0], a_ind[:, 1]] = 1.0
-    return self.contexts[ind, :], self.rewards[ind, :], weights
-
-  def num_points(self, f=None):
-    """Returns number of points in the buffer (after applying function f)."""
-    if f is not None:
-      return f(self.contexts.shape[0])
-    return self.contexts.shape[0]
-
-  @property
-  def context_dim(self):
-    return self._context_dim
-
-  @property
-  def num_actions(self):
-    return self._num_actions
-
-  @property
-  def contexts(self):
-    return self._contexts
-
-  @contexts.setter
-  def contexts(self, value):
-    self._contexts = value
-
-  @property
-  def actions(self):
-    return self._actions
-
-  @actions.setter
-  def actions(self, value):
-    self._actions = value
-
-  @property
-  def rewards(self):
-    return self._rewards
-
-  @rewards.setter
-  def rewards(self, value):
-    self._rewards = value
--- a/research/deep_contextual_bandits/bandits/data/data_sampler.py
+++ b/research/deep_contextual_bandits/bandits/data/data_sampler.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functions to create bandit problems from datasets."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-
-
-def one_hot(df, cols):
-  """Returns one-hot encoding of DataFrame df including columns in cols."""
-  for col in cols:
-    dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
-    df = pd.concat([df, dummies], axis=1)
-    df = df.drop(col, axis=1)
-  return df
-
-
-def sample_mushroom_data(file_name,
-                         num_contexts,
-                         r_noeat=0,
-                         r_eat_safe=5,
-                         r_eat_poison_bad=-35,
-                         r_eat_poison_good=5,
-                         prob_poison_bad=0.5):
-  """Samples bandit game from Mushroom UCI Dataset.
-
-  Args:
-    file_name: Route of file containing the original Mushroom UCI dataset.
-    num_contexts: Number of points to sample, i.e. (context, action rewards).
-    r_noeat: Reward for not eating a mushroom.
-    r_eat_safe: Reward for eating a non-poisonous mushroom.
-    r_eat_poison_bad: Reward for eating a poisonous mushroom if harmed.
-    r_eat_poison_good: Reward for eating a poisonous mushroom if not harmed.
-    prob_poison_bad: Probability of being harmed by eating a poisonous mushroom.
-
-  Returns:
-    dataset: Sampled matrix with n rows: (context, eat_reward, no_eat_reward).
-    opt_vals: Vector of expected optimal (reward, action) for each context.
-
-  We assume r_eat_safe > r_noeat, and r_eat_poison_good > r_eat_poison_bad.
-  """
-
-  # first two cols of df encode whether mushroom is edible or poisonous
-  df = pd.read_csv(file_name, header=None)
-  df = one_hot(df, df.columns)
-  ind = np.random.choice(range(df.shape[0]), num_contexts, replace=True)
-
-  contexts = df.iloc[ind, 2:]
-  no_eat_reward = r_noeat * np.ones((num_contexts, 1))
-  random_poison = np.random.choice(
-      [r_eat_poison_bad, r_eat_poison_good],
-      p=[prob_poison_bad, 1 - prob_poison_bad],
-      size=num_contexts)
-  eat_reward = r_eat_safe * df.iloc[ind, 0]
-  eat_reward += np.multiply(random_poison, df.iloc[ind, 1])
-  eat_reward = eat_reward.values.reshape((num_contexts, 1))
-
-  # compute optimal expected reward and optimal actions
-  exp_eat_poison_reward = r_eat_poison_bad * prob_poison_bad
-  exp_eat_poison_reward += r_eat_poison_good * (1 - prob_poison_bad)
-  opt_exp_reward = r_eat_safe * df.iloc[ind, 0] + max(
-      r_noeat, exp_eat_poison_reward) * df.iloc[ind, 1]
-
-  if r_noeat > exp_eat_poison_reward:
-    # actions: no eat = 0 ; eat = 1
-    opt_actions = df.iloc[ind, 0]  # indicator of edible
-  else:
-    # should always eat (higher expected reward)
-    opt_actions = np.ones((num_contexts, 1))
-
-  opt_vals = (opt_exp_reward.values, opt_actions.values)
-
-  return np.hstack((contexts, no_eat_reward, eat_reward)), opt_vals
-
-
-def sample_stock_data(file_name, context_dim, num_actions, num_contexts,
-                      sigma, shuffle_rows=True):
-  """Samples linear bandit game from stock prices dataset.
-
-  Args:
-    file_name: Route of file containing the stock prices dataset.
-    context_dim: Context dimension (i.e. vector with the price of each stock).
-    num_actions: Number of actions (different linear portfolio strategies).
-    num_contexts: Number of contexts to sample.
-    sigma: Vector with additive noise levels for each action.
-    shuffle_rows: If True, rows from original dataset are shuffled.
-
-  Returns:
-    dataset: Sampled matrix with rows: (context, reward_1, ..., reward_k).
-    opt_vals: Vector of expected optimal (reward, action) for each context.
-  """
-
-  with tf.gfile.Open(file_name, 'r') as f:
-    contexts = np.loadtxt(f, skiprows=1)
-
-  if shuffle_rows:
-    np.random.shuffle(contexts)
-  contexts = contexts[:num_contexts, :]
-
-  betas = np.random.uniform(-1, 1, (context_dim, num_actions))
-  betas /= np.linalg.norm(betas, axis=0)
-
-  mean_rewards = np.dot(contexts, betas)
-  noise = np.random.normal(scale=sigma, size=mean_rewards.shape)
-  rewards = mean_rewards + noise
-
-  opt_actions = np.argmax(mean_rewards, axis=1)
-  opt_rewards = [mean_rewards[i, a] for i, a in enumerate(opt_actions)]
-  return np.hstack((contexts, rewards)), (np.array(opt_rewards), opt_actions)
-
-
-def sample_jester_data(file_name, context_dim, num_actions, num_contexts,
-                       shuffle_rows=True, shuffle_cols=False):
-  """Samples bandit game from (user, joke) dense subset of Jester dataset.
-
-  Args:
-    file_name: Route of file containing the modified Jester dataset.
-    context_dim: Context dimension (i.e. vector with some ratings from a user).
-    num_actions: Number of actions (number of joke ratings to predict).
-    num_contexts: Number of contexts to sample.
-    shuffle_rows: If True, rows from original dataset are shuffled.
-    shuffle_cols: Whether or not context/action jokes are randomly shuffled.
-
-  Returns:
-    dataset: Sampled matrix with rows: (context, rating_1, ..., rating_k).
-    opt_vals: Vector of deterministic optimal (reward, action) for each context.
-  """
-
-  with tf.gfile.Open(file_name, 'rb') as f:
-    dataset = np.load(f)
-
-  if shuffle_cols:
-    dataset = dataset[:, np.random.permutation(dataset.shape[1])]
-  if shuffle_rows:
-    np.random.shuffle(dataset)
-  dataset = dataset[:num_contexts, :]
-
-  assert context_dim + num_actions == dataset.shape[1], 'Wrong data dimensions.'
-
-  opt_actions = np.argmax(dataset[:, context_dim:], axis=1)
-  opt_rewards = np.array([dataset[i, context_dim + a]
-                          for i, a in enumerate(opt_actions)])
-
-  return dataset, (opt_rewards, opt_actions)
-
-
-def sample_statlog_data(file_name, num_contexts, shuffle_rows=True,
-                        remove_underrepresented=False):
-  """Returns bandit problem dataset based on the UCI statlog data.
-
-  Args:
-    file_name: Route of file containing the Statlog dataset.
-    num_contexts: Number of contexts to sample.
-    shuffle_rows: If True, rows from original dataset are shuffled.
-    remove_underrepresented: If True, removes arms with very few rewards.
-
-  Returns:
-    dataset: Sampled matrix with rows: (context, action rewards).
-    opt_vals: Vector of deterministic optimal (reward, action) for each context.
-
-  https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)
-  """
-
-  with tf.gfile.Open(file_name, 'r') as f:
-    data = np.loadtxt(f)
-
-  num_actions = 7  # some of the actions are very rarely optimal.
-
-  # Shuffle data
-  if shuffle_rows:
-    np.random.shuffle(data)
-  data = data[:num_contexts, :]
-
-  # Last column is label, rest are features
-  contexts = data[:, :-1]
-  labels = data[:, -1].astype(int) - 1  # convert to 0 based index
-
-  if remove_underrepresented:
-    contexts, labels = remove_underrepresented_classes(contexts, labels)
-
-  return classification_to_bandit_problem(contexts, labels, num_actions)
-
-
-def sample_adult_data(file_name, num_contexts, shuffle_rows=True,
-                      remove_underrepresented=False):
-  """Returns bandit problem dataset based on the UCI adult data.
-
-  Args:
-    file_name: Route of file containing the Adult dataset.
-    num_contexts: Number of contexts to sample.
-    shuffle_rows: If True, rows from original dataset are shuffled.
-    remove_underrepresented: If True, removes arms with very few rewards.
-
-  Returns:
-    dataset: Sampled matrix with rows: (context, action rewards).
-    opt_vals: Vector of deterministic optimal (reward, action) for each context.
-
-  Preprocessing:
-    * drop rows with missing values
-    * convert categorical variables to 1 hot encoding
-
-  https://archive.ics.uci.edu/ml/datasets/census+income
-  """
-  with tf.gfile.Open(file_name, 'r') as f:
-    df = pd.read_csv(f, header=None,
-                     na_values=[' ?']).dropna()
-
-  num_actions = 14
-
-  if shuffle_rows:
-    df = df.sample(frac=1)
-  df = df.iloc[:num_contexts, :]
-
-  labels = df[6].astype('category').cat.codes.as_matrix()
-  df = df.drop([6], axis=1)
-
-  # Convert categorical variables to 1 hot encoding
-  cols_to_transform = [1, 3, 5, 7, 8, 9, 13, 14]
-  df = pd.get_dummies(df, columns=cols_to_transform)
-
-  if remove_underrepresented:
-    df, labels = remove_underrepresented_classes(df, labels)
-  contexts = df.as_matrix()
-
-  return classification_to_bandit_problem(contexts, labels, num_actions)
-
-
-def sample_census_data(file_name, num_contexts, shuffle_rows=True,
-                       remove_underrepresented=False):
-  """Returns bandit problem dataset based on the UCI census data.
-
-  Args:
-    file_name: Route of file containing the Census dataset.
-    num_contexts: Number of contexts to sample.
-    shuffle_rows: If True, rows from original dataset are shuffled.
-    remove_underrepresented: If True, removes arms with very few rewards.
-
-  Returns:
-    dataset: Sampled matrix with rows: (context, action rewards).
-    opt_vals: Vector of deterministic optimal (reward, action) for each context.
-
-  Preprocessing:
-    * drop rows with missing labels
-    * convert categorical variables to 1 hot encoding
-
-  Note: this is the processed (not the 'raw') dataset. It contains a subset
-  of the raw features and they've all been discretized.
-
-  https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29
-  """
-  # Note: this dataset is quite large. It will be slow to load and preprocess.
-  with tf.gfile.Open(file_name, 'r') as f:
-    df = (pd.read_csv(f, header=0, na_values=['?'])
-          .dropna())
-
-  num_actions = 9
-
-  if shuffle_rows:
-    df = df.sample(frac=1)
-  df = df.iloc[:num_contexts, :]
-
-  # Assuming what the paper calls response variable is the label?
-  labels = df['dOccup'].astype('category').cat.codes.as_matrix()
-  # In addition to label, also drop the (unique?) key.
-  df = df.drop(['dOccup', 'caseid'], axis=1)
-
-  # All columns are categorical. Convert to 1 hot encoding.
-  df = pd.get_dummies(df, columns=df.columns)
-
-  if remove_underrepresented:
-    df, labels = remove_underrepresented_classes(df, labels)
-  contexts = df.as_matrix()
-
-  return classification_to_bandit_problem(contexts, labels, num_actions)
-
-
-def sample_covertype_data(file_name, num_contexts, shuffle_rows=True,
-                          remove_underrepresented=False):
-  """Returns bandit problem dataset based on the UCI Cover_Type data.
-
-  Args:
-    file_name: Route of file containing the Covertype dataset.
-    num_contexts: Number of contexts to sample.
-    shuffle_rows: If True, rows from original dataset are shuffled.
-    remove_underrepresented: If True, removes arms with very few rewards.
-
-  Returns:
-    dataset: Sampled matrix with rows: (context, action rewards).
-    opt_vals: Vector of deterministic optimal (reward, action) for each context.
-
-  Preprocessing:
-    * drop rows with missing labels
-    * convert categorical variables to 1 hot encoding
-
-  https://archive.ics.uci.edu/ml/datasets/Covertype
-  """
-  with tf.gfile.Open(file_name, 'r') as f:
-    df = (pd.read_csv(f, header=0, na_values=['?'])
-          .dropna())
-
-  num_actions = 7
-
-  if shuffle_rows:
-    df = df.sample(frac=1)
-  df = df.iloc[:num_contexts, :]
-
-  # Assuming what the paper calls response variable is the label?
-  # Last column is label.
-  labels = df[df.columns[-1]].astype('category').cat.codes.as_matrix()
-  df = df.drop([df.columns[-1]], axis=1)
-
-  # All columns are either quantitative or already converted to 1 hot.
-  if remove_underrepresented:
-    df, labels = remove_underrepresented_classes(df, labels)
-  contexts = df.as_matrix()
-
-  return classification_to_bandit_problem(contexts, labels, num_actions)
-
-
-def classification_to_bandit_problem(contexts, labels, num_actions=None):
-  """Normalize contexts and encode deterministic rewards."""
-
-  if num_actions is None:
-    num_actions = np.max(labels) + 1
-  num_contexts = contexts.shape[0]
-
-  # Due to random subsampling in small problems, some features may be constant
-  sstd = safe_std(np.std(contexts, axis=0, keepdims=True)[0, :])
-
-  # Normalize features
-  contexts = ((contexts - np.mean(contexts, axis=0, keepdims=True)) / sstd)
-
-  # One hot encode labels as rewards
-  rewards = np.zeros((num_contexts, num_actions))
-  rewards[np.arange(num_contexts), labels] = 1.0
-
-  return contexts, rewards, (np.ones(num_contexts), labels)
-
-
-def safe_std(values):
-  """Remove zero std values for ones."""
-  return np.array([val if val != 0.0 else 1.0 for val in values])
-
-
-def remove_underrepresented_classes(features, labels, thresh=0.0005):
-  """Removes classes when number of datapoints fraction is below a threshold."""
-
-  # Threshold doesn't seem to agree with https://arxiv.org/pdf/1706.04687.pdf
-  # Example: for Covertype, they report 4 classes after filtering, we get 7?
-  total_count = labels.shape[0]
-  unique, counts = np.unique(labels, return_counts=True)
-  ratios = counts.astype('float') / total_count
-  vals_and_ratios = dict(zip(unique, ratios))
-  print('Unique classes and their ratio of total: %s' % vals_and_ratios)
-  keep = [vals_and_ratios[v] >= thresh for v in labels]
-  return features[keep], labels[np.array(keep)]
--- a/research/deep_contextual_bandits/bandits/data/synthetic_data_sampler.py
+++ b/research/deep_contextual_bandits/bandits/data/synthetic_data_sampler.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Several functions to sample contextual data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-def sample_contextual_data(num_contexts, dim_context, num_actions, sigma):
-  """Samples independent Gaussian data.
-
-  There is nothing to learn here as the rewards do not depend on the context.
-
-  Args:
-    num_contexts: Number of contexts to sample.
-    dim_context: Dimension of the contexts.
-    num_actions: Number of arms for the multi-armed bandit.
-    sigma: Standard deviation of the independent Gaussian samples.
-
-  Returns:
-    data: A [num_contexts, dim_context + num_actions] numpy array with the data.
-  """
-  size_data = [num_contexts, dim_context + num_actions]
-  return np.random.normal(scale=sigma, size=size_data)
-
-
-def sample_linear_data(num_contexts, dim_context, num_actions, sigma=0.0):
-  """Samples data from linearly parameterized arms.
-
-  The reward for context X and arm j is given by X^T beta_j, for some latent
-  set of parameters {beta_j : j = 1, ..., k}. The beta's are sampled uniformly
-  at random, the contexts are Gaussian, and sigma-noise is added to the rewards.
-
-  Args:
-    num_contexts: Number of contexts to sample.
-    dim_context: Dimension of the contexts.
-    num_actions: Number of arms for the multi-armed bandit.
-    sigma: Standard deviation of the additive noise. Set to zero for no noise.
-
-  Returns:
-    data: A [n, d+k] numpy array with the data.
-    betas: Latent parameters that determine expected reward for each arm.
-    opt: (optimal_rewards, optimal_actions) for all contexts.
-  """
-
-  betas = np.random.uniform(-1, 1, (dim_context, num_actions))
-  betas /= np.linalg.norm(betas, axis=0)
-  contexts = np.random.normal(size=[num_contexts, dim_context])
-  rewards = np.dot(contexts, betas)
-  opt_actions = np.argmax(rewards, axis=1)
-  rewards += np.random.normal(scale=sigma, size=rewards.shape)
-  opt_rewards = np.array([rewards[i, act] for i, act in enumerate(opt_actions)])
-  return np.hstack((contexts, rewards)), betas, (opt_rewards, opt_actions)
-
-
-def sample_sparse_linear_data(num_contexts, dim_context, num_actions,
-                              sparse_dim, sigma=0.0):
-  """Samples data from sparse linearly parameterized arms.
-
-  The reward for context X and arm j is given by X^T beta_j, for some latent
-  set of parameters {beta_j : j = 1, ..., k}. The beta's are sampled uniformly
-  at random, the contexts are Gaussian, and sigma-noise is added to the rewards.
-  Only s components out of d are non-zero for each arm's beta.
-
-  Args:
-    num_contexts: Number of contexts to sample.
-    dim_context: Dimension of the contexts.
-    num_actions: Number of arms for the multi-armed bandit.
-    sparse_dim: Dimension of the latent subspace (sparsity pattern dimension).
-    sigma: Standard deviation of the additive noise. Set to zero for no noise.
-
-  Returns:
-    data: A [num_contexts, dim_context+num_actions] numpy array with the data.
-    betas: Latent parameters that determine expected reward for each arm.
-    opt: (optimal_rewards, optimal_actions) for all contexts.
-  """
-
-  flatten = lambda l: [item for sublist in l for item in sublist]
-  sparse_pattern = flatten(
-      [[(j, i) for j in np.random.choice(range(dim_context),
-                                         sparse_dim,
-                                         replace=False)]
-       for i in range(num_actions)])
-  betas = np.random.uniform(-1, 1, (dim_context, num_actions))
-  mask = np.zeros((dim_context, num_actions))
-  for elt in sparse_pattern:
-    mask[elt] = 1
-  betas = np.multiply(betas, mask)
-  betas /= np.linalg.norm(betas, axis=0)
-  contexts = np.random.normal(size=[num_contexts, dim_context])
-  rewards = np.dot(contexts, betas)
-  opt_actions = np.argmax(rewards, axis=1)
-  rewards += np.random.normal(scale=sigma, size=rewards.shape)
-  opt_rewards = np.array([rewards[i, act] for i, act in enumerate(opt_actions)])
-  return np.hstack((contexts, rewards)), betas, (opt_rewards, opt_actions)
-
-
-def sample_wheel_bandit_data(num_contexts, delta, mean_v, std_v,
-                             mu_large, std_large):
-  """Samples from Wheel bandit game (see https://arxiv.org/abs/1802.09127).
-
-  Args:
-    num_contexts: Number of points to sample, i.e. (context, action rewards).
-    delta: Exploration parameter: high reward in one region if norm above delta.
-    mean_v: Mean reward for each action if context norm is below delta.
-    std_v: Gaussian reward std for each action if context norm is below delta.
-    mu_large: Mean reward for optimal action if context norm is above delta.
-    std_large: Reward std for optimal action if context norm is above delta.
-
-  Returns:
-    dataset: Sampled matrix with n rows: (context, action rewards).
-    opt_vals: Vector of expected optimal (reward, action) for each context.
-  """
-
-  context_dim = 2
-  num_actions = 5
-
-  data = []
-  rewards = []
-  opt_actions = []
-  opt_rewards = []
-
-  # sample uniform contexts in unit ball
-  while len(data) < num_contexts:
-    raw_data = np.random.uniform(-1, 1, (int(num_contexts / 3), context_dim))
-
-    for i in range(raw_data.shape[0]):
-      if np.linalg.norm(raw_data[i, :]) <= 1:
-        data.append(raw_data[i, :])
-
-  contexts = np.stack(data)[:num_contexts, :]
-
-  # sample rewards
-  for i in range(num_contexts):
-    r = [np.random.normal(mean_v[j], std_v[j]) for j in range(num_actions)]
-    if np.linalg.norm(contexts[i, :]) >= delta:
-      # large reward in the right region for the context
-      r_big = np.random.normal(mu_large, std_large)
-      if contexts[i, 0] > 0:
-        if contexts[i, 1] > 0:
-          r[0] = r_big
-          opt_actions.append(0)
-        else:
-          r[1] = r_big
-          opt_actions.append(1)
-      else:
-        if contexts[i, 1] > 0:
-          r[2] = r_big
-          opt_actions.append(2)
-        else:
-          r[3] = r_big
-          opt_actions.append(3)
-    else:
-      opt_actions.append(np.argmax(mean_v))
-
-    opt_rewards.append(r[opt_actions[-1]])
-    rewards.append(r)
-
-  rewards = np.stack(rewards)
-  opt_rewards = np.array(opt_rewards)
-  opt_actions = np.array(opt_actions)
-
-  return np.hstack((contexts, rewards)), (opt_rewards, opt_actions)