Added new model.

5a3c97b9 · Carlos Riquelme · ae8e0f53 · 5a3c97b9 · 5a3c97b9 · 5a3c97b9
Commit 5a3c97b9 authored Jul 23, 2018 by Carlos Riquelme
20 changed files
--- a/research/deep_contextual_bandits/bandits/algorithms/multitask_gp.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/multitask_gp.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A Multitask Gaussian process."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+from absl import logging
+
+import numpy as np
+import tensorflow as tf
+from bandits.core.bayesian_nn import BayesianNN
+
+FLAGS = flags.FLAGS
+tfd = tf.contrib.distributions
+
+class MultitaskGP(BayesianNN):
+  """Implements a Gaussian process with multi-task outputs.
+
+  Optimizes the hyperparameters over the log marginal likelihood.
+  Uses a Matern 3/2 + linear covariance and returns
+  sampled predictions for test inputs.  The outputs are optionally
+  correlated where the correlation structure is learned through latent
+  embeddings of the tasks.
+  """
+
+  def __init__(self, hparams):
+    self.name = "MultiTaskGP"
+    self.hparams = hparams
+
+    self.n_in = self.hparams.context_dim
+    self.n_out = self.hparams.num_outputs
+    self.keep_fixed_after_max_obs = self.hparams.keep_fixed_after_max_obs
+
+    self._show_training = self.hparams.show_training
+    self._freq_summary = self.hparams.freq_summary
+
+    # Dimensionality of the latent task vectors
+    self.task_latent_dim = self.hparams.task_latent_dim
+
+    # Maximum number of observations to include
+    self.max_num_points = self.hparams.max_num_points
+
+    if self.hparams.learn_embeddings:
+      self.learn_embeddings = self.hparams.learn_embeddings
+    else:
+      self.learn_embeddings = False
+
+    # create the graph corresponding to the BNN instance
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+      # store a new session for the graph
+      self.sess = tf.Session()
+
+      with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
+        self.n = tf.placeholder(shape=[], dtype=tf.float64)
+        self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float64)
+        self.x_in = tf.placeholder(shape=[None, self.n_in], dtype=tf.float64)
+        self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float64)
+        self.weights = tf.placeholder(shape=[None, self.n_out],
+                                      dtype=tf.float64)
+
+        self.build_model()
+      self.sess.run(tf.global_variables_initializer())
+
+  def atleast_2d(self, x, dims):
+    return tf.reshape(tf.expand_dims(x, axis=0), (-1, dims))
+
+  def sq_dist(self, x, x2):
+    a2 = tf.reduce_sum(tf.square(x), 1)
+    b2 = tf.reduce_sum(tf.square(x2), 1)
+    sqdists = tf.expand_dims(a2, 1) + b2 - 2.0 * tf.matmul(x, tf.transpose(x2))
+    return sqdists
+
+  # Covariance between outputs
+  def task_cov(self, x, x2):
+    """Squared Exponential Covariance Kernel over latent task embeddings."""
+    # Index into latent task vectors
+    x_vecs = tf.gather(self.task_vectors, tf.argmax(x, axis=1), axis=0)
+    x2_vecs = tf.gather(self.task_vectors, tf.argmax(x2, axis=1), axis=0)
+    r = self.sq_dist(self.atleast_2d(x_vecs, self.task_latent_dim),
+                     self.atleast_2d(x2_vecs, self.task_latent_dim))
+    return tf.exp(-r)
+
+  def cov(self, x, x2):
+    """Matern 3/2 + Linear Gaussian Process Covariance Function."""
+    ls = tf.clip_by_value(self.length_scales, -5.0, 5.0)
+    ls_lin = tf.clip_by_value(self.length_scales_lin, -5.0, 5.0)
+    r = self.sq_dist(self.atleast_2d(x, self.n_in)/tf.nn.softplus(ls),
+                     self.atleast_2d(x2, self.n_in)/tf.nn.softplus(ls))
+    r = tf.clip_by_value(r, 0, 1e8)
+
+    # Matern 3/2 Covariance
+    matern = (1.0 + tf.sqrt(3.0*r + 1e-16)) * tf.exp(-tf.sqrt(3.0*r + 1e-16))
+    # Linear Covariance
+    lin = tf.matmul(x / tf.nn.softplus(ls_lin),
+                    x2 / tf.nn.softplus(ls_lin), transpose_b=True)
+    return (tf.nn.softplus(self.amplitude) * matern +
+            tf.nn.softplus(self.amplitude_linear) * lin)
+
+  def build_model(self):
+    """Defines the GP model.
+
+    The loss is computed for partial feedback settings (bandits), so only
+    the observed outcome is backpropagated (see weighted loss).
+    Selects the optimizer and, finally, it also initializes the graph.
+    """
+
+    logging.info("Initializing model %s.", self.name)
+    self.global_step = tf.train.get_or_create_global_step()
+
+    # Define state for the model (inputs, etc.)
+    self.x_train = tf.get_variable(
+        "training_data",
+        initializer=tf.ones(
+            [self.hparams.batch_size, self.n_in], dtype=tf.float64),
+        validate_shape=False,
+        trainable=False)
+    self.y_train = tf.get_variable(
+        "training_labels",
+        initializer=tf.zeros([self.hparams.batch_size, 1], dtype=tf.float64),
+        validate_shape=False,
+        trainable=False)
+    self.weights_train = tf.get_variable(
+        "weights_train",
+        initializer=tf.ones(
+            [self.hparams.batch_size, self.n_out], dtype=tf.float64),
+        validate_shape=False,
+        trainable=False)
+    self.input_op = tf.assign(self.x_train, self.x_in, validate_shape=False)
+    self.input_w_op = tf.assign(
+        self.weights_train, self.weights, validate_shape=False)
+
+    self.input_std = tf.get_variable(
+        "data_standard_deviation",
+        initializer=tf.ones([1, self.n_out], dtype=tf.float64),
+        dtype=tf.float64,
+        trainable=False)
+    self.input_mean = tf.get_variable(
+        "data_mean",
+        initializer=tf.zeros([1, self.n_out], dtype=tf.float64),
+        dtype=tf.float64,
+        trainable=True)
+
+    # GP Hyperparameters
+    self.noise = tf.get_variable(
+        "noise", initializer=tf.cast(0.0, dtype=tf.float64))
+    self.amplitude = tf.get_variable(
+        "amplitude", initializer=tf.cast(1.0, dtype=tf.float64))
+    self.amplitude_linear = tf.get_variable(
+        "linear_amplitude", initializer=tf.cast(1.0, dtype=tf.float64))
+    self.length_scales = tf.get_variable(
+        "length_scales", initializer=tf.zeros([1, self.n_in], dtype=tf.float64))
+    self.length_scales_lin = tf.get_variable(
+        "length_scales_linear",
+        initializer=tf.zeros([1, self.n_in], dtype=tf.float64))
+
+    # Latent embeddings of the different outputs for task covariance
+    self.task_vectors = tf.get_variable(
+        "latent_task_vectors",
+        initializer=tf.random_normal(
+            [self.n_out, self.task_latent_dim], dtype=tf.float64))
+
+    # Normalize outputs across each dimension
+    # Since we have different numbers of observations across each task, we
+    # normalize by their respective counts.
+    index_counts = self.atleast_2d(tf.reduce_sum(self.weights, axis=0),
+                                   self.n_out)
+    index_counts = tf.where(index_counts > 0, index_counts,
+                            tf.ones(tf.shape(index_counts), dtype=tf.float64))
+    self.mean_op = tf.assign(self.input_mean,
+                             tf.reduce_sum(self.y, axis=0) / index_counts)
+    self.var_op = tf.assign(
+        self.input_std, tf.sqrt(1e-4 + tf.reduce_sum(tf.square(
+            self.y - tf.reduce_sum(self.y, axis=0) / index_counts), axis=0)
+                                / index_counts))
+
+    with tf.control_dependencies([self.var_op]):
+      y_normed = self.atleast_2d(
+          (self.y - self.input_mean) / self.input_std, self.n_out)
+      y_normed = self.atleast_2d(tf.boolean_mask(y_normed, self.weights > 0), 1)
+    self.out_op = tf.assign(self.y_train, y_normed, validate_shape=False)
+
+    # Observation noise
+    alpha = tf.nn.softplus(self.noise) + 1e-6
+
+    # Covariance
+    with tf.control_dependencies([self.input_op, self.input_w_op, self.out_op]):
+      self.self_cov = (self.cov(self.x_in, self.x_in) *
+                       self.task_cov(self.weights, self.weights) +
+                       tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64) * alpha)
+
+    self.chol = tf.cholesky(self.self_cov)
+    self.kinv = tf.cholesky_solve(self.chol, tf.eye(tf.shape(self.x_in)[0],
+                                                    dtype=tf.float64))
+
+    self.input_inv = tf.Variable(
+        tf.eye(self.hparams.batch_size, dtype=tf.float64),
+        validate_shape=False,
+        trainable=False)
+    self.input_cov_op = tf.assign(self.input_inv, self.kinv,
+                                  validate_shape=False)
+
+    # Log determinant by taking the singular values along the diagonal
+    # of self.chol
+    with tf.control_dependencies([self.input_cov_op]):
+      logdet = 2.0 * tf.reduce_sum(tf.log(tf.diag_part(self.chol) + 1e-16))
+
+    # Log Marginal likelihood
+    self.marginal_ll = -tf.reduce_sum(-0.5 * tf.matmul(
+        tf.transpose(y_normed), tf.matmul(self.kinv, y_normed)) - 0.5 * logdet -
+                                      0.5 * self.n * np.log(2 * np.pi))
+
+    zero = tf.cast(0., dtype=tf.float64)
+    one = tf.cast(1., dtype=tf.float64)
+    standard_normal = tfd.Normal(loc=zero, scale=one)
+
+    # Loss is marginal likelihood and priors
+    self.loss = tf.reduce_sum(
+        self.marginal_ll -
+        (standard_normal.log_prob(self.amplitude) +
+         standard_normal.log_prob(tf.exp(self.noise)) +
+         standard_normal.log_prob(self.amplitude_linear) +
+         tfd.Normal(loc=zero, scale=one * 10.).log_prob(
+             self.task_vectors))
+    )
+
+    # Optimizer for hyperparameters
+    optimizer = tf.train.AdamOptimizer(learning_rate=self.hparams.lr)
+    vars_to_optimize = [
+        self.amplitude, self.length_scales, self.length_scales_lin,
+        self.amplitude_linear, self.noise, self.input_mean
+    ]
+
+    if self.learn_embeddings:
+      vars_to_optimize.append(self.task_vectors)
+    grads = optimizer.compute_gradients(self.loss, vars_to_optimize)
+    self.train_op = optimizer.apply_gradients(grads,
+                                              global_step=self.global_step)
+
+    # Predictions for test data
+    self.y_mean, self.y_pred = self.posterior_mean_and_sample(self.x)
+
+    # create tensorboard metrics
+    self.create_summaries()
+    self.summary_writer = tf.summary.FileWriter("{}/graph_{}".format(
+        FLAGS.logdir, self.name), self.sess.graph)
+    self.check = tf.add_check_numerics_ops()
+
+  def posterior_mean_and_sample(self, candidates):
+    """Draw samples for test predictions.
+
+    Given a Tensor of 'candidates' inputs, returns samples from the posterior
+    and the posterior mean prediction for those inputs.
+
+    Args:
+      candidates: A (num-examples x num-dims) Tensor containing the inputs for
+      which to return predictions.
+    Returns:
+      y_mean: The posterior mean prediction given these inputs
+      y_sample: A sample from the posterior of the outputs given these inputs
+    """
+    # Cross-covariance for test predictions
+    w = tf.identity(self.weights_train)
+    inds = tf.squeeze(
+        tf.reshape(
+            tf.tile(
+                tf.reshape(tf.range(self.n_out), (self.n_out, 1)),
+                (1, tf.shape(candidates)[0])), (-1, 1)))
+
+    cross_cov = self.cov(tf.tile(candidates, [self.n_out, 1]), self.x_train)
+    cross_task_cov = self.task_cov(tf.one_hot(inds, self.n_out), w)
+    cross_cov *= cross_task_cov
+
+    # Test mean prediction
+    y_mean = tf.matmul(cross_cov, tf.matmul(self.input_inv, self.y_train))
+
+    # Test sample predictions
+    # Note this can be done much more efficiently using Kronecker products
+    # if all tasks are fully observed (which we won't assume)
+    test_cov = (
+        self.cov(tf.tile(candidates, [self.n_out, 1]),
+                 tf.tile(candidates, [self.n_out, 1])) *
+        self.task_cov(tf.one_hot(inds, self.n_out),
+                      tf.one_hot(inds, self.n_out)) -
+        tf.matmul(cross_cov,
+                  tf.matmul(self.input_inv,
+                            tf.transpose(cross_cov))))
+
+    # Get the matrix square root through an SVD for drawing samples
+    # This seems more numerically stable than the Cholesky
+    s, _, v = tf.svd(test_cov, full_matrices=True)
+    test_sqrt = tf.matmul(v, tf.matmul(tf.diag(s), tf.transpose(v)))
+
+    y_sample = (
+        tf.matmul(
+            test_sqrt,
+            tf.random_normal([tf.shape(test_sqrt)[0], 1], dtype=tf.float64)) +
+        y_mean)
+
+    y_sample = (
+        tf.transpose(tf.reshape(y_sample,
+                                (self.n_out, -1))) * self.input_std +
+        self.input_mean)
+
+    return y_mean, y_sample
+
+  def create_summaries(self):
+    with self.graph.as_default():
+      tf.summary.scalar("loss", self.loss)
+      tf.summary.scalar("log_noise", self.noise)
+      tf.summary.scalar("log_amp", self.amplitude)
+      tf.summary.scalar("log_amp_lin", self.amplitude_linear)
+      tf.summary.histogram("length_scales", self.length_scales)
+      tf.summary.histogram("length_scales_lin", self.length_scales_lin)
+      self.summary_op = tf.summary.merge_all()
+
+  def train(self, data, num_steps):
+    """Trains the GP for num_steps, using the data in 'data'.
+
+    Args:
+      data: ContextualDataset object that provides the data.
+      num_steps: Number of minibatches to train the network for.
+    """
+
+    logging.info("Training %s for %d steps...", self.name, num_steps)
+    for step in range(num_steps):
+      numpts = min(data.num_points(None), self.max_num_points)
+      if numpts >= self.max_num_points and self.keep_fixed_after_max_obs:
+        x = data.contexts[:numpts, :]
+        y = data.rewards[:numpts, :]
+        weights = np.zeros((x.shape[0], self.n_out))
+        for i, val in enumerate(data.actions[:numpts]):
+          weights[i, val] = 1.0
+      else:
+        x, y, weights = data.get_batch_with_weights(numpts)
+
+      ops = [
+          self.global_step, self.summary_op, self.loss, self.noise,
+          self.amplitude, self.amplitude_linear, self.length_scales,
+          self.length_scales_lin, self.input_cov_op, self.input_op, self.var_op,
+          self.input_w_op, self.out_op, self.train_op
+      ]
+
+      res = self.sess.run(ops,
+                          feed_dict={self.x: x,
+                                     self.x_in: x,
+                                     self.y: y,
+                                     self.weights: weights,
+                                     self.n: numpts,
+                                    })
+
+      if step % self._freq_summary == 0:
+        if self._show_training:
+          logging.info("step: %d, loss: %g noise: %f amp: %f amp_lin: %f",
+                       step, res[2], res[3], res[4], res[5])
+      summary = res[1]
+      global_step = res[0]
+      self.summary_writer.add_summary(summary, global_step=global_step)
--- a/research/deep_contextual_bandits/bandits/algorithms/neural_bandit_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/neural_bandit_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Define a family of neural network architectures for bandits.
+
+The network accepts different type of optimizers that could lead to different
+approximations of the posterior distribution or simply to point estimates.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from absl import flags
+from bandits.core.bayesian_nn import BayesianNN
+
+FLAGS = flags.FLAGS
+
+
+class NeuralBanditModel(BayesianNN):
+  """Implements a neural network for bandit problems."""
+
+  def __init__(self, optimizer, hparams, name):
+    """Saves hyper-params and builds the Tensorflow graph."""
+
+    self.opt_name = optimizer
+    self.name = name
+    self.hparams = hparams
+    self.verbose = getattr(self.hparams, "verbose", True)
+    self.times_trained = 0
+    self.build_model()
+
+  def build_layer(self, x, num_units):
+    """Builds a layer with input x; dropout and layer norm if specified."""
+
+    init_s = self.hparams.init_scale
+
+    layer_n = getattr(self.hparams, "layer_norm", False)
+    dropout = getattr(self.hparams, "use_dropout", False)
+
+    nn = tf.contrib.layers.fully_connected(
+        x,
+        num_units,
+        activation_fn=self.hparams.activation,
+        normalizer_fn=None if not layer_n else tf.contrib.layers.layer_norm,
+        normalizer_params={},
+        weights_initializer=tf.random_uniform_initializer(-init_s, init_s)
+    )
+
+    if dropout:
+      nn = tf.nn.dropout(nn, self.hparams.keep_prob)
+
+    return nn
+
+  def forward_pass(self):
+
+    init_s = self.hparams.init_scale
+
+    scope_name = "prediction_{}".format(self.name)
+    with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
+      nn = self.x
+      for num_units in self.hparams.layer_sizes:
+        if num_units > 0:
+          nn = self.build_layer(nn, num_units)
+
+      y_pred = tf.layers.dense(
+          nn,
+          self.hparams.num_actions,
+          kernel_initializer=tf.random_uniform_initializer(-init_s, init_s))
+
+    return nn, y_pred
+
+  def build_model(self):
+    """Defines the actual NN model with fully connected layers.
+
+    The loss is computed for partial feedback settings (bandits), so only
+    the observed outcome is backpropagated (see weighted loss).
+    Selects the optimizer and, finally, it also initializes the graph.
+    """
+
+    # create and store the graph corresponding to the BNN instance
+    self.graph = tf.Graph()
+
+    with self.graph.as_default():
+
+      # create and store a new session for the graph
+      self.sess = tf.Session()
+
+      with tf.name_scope(self.name):
+
+        self.global_step = tf.train.get_or_create_global_step()
+
+        # context
+        self.x = tf.placeholder(
+            shape=[None, self.hparams.context_dim],
+            dtype=tf.float32,
+            name="{}_x".format(self.name))
+
+        # reward vector
+        self.y = tf.placeholder(
+            shape=[None, self.hparams.num_actions],
+            dtype=tf.float32,
+            name="{}_y".format(self.name))
+
+        # weights (1 for selected action, 0 otherwise)
+        self.weights = tf.placeholder(
+            shape=[None, self.hparams.num_actions],
+            dtype=tf.float32,
+            name="{}_w".format(self.name))
+
+        # with tf.variable_scope("prediction_{}".format(self.name)):
+        self.nn, self.y_pred = self.forward_pass()
+        self.loss = tf.squared_difference(self.y_pred, self.y)
+        self.weighted_loss = tf.multiply(self.weights, self.loss)
+        self.cost = tf.reduce_sum(self.weighted_loss) / self.hparams.batch_size
+
+        if self.hparams.activate_decay:
+          self.lr = tf.train.inverse_time_decay(
+              self.hparams.initial_lr, self.global_step,
+              1, self.hparams.lr_decay_rate)
+        else:
+          self.lr = tf.Variable(self.hparams.initial_lr, trainable=False)
+
+        # create tensorboard metrics
+        self.create_summaries()
+        self.summary_writer = tf.summary.FileWriter(
+            "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
+
+        tvars = tf.trainable_variables()
+        grads, _ = tf.clip_by_global_norm(
+            tf.gradients(self.cost, tvars), self.hparams.max_grad_norm)
+
+        self.optimizer = self.select_optimizer()
+
+        self.train_op = self.optimizer.apply_gradients(
+            zip(grads, tvars), global_step=self.global_step)
+
+        self.init = tf.global_variables_initializer()
+
+        self.initialize_graph()
+
+  def initialize_graph(self):
+    """Initializes all variables."""
+
+    with self.graph.as_default():
+      if self.verbose:
+        print("Initializing model {}.".format(self.name))
+      self.sess.run(self.init)
+
+  def assign_lr(self):
+    """Resets the learning rate in dynamic schedules for subsequent trainings.
+
+    In bandits settings, we do expand our dataset over time. Then, we need to
+    re-train the network with the new data. The algorithms that do not keep
+    the step constant, can reset it at the start of each *training* process.
+    """
+
+    decay_steps = 1
+    if self.hparams.activate_decay:
+      current_gs = self.sess.run(self.global_step)
+      with self.graph.as_default():
+        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
+                                              self.global_step - current_gs,
+                                              decay_steps,
+                                              self.hparams.lr_decay_rate)
+
+  def select_optimizer(self):
+    """Selects optimizer. To be extended (SGLD, KFAC, etc)."""
+    return tf.train.RMSPropOptimizer(self.lr)
+
+  def create_summaries(self):
+    """Defines summaries including mean loss, learning rate, and global step."""
+
+    with self.graph.as_default():
+      with tf.name_scope(self.name + "_summaries"):
+        tf.summary.scalar("cost", self.cost)
+        tf.summary.scalar("lr", self.lr)
+        tf.summary.scalar("global_step", self.global_step)
+        self.summary_op = tf.summary.merge_all()
+
+  def train(self, data, num_steps):
+    """Trains the network for num_steps, using the provided data.
+
+    Args:
+      data: ContextualDataset object that provides the data.
+      num_steps: Number of minibatches to train the network for.
+    """
+
+    if self.verbose:
+      print("Training {} for {} steps...".format(self.name, num_steps))
+
+    with self.graph.as_default():
+
+      for step in range(num_steps):
+        x, y, w = data.get_batch_with_weights(self.hparams.batch_size)
+        _, cost, summary, lr = self.sess.run(
+            [self.train_op, self.cost, self.summary_op, self.lr],
+            feed_dict={self.x: x, self.y: y, self.weights: w})
+
+        if step % self.hparams.freq_summary == 0:
+          if self.hparams.show_training:
+            print("{} | step: {}, lr: {}, loss: {}".format(
+                self.name, step, lr, cost))
+          self.summary_writer.add_summary(summary, step)
+
+      self.times_trained += 1
--- a/research/deep_contextual_bandits/bandits/algorithms/neural_linear_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/neural_linear_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Thompson Sampling with linear posterior over a learnt deep representation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy.stats import invgamma
+
+from bandits.core.bandit_algorithm import BanditAlgorithm
+from bandits.core.contextual_dataset import ContextualDataset
+from bandits.algorithms.neural_bandit_model import NeuralBanditModel
+
+
+class NeuralLinearPosteriorSampling(BanditAlgorithm):
+  """Full Bayesian linear regression on the last layer of a deep neural net."""
+
+  def __init__(self, name, hparams, optimizer='RMS'):
+
+    self.name = name
+    self.hparams = hparams
+    self.latent_dim = self.hparams.layer_sizes[-1]
+
+    # Gaussian prior for each beta_i
+    self._lambda_prior = self.hparams.lambda_prior
+
+    self.mu = [
+        np.zeros(self.latent_dim)
+        for _ in range(self.hparams.num_actions)
+    ]
+
+    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
+                for _ in range(self.hparams.num_actions)]
+
+    self.precision = [
+        self.lambda_prior * np.eye(self.latent_dim)
+        for _ in range(self.hparams.num_actions)
+    ]
+
+    # Inverse Gamma prior for each sigma2_i
+    self._a0 = self.hparams.a0
+    self._b0 = self.hparams.b0
+
+    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
+    self.b = [self._b0 for _ in range(self.hparams.num_actions)]
+
+    # Regression and NN Update Frequency
+    self.update_freq_lr = hparams.training_freq
+    self.update_freq_nn = hparams.training_freq_network
+
+    self.t = 0
+    self.optimizer_n = optimizer
+
+    self.num_epochs = hparams.training_epochs
+    self.data_h = ContextualDataset(hparams.context_dim,
+                                    hparams.num_actions,
+                                    intercept=False)
+    self.latent_h = ContextualDataset(self.latent_dim,
+                                      hparams.num_actions,
+                                      intercept=False)
+    self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))
+
+  def action(self, context):
+    """Samples beta's from posterior, and chooses best action accordingly."""
+
+    # Round robin until each action has been selected "initial_pulls" times
+    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
+      return self.t % self.hparams.num_actions
+
+    # Sample sigma2, and beta conditional on sigma2
+    sigma2_s = [
+        self.b[i] * invgamma.rvs(self.a[i])
+        for i in range(self.hparams.num_actions)
+    ]
+
+    try:
+      beta_s = [
+          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
+          for i in range(self.hparams.num_actions)
+      ]
+    except np.linalg.LinAlgError as e:
+      # Sampling could fail if covariance is not positive definite
+      print('Exception when sampling for {}.'.format(self.name))
+      print('Details: {} | {}.'.format(e.message, e.args))
+      d = self.latent_dim
+      beta_s = [
+          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
+          for i in range(self.hparams.num_actions)
+      ]
+
+    # Compute last-layer representation for the current context
+    with self.bnn.graph.as_default():
+      c = context.reshape((1, self.hparams.context_dim))
+      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
+
+    # Apply Thompson Sampling to last-layer representation
+    vals = [
+        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
+    ]
+    return np.argmax(vals)
+
+  def update(self, context, action, reward):
+    """Updates the posterior using linear bayesian regression formula."""
+
+    self.t += 1
+    self.data_h.add(context, action, reward)
+    c = context.reshape((1, self.hparams.context_dim))
+    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
+    self.latent_h.add(z_context, action, reward)
+
+    # Retrain the network on the original data (data_h)
+    if self.t % self.update_freq_nn == 0:
+
+      if self.hparams.reset_lr:
+        self.bnn.assign_lr()
+      self.bnn.train(self.data_h, self.num_epochs)
+
+      # Update the latent representation of every datapoint collected so far
+      new_z = self.bnn.sess.run(self.bnn.nn,
+                                feed_dict={self.bnn.x: self.data_h.contexts})
+      self.latent_h.replace_data(contexts=new_z)
+
+    # Update the Bayesian Linear Regression
+    if self.t % self.update_freq_lr == 0:
+
+      # Find all the actions to update
+      actions_to_update = self.latent_h.actions[:-self.update_freq_lr]
+
+      for action_v in np.unique(actions_to_update):
+
+        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
+        z, y = self.latent_h.get_data(action_v)
+
+        # The algorithm could be improved with sequential formulas (cheaper)
+        s = np.dot(z.T, z)
+
+        # Some terms are removed as we assume prior mu_0 = 0.
+        precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
+        cov_a = np.linalg.inv(precision_a)
+        mu_a = np.dot(cov_a, np.dot(z.T, y))
+
+        # Inverse Gamma posterior update
+        a_post = self.a0 + z.shape[0] / 2.0
+        b_upd = 0.5 * np.dot(y.T, y)
+        b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
+        b_post = self.b0 + b_upd
+
+        # Store new posterior distributions
+        self.mu[action_v] = mu_a
+        self.cov[action_v] = cov_a
+        self.precision[action_v] = precision_a
+        self.a[action_v] = a_post
+        self.b[action_v] = b_post
+
+  @property
+  def a0(self):
+    return self._a0
+
+  @property
+  def b0(self):
+    return self._b0
+
+  @property
+  def lambda_prior(self):
+    return self._lambda_prior
--- a/research/deep_contextual_bandits/bandits/algorithms/parameter_noise_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/parameter_noise_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Contextual algorithm based on Thompson Sampling + direct noise injection."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy.special import logsumexp
+import tensorflow as tf
+
+from absl import flags
+
+from bandits.core.bandit_algorithm import BanditAlgorithm
+from bandits.core.contextual_dataset import ContextualDataset
+from bandits.algorithms.neural_bandit_model import NeuralBanditModel
+
+FLAGS = flags.FLAGS
+
+
+class ParameterNoiseSampling(BanditAlgorithm):
+  """Parameter Noise Sampling algorithm based on adding noise to net params.
+
+  Described in https://arxiv.org/abs/1706.01905
+  """
+
+  def __init__(self, name, hparams):
+    """Creates the algorithm, and sets up the adaptive Gaussian noise."""
+
+    self.name = name
+    self.hparams = hparams
+    self.verbose = getattr(self.hparams, 'verbose', True)
+    self.noise_std = getattr(self.hparams, 'noise_std', 0.005)
+    self.eps = getattr(self.hparams, 'eps', 0.05)
+    self.d_samples = getattr(self.hparams, 'd_samples', 300)
+    self.optimizer = getattr(self.hparams, 'optimizer', 'RMS')
+
+    # keep track of noise heuristic statistics
+    self.std_h = [self.noise_std]
+    self.eps_h = [self.eps]
+    self.kl_h = []
+    self.t = 0
+
+    self.freq_update = hparams.training_freq
+    self.num_epochs = hparams.training_epochs
+
+    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
+                                    hparams.buffer_s)
+    self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name))
+
+    with self.bnn.graph.as_default():
+
+      # noise-injection std placeholder
+      self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=())
+
+      # create noise corruption op; adds noise to all weights
+      tvars = tf.trainable_variables()
+      self.bnn.noisy_grads = [
+          tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph)
+          for v in tvars
+      ]
+
+      # add noise to all params, then compute prediction, then subtract.
+      with tf.control_dependencies(self.bnn.noisy_grads):
+        self.bnn.noise_add_ops = [
+            tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads)
+        ]
+        with tf.control_dependencies(self.bnn.noise_add_ops):
+          # we force the prediction for 'y' to be recomputed after adding noise
+          self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass()
+
+          self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val)
+          with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]):
+            self.bnn.noise_sub_ops = [
+                tvars[i].assign_add(-n)
+                for i, n in enumerate(self.bnn.noisy_grads)
+            ]
+
+  def action(self, context):
+    """Selects action based on Thompson Sampling *after* adding noise."""
+
+    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
+      # round robin until each action has been taken "initial_pulls" times
+      return self.t % self.hparams.num_actions
+
+    with self.bnn.graph.as_default():
+      # run noise prediction op to choose action, and subtract noise op after.
+      c = context.reshape((1, self.hparams.context_dim))
+      output, _ = self.bnn.sess.run(
+          [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
+          feed_dict={self.bnn.x: c,
+                     self.bnn.noise_std_ph: self.noise_std})
+      return np.argmax(output)
+
+  def update(self, context, action, reward):
+    """Updates the data buffer, and re-trains the BNN and noise level."""
+
+    self.t += 1
+    self.data_h.add(context, action, reward)
+
+    if self.t % self.freq_update == 0:
+      self.bnn.train(self.data_h, self.num_epochs)
+      self.update_noise()
+
+  def update_noise(self):
+    """Increase noise if distance btw original and corrupted distrib small."""
+
+    kl = self.compute_distance()
+    delta = -np.log1p(- self.eps + self.eps / self.hparams.num_actions)
+
+    if kl < delta:
+      self.noise_std *= 1.01
+    else:
+      self.noise_std /= 1.01
+
+    self.eps *= 0.99
+
+    if self.verbose:
+      print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.format(
+          self.eps, kl, self.noise_std, delta, kl < delta))
+
+    # store noise-injection statistics for inspection: std, KL, eps.
+    self.std_h.append(self.noise_std)
+    self.kl_h.append(kl)
+    self.eps_h.append(self.eps)
+
+  def compute_distance(self):
+    """Computes empirical KL for original and corrupted output distributions."""
+
+    random_inputs, _ = self.data_h.get_batch(self.d_samples)
+    y_model = self.bnn.sess.run(
+        self.bnn.y_pred,
+        feed_dict={
+            self.bnn.x: random_inputs,
+            self.bnn.noise_std_ph: self.noise_std
+        })
+    y_noisy, _ = self.bnn.sess.run(
+        [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
+        feed_dict={
+            self.bnn.x: random_inputs,
+            self.bnn.noise_std_ph: self.noise_std
+        })
+
+    if self.verbose:
+      # display how often original & perturbed models propose different actions
+      s = np.sum([np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :])
+                  for i in range(y_model.shape[0])])
+      print('{} | % of agreement btw original / corrupted actions: {}.'.format(
+          self.name, s / self.d_samples))
+
+    kl = self.compute_kl_with_logits(y_model, y_noisy)
+    return kl
+
+  def compute_kl_with_logits(self, logits1, logits2):
+    """Computes KL from logits samples from two distributions."""
+
+    def exp_times_diff(a, b):
+      return np.multiply(np.exp(a), a - b)
+
+    logsumexp1 = logsumexp(logits1, axis=1)
+    logsumexp2 = logsumexp(logits2, axis=1)
+    logsumexp_diff = logsumexp2 - logsumexp1
+
+    exp_diff = exp_times_diff(logits1, logits2)
+    exp_diff = np.sum(exp_diff, axis=1)
+
+    inv_exp_sum = np.sum(np.exp(logits1), axis=1)
+    term1 = np.divide(exp_diff, inv_exp_sum)
+
+    kl = term1 + logsumexp_diff
+    kl = np.maximum(kl, 0.0)
+    kl = np.nan_to_num(kl)
+    return np.mean(kl)
--- a/research/deep_contextual_bandits/bandits/algorithms/posterior_bnn_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/posterior_bnn_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Contextual bandit algorithm based on Thompson Sampling and a Bayesian NN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from bandits.core.bandit_algorithm import BanditAlgorithm
+from bandits.algorithms.bb_alpha_divergence_model import BBAlphaDivergence
+from bandits.algorithms.bf_variational_neural_bandit_model import BfVariationalNeuralBanditModel
+from bandits.core.contextual_dataset import ContextualDataset
+from bandits.algorithms.multitask_gp import MultitaskGP
+from bandits.algorithms.neural_bandit_model import NeuralBanditModel
+from bandits.algorithms.variational_neural_bandit_model import VariationalNeuralBanditModel
+
+
+class PosteriorBNNSampling(BanditAlgorithm):
+  """Posterior Sampling algorithm based on a Bayesian neural network."""
+
+  def __init__(self, name, hparams, bnn_model='RMSProp'):
+    """Creates a PosteriorBNNSampling object based on a specific optimizer.
+
+    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
+    The Bayesian Network keeps the posterior based on the optimizer iterations.
+
+    Args:
+      name: Name of the algorithm.
+      hparams: Hyper-parameters of the algorithm.
+      bnn_model: Type of BNN. By default RMSProp (point estimate).
+    """
+
+    self.name = name
+    self.hparams = hparams
+    self.optimizer_n = hparams.optimizer
+
+    self.training_freq = hparams.training_freq
+    self.training_epochs = hparams.training_epochs
+    self.t = 0
+    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
+                                    hparams.buffer_s)
+
+    # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
+    bnn_name = '{}-bnn'.format(name)
+    if bnn_model == 'Variational':
+      self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
+    elif bnn_model == 'AlphaDiv':
+      self.bnn = BBAlphaDivergence(hparams, bnn_name)
+    elif bnn_model == 'Variational_BF':
+      self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
+    elif bnn_model == 'GP':
+      self.bnn = MultitaskGP(hparams)
+    else:
+      self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)
+
+  def action(self, context):
+    """Selects action for context based on Thompson Sampling using the BNN."""
+
+    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
+      # round robin until each action has been taken "initial_pulls" times
+      return self.t % self.hparams.num_actions
+
+    with self.bnn.graph.as_default():
+      c = context.reshape((1, self.hparams.context_dim))
+      output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
+      return np.argmax(output)
+
+  def update(self, context, action, reward):
+    """Updates data buffer, and re-trains the BNN every training_freq steps."""
+
+    self.t += 1
+    self.data_h.add(context, action, reward)
+
+    if self.t % self.training_freq == 0:
+      if self.hparams.reset_lr:
+        self.bnn.assign_lr()
+      self.bnn.train(self.data_h, self.training_epochs)
--- a/research/deep_contextual_bandits/bandits/algorithms/uniform_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/uniform_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Contextual bandit algorithm that selects an action uniformly at random."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from bandits.core.bandit_algorithm import BanditAlgorithm
+
+
+class UniformSampling(BanditAlgorithm):
+  """Defines a baseline; returns one action uniformly at random."""
+
+  def __init__(self, name, hparams):
+    """Creates a UniformSampling object.
+
+    Args:
+      name: Name of the algorithm.
+      hparams: Hyper-parameters, including the number of arms (num_actions).
+    """
+
+    self.name = name
+    self.hparams = hparams
+
+  def action(self, context):
+    """Selects an action uniformly at random."""
+    return np.random.choice(range(self.hparams.num_actions))
--- a/research/deep_contextual_bandits/bandits/algorithms/variational_neural_bandit_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/variational_neural_bandit_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Bayesian NN using factorized VI (Bayes By Backprop. Blundell et al. 2014).
+
+See https://arxiv.org/abs/1505.05424 for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from absl import flags
+from bandits.core.bayesian_nn import BayesianNN
+
+FLAGS = flags.FLAGS
+
+
+def log_gaussian(x, mu, sigma, reduce_sum=True):
+  """Returns log Gaussian pdf."""
+  res = (-0.5 * np.log(2 * np.pi) - tf.log(sigma) - tf.square(x - mu) /
+         (2 * tf.square(sigma)))
+  if reduce_sum:
+    return tf.reduce_sum(res)
+  else:
+    return res
+
+
+def analytic_kl(mu_1, sigma_1, mu_2, sigma_2):
+  """KL for two Gaussian distributions with diagonal covariance matrix."""
+  sigma_1_sq = tf.square(sigma_1)
+  sigma_2_sq = tf.square(sigma_2)
+
+  t1 = tf.square(mu_1 - mu_2) / (2. * sigma_2_sq)
+  t2 = (sigma_1_sq/sigma_2_sq - 1. - tf.log(sigma_1_sq) + tf.log(sigma_2_sq))/2.
+  return tf.reduce_sum(t1 + t2)
+
+
+class VariationalNeuralBanditModel(BayesianNN):
+  """Implements an approximate Bayesian NN using Variational Inference."""
+
+  def __init__(self, hparams, name="BBBNN"):
+
+    self.name = name
+    self.hparams = hparams
+
+    self.n_in = self.hparams.context_dim
+    self.n_out = self.hparams.num_actions
+    self.layers = self.hparams.layer_sizes
+    self.init_scale = self.hparams.init_scale
+    self.f_num_points = None
+    if "f_num_points" in hparams:
+      self.f_num_points = self.hparams.f_num_points
+
+    self.cleared_times_trained = self.hparams.cleared_times_trained
+    self.initial_training_steps = self.hparams.initial_training_steps
+    self.training_schedule = np.linspace(self.initial_training_steps,
+                                         self.hparams.training_epochs,
+                                         self.cleared_times_trained)
+    self.verbose = getattr(self.hparams, "verbose", True)
+
+    self.weights_m = {}
+    self.weights_std = {}
+    self.biases_m = {}
+    self.biases_std = {}
+
+    self.times_trained = 0
+
+    if self.hparams.use_sigma_exp_transform:
+      self.sigma_transform = tf.exp
+      self.inverse_sigma_transform = np.log
+    else:
+      self.sigma_transform = tf.nn.softplus
+      self.inverse_sigma_transform = lambda y: y + np.log(1. - np.exp(-y))
+
+    # Whether to use the local reparameterization trick to compute the loss.
+    # See details in https://arxiv.org/abs/1506.02557
+    self.use_local_reparameterization = True
+
+    self.build_graph()
+
+  def build_mu_variable(self, shape):
+    """Returns a mean variable initialized as N(0, 0.05)."""
+    return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
+
+  def build_sigma_variable(self, shape, init=-5.):
+    """Returns a sigma variable initialized as N(init, 0.05)."""
+    # Initialize sigma to be very small initially to encourage MAP opt first
+    return tf.Variable(tf.random_normal(shape, init, 0.05))
+
+  def build_layer(self, input_x, input_x_local, shape,
+                  layer_id, activation_fn=tf.nn.relu):
+    """Builds a variational layer, and computes KL term.
+
+    Args:
+      input_x: Input to the variational layer.
+      input_x_local: Input when the local reparameterization trick was applied.
+      shape: [number_inputs, number_outputs] for the layer.
+      layer_id: Number of layer in the architecture.
+      activation_fn: Activation function to apply.
+
+    Returns:
+      output_h: Output of the variational layer.
+      output_h_local: Output when local reparameterization trick was applied.
+      neg_kl: Negative KL term for the layer.
+    """
+
+    w_mu = self.build_mu_variable(shape)
+    w_sigma = self.sigma_transform(self.build_sigma_variable(shape))
+    w_noise = tf.random_normal(shape)
+    w = w_mu + w_sigma * w_noise
+
+    b_mu = self.build_mu_variable([1, shape[1]])
+    b_sigma = self.sigma_transform(self.build_sigma_variable([1, shape[1]]))
+    b = b_mu
+
+    # Store means and stds
+    self.weights_m[layer_id] = w_mu
+    self.weights_std[layer_id] = w_sigma
+    self.biases_m[layer_id] = b_mu
+    self.biases_std[layer_id] = b_sigma
+
+    # Create outputs
+    output_h = activation_fn(tf.matmul(input_x, w) + b)
+
+    if self.use_local_reparameterization:
+      # Use analytic KL divergence wrt the prior
+      neg_kl = -analytic_kl(w_mu, w_sigma,
+                            0., tf.to_float(np.sqrt(2./shape[0])))
+    else:
+      # Create empirical KL loss terms
+      log_p = log_gaussian(w, 0., tf.to_float(np.sqrt(2./shape[0])))
+      log_q = log_gaussian(w, tf.stop_gradient(w_mu), tf.stop_gradient(w_sigma))
+      neg_kl = log_p - log_q
+
+    # Apply local reparameterization trick: sample activations pre nonlinearity
+    m_h = tf.matmul(input_x_local, w_mu) + b
+    v_h = tf.matmul(tf.square(input_x_local), tf.square(w_sigma))
+    output_h_local = m_h + tf.sqrt(v_h + 1e-6) * tf.random_normal(tf.shape(v_h))
+    output_h_local = activation_fn(output_h_local)
+
+    return output_h, output_h_local, neg_kl
+
+  def build_action_noise(self):
+    """Defines a model for additive noise per action, and its KL term."""
+
+    # Define mean and std variables (log-normal dist) for each action.
+    noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
+                      + self.inverse_sigma_transform(self.hparams.noise_sigma))
+    noise_sigma_sigma = self.sigma_transform(
+        self.build_sigma_variable([1, self.n_out]))
+
+    pre_noise_sigma = (noise_sigma_mu
+                       + tf.random_normal([1, self.n_out]) * noise_sigma_sigma)
+    self.noise_sigma = self.sigma_transform(pre_noise_sigma)
+
+    # Compute KL for additive noise sigma terms.
+    if getattr(self.hparams, "infer_noise_sigma", False):
+      neg_kl_term = log_gaussian(
+          pre_noise_sigma,
+          self.inverse_sigma_transform(self.hparams.noise_sigma),
+          self.hparams.prior_sigma
+      )
+      neg_kl_term -= log_gaussian(pre_noise_sigma,
+                                  noise_sigma_mu,
+                                  noise_sigma_sigma)
+    else:
+      neg_kl_term = 0.
+
+    return neg_kl_term
+
+  def build_model(self, activation_fn=tf.nn.relu):
+    """Defines the actual NN model with fully connected layers.
+
+    The loss is computed for partial feedback settings (bandits), so only
+    the observed outcome is backpropagated (see weighted loss).
+    Selects the optimizer and, finally, it also initializes the graph.
+
+    Args:
+      activation_fn: the activation function used in the nn layers.
+    """
+
+    if self.verbose:
+      print("Initializing model {}.".format(self.name))
+    neg_kl_term, l_number = 0, 0
+    use_local_reparameterization = self.use_local_reparameterization
+
+    # Compute model additive noise for each action with log-normal distribution
+    neg_kl_term += self.build_action_noise()
+
+    # Build network.
+    input_x = self.x
+    input_local = self.x
+    n_in = self.n_in
+
+    for l_number, n_nodes in enumerate(self.layers):
+      if n_nodes > 0:
+        h, h_local, neg_kl = self.build_layer(input_x, input_local,
+                                              [n_in, n_nodes], l_number)
+
+        neg_kl_term += neg_kl
+        input_x, input_local = h, h_local
+        n_in = n_nodes
+
+    # Create last linear layer
+    h, h_local, neg_kl = self.build_layer(input_x, input_local,
+                                          [n_in, self.n_out],
+                                          l_number + 1,
+                                          activation_fn=lambda x: x)
+    neg_kl_term += neg_kl
+
+    self.y_pred = h
+    self.y_pred_local = h_local
+
+    # Compute log likelihood (with learned or fixed noise level)
+    if getattr(self.hparams, "infer_noise_sigma", False):
+      log_likelihood = log_gaussian(
+          self.y, self.y_pred_local, self.noise_sigma, reduce_sum=False)
+    else:
+      y_hat = self.y_pred_local if use_local_reparameterization else self.y_pred
+      log_likelihood = log_gaussian(
+          self.y, y_hat, self.hparams.noise_sigma, reduce_sum=False)
+
+    # Only take into account observed outcomes (bandits setting)
+    batch_size = tf.to_float(tf.shape(self.x)[0])
+    weighted_log_likelihood = tf.reduce_sum(
+        log_likelihood * self.weights) / batch_size
+
+    # The objective is 1/n * (\sum_i log_like_i - KL); neg_kl_term estimates -KL
+    elbo = weighted_log_likelihood + (neg_kl_term / self.n)
+
+    self.loss = -elbo
+    self.global_step = tf.train.get_or_create_global_step()
+    self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
+        self.loss, global_step=self.global_step)
+
+    # Create tensorboard metrics
+    self.create_summaries()
+    self.summary_writer = tf.summary.FileWriter(
+        "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
+
+  def build_graph(self):
+    """Defines graph, session, placeholders, and model.
+
+    Placeholders are: n (size of the dataset), x and y (context and observed
+    reward for each action), and weights (one-hot encoding of selected action
+    for each context, i.e., only possibly non-zero element in each y).
+    """
+
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+
+      self.sess = tf.Session()
+
+      self.n = tf.placeholder(shape=[], dtype=tf.float32)
+
+      self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float32)
+      self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
+      self.weights = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
+
+      self.build_model()
+      self.sess.run(tf.global_variables_initializer())
+
+  def create_summaries(self):
+    """Defines summaries including mean loss, and global step."""
+
+    with self.graph.as_default():
+      with tf.name_scope(self.name + "_summaries"):
+        tf.summary.scalar("loss", self.loss)
+        tf.summary.scalar("global_step", self.global_step)
+        self.summary_op = tf.summary.merge_all()
+
+  def assign_lr(self):
+    """Resets the learning rate in dynamic schedules for subsequent trainings.
+
+    In bandits settings, we do expand our dataset over time. Then, we need to
+    re-train the network with the new data. The algorithms that do not keep
+    the step constant, can reset it at the start of each *training* process.
+    """
+
+    decay_steps = 1
+    if self.hparams.activate_decay:
+      current_gs = self.sess.run(self.global_step)
+      with self.graph.as_default():
+        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
+                                              self.global_step - current_gs,
+                                              decay_steps,
+                                              self.hparams.lr_decay_rate)
+
+  def train(self, data, num_steps):
+    """Trains the BNN for num_steps, using the data in 'data'.
+
+    Args:
+      data: ContextualDataset object that provides the data.
+      num_steps: Number of minibatches to train the network for.
+
+    Returns:
+      losses: Loss history during training.
+    """
+
+    if self.times_trained < self.cleared_times_trained:
+      num_steps = int(self.training_schedule[self.times_trained])
+    self.times_trained += 1
+
+    losses = []
+
+    with self.graph.as_default():
+
+      if self.verbose:
+        print("Training {} for {} steps...".format(self.name, num_steps))
+
+      for step in range(num_steps):
+        x, y, weights = data.get_batch_with_weights(self.hparams.batch_size)
+        _, summary, global_step, loss = self.sess.run(
+            [self.train_op, self.summary_op, self.global_step, self.loss],
+            feed_dict={
+                self.x: x,
+                self.y: y,
+                self.weights: weights,
+                self.n: data.num_points(self.f_num_points),
+            })
+
+        losses.append(loss)
+
+        if step % self.hparams.freq_summary == 0:
+          if self.hparams.show_training:
+            print("{} | step: {}, loss: {}".format(
+                self.name, global_step, loss))
+          self.summary_writer.add_summary(summary, global_step)
+
+    return losses
--- a/research/deep_contextual_bandits/bandits/core/__pycache__/bandit_algorithm.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/core/__pycache__/bandit_algorithm.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/core/__pycache__/bayesian_nn.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/core/__pycache__/bayesian_nn.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/core/__pycache__/contextual_bandit.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/core/__pycache__/contextual_bandit.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/core/__pycache__/contextual_dataset.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/core/__pycache__/contextual_dataset.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/core/bandit_algorithm.py
+++ b/research/deep_contextual_bandits/bandits/core/bandit_algorithm.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Define the abstract class for contextual bandit algorithms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class BanditAlgorithm(object):
+  """A bandit algorithm must be able to do two basic operations.
+
+  1. Choose an action given a context.
+  2. Update its internal model given a triple (context, played action, reward).
+  """
+
+  def action(self, context):
+    pass
+
+  def update(self, context, action, reward):
+    pass
--- a/research/deep_contextual_bandits/bandits/core/bayesian_nn.py
+++ b/research/deep_contextual_bandits/bandits/core/bayesian_nn.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Define the abstract class for Bayesian Neural Networks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class BayesianNN(object):
+  """A Bayesian neural network keeps a distribution over neural nets."""
+
+  def __init__(self, optimizer):
+    pass
+
+  def build_model(self):
+    pass
+
+  def train(self, data):
+    pass
+
+  def sample(self, steps):
+    pass
--- a/research/deep_contextual_bandits/bandits/core/contextual_bandit.py
+++ b/research/deep_contextual_bandits/bandits/core/contextual_bandit.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Define a contextual bandit from which we can sample and compute rewards.
+
+We can feed the data, sample a context, its reward for a specific action, and
+also the optimal action for a given context.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def run_contextual_bandit(context_dim, num_actions, dataset, algos):
+  """Run a contextual bandit problem on a set of algorithms.
+
+  Args:
+    context_dim: Dimension of the context.
+    num_actions: Number of available actions.
+    dataset: Matrix where every row is a context + num_actions rewards.
+    algos: List of algorithms to use in the contextual bandit instance.
+
+  Returns:
+    h_actions: Matrix with actions: size (num_context, num_algorithms).
+    h_rewards: Matrix with rewards: size (num_context, num_algorithms).
+  """
+
+  num_contexts = dataset.shape[0]
+
+  # Create contextual bandit
+  cmab = ContextualBandit(context_dim, num_actions)
+  cmab.feed_data(dataset)
+
+  h_actions = np.empty((0, len(algos)), float)
+  h_rewards = np.empty((0, len(algos)), float)
+
+  # Run the contextual bandit process
+  for i in range(num_contexts):
+    context = cmab.context(i)
+    actions = [a.action(context) for a in algos]
+    rewards = [cmab.reward(i, action) for action in actions]
+
+    for j, a in enumerate(algos):
+      a.update(context, actions[j], rewards[j])
+
+    h_actions = np.vstack((h_actions, np.array(actions)))
+    h_rewards = np.vstack((h_rewards, np.array(rewards)))
+
+  return h_actions, h_rewards
+
+
+class ContextualBandit(object):
+  """Implements a Contextual Bandit with d-dimensional contexts and k arms."""
+
+  def __init__(self, context_dim, num_actions):
+    """Creates a contextual bandit object.
+
+    Args:
+      context_dim: Dimension of the contexts.
+      num_actions: Number of arms for the multi-armed bandit.
+    """
+
+    self._context_dim = context_dim
+    self._num_actions = num_actions
+
+  def feed_data(self, data):
+    """Feeds the data (contexts + rewards) to the bandit object.
+
+    Args:
+      data: Numpy array with shape [n, d+k], where n is the number of contexts,
+        d is the dimension of each context, and k the number of arms (rewards).
+
+    Raises:
+      ValueError: when data dimensions do not correspond to the object values.
+    """
+
+    if data.shape[1] != self.context_dim + self.num_actions:
+      raise ValueError('Data dimensions do not match.')
+
+    self._number_contexts = data.shape[0]
+    self.data = data
+    self.order = range(self.number_contexts)
+
+  def reset(self):
+    """Randomly shuffle the order of the contexts to deliver."""
+    self.order = np.random.permutation(self.number_contexts)
+
+  def context(self, number):
+    """Returns the number-th context."""
+    return self.data[self.order[number]][:self.context_dim]
+
+  def reward(self, number, action):
+    """Returns the reward for the number-th context and action."""
+    return self.data[self.order[number]][self.context_dim + action]
+
+  def optimal(self, number):
+    """Returns the optimal action (in hindsight) for the number-th context."""
+    return np.argmax(self.data[self.order[number]][self.context_dim:])
+
+  @property
+  def context_dim(self):
+    return self._context_dim
+
+  @property
+  def num_actions(self):
+    return self._num_actions
+
+  @property
+  def number_contexts(self):
+    return self._number_contexts
--- a/research/deep_contextual_bandits/bandits/core/contextual_dataset.py
+++ b/research/deep_contextual_bandits/bandits/core/contextual_dataset.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Define a data buffer for contextual bandit algorithms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class ContextualDataset(object):
+  """The buffer is able to append new data, and sample random minibatches."""
+
+  def __init__(self, context_dim, num_actions, buffer_s=-1, intercept=False):
+    """Creates a ContextualDataset object.
+
+    The data is stored in attributes: contexts and rewards.
+    The sequence of taken actions are stored in attribute actions.
+
+    Args:
+      context_dim: Dimension of the contexts.
+      num_actions: Number of arms for the multi-armed bandit.
+      buffer_s: Size of buffer for training. Only last buffer_s will be
+        returned as minibatch. If buffer_s = -1, all data will be used.
+      intercept: If True, it adds a constant (1.0) dimension to each context X,
+        at the end.
+    """
+
+    self._context_dim = context_dim
+    self._num_actions = num_actions
+    self._contexts = None
+    self._rewards = None
+    self.actions = []
+    self.buffer_s = buffer_s
+    self.intercept = intercept
+
+  def add(self, context, action, reward):
+    """Adds a new triplet (context, action, reward) to the dataset.
+
+    The reward for the actions that weren't played is assumed to be zero.
+
+    Args:
+      context: A d-dimensional vector with the context.
+      action: Integer between 0 and k-1 representing the chosen arm.
+      reward: Real number representing the reward for the (context, action).
+    """
+
+    if self.intercept:
+      c = np.array(context[:])
+      c = np.append(c, 1.0).reshape((1, self.context_dim + 1))
+    else:
+      c = np.array(context[:]).reshape((1, self.context_dim))
+
+    if self.contexts is None:
+      self.contexts = c
+    else:
+      self.contexts = np.vstack((self.contexts, c))
+
+    r = np.zeros((1, self.num_actions))
+    r[0, action] = reward
+    if self.rewards is None:
+      self.rewards = r
+    else:
+      self.rewards = np.vstack((self.rewards, r))
+
+    self.actions.append(action)
+
+  def replace_data(self, contexts=None, actions=None, rewards=None):
+    if contexts is not None:
+      self.contexts = contexts
+    if actions is not None:
+      self.actions = actions
+    if rewards is not None:
+      self.rewards = rewards
+
+  def get_batch(self, batch_size):
+    """Returns a random minibatch of (contexts, rewards) with batch_size."""
+    n, _ = self.contexts.shape
+    if self.buffer_s == -1:
+      # use all the data
+      ind = np.random.choice(range(n), batch_size)
+    else:
+      # use only buffer (last buffer_s observations)
+      ind = np.random.choice(range(max(0, n - self.buffer_s), n), batch_size)
+    return self.contexts[ind, :], self.rewards[ind, :]
+
+  def get_data(self, action):
+    """Returns all (context, reward) where the action was played."""
+    n, _ = self.contexts.shape
+    ind = np.array([i for i in range(n) if self.actions[i] == action])
+    return self.contexts[ind, :], self.rewards[ind, action]
+
+  def get_data_with_weights(self):
+    """Returns all observations with one-hot weights for actions."""
+    weights = np.zeros((self.contexts.shape[0], self.num_actions))
+    a_ind = np.array([(i, val) for i, val in enumerate(self.actions)])
+    weights[a_ind[:, 0], a_ind[:, 1]] = 1.0
+    return self.contexts, self.rewards, weights
+
+  def get_batch_with_weights(self, batch_size):
+    """Returns a random mini-batch with one-hot weights for actions."""
+    n, _ = self.contexts.shape
+    if self.buffer_s == -1:
+      # use all the data
+      ind = np.random.choice(range(n), batch_size)
+    else:
+      # use only buffer (last buffer_s obs)
+      ind = np.random.choice(range(max(0, n - self.buffer_s), n), batch_size)
+
+    weights = np.zeros((batch_size, self.num_actions))
+    sampled_actions = np.array(self.actions)[ind]
+    a_ind = np.array([(i, val) for i, val in enumerate(sampled_actions)])
+    weights[a_ind[:, 0], a_ind[:, 1]] = 1.0
+    return self.contexts[ind, :], self.rewards[ind, :], weights
+
+  def num_points(self, f=None):
+    """Returns number of points in the buffer (after applying function f)."""
+    if f is not None:
+      return f(self.contexts.shape[0])
+    return self.contexts.shape[0]
+
+  @property
+  def context_dim(self):
+    return self._context_dim
+
+  @property
+  def num_actions(self):
+    return self._num_actions
+
+  @property
+  def contexts(self):
+    return self._contexts
+
+  @contexts.setter
+  def contexts(self, value):
+    self._contexts = value
+
+  @property
+  def actions(self):
+    return self._actions
+
+  @actions.setter
+  def actions(self, value):
+    self._actions = value
+
+  @property
+  def rewards(self):
+    return self._rewards
+
+  @rewards.setter
+  def rewards(self, value):
+    self._rewards = value
--- a/research/deep_contextual_bandits/bandits/data/__pycache__/data_sampler.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/data/__pycache__/data_sampler.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/data/__pycache__/synthetic_data_sampler.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/data/__pycache__/synthetic_data_sampler.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/data/data_sampler.py
+++ b/research/deep_contextual_bandits/bandits/data/data_sampler.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions to create bandit problems from datasets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+
+
+def one_hot(df, cols):
+  """Returns one-hot encoding of DataFrame df including columns in cols."""
+  for col in cols:
+    dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
+    df = pd.concat([df, dummies], axis=1)
+    df = df.drop(col, axis=1)
+  return df
+
+
+def sample_mushroom_data(file_name,
+                         num_contexts,
+                         r_noeat=0,
+                         r_eat_safe=5,
+                         r_eat_poison_bad=-35,
+                         r_eat_poison_good=5,
+                         prob_poison_bad=0.5):
+  """Samples bandit game from Mushroom UCI Dataset.
+
+  Args:
+    file_name: Route of file containing the original Mushroom UCI dataset.
+    num_contexts: Number of points to sample, i.e. (context, action rewards).
+    r_noeat: Reward for not eating a mushroom.
+    r_eat_safe: Reward for eating a non-poisonous mushroom.
+    r_eat_poison_bad: Reward for eating a poisonous mushroom if harmed.
+    r_eat_poison_good: Reward for eating a poisonous mushroom if not harmed.
+    prob_poison_bad: Probability of being harmed by eating a poisonous mushroom.
+
+  Returns:
+    dataset: Sampled matrix with n rows: (context, eat_reward, no_eat_reward).
+    opt_vals: Vector of expected optimal (reward, action) for each context.
+
+  We assume r_eat_safe > r_noeat, and r_eat_poison_good > r_eat_poison_bad.
+  """
+
+  # first two cols of df encode whether mushroom is edible or poisonous
+  df = pd.read_csv(file_name, header=None)
+  df = one_hot(df, df.columns)
+  ind = np.random.choice(range(df.shape[0]), num_contexts, replace=True)
+
+  contexts = df.iloc[ind, 2:]
+  no_eat_reward = r_noeat * np.ones((num_contexts, 1))
+  random_poison = np.random.choice(
+      [r_eat_poison_bad, r_eat_poison_good],
+      p=[prob_poison_bad, 1 - prob_poison_bad],
+      size=num_contexts)
+  eat_reward = r_eat_safe * df.iloc[ind, 0]
+  eat_reward += np.multiply(random_poison, df.iloc[ind, 1])
+  eat_reward = eat_reward.reshape((num_contexts, 1))
+
+  # compute optimal expected reward and optimal actions
+  exp_eat_poison_reward = r_eat_poison_bad * prob_poison_bad
+  exp_eat_poison_reward += r_eat_poison_good * (1 - prob_poison_bad)
+  opt_exp_reward = r_eat_safe * df.iloc[ind, 0] + max(
+      r_noeat, exp_eat_poison_reward) * df.iloc[ind, 1]
+
+  if r_noeat > exp_eat_poison_reward:
+    # actions: no eat = 0 ; eat = 1
+    opt_actions = df.iloc[ind, 0]  # indicator of edible
+  else:
+    # should always eat (higher expected reward)
+    opt_actions = np.ones((num_contexts, 1))
+
+  opt_vals = (opt_exp_reward.values, opt_actions.values)
+
+  return np.hstack((contexts, no_eat_reward, eat_reward)), opt_vals
+
+
+def sample_stock_data(file_name, context_dim, num_actions, num_contexts,
+                      sigma, shuffle_rows=True):
+  """Samples linear bandit game from stock prices dataset.
+
+  Args:
+    file_name: Route of file containing the stock prices dataset.
+    context_dim: Context dimension (i.e. vector with the price of each stock).
+    num_actions: Number of actions (different linear portfolio strategies).
+    num_contexts: Number of contexts to sample.
+    sigma: Vector with additive noise levels for each action.
+    shuffle_rows: If True, rows from original dataset are shuffled.
+
+  Returns:
+    dataset: Sampled matrix with rows: (context, reward_1, ..., reward_k).
+    opt_vals: Vector of expected optimal (reward, action) for each context.
+  """
+
+  with tf.gfile.Open(file_name, 'r') as f:
+    contexts = np.loadtxt(f, skiprows=1)
+
+  if shuffle_rows:
+    np.random.shuffle(contexts)
+  contexts = contexts[:num_contexts, :]
+
+  betas = np.random.uniform(-1, 1, (context_dim, num_actions))
+  betas /= np.linalg.norm(betas, axis=0)
+
+  mean_rewards = np.dot(contexts, betas)
+  noise = np.random.normal(scale=sigma, size=mean_rewards.shape)
+  rewards = mean_rewards + noise
+
+  opt_actions = np.argmax(mean_rewards, axis=1)
+  opt_rewards = [mean_rewards[i, a] for i, a in enumerate(opt_actions)]
+  return np.hstack((contexts, rewards)), (np.array(opt_rewards), opt_actions)
+
+
+def sample_jester_data(file_name, context_dim, num_actions, num_contexts,
+                       shuffle_rows=True, shuffle_cols=False):
+  """Samples bandit game from (user, joke) dense subset of Jester dataset.
+
+  Args:
+    file_name: Route of file containing the modified Jester dataset.
+    context_dim: Context dimension (i.e. vector with some ratings from a user).
+    num_actions: Number of actions (number of joke ratings to predict).
+    num_contexts: Number of contexts to sample.
+    shuffle_rows: If True, rows from original dataset are shuffled.
+    shuffle_cols: Whether or not context/action jokes are randomly shuffled.
+
+  Returns:
+    dataset: Sampled matrix with rows: (context, rating_1, ..., rating_k).
+    opt_vals: Vector of deterministic optimal (reward, action) for each context.
+  """
+
+  with tf.gfile.Open(file_name, 'rb') as f:
+    dataset = np.load(f)
+
+  if shuffle_cols:
+    dataset = dataset[:, np.random.permutation(dataset.shape[1])]
+  if shuffle_rows:
+    np.random.shuffle(dataset)
+  dataset = dataset[:num_contexts, :]
+
+  assert context_dim + num_actions == dataset.shape[1], 'Wrong data dimensions.'
+
+  opt_actions = np.argmax(dataset[:, context_dim:], axis=1)
+  opt_rewards = np.array([dataset[i, context_dim + a]
+                          for i, a in enumerate(opt_actions)])
+
+  return dataset, (opt_rewards, opt_actions)
+
+
+def sample_statlog_data(file_name, num_contexts, shuffle_rows=True,
+                        remove_underrepresented=False):
+  """Returns bandit problem dataset based on the UCI statlog data.
+
+  Args:
+    file_name: Route of file containing the Statlog dataset.
+    num_contexts: Number of contexts to sample.
+    shuffle_rows: If True, rows from original dataset are shuffled.
+    remove_underrepresented: If True, removes arms with very few rewards.
+
+  Returns:
+    dataset: Sampled matrix with rows: (context, action rewards).
+    opt_vals: Vector of deterministic optimal (reward, action) for each context.
+
+  https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)
+  """
+
+  with tf.gfile.Open(file_name, 'r') as f:
+    data = np.loadtxt(f)
+
+  num_actions = 7  # some of the actions are very rarely optimal.
+
+  # Shuffle data
+  if shuffle_rows:
+    np.random.shuffle(data)
+  data = data[:num_contexts, :]
+
+  # Last column is label, rest are features
+  contexts = data[:, :-1]
+  labels = data[:, -1].astype(int) - 1  # convert to 0 based index
+
+  if remove_underrepresented:
+    contexts, labels = remove_underrepresented_classes(contexts, labels)
+
+  return classification_to_bandit_problem(contexts, labels, num_actions)
+
+
+def sample_adult_data(file_name, num_contexts, shuffle_rows=True,
+                      remove_underrepresented=False):
+  """Returns bandit problem dataset based on the UCI adult data.
+
+  Args:
+    file_name: Route of file containing the Adult dataset.
+    num_contexts: Number of contexts to sample.
+    shuffle_rows: If True, rows from original dataset are shuffled.
+    remove_underrepresented: If True, removes arms with very few rewards.
+
+  Returns:
+    dataset: Sampled matrix with rows: (context, action rewards).
+    opt_vals: Vector of deterministic optimal (reward, action) for each context.
+
+  Preprocessing:
+    * drop rows with missing values
+    * convert categorical variables to 1 hot encoding
+
+  https://archive.ics.uci.edu/ml/datasets/census+income
+  """
+  with tf.gfile.Open(file_name, 'r') as f:
+    df = pd.read_csv(f, header=None,
+                     na_values=[' ?']).dropna()
+
+  num_actions = 14
+
+  if shuffle_rows:
+    df = df.sample(frac=1)
+  df = df.iloc[:num_contexts, :]
+
+  labels = df[6].astype('category').cat.codes.as_matrix()
+  df = df.drop([6], axis=1)
+
+  # Convert categorical variables to 1 hot encoding
+  cols_to_transform = [1, 3, 5, 7, 8, 9, 13, 14]
+  df = pd.get_dummies(df, columns=cols_to_transform)
+
+  if remove_underrepresented:
+    df, labels = remove_underrepresented_classes(df, labels)
+  contexts = df.as_matrix()
+
+  return classification_to_bandit_problem(contexts, labels, num_actions)
+
+
+def sample_census_data(file_name, num_contexts, shuffle_rows=True,
+                       remove_underrepresented=False):
+  """Returns bandit problem dataset based on the UCI census data.
+
+  Args:
+    file_name: Route of file containing the Census dataset.
+    num_contexts: Number of contexts to sample.
+    shuffle_rows: If True, rows from original dataset are shuffled.
+    remove_underrepresented: If True, removes arms with very few rewards.
+
+  Returns:
+    dataset: Sampled matrix with rows: (context, action rewards).
+    opt_vals: Vector of deterministic optimal (reward, action) for each context.
+
+  Preprocessing:
+    * drop rows with missing labels
+    * convert categorical variables to 1 hot encoding
+
+  Note: this is the processed (not the 'raw') dataset. It contains a subset
+  of the raw features and they've all been discretized.
+
+  https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29
+  """
+  # Note: this dataset is quite large. It will be slow to load and preprocess.
+  with tf.gfile.Open(file_name, 'r') as f:
+    df = (pd.read_csv(f, header=0, na_values=['?'])
+          .dropna())
+
+  num_actions = 9
+
+  if shuffle_rows:
+    df = df.sample(frac=1)
+  df = df.iloc[:num_contexts, :]
+
+  # Assuming what the paper calls response variable is the label?
+  labels = df['dOccup'].astype('category').cat.codes.as_matrix()
+  # In addition to label, also drop the (unique?) key.
+  df = df.drop(['dOccup', 'caseid'], axis=1)
+
+  # All columns are categorical. Convert to 1 hot encoding.
+  df = pd.get_dummies(df, columns=df.columns)
+
+  if remove_underrepresented:
+    df, labels = remove_underrepresented_classes(df, labels)
+  contexts = df.as_matrix()
+
+  return classification_to_bandit_problem(contexts, labels, num_actions)
+
+
+def sample_covertype_data(file_name, num_contexts, shuffle_rows=True,
+                          remove_underrepresented=False):
+  """Returns bandit problem dataset based on the UCI Cover_Type data.
+
+  Args:
+    file_name: Route of file containing the Covertype dataset.
+    num_contexts: Number of contexts to sample.
+    shuffle_rows: If True, rows from original dataset are shuffled.
+    remove_underrepresented: If True, removes arms with very few rewards.
+
+  Returns:
+    dataset: Sampled matrix with rows: (context, action rewards).
+    opt_vals: Vector of deterministic optimal (reward, action) for each context.
+
+  Preprocessing:
+    * drop rows with missing labels
+    * convert categorical variables to 1 hot encoding
+
+  https://archive.ics.uci.edu/ml/datasets/Covertype
+  """
+  with tf.gfile.Open(file_name, 'r') as f:
+    df = (pd.read_csv(f, header=0, na_values=['?'])
+          .dropna())
+
+  num_actions = 7
+
+  if shuffle_rows:
+    df = df.sample(frac=1)
+  df = df.iloc[:num_contexts, :]
+
+  # Assuming what the paper calls response variable is the label?
+  # Last column is label.
+  labels = df[df.columns[-1]].astype('category').cat.codes.as_matrix()
+  df = df.drop([df.columns[-1]], axis=1)
+
+  # All columns are either quantitative or already converted to 1 hot.
+  if remove_underrepresented:
+    df, labels = remove_underrepresented_classes(df, labels)
+  contexts = df.as_matrix()
+
+  return classification_to_bandit_problem(contexts, labels, num_actions)
+
+
+def classification_to_bandit_problem(contexts, labels, num_actions=None):
+  """Normalize contexts and encode deterministic rewards."""
+
+  if num_actions is None:
+    num_actions = np.max(labels) + 1
+  num_contexts = contexts.shape[0]
+
+  # Due to random subsampling in small problems, some features may be constant
+  sstd = safe_std(np.std(contexts, axis=0, keepdims=True)[0, :])
+
+  # Normalize features
+  contexts = ((contexts - np.mean(contexts, axis=0, keepdims=True)) / sstd)
+
+  # One hot encode labels as rewards
+  rewards = np.zeros((num_contexts, num_actions))
+  rewards[np.arange(num_contexts), labels] = 1.0
+
+  return contexts, rewards, (np.ones(num_contexts), labels)
+
+
+def safe_std(values):
+  """Remove zero std values for ones."""
+  return np.array([val if val != 0.0 else 1.0 for val in values])
+
+
+def remove_underrepresented_classes(features, labels, thresh=0.0005):
+  """Removes classes when number of datapoints fraction is below a threshold."""
+
+  # Threshold doesn't seem to agree with https://arxiv.org/pdf/1706.04687.pdf
+  # Example: for Covertype, they report 4 classes after filtering, we get 7?
+  total_count = labels.shape[0]
+  unique, counts = np.unique(labels, return_counts=True)
+  ratios = counts.astype('float') / total_count
+  vals_and_ratios = dict(zip(unique, ratios))
+  print('Unique classes and their ratio of total: %s' % vals_and_ratios)
+  keep = [vals_and_ratios[v] >= thresh for v in labels]
+  return features[keep], labels[np.array(keep)]
--- a/research/deep_contextual_bandits/bandits/data/synthetic_data_sampler.py
+++ b/research/deep_contextual_bandits/bandits/data/synthetic_data_sampler.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Several functions to sample contextual data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def sample_contextual_data(num_contexts, dim_context, num_actions, sigma):
+  """Samples independent Gaussian data.
+
+  There is nothing to learn here as the rewards do not depend on the context.
+
+  Args:
+    num_contexts: Number of contexts to sample.
+    dim_context: Dimension of the contexts.
+    num_actions: Number of arms for the multi-armed bandit.
+    sigma: Standard deviation of the independent Gaussian samples.
+
+  Returns:
+    data: A [num_contexts, dim_context + num_actions] numpy array with the data.
+  """
+  size_data = [num_contexts, dim_context + num_actions]
+  return np.random.normal(scale=sigma, size=size_data)
+
+
+def sample_linear_data(num_contexts, dim_context, num_actions, sigma=0.0):
+  """Samples data from linearly parameterized arms.
+
+  The reward for context X and arm j is given by X^T beta_j, for some latent
+  set of parameters {beta_j : j = 1, ..., k}. The beta's are sampled uniformly
+  at random, the contexts are Gaussian, and sigma-noise is added to the rewards.
+
+  Args:
+    num_contexts: Number of contexts to sample.
+    dim_context: Dimension of the contexts.
+    num_actions: Number of arms for the multi-armed bandit.
+    sigma: Standard deviation of the additive noise. Set to zero for no noise.
+
+  Returns:
+    data: A [n, d+k] numpy array with the data.
+    betas: Latent parameters that determine expected reward for each arm.
+    opt: (optimal_rewards, optimal_actions) for all contexts.
+  """
+
+  betas = np.random.uniform(-1, 1, (dim_context, num_actions))
+  betas /= np.linalg.norm(betas, axis=0)
+  contexts = np.random.normal(size=[num_contexts, dim_context])
+  rewards = np.dot(contexts, betas)
+  opt_actions = np.argmax(rewards, axis=1)
+  rewards += np.random.normal(scale=sigma, size=rewards.shape)
+  opt_rewards = np.array([rewards[i, act] for i, act in enumerate(opt_actions)])
+  return np.hstack((contexts, rewards)), betas, (opt_rewards, opt_actions)
+
+
+def sample_sparse_linear_data(num_contexts, dim_context, num_actions,
+                              sparse_dim, sigma=0.0):
+  """Samples data from sparse linearly parameterized arms.
+
+  The reward for context X and arm j is given by X^T beta_j, for some latent
+  set of parameters {beta_j : j = 1, ..., k}. The beta's are sampled uniformly
+  at random, the contexts are Gaussian, and sigma-noise is added to the rewards.
+  Only s components out of d are non-zero for each arm's beta.
+
+  Args:
+    num_contexts: Number of contexts to sample.
+    dim_context: Dimension of the contexts.
+    num_actions: Number of arms for the multi-armed bandit.
+    sparse_dim: Dimension of the latent subspace (sparsity pattern dimension).
+    sigma: Standard deviation of the additive noise. Set to zero for no noise.
+
+  Returns:
+    data: A [num_contexts, dim_context+num_actions] numpy array with the data.
+    betas: Latent parameters that determine expected reward for each arm.
+    opt: (optimal_rewards, optimal_actions) for all contexts.
+  """
+
+  flatten = lambda l: [item for sublist in l for item in sublist]
+  sparse_pattern = flatten(
+      [[(j, i) for j in np.random.choice(range(dim_context),
+                                         sparse_dim,
+                                         replace=False)]
+       for i in range(num_actions)])
+  betas = np.random.uniform(-1, 1, (dim_context, num_actions))
+  mask = np.zeros((dim_context, num_actions))
+  for elt in sparse_pattern:
+    mask[elt] = 1
+  betas = np.multiply(betas, mask)
+  betas /= np.linalg.norm(betas, axis=0)
+  contexts = np.random.normal(size=[num_contexts, dim_context])
+  rewards = np.dot(contexts, betas)
+  opt_actions = np.argmax(rewards, axis=1)
+  rewards += np.random.normal(scale=sigma, size=rewards.shape)
+  opt_rewards = np.array([rewards[i, act] for i, act in enumerate(opt_actions)])
+  return np.hstack((contexts, rewards)), betas, (opt_rewards, opt_actions)
+
+
+def sample_wheel_bandit_data(num_contexts, delta, mean_v, std_v,
+                             mu_large, std_large):
+  """Samples from Wheel bandit game (see https://arxiv.org/abs/1802.09127).
+
+  Args:
+    num_contexts: Number of points to sample, i.e. (context, action rewards).
+    delta: Exploration parameter: high reward in one region if norm above delta.
+    mean_v: Mean reward for each action if context norm is below delta.
+    std_v: Gaussian reward std for each action if context norm is below delta.
+    mu_large: Mean reward for optimal action if context norm is above delta.
+    std_large: Reward std for optimal action if context norm is above delta.
+
+  Returns:
+    dataset: Sampled matrix with n rows: (context, action rewards).
+    opt_vals: Vector of expected optimal (reward, action) for each context.
+  """
+
+  context_dim = 2
+  num_actions = 5
+
+  data = []
+  rewards = []
+  opt_actions = []
+  opt_rewards = []
+
+  # sample uniform contexts in unit ball
+  while len(data) < num_contexts:
+    raw_data = np.random.uniform(-1, 1, (int(num_contexts / 3), context_dim))
+
+    for i in range(raw_data.shape[0]):
+      if np.linalg.norm(raw_data[i, :]) <= 1:
+        data.append(raw_data[i, :])
+
+  contexts = np.stack(data)[:num_contexts, :]
+
+  # sample rewards
+  for i in range(num_contexts):
+    r = [np.random.normal(mean_v[j], std_v[j]) for j in range(num_actions)]
+    if np.linalg.norm(contexts[i, :]) >= delta:
+      # large reward in the right region for the context
+      r_big = np.random.normal(mu_large, std_large)
+      if contexts[i, 0] > 0:
+        if contexts[i, 1] > 0:
+          r[0] = r_big
+          opt_actions.append(0)
+        else:
+          r[1] = r_big
+          opt_actions.append(1)
+      else:
+        if contexts[i, 1] > 0:
+          r[2] = r_big
+          opt_actions.append(2)
+        else:
+          r[3] = r_big
+          opt_actions.append(3)
+    else:
+      opt_actions.append(np.argmax(mean_v))
+
+    opt_rewards.append(r[opt_actions[-1]])
+    rewards.append(r)
+
+  rewards = np.stack(rewards)
+  opt_rewards = np.array(opt_rewards)
+  opt_actions = np.array(opt_actions)
+
+  return np.hstack((contexts, rewards)), (opt_rewards, opt_actions)
--- a/research/deep_contextual_bandits/example_main.py
+++ b/research/deep_contextual_bandits/example_main.py