Merge pull request #4871 from rikel/master

Deep Contextual Bandits code for tensorflow/models

Merge pull request #4871 from rikel/master
Deep Contextual Bandits code for tensorflow/models
05ec6d87 · Lukasz Kaiser · GitHub · ae8e0f53 · 6b5d9233 · 05ec6d87
Unverified Commit 05ec6d87 authored Jul 23, 2018 by Lukasz Kaiser Committed by GitHub Jul 23, 2018
20 changed files
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -10,6 +10,7 @@
 /research/brain_coder/ @danabo
 /research/cognitive_mapping_and_planning/ @s-gupta
 /research/compression/ @nmjohn
+/research/deep_contextual_bandits/ @rikel
 /research/deeplab/ @aquariusjay @yknzhu @gpapan
 /research/delf/ @andrefaraujo
 /research/differential_privacy/ @ilyamironov @ananthr

--- a/research/README.md
+++ b/research/README.md
@@ -22,6 +22,7 @@ request.
    for visual navigation.
 -   [compression](compression): compressing and decompressing images using a
    pre-trained Residual GRU network.
+-   [deep_contextual_bandits](deep_contextual_bandits): code for a variety of contextual bandits algorithms using deep neural networks and Thompson sampling.
 -   [deep_speech](deep_speech): automatic speech recognition.
 -   [deeplab](deeplab): deep labeling for semantic image segmentation.
 -   [delf](delf): deep local features for image matching and retrieval.

--- a/research/deep_contextual_bandits/README.md
+++ b/research/deep_contextual_bandits/README.md
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/bb_alpha_divergence_model.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/bb_alpha_divergence_model.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/bf_variational_neural_bandit_model.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/bf_variational_neural_bandit_model.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/bootstrapped_bnn_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/bootstrapped_bnn_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/fixed_policy_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/fixed_policy_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/linear_full_posterior_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/linear_full_posterior_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/multitask_gp.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/multitask_gp.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/neural_bandit_model.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/neural_bandit_model.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/neural_linear_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/neural_linear_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/parameter_noise_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/parameter_noise_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/posterior_bnn_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/posterior_bnn_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/uniform_sampling.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/uniform_sampling.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/__pycache__/variational_neural_bandit_model.cpython-36.pyc
+++ b/research/deep_contextual_bandits/bandits/algorithms/__pycache__/variational_neural_bandit_model.cpython-36.pyc
--- a/research/deep_contextual_bandits/bandits/algorithms/bb_alpha_divergence_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/bb_alpha_divergence_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bayesian NN using expectation propagation (Black-Box Alpha-Divergence).
+See https://arxiv.org/abs/1511.03243 for details.
+All formulas used in this implementation are derived in:
+https://www.overleaf.com/12837696kwzjxkyhdytk#/49028744/.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import numpy as np
+import tensorflow as tf
+from absl import flags
+from bandits.core.bayesian_nn import BayesianNN
+FLAGS = flags.FLAGS
+tfd = tf.contrib.distributions  # update to: tensorflow_probability.distributions
+def log_gaussian(x, mu, sigma, reduce_sum=True):
+  res = tfd.Normal(mu, sigma).log_prob(x)
+  if reduce_sum:
+    return tf.reduce_sum(res)
+  else:
+    return res
+class BBAlphaDivergence(BayesianNN):
+  """Implements an approximate Bayesian NN via Black-Box Alpha-Divergence."""
+  def __init__(self, hparams, name):
+    self.name = name
+    self.hparams = hparams
+    self.alpha = getattr(self.hparams, 'alpha', 1.0)
+    self.num_mc_nn_samples = getattr(self.hparams, 'num_mc_nn_samples', 10)
+    self.n_in = self.hparams.context_dim
+    self.n_out = self.hparams.num_actions
+    self.layers = self.hparams.layer_sizes
+    self.batch_size = self.hparams.batch_size
+    self.show_training = self.hparams.show_training
+    self.freq_summary = self.hparams.freq_summary
+    self.verbose = getattr(self.hparams, 'verbose', True)
+    self.cleared_times_trained = self.hparams.cleared_times_trained
+    self.initial_training_steps = self.hparams.initial_training_steps
+    self.training_schedule = np.linspace(self.initial_training_steps,
+                                         self.hparams.training_epochs,
+                                         self.cleared_times_trained)
+    self.times_trained = 0
+    self.initialize_model()
+  def initialize_model(self):
+    """Builds and initialize the model."""
+    self.num_w = 0
+    self.num_b = 0
+    self.weights_m = {}
+    self.weights_std = {}
+    self.biases_m = {}
+    self.biases_std = {}
+    self.h_max_var = []
+    if self.hparams.use_sigma_exp_transform:
+      self.sigma_transform = tfd.bijectors.Exp()
+    else:
+      self.sigma_transform = tfd.bijectors.Softplus()
+    # Build the graph corresponding to the Bayesian NN instance.
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+      self.sess = tf.Session()
+      self.x = tf.placeholder(shape=[None, self.n_in],
+                              dtype=tf.float32, name='x')
+      self.y = tf.placeholder(shape=[None, self.n_out],
+                              dtype=tf.float32, name='y')
+      self.weights = tf.placeholder(shape=[None, self.n_out],
+                                    dtype=tf.float32, name='w')
+      self.data_size = tf.placeholder(tf.float32, shape=(), name='data_size')
+      self.prior_variance = self.hparams.prior_variance
+      if self.prior_variance < 0:
+        # if not fixed, we learn the prior.
+        self.prior_variance = self.sigma_transform.forward(
+            self.build_mu_variable([1, 1]))
+      self.build_model()
+      self.sess.run(tf.global_variables_initializer())
+  def build_mu_variable(self, shape):
+    """Returns a mean variable initialized as N(0, 0.05)."""
+    return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
+  def build_sigma_variable(self, shape, init=-5.):
+    """Returns a sigma variable initialized as N(init, 0.05)."""
+    # Initialize sigma to be very small initially to encourage MAP opt first
+    return tf.Variable(tf.random_normal(shape, init, 0.05))
+  def build_layer(self, input_x, shape, layer_id, activation_fn=tf.nn.relu):
+    """Builds a layer with N(mean, std) for each weight, and samples from it."""
+    w_mu = self.build_mu_variable(shape)
+    w_sigma = self.sigma_transform.forward(self.build_sigma_variable(shape))
+    w_noise = tf.random_normal(shape)
+    w = w_mu + w_sigma * w_noise
+    b_mu = self.build_mu_variable([1, shape[1]])
+    b_sigma = self.sigma_transform.forward(
+        self.build_sigma_variable([1, shape[1]]))
+    b_noise = tf.random_normal([1, shape[1]])
+    b = b_mu + b_sigma * b_noise
+    # Create outputs
+    output_h = activation_fn(tf.matmul(input_x, w) + b)
+    # Store means and stds
+    self.weights_m[layer_id] = w_mu
+    self.weights_std[layer_id] = w_sigma
+    self.biases_m[layer_id] = b_mu
+    self.biases_std[layer_id] = b_sigma
+    return output_h
+  def sample_neural_network(self, activation_fn=tf.nn.relu):
+    """Samples a nn from posterior, computes data log lk and log f factor."""
+    with self.graph.as_default():
+      log_f = 0
+      n = self.data_size
+      input_x = self.x
+      for layer_id in range(self.total_layers):
+        # load mean and std of each weight
+        w_mu = self.weights_m[layer_id]
+        w_sigma = self.weights_std[layer_id]
+        b_mu = self.biases_m[layer_id]
+        b_sigma = self.biases_std[layer_id]
+        # sample weights from Gaussian distribution
+        shape = w_mu.shape
+        w_noise = tf.random_normal(shape)
+        b_noise = tf.random_normal([1, int(shape[1])])
+        w = w_mu + w_sigma * w_noise
+        b = b_mu + b_sigma * b_noise
+        # compute contribution to log_f
+        t1 = w * w_mu / (n * w_sigma ** 2)
+        t2 = (0.5 * w ** 2 / n) * (1 / self.prior_variance - 1 / w_sigma ** 2)
+        log_f += tf.reduce_sum(t1 + t2)
+        t1 = b * b_mu / (n * b_sigma ** 2)
+        t2 = (0.5 * b ** 2 / n) * (1 / self.prior_variance - 1 / b_sigma ** 2)
+        log_f += tf.reduce_sum(t1 + t2)
+        if layer_id < self.total_layers - 1:
+          output_h = activation_fn(tf.matmul(input_x, w) + b)
+        else:
+          output_h = tf.matmul(input_x, w) + b
+        input_x = output_h
+      # compute log likelihood of the observed reward under the sampled nn
+      log_likelihood = log_gaussian(
+          self.y, output_h, self.noise_sigma, reduce_sum=False)
+      weighted_log_likelihood = tf.reduce_sum(log_likelihood * self.weights, -1)
+    return log_f, weighted_log_likelihood
+  def log_z_q(self):
+    """Computes log-partition function of current posterior parameters."""
+    with self.graph.as_default():
+      log_z_q = 0
+      for layer_id in range(self.total_layers):
+        w_mu = self.weights_m[layer_id]
+        w_sigma = self.weights_std[layer_id]
+        b_mu = self.biases_m[layer_id]
+        b_sigma = self.biases_std[layer_id]
+        w_term = 0.5 * tf.reduce_sum(w_mu ** 2 / w_sigma ** 2)
+        w_term += 0.5 * tf.reduce_sum(tf.log(2 * np.pi) + 2 * tf.log(w_sigma))
+        b_term = 0.5 * tf.reduce_sum(b_mu ** 2 / b_sigma ** 2)
+        b_term += 0.5 * tf.reduce_sum(tf.log(2 * np.pi) + 2 * tf.log(b_sigma))
+        log_z_q += w_term + b_term
+      return log_z_q
+  def log_z_prior(self):
+    """Computes log-partition function of the prior parameters."""
+    num_params = self.num_w + self.num_b
+    return num_params * 0.5 * tf.log(2 * np.pi * self.prior_variance)
+  def log_alpha_likelihood_ratio(self, activation_fn=tf.nn.relu):
+    # each nn sample returns (log f, log likelihoods)
+    nn_samples = [
+        self.sample_neural_network(activation_fn)
+        for _ in range(self.num_mc_nn_samples)
+    ]
+    nn_log_f_samples = [elt[0] for elt in nn_samples]
+    nn_log_lk_samples = [elt[1] for elt in nn_samples]
+    # we stack the (log f, log likelihoods) from the k nn samples
+    nn_log_f_stack = tf.stack(nn_log_f_samples)      # k x 1
+    nn_log_lk_stack = tf.stack(nn_log_lk_samples)    # k x N
+    nn_f_tile = tf.tile(nn_log_f_stack, [self.batch_size])
+    nn_f_tile = tf.reshape(nn_f_tile,
+                           [self.num_mc_nn_samples, self.batch_size])
+    # now both the log f and log likelihood terms have shape: k x N
+    # apply formula in https://www.overleaf.com/12837696kwzjxkyhdytk#/49028744/
+    nn_log_ratio = nn_log_lk_stack - nn_f_tile
+    nn_log_ratio = self.alpha * tf.transpose(nn_log_ratio)
+    logsumexp_value = tf.reduce_logsumexp(nn_log_ratio, -1)
+    log_k_scalar = tf.log(tf.cast(self.num_mc_nn_samples, tf.float32))
+    log_k = log_k_scalar * tf.ones([self.batch_size])
+    return tf.reduce_sum(logsumexp_value - log_k, -1)
+  def build_model(self, activation_fn=tf.nn.relu):
+    """Defines the actual NN model with fully connected layers.
+    Args:
+      activation_fn: Activation function for the neural network.
+    The loss is computed for partial feedback settings (bandits), so only
+    the observed outcome is backpropagated (see weighted loss).
+    Selects the optimizer and, finally, it also initializes the graph.
+    """
+    print('Initializing model {}.'.format(self.name))
+    # Build terms for the noise sigma estimation for each action.
+    noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
+                      + self.sigma_transform.inverse(self.hparams.noise_sigma))
+    noise_sigma_sigma = self.sigma_transform.forward(
+        self.build_sigma_variable([1, self.n_out]))
+    pre_noise_sigma = noise_sigma_mu + tf.random_normal(
+        [1, self.n_out]) * noise_sigma_sigma
+    self.noise_sigma = self.sigma_transform.forward(pre_noise_sigma)
+    # Build network
+    input_x = self.x
+    n_in = self.n_in
+    self.total_layers = len(self.layers) + 1
+    if self.layers[0] == 0:
+      self.total_layers = 1
+    for l_number, n_nodes in enumerate(self.layers):
+      if n_nodes > 0:
+        h = self.build_layer(input_x, [n_in, n_nodes], l_number)
+        input_x = h
+        n_in = n_nodes
+        self.num_w += n_in * n_nodes
+        self.num_b += n_nodes
+    self.y_pred = self.build_layer(input_x, [n_in, self.n_out],
+                                   self.total_layers - 1,
+                                   activation_fn=lambda x: x)
+    # Compute energy function based on sampled nn's
+    log_coeff = self.data_size / (self.batch_size * self.alpha)
+    log_ratio = log_coeff * self.log_alpha_likelihood_ratio(activation_fn)
+    logzprior = self.log_z_prior()
+    logzq = self.log_z_q()
+    energy = logzprior - logzq - log_ratio
+    self.loss = energy
+    self.global_step = tf.train.get_or_create_global_step()
+    self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
+        self.loss, global_step=self.global_step)
+    # Useful for debugging
+    sq_loss = tf.squared_difference(self.y_pred, self.y)
+    weighted_sq_loss = self.weights * sq_loss
+    self.cost = tf.reduce_sum(weighted_sq_loss) / self.batch_size
+    # Create tensorboard metrics
+    self.create_summaries()
+    self.summary_writer = tf.summary.FileWriter('{}/graph_{}'.format(
+        FLAGS.logdir, self.name), self.sess.graph)
+  def create_summaries(self):
+    tf.summary.scalar('loss', self.loss)
+    tf.summary.scalar('cost', self.cost)
+    self.summary_op = tf.summary.merge_all()
+  def assign_lr(self):
+    """Resets the learning rate in dynamic schedules for subsequent trainings.
+    In bandits settings, we do expand our dataset over time. Then, we need to
+    re-train the network with the new data. Those algorithms that do not keep
+    the step constant, can reset it at the start of each training process.
+    """
+    decay_steps = 1
+    if self.hparams.activate_decay:
+      current_gs = self.sess.run(self.global_step)
+      with self.graph.as_default():
+        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
+                                              self.global_step - current_gs,
+                                              decay_steps,
+                                              self.hparams.lr_decay_rate)
+  def train(self, data, num_steps):
+    """Trains the BNN for num_steps, using the data in 'data'.
+    Args:
+      data: ContextualDataset object that provides the data.
+      num_steps: Number of minibatches to train the network for.
+    """
+    if self.times_trained < self.cleared_times_trained:
+      num_steps = int(self.training_schedule[self.times_trained])
+    self.times_trained += 1
+    if self.verbose:
+      print('Training {} for {} steps...'.format(self.name, num_steps))
+    with self.graph.as_default():
+      for step in range(num_steps):
+        x, y, w = data.get_batch_with_weights(self.hparams.batch_size)
+        _, summary, global_step, loss = self.sess.run(
+            [self.train_op, self.summary_op, self.global_step, self.loss],
+            feed_dict={self.x: x, self.y: y, self.weights: w,
+                       self.data_size: data.num_points()})
+        weights_l = self.sess.run(self.weights_std[0])
+        self.h_max_var.append(np.max(weights_l))
+        if step % self.freq_summary == 0:
+          if self.show_training:
+            print('step: {}, loss: {}'.format(step, loss))
+            sys.stdout.flush()
+          self.summary_writer.add_summary(summary, global_step)
--- a/research/deep_contextual_bandits/bandits/algorithms/bf_variational_neural_bandit_model.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/bf_variational_neural_bandit_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bayesian NN using factorized VI (Bayes By Backprop. Blundell et al. 2014).
+See https://arxiv.org/abs/1505.05424 for details.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+# import tensorflow_probability as tfp
+from absl import flags
+from bandits.core.bayesian_nn import BayesianNN
+FLAGS = flags.FLAGS
+# tfd = tfp.distributions
+tfd = tf.contrib.distributions
+tfl = tf.contrib.layers
+def log_gaussian(x, mu, sigma, reduce_sum=True):
+  """Returns log Gaussian pdf."""
+  res = tfd.Normal(mu, sigma).log_prob(x)
+  if reduce_sum:
+    return tf.reduce_sum(res)
+  else:
+    return res
+def analytic_kl(mu_1, sigma_1, mu_2, sigma_2):
+  """KL for two Gaussian distributions with diagonal covariance matrix."""
+  kl = tfd.kl_divergence(tfd.MVNDiag(mu_1, sigma_1), tfd.MVNDiag(mu_2, sigma_2))
+  return kl
+class BfVariationalNeuralBanditModel(BayesianNN):
+  """Implements an approximate Bayesian NN using Variational Inference."""
+  def __init__(self, hparams, name="BBBNN"):
+    self.name = name
+    self.hparams = hparams
+    self.n_in = self.hparams.context_dim
+    self.n_out = self.hparams.num_actions
+    self.layers = self.hparams.layer_sizes
+    self.init_scale = self.hparams.init_scale
+    self.f_num_points = None
+    if "f_num_points" in hparams:
+      self.f_num_points = self.hparams.f_num_points
+    self.cleared_times_trained = self.hparams.cleared_times_trained
+    self.initial_training_steps = self.hparams.initial_training_steps
+    self.training_schedule = np.linspace(self.initial_training_steps,
+                                         self.hparams.training_epochs,
+                                         self.cleared_times_trained)
+    self.verbose = getattr(self.hparams, "verbose", True)
+    self.weights_m = {}
+    self.weights_std = {}
+    self.biases_m = {}
+    self.biases_std = {}
+    self.times_trained = 0
+    if self.hparams.use_sigma_exp_transform:
+      self.sigma_transform = tf.exp
+      self.inverse_sigma_transform = np.log
+    else:
+      self.sigma_transform = tf.nn.softplus
+      self.inverse_sigma_transform = lambda y: y + np.log(1. - np.exp(-y))
+    # Whether to use the local reparameterization trick to compute the loss.
+    # See details in https://arxiv.org/abs/1506.02557
+    self.use_local_reparameterization = True
+    self.build_graph()
+  def build_mu_variable(self, shape):
+    """Returns a mean variable initialized as N(0, 0.05)."""
+    return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
+  def build_sigma_variable(self, shape, init=-5.):
+    """Returns a sigma variable initialized as N(init, 0.05)."""
+    # Initialize sigma to be very small initially to encourage MAP opt first
+    return tf.Variable(tf.random_normal(shape, init, 0.05))
+  def build_layer(self, input_x, input_x_local, shape,
+                  layer_id, activation_fn=tf.nn.relu):
+    """Builds a variational layer, and computes KL term.
+    Args:
+      input_x: Input to the variational layer.
+      input_x_local: Input when the local reparameterization trick was applied.
+      shape: [number_inputs, number_outputs] for the layer.
+      layer_id: Number of layer in the architecture.
+      activation_fn: Activation function to apply.
+    Returns:
+      output_h: Output of the variational layer.
+      output_h_local: Output when local reparameterization trick was applied.
+      neg_kl: Negative KL term for the layer.
+    """
+    w_mu = self.build_mu_variable(shape)
+    w_sigma = self.sigma_transform(self.build_sigma_variable(shape))
+    w_noise = tf.random_normal(shape)
+    w = w_mu + w_sigma * w_noise
+    b_mu = self.build_mu_variable([1, shape[1]])
+    b_sigma = self.sigma_transform(self.build_sigma_variable([1, shape[1]]))
+    b = b_mu
+    # Store means and stds
+    self.weights_m[layer_id] = w_mu
+    self.weights_std[layer_id] = w_sigma
+    self.biases_m[layer_id] = b_mu
+    self.biases_std[layer_id] = b_sigma
+    # Create outputs
+    output_h = activation_fn(tf.matmul(input_x, w) + b)
+    if self.use_local_reparameterization:
+      # Use analytic KL divergence wrt the prior
+      neg_kl = -analytic_kl(w_mu, w_sigma,
+                            0., tf.to_float(np.sqrt(2./shape[0])))
+    else:
+      # Create empirical KL loss terms
+      log_p = log_gaussian(w, 0., tf.to_float(np.sqrt(2./shape[0])))
+      log_q = log_gaussian(w, tf.stop_gradient(w_mu), tf.stop_gradient(w_sigma))
+      neg_kl = log_p - log_q
+    # Apply local reparameterization trick: sample activations pre nonlinearity
+    m_h = tf.matmul(input_x_local, w_mu) + b
+    v_h = tf.matmul(tf.square(input_x_local), tf.square(w_sigma))
+    output_h_local = m_h + tf.sqrt(v_h + 1e-6) * tf.random_normal(tf.shape(v_h))
+    output_h_local = activation_fn(output_h_local)
+    return output_h, output_h_local, neg_kl
+  def build_action_noise(self):
+    """Defines a model for additive noise per action, and its KL term."""
+    # Define mean and std variables (log-normal dist) for each action.
+    noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
+                      + self.inverse_sigma_transform(self.hparams.noise_sigma))
+    noise_sigma_sigma = self.sigma_transform(
+        self.build_sigma_variable([1, self.n_out]))
+    pre_noise_sigma = (noise_sigma_mu
+                       + tf.random_normal([1, self.n_out]) * noise_sigma_sigma)
+    self.noise_sigma = self.sigma_transform(pre_noise_sigma)
+    # Compute KL for additive noise sigma terms.
+    if getattr(self.hparams, "infer_noise_sigma", False):
+      neg_kl_term = log_gaussian(
+          pre_noise_sigma,
+          self.inverse_sigma_transform(self.hparams.noise_sigma),
+          self.hparams.prior_sigma
+      )
+      neg_kl_term -= log_gaussian(pre_noise_sigma,
+                                  noise_sigma_mu,
+                                  noise_sigma_sigma)
+    else:
+      neg_kl_term = 0.
+    return neg_kl_term
+  def build_model(self, activation_fn=tf.nn.relu):
+    """Defines the actual NN model with fully connected layers.
+    The loss is computed for partial feedback settings (bandits), so only
+    the observed outcome is backpropagated (see weighted loss).
+    Selects the optimizer and, finally, it also initializes the graph.
+    Args:
+      activation_fn: the activation function used in the nn layers.
+    """
+    def weight_prior(dtype, shape, c, d, e):
+      del c, d, e
+      return tfd.Independent(
+          tfd.Normal(loc=tf.zeros(shape, dtype),
+                     scale=tf.to_float(np.sqrt(2) / shape[0])),
+          reinterpreted_batch_ndims=tf.size(shape))
+    if self.verbose:
+      print("Initializing model {}.".format(self.name))
+    # Compute model additive noise for each action with log-normal distribution
+    neg_kl_term = self.build_action_noise()
+    # Build variational network using self.x as input.
+    input_x = self.x
+    # Create Keras model using DenseLocalReparameterization (prior N(0, 1)).
+    model_layers = [
+        tfl.DenseLocalReparameterization(
+            n_nodes,
+            activation=tf.nn.relu,
+            kernel_prior_fn=weight_prior
+        )
+        for n_nodes in self.layers if n_nodes > 0
+    ]
+    output_layer = tfl.DenseLocalReparameterization(
+        self.n_out,
+        activation=lambda x: x,
+        kernel_prior_fn=weight_prior
+    )
+    model_layers.append(output_layer)
+    model = tf.keras.Sequential(model_layers)
+    self.y_pred = model(input_x)
+    # Compute KL term
+    neg_kl_term -= tf.add_n(model.losses)
+    # Compute log likelihood (with learned or fixed noise level)
+    if getattr(self.hparams, "infer_noise_sigma", False):
+      log_likelihood = log_gaussian(
+          self.y, self.y_pred, self.noise_sigma, reduce_sum=False)
+    else:
+      log_likelihood = log_gaussian(
+          self.y, self.y_pred, self.hparams.noise_sigma, reduce_sum=False)
+    # Only take into account observed outcomes (bandits setting)
+    batch_size = tf.to_float(tf.shape(self.x)[0])
+    weighted_log_likelihood = tf.reduce_sum(
+        log_likelihood * self.weights) / batch_size
+    # The objective is 1/n * (\sum_i log_like_i - KL); neg_kl_term estimates -KL
+    elbo = weighted_log_likelihood + (neg_kl_term / self.n)
+    self.loss = -elbo
+    self.global_step = tf.train.get_or_create_global_step()
+    self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
+        self.loss, global_step=self.global_step)
+    # Create tensorboard metrics
+    self.create_summaries()
+    self.summary_writer = tf.summary.FileWriter(
+        "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
+  def build_graph(self):
+    """Defines graph, session, placeholders, and model.
+    Placeholders are: n (size of the dataset), x and y (context and observed
+    reward for each action), and weights (one-hot encoding of selected action
+    for each context, i.e., only possibly non-zero element in each y).
+    """
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+      self.sess = tf.Session()
+      self.n = tf.placeholder(shape=[], dtype=tf.float32)
+      self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float32)
+      self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
+      self.weights = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
+      self.build_model()
+      self.sess.run(tf.global_variables_initializer())
+  def create_summaries(self):
+    """Defines summaries including mean loss, and global step."""
+    with self.graph.as_default():
+      with tf.name_scope(self.name + "_summaries"):
+        tf.summary.scalar("loss", self.loss)
+        tf.summary.scalar("global_step", self.global_step)
+        self.summary_op = tf.summary.merge_all()
+  def assign_lr(self):
+    """Resets the learning rate in dynamic schedules for subsequent trainings.
+    In bandits settings, we do expand our dataset over time. Then, we need to
+    re-train the network with the new data. The algorithms that do not keep
+    the step constant, can reset it at the start of each *training* process.
+    """
+    decay_steps = 1
+    if self.hparams.activate_decay:
+      current_gs = self.sess.run(self.global_step)
+      with self.graph.as_default():
+        self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
+                                              self.global_step - current_gs,
+                                              decay_steps,
+                                              self.hparams.lr_decay_rate)
+  def train(self, data, num_steps):
+    """Trains the BNN for num_steps, using the data in 'data'.
+    Args:
+      data: ContextualDataset object that provides the data.
+      num_steps: Number of minibatches to train the network for.
+    Returns:
+      losses: Loss history during training.
+    """
+    if self.times_trained < self.cleared_times_trained:
+      num_steps = int(self.training_schedule[self.times_trained])
+    self.times_trained += 1
+    losses = []
+    with self.graph.as_default():
+      if self.verbose:
+        print("Training {} for {} steps...".format(self.name, num_steps))
+      for step in range(num_steps):
+        x, y, weights = data.get_batch_with_weights(self.hparams.batch_size)
+        _, summary, global_step, loss = self.sess.run(
+            [self.train_op, self.summary_op, self.global_step, self.loss],
+            feed_dict={
+                self.x: x,
+                self.y: y,
+                self.weights: weights,
+                self.n: data.num_points(self.f_num_points),
+            })
+        losses.append(loss)
+        if step % self.hparams.freq_summary == 0:
+          if self.hparams.show_training:
+            print("{} | step: {}, loss: {}".format(
+                self.name, global_step, loss))
+          self.summary_writer.add_summary(summary, global_step)
+    return losses
--- a/research/deep_contextual_bandits/bandits/algorithms/bootstrapped_bnn_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/bootstrapped_bnn_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contextual algorithm based on boostrapping neural networks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from bandits.core.bandit_algorithm import BanditAlgorithm
+from bandits.core.contextual_dataset import ContextualDataset
+from bandits.algorithms.neural_bandit_model import NeuralBanditModel
+class BootstrappedBNNSampling(BanditAlgorithm):
+  """Thompson Sampling algorithm based on training several neural networks."""
+  def __init__(self, name, hparams, optimizer='RMS'):
+    """Creates a BootstrappedSGDSampling object based on a specific optimizer.
+      hparams.q: Number of models that are independently trained.
+      hparams.p: Prob of independently including each datapoint in each model.
+    Args:
+      name: Name given to the instance.
+      hparams: Hyperparameters for each individual model.
+      optimizer: Neural network optimization algorithm.
+    """
+    self.name = name
+    self.hparams = hparams
+    self.optimizer_n = optimizer
+    self.training_freq = hparams.training_freq
+    self.training_epochs = hparams.training_epochs
+    self.t = 0
+    self.q = hparams.q
+    self.p = hparams.p
+    self.datasets = [
+        ContextualDataset(hparams.context_dim,
+                          hparams.num_actions,
+                          hparams.buffer_s)
+        for _ in range(self.q)
+    ]
+    self.bnn_boot = [
+        NeuralBanditModel(optimizer, hparams, '{}-{}-bnn'.format(name, i))
+        for i in range(self.q)
+    ]
+  def action(self, context):
+    """Selects action for context based on Thompson Sampling using one BNN."""
+    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
+      # round robin until each action has been taken "initial_pulls" times
+      return self.t % self.hparams.num_actions
+    # choose model uniformly at random
+    model_index = np.random.randint(self.q)
+    with self.bnn_boot[model_index].graph.as_default():
+      c = context.reshape((1, self.hparams.context_dim))
+      output = self.bnn_boot[model_index].sess.run(
+          self.bnn_boot[model_index].y_pred,
+          feed_dict={self.bnn_boot[model_index].x: c})
+      return np.argmax(output)
+  def update(self, context, action, reward):
+    """Updates the data buffer, and re-trains the BNN every self.freq_update."""
+    self.t += 1
+    for i in range(self.q):
+      # include the data point with probability p independently in each dataset
+      if np.random.random() < self.p or self.t < 2:
+        self.datasets[i].add(context, action, reward)
+    if self.t % self.training_freq == 0:
+      # update all the models:
+      for i in range(self.q):
+        if self.hparams.reset_lr:
+          self.bnn_boot[i].assign_lr()
+        self.bnn_boot[i].train(self.datasets[i], self.training_epochs)
--- a/research/deep_contextual_bandits/bandits/algorithms/fixed_policy_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/fixed_policy_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contextual bandit algorithm that selects an action at random."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from bandits.core.bandit_algorithm import BanditAlgorithm
+class FixedPolicySampling(BanditAlgorithm):
+  """Defines a baseline; returns an action at random with probs given by p."""
+  def __init__(self, name, p, hparams):
+    """Creates a FixedPolicySampling object.
+    Args:
+      name: Name of the algorithm.
+      p: Vector of normalized probabilities corresponding to sampling each arm.
+      hparams: Hyper-parameters, including the number of arms (num_actions).
+    Raises:
+      ValueError: when p dimension does not match the number of actions.
+    """
+    self.name = name
+    self.p = p
+    self.hparams = hparams
+    if len(p) != self.hparams.num_actions:
+      raise ValueError('Policy needs k probabilities.')
+  def action(self, context):
+    """Selects an action at random according to distribution p."""
+    return np.random.choice(range(self.hparams.num_actions), p=self.p)
--- a/research/deep_contextual_bandits/bandits/algorithms/linear_full_posterior_sampling.py
+++ b/research/deep_contextual_bandits/bandits/algorithms/linear_full_posterior_sampling.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contextual algorithm that keeps a full linear posterior for each arm."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from scipy.stats import invgamma
+from bandits.core.bandit_algorithm import BanditAlgorithm
+from bandits.core.contextual_dataset import ContextualDataset
+class LinearFullPosteriorSampling(BanditAlgorithm):
+  """Thompson Sampling with independent linear models and unknown noise var."""
+  def __init__(self, name, hparams):
+    """Initialize posterior distributions and hyperparameters.
+    Assume a linear model for each action i: reward = context^T beta_i + noise
+    Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
+    level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
+    and precision matrices are initialized, and the ContextualDataset created.
+    Args:
+      name: Name of the algorithm.
+      hparams: Hyper-parameters of the algorithm.
+    """
+    self.name = name
+    self.hparams = hparams
+    # Gaussian prior for each beta_i
+    self._lambda_prior = self.hparams.lambda_prior
+    self.mu = [
+        np.zeros(self.hparams.context_dim + 1)
+        for _ in range(self.hparams.num_actions)
+    ]
+    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
+                for _ in range(self.hparams.num_actions)]
+    self.precision = [
+        self.lambda_prior * np.eye(self.hparams.context_dim + 1)
+        for _ in range(self.hparams.num_actions)
+    ]
+    # Inverse Gamma prior for each sigma2_i
+    self._a0 = self.hparams.a0
+    self._b0 = self.hparams.b0
+    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
+    self.b = [self._b0 for _ in range(self.hparams.num_actions)]
+    self.t = 0
+    self.data_h = ContextualDataset(hparams.context_dim,
+                                    hparams.num_actions,
+                                    intercept=True)
+  def action(self, context):
+    """Samples beta's from posterior, and chooses best action accordingly.
+    Args:
+      context: Context for which the action need to be chosen.
+    Returns:
+      action: Selected action for the context.
+    """
+    # Round robin until each action has been selected "initial_pulls" times
+    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
+      return self.t % self.hparams.num_actions
+    # Sample sigma2, and beta conditional on sigma2
+    sigma2_s = [
+        self.b[i] * invgamma.rvs(self.a[i])
+        for i in range(self.hparams.num_actions)
+    ]
+    try:
+      beta_s = [
+          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
+          for i in range(self.hparams.num_actions)
+      ]
+    except np.linalg.LinAlgError as e:
+      # Sampling could fail if covariance is not positive definite
+      print('Exception when sampling from {}.'.format(self.name))
+      print('Details: {} | {}.'.format(e.message, e.args))
+      d = self.hparams.context_dim + 1
+      beta_s = [
+          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
+          for i in range(self.hparams.num_actions)
+      ]
+    # Compute sampled expected values, intercept is last component of beta
+    vals = [
+        np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
+        for i in range(self.hparams.num_actions)
+    ]
+    return np.argmax(vals)
+  def update(self, context, action, reward):
+    """Updates action posterior using the linear Bayesian regression formula.
+    Args:
+      context: Last observed context.
+      action: Last observed action.
+      reward: Last observed reward.
+    """
+    self.t += 1
+    self.data_h.add(context, action, reward)
+    # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
+    x, y = self.data_h.get_data(action)
+    # The algorithm could be improved with sequential update formulas (cheaper)
+    s = np.dot(x.T, x)
+    # Some terms are removed as we assume prior mu_0 = 0.
+    precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1)
+    cov_a = np.linalg.inv(precision_a)
+    mu_a = np.dot(cov_a, np.dot(x.T, y))
+    # Inverse Gamma posterior update
+    a_post = self.a0 + x.shape[0] / 2.0
+    b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a)))
+    b_post = self.b0 + b_upd
+    # Store new posterior distributions
+    self.mu[action] = mu_a
+    self.cov[action] = cov_a
+    self.precision[action] = precision_a
+    self.a[action] = a_post
+    self.b[action] = b_post
+  @property
+  def a0(self):
+    return self._a0
+  @property
+  def b0(self):
+    return self._b0
+  @property
+  def lambda_prior(self):
+    return self._lambda_prior