Commit 356c98bd authored by Kaushik Shivakumar's avatar Kaushik Shivakumar
Browse files

Merge remote-tracking branch 'upstream/master' into detr-push-3

parents d31aba8a b9785623
#!/usr/bin/python
#
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Python implementation of MS-SSIM.
Usage:
python msssim.py --original_image=original.png --compared_image=distorted.png
"""
import numpy as np
from scipy import signal
from scipy.ndimage.filters import convolve
import tensorflow as tf
tf.flags.DEFINE_string('original_image', None, 'Path to PNG image.')
tf.flags.DEFINE_string('compared_image', None, 'Path to PNG image.')
FLAGS = tf.flags.FLAGS
def _FSpecialGauss(size, sigma):
"""Function to mimic the 'fspecial' gaussian MATLAB function."""
radius = size // 2
offset = 0.0
start, stop = -radius, radius + 1
if size % 2 == 0:
offset = 0.5
stop -= 1
x, y = np.mgrid[offset + start:stop, offset + start:stop]
assert len(x) == size
g = np.exp(-((x**2 + y**2)/(2.0 * sigma**2)))
return g / g.sum()
def _SSIMForMultiScale(img1, img2, max_val=255, filter_size=11,
filter_sigma=1.5, k1=0.01, k2=0.03):
"""Return the Structural Similarity Map between `img1` and `img2`.
This function attempts to match the functionality of ssim_index_new.m by
Zhou Wang: http://www.cns.nyu.edu/~lcv/ssim/msssim.zip
Arguments:
img1: Numpy array holding the first RGB image batch.
img2: Numpy array holding the second RGB image batch.
max_val: the dynamic range of the images (i.e., the difference between the
maximum the and minimum allowed values).
filter_size: Size of blur kernel to use (will be reduced for small images).
filter_sigma: Standard deviation for Gaussian blur kernel (will be reduced
for small images).
k1: Constant used to maintain stability in the SSIM calculation (0.01 in
the original paper).
k2: Constant used to maintain stability in the SSIM calculation (0.03 in
the original paper).
Returns:
Pair containing the mean SSIM and contrast sensitivity between `img1` and
`img2`.
Raises:
RuntimeError: If input images don't have the same shape or don't have four
dimensions: [batch_size, height, width, depth].
"""
if img1.shape != img2.shape:
raise RuntimeError('Input images must have the same shape (%s vs. %s).',
img1.shape, img2.shape)
if img1.ndim != 4:
raise RuntimeError('Input images must have four dimensions, not %d',
img1.ndim)
img1 = img1.astype(np.float64)
img2 = img2.astype(np.float64)
_, height, width, _ = img1.shape
# Filter size can't be larger than height or width of images.
size = min(filter_size, height, width)
# Scale down sigma if a smaller filter size is used.
sigma = size * filter_sigma / filter_size if filter_size else 0
if filter_size:
window = np.reshape(_FSpecialGauss(size, sigma), (1, size, size, 1))
mu1 = signal.fftconvolve(img1, window, mode='valid')
mu2 = signal.fftconvolve(img2, window, mode='valid')
sigma11 = signal.fftconvolve(img1 * img1, window, mode='valid')
sigma22 = signal.fftconvolve(img2 * img2, window, mode='valid')
sigma12 = signal.fftconvolve(img1 * img2, window, mode='valid')
else:
# Empty blur kernel so no need to convolve.
mu1, mu2 = img1, img2
sigma11 = img1 * img1
sigma22 = img2 * img2
sigma12 = img1 * img2
mu11 = mu1 * mu1
mu22 = mu2 * mu2
mu12 = mu1 * mu2
sigma11 -= mu11
sigma22 -= mu22
sigma12 -= mu12
# Calculate intermediate values used by both ssim and cs_map.
c1 = (k1 * max_val) ** 2
c2 = (k2 * max_val) ** 2
v1 = 2.0 * sigma12 + c2
v2 = sigma11 + sigma22 + c2
ssim = np.mean((((2.0 * mu12 + c1) * v1) / ((mu11 + mu22 + c1) * v2)))
cs = np.mean(v1 / v2)
return ssim, cs
def MultiScaleSSIM(img1, img2, max_val=255, filter_size=11, filter_sigma=1.5,
k1=0.01, k2=0.03, weights=None):
"""Return the MS-SSIM score between `img1` and `img2`.
This function implements Multi-Scale Structural Similarity (MS-SSIM) Image
Quality Assessment according to Zhou Wang's paper, "Multi-scale structural
similarity for image quality assessment" (2003).
Link: https://ece.uwaterloo.ca/~z70wang/publications/msssim.pdf
Author's MATLAB implementation:
http://www.cns.nyu.edu/~lcv/ssim/msssim.zip
Arguments:
img1: Numpy array holding the first RGB image batch.
img2: Numpy array holding the second RGB image batch.
max_val: the dynamic range of the images (i.e., the difference between the
maximum the and minimum allowed values).
filter_size: Size of blur kernel to use (will be reduced for small images).
filter_sigma: Standard deviation for Gaussian blur kernel (will be reduced
for small images).
k1: Constant used to maintain stability in the SSIM calculation (0.01 in
the original paper).
k2: Constant used to maintain stability in the SSIM calculation (0.03 in
the original paper).
weights: List of weights for each level; if none, use five levels and the
weights from the original paper.
Returns:
MS-SSIM score between `img1` and `img2`.
Raises:
RuntimeError: If input images don't have the same shape or don't have four
dimensions: [batch_size, height, width, depth].
"""
if img1.shape != img2.shape:
raise RuntimeError('Input images must have the same shape (%s vs. %s).',
img1.shape, img2.shape)
if img1.ndim != 4:
raise RuntimeError('Input images must have four dimensions, not %d',
img1.ndim)
# Note: default weights don't sum to 1.0 but do match the paper / matlab code.
weights = np.array(weights if weights else
[0.0448, 0.2856, 0.3001, 0.2363, 0.1333])
levels = weights.size
downsample_filter = np.ones((1, 2, 2, 1)) / 4.0
im1, im2 = [x.astype(np.float64) for x in [img1, img2]]
mssim = np.array([])
mcs = np.array([])
for _ in range(levels):
ssim, cs = _SSIMForMultiScale(
im1, im2, max_val=max_val, filter_size=filter_size,
filter_sigma=filter_sigma, k1=k1, k2=k2)
mssim = np.append(mssim, ssim)
mcs = np.append(mcs, cs)
filtered = [convolve(im, downsample_filter, mode='reflect')
for im in [im1, im2]]
im1, im2 = [x[:, ::2, ::2, :] for x in filtered]
return (np.prod(mcs[0:levels-1] ** weights[0:levels-1]) *
(mssim[levels-1] ** weights[levels-1]))
def main(_):
if FLAGS.original_image is None or FLAGS.compared_image is None:
print('\nUsage: python msssim.py --original_image=original.png '
'--compared_image=distorted.png\n\n')
return
if not tf.gfile.Exists(FLAGS.original_image):
print('\nCannot find --original_image.\n')
return
if not tf.gfile.Exists(FLAGS.compared_image):
print('\nCannot find --compared_image.\n')
return
with tf.gfile.FastGFile(FLAGS.original_image) as image_file:
img1_str = image_file.read('rb')
with tf.gfile.FastGFile(FLAGS.compared_image) as image_file:
img2_str = image_file.read('rb')
input_img = tf.placeholder(tf.string)
decoded_image = tf.expand_dims(tf.image.decode_png(input_img, channels=3), 0)
with tf.Session() as sess:
img1 = sess.run(decoded_image, feed_dict={input_img: img1_str})
img2 = sess.run(decoded_image, feed_dict={input_img: img2_str})
print((MultiScaleSSIM(img1, img2, max_val=255)))
if __name__ == '__main__':
tf.app.run()
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Deep Bayesian Bandits Library
This library corresponds to the *[Deep Bayesian Bandits Showdown: An Empirical
Comparison of Bayesian Deep Networks for Thompson
Sampling](https://arxiv.org/abs/1802.09127)* paper, published in
[ICLR](https://iclr.cc/) 2018. We provide a benchmark to test decision-making
algorithms for contextual-bandits. In particular, the current library implements
a variety of algorithms (many of them based on approximate Bayesian Neural
Networks and Thompson sampling), and a number of real and syntethic data
problems exhibiting a diverse set of properties.
It is a Python library that uses [TensorFlow](https://www.tensorflow.org/).
We encourage contributors to add new approximate Bayesian Neural Networks or,
more generally, contextual bandits algorithms to the library. Also, we would
like to extend the data sources over time, so we warmly encourage contributions
in this front too!
Please, use the following when citing the code or the paper:
```
@article{riquelme2018deep, title={Deep Bayesian Bandits Showdown: An Empirical
Comparison of Bayesian Deep Networks for Thompson Sampling},
author={Riquelme, Carlos and Tucker, George and Snoek, Jasper},
journal={International Conference on Learning Representations, ICLR.}, year={2018}}
```
**Contact**. This repository is maintained by [Carlos Riquelme](http://rikel.me) ([rikel](https://github.com/rikel)). Feel free to reach out directly at [rikel@google.com](mailto:rikel@google.com) with any questions or comments.
We first briefly introduce contextual bandits, Thompson sampling, enumerate the
implemented algorithms, and the available data sources. Then, we provide a
simple complete example illustrating how to use the library.
## Contextual Bandits
Contextual bandits are a rich decision-making framework where an algorithm has
to choose among a set of *k* actions at every time step *t*, after observing
a context (or side-information) denoted by *X<sub>t</sub>*. The general pseudocode for
the process if we use algorithm **A** is as follows:
```
At time t = 1, ..., T:
1. Observe new context: X_t
2. Choose action: a_t = A.action(X_t)
3. Observe reward: r_t
4. Update internal state of the algorithm: A.update((X_t, a_t, r_t))
```
The goal is to maximize the total sum of rewards: &sum;<sub>t</sub> r<sub>t</sub>
For example, each *X<sub>t</sub>* could encode the properties of a specific user (and
the time or day), and we may have to choose an ad, discount coupon, treatment,
hyper-parameters, or version of a website to show or provide to the user.
Hopefully, over time, we will learn how to match each type of user to the most
beneficial personalized action under some metric (the reward).
## Thompson Sampling
Thompson Sampling is a meta-algorithm that chooses an action for the contextual
bandit in a statistically efficient manner, simultaneously finding the best arm
while attempting to incur low cost. Informally speaking, we assume the expected
reward is given by some function
**E**[r<sub>t</sub> | X<sub>t</sub>, a<sub>t</sub>] = f(X<sub>t</sub>, a<sub>t</sub>).
Unfortunately, function **f** is unknown, as otherwise we could just choose the
action with highest expected value:
a<sub>t</sub><sup>*</sup> = arg max<sub>i</sub> f(X<sub>t</sub>, a<sub>t</sub>).
The idea behind Thompson Sampling is based on keeping a posterior distribution
&pi;<sub>t</sub> over functions in some family f &isin; F after observing the first
*t-1* datapoints. Then, at time *t*, we sample one potential explanation of
the underlying process: f<sub>t</sub> &sim; &pi;<sub>t</sub>, and act optimally (i.e., greedily)
*according to f<sub>t</sub>*. In other words, we choose
a<sub>t</sub> = arg max<sub>i</sub> f<sub>t</sub>(X<sub>t</sub>, a<sub>i</sub>).
Finally, we update our posterior distribution with the new collected
datapoint (X<sub>t</sub>, a<sub>t</sub>, r<sub>t</sub>).
The main issue is that keeping an updated posterior &pi;<sub>t</sub> (or, even,
sampling from it) is often intractable for highly parameterized models like deep
neural networks. The algorithms we list in the next section provide tractable
*approximations* that can be used in combination with Thompson Sampling to solve
the contextual bandit problem.
## Algorithms
The Deep Bayesian Bandits library includes the following algorithms (see the
[paper](https://arxiv.org/abs/1802.09127) for further details):
1. **Linear Algorithms**. As a powerful baseline, we provide linear algorithms.
In particular, we focus on the exact Bayesian linear regression
implementation, while it is easy to derive the greedy OLS version (possibly,
with epsilon-greedy exploration). The algorithm is implemented in
*linear_full_posterior_sampling.py*, and it is instantiated as follows:
```
linear_full = LinearFullPosteriorSampling('MyLinearTS', my_hparams)
```
2. **Neural Linear**. We introduce an algorithm we call Neural Linear, which
operates by learning a neural network to map contexts to rewards for each
action, and ---simultaneously--- it updates a Bayesian linear regression in
the last layer (i.e., the one that maps the final representation **z** to
the rewards **r**). Thompson Sampling samples the linear parameters
&beta;<sub>i</sub> for each action *i*, but keeps the network that computes the
representation. Then, both parts (network and Bayesian linear regression)
are updated, possibly at different frequencies. The algorithm is implemented
in *neural_linear_sampling.py*, and we create an algorithm instance like
this:
```
neural_linear = NeuralLinearPosteriorSampling('MyNLinear', my_hparams)
```
3. **Neural Greedy**. Another standard benchmark is to train a neural network
that maps contexts to rewards, and at each time *t* just acts greedily
according to the current model. In particular, this approach does *not*
explicitly use Thompson Sampling. However, due to stochastic gradient
descent, there is still some randomness in its output. It is
straight-forward to add epsilon-greedy exploration to choose random
actions with probability &epsilon; &isin; (0, 1). The algorithm is
implemented in *neural_bandit_model.py*, and it is used together with
*PosteriorBNNSampling* (defined in *posterior_bnn_sampling.py*) by calling:
```
neural_greedy = PosteriorBNNSampling('MyNGreedy', my_hparams, 'RMSProp')
```
4. **Stochastic Variational Inference**, Bayes by Backpropagation. We implement
a Bayesian neural network by modeling each individual weight posterior as a
univariate Gaussian distribution: w<sub>ij</sub> &sim; N(&mu;<sub>ij</sub>, &sigma;<sub>ij</sub><sup>2</sup>).
Thompson sampling then samples a network at each time step
by sampling each weight independently. The variational approach consists in
maximizing a proxy for maximum likelihood of the observed data, the ELBO or
variational lower bound, to fit the values of &mu;<sub>ij</sub>, &sigma;<sub>ij</sub><sup>2</sup>
for every *i, j*.
See [Weight Uncertainty in Neural
Networks](https://arxiv.org/abs/1505.05424).
The BNN algorithm is implemented in *variational_neural_bandit_model.py*,
and it is used together with *PosteriorBNNSampling* (defined in
*posterior_bnn_sampling.py*) by calling:
```
bbb = PosteriorBNNSampling('myBBB', my_hparams, 'Variational')
```
5. **Expectation-Propagation**, Black-box alpha-divergence minimization.
The family of expectation-propagation algorithms is based on the message
passing framework . They iteratively approximate the posterior by updating a
single approximation factor (or site) at a time, which usually corresponds
to the likelihood of one data point. We focus on methods that directly
optimize the global EP objective via stochastic gradient descent, as, for
instance, Power EP. For further details see original paper below.
See [Black-box alpha-divergence
Minimization](https://arxiv.org/abs/1511.03243).
We create an instance of the algorithm like this:
```
bb_adiv = PosteriorBNNSampling('MyEP', my_hparams, 'AlphaDiv')
```
6. **Dropout**. Dropout is a training technique where the output of each neuron
is independently zeroed out with probability *p* at each forward pass.
Once the network has been trained, dropout can still be used to obtain a
distribution of predictions for a specific input. Following the best action
with respect to the random dropout prediction can be interpreted as an
implicit form of Thompson sampling. The code for dropout is the same as for
Neural Greedy (see above), but we need to set two hyper-parameters:
*use_dropout=True* and *keep_prob=p* where *p* takes the desired value in
(0, 1). Then:
```
dropout = PosteriorBNNSampling('MyDropout', my_hparams, 'RMSProp')
```
7. **Monte Carlo Methods**. To be added soon.
8. **Bootstrapped Networks**. This algorithm trains simultaneously and in
parallel **q** neural networks based on different datasets D<sub>1</sub>, ..., D<sub>q</sub>. The way those datasets are collected is by adding each new collected
datapoint (X<sub>t</sub>, a<sub>t</sub>, r<sub>t</sub>) to each dataset *D<sub>i</sub>* independently and with
probability p &isin; (0, 1]. Therefore, the main hyperparameters of the
algorithm are **(q, p)**. In order to choose an action for a new context,
one of the **q** networks is first selected with uniform probability (i.e.,
*1/q*). Then, the best action according to the *selected* network is
played.
See [Deep Exploration via Bootstrapped
DQN](https://arxiv.org/abs/1602.04621).
The algorithm is implemented in *bootstrapped_bnn_sampling.py*, and we
instantiate it as (where *my_hparams* contains both **q** and **p**):
```
bootstrap = BootstrappedBNNSampling('MyBoot', my_hparams)
```
9. **Parameter-Noise**. Another approach to approximate a distribution over
neural networks (or more generally, models) that map contexts to rewards,
consists in randomly perturbing a point estimate trained by Stochastic
Gradient Descent on the data. The Parameter-Noise algorithm uses a heuristic
to control the amount of noise &sigma;<sub>t</sub><sup>2</sup> it adds independently to the
parameters representing a neural network: &theta;<sub>t</sub><sup>'</sup> = &theta;<sub>t</sub> + &epsilon; where
&epsilon; &sim; N(0, &sigma;<sub>t</sub><sup>2</sup> Id).
After using &theta;<sub>t</sub><sup>'</sup> for decision making, the following SGD
training steps start again from &theta;<sub>t</sub>. The key hyperparameters to set
are those controlling the noise heuristic.
See [Parameter Space Noise for
Exploration](https://arxiv.org/abs/1706.01905).
The algorithm is implemented in *parameter_noise_sampling.py*, and we create
an instance by calling:
```
parameter_noise = ParameterNoiseSampling('MyParamNoise', my_hparams)
```
10. **Gaussian Processes**. Another standard benchmark are Gaussian Processes,
see *Gaussian Processes for Machine Learning* by Rasmussen and Williams for
an introduction. To model the expected reward of different actions, we fit a
multitask GP.
See [Multi-task Gaussian Process
Prediction](http://papers.nips.cc/paper/3189-multi-task-gaussian-process-prediction.pdf).
Our implementation is provided in *multitask_gp.py*, and it is instantiated
as follows:
```
gp = PosteriorBNNSampling('MyMultitaskGP', my_hparams, 'GP')
```
In the code snippet at the bottom, we show how to instantiate some of these
algorithms, and how to run the contextual bandit simulator, and display the
high-level results.
## Data
In the paper we use two types of contextual datasets: synthetic and based on
real-world data.
We provide functions that sample problems from those datasets. In the case of
real-world data, you first need to download the raw datasets, and pass the route
to the functions. Links for the datasets are provided below.
### Synthetic Datasets
Synthetic datasets are contained in the *synthetic_data_sampler.py* file. In
particular, it includes:
1. **Linear data**. Provides a number of linear arms, and Gaussian contexts.
2. **Sparse linear data**. Provides a number of sparse linear arms, and
Gaussian contexts.
3. **Wheel bandit data**. Provides sampled data from the wheel bandit data, see
[Section 5.4](https://arxiv.org/abs/1802.09127) in the paper.
### Real-World Datasets
Real-world data generating functions are contained in the *data_sampler.py*
file.
In particular, it includes:
1. **Mushroom data**. Each incoming context represents a different type of
mushroom, and the actions are eat or no-eat. Eating an edible mushroom
provides positive reward, while eating a poisonous one provides positive
reward with probability *p*, and a large negative reward with probability
*1-p*. All the rewards, and the value of *p* are customizable. The
[dataset](https://archive.ics.uci.edu/ml/datasets/mushroom) is part of the
UCI repository, and the bandit problem was proposed in Blundell et al.
(2015). Data is available [here](https://storage.googleapis.com/bandits_datasets/mushroom.data)
or alternatively [here](https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/),
use the *agaricus-lepiota.data* file.
2. **Stock data**. We created the Financial Dataset by pulling the stock prices
of *d = 21* publicly traded companies in NYSE and Nasdaq, for the last 14
years (*n = 3713*). For each day, the context was the price difference
between the beginning and end of the session for each stock. We
synthetically created the arms to be a linear combination of the contexts,
representing *k = 8* different potential portfolios. Data is available
[here](https://storage.googleapis.com/bandits_datasets/raw_stock_contexts).
3. **Jester data**. We create a recommendation system bandit problem as
follows. The Jester Dataset (Goldberg et al., 2001) provides continuous
ratings in *[-10, 10]* for 100 jokes from a total of 73421 users. We find
a *complete* subset of *n = 19181* users rating all 40 jokes. Following
Riquelme et al. (2017), we take *d = 32* of the ratings as the context of
the user, and *k = 8* as the arms. The agent recommends one joke, and
obtains the reward corresponding to the rating of the user for the selected
joke. Data is available [here](https://storage.googleapis.com/bandits_datasets/jester_data_40jokes_19181users.npy).
4. **Statlog data**. The Shuttle Statlog Dataset (Asuncion & Newman, 2007)
provides the value of *d = 9* indicators during a space shuttle flight,
and the goal is to predict the state of the radiator subsystem of the
shuttle. There are *k = 7* possible states, and if the agent selects the
right state, then reward 1 is generated. Otherwise, the agent obtains no
reward (*r = 0*). The most interesting aspect of the dataset is that one
action is the optimal one in 80% of the cases, and some algorithms may
commit to this action instead of further exploring. In this case, the number
of contexts is *n = 43500*. Data is available [here](https://storage.googleapis.com/bandits_datasets/shuttle.trn) or alternatively
[here](https://archive.ics.uci.edu/ml/datasets/Statlog+\(Shuttle\)), use
*shuttle.trn* file.
5. **Adult data**. The Adult Dataset (Kohavi, 1996; Asuncion & Newman, 2007)
comprises personal information from the US Census Bureau database, and the
standard prediction task is to determine if a person makes over 50K a year
or not. However, we consider the *k = 14* different occupations as
feasible actions, based on *d = 94* covariates (many of them binarized).
As in previous datasets, the agent obtains a reward of 1 for making the
right prediction, and 0 otherwise. The total number of contexts is *n =
45222*. Data is available [here](https://storage.googleapis.com/bandits_datasets/adult.full) or alternatively
[here](https://archive.ics.uci.edu/ml/datasets/adult), use *adult.data*
file.
6. **Census data**. The US Census (1990) Dataset (Asuncion & Newman, 2007)
contains a number of personal features (age, native language, education...)
which we summarize in *d = 389* covariates, including binary dummy
variables for categorical features. Our goal again is to predict the
occupation of the individual among *k = 9* classes. The agent obtains
reward 1 for making the right prediction, and 0 otherwise. Data is available
[here](https://storage.googleapis.com/bandits_datasets/USCensus1990.data.txt) or alternatively [here](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+\(1990\)), use
*USCensus1990.data.txt* file.
7. **Covertype data**. The Covertype Dataset (Asuncion & Newman, 2007)
classifies the cover type of northern Colorado forest areas in *k = 7*
classes, based on *d = 54* features, including elevation, slope, aspect,
and soil type. Again, the agent obtains reward 1 if the correct class is
selected, and 0 otherwise. Data is available [here](https://storage.googleapis.com/bandits_datasets/covtype.data) or alternatively
[here](https://archive.ics.uci.edu/ml/datasets/covertype), use
*covtype.data* file.
In datasets 4-7, each feature of the dataset is normalized first.
## Usage: Basic Example
This library requires Tensorflow, Numpy, and Pandas.
The file *example_main.py* provides a complete example on how to use the
library. We run the code:
```
python example_main.py
```
**Do not forget to** configure the routes to the data files at the top of *example_main.py*.
For example, we can run the Mushroom bandit for 2000 contexts on a few
algorithms as follows:
```
# Problem parameters
num_contexts = 2000
# Choose data source among:
# {linear, sparse_linear, mushroom, financial, jester,
# statlog, adult, covertype, census, wheel}
data_type = 'mushroom'
# Create dataset
sampled_vals = sample_data(data_type, num_contexts)
dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals
# Define hyperparameters and algorithms
hparams_linear = tf.contrib.training.HParams(num_actions=num_actions,
context_dim=context_dim,
a0=6,
b0=6,
lambda_prior=0.25,
initial_pulls=2)
hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions,
context_dim=context_dim,
init_scale=0.3,
activation=tf.nn.relu,
layer_sizes=[50],
batch_size=512,
activate_decay=True,
initial_lr=0.1,
max_grad_norm=5.0,
show_training=False,
freq_summary=1000,
buffer_s=-1,
initial_pulls=2,
optimizer='RMS',
reset_lr=True,
lr_decay_rate=0.5,
training_freq=50,
training_epochs=100,
keep_prob=0.80,
use_dropout=True)
### Create hyper-parameter configurations for other algorithms
[...]
algos = [
UniformSampling('Uniform Sampling', hparams),
PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'),
PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'),
NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear),
LinearFullPosteriorSampling('LinFullPost', hparams_linear),
BootstrappedBNNSampling('BootRMS', hparams_boot),
ParameterNoiseSampling('ParamNoise', hparams_pnoise),
]
# Run contextual bandit problem
t_init = time.time()
results = run_contextual_bandit(context_dim, num_actions, dataset, algos)
_, h_rewards = results
# Display results
display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
```
The previous code leads to final results that look like:
```
---------------------------------------------------
---------------------------------------------------
mushroom bandit completed after 69.8401839733 seconds.
---------------------------------------------------
0) LinFullPost | total reward = 4365.0.
1) NeuralLinear | total reward = 4110.0.
2) Dropout | total reward = 3430.0.
3) ParamNoise | total reward = 3270.0.
4) BootRMS | total reward = 3050.0.
5) BBB | total reward = 2505.0.
6) Uniform Sampling | total reward = -4930.0.
---------------------------------------------------
Optimal total reward = 5235.
Frequency of optimal actions (action, frequency):
[[0, 953], [1, 1047]]
---------------------------------------------------
---------------------------------------------------
```
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bayesian NN using expectation propagation (Black-Box Alpha-Divergence).
See https://arxiv.org/abs/1511.03243 for details.
All formulas used in this implementation are derived in:
https://www.overleaf.com/12837696kwzjxkyhdytk#/49028744/.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import numpy as np
import tensorflow as tf
from absl import flags
from bandits.core.bayesian_nn import BayesianNN
FLAGS = flags.FLAGS
tfd = tf.contrib.distributions # update to: tensorflow_probability.distributions
def log_gaussian(x, mu, sigma, reduce_sum=True):
res = tfd.Normal(mu, sigma).log_prob(x)
if reduce_sum:
return tf.reduce_sum(res)
else:
return res
class BBAlphaDivergence(BayesianNN):
"""Implements an approximate Bayesian NN via Black-Box Alpha-Divergence."""
def __init__(self, hparams, name):
self.name = name
self.hparams = hparams
self.alpha = getattr(self.hparams, 'alpha', 1.0)
self.num_mc_nn_samples = getattr(self.hparams, 'num_mc_nn_samples', 10)
self.n_in = self.hparams.context_dim
self.n_out = self.hparams.num_actions
self.layers = self.hparams.layer_sizes
self.batch_size = self.hparams.batch_size
self.show_training = self.hparams.show_training
self.freq_summary = self.hparams.freq_summary
self.verbose = getattr(self.hparams, 'verbose', True)
self.cleared_times_trained = self.hparams.cleared_times_trained
self.initial_training_steps = self.hparams.initial_training_steps
self.training_schedule = np.linspace(self.initial_training_steps,
self.hparams.training_epochs,
self.cleared_times_trained)
self.times_trained = 0
self.initialize_model()
def initialize_model(self):
"""Builds and initialize the model."""
self.num_w = 0
self.num_b = 0
self.weights_m = {}
self.weights_std = {}
self.biases_m = {}
self.biases_std = {}
self.h_max_var = []
if self.hparams.use_sigma_exp_transform:
self.sigma_transform = tfd.bijectors.Exp()
else:
self.sigma_transform = tfd.bijectors.Softplus()
# Build the graph corresponding to the Bayesian NN instance.
self.graph = tf.Graph()
with self.graph.as_default():
self.sess = tf.Session()
self.x = tf.placeholder(shape=[None, self.n_in],
dtype=tf.float32, name='x')
self.y = tf.placeholder(shape=[None, self.n_out],
dtype=tf.float32, name='y')
self.weights = tf.placeholder(shape=[None, self.n_out],
dtype=tf.float32, name='w')
self.data_size = tf.placeholder(tf.float32, shape=(), name='data_size')
self.prior_variance = self.hparams.prior_variance
if self.prior_variance < 0:
# if not fixed, we learn the prior.
self.prior_variance = self.sigma_transform.forward(
self.build_mu_variable([1, 1]))
self.build_model()
self.sess.run(tf.global_variables_initializer())
def build_mu_variable(self, shape):
"""Returns a mean variable initialized as N(0, 0.05)."""
return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
def build_sigma_variable(self, shape, init=-5.):
"""Returns a sigma variable initialized as N(init, 0.05)."""
# Initialize sigma to be very small initially to encourage MAP opt first
return tf.Variable(tf.random_normal(shape, init, 0.05))
def build_layer(self, input_x, shape, layer_id, activation_fn=tf.nn.relu):
"""Builds a layer with N(mean, std) for each weight, and samples from it."""
w_mu = self.build_mu_variable(shape)
w_sigma = self.sigma_transform.forward(self.build_sigma_variable(shape))
w_noise = tf.random_normal(shape)
w = w_mu + w_sigma * w_noise
b_mu = self.build_mu_variable([1, shape[1]])
b_sigma = self.sigma_transform.forward(
self.build_sigma_variable([1, shape[1]]))
b_noise = tf.random_normal([1, shape[1]])
b = b_mu + b_sigma * b_noise
# Create outputs
output_h = activation_fn(tf.matmul(input_x, w) + b)
# Store means and stds
self.weights_m[layer_id] = w_mu
self.weights_std[layer_id] = w_sigma
self.biases_m[layer_id] = b_mu
self.biases_std[layer_id] = b_sigma
return output_h
def sample_neural_network(self, activation_fn=tf.nn.relu):
"""Samples a nn from posterior, computes data log lk and log f factor."""
with self.graph.as_default():
log_f = 0
n = self.data_size
input_x = self.x
for layer_id in range(self.total_layers):
# load mean and std of each weight
w_mu = self.weights_m[layer_id]
w_sigma = self.weights_std[layer_id]
b_mu = self.biases_m[layer_id]
b_sigma = self.biases_std[layer_id]
# sample weights from Gaussian distribution
shape = w_mu.shape
w_noise = tf.random_normal(shape)
b_noise = tf.random_normal([1, int(shape[1])])
w = w_mu + w_sigma * w_noise
b = b_mu + b_sigma * b_noise
# compute contribution to log_f
t1 = w * w_mu / (n * w_sigma ** 2)
t2 = (0.5 * w ** 2 / n) * (1 / self.prior_variance - 1 / w_sigma ** 2)
log_f += tf.reduce_sum(t1 + t2)
t1 = b * b_mu / (n * b_sigma ** 2)
t2 = (0.5 * b ** 2 / n) * (1 / self.prior_variance - 1 / b_sigma ** 2)
log_f += tf.reduce_sum(t1 + t2)
if layer_id < self.total_layers - 1:
output_h = activation_fn(tf.matmul(input_x, w) + b)
else:
output_h = tf.matmul(input_x, w) + b
input_x = output_h
# compute log likelihood of the observed reward under the sampled nn
log_likelihood = log_gaussian(
self.y, output_h, self.noise_sigma, reduce_sum=False)
weighted_log_likelihood = tf.reduce_sum(log_likelihood * self.weights, -1)
return log_f, weighted_log_likelihood
def log_z_q(self):
"""Computes log-partition function of current posterior parameters."""
with self.graph.as_default():
log_z_q = 0
for layer_id in range(self.total_layers):
w_mu = self.weights_m[layer_id]
w_sigma = self.weights_std[layer_id]
b_mu = self.biases_m[layer_id]
b_sigma = self.biases_std[layer_id]
w_term = 0.5 * tf.reduce_sum(w_mu ** 2 / w_sigma ** 2)
w_term += 0.5 * tf.reduce_sum(tf.log(2 * np.pi) + 2 * tf.log(w_sigma))
b_term = 0.5 * tf.reduce_sum(b_mu ** 2 / b_sigma ** 2)
b_term += 0.5 * tf.reduce_sum(tf.log(2 * np.pi) + 2 * tf.log(b_sigma))
log_z_q += w_term + b_term
return log_z_q
def log_z_prior(self):
"""Computes log-partition function of the prior parameters."""
num_params = self.num_w + self.num_b
return num_params * 0.5 * tf.log(2 * np.pi * self.prior_variance)
def log_alpha_likelihood_ratio(self, activation_fn=tf.nn.relu):
# each nn sample returns (log f, log likelihoods)
nn_samples = [
self.sample_neural_network(activation_fn)
for _ in range(self.num_mc_nn_samples)
]
nn_log_f_samples = [elt[0] for elt in nn_samples]
nn_log_lk_samples = [elt[1] for elt in nn_samples]
# we stack the (log f, log likelihoods) from the k nn samples
nn_log_f_stack = tf.stack(nn_log_f_samples) # k x 1
nn_log_lk_stack = tf.stack(nn_log_lk_samples) # k x N
nn_f_tile = tf.tile(nn_log_f_stack, [self.batch_size])
nn_f_tile = tf.reshape(nn_f_tile,
[self.num_mc_nn_samples, self.batch_size])
# now both the log f and log likelihood terms have shape: k x N
# apply formula in https://www.overleaf.com/12837696kwzjxkyhdytk#/49028744/
nn_log_ratio = nn_log_lk_stack - nn_f_tile
nn_log_ratio = self.alpha * tf.transpose(nn_log_ratio)
logsumexp_value = tf.reduce_logsumexp(nn_log_ratio, -1)
log_k_scalar = tf.log(tf.cast(self.num_mc_nn_samples, tf.float32))
log_k = log_k_scalar * tf.ones([self.batch_size])
return tf.reduce_sum(logsumexp_value - log_k, -1)
def build_model(self, activation_fn=tf.nn.relu):
"""Defines the actual NN model with fully connected layers.
Args:
activation_fn: Activation function for the neural network.
The loss is computed for partial feedback settings (bandits), so only
the observed outcome is backpropagated (see weighted loss).
Selects the optimizer and, finally, it also initializes the graph.
"""
print('Initializing model {}.'.format(self.name))
# Build terms for the noise sigma estimation for each action.
noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
+ self.sigma_transform.inverse(self.hparams.noise_sigma))
noise_sigma_sigma = self.sigma_transform.forward(
self.build_sigma_variable([1, self.n_out]))
pre_noise_sigma = noise_sigma_mu + tf.random_normal(
[1, self.n_out]) * noise_sigma_sigma
self.noise_sigma = self.sigma_transform.forward(pre_noise_sigma)
# Build network
input_x = self.x
n_in = self.n_in
self.total_layers = len(self.layers) + 1
if self.layers[0] == 0:
self.total_layers = 1
for l_number, n_nodes in enumerate(self.layers):
if n_nodes > 0:
h = self.build_layer(input_x, [n_in, n_nodes], l_number)
input_x = h
n_in = n_nodes
self.num_w += n_in * n_nodes
self.num_b += n_nodes
self.y_pred = self.build_layer(input_x, [n_in, self.n_out],
self.total_layers - 1,
activation_fn=lambda x: x)
# Compute energy function based on sampled nn's
log_coeff = self.data_size / (self.batch_size * self.alpha)
log_ratio = log_coeff * self.log_alpha_likelihood_ratio(activation_fn)
logzprior = self.log_z_prior()
logzq = self.log_z_q()
energy = logzprior - logzq - log_ratio
self.loss = energy
self.global_step = tf.train.get_or_create_global_step()
self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
self.loss, global_step=self.global_step)
# Useful for debugging
sq_loss = tf.squared_difference(self.y_pred, self.y)
weighted_sq_loss = self.weights * sq_loss
self.cost = tf.reduce_sum(weighted_sq_loss) / self.batch_size
# Create tensorboard metrics
self.create_summaries()
self.summary_writer = tf.summary.FileWriter('{}/graph_{}'.format(
FLAGS.logdir, self.name), self.sess.graph)
def create_summaries(self):
tf.summary.scalar('loss', self.loss)
tf.summary.scalar('cost', self.cost)
self.summary_op = tf.summary.merge_all()
def assign_lr(self):
"""Resets the learning rate in dynamic schedules for subsequent trainings.
In bandits settings, we do expand our dataset over time. Then, we need to
re-train the network with the new data. Those algorithms that do not keep
the step constant, can reset it at the start of each training process.
"""
decay_steps = 1
if self.hparams.activate_decay:
current_gs = self.sess.run(self.global_step)
with self.graph.as_default():
self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
self.global_step - current_gs,
decay_steps,
self.hparams.lr_decay_rate)
def train(self, data, num_steps):
"""Trains the BNN for num_steps, using the data in 'data'.
Args:
data: ContextualDataset object that provides the data.
num_steps: Number of minibatches to train the network for.
"""
if self.times_trained < self.cleared_times_trained:
num_steps = int(self.training_schedule[self.times_trained])
self.times_trained += 1
if self.verbose:
print('Training {} for {} steps...'.format(self.name, num_steps))
with self.graph.as_default():
for step in range(num_steps):
x, y, w = data.get_batch_with_weights(self.hparams.batch_size)
_, summary, global_step, loss = self.sess.run(
[self.train_op, self.summary_op, self.global_step, self.loss],
feed_dict={self.x: x, self.y: y, self.weights: w,
self.data_size: data.num_points()})
weights_l = self.sess.run(self.weights_std[0])
self.h_max_var.append(np.max(weights_l))
if step % self.freq_summary == 0:
if self.show_training:
print('step: {}, loss: {}'.format(step, loss))
sys.stdout.flush()
self.summary_writer.add_summary(summary, global_step)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bayesian NN using factorized VI (Bayes By Backprop. Blundell et al. 2014).
See https://arxiv.org/abs/1505.05424 for details.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
# import tensorflow_probability as tfp
from absl import flags
from bandits.core.bayesian_nn import BayesianNN
FLAGS = flags.FLAGS
# tfd = tfp.distributions
tfd = tf.contrib.distributions
tfl = tf.contrib.layers
def log_gaussian(x, mu, sigma, reduce_sum=True):
"""Returns log Gaussian pdf."""
res = tfd.Normal(mu, sigma).log_prob(x)
if reduce_sum:
return tf.reduce_sum(res)
else:
return res
def analytic_kl(mu_1, sigma_1, mu_2, sigma_2):
"""KL for two Gaussian distributions with diagonal covariance matrix."""
kl = tfd.kl_divergence(tfd.MVNDiag(mu_1, sigma_1), tfd.MVNDiag(mu_2, sigma_2))
return kl
class BfVariationalNeuralBanditModel(BayesianNN):
"""Implements an approximate Bayesian NN using Variational Inference."""
def __init__(self, hparams, name="BBBNN"):
self.name = name
self.hparams = hparams
self.n_in = self.hparams.context_dim
self.n_out = self.hparams.num_actions
self.layers = self.hparams.layer_sizes
self.init_scale = self.hparams.init_scale
self.f_num_points = None
if "f_num_points" in hparams:
self.f_num_points = self.hparams.f_num_points
self.cleared_times_trained = self.hparams.cleared_times_trained
self.initial_training_steps = self.hparams.initial_training_steps
self.training_schedule = np.linspace(self.initial_training_steps,
self.hparams.training_epochs,
self.cleared_times_trained)
self.verbose = getattr(self.hparams, "verbose", True)
self.weights_m = {}
self.weights_std = {}
self.biases_m = {}
self.biases_std = {}
self.times_trained = 0
if self.hparams.use_sigma_exp_transform:
self.sigma_transform = tf.exp
self.inverse_sigma_transform = np.log
else:
self.sigma_transform = tf.nn.softplus
self.inverse_sigma_transform = lambda y: y + np.log(1. - np.exp(-y))
# Whether to use the local reparameterization trick to compute the loss.
# See details in https://arxiv.org/abs/1506.02557
self.use_local_reparameterization = True
self.build_graph()
def build_mu_variable(self, shape):
"""Returns a mean variable initialized as N(0, 0.05)."""
return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
def build_sigma_variable(self, shape, init=-5.):
"""Returns a sigma variable initialized as N(init, 0.05)."""
# Initialize sigma to be very small initially to encourage MAP opt first
return tf.Variable(tf.random_normal(shape, init, 0.05))
def build_layer(self, input_x, input_x_local, shape,
layer_id, activation_fn=tf.nn.relu):
"""Builds a variational layer, and computes KL term.
Args:
input_x: Input to the variational layer.
input_x_local: Input when the local reparameterization trick was applied.
shape: [number_inputs, number_outputs] for the layer.
layer_id: Number of layer in the architecture.
activation_fn: Activation function to apply.
Returns:
output_h: Output of the variational layer.
output_h_local: Output when local reparameterization trick was applied.
neg_kl: Negative KL term for the layer.
"""
w_mu = self.build_mu_variable(shape)
w_sigma = self.sigma_transform(self.build_sigma_variable(shape))
w_noise = tf.random_normal(shape)
w = w_mu + w_sigma * w_noise
b_mu = self.build_mu_variable([1, shape[1]])
b_sigma = self.sigma_transform(self.build_sigma_variable([1, shape[1]]))
b = b_mu
# Store means and stds
self.weights_m[layer_id] = w_mu
self.weights_std[layer_id] = w_sigma
self.biases_m[layer_id] = b_mu
self.biases_std[layer_id] = b_sigma
# Create outputs
output_h = activation_fn(tf.matmul(input_x, w) + b)
if self.use_local_reparameterization:
# Use analytic KL divergence wrt the prior
neg_kl = -analytic_kl(w_mu, w_sigma,
0., tf.to_float(np.sqrt(2./shape[0])))
else:
# Create empirical KL loss terms
log_p = log_gaussian(w, 0., tf.to_float(np.sqrt(2./shape[0])))
log_q = log_gaussian(w, tf.stop_gradient(w_mu), tf.stop_gradient(w_sigma))
neg_kl = log_p - log_q
# Apply local reparameterization trick: sample activations pre nonlinearity
m_h = tf.matmul(input_x_local, w_mu) + b
v_h = tf.matmul(tf.square(input_x_local), tf.square(w_sigma))
output_h_local = m_h + tf.sqrt(v_h + 1e-6) * tf.random_normal(tf.shape(v_h))
output_h_local = activation_fn(output_h_local)
return output_h, output_h_local, neg_kl
def build_action_noise(self):
"""Defines a model for additive noise per action, and its KL term."""
# Define mean and std variables (log-normal dist) for each action.
noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
+ self.inverse_sigma_transform(self.hparams.noise_sigma))
noise_sigma_sigma = self.sigma_transform(
self.build_sigma_variable([1, self.n_out]))
pre_noise_sigma = (noise_sigma_mu
+ tf.random_normal([1, self.n_out]) * noise_sigma_sigma)
self.noise_sigma = self.sigma_transform(pre_noise_sigma)
# Compute KL for additive noise sigma terms.
if getattr(self.hparams, "infer_noise_sigma", False):
neg_kl_term = log_gaussian(
pre_noise_sigma,
self.inverse_sigma_transform(self.hparams.noise_sigma),
self.hparams.prior_sigma
)
neg_kl_term -= log_gaussian(pre_noise_sigma,
noise_sigma_mu,
noise_sigma_sigma)
else:
neg_kl_term = 0.
return neg_kl_term
def build_model(self, activation_fn=tf.nn.relu):
"""Defines the actual NN model with fully connected layers.
The loss is computed for partial feedback settings (bandits), so only
the observed outcome is backpropagated (see weighted loss).
Selects the optimizer and, finally, it also initializes the graph.
Args:
activation_fn: the activation function used in the nn layers.
"""
def weight_prior(dtype, shape, c, d, e):
del c, d, e
return tfd.Independent(
tfd.Normal(loc=tf.zeros(shape, dtype),
scale=tf.to_float(np.sqrt(2) / shape[0])),
reinterpreted_batch_ndims=tf.size(shape))
if self.verbose:
print("Initializing model {}.".format(self.name))
# Compute model additive noise for each action with log-normal distribution
neg_kl_term = self.build_action_noise()
# Build variational network using self.x as input.
input_x = self.x
# Create Keras model using DenseLocalReparameterization (prior N(0, 1)).
model_layers = [
tfl.DenseLocalReparameterization(
n_nodes,
activation=tf.nn.relu,
kernel_prior_fn=weight_prior
)
for n_nodes in self.layers if n_nodes > 0
]
output_layer = tfl.DenseLocalReparameterization(
self.n_out,
activation=lambda x: x,
kernel_prior_fn=weight_prior
)
model_layers.append(output_layer)
model = tf.keras.Sequential(model_layers)
self.y_pred = model(input_x)
# Compute KL term
neg_kl_term -= tf.add_n(model.losses)
# Compute log likelihood (with learned or fixed noise level)
if getattr(self.hparams, "infer_noise_sigma", False):
log_likelihood = log_gaussian(
self.y, self.y_pred, self.noise_sigma, reduce_sum=False)
else:
log_likelihood = log_gaussian(
self.y, self.y_pred, self.hparams.noise_sigma, reduce_sum=False)
# Only take into account observed outcomes (bandits setting)
batch_size = tf.to_float(tf.shape(self.x)[0])
weighted_log_likelihood = tf.reduce_sum(
log_likelihood * self.weights) / batch_size
# The objective is 1/n * (\sum_i log_like_i - KL); neg_kl_term estimates -KL
elbo = weighted_log_likelihood + (neg_kl_term / self.n)
self.loss = -elbo
self.global_step = tf.train.get_or_create_global_step()
self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
self.loss, global_step=self.global_step)
# Create tensorboard metrics
self.create_summaries()
self.summary_writer = tf.summary.FileWriter(
"{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
def build_graph(self):
"""Defines graph, session, placeholders, and model.
Placeholders are: n (size of the dataset), x and y (context and observed
reward for each action), and weights (one-hot encoding of selected action
for each context, i.e., only possibly non-zero element in each y).
"""
self.graph = tf.Graph()
with self.graph.as_default():
self.sess = tf.Session()
self.n = tf.placeholder(shape=[], dtype=tf.float32)
self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float32)
self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
self.weights = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
self.build_model()
self.sess.run(tf.global_variables_initializer())
def create_summaries(self):
"""Defines summaries including mean loss, and global step."""
with self.graph.as_default():
with tf.name_scope(self.name + "_summaries"):
tf.summary.scalar("loss", self.loss)
tf.summary.scalar("global_step", self.global_step)
self.summary_op = tf.summary.merge_all()
def assign_lr(self):
"""Resets the learning rate in dynamic schedules for subsequent trainings.
In bandits settings, we do expand our dataset over time. Then, we need to
re-train the network with the new data. The algorithms that do not keep
the step constant, can reset it at the start of each *training* process.
"""
decay_steps = 1
if self.hparams.activate_decay:
current_gs = self.sess.run(self.global_step)
with self.graph.as_default():
self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
self.global_step - current_gs,
decay_steps,
self.hparams.lr_decay_rate)
def train(self, data, num_steps):
"""Trains the BNN for num_steps, using the data in 'data'.
Args:
data: ContextualDataset object that provides the data.
num_steps: Number of minibatches to train the network for.
Returns:
losses: Loss history during training.
"""
if self.times_trained < self.cleared_times_trained:
num_steps = int(self.training_schedule[self.times_trained])
self.times_trained += 1
losses = []
with self.graph.as_default():
if self.verbose:
print("Training {} for {} steps...".format(self.name, num_steps))
for step in range(num_steps):
x, y, weights = data.get_batch_with_weights(self.hparams.batch_size)
_, summary, global_step, loss = self.sess.run(
[self.train_op, self.summary_op, self.global_step, self.loss],
feed_dict={
self.x: x,
self.y: y,
self.weights: weights,
self.n: data.num_points(self.f_num_points),
})
losses.append(loss)
if step % self.hparams.freq_summary == 0:
if self.hparams.show_training:
print("{} | step: {}, loss: {}".format(
self.name, global_step, loss))
self.summary_writer.add_summary(summary, global_step)
return losses
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contextual algorithm based on boostrapping neural networks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from bandits.core.bandit_algorithm import BanditAlgorithm
from bandits.core.contextual_dataset import ContextualDataset
from bandits.algorithms.neural_bandit_model import NeuralBanditModel
class BootstrappedBNNSampling(BanditAlgorithm):
"""Thompson Sampling algorithm based on training several neural networks."""
def __init__(self, name, hparams, optimizer='RMS'):
"""Creates a BootstrappedSGDSampling object based on a specific optimizer.
hparams.q: Number of models that are independently trained.
hparams.p: Prob of independently including each datapoint in each model.
Args:
name: Name given to the instance.
hparams: Hyperparameters for each individual model.
optimizer: Neural network optimization algorithm.
"""
self.name = name
self.hparams = hparams
self.optimizer_n = optimizer
self.training_freq = hparams.training_freq
self.training_epochs = hparams.training_epochs
self.t = 0
self.q = hparams.q
self.p = hparams.p
self.datasets = [
ContextualDataset(hparams.context_dim,
hparams.num_actions,
hparams.buffer_s)
for _ in range(self.q)
]
self.bnn_boot = [
NeuralBanditModel(optimizer, hparams, '{}-{}-bnn'.format(name, i))
for i in range(self.q)
]
def action(self, context):
"""Selects action for context based on Thompson Sampling using one BNN."""
if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
# round robin until each action has been taken "initial_pulls" times
return self.t % self.hparams.num_actions
# choose model uniformly at random
model_index = np.random.randint(self.q)
with self.bnn_boot[model_index].graph.as_default():
c = context.reshape((1, self.hparams.context_dim))
output = self.bnn_boot[model_index].sess.run(
self.bnn_boot[model_index].y_pred,
feed_dict={self.bnn_boot[model_index].x: c})
return np.argmax(output)
def update(self, context, action, reward):
"""Updates the data buffer, and re-trains the BNN every self.freq_update."""
self.t += 1
for i in range(self.q):
# include the data point with probability p independently in each dataset
if np.random.random() < self.p or self.t < 2:
self.datasets[i].add(context, action, reward)
if self.t % self.training_freq == 0:
# update all the models:
for i in range(self.q):
if self.hparams.reset_lr:
self.bnn_boot[i].assign_lr()
self.bnn_boot[i].train(self.datasets[i], self.training_epochs)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contextual bandit algorithm that selects an action at random."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from bandits.core.bandit_algorithm import BanditAlgorithm
class FixedPolicySampling(BanditAlgorithm):
"""Defines a baseline; returns an action at random with probs given by p."""
def __init__(self, name, p, hparams):
"""Creates a FixedPolicySampling object.
Args:
name: Name of the algorithm.
p: Vector of normalized probabilities corresponding to sampling each arm.
hparams: Hyper-parameters, including the number of arms (num_actions).
Raises:
ValueError: when p dimension does not match the number of actions.
"""
self.name = name
self.p = p
self.hparams = hparams
if len(p) != self.hparams.num_actions:
raise ValueError('Policy needs k probabilities.')
def action(self, context):
"""Selects an action at random according to distribution p."""
return np.random.choice(range(self.hparams.num_actions), p=self.p)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contextual algorithm that keeps a full linear posterior for each arm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from scipy.stats import invgamma
from bandits.core.bandit_algorithm import BanditAlgorithm
from bandits.core.contextual_dataset import ContextualDataset
class LinearFullPosteriorSampling(BanditAlgorithm):
"""Thompson Sampling with independent linear models and unknown noise var."""
def __init__(self, name, hparams):
"""Initialize posterior distributions and hyperparameters.
Assume a linear model for each action i: reward = context^T beta_i + noise
Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
and precision matrices are initialized, and the ContextualDataset created.
Args:
name: Name of the algorithm.
hparams: Hyper-parameters of the algorithm.
"""
self.name = name
self.hparams = hparams
# Gaussian prior for each beta_i
self._lambda_prior = self.hparams.lambda_prior
self.mu = [
np.zeros(self.hparams.context_dim + 1)
for _ in range(self.hparams.num_actions)
]
self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
for _ in range(self.hparams.num_actions)]
self.precision = [
self.lambda_prior * np.eye(self.hparams.context_dim + 1)
for _ in range(self.hparams.num_actions)
]
# Inverse Gamma prior for each sigma2_i
self._a0 = self.hparams.a0
self._b0 = self.hparams.b0
self.a = [self._a0 for _ in range(self.hparams.num_actions)]
self.b = [self._b0 for _ in range(self.hparams.num_actions)]
self.t = 0
self.data_h = ContextualDataset(hparams.context_dim,
hparams.num_actions,
intercept=True)
def action(self, context):
"""Samples beta's from posterior, and chooses best action accordingly.
Args:
context: Context for which the action need to be chosen.
Returns:
action: Selected action for the context.
"""
# Round robin until each action has been selected "initial_pulls" times
if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
return self.t % self.hparams.num_actions
# Sample sigma2, and beta conditional on sigma2
sigma2_s = [
self.b[i] * invgamma.rvs(self.a[i])
for i in range(self.hparams.num_actions)
]
try:
beta_s = [
np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
for i in range(self.hparams.num_actions)
]
except np.linalg.LinAlgError as e:
# Sampling could fail if covariance is not positive definite
print('Exception when sampling from {}.'.format(self.name))
print('Details: {} | {}.'.format(e.message, e.args))
d = self.hparams.context_dim + 1
beta_s = [
np.random.multivariate_normal(np.zeros((d)), np.eye(d))
for i in range(self.hparams.num_actions)
]
# Compute sampled expected values, intercept is last component of beta
vals = [
np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
for i in range(self.hparams.num_actions)
]
return np.argmax(vals)
def update(self, context, action, reward):
"""Updates action posterior using the linear Bayesian regression formula.
Args:
context: Last observed context.
action: Last observed action.
reward: Last observed reward.
"""
self.t += 1
self.data_h.add(context, action, reward)
# Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
x, y = self.data_h.get_data(action)
# The algorithm could be improved with sequential update formulas (cheaper)
s = np.dot(x.T, x)
# Some terms are removed as we assume prior mu_0 = 0.
precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1)
cov_a = np.linalg.inv(precision_a)
mu_a = np.dot(cov_a, np.dot(x.T, y))
# Inverse Gamma posterior update
a_post = self.a0 + x.shape[0] / 2.0
b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a)))
b_post = self.b0 + b_upd
# Store new posterior distributions
self.mu[action] = mu_a
self.cov[action] = cov_a
self.precision[action] = precision_a
self.a[action] = a_post
self.b[action] = b_post
@property
def a0(self):
return self._a0
@property
def b0(self):
return self._b0
@property
def lambda_prior(self):
return self._lambda_prior
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A Multitask Gaussian process."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
from bandits.core.bayesian_nn import BayesianNN
FLAGS = flags.FLAGS
tfd = tf.contrib.distributions
class MultitaskGP(BayesianNN):
"""Implements a Gaussian process with multi-task outputs.
Optimizes the hyperparameters over the log marginal likelihood.
Uses a Matern 3/2 + linear covariance and returns
sampled predictions for test inputs. The outputs are optionally
correlated where the correlation structure is learned through latent
embeddings of the tasks.
"""
def __init__(self, hparams):
self.name = "MultiTaskGP"
self.hparams = hparams
self.n_in = self.hparams.context_dim
self.n_out = self.hparams.num_outputs
self.keep_fixed_after_max_obs = self.hparams.keep_fixed_after_max_obs
self._show_training = self.hparams.show_training
self._freq_summary = self.hparams.freq_summary
# Dimensionality of the latent task vectors
self.task_latent_dim = self.hparams.task_latent_dim
# Maximum number of observations to include
self.max_num_points = self.hparams.max_num_points
if self.hparams.learn_embeddings:
self.learn_embeddings = self.hparams.learn_embeddings
else:
self.learn_embeddings = False
# create the graph corresponding to the BNN instance
self.graph = tf.Graph()
with self.graph.as_default():
# store a new session for the graph
self.sess = tf.Session()
with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
self.n = tf.placeholder(shape=[], dtype=tf.float64)
self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float64)
self.x_in = tf.placeholder(shape=[None, self.n_in], dtype=tf.float64)
self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float64)
self.weights = tf.placeholder(shape=[None, self.n_out],
dtype=tf.float64)
self.build_model()
self.sess.run(tf.global_variables_initializer())
def atleast_2d(self, x, dims):
return tf.reshape(tf.expand_dims(x, axis=0), (-1, dims))
def sq_dist(self, x, x2):
a2 = tf.reduce_sum(tf.square(x), 1)
b2 = tf.reduce_sum(tf.square(x2), 1)
sqdists = tf.expand_dims(a2, 1) + b2 - 2.0 * tf.matmul(x, tf.transpose(x2))
return sqdists
# Covariance between outputs
def task_cov(self, x, x2):
"""Squared Exponential Covariance Kernel over latent task embeddings."""
# Index into latent task vectors
x_vecs = tf.gather(self.task_vectors, tf.argmax(x, axis=1), axis=0)
x2_vecs = tf.gather(self.task_vectors, tf.argmax(x2, axis=1), axis=0)
r = self.sq_dist(self.atleast_2d(x_vecs, self.task_latent_dim),
self.atleast_2d(x2_vecs, self.task_latent_dim))
return tf.exp(-r)
def cov(self, x, x2):
"""Matern 3/2 + Linear Gaussian Process Covariance Function."""
ls = tf.clip_by_value(self.length_scales, -5.0, 5.0)
ls_lin = tf.clip_by_value(self.length_scales_lin, -5.0, 5.0)
r = self.sq_dist(self.atleast_2d(x, self.n_in)/tf.nn.softplus(ls),
self.atleast_2d(x2, self.n_in)/tf.nn.softplus(ls))
r = tf.clip_by_value(r, 0, 1e8)
# Matern 3/2 Covariance
matern = (1.0 + tf.sqrt(3.0*r + 1e-16)) * tf.exp(-tf.sqrt(3.0*r + 1e-16))
# Linear Covariance
lin = tf.matmul(x / tf.nn.softplus(ls_lin),
x2 / tf.nn.softplus(ls_lin), transpose_b=True)
return (tf.nn.softplus(self.amplitude) * matern +
tf.nn.softplus(self.amplitude_linear) * lin)
def build_model(self):
"""Defines the GP model.
The loss is computed for partial feedback settings (bandits), so only
the observed outcome is backpropagated (see weighted loss).
Selects the optimizer and, finally, it also initializes the graph.
"""
logging.info("Initializing model %s.", self.name)
self.global_step = tf.train.get_or_create_global_step()
# Define state for the model (inputs, etc.)
self.x_train = tf.get_variable(
"training_data",
initializer=tf.ones(
[self.hparams.batch_size, self.n_in], dtype=tf.float64),
validate_shape=False,
trainable=False)
self.y_train = tf.get_variable(
"training_labels",
initializer=tf.zeros([self.hparams.batch_size, 1], dtype=tf.float64),
validate_shape=False,
trainable=False)
self.weights_train = tf.get_variable(
"weights_train",
initializer=tf.ones(
[self.hparams.batch_size, self.n_out], dtype=tf.float64),
validate_shape=False,
trainable=False)
self.input_op = tf.assign(self.x_train, self.x_in, validate_shape=False)
self.input_w_op = tf.assign(
self.weights_train, self.weights, validate_shape=False)
self.input_std = tf.get_variable(
"data_standard_deviation",
initializer=tf.ones([1, self.n_out], dtype=tf.float64),
dtype=tf.float64,
trainable=False)
self.input_mean = tf.get_variable(
"data_mean",
initializer=tf.zeros([1, self.n_out], dtype=tf.float64),
dtype=tf.float64,
trainable=True)
# GP Hyperparameters
self.noise = tf.get_variable(
"noise", initializer=tf.cast(0.0, dtype=tf.float64))
self.amplitude = tf.get_variable(
"amplitude", initializer=tf.cast(1.0, dtype=tf.float64))
self.amplitude_linear = tf.get_variable(
"linear_amplitude", initializer=tf.cast(1.0, dtype=tf.float64))
self.length_scales = tf.get_variable(
"length_scales", initializer=tf.zeros([1, self.n_in], dtype=tf.float64))
self.length_scales_lin = tf.get_variable(
"length_scales_linear",
initializer=tf.zeros([1, self.n_in], dtype=tf.float64))
# Latent embeddings of the different outputs for task covariance
self.task_vectors = tf.get_variable(
"latent_task_vectors",
initializer=tf.random_normal(
[self.n_out, self.task_latent_dim], dtype=tf.float64))
# Normalize outputs across each dimension
# Since we have different numbers of observations across each task, we
# normalize by their respective counts.
index_counts = self.atleast_2d(tf.reduce_sum(self.weights, axis=0),
self.n_out)
index_counts = tf.where(index_counts > 0, index_counts,
tf.ones(tf.shape(index_counts), dtype=tf.float64))
self.mean_op = tf.assign(self.input_mean,
tf.reduce_sum(self.y, axis=0) / index_counts)
self.var_op = tf.assign(
self.input_std, tf.sqrt(1e-4 + tf.reduce_sum(tf.square(
self.y - tf.reduce_sum(self.y, axis=0) / index_counts), axis=0)
/ index_counts))
with tf.control_dependencies([self.var_op]):
y_normed = self.atleast_2d(
(self.y - self.input_mean) / self.input_std, self.n_out)
y_normed = self.atleast_2d(tf.boolean_mask(y_normed, self.weights > 0), 1)
self.out_op = tf.assign(self.y_train, y_normed, validate_shape=False)
# Observation noise
alpha = tf.nn.softplus(self.noise) + 1e-6
# Covariance
with tf.control_dependencies([self.input_op, self.input_w_op, self.out_op]):
self.self_cov = (self.cov(self.x_in, self.x_in) *
self.task_cov(self.weights, self.weights) +
tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64) * alpha)
self.chol = tf.cholesky(self.self_cov)
self.kinv = tf.cholesky_solve(self.chol, tf.eye(tf.shape(self.x_in)[0],
dtype=tf.float64))
self.input_inv = tf.Variable(
tf.eye(self.hparams.batch_size, dtype=tf.float64),
validate_shape=False,
trainable=False)
self.input_cov_op = tf.assign(self.input_inv, self.kinv,
validate_shape=False)
# Log determinant by taking the singular values along the diagonal
# of self.chol
with tf.control_dependencies([self.input_cov_op]):
logdet = 2.0 * tf.reduce_sum(tf.log(tf.diag_part(self.chol) + 1e-16))
# Log Marginal likelihood
self.marginal_ll = -tf.reduce_sum(-0.5 * tf.matmul(
tf.transpose(y_normed), tf.matmul(self.kinv, y_normed)) - 0.5 * logdet -
0.5 * self.n * np.log(2 * np.pi))
zero = tf.cast(0., dtype=tf.float64)
one = tf.cast(1., dtype=tf.float64)
standard_normal = tfd.Normal(loc=zero, scale=one)
# Loss is marginal likelihood and priors
self.loss = tf.reduce_sum(
self.marginal_ll -
(standard_normal.log_prob(self.amplitude) +
standard_normal.log_prob(tf.exp(self.noise)) +
standard_normal.log_prob(self.amplitude_linear) +
tfd.Normal(loc=zero, scale=one * 10.).log_prob(
self.task_vectors))
)
# Optimizer for hyperparameters
optimizer = tf.train.AdamOptimizer(learning_rate=self.hparams.lr)
vars_to_optimize = [
self.amplitude, self.length_scales, self.length_scales_lin,
self.amplitude_linear, self.noise, self.input_mean
]
if self.learn_embeddings:
vars_to_optimize.append(self.task_vectors)
grads = optimizer.compute_gradients(self.loss, vars_to_optimize)
self.train_op = optimizer.apply_gradients(grads,
global_step=self.global_step)
# Predictions for test data
self.y_mean, self.y_pred = self.posterior_mean_and_sample(self.x)
# create tensorboard metrics
self.create_summaries()
self.summary_writer = tf.summary.FileWriter("{}/graph_{}".format(
FLAGS.logdir, self.name), self.sess.graph)
self.check = tf.add_check_numerics_ops()
def posterior_mean_and_sample(self, candidates):
"""Draw samples for test predictions.
Given a Tensor of 'candidates' inputs, returns samples from the posterior
and the posterior mean prediction for those inputs.
Args:
candidates: A (num-examples x num-dims) Tensor containing the inputs for
which to return predictions.
Returns:
y_mean: The posterior mean prediction given these inputs
y_sample: A sample from the posterior of the outputs given these inputs
"""
# Cross-covariance for test predictions
w = tf.identity(self.weights_train)
inds = tf.squeeze(
tf.reshape(
tf.tile(
tf.reshape(tf.range(self.n_out), (self.n_out, 1)),
(1, tf.shape(candidates)[0])), (-1, 1)))
cross_cov = self.cov(tf.tile(candidates, [self.n_out, 1]), self.x_train)
cross_task_cov = self.task_cov(tf.one_hot(inds, self.n_out), w)
cross_cov *= cross_task_cov
# Test mean prediction
y_mean = tf.matmul(cross_cov, tf.matmul(self.input_inv, self.y_train))
# Test sample predictions
# Note this can be done much more efficiently using Kronecker products
# if all tasks are fully observed (which we won't assume)
test_cov = (
self.cov(tf.tile(candidates, [self.n_out, 1]),
tf.tile(candidates, [self.n_out, 1])) *
self.task_cov(tf.one_hot(inds, self.n_out),
tf.one_hot(inds, self.n_out)) -
tf.matmul(cross_cov,
tf.matmul(self.input_inv,
tf.transpose(cross_cov))))
# Get the matrix square root through an SVD for drawing samples
# This seems more numerically stable than the Cholesky
s, _, v = tf.svd(test_cov, full_matrices=True)
test_sqrt = tf.matmul(v, tf.matmul(tf.diag(s), tf.transpose(v)))
y_sample = (
tf.matmul(
test_sqrt,
tf.random_normal([tf.shape(test_sqrt)[0], 1], dtype=tf.float64)) +
y_mean)
y_sample = (
tf.transpose(tf.reshape(y_sample,
(self.n_out, -1))) * self.input_std +
self.input_mean)
return y_mean, y_sample
def create_summaries(self):
with self.graph.as_default():
tf.summary.scalar("loss", self.loss)
tf.summary.scalar("log_noise", self.noise)
tf.summary.scalar("log_amp", self.amplitude)
tf.summary.scalar("log_amp_lin", self.amplitude_linear)
tf.summary.histogram("length_scales", self.length_scales)
tf.summary.histogram("length_scales_lin", self.length_scales_lin)
self.summary_op = tf.summary.merge_all()
def train(self, data, num_steps):
"""Trains the GP for num_steps, using the data in 'data'.
Args:
data: ContextualDataset object that provides the data.
num_steps: Number of minibatches to train the network for.
"""
logging.info("Training %s for %d steps...", self.name, num_steps)
for step in range(num_steps):
numpts = min(data.num_points(None), self.max_num_points)
if numpts >= self.max_num_points and self.keep_fixed_after_max_obs:
x = data.contexts[:numpts, :]
y = data.rewards[:numpts, :]
weights = np.zeros((x.shape[0], self.n_out))
for i, val in enumerate(data.actions[:numpts]):
weights[i, val] = 1.0
else:
x, y, weights = data.get_batch_with_weights(numpts)
ops = [
self.global_step, self.summary_op, self.loss, self.noise,
self.amplitude, self.amplitude_linear, self.length_scales,
self.length_scales_lin, self.input_cov_op, self.input_op, self.var_op,
self.input_w_op, self.out_op, self.train_op
]
res = self.sess.run(ops,
feed_dict={self.x: x,
self.x_in: x,
self.y: y,
self.weights: weights,
self.n: numpts,
})
if step % self._freq_summary == 0:
if self._show_training:
logging.info("step: %d, loss: %g noise: %f amp: %f amp_lin: %f",
step, res[2], res[3], res[4], res[5])
summary = res[1]
global_step = res[0]
self.summary_writer.add_summary(summary, global_step=global_step)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Define a family of neural network architectures for bandits.
The network accepts different type of optimizers that could lead to different
approximations of the posterior distribution or simply to point estimates.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from absl import flags
from bandits.core.bayesian_nn import BayesianNN
FLAGS = flags.FLAGS
class NeuralBanditModel(BayesianNN):
"""Implements a neural network for bandit problems."""
def __init__(self, optimizer, hparams, name):
"""Saves hyper-params and builds the Tensorflow graph."""
self.opt_name = optimizer
self.name = name
self.hparams = hparams
self.verbose = getattr(self.hparams, "verbose", True)
self.times_trained = 0
self.build_model()
def build_layer(self, x, num_units):
"""Builds a layer with input x; dropout and layer norm if specified."""
init_s = self.hparams.init_scale
layer_n = getattr(self.hparams, "layer_norm", False)
dropout = getattr(self.hparams, "use_dropout", False)
nn = tf.contrib.layers.fully_connected(
x,
num_units,
activation_fn=self.hparams.activation,
normalizer_fn=None if not layer_n else tf.contrib.layers.layer_norm,
normalizer_params={},
weights_initializer=tf.random_uniform_initializer(-init_s, init_s)
)
if dropout:
nn = tf.nn.dropout(nn, self.hparams.keep_prob)
return nn
def forward_pass(self):
init_s = self.hparams.init_scale
scope_name = "prediction_{}".format(self.name)
with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
nn = self.x
for num_units in self.hparams.layer_sizes:
if num_units > 0:
nn = self.build_layer(nn, num_units)
y_pred = tf.layers.dense(
nn,
self.hparams.num_actions,
kernel_initializer=tf.random_uniform_initializer(-init_s, init_s))
return nn, y_pred
def build_model(self):
"""Defines the actual NN model with fully connected layers.
The loss is computed for partial feedback settings (bandits), so only
the observed outcome is backpropagated (see weighted loss).
Selects the optimizer and, finally, it also initializes the graph.
"""
# create and store the graph corresponding to the BNN instance
self.graph = tf.Graph()
with self.graph.as_default():
# create and store a new session for the graph
self.sess = tf.Session()
with tf.name_scope(self.name):
self.global_step = tf.train.get_or_create_global_step()
# context
self.x = tf.placeholder(
shape=[None, self.hparams.context_dim],
dtype=tf.float32,
name="{}_x".format(self.name))
# reward vector
self.y = tf.placeholder(
shape=[None, self.hparams.num_actions],
dtype=tf.float32,
name="{}_y".format(self.name))
# weights (1 for selected action, 0 otherwise)
self.weights = tf.placeholder(
shape=[None, self.hparams.num_actions],
dtype=tf.float32,
name="{}_w".format(self.name))
# with tf.variable_scope("prediction_{}".format(self.name)):
self.nn, self.y_pred = self.forward_pass()
self.loss = tf.squared_difference(self.y_pred, self.y)
self.weighted_loss = tf.multiply(self.weights, self.loss)
self.cost = tf.reduce_sum(self.weighted_loss) / self.hparams.batch_size
if self.hparams.activate_decay:
self.lr = tf.train.inverse_time_decay(
self.hparams.initial_lr, self.global_step,
1, self.hparams.lr_decay_rate)
else:
self.lr = tf.Variable(self.hparams.initial_lr, trainable=False)
# create tensorboard metrics
self.create_summaries()
self.summary_writer = tf.summary.FileWriter(
"{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(
tf.gradients(self.cost, tvars), self.hparams.max_grad_norm)
self.optimizer = self.select_optimizer()
self.train_op = self.optimizer.apply_gradients(
zip(grads, tvars), global_step=self.global_step)
self.init = tf.global_variables_initializer()
self.initialize_graph()
def initialize_graph(self):
"""Initializes all variables."""
with self.graph.as_default():
if self.verbose:
print("Initializing model {}.".format(self.name))
self.sess.run(self.init)
def assign_lr(self):
"""Resets the learning rate in dynamic schedules for subsequent trainings.
In bandits settings, we do expand our dataset over time. Then, we need to
re-train the network with the new data. The algorithms that do not keep
the step constant, can reset it at the start of each *training* process.
"""
decay_steps = 1
if self.hparams.activate_decay:
current_gs = self.sess.run(self.global_step)
with self.graph.as_default():
self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
self.global_step - current_gs,
decay_steps,
self.hparams.lr_decay_rate)
def select_optimizer(self):
"""Selects optimizer. To be extended (SGLD, KFAC, etc)."""
return tf.train.RMSPropOptimizer(self.lr)
def create_summaries(self):
"""Defines summaries including mean loss, learning rate, and global step."""
with self.graph.as_default():
with tf.name_scope(self.name + "_summaries"):
tf.summary.scalar("cost", self.cost)
tf.summary.scalar("lr", self.lr)
tf.summary.scalar("global_step", self.global_step)
self.summary_op = tf.summary.merge_all()
def train(self, data, num_steps):
"""Trains the network for num_steps, using the provided data.
Args:
data: ContextualDataset object that provides the data.
num_steps: Number of minibatches to train the network for.
"""
if self.verbose:
print("Training {} for {} steps...".format(self.name, num_steps))
with self.graph.as_default():
for step in range(num_steps):
x, y, w = data.get_batch_with_weights(self.hparams.batch_size)
_, cost, summary, lr = self.sess.run(
[self.train_op, self.cost, self.summary_op, self.lr],
feed_dict={self.x: x, self.y: y, self.weights: w})
if step % self.hparams.freq_summary == 0:
if self.hparams.show_training:
print("{} | step: {}, lr: {}, loss: {}".format(
self.name, step, lr, cost))
self.summary_writer.add_summary(summary, step)
self.times_trained += 1
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Thompson Sampling with linear posterior over a learnt deep representation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from scipy.stats import invgamma
from bandits.core.bandit_algorithm import BanditAlgorithm
from bandits.core.contextual_dataset import ContextualDataset
from bandits.algorithms.neural_bandit_model import NeuralBanditModel
class NeuralLinearPosteriorSampling(BanditAlgorithm):
"""Full Bayesian linear regression on the last layer of a deep neural net."""
def __init__(self, name, hparams, optimizer='RMS'):
self.name = name
self.hparams = hparams
self.latent_dim = self.hparams.layer_sizes[-1]
# Gaussian prior for each beta_i
self._lambda_prior = self.hparams.lambda_prior
self.mu = [
np.zeros(self.latent_dim)
for _ in range(self.hparams.num_actions)
]
self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
for _ in range(self.hparams.num_actions)]
self.precision = [
self.lambda_prior * np.eye(self.latent_dim)
for _ in range(self.hparams.num_actions)
]
# Inverse Gamma prior for each sigma2_i
self._a0 = self.hparams.a0
self._b0 = self.hparams.b0
self.a = [self._a0 for _ in range(self.hparams.num_actions)]
self.b = [self._b0 for _ in range(self.hparams.num_actions)]
# Regression and NN Update Frequency
self.update_freq_lr = hparams.training_freq
self.update_freq_nn = hparams.training_freq_network
self.t = 0
self.optimizer_n = optimizer
self.num_epochs = hparams.training_epochs
self.data_h = ContextualDataset(hparams.context_dim,
hparams.num_actions,
intercept=False)
self.latent_h = ContextualDataset(self.latent_dim,
hparams.num_actions,
intercept=False)
self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))
def action(self, context):
"""Samples beta's from posterior, and chooses best action accordingly."""
# Round robin until each action has been selected "initial_pulls" times
if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
return self.t % self.hparams.num_actions
# Sample sigma2, and beta conditional on sigma2
sigma2_s = [
self.b[i] * invgamma.rvs(self.a[i])
for i in range(self.hparams.num_actions)
]
try:
beta_s = [
np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
for i in range(self.hparams.num_actions)
]
except np.linalg.LinAlgError as e:
# Sampling could fail if covariance is not positive definite
print('Exception when sampling for {}.'.format(self.name))
print('Details: {} | {}.'.format(e.message, e.args))
d = self.latent_dim
beta_s = [
np.random.multivariate_normal(np.zeros((d)), np.eye(d))
for i in range(self.hparams.num_actions)
]
# Compute last-layer representation for the current context
with self.bnn.graph.as_default():
c = context.reshape((1, self.hparams.context_dim))
z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
# Apply Thompson Sampling to last-layer representation
vals = [
np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
]
return np.argmax(vals)
def update(self, context, action, reward):
"""Updates the posterior using linear bayesian regression formula."""
self.t += 1
self.data_h.add(context, action, reward)
c = context.reshape((1, self.hparams.context_dim))
z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
self.latent_h.add(z_context, action, reward)
# Retrain the network on the original data (data_h)
if self.t % self.update_freq_nn == 0:
if self.hparams.reset_lr:
self.bnn.assign_lr()
self.bnn.train(self.data_h, self.num_epochs)
# Update the latent representation of every datapoint collected so far
new_z = self.bnn.sess.run(self.bnn.nn,
feed_dict={self.bnn.x: self.data_h.contexts})
self.latent_h.replace_data(contexts=new_z)
# Update the Bayesian Linear Regression
if self.t % self.update_freq_lr == 0:
# Find all the actions to update
actions_to_update = self.latent_h.actions[:-self.update_freq_lr]
for action_v in np.unique(actions_to_update):
# Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
z, y = self.latent_h.get_data(action_v)
# The algorithm could be improved with sequential formulas (cheaper)
s = np.dot(z.T, z)
# Some terms are removed as we assume prior mu_0 = 0.
precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
cov_a = np.linalg.inv(precision_a)
mu_a = np.dot(cov_a, np.dot(z.T, y))
# Inverse Gamma posterior update
a_post = self.a0 + z.shape[0] / 2.0
b_upd = 0.5 * np.dot(y.T, y)
b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
b_post = self.b0 + b_upd
# Store new posterior distributions
self.mu[action_v] = mu_a
self.cov[action_v] = cov_a
self.precision[action_v] = precision_a
self.a[action_v] = a_post
self.b[action_v] = b_post
@property
def a0(self):
return self._a0
@property
def b0(self):
return self._b0
@property
def lambda_prior(self):
return self._lambda_prior
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contextual algorithm based on Thompson Sampling + direct noise injection."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from scipy.special import logsumexp
import tensorflow as tf
from absl import flags
from bandits.core.bandit_algorithm import BanditAlgorithm
from bandits.core.contextual_dataset import ContextualDataset
from bandits.algorithms.neural_bandit_model import NeuralBanditModel
FLAGS = flags.FLAGS
class ParameterNoiseSampling(BanditAlgorithm):
"""Parameter Noise Sampling algorithm based on adding noise to net params.
Described in https://arxiv.org/abs/1706.01905
"""
def __init__(self, name, hparams):
"""Creates the algorithm, and sets up the adaptive Gaussian noise."""
self.name = name
self.hparams = hparams
self.verbose = getattr(self.hparams, 'verbose', True)
self.noise_std = getattr(self.hparams, 'noise_std', 0.005)
self.eps = getattr(self.hparams, 'eps', 0.05)
self.d_samples = getattr(self.hparams, 'd_samples', 300)
self.optimizer = getattr(self.hparams, 'optimizer', 'RMS')
# keep track of noise heuristic statistics
self.std_h = [self.noise_std]
self.eps_h = [self.eps]
self.kl_h = []
self.t = 0
self.freq_update = hparams.training_freq
self.num_epochs = hparams.training_epochs
self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
hparams.buffer_s)
self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name))
with self.bnn.graph.as_default():
# noise-injection std placeholder
self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=())
# create noise corruption op; adds noise to all weights
tvars = tf.trainable_variables()
self.bnn.noisy_grads = [
tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph)
for v in tvars
]
# add noise to all params, then compute prediction, then subtract.
with tf.control_dependencies(self.bnn.noisy_grads):
self.bnn.noise_add_ops = [
tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads)
]
with tf.control_dependencies(self.bnn.noise_add_ops):
# we force the prediction for 'y' to be recomputed after adding noise
self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass()
self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val)
with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]):
self.bnn.noise_sub_ops = [
tvars[i].assign_add(-n)
for i, n in enumerate(self.bnn.noisy_grads)
]
def action(self, context):
"""Selects action based on Thompson Sampling *after* adding noise."""
if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
# round robin until each action has been taken "initial_pulls" times
return self.t % self.hparams.num_actions
with self.bnn.graph.as_default():
# run noise prediction op to choose action, and subtract noise op after.
c = context.reshape((1, self.hparams.context_dim))
output, _ = self.bnn.sess.run(
[self.bnn.noisy_pred, self.bnn.noise_sub_ops],
feed_dict={self.bnn.x: c,
self.bnn.noise_std_ph: self.noise_std})
return np.argmax(output)
def update(self, context, action, reward):
"""Updates the data buffer, and re-trains the BNN and noise level."""
self.t += 1
self.data_h.add(context, action, reward)
if self.t % self.freq_update == 0:
self.bnn.train(self.data_h, self.num_epochs)
self.update_noise()
def update_noise(self):
"""Increase noise if distance btw original and corrupted distrib small."""
kl = self.compute_distance()
delta = -np.log1p(- self.eps + self.eps / self.hparams.num_actions)
if kl < delta:
self.noise_std *= 1.01
else:
self.noise_std /= 1.01
self.eps *= 0.99
if self.verbose:
print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.format(
self.eps, kl, self.noise_std, delta, kl < delta))
# store noise-injection statistics for inspection: std, KL, eps.
self.std_h.append(self.noise_std)
self.kl_h.append(kl)
self.eps_h.append(self.eps)
def compute_distance(self):
"""Computes empirical KL for original and corrupted output distributions."""
random_inputs, _ = self.data_h.get_batch(self.d_samples)
y_model = self.bnn.sess.run(
self.bnn.y_pred,
feed_dict={
self.bnn.x: random_inputs,
self.bnn.noise_std_ph: self.noise_std
})
y_noisy, _ = self.bnn.sess.run(
[self.bnn.noisy_pred, self.bnn.noise_sub_ops],
feed_dict={
self.bnn.x: random_inputs,
self.bnn.noise_std_ph: self.noise_std
})
if self.verbose:
# display how often original & perturbed models propose different actions
s = np.sum([np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :])
for i in range(y_model.shape[0])])
print('{} | % of agreement btw original / corrupted actions: {}.'.format(
self.name, s / self.d_samples))
kl = self.compute_kl_with_logits(y_model, y_noisy)
return kl
def compute_kl_with_logits(self, logits1, logits2):
"""Computes KL from logits samples from two distributions."""
def exp_times_diff(a, b):
return np.multiply(np.exp(a), a - b)
logsumexp1 = logsumexp(logits1, axis=1)
logsumexp2 = logsumexp(logits2, axis=1)
logsumexp_diff = logsumexp2 - logsumexp1
exp_diff = exp_times_diff(logits1, logits2)
exp_diff = np.sum(exp_diff, axis=1)
inv_exp_sum = np.sum(np.exp(logits1), axis=1)
term1 = np.divide(exp_diff, inv_exp_sum)
kl = term1 + logsumexp_diff
kl = np.maximum(kl, 0.0)
kl = np.nan_to_num(kl)
return np.mean(kl)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contextual bandit algorithm based on Thompson Sampling and a Bayesian NN."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from bandits.core.bandit_algorithm import BanditAlgorithm
from bandits.algorithms.bb_alpha_divergence_model import BBAlphaDivergence
from bandits.algorithms.bf_variational_neural_bandit_model import BfVariationalNeuralBanditModel
from bandits.core.contextual_dataset import ContextualDataset
from bandits.algorithms.multitask_gp import MultitaskGP
from bandits.algorithms.neural_bandit_model import NeuralBanditModel
from bandits.algorithms.variational_neural_bandit_model import VariationalNeuralBanditModel
class PosteriorBNNSampling(BanditAlgorithm):
"""Posterior Sampling algorithm based on a Bayesian neural network."""
def __init__(self, name, hparams, bnn_model='RMSProp'):
"""Creates a PosteriorBNNSampling object based on a specific optimizer.
The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
The Bayesian Network keeps the posterior based on the optimizer iterations.
Args:
name: Name of the algorithm.
hparams: Hyper-parameters of the algorithm.
bnn_model: Type of BNN. By default RMSProp (point estimate).
"""
self.name = name
self.hparams = hparams
self.optimizer_n = hparams.optimizer
self.training_freq = hparams.training_freq
self.training_epochs = hparams.training_epochs
self.t = 0
self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
hparams.buffer_s)
# to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
bnn_name = '{}-bnn'.format(name)
if bnn_model == 'Variational':
self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
elif bnn_model == 'AlphaDiv':
self.bnn = BBAlphaDivergence(hparams, bnn_name)
elif bnn_model == 'Variational_BF':
self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
elif bnn_model == 'GP':
self.bnn = MultitaskGP(hparams)
else:
self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)
def action(self, context):
"""Selects action for context based on Thompson Sampling using the BNN."""
if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
# round robin until each action has been taken "initial_pulls" times
return self.t % self.hparams.num_actions
with self.bnn.graph.as_default():
c = context.reshape((1, self.hparams.context_dim))
output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
return np.argmax(output)
def update(self, context, action, reward):
"""Updates data buffer, and re-trains the BNN every training_freq steps."""
self.t += 1
self.data_h.add(context, action, reward)
if self.t % self.training_freq == 0:
if self.hparams.reset_lr:
self.bnn.assign_lr()
self.bnn.train(self.data_h, self.training_epochs)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contextual bandit algorithm that selects an action uniformly at random."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from bandits.core.bandit_algorithm import BanditAlgorithm
class UniformSampling(BanditAlgorithm):
"""Defines a baseline; returns one action uniformly at random."""
def __init__(self, name, hparams):
"""Creates a UniformSampling object.
Args:
name: Name of the algorithm.
hparams: Hyper-parameters, including the number of arms (num_actions).
"""
self.name = name
self.hparams = hparams
def action(self, context):
"""Selects an action uniformly at random."""
return np.random.choice(range(self.hparams.num_actions))
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bayesian NN using factorized VI (Bayes By Backprop. Blundell et al. 2014).
See https://arxiv.org/abs/1505.05424 for details.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from absl import flags
from bandits.core.bayesian_nn import BayesianNN
FLAGS = flags.FLAGS
def log_gaussian(x, mu, sigma, reduce_sum=True):
"""Returns log Gaussian pdf."""
res = (-0.5 * np.log(2 * np.pi) - tf.log(sigma) - tf.square(x - mu) /
(2 * tf.square(sigma)))
if reduce_sum:
return tf.reduce_sum(res)
else:
return res
def analytic_kl(mu_1, sigma_1, mu_2, sigma_2):
"""KL for two Gaussian distributions with diagonal covariance matrix."""
sigma_1_sq = tf.square(sigma_1)
sigma_2_sq = tf.square(sigma_2)
t1 = tf.square(mu_1 - mu_2) / (2. * sigma_2_sq)
t2 = (sigma_1_sq/sigma_2_sq - 1. - tf.log(sigma_1_sq) + tf.log(sigma_2_sq))/2.
return tf.reduce_sum(t1 + t2)
class VariationalNeuralBanditModel(BayesianNN):
"""Implements an approximate Bayesian NN using Variational Inference."""
def __init__(self, hparams, name="BBBNN"):
self.name = name
self.hparams = hparams
self.n_in = self.hparams.context_dim
self.n_out = self.hparams.num_actions
self.layers = self.hparams.layer_sizes
self.init_scale = self.hparams.init_scale
self.f_num_points = None
if "f_num_points" in hparams:
self.f_num_points = self.hparams.f_num_points
self.cleared_times_trained = self.hparams.cleared_times_trained
self.initial_training_steps = self.hparams.initial_training_steps
self.training_schedule = np.linspace(self.initial_training_steps,
self.hparams.training_epochs,
self.cleared_times_trained)
self.verbose = getattr(self.hparams, "verbose", True)
self.weights_m = {}
self.weights_std = {}
self.biases_m = {}
self.biases_std = {}
self.times_trained = 0
if self.hparams.use_sigma_exp_transform:
self.sigma_transform = tf.exp
self.inverse_sigma_transform = np.log
else:
self.sigma_transform = tf.nn.softplus
self.inverse_sigma_transform = lambda y: y + np.log(1. - np.exp(-y))
# Whether to use the local reparameterization trick to compute the loss.
# See details in https://arxiv.org/abs/1506.02557
self.use_local_reparameterization = True
self.build_graph()
def build_mu_variable(self, shape):
"""Returns a mean variable initialized as N(0, 0.05)."""
return tf.Variable(tf.random_normal(shape, 0.0, 0.05))
def build_sigma_variable(self, shape, init=-5.):
"""Returns a sigma variable initialized as N(init, 0.05)."""
# Initialize sigma to be very small initially to encourage MAP opt first
return tf.Variable(tf.random_normal(shape, init, 0.05))
def build_layer(self, input_x, input_x_local, shape,
layer_id, activation_fn=tf.nn.relu):
"""Builds a variational layer, and computes KL term.
Args:
input_x: Input to the variational layer.
input_x_local: Input when the local reparameterization trick was applied.
shape: [number_inputs, number_outputs] for the layer.
layer_id: Number of layer in the architecture.
activation_fn: Activation function to apply.
Returns:
output_h: Output of the variational layer.
output_h_local: Output when local reparameterization trick was applied.
neg_kl: Negative KL term for the layer.
"""
w_mu = self.build_mu_variable(shape)
w_sigma = self.sigma_transform(self.build_sigma_variable(shape))
w_noise = tf.random_normal(shape)
w = w_mu + w_sigma * w_noise
b_mu = self.build_mu_variable([1, shape[1]])
b_sigma = self.sigma_transform(self.build_sigma_variable([1, shape[1]]))
b = b_mu
# Store means and stds
self.weights_m[layer_id] = w_mu
self.weights_std[layer_id] = w_sigma
self.biases_m[layer_id] = b_mu
self.biases_std[layer_id] = b_sigma
# Create outputs
output_h = activation_fn(tf.matmul(input_x, w) + b)
if self.use_local_reparameterization:
# Use analytic KL divergence wrt the prior
neg_kl = -analytic_kl(w_mu, w_sigma,
0., tf.to_float(np.sqrt(2./shape[0])))
else:
# Create empirical KL loss terms
log_p = log_gaussian(w, 0., tf.to_float(np.sqrt(2./shape[0])))
log_q = log_gaussian(w, tf.stop_gradient(w_mu), tf.stop_gradient(w_sigma))
neg_kl = log_p - log_q
# Apply local reparameterization trick: sample activations pre nonlinearity
m_h = tf.matmul(input_x_local, w_mu) + b
v_h = tf.matmul(tf.square(input_x_local), tf.square(w_sigma))
output_h_local = m_h + tf.sqrt(v_h + 1e-6) * tf.random_normal(tf.shape(v_h))
output_h_local = activation_fn(output_h_local)
return output_h, output_h_local, neg_kl
def build_action_noise(self):
"""Defines a model for additive noise per action, and its KL term."""
# Define mean and std variables (log-normal dist) for each action.
noise_sigma_mu = (self.build_mu_variable([1, self.n_out])
+ self.inverse_sigma_transform(self.hparams.noise_sigma))
noise_sigma_sigma = self.sigma_transform(
self.build_sigma_variable([1, self.n_out]))
pre_noise_sigma = (noise_sigma_mu
+ tf.random_normal([1, self.n_out]) * noise_sigma_sigma)
self.noise_sigma = self.sigma_transform(pre_noise_sigma)
# Compute KL for additive noise sigma terms.
if getattr(self.hparams, "infer_noise_sigma", False):
neg_kl_term = log_gaussian(
pre_noise_sigma,
self.inverse_sigma_transform(self.hparams.noise_sigma),
self.hparams.prior_sigma
)
neg_kl_term -= log_gaussian(pre_noise_sigma,
noise_sigma_mu,
noise_sigma_sigma)
else:
neg_kl_term = 0.
return neg_kl_term
def build_model(self, activation_fn=tf.nn.relu):
"""Defines the actual NN model with fully connected layers.
The loss is computed for partial feedback settings (bandits), so only
the observed outcome is backpropagated (see weighted loss).
Selects the optimizer and, finally, it also initializes the graph.
Args:
activation_fn: the activation function used in the nn layers.
"""
if self.verbose:
print("Initializing model {}.".format(self.name))
neg_kl_term, l_number = 0, 0
use_local_reparameterization = self.use_local_reparameterization
# Compute model additive noise for each action with log-normal distribution
neg_kl_term += self.build_action_noise()
# Build network.
input_x = self.x
input_local = self.x
n_in = self.n_in
for l_number, n_nodes in enumerate(self.layers):
if n_nodes > 0:
h, h_local, neg_kl = self.build_layer(input_x, input_local,
[n_in, n_nodes], l_number)
neg_kl_term += neg_kl
input_x, input_local = h, h_local
n_in = n_nodes
# Create last linear layer
h, h_local, neg_kl = self.build_layer(input_x, input_local,
[n_in, self.n_out],
l_number + 1,
activation_fn=lambda x: x)
neg_kl_term += neg_kl
self.y_pred = h
self.y_pred_local = h_local
# Compute log likelihood (with learned or fixed noise level)
if getattr(self.hparams, "infer_noise_sigma", False):
log_likelihood = log_gaussian(
self.y, self.y_pred_local, self.noise_sigma, reduce_sum=False)
else:
y_hat = self.y_pred_local if use_local_reparameterization else self.y_pred
log_likelihood = log_gaussian(
self.y, y_hat, self.hparams.noise_sigma, reduce_sum=False)
# Only take into account observed outcomes (bandits setting)
batch_size = tf.to_float(tf.shape(self.x)[0])
weighted_log_likelihood = tf.reduce_sum(
log_likelihood * self.weights) / batch_size
# The objective is 1/n * (\sum_i log_like_i - KL); neg_kl_term estimates -KL
elbo = weighted_log_likelihood + (neg_kl_term / self.n)
self.loss = -elbo
self.global_step = tf.train.get_or_create_global_step()
self.train_op = tf.train.AdamOptimizer(self.hparams.initial_lr).minimize(
self.loss, global_step=self.global_step)
# Create tensorboard metrics
self.create_summaries()
self.summary_writer = tf.summary.FileWriter(
"{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph)
def build_graph(self):
"""Defines graph, session, placeholders, and model.
Placeholders are: n (size of the dataset), x and y (context and observed
reward for each action), and weights (one-hot encoding of selected action
for each context, i.e., only possibly non-zero element in each y).
"""
self.graph = tf.Graph()
with self.graph.as_default():
self.sess = tf.Session()
self.n = tf.placeholder(shape=[], dtype=tf.float32)
self.x = tf.placeholder(shape=[None, self.n_in], dtype=tf.float32)
self.y = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
self.weights = tf.placeholder(shape=[None, self.n_out], dtype=tf.float32)
self.build_model()
self.sess.run(tf.global_variables_initializer())
def create_summaries(self):
"""Defines summaries including mean loss, and global step."""
with self.graph.as_default():
with tf.name_scope(self.name + "_summaries"):
tf.summary.scalar("loss", self.loss)
tf.summary.scalar("global_step", self.global_step)
self.summary_op = tf.summary.merge_all()
def assign_lr(self):
"""Resets the learning rate in dynamic schedules for subsequent trainings.
In bandits settings, we do expand our dataset over time. Then, we need to
re-train the network with the new data. The algorithms that do not keep
the step constant, can reset it at the start of each *training* process.
"""
decay_steps = 1
if self.hparams.activate_decay:
current_gs = self.sess.run(self.global_step)
with self.graph.as_default():
self.lr = tf.train.inverse_time_decay(self.hparams.initial_lr,
self.global_step - current_gs,
decay_steps,
self.hparams.lr_decay_rate)
def train(self, data, num_steps):
"""Trains the BNN for num_steps, using the data in 'data'.
Args:
data: ContextualDataset object that provides the data.
num_steps: Number of minibatches to train the network for.
Returns:
losses: Loss history during training.
"""
if self.times_trained < self.cleared_times_trained:
num_steps = int(self.training_schedule[self.times_trained])
self.times_trained += 1
losses = []
with self.graph.as_default():
if self.verbose:
print("Training {} for {} steps...".format(self.name, num_steps))
for step in range(num_steps):
x, y, weights = data.get_batch_with_weights(self.hparams.batch_size)
_, summary, global_step, loss = self.sess.run(
[self.train_op, self.summary_op, self.global_step, self.loss],
feed_dict={
self.x: x,
self.y: y,
self.weights: weights,
self.n: data.num_points(self.f_num_points),
})
losses.append(loss)
if step % self.hparams.freq_summary == 0:
if self.hparams.show_training:
print("{} | step: {}, loss: {}".format(
self.name, global_step, loss))
self.summary_writer.add_summary(summary, global_step)
return losses
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Define the abstract class for contextual bandit algorithms."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class BanditAlgorithm(object):
"""A bandit algorithm must be able to do two basic operations.
1. Choose an action given a context.
2. Update its internal model given a triple (context, played action, reward).
"""
def action(self, context):
pass
def update(self, context, action, reward):
pass
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Define the abstract class for Bayesian Neural Networks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class BayesianNN(object):
"""A Bayesian neural network keeps a distribution over neural nets."""
def __init__(self, optimizer):
pass
def build_model(self):
pass
def train(self, data):
pass
def sample(self, steps):
pass
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Define a contextual bandit from which we can sample and compute rewards.
We can feed the data, sample a context, its reward for a specific action, and
also the optimal action for a given context.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def run_contextual_bandit(context_dim, num_actions, dataset, algos):
"""Run a contextual bandit problem on a set of algorithms.
Args:
context_dim: Dimension of the context.
num_actions: Number of available actions.
dataset: Matrix where every row is a context + num_actions rewards.
algos: List of algorithms to use in the contextual bandit instance.
Returns:
h_actions: Matrix with actions: size (num_context, num_algorithms).
h_rewards: Matrix with rewards: size (num_context, num_algorithms).
"""
num_contexts = dataset.shape[0]
# Create contextual bandit
cmab = ContextualBandit(context_dim, num_actions)
cmab.feed_data(dataset)
h_actions = np.empty((0, len(algos)), float)
h_rewards = np.empty((0, len(algos)), float)
# Run the contextual bandit process
for i in range(num_contexts):
context = cmab.context(i)
actions = [a.action(context) for a in algos]
rewards = [cmab.reward(i, action) for action in actions]
for j, a in enumerate(algos):
a.update(context, actions[j], rewards[j])
h_actions = np.vstack((h_actions, np.array(actions)))
h_rewards = np.vstack((h_rewards, np.array(rewards)))
return h_actions, h_rewards
class ContextualBandit(object):
"""Implements a Contextual Bandit with d-dimensional contexts and k arms."""
def __init__(self, context_dim, num_actions):
"""Creates a contextual bandit object.
Args:
context_dim: Dimension of the contexts.
num_actions: Number of arms for the multi-armed bandit.
"""
self._context_dim = context_dim
self._num_actions = num_actions
def feed_data(self, data):
"""Feeds the data (contexts + rewards) to the bandit object.
Args:
data: Numpy array with shape [n, d+k], where n is the number of contexts,
d is the dimension of each context, and k the number of arms (rewards).
Raises:
ValueError: when data dimensions do not correspond to the object values.
"""
if data.shape[1] != self.context_dim + self.num_actions:
raise ValueError('Data dimensions do not match.')
self._number_contexts = data.shape[0]
self.data = data
self.order = range(self.number_contexts)
def reset(self):
"""Randomly shuffle the order of the contexts to deliver."""
self.order = np.random.permutation(self.number_contexts)
def context(self, number):
"""Returns the number-th context."""
return self.data[self.order[number]][:self.context_dim]
def reward(self, number, action):
"""Returns the reward for the number-th context and action."""
return self.data[self.order[number]][self.context_dim + action]
def optimal(self, number):
"""Returns the optimal action (in hindsight) for the number-th context."""
return np.argmax(self.data[self.order[number]][self.context_dim:])
@property
def context_dim(self):
return self._context_dim
@property
def num_actions(self):
return self._num_actions
@property
def number_contexts(self):
return self._number_contexts
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Define a data buffer for contextual bandit algorithms."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class ContextualDataset(object):
"""The buffer is able to append new data, and sample random minibatches."""
def __init__(self, context_dim, num_actions, buffer_s=-1, intercept=False):
"""Creates a ContextualDataset object.
The data is stored in attributes: contexts and rewards.
The sequence of taken actions are stored in attribute actions.
Args:
context_dim: Dimension of the contexts.
num_actions: Number of arms for the multi-armed bandit.
buffer_s: Size of buffer for training. Only last buffer_s will be
returned as minibatch. If buffer_s = -1, all data will be used.
intercept: If True, it adds a constant (1.0) dimension to each context X,
at the end.
"""
self._context_dim = context_dim
self._num_actions = num_actions
self._contexts = None
self._rewards = None
self.actions = []
self.buffer_s = buffer_s
self.intercept = intercept
def add(self, context, action, reward):
"""Adds a new triplet (context, action, reward) to the dataset.
The reward for the actions that weren't played is assumed to be zero.
Args:
context: A d-dimensional vector with the context.
action: Integer between 0 and k-1 representing the chosen arm.
reward: Real number representing the reward for the (context, action).
"""
if self.intercept:
c = np.array(context[:])
c = np.append(c, 1.0).reshape((1, self.context_dim + 1))
else:
c = np.array(context[:]).reshape((1, self.context_dim))
if self.contexts is None:
self.contexts = c
else:
self.contexts = np.vstack((self.contexts, c))
r = np.zeros((1, self.num_actions))
r[0, action] = reward
if self.rewards is None:
self.rewards = r
else:
self.rewards = np.vstack((self.rewards, r))
self.actions.append(action)
def replace_data(self, contexts=None, actions=None, rewards=None):
if contexts is not None:
self.contexts = contexts
if actions is not None:
self.actions = actions
if rewards is not None:
self.rewards = rewards
def get_batch(self, batch_size):
"""Returns a random minibatch of (contexts, rewards) with batch_size."""
n, _ = self.contexts.shape
if self.buffer_s == -1:
# use all the data
ind = np.random.choice(range(n), batch_size)
else:
# use only buffer (last buffer_s observations)
ind = np.random.choice(range(max(0, n - self.buffer_s), n), batch_size)
return self.contexts[ind, :], self.rewards[ind, :]
def get_data(self, action):
"""Returns all (context, reward) where the action was played."""
n, _ = self.contexts.shape
ind = np.array([i for i in range(n) if self.actions[i] == action])
return self.contexts[ind, :], self.rewards[ind, action]
def get_data_with_weights(self):
"""Returns all observations with one-hot weights for actions."""
weights = np.zeros((self.contexts.shape[0], self.num_actions))
a_ind = np.array([(i, val) for i, val in enumerate(self.actions)])
weights[a_ind[:, 0], a_ind[:, 1]] = 1.0
return self.contexts, self.rewards, weights
def get_batch_with_weights(self, batch_size):
"""Returns a random mini-batch with one-hot weights for actions."""
n, _ = self.contexts.shape
if self.buffer_s == -1:
# use all the data
ind = np.random.choice(range(n), batch_size)
else:
# use only buffer (last buffer_s obs)
ind = np.random.choice(range(max(0, n - self.buffer_s), n), batch_size)
weights = np.zeros((batch_size, self.num_actions))
sampled_actions = np.array(self.actions)[ind]
a_ind = np.array([(i, val) for i, val in enumerate(sampled_actions)])
weights[a_ind[:, 0], a_ind[:, 1]] = 1.0
return self.contexts[ind, :], self.rewards[ind, :], weights
def num_points(self, f=None):
"""Returns number of points in the buffer (after applying function f)."""
if f is not None:
return f(self.contexts.shape[0])
return self.contexts.shape[0]
@property
def context_dim(self):
return self._context_dim
@property
def num_actions(self):
return self._num_actions
@property
def contexts(self):
return self._contexts
@contexts.setter
def contexts(self, value):
self._contexts = value
@property
def actions(self):
return self._actions
@actions.setter
def actions(self, value):
self._actions = value
@property
def rewards(self):
return self._rewards
@rewards.setter
def rewards(self, value):
self._rewards = value
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to create bandit problems from datasets."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import tensorflow as tf
def one_hot(df, cols):
"""Returns one-hot encoding of DataFrame df including columns in cols."""
for col in cols:
dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
df = pd.concat([df, dummies], axis=1)
df = df.drop(col, axis=1)
return df
def sample_mushroom_data(file_name,
num_contexts,
r_noeat=0,
r_eat_safe=5,
r_eat_poison_bad=-35,
r_eat_poison_good=5,
prob_poison_bad=0.5):
"""Samples bandit game from Mushroom UCI Dataset.
Args:
file_name: Route of file containing the original Mushroom UCI dataset.
num_contexts: Number of points to sample, i.e. (context, action rewards).
r_noeat: Reward for not eating a mushroom.
r_eat_safe: Reward for eating a non-poisonous mushroom.
r_eat_poison_bad: Reward for eating a poisonous mushroom if harmed.
r_eat_poison_good: Reward for eating a poisonous mushroom if not harmed.
prob_poison_bad: Probability of being harmed by eating a poisonous mushroom.
Returns:
dataset: Sampled matrix with n rows: (context, eat_reward, no_eat_reward).
opt_vals: Vector of expected optimal (reward, action) for each context.
We assume r_eat_safe > r_noeat, and r_eat_poison_good > r_eat_poison_bad.
"""
# first two cols of df encode whether mushroom is edible or poisonous
df = pd.read_csv(file_name, header=None)
df = one_hot(df, df.columns)
ind = np.random.choice(range(df.shape[0]), num_contexts, replace=True)
contexts = df.iloc[ind, 2:]
no_eat_reward = r_noeat * np.ones((num_contexts, 1))
random_poison = np.random.choice(
[r_eat_poison_bad, r_eat_poison_good],
p=[prob_poison_bad, 1 - prob_poison_bad],
size=num_contexts)
eat_reward = r_eat_safe * df.iloc[ind, 0]
eat_reward += np.multiply(random_poison, df.iloc[ind, 1])
eat_reward = eat_reward.values.reshape((num_contexts, 1))
# compute optimal expected reward and optimal actions
exp_eat_poison_reward = r_eat_poison_bad * prob_poison_bad
exp_eat_poison_reward += r_eat_poison_good * (1 - prob_poison_bad)
opt_exp_reward = r_eat_safe * df.iloc[ind, 0] + max(
r_noeat, exp_eat_poison_reward) * df.iloc[ind, 1]
if r_noeat > exp_eat_poison_reward:
# actions: no eat = 0 ; eat = 1
opt_actions = df.iloc[ind, 0] # indicator of edible
else:
# should always eat (higher expected reward)
opt_actions = np.ones((num_contexts, 1))
opt_vals = (opt_exp_reward.values, opt_actions.values)
return np.hstack((contexts, no_eat_reward, eat_reward)), opt_vals
def sample_stock_data(file_name, context_dim, num_actions, num_contexts,
sigma, shuffle_rows=True):
"""Samples linear bandit game from stock prices dataset.
Args:
file_name: Route of file containing the stock prices dataset.
context_dim: Context dimension (i.e. vector with the price of each stock).
num_actions: Number of actions (different linear portfolio strategies).
num_contexts: Number of contexts to sample.
sigma: Vector with additive noise levels for each action.
shuffle_rows: If True, rows from original dataset are shuffled.
Returns:
dataset: Sampled matrix with rows: (context, reward_1, ..., reward_k).
opt_vals: Vector of expected optimal (reward, action) for each context.
"""
with tf.gfile.Open(file_name, 'r') as f:
contexts = np.loadtxt(f, skiprows=1)
if shuffle_rows:
np.random.shuffle(contexts)
contexts = contexts[:num_contexts, :]
betas = np.random.uniform(-1, 1, (context_dim, num_actions))
betas /= np.linalg.norm(betas, axis=0)
mean_rewards = np.dot(contexts, betas)
noise = np.random.normal(scale=sigma, size=mean_rewards.shape)
rewards = mean_rewards + noise
opt_actions = np.argmax(mean_rewards, axis=1)
opt_rewards = [mean_rewards[i, a] for i, a in enumerate(opt_actions)]
return np.hstack((contexts, rewards)), (np.array(opt_rewards), opt_actions)
def sample_jester_data(file_name, context_dim, num_actions, num_contexts,
shuffle_rows=True, shuffle_cols=False):
"""Samples bandit game from (user, joke) dense subset of Jester dataset.
Args:
file_name: Route of file containing the modified Jester dataset.
context_dim: Context dimension (i.e. vector with some ratings from a user).
num_actions: Number of actions (number of joke ratings to predict).
num_contexts: Number of contexts to sample.
shuffle_rows: If True, rows from original dataset are shuffled.
shuffle_cols: Whether or not context/action jokes are randomly shuffled.
Returns:
dataset: Sampled matrix with rows: (context, rating_1, ..., rating_k).
opt_vals: Vector of deterministic optimal (reward, action) for each context.
"""
with tf.gfile.Open(file_name, 'rb') as f:
dataset = np.load(f)
if shuffle_cols:
dataset = dataset[:, np.random.permutation(dataset.shape[1])]
if shuffle_rows:
np.random.shuffle(dataset)
dataset = dataset[:num_contexts, :]
assert context_dim + num_actions == dataset.shape[1], 'Wrong data dimensions.'
opt_actions = np.argmax(dataset[:, context_dim:], axis=1)
opt_rewards = np.array([dataset[i, context_dim + a]
for i, a in enumerate(opt_actions)])
return dataset, (opt_rewards, opt_actions)
def sample_statlog_data(file_name, num_contexts, shuffle_rows=True,
remove_underrepresented=False):
"""Returns bandit problem dataset based on the UCI statlog data.
Args:
file_name: Route of file containing the Statlog dataset.
num_contexts: Number of contexts to sample.
shuffle_rows: If True, rows from original dataset are shuffled.
remove_underrepresented: If True, removes arms with very few rewards.
Returns:
dataset: Sampled matrix with rows: (context, action rewards).
opt_vals: Vector of deterministic optimal (reward, action) for each context.
https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)
"""
with tf.gfile.Open(file_name, 'r') as f:
data = np.loadtxt(f)
num_actions = 7 # some of the actions are very rarely optimal.
# Shuffle data
if shuffle_rows:
np.random.shuffle(data)
data = data[:num_contexts, :]
# Last column is label, rest are features
contexts = data[:, :-1]
labels = data[:, -1].astype(int) - 1 # convert to 0 based index
if remove_underrepresented:
contexts, labels = remove_underrepresented_classes(contexts, labels)
return classification_to_bandit_problem(contexts, labels, num_actions)
def sample_adult_data(file_name, num_contexts, shuffle_rows=True,
remove_underrepresented=False):
"""Returns bandit problem dataset based on the UCI adult data.
Args:
file_name: Route of file containing the Adult dataset.
num_contexts: Number of contexts to sample.
shuffle_rows: If True, rows from original dataset are shuffled.
remove_underrepresented: If True, removes arms with very few rewards.
Returns:
dataset: Sampled matrix with rows: (context, action rewards).
opt_vals: Vector of deterministic optimal (reward, action) for each context.
Preprocessing:
* drop rows with missing values
* convert categorical variables to 1 hot encoding
https://archive.ics.uci.edu/ml/datasets/census+income
"""
with tf.gfile.Open(file_name, 'r') as f:
df = pd.read_csv(f, header=None,
na_values=[' ?']).dropna()
num_actions = 14
if shuffle_rows:
df = df.sample(frac=1)
df = df.iloc[:num_contexts, :]
labels = df[6].astype('category').cat.codes.as_matrix()
df = df.drop([6], axis=1)
# Convert categorical variables to 1 hot encoding
cols_to_transform = [1, 3, 5, 7, 8, 9, 13, 14]
df = pd.get_dummies(df, columns=cols_to_transform)
if remove_underrepresented:
df, labels = remove_underrepresented_classes(df, labels)
contexts = df.as_matrix()
return classification_to_bandit_problem(contexts, labels, num_actions)
def sample_census_data(file_name, num_contexts, shuffle_rows=True,
remove_underrepresented=False):
"""Returns bandit problem dataset based on the UCI census data.
Args:
file_name: Route of file containing the Census dataset.
num_contexts: Number of contexts to sample.
shuffle_rows: If True, rows from original dataset are shuffled.
remove_underrepresented: If True, removes arms with very few rewards.
Returns:
dataset: Sampled matrix with rows: (context, action rewards).
opt_vals: Vector of deterministic optimal (reward, action) for each context.
Preprocessing:
* drop rows with missing labels
* convert categorical variables to 1 hot encoding
Note: this is the processed (not the 'raw') dataset. It contains a subset
of the raw features and they've all been discretized.
https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29
"""
# Note: this dataset is quite large. It will be slow to load and preprocess.
with tf.gfile.Open(file_name, 'r') as f:
df = (pd.read_csv(f, header=0, na_values=['?'])
.dropna())
num_actions = 9
if shuffle_rows:
df = df.sample(frac=1)
df = df.iloc[:num_contexts, :]
# Assuming what the paper calls response variable is the label?
labels = df['dOccup'].astype('category').cat.codes.as_matrix()
# In addition to label, also drop the (unique?) key.
df = df.drop(['dOccup', 'caseid'], axis=1)
# All columns are categorical. Convert to 1 hot encoding.
df = pd.get_dummies(df, columns=df.columns)
if remove_underrepresented:
df, labels = remove_underrepresented_classes(df, labels)
contexts = df.as_matrix()
return classification_to_bandit_problem(contexts, labels, num_actions)
def sample_covertype_data(file_name, num_contexts, shuffle_rows=True,
remove_underrepresented=False):
"""Returns bandit problem dataset based on the UCI Cover_Type data.
Args:
file_name: Route of file containing the Covertype dataset.
num_contexts: Number of contexts to sample.
shuffle_rows: If True, rows from original dataset are shuffled.
remove_underrepresented: If True, removes arms with very few rewards.
Returns:
dataset: Sampled matrix with rows: (context, action rewards).
opt_vals: Vector of deterministic optimal (reward, action) for each context.
Preprocessing:
* drop rows with missing labels
* convert categorical variables to 1 hot encoding
https://archive.ics.uci.edu/ml/datasets/Covertype
"""
with tf.gfile.Open(file_name, 'r') as f:
df = (pd.read_csv(f, header=0, na_values=['?'])
.dropna())
num_actions = 7
if shuffle_rows:
df = df.sample(frac=1)
df = df.iloc[:num_contexts, :]
# Assuming what the paper calls response variable is the label?
# Last column is label.
labels = df[df.columns[-1]].astype('category').cat.codes.as_matrix()
df = df.drop([df.columns[-1]], axis=1)
# All columns are either quantitative or already converted to 1 hot.
if remove_underrepresented:
df, labels = remove_underrepresented_classes(df, labels)
contexts = df.as_matrix()
return classification_to_bandit_problem(contexts, labels, num_actions)
def classification_to_bandit_problem(contexts, labels, num_actions=None):
"""Normalize contexts and encode deterministic rewards."""
if num_actions is None:
num_actions = np.max(labels) + 1
num_contexts = contexts.shape[0]
# Due to random subsampling in small problems, some features may be constant
sstd = safe_std(np.std(contexts, axis=0, keepdims=True)[0, :])
# Normalize features
contexts = ((contexts - np.mean(contexts, axis=0, keepdims=True)) / sstd)
# One hot encode labels as rewards
rewards = np.zeros((num_contexts, num_actions))
rewards[np.arange(num_contexts), labels] = 1.0
return contexts, rewards, (np.ones(num_contexts), labels)
def safe_std(values):
"""Remove zero std values for ones."""
return np.array([val if val != 0.0 else 1.0 for val in values])
def remove_underrepresented_classes(features, labels, thresh=0.0005):
"""Removes classes when number of datapoints fraction is below a threshold."""
# Threshold doesn't seem to agree with https://arxiv.org/pdf/1706.04687.pdf
# Example: for Covertype, they report 4 classes after filtering, we get 7?
total_count = labels.shape[0]
unique, counts = np.unique(labels, return_counts=True)
ratios = counts.astype('float') / total_count
vals_and_ratios = dict(zip(unique, ratios))
print('Unique classes and their ratio of total: %s' % vals_and_ratios)
keep = [vals_and_ratios[v] >= thresh for v in labels]
return features[keep], labels[np.array(keep)]
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Several functions to sample contextual data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def sample_contextual_data(num_contexts, dim_context, num_actions, sigma):
"""Samples independent Gaussian data.
There is nothing to learn here as the rewards do not depend on the context.
Args:
num_contexts: Number of contexts to sample.
dim_context: Dimension of the contexts.
num_actions: Number of arms for the multi-armed bandit.
sigma: Standard deviation of the independent Gaussian samples.
Returns:
data: A [num_contexts, dim_context + num_actions] numpy array with the data.
"""
size_data = [num_contexts, dim_context + num_actions]
return np.random.normal(scale=sigma, size=size_data)
def sample_linear_data(num_contexts, dim_context, num_actions, sigma=0.0):
"""Samples data from linearly parameterized arms.
The reward for context X and arm j is given by X^T beta_j, for some latent
set of parameters {beta_j : j = 1, ..., k}. The beta's are sampled uniformly
at random, the contexts are Gaussian, and sigma-noise is added to the rewards.
Args:
num_contexts: Number of contexts to sample.
dim_context: Dimension of the contexts.
num_actions: Number of arms for the multi-armed bandit.
sigma: Standard deviation of the additive noise. Set to zero for no noise.
Returns:
data: A [n, d+k] numpy array with the data.
betas: Latent parameters that determine expected reward for each arm.
opt: (optimal_rewards, optimal_actions) for all contexts.
"""
betas = np.random.uniform(-1, 1, (dim_context, num_actions))
betas /= np.linalg.norm(betas, axis=0)
contexts = np.random.normal(size=[num_contexts, dim_context])
rewards = np.dot(contexts, betas)
opt_actions = np.argmax(rewards, axis=1)
rewards += np.random.normal(scale=sigma, size=rewards.shape)
opt_rewards = np.array([rewards[i, act] for i, act in enumerate(opt_actions)])
return np.hstack((contexts, rewards)), betas, (opt_rewards, opt_actions)
def sample_sparse_linear_data(num_contexts, dim_context, num_actions,
sparse_dim, sigma=0.0):
"""Samples data from sparse linearly parameterized arms.
The reward for context X and arm j is given by X^T beta_j, for some latent
set of parameters {beta_j : j = 1, ..., k}. The beta's are sampled uniformly
at random, the contexts are Gaussian, and sigma-noise is added to the rewards.
Only s components out of d are non-zero for each arm's beta.
Args:
num_contexts: Number of contexts to sample.
dim_context: Dimension of the contexts.
num_actions: Number of arms for the multi-armed bandit.
sparse_dim: Dimension of the latent subspace (sparsity pattern dimension).
sigma: Standard deviation of the additive noise. Set to zero for no noise.
Returns:
data: A [num_contexts, dim_context+num_actions] numpy array with the data.
betas: Latent parameters that determine expected reward for each arm.
opt: (optimal_rewards, optimal_actions) for all contexts.
"""
flatten = lambda l: [item for sublist in l for item in sublist]
sparse_pattern = flatten(
[[(j, i) for j in np.random.choice(range(dim_context),
sparse_dim,
replace=False)]
for i in range(num_actions)])
betas = np.random.uniform(-1, 1, (dim_context, num_actions))
mask = np.zeros((dim_context, num_actions))
for elt in sparse_pattern:
mask[elt] = 1
betas = np.multiply(betas, mask)
betas /= np.linalg.norm(betas, axis=0)
contexts = np.random.normal(size=[num_contexts, dim_context])
rewards = np.dot(contexts, betas)
opt_actions = np.argmax(rewards, axis=1)
rewards += np.random.normal(scale=sigma, size=rewards.shape)
opt_rewards = np.array([rewards[i, act] for i, act in enumerate(opt_actions)])
return np.hstack((contexts, rewards)), betas, (opt_rewards, opt_actions)
def sample_wheel_bandit_data(num_contexts, delta, mean_v, std_v,
mu_large, std_large):
"""Samples from Wheel bandit game (see https://arxiv.org/abs/1802.09127).
Args:
num_contexts: Number of points to sample, i.e. (context, action rewards).
delta: Exploration parameter: high reward in one region if norm above delta.
mean_v: Mean reward for each action if context norm is below delta.
std_v: Gaussian reward std for each action if context norm is below delta.
mu_large: Mean reward for optimal action if context norm is above delta.
std_large: Reward std for optimal action if context norm is above delta.
Returns:
dataset: Sampled matrix with n rows: (context, action rewards).
opt_vals: Vector of expected optimal (reward, action) for each context.
"""
context_dim = 2
num_actions = 5
data = []
rewards = []
opt_actions = []
opt_rewards = []
# sample uniform contexts in unit ball
while len(data) < num_contexts:
raw_data = np.random.uniform(-1, 1, (int(num_contexts / 3), context_dim))
for i in range(raw_data.shape[0]):
if np.linalg.norm(raw_data[i, :]) <= 1:
data.append(raw_data[i, :])
contexts = np.stack(data)[:num_contexts, :]
# sample rewards
for i in range(num_contexts):
r = [np.random.normal(mean_v[j], std_v[j]) for j in range(num_actions)]
if np.linalg.norm(contexts[i, :]) >= delta:
# large reward in the right region for the context
r_big = np.random.normal(mu_large, std_large)
if contexts[i, 0] > 0:
if contexts[i, 1] > 0:
r[0] = r_big
opt_actions.append(0)
else:
r[1] = r_big
opt_actions.append(1)
else:
if contexts[i, 1] > 0:
r[2] = r_big
opt_actions.append(2)
else:
r[3] = r_big
opt_actions.append(3)
else:
opt_actions.append(np.argmax(mean_v))
opt_rewards.append(r[opt_actions[-1]])
rewards.append(r)
rewards = np.stack(rewards)
opt_rewards = np.array(opt_rewards)
opt_actions = np.array(opt_actions)
return np.hstack((contexts, rewards)), (opt_rewards, opt_actions)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment