Commit 748eceae authored by Marianne Linhares Monteiro's avatar Marianne Linhares Monteiro Committed by GitHub
Browse files

Merge branch 'master' into cifar10_experiment

parents 40e906d2 ed65b632
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Download MNIST, Omniglot datasets for Rebar."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import urllib
import gzip
import os
import config
import struct
import numpy as np
import cPickle as pickle
import datasets
MNIST_URL = 'see README'
MNIST_BINARIZED_URL = 'see README'
OMNIGLOT_URL = 'see README'
MNIST_FLOAT_TRAIN = 'train-images-idx3-ubyte'
def load_mnist_float(local_filename):
with open(local_filename, 'rb') as f:
f.seek(4)
nimages, rows, cols = struct.unpack('>iii', f.read(12))
dim = rows*cols
images = np.fromfile(f, dtype=np.dtype(np.ubyte))
images = (images/255.0).astype('float32').reshape((nimages, dim))
return images
if __name__ == '__main__':
if not os.path.exists(config.DATA_DIR):
os.makedirs(config.DATA_DIR)
# Get MNIST and convert to npy file
local_filename = os.path.join(config.DATA_DIR, MNIST_FLOAT_TRAIN)
if not os.path.exists(local_filename):
urllib.urlretrieve("%s/%s.gz" % (MNIST_URL, MNIST_FLOAT_TRAIN), local_filename+'.gz')
with gzip.open(local_filename+'.gz', 'rb') as f:
file_content = f.read()
with open(local_filename, 'wb') as f:
f.write(file_content)
os.remove(local_filename+'.gz')
mnist_float_train = load_mnist_float(local_filename)[:-10000]
# save in a nice format
np.save(os.path.join(config.DATA_DIR, config.MNIST_FLOAT), mnist_float_train)
# Get binarized MNIST
splits = ['train', 'valid', 'test']
mnist_binarized = []
for split in splits:
filename = 'binarized_mnist_%s.amat' % split
url = '%s/binarized_mnist_%s.amat' % (MNIST_BINARIZED_URL, split)
local_filename = os.path.join(config.DATA_DIR, filename)
if not os.path.exists(local_filename):
urllib.urlretrieve(url, local_filename)
with open(local_filename, 'rb') as f:
mnist_binarized.append((np.array([map(int, line.split()) for line in f.readlines()]).astype('float32'), None))
# save in a nice format
with open(os.path.join(config.DATA_DIR, config.MNIST_BINARIZED), 'w') as out:
pickle.dump(mnist_binarized, out)
# Get Omniglot
local_filename = os.path.join(config.DATA_DIR, config.OMNIGLOT)
if not os.path.exists(local_filename):
urllib.urlretrieve(OMNIGLOT_URL,
local_filename)
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Logger for REBAR"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class Logger:
def __init__(self):
pass
def log(self, key, value):
pass
def flush(self):
pass
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
import numpy as np
from scipy.misc import logsumexp
import tensorflow.contrib.slim as slim
from tensorflow.python.ops import init_ops
import utils as U
FLAGS = tf.flags.FLAGS
Q_COLLECTION = "q_collection"
P_COLLECTION = "p_collection"
class SBN(object): # REINFORCE
def __init__(self,
hparams,
activation_func=tf.nn.sigmoid,
mean_xs = None,
eval_mode=False):
self.eval_mode = eval_mode
self.hparams = hparams
self.mean_xs = mean_xs
self.train_bias= -np.log(1./np.clip(mean_xs, 0.001, 0.999)-1.).astype(np.float32)
self.activation_func = activation_func
self.n_samples = tf.placeholder('int32')
self.x = tf.placeholder('float', [None, self.hparams.n_input])
self._x = tf.tile(self.x, [self.n_samples, 1])
self.batch_size = tf.shape(self._x)[0]
self.uniform_samples = dict()
self.uniform_samples_v = dict()
self.prior = tf.Variable(tf.zeros([self.hparams.n_hidden],
dtype=tf.float32),
name='p_prior',
collections=[tf.GraphKeys.GLOBAL_VARIABLES, P_COLLECTION])
self.run_recognition_network = False
self.run_generator_network = False
# Initialize temperature
self.pre_temperature_variable = tf.Variable(
np.log(self.hparams.temperature),
trainable=False,
dtype=tf.float32)
self.temperature_variable = tf.exp(self.pre_temperature_variable)
self.global_step = tf.Variable(0, trainable=False)
self.baseline_loss = []
self.ema = tf.train.ExponentialMovingAverage(decay=0.999)
self.maintain_ema_ops = []
self.optimizer_class = tf.train.AdamOptimizer(
learning_rate=1*self.hparams.learning_rate,
beta2=self.hparams.beta2)
self._generate_randomness()
self._create_network()
def initialize(self, sess):
self.sess = sess
def _create_eta(self, shape=[], collection='CV'):
return 2 * tf.sigmoid(tf.Variable(tf.zeros(shape), trainable=False,
collections=[collection, tf.GraphKeys.GLOBAL_VARIABLES, Q_COLLECTION]))
def _create_baseline(self, n_output=1, n_hidden=100,
is_zero_init=False,
collection='BASELINE'):
# center input
h = self._x
if self.mean_xs is not None:
h -= self.mean_xs
if is_zero_init:
initializer = init_ops.zeros_initializer()
else:
initializer = slim.variance_scaling_initializer()
with slim.arg_scope([slim.fully_connected],
variables_collections=[collection, Q_COLLECTION],
trainable=False,
weights_initializer=initializer):
h = slim.fully_connected(h, n_hidden, activation_fn=tf.nn.tanh)
baseline = slim.fully_connected(h, n_output, activation_fn=None)
if n_output == 1:
baseline = tf.reshape(baseline, [-1]) # very important to reshape
return baseline
def _create_transformation(self, input, n_output, reuse, scope_prefix):
"""Create the deterministic transformation between stochastic layers.
If self.hparam.nonlinear:
2 x tanh layers
Else:
1 x linear layer
"""
if self.hparams.nonlinear:
h = slim.fully_connected(input,
self.hparams.n_hidden,
reuse=reuse,
activation_fn=tf.nn.tanh,
scope='%s_nonlinear_1' % scope_prefix)
h = slim.fully_connected(h,
self.hparams.n_hidden,
reuse=reuse,
activation_fn=tf.nn.tanh,
scope='%s_nonlinear_2' % scope_prefix)
h = slim.fully_connected(h,
n_output,
reuse=reuse,
activation_fn=None,
scope='%s' % scope_prefix)
else:
h = slim.fully_connected(input,
n_output,
reuse=reuse,
activation_fn=None,
scope='%s' % scope_prefix)
return h
def _recognition_network(self, sampler=None, log_likelihood_func=None):
"""x values -> samples from Q and return log Q(h|x)."""
samples = {}
reuse = None if not self.run_recognition_network else True
# Set defaults
if sampler is None:
sampler = self._random_sample
if log_likelihood_func is None:
log_likelihood_func = lambda sample, log_params: (
U.binary_log_likelihood(sample['activation'], log_params))
logQ = []
if self.hparams.task in ['sbn', 'omni']:
# Initialize the edge case
samples[-1] = {'activation': self._x}
if self.mean_xs is not None:
samples[-1]['activation'] -= self.mean_xs # center the input
samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0
with slim.arg_scope([slim.fully_connected],
weights_initializer=slim.variance_scaling_initializer(),
variables_collections=[Q_COLLECTION]):
for i in xrange(self.hparams.n_layer):
# Set up the input to the layer
input = 2.0*samples[i-1]['activation'] - 1.0
# Create the conditional distribution (output is the logits)
h = self._create_transformation(input,
n_output=self.hparams.n_hidden,
reuse=reuse,
scope_prefix='q_%d' % i)
samples[i] = sampler(h, self.uniform_samples[i], i)
logQ.append(log_likelihood_func(samples[i], h))
self.run_recognition_network = True
return logQ, samples
elif self.hparams.task == 'sp':
# Initialize the edge case
samples[-1] = {'activation': tf.split(self._x,
num_or_size_splits=2,
axis=1)[0]} # top half of digit
if self.mean_xs is not None:
samples[-1]['activation'] -= np.split(self.mean_xs, 2, 0)[0] # center the input
samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0
with slim.arg_scope([slim.fully_connected],
weights_initializer=slim.variance_scaling_initializer(),
variables_collections=[Q_COLLECTION]):
for i in xrange(self.hparams.n_layer):
# Set up the input to the layer
input = 2.0*samples[i-1]['activation'] - 1.0
# Create the conditional distribution (output is the logits)
h = self._create_transformation(input,
n_output=self.hparams.n_hidden,
reuse=reuse,
scope_prefix='q_%d' % i)
samples[i] = sampler(h, self.uniform_samples[i], i)
logQ.append(log_likelihood_func(samples[i], h))
self.run_recognition_network = True
return logQ, samples
def _generator_network(self, samples, logQ, log_likelihood_func=None):
'''Returns learning signal and function.
This is the implementation for SBNs for the ELBO.
Args:
samples: dictionary of sampled latent variables
logQ: list of log q(h_i) terms
log_likelihood_func: function used to compute log probs for the latent
variables
Returns:
learning_signal: the "reward" function
function_term: part of the function that depends on the parameters
and needs to have the gradient taken through
'''
reuse=None if not self.run_generator_network else True
if self.hparams.task in ['sbn', 'omni']:
if log_likelihood_func is None:
log_likelihood_func = lambda sample, log_params: (
U.binary_log_likelihood(sample['activation'], log_params))
logPPrior = log_likelihood_func(
samples[self.hparams.n_layer-1],
tf.expand_dims(self.prior, 0))
with slim.arg_scope([slim.fully_connected],
weights_initializer=slim.variance_scaling_initializer(),
variables_collections=[P_COLLECTION]):
for i in reversed(xrange(self.hparams.n_layer)):
if i == 0:
n_output = self.hparams.n_input
else:
n_output = self.hparams.n_hidden
input = 2.0*samples[i]['activation']-1.0
h = self._create_transformation(input,
n_output,
reuse=reuse,
scope_prefix='p_%d' % i)
if i == 0:
# Assume output is binary
logP = U.binary_log_likelihood(self._x, h + self.train_bias)
else:
logPPrior += log_likelihood_func(samples[i-1], h)
self.run_generator_network = True
return logP + logPPrior - tf.add_n(logQ), logP + logPPrior
elif self.hparams.task == 'sp':
with slim.arg_scope([slim.fully_connected],
weights_initializer=slim.variance_scaling_initializer(),
variables_collections=[P_COLLECTION]):
n_output = int(self.hparams.n_input/2)
i = self.hparams.n_layer - 1 # use the last layer
input = 2.0*samples[i]['activation']-1.0
h = self._create_transformation(input,
n_output,
reuse=reuse,
scope_prefix='p_%d' % i)
# Predict on the lower half of the image
logP = U.binary_log_likelihood(tf.split(self._x,
num_or_size_splits=2,
axis=1)[1],
h + np.split(self.train_bias, 2, 0)[1])
self.run_generator_network = True
return logP, logP
def _create_loss(self):
# Hard loss
logQHard, samples = self._recognition_network()
reinforce_learning_signal, reinforce_model_grad = self._generator_network(samples, logQHard)
logQHard = tf.add_n(logQHard)
# REINFORCE
learning_signal = tf.stop_gradient(center(reinforce_learning_signal))
self.optimizerLoss = -(learning_signal*logQHard +
reinforce_model_grad)
self.lHat = map(tf.reduce_mean, [
reinforce_learning_signal,
U.rms(learning_signal),
])
return reinforce_learning_signal
def _reshape(self, t):
return tf.transpose(tf.reshape(t,
[self.n_samples, -1]))
def compute_tensor_variance(self, t):
"""Compute the mean per component variance.
Use a moving average to estimate the required moments.
"""
t_sq = tf.reduce_mean(tf.square(t))
self.maintain_ema_ops.append(self.ema.apply([t, t_sq]))
# mean per component variance
variance_estimator = (self.ema.average(t_sq) -
tf.reduce_mean(
tf.square(self.ema.average(t))))
return variance_estimator
def _create_train_op(self, grads_and_vars, extra_grads_and_vars=[]):
'''
Args:
grads_and_vars: gradients to apply and compute running average variance
extra_grads_and_vars: gradients to apply (not used to compute average variance)
'''
# Variance summaries
first_moment = U.vectorize(grads_and_vars, skip_none=True)
second_moment = tf.square(first_moment)
self.maintain_ema_ops.append(self.ema.apply([first_moment, second_moment]))
# Add baseline losses
if len(self.baseline_loss) > 0:
mean_baseline_loss = tf.reduce_mean(tf.add_n(self.baseline_loss))
extra_grads_and_vars += self.optimizer_class.compute_gradients(
mean_baseline_loss,
var_list=tf.get_collection('BASELINE'))
# Ensure that all required tensors are computed before updates are executed
extra_optimizer = tf.train.AdamOptimizer(
learning_rate=10*self.hparams.learning_rate,
beta2=self.hparams.beta2)
with tf.control_dependencies(
[tf.group(*[g for g, _ in (grads_and_vars + extra_grads_and_vars) if g is not None])]):
# Filter out the P_COLLECTION variables if we're in eval mode
if self.eval_mode:
grads_and_vars = [(g, v) for g, v in grads_and_vars
if v not in tf.get_collection(P_COLLECTION)]
train_op = self.optimizer_class.apply_gradients(grads_and_vars,
global_step=self.global_step)
if len(extra_grads_and_vars) > 0:
extra_train_op = extra_optimizer.apply_gradients(extra_grads_and_vars)
else:
extra_train_op = tf.no_op()
self.optimizer = tf.group(train_op, extra_train_op, *self.maintain_ema_ops)
# per parameter variance
variance_estimator = (self.ema.average(second_moment) -
tf.square(self.ema.average(first_moment)))
self.grad_variance = tf.reduce_mean(variance_estimator)
def _create_network(self):
logF = self._create_loss()
self.optimizerLoss = tf.reduce_mean(self.optimizerLoss)
# Setup optimizer
grads_and_vars = self.optimizer_class.compute_gradients(self.optimizerLoss)
self._create_train_op(grads_and_vars)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
def partial_fit(self, X, n_samples=1):
if hasattr(self, 'grad_variances'):
grad_variance_field_to_return = self.grad_variances
else:
grad_variance_field_to_return = self.grad_variance
_, res, grad_variance, step, temperature = self.sess.run(
(self.optimizer, self.lHat, grad_variance_field_to_return, self.global_step, self.temperature_variable),
feed_dict={self.x: X, self.n_samples: n_samples})
return res, grad_variance, step, temperature
def partial_grad(self, X, n_samples=1):
control_variate_grads, step = self.sess.run(
(self.control_variate_grads, self.global_step),
feed_dict={self.x: X, self.n_samples: n_samples})
return control_variate_grads, step
def partial_eval(self, X, n_samples=5):
if n_samples < 1000:
res, iwae = self.sess.run(
(self.lHat, self.iwae),
feed_dict={self.x: X, self.n_samples: n_samples})
res = [iwae] + res
else: # special case to handle OOM
assert n_samples % 100 == 0, "When using large # of samples, it must be divisble by 100"
res = []
for i in xrange(int(n_samples/100)):
logF, = self.sess.run(
(self.logF,),
feed_dict={self.x: X, self.n_samples: 100})
res.append(logsumexp(logF, axis=1))
res = [np.mean(logsumexp(res, axis=0) - np.log(n_samples))]
return res
# Random samplers
def _mean_sample(self, log_alpha, _, layer):
"""Returns mean of random variables parameterized by log_alpha."""
mu = tf.nn.sigmoid(log_alpha)
return {
'preactivation': mu,
'activation': mu,
'log_param': log_alpha,
}
def _generate_randomness(self):
for i in xrange(self.hparams.n_layer):
self.uniform_samples[i] = tf.stop_gradient(tf.random_uniform(
[self.batch_size, self.hparams.n_hidden]))
def _u_to_v(self, log_alpha, u, eps = 1e-8):
"""Convert u to tied randomness in v."""
u_prime = tf.nn.sigmoid(-log_alpha) # g(u') = 0
v_1 = (u - u_prime) / tf.clip_by_value(1 - u_prime, eps, 1)
v_1 = tf.clip_by_value(v_1, 0, 1)
v_1 = tf.stop_gradient(v_1)
v_1 = v_1*(1 - u_prime) + u_prime
v_0 = u / tf.clip_by_value(u_prime, eps, 1)
v_0 = tf.clip_by_value(v_0, 0, 1)
v_0 = tf.stop_gradient(v_0)
v_0 = v_0 * u_prime
v = tf.where(u > u_prime, v_1, v_0)
v = tf.check_numerics(v, 'v sampling is not numerically stable.')
v = v + tf.stop_gradient(-v + u) # v and u are the same up to numerical errors
return v
def _random_sample(self, log_alpha, u, layer):
"""Returns sampled random variables parameterized by log_alpha."""
# Generate tied randomness for later
if layer not in self.uniform_samples_v:
self.uniform_samples_v[layer] = self._u_to_v(log_alpha, u)
# Sample random variable underlying softmax/argmax
x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
samples = tf.stop_gradient(tf.to_float(x > 0))
return {
'preactivation': x,
'activation': samples,
'log_param': log_alpha,
}
def _random_sample_soft(self, log_alpha, u, layer, temperature=None):
"""Returns sampled random variables parameterized by log_alpha."""
if temperature is None:
temperature = self.hparams.temperature
# Sample random variable underlying softmax/argmax
x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
x /= tf.expand_dims(temperature, -1)
if self.hparams.muprop_relaxation:
y = tf.nn.sigmoid(x + log_alpha * tf.expand_dims(temperature/(temperature + 1), -1))
else:
y = tf.nn.sigmoid(x)
return {
'preactivation': x,
'activation': y,
'log_param': log_alpha
}
def _random_sample_soft_v(self, log_alpha, _, layer, temperature=None):
"""Returns sampled random variables parameterized by log_alpha."""
v = self.uniform_samples_v[layer]
return self._random_sample_soft(log_alpha, v, layer, temperature)
def get_gumbel_gradient(self):
logQ, softSamples = self._recognition_network(sampler=self._random_sample_soft)
logQ = tf.add_n(logQ)
logPPrior, logP = self._generator_network(softSamples)
softELBO = logPPrior + logP - logQ
gumbel_gradient = (self.optimizer_class.
compute_gradients(softELBO))
debug = {
'softELBO': softELBO,
}
return gumbel_gradient, debug
# samplers used for quadratic version
def _random_sample_switch(self, log_alpha, u, layer, switch_layer, temperature=None):
"""Run partial discrete, then continuous path.
Args:
switch_layer: this layer and beyond will be continuous
"""
if layer < switch_layer:
return self._random_sample(log_alpha, u, layer)
else:
return self._random_sample_soft(log_alpha, u, layer, temperature)
def _random_sample_switch_v(self, log_alpha, u, layer, switch_layer, temperature=None):
"""Run partial discrete, then continuous path.
Args:
switch_layer: this layer and beyond will be continuous
"""
if layer < switch_layer:
return self._random_sample(log_alpha, u, layer)
else:
return self._random_sample_soft_v(log_alpha, u, layer, temperature)
# #####
# Gradient computation
# #####
def get_nvil_gradient(self):
"""Compute the NVIL gradient."""
# Hard loss
logQHard, samples = self._recognition_network()
ELBO, reinforce_model_grad = self._generator_network(samples, logQHard)
logQHard = tf.add_n(logQHard)
# Add baselines (no variance normalization)
learning_signal = tf.stop_gradient(ELBO) - self._create_baseline()
# Set up losses
self.baseline_loss.append(tf.square(learning_signal))
optimizerLoss = -(tf.stop_gradient(learning_signal)*logQHard +
reinforce_model_grad)
optimizerLoss = tf.reduce_mean(optimizerLoss)
nvil_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
debug = {
'ELBO': ELBO,
'RMS of centered learning signal': U.rms(learning_signal),
}
return nvil_gradient, debug
def get_simple_muprop_gradient(self):
""" Computes the simple muprop gradient.
This muprop control variate does not include the linear term.
"""
# Hard loss
logQHard, hardSamples = self._recognition_network()
hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
# Soft loss
logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
muELBO, _ = self._generator_network(muSamples, logQ)
scaling_baseline = self._create_eta(collection='BASELINE')
learning_signal = (hardELBO
- scaling_baseline * muELBO
- self._create_baseline())
self.baseline_loss.append(tf.square(learning_signal))
optimizerLoss = -(tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
+ reinforce_model_grad)
optimizerLoss = tf.reduce_mean(optimizerLoss)
simple_muprop_gradient = (self.optimizer_class.
compute_gradients(optimizerLoss))
debug = {
'ELBO': hardELBO,
'muELBO': muELBO,
'RMS': U.rms(learning_signal),
}
return simple_muprop_gradient, debug
def get_muprop_gradient(self):
"""
random sample function that actually returns mean
new forward pass that returns logQ as a list
can get x_i from samples
"""
# Hard loss
logQHard, hardSamples = self._recognition_network()
hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
# Soft loss
logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
muELBO, _ = self._generator_network(muSamples, logQ)
# Compute gradients
muELBOGrads = tf.gradients(tf.reduce_sum(muELBO),
[ muSamples[i]['activation'] for
i in xrange(self.hparams.n_layer) ])
# Compute MuProp gradient estimates
learning_signal = hardELBO
optimizerLoss = 0.0
learning_signals = []
for i in xrange(self.hparams.n_layer):
dfDiff = tf.reduce_sum(
muELBOGrads[i] * (hardSamples[i]['activation'] -
muSamples[i]['activation']),
axis=1)
dfMu = tf.reduce_sum(
tf.stop_gradient(muELBOGrads[i]) *
tf.nn.sigmoid(hardSamples[i]['log_param']),
axis=1)
scaling_baseline_0 = self._create_eta(collection='BASELINE')
scaling_baseline_1 = self._create_eta(collection='BASELINE')
learning_signals.append(learning_signal - scaling_baseline_0 * muELBO - scaling_baseline_1 * dfDiff - self._create_baseline())
self.baseline_loss.append(tf.square(learning_signals[i]))
optimizerLoss += (
logQHard[i] * tf.stop_gradient(learning_signals[i]) +
tf.stop_gradient(scaling_baseline_1) * dfMu)
optimizerLoss += reinforce_model_grad
optimizerLoss *= -1
optimizerLoss = tf.reduce_mean(optimizerLoss)
muprop_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
debug = {
'ELBO': hardELBO,
'muELBO': muELBO,
}
debug.update(dict([
('RMS learning signal layer %d' % i, U.rms(learning_signal))
for (i, learning_signal) in enumerate(learning_signals)]))
return muprop_gradient, debug
# REBAR gradient helper functions
def _create_gumbel_control_variate(self, logQHard, temperature=None):
'''Calculate gumbel control variate.
'''
if temperature is None:
temperature = self.hparams.temperature
logQ, softSamples = self._recognition_network(sampler=functools.partial(
self._random_sample_soft, temperature=temperature))
softELBO, _ = self._generator_network(softSamples, logQ)
logQ = tf.add_n(logQ)
# Generate the softELBO_v (should be the same value but different grads)
logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
self._random_sample_soft_v, temperature=temperature))
softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
logQ_v = tf.add_n(logQ_v)
# Compute losses
learning_signal = tf.stop_gradient(softELBO_v)
# Control variate
h = (tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
- softELBO + softELBO_v)
extra = (softELBO_v, -softELBO + softELBO_v)
return h, extra
def _create_gumbel_control_variate_quadratic(self, logQHard, temperature=None):
'''Calculate gumbel control variate.
'''
if temperature is None:
temperature = self.hparams.temperature
h = 0
extra = []
for layer in xrange(self.hparams.n_layer):
logQ, softSamples = self._recognition_network(sampler=functools.partial(
self._random_sample_switch, switch_layer=layer, temperature=temperature))
softELBO, _ = self._generator_network(softSamples, logQ)
# Generate the softELBO_v (should be the same value but different grads)
logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
self._random_sample_switch_v, switch_layer=layer, temperature=temperature))
softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
# Compute losses
learning_signal = tf.stop_gradient(softELBO_v)
# Control variate
h += (tf.stop_gradient(learning_signal) * logQHard[layer]
- softELBO + softELBO_v)
extra.append((softELBO_v, -softELBO + softELBO_v))
return h, extra
def _create_hard_elbo(self):
logQHard, hardSamples = self._recognition_network()
hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
reinforce_learning_signal = tf.stop_gradient(hardELBO)
# Center learning signal
baseline = self._create_baseline(collection='CV')
reinforce_learning_signal = tf.stop_gradient(reinforce_learning_signal) - baseline
nvil_gradient = (tf.stop_gradient(hardELBO) - baseline) * tf.add_n(logQHard) + reinforce_model_grad
return hardELBO, nvil_gradient, logQHard
def multiply_by_eta(self, h_grads, eta):
# Modifies eta
res = []
eta_statistics = []
for (g, v) in h_grads:
if g is None:
res.append((g, v))
else:
if 'network' not in eta:
eta['network'] = self._create_eta()
res.append((g*eta['network'], v))
eta_statistics.append(eta['network'])
return res, eta_statistics
def multiply_by_eta_per_layer(self, h_grads, eta):
# Modifies eta
res = []
eta_statistics = []
for (g, v) in h_grads:
if g is None:
res.append((g, v))
else:
if v not in eta:
eta[v] = self._create_eta()
res.append((g*eta[v], v))
eta_statistics.append(eta[v])
return res, eta_statistics
def multiply_by_eta_per_unit(self, h_grads, eta):
# Modifies eta
res = []
eta_statistics = []
for (g, v) in h_grads:
if g is None:
res.append((g, v))
else:
if v not in eta:
g_shape = g.shape_as_list()
assert len(g_shape) <= 2, 'Gradient has too many dimensions'
if len(g_shape) == 1:
eta[v] = self._create_eta(g_shape)
else:
eta[v] = self._create_eta([1, g_shape[1]])
h_grads.append((g*eta[v], v))
eta_statistics.extend(tf.nn.moments(tf.squeeze(eta[v]), axes=[0]))
return res, eta_statistics
def get_dynamic_rebar_gradient(self):
"""Get the dynamic rebar gradient (t, eta optimized)."""
tiled_pre_temperature = tf.tile([self.pre_temperature_variable],
[self.batch_size])
temperature = tf.exp(tiled_pre_temperature)
hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
if self.hparams.quadratic:
gumbel_cv, extra = self._create_gumbel_control_variate_quadratic(logQHard, temperature=temperature)
else:
gumbel_cv, extra = self._create_gumbel_control_variate(logQHard, temperature=temperature)
f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))
eta = {}
h_grads, eta_statistics = self.multiply_by_eta_per_layer(
self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
eta)
model_grads = U.add_grads_and_vars(f_grads, h_grads)
total_grads = model_grads
# Construct the variance objective
g = U.vectorize(model_grads, set_none_to_zero=True)
self.maintain_ema_ops.append(self.ema.apply([g]))
gbar = 0 #tf.stop_gradient(self.ema.average(g))
variance_objective = tf.reduce_mean(tf.square(g - gbar))
reinf_g_t = 0
if self.hparams.quadratic:
for layer in xrange(self.hparams.n_layer):
gumbel_learning_signal, _ = extra[layer]
df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
reinf_g_t_i, _ = self.multiply_by_eta_per_layer(
self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * logQHard[layer])),
eta)
reinf_g_t += U.vectorize(reinf_g_t_i, set_none_to_zero=True)
reparam = tf.add_n([reparam_i for _, reparam_i in extra])
else:
gumbel_learning_signal, reparam = extra
df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
reinf_g_t, _ = self.multiply_by_eta_per_layer(
self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * tf.add_n(logQHard))),
eta)
reinf_g_t = U.vectorize(reinf_g_t, set_none_to_zero=True)
reparam_g, _ = self.multiply_by_eta_per_layer(
self.optimizer_class.compute_gradients(tf.reduce_mean(reparam)),
eta)
reparam_g = U.vectorize(reparam_g, set_none_to_zero=True)
reparam_g_t = tf.gradients(tf.reduce_mean(2*tf.stop_gradient(g - gbar)*reparam_g), self.pre_temperature_variable)[0]
variance_objective_grad = tf.reduce_mean(2*(g - gbar)*reinf_g_t) + reparam_g_t
debug = { 'ELBO': hardELBO,
'etas': eta_statistics,
'variance_objective': variance_objective,
}
return total_grads, debug, variance_objective, variance_objective_grad
def get_rebar_gradient(self):
"""Get the rebar gradient."""
hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
if self.hparams.quadratic:
gumbel_cv, _ = self._create_gumbel_control_variate_quadratic(logQHard)
else:
gumbel_cv, _ = self._create_gumbel_control_variate(logQHard)
f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))
eta = {}
h_grads, eta_statistics = self.multiply_by_eta_per_layer(
self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
eta)
model_grads = U.add_grads_and_vars(f_grads, h_grads)
total_grads = model_grads
# Construct the variance objective
variance_objective = tf.reduce_mean(tf.square(U.vectorize(model_grads, set_none_to_zero=True)))
debug = { 'ELBO': hardELBO,
'etas': eta_statistics,
'variance_objective': variance_objective,
}
return total_grads, debug, variance_objective
###
# Create varaints
###
class SBNSimpleMuProp(SBN):
def _create_loss(self):
simple_muprop_gradient, debug = self.get_simple_muprop_gradient()
self.lHat = map(tf.reduce_mean, [
debug['ELBO'],
debug['muELBO'],
])
return debug['ELBO'], simple_muprop_gradient
def _create_network(self):
logF, loss_grads = self._create_loss()
self._create_train_op(loss_grads)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
class SBNMuProp(SBN):
def _create_loss(self):
muprop_gradient, debug = self.get_muprop_gradient()
self.lHat = map(tf.reduce_mean, [
debug['ELBO'],
debug['muELBO'],
])
return debug['ELBO'], muprop_gradient
def _create_network(self):
logF, loss_grads = self._create_loss()
self._create_train_op(loss_grads)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
class SBNNVIL(SBN):
def _create_loss(self):
nvil_gradient, debug = self.get_nvil_gradient()
self.lHat = map(tf.reduce_mean, [
debug['ELBO'],
])
return debug['ELBO'], nvil_gradient
def _create_network(self):
logF, loss_grads = self._create_loss()
self._create_train_op(loss_grads)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
class SBNRebar(SBN):
def _create_loss(self):
rebar_gradient, debug, variance_objective = self.get_rebar_gradient()
self.lHat = map(tf.reduce_mean, [
debug['ELBO'],
])
self.lHat.extend(map(tf.reduce_mean, debug['etas']))
return debug['ELBO'], rebar_gradient, variance_objective
def _create_network(self):
logF, loss_grads, variance_objective = self._create_loss()
# Create additional updates for control variates and temperature
eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
var_list=tf.get_collection('CV')))
self._create_train_op(loss_grads, eta_grads)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
class SBNDynamicRebar(SBN):
def _create_loss(self):
rebar_gradient, debug, variance_objective, variance_objective_grad = self.get_dynamic_rebar_gradient()
self.lHat = map(tf.reduce_mean, [
debug['ELBO'],
self.temperature_variable,
])
self.lHat.extend(debug['etas'])
return debug['ELBO'], rebar_gradient, variance_objective, variance_objective_grad
def _create_network(self):
logF, loss_grads, variance_objective, variance_objective_grad = self._create_loss()
# Create additional updates for control variates and temperature
eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
var_list=tf.get_collection('CV'))
+ [(variance_objective_grad, self.pre_temperature_variable)])
self._create_train_op(loss_grads, eta_grads)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
class SBNTrackGradVariances(SBN):
"""Follow NVIL, compute gradient variances for NVIL, MuProp and REBAR."""
def compute_gradient_moments(self, grads_and_vars):
first_moment = U.vectorize(grads_and_vars, set_none_to_zero=True)
second_moment = tf.square(first_moment)
self.maintain_ema_ops.append(self.ema.apply([first_moment, second_moment]))
return self.ema.average(first_moment), self.ema.average(second_moment)
def _create_loss(self):
self.losses = [
('NVIL', self.get_nvil_gradient),
('SimpleMuProp', self.get_simple_muprop_gradient),
('MuProp', self.get_muprop_gradient),
]
moments = []
for k, v in self.losses:
print(k)
gradient, debug = v()
if k == 'SimpleMuProp':
ELBO = debug['ELBO']
gradient_to_follow = gradient
moments.append(self.compute_gradient_moments(
gradient))
self.losses.append(('DynamicREBAR', self.get_dynamic_rebar_gradient))
dynamic_rebar_gradient, _, variance_objective, variance_objective_grad = self.get_dynamic_rebar_gradient()
moments.append(self.compute_gradient_moments(dynamic_rebar_gradient))
self.losses.append(('REBAR', self.get_rebar_gradient))
rebar_gradient, _, variance_objective2 = self.get_rebar_gradient()
moments.append(self.compute_gradient_moments(rebar_gradient))
mu = tf.reduce_mean(tf.stack([f for f, _ in moments]), axis=0)
self.grad_variances = []
deviations = []
for f, s in moments:
self.grad_variances.append(tf.reduce_mean(s - tf.square(mu)))
deviations.append(tf.reduce_mean(tf.square(f - mu)))
self.lHat = map(tf.reduce_mean, [
ELBO,
self.temperature_variable,
variance_objective_grad,
variance_objective_grad*variance_objective_grad,
])
self.lHat.extend(deviations)
self.lHat.append(tf.log(tf.reduce_mean(mu*mu)))
# self.lHat.extend(map(tf.log, grad_variances))
return ELBO, gradient_to_follow, variance_objective + variance_objective2, variance_objective_grad
def _create_network(self):
logF, loss_grads, variance_objective, variance_objective_grad = self._create_loss()
eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
var_list=tf.get_collection('CV'))
+ [(variance_objective_grad, self.pre_temperature_variable)])
self._create_train_op(loss_grads, eta_grads)
# Create IWAE lower bound for evaluation
self.logF = self._reshape(logF)
self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
tf.log(tf.to_float(self.n_samples)))
class SBNGumbel(SBN):
def _random_sample_soft(self, log_alpha, u, layer, temperature=None):
"""Returns sampled random variables parameterized by log_alpha."""
if temperature is None:
temperature = self.hparams.temperature
# Sample random variable underlying softmax/argmax
x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
x /= temperature
if self.hparams.muprop_relaxation:
x += temperature/(temperature + 1)*log_alpha
y = tf.nn.sigmoid(x)
return {
'preactivation': x,
'activation': y,
'log_param': log_alpha
}
def _create_loss(self):
# Hard loss
logQHard, hardSamples = self._recognition_network()
hardELBO, _ = self._generator_network(hardSamples, logQHard)
logQ, softSamples = self._recognition_network(sampler=self._random_sample_soft)
softELBO, _ = self._generator_network(softSamples, logQ)
self.optimizerLoss = -softELBO
self.lHat = map(tf.reduce_mean, [
hardELBO,
softELBO,
])
return hardELBO
default_hparams = tf.contrib.training.HParams(model='SBNGumbel',
n_hidden=200,
n_input=784,
n_layer=1,
nonlinear=False,
learning_rate=0.001,
temperature=0.5,
n_samples=1,
batch_size=24,
trial=1,
muprop_relaxation=True,
dynamic_b=False, # dynamic binarization
quadratic=True,
beta2=0.99999,
task='sbn',
)
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import random
import sys
import os
import numpy as np
import tensorflow as tf
import rebar
import datasets
import logger as L
gfile = tf.gfile
tf.app.flags.DEFINE_string("working_dir", "/tmp/rebar",
"""Directory where to save data, write logs, etc.""")
tf.app.flags.DEFINE_string('hparams', '',
'''Comma separated list of name=value pairs.''')
tf.app.flags.DEFINE_integer('eval_freq', 20,
'''How often to run the evaluation step.''')
FLAGS = tf.flags.FLAGS
def manual_scalar_summary(name, value):
value = tf.Summary.Value(tag=name, simple_value=value)
summary_str = tf.Summary(value=[value])
return summary_str
def eval(sbn, eval_xs, n_samples=100, batch_size=5):
n = eval_xs.shape[0]
i = 0
res = []
while i < n:
batch_xs = eval_xs[i:min(i+batch_size, n)]
res.append(sbn.partial_eval(batch_xs, n_samples))
i += batch_size
res = np.mean(res, axis=0)
return res
def train(sbn, train_xs, valid_xs, test_xs, training_steps, debug=False):
hparams = sorted(sbn.hparams.values().items())
hparams = (map(str, x) for x in hparams)
hparams = ('_'.join(x) for x in hparams)
hparams_str = '.'.join(hparams)
logger = L.Logger()
# Create the experiment name from the hparams
experiment_name = ([str(sbn.hparams.n_hidden) for i in xrange(sbn.hparams.n_layer)] +
[str(sbn.hparams.n_input)])
if sbn.hparams.nonlinear:
experiment_name = '~'.join(experiment_name)
else:
experiment_name = '-'.join(experiment_name)
experiment_name = 'SBN_%s' % experiment_name
rowkey = {'experiment': experiment_name,
'model': hparams_str}
# Create summary writer
summ_dir = os.path.join(FLAGS.working_dir, hparams_str)
summary_writer = tf.summary.FileWriter(
summ_dir, flush_secs=15, max_queue=100)
sv = tf.train.Supervisor(logdir=os.path.join(
FLAGS.working_dir, hparams_str),
save_summaries_secs=0,
save_model_secs=1200,
summary_op=None,
recovery_wait_secs=30,
global_step=sbn.global_step)
with sv.managed_session() as sess:
# Dump hparams to file
with gfile.Open(os.path.join(FLAGS.working_dir,
hparams_str,
'hparams.json'),
'w') as out:
json.dump(sbn.hparams.values(), out)
sbn.initialize(sess)
batch_size = sbn.hparams.batch_size
scores = []
n = train_xs.shape[0]
index = range(n)
while not sv.should_stop():
lHats = []
grad_variances = []
temperatures = []
random.shuffle(index)
i = 0
while i < n:
batch_index = index[i:min(i+batch_size, n)]
batch_xs = train_xs[batch_index, :]
if sbn.hparams.dynamic_b:
# Dynamically binarize the batch data
batch_xs = (np.random.rand(*batch_xs.shape) < batch_xs).astype(float)
lHat, grad_variance, step, temperature = sbn.partial_fit(batch_xs,
sbn.hparams.n_samples)
if debug:
print(i, lHat)
if i > 100:
return
lHats.append(lHat)
grad_variances.append(grad_variance)
temperatures.append(temperature)
i += batch_size
grad_variances = np.log(np.mean(grad_variances, axis=0)).tolist()
summary_strings = []
if isinstance(grad_variances, list):
grad_variances = dict(zip([k for (k, v) in sbn.losses], map(float, grad_variances)))
rowkey['step'] = step
logger.log(rowkey, {'step': step,
'train': np.mean(lHats, axis=0)[0],
'grad_variances': grad_variances,
'temperature': np.mean(temperatures), })
grad_variances = '\n'.join(map(str, sorted(grad_variances.iteritems())))
else:
rowkey['step'] = step
logger.log(rowkey, {'step': step,
'train': np.mean(lHats, axis=0)[0],
'grad_variance': grad_variances,
'temperature': np.mean(temperatures), })
summary_strings.append(manual_scalar_summary("log grad variance", grad_variances))
print('Step %d: %s\n%s' % (step, str(np.mean(lHats, axis=0)), str(grad_variances)))
# Every few epochs compute test and validation scores
epoch = int(step / (train_xs.shape[0] / sbn.hparams.batch_size))
if epoch % FLAGS.eval_freq == 0:
valid_res = eval(sbn, valid_xs)
test_res= eval(sbn, test_xs)
print('\nValid %d: %s' % (step, str(valid_res)))
print('Test %d: %s\n' % (step, str(test_res)))
logger.log(rowkey, {'step': step,
'valid': valid_res[0],
'test': test_res[0]})
logger.flush() # Flush infrequently
# Create summaries
summary_strings.extend([
manual_scalar_summary("Train ELBO", np.mean(lHats, axis=0)[0]),
manual_scalar_summary("Temperature", np.mean(temperatures)),
])
for summ_str in summary_strings:
summary_writer.add_summary(summ_str, global_step=step)
summary_writer.flush()
sys.stdout.flush()
scores.append(np.mean(lHats, axis=0))
if step > training_steps:
break
return scores
def main():
# Parse hyperparams
hparams = rebar.default_hparams
hparams.parse(FLAGS.hparams)
print(hparams.values())
train_xs, valid_xs, test_xs = datasets.load_data(hparams)
mean_xs = np.mean(train_xs, axis=0) # Compute mean centering on training
training_steps = 2000000
model = getattr(rebar, hparams.model)
sbn = model(hparams, mean_xs=mean_xs)
scores = train(sbn, train_xs, valid_xs, test_xs,
training_steps=training_steps, debug=False)
if __name__ == '__main__':
main()
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Basic data management and plotting utilities."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import cPickle as pickle
import getpass
import numpy as np
import gc
import tensorflow as tf
#
# Python utlities
#
def exp_moving_average(x, alpha=0.9):
res = []
mu = 0
alpha_factor = 1
for x_i in x:
mu += (1 - alpha)*(x_i - mu)
alpha_factor *= alpha
res.append(mu/(1 - alpha_factor))
return np.array(res)
def sanitize(s):
return s.replace('.', '_')
#
# Tensorflow utilities
#
def softplus(x):
'''
Let m = max(0, x), then,
sofplus(x) = log(1 + e(x)) = log(e(0) + e(x)) = log(e(m)(e(-m) + e(x-m)))
= m + log(e(-m) + e(x - m))
The term inside of the log is guaranteed to be between 1 and 2.
'''
m = tf.maximum(tf.zeros_like(x), x)
return m + tf.log(tf.exp(-m) + tf.exp(x - m))
def safe_log_prob(x, eps=1e-8):
return tf.log(tf.clip_by_value(x, eps, 1.0))
def rms(x):
return tf.sqrt(tf.reduce_mean(tf.square(x)))
def center(x):
mu = (tf.reduce_sum(x) - x)/tf.to_float(tf.shape(x)[0] - 1)
return x - mu
def vectorize(grads_and_vars, set_none_to_zero=False, skip_none=False):
if set_none_to_zero:
return tf.concat([tf.reshape(g, [-1]) if g is not None else
tf.reshape(tf.zeros_like(v), [-1]) for g, v in grads_and_vars], 0)
elif skip_none:
return tf.concat([tf.reshape(g, [-1]) for g, v in grads_and_vars if g is not None], 0)
else:
return tf.concat([tf.reshape(g, [-1]) for g, v in grads_and_vars], 0)
def add_grads_and_vars(a, b):
'''Add grads_and_vars from two calls to tf.compute_gradients.'''
res = []
for (g_a, v_a), (g_b, v_b) in zip(a, b):
assert v_a == v_b
if g_a is None:
res.append((g_b, v_b))
elif g_b is None:
res.append((g_a, v_a))
else:
res.append((g_a + g_b, v_a))
return res
def binary_log_likelihood(y, log_y_hat):
"""Computes binary log likelihood.
Args:
y: observed data
log_y_hat: parameters of the binary variables
Returns:
log_likelihood
"""
return tf.reduce_sum(y*(-softplus(-log_y_hat)) +
(1 - y)*(-log_y_hat-softplus(-log_y_hat)),
1)
def cov(a, b):
"""Compute the sample covariance between two vectors."""
mu_a = tf.reduce_mean(a)
mu_b = tf.reduce_mean(b)
n = tf.to_float(tf.shape(a)[0])
return tf.reduce_sum((a - mu_a)*(b - mu_b))/(n - 1.0)
def corr(a, b):
return cov(a, b)*tf.rsqrt(cov(a, a))*tf.rsqrt(cov(b, b))
def logSumExp(t, axis=0, keep_dims = False):
'''Computes the log(sum(exp(t))) numerically stabily.
Args:
t: input tensor
axis: which axis to sum over
keep_dims: whether to keep the dim or not
Returns:
tensor with result
'''
m = tf.reduce_max(t, [axis])
res = m + tf.log(tf.reduce_sum(tf.exp(t - tf.expand_dims(m, axis)), [axis]))
if keep_dims:
return tf.expand_dims(res, axis)
else:
return res
if __name__ == '__main__':
app.run()
......@@ -58,7 +58,7 @@ def build_input(dataset, data_path, batch_size, mode):
record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
# Convert from string to [depth * height * width] to [depth, height, width].
depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
depth_major = tf.reshape(tf.slice(record, [label_offset + label_bytes], [image_bytes]),
[depth, image_size, image_size])
# Convert from [depth, height, width] to [height, width, depth].
image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
......
......@@ -2,8 +2,7 @@
# Contains files for loading, training and evaluating TF-Slim-based models.
package(default_visibility = [
":internal",
"//domain_adaptation:__subpackages__",
"//visibility:public",
])
licenses(["notice"]) # Apache 2.0
......
......@@ -256,6 +256,17 @@ and/or multiple CPUs, either synchrononously or asynchronously.
See [model_deploy](https://github.com/tensorflow/models/blob/master/slim/deployment/model_deploy.py)
for details.
### TensorBoard
To visualize the losses and other metrics during training, you can use
[TensorBoard](https://github.com/tensorflow/tensorboard)
by running the command below.
```shell
tensorboard --logdir=${TRAIN_DIR}
```
Once TensorBoard is running, navigate your web browser to http://localhost:6006.
# Fine-tuning a model from an existing checkpoint
<a id='Tuning'></a>
......@@ -392,8 +403,7 @@ bazel-bin/tensorflow/examples/label_image/label_image \
--graph=/tmp/frozen_inception_v3.pb \
--labels=/tmp/imagenet_slim_labels.txt \
--input_mean=0 \
--input_std=255 \
--logtostderr
--input_std=255
```
......
......@@ -67,7 +67,7 @@ def main(_):
download_and_convert_mnist.run(FLAGS.dataset_dir)
else:
raise ValueError(
'dataset_name [%s] was not recognized.' % FLAGS.dataset_dir)
'dataset_name [%s] was not recognized.' % FLAGS.dataset_name)
if __name__ == '__main__':
tf.app.run()
......
......@@ -48,8 +48,7 @@ bazel-bin/tensorflow/examples/label_image/label_image \
--graph=/tmp/frozen_inception_v3.pb \
--labels=/tmp/imagenet_slim_labels.txt \
--input_mean=0 \
--input_std=255 \
--logtostderr
--input_std=255
"""
......@@ -63,7 +62,6 @@ from tensorflow.python.platform import gfile
from datasets import dataset_factory
from nets import nets_factory
slim = tf.contrib.slim
tf.app.flags.DEFINE_string(
......@@ -74,8 +72,8 @@ tf.app.flags.DEFINE_boolean(
'Whether to save out a training-focused version of the model.')
tf.app.flags.DEFINE_integer(
'default_image_size', 224,
'The image size to use if the model does not define it.')
'image_size', None,
'The image size to use, otherwise use the model default_image_size.')
tf.app.flags.DEFINE_string('dataset_name', 'imagenet',
'The name of the dataset to use with the model.')
......@@ -100,16 +98,13 @@ def main(_):
raise ValueError('You must supply the path to save to with --output_file')
tf.logging.set_verbosity(tf.logging.INFO)
with tf.Graph().as_default() as graph:
dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'validation',
dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train',
FLAGS.dataset_dir)
network_fn = nets_factory.get_network_fn(
FLAGS.model_name,
num_classes=(dataset.num_classes - FLAGS.labels_offset),
is_training=FLAGS.is_training)
if hasattr(network_fn, 'default_image_size'):
image_size = network_fn.default_image_size
else:
image_size = FLAGS.default_image_size
image_size = FLAGS.image_size or network_fn.default_image_size
placeholder = tf.placeholder(name='input', dtype=tf.float32,
shape=[1, image_size, image_size, 3])
network_fn(placeholder)
......
......@@ -25,7 +25,7 @@ import os
import tensorflow as tf
from tensorflow.python.platform import gfile
from google3.third_party.tensorflow_models.slim import export_inference_graph
import export_inference_graph
class ExportInferenceGraphTest(tf.test.TestCase):
......
......@@ -425,6 +425,7 @@ def inception_v3(inputs,
prediction_fn=slim.softmax,
spatial_squeeze=True,
reuse=None,
create_aux_logits=True,
scope='InceptionV3'):
"""Inception model from http://arxiv.org/abs/1512.00567.
......@@ -457,6 +458,7 @@ def inception_v3(inputs,
of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
create_aux_logits: Whether to create the auxiliary logits.
scope: Optional variable_scope.
Returns:
......@@ -481,30 +483,31 @@ def inception_v3(inputs,
depth_multiplier=depth_multiplier)
# Auxiliary Head logits
with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
stride=1, padding='SAME'):
aux_logits = end_points['Mixed_6e']
with tf.variable_scope('AuxLogits'):
aux_logits = slim.avg_pool2d(
aux_logits, [5, 5], stride=3, padding='VALID',
scope='AvgPool_1a_5x5')
aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
scope='Conv2d_1b_1x1')
# Shape of feature map before the final layer.
kernel_size = _reduced_kernel_size_for_small_input(
aux_logits, [5, 5])
aux_logits = slim.conv2d(
aux_logits, depth(768), kernel_size,
weights_initializer=trunc_normal(0.01),
padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
aux_logits = slim.conv2d(
aux_logits, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, weights_initializer=trunc_normal(0.001),
scope='Conv2d_2b_1x1')
if spatial_squeeze:
aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
end_points['AuxLogits'] = aux_logits
if create_aux_logits:
with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
stride=1, padding='SAME'):
aux_logits = end_points['Mixed_6e']
with tf.variable_scope('AuxLogits'):
aux_logits = slim.avg_pool2d(
aux_logits, [5, 5], stride=3, padding='VALID',
scope='AvgPool_1a_5x5')
aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
scope='Conv2d_1b_1x1')
# Shape of feature map before the final layer.
kernel_size = _reduced_kernel_size_for_small_input(
aux_logits, [5, 5])
aux_logits = slim.conv2d(
aux_logits, depth(768), kernel_size,
weights_initializer=trunc_normal(0.01),
padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
aux_logits = slim.conv2d(
aux_logits, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, weights_initializer=trunc_normal(0.001),
scope='Conv2d_2b_1x1')
if spatial_squeeze:
aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
end_points['AuxLogits'] = aux_logits
# Final pooling and prediction
with tf.variable_scope('Logits'):
......
......@@ -27,6 +27,8 @@ As described in https://arxiv.org/abs/1704.04861.
100% Mobilenet V1 (base) with input size 224x224:
See mobilenet_v1()
Layer params macs
--------------------------------------------------------------------------------
MobilenetV1/Conv2d_0/Conv2D: 864 10,838,016
......@@ -62,6 +64,8 @@ Total: 3,185,088 567,716,352
75% Mobilenet V1 (base) with input size 128x128:
See mobilenet_v1_075()
Layer params macs
--------------------------------------------------------------------------------
MobilenetV1/Conv2d_0/Conv2D: 648 2,654,208
......@@ -102,6 +106,7 @@ from __future__ import division
from __future__ import print_function
from collections import namedtuple
import functools
import tensorflow as tf
......@@ -335,6 +340,17 @@ def mobilenet_v1(inputs,
mobilenet_v1.default_image_size = 224
def wrapped_partial(func, *args, **kwargs):
partial_func = functools.partial(func, *args, **kwargs)
functools.update_wrapper(partial_func, func)
return partial_func
mobilenet_v1_075 = wrapped_partial(mobilenet_v1, depth_multiplier=0.75)
mobilenet_v1_050 = wrapped_partial(mobilenet_v1, depth_multiplier=0.50)
mobilenet_v1_025 = wrapped_partial(mobilenet_v1, depth_multiplier=0.25)
def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
"""Define kernel size which is automatically reduced for small input.
......
......@@ -54,6 +54,9 @@ networks_map = {'alexnet_v2': alexnet.alexnet_v2,
'resnet_v2_152': resnet_v2.resnet_v2_152,
'resnet_v2_200': resnet_v2.resnet_v2_200,
'mobilenet_v1': mobilenet_v1.mobilenet_v1,
'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_075,
'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_050,
'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_025,
}
arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
......@@ -78,6 +81,9 @@ arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
'resnet_v2_152': resnet_v2.resnet_arg_scope,
'resnet_v2_200': resnet_v2.resnet_arg_scope,
'mobilenet_v1': mobilenet_v1.mobilenet_v1_arg_scope,
'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_arg_scope,
'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_arg_scope,
'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_arg_scope,
}
......
......@@ -199,7 +199,9 @@ def stack_blocks_dense(net, blocks, output_stride=None,
def resnet_arg_scope(weight_decay=0.0001,
batch_norm_decay=0.997,
batch_norm_epsilon=1e-5,
batch_norm_scale=True):
batch_norm_scale=True,
activation_fn=tf.nn.relu,
use_batch_norm=True):
"""Defines the default ResNet arg scope.
TODO(gpapan): The batch-normalization related default values above are
......@@ -215,6 +217,8 @@ def resnet_arg_scope(weight_decay=0.0001,
normalizing activations by their variance in batch normalization.
batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
activations in the batch normalization layer.
activation_fn: The activation function which is used in ResNet.
use_batch_norm: Whether or not to use batch normalization.
Returns:
An `arg_scope` to use for the resnet models.
......@@ -230,8 +234,8 @@ def resnet_arg_scope(weight_decay=0.0001,
[slim.conv2d],
weights_regularizer=slim.l2_regularizer(weight_decay),
weights_initializer=slim.variance_scaling_initializer(),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
activation_fn=activation_fn,
normalizer_fn=slim.batch_norm if use_batch_norm else None,
normalizer_params=batch_norm_params):
with slim.arg_scope([slim.batch_norm], **batch_norm_params):
# The following implies padding='SAME' for pool1, which makes feature
......
......@@ -66,8 +66,14 @@ slim = tf.contrib.slim
@slim.add_arg_scope
def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
outputs_collections=None, scope=None):
def bottleneck(inputs,
depth,
depth_bottleneck,
stride,
rate=1,
outputs_collections=None,
scope=None,
use_bounded_activations=False):
"""Bottleneck residual unit variant with BN after convolutions.
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
......@@ -86,6 +92,8 @@ def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
rate: An integer, rate for atrous convolution.
outputs_collections: Collection to add the ResNet unit output.
scope: Optional variable_scope.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
Returns:
The ResNet unit's output.
......@@ -95,8 +103,12 @@ def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
if depth == depth_in:
shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
else:
shortcut = slim.conv2d(inputs, depth, [1, 1], stride=stride,
activation_fn=None, scope='shortcut')
shortcut = slim.conv2d(
inputs,
depth, [1, 1],
stride=stride,
activation_fn=tf.nn.relu6 if use_bounded_activations else None,
scope='shortcut')
residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
scope='conv1')
......@@ -105,7 +117,12 @@ def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
residual = slim.conv2d(residual, depth, [1, 1], stride=1,
activation_fn=None, scope='conv3')
output = tf.nn.relu(shortcut + residual)
if use_bounded_activations:
# Use clip_by_value to simulate bandpass activation.
residual = tf.clip_by_value(residual, -6.0, 6.0)
output = tf.nn.relu6(shortcut + residual)
else:
output = tf.nn.relu(shortcut + residual)
return slim.utils.collect_named_outputs(outputs_collections,
sc.original_name_scope,
......@@ -119,7 +136,7 @@ def resnet_v1(inputs,
global_pool=True,
output_stride=None,
include_root_block=True,
spatial_squeeze=False,
spatial_squeeze=True,
reuse=None,
scope=None):
"""Generator for v1 ResNet models.
......
......@@ -251,6 +251,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
global_pool=True,
output_stride=None,
include_root_block=True,
spatial_squeeze=True,
reuse=None,
scope='resnet_v1_small'):
"""A shallow and thin ResNet v1 for faster tests."""
......@@ -266,6 +267,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
global_pool=global_pool,
output_stride=output_stride,
include_root_block=include_root_block,
spatial_squeeze=spatial_squeeze,
reuse=reuse,
scope=scope)
......@@ -276,6 +278,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, end_points = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
spatial_squeeze=False,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
......@@ -307,6 +310,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
_, end_points = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
spatial_squeeze=False,
scope='resnet')
endpoint_to_shape = {
'resnet/block1': [2, 41, 41, 4],
......@@ -325,6 +329,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
_, end_points = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
include_root_block=False,
spatial_squeeze=False,
scope='resnet')
endpoint_to_shape = {
'resnet/block1': [2, 64, 64, 4],
......@@ -345,6 +350,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
num_classes,
global_pool=global_pool,
output_stride=output_stride,
spatial_squeeze=False,
scope='resnet')
endpoint_to_shape = {
'resnet/block1': [2, 41, 41, 4],
......@@ -391,6 +397,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, _ = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
spatial_squeeze=False,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(),
......
......@@ -115,7 +115,7 @@ def resnet_v2(inputs,
global_pool=True,
output_stride=None,
include_root_block=True,
spatial_squeeze=False,
spatial_squeeze=True,
reuse=None,
scope=None):
"""Generator for v2 (preactivation) ResNet models.
......@@ -251,7 +251,7 @@ def resnet_v2_50(inputs,
is_training=True,
global_pool=True,
output_stride=None,
spatial_squeeze=False,
spatial_squeeze=True,
reuse=None,
scope='resnet_v2_50'):
"""ResNet-50 model of [1]. See resnet_v2() for arg and return description."""
......@@ -273,7 +273,7 @@ def resnet_v2_101(inputs,
is_training=True,
global_pool=True,
output_stride=None,
spatial_squeeze=False,
spatial_squeeze=True,
reuse=None,
scope='resnet_v2_101'):
"""ResNet-101 model of [1]. See resnet_v2() for arg and return description."""
......@@ -295,7 +295,7 @@ def resnet_v2_152(inputs,
is_training=True,
global_pool=True,
output_stride=None,
spatial_squeeze=False,
spatial_squeeze=True,
reuse=None,
scope='resnet_v2_152'):
"""ResNet-152 model of [1]. See resnet_v2() for arg and return description."""
......@@ -317,7 +317,7 @@ def resnet_v2_200(inputs,
is_training=True,
global_pool=True,
output_stride=None,
spatial_squeeze=False,
spatial_squeeze=True,
reuse=None,
scope='resnet_v2_200'):
"""ResNet-200 model of [2]. See resnet_v2() for arg and return description."""
......
......@@ -251,6 +251,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
global_pool=True,
output_stride=None,
include_root_block=True,
spatial_squeeze=True,
reuse=None,
scope='resnet_v2_small'):
"""A shallow and thin ResNet v2 for faster tests."""
......@@ -266,6 +267,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
global_pool=global_pool,
output_stride=output_stride,
include_root_block=include_root_block,
spatial_squeeze=spatial_squeeze,
reuse=reuse,
scope=scope)
......@@ -276,6 +278,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, end_points = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
spatial_squeeze=False,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
......@@ -307,6 +310,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
_, end_points = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
spatial_squeeze=False,
scope='resnet')
endpoint_to_shape = {
'resnet/block1': [2, 41, 41, 4],
......@@ -325,6 +329,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
_, end_points = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
include_root_block=False,
spatial_squeeze=False,
scope='resnet')
endpoint_to_shape = {
'resnet/block1': [2, 64, 64, 4],
......@@ -345,6 +350,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
num_classes,
global_pool=global_pool,
output_stride=output_stride,
spatial_squeeze=False,
scope='resnet')
endpoint_to_shape = {
'resnet/block1': [2, 41, 41, 4],
......@@ -393,6 +399,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, _ = self._resnet_small(inputs, num_classes,
global_pool=global_pool,
spatial_squeeze=False,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(),
......
......@@ -87,8 +87,9 @@ def vgg_a(inputs,
fc_conv_padding: the type of padding to use for the fully connected layer
that is implemented as a convolutional layer. Use 'SAME' padding if you
are applying the network in a fully convolutional manner and want to
get a prediction map downsampled by a factor of 32 as an output. Otherwise,
the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
get a prediction map downsampled by a factor of 32 as an output.
Otherwise, the output prediction map will be (input / 32) - 6 in case of
'VALID' padding.
Returns:
the last op containing the log predictions and end_points dict.
......@@ -152,8 +153,9 @@ def vgg_16(inputs,
fc_conv_padding: the type of padding to use for the fully connected layer
that is implemented as a convolutional layer. Use 'SAME' padding if you
are applying the network in a fully convolutional manner and want to
get a prediction map downsampled by a factor of 32 as an output. Otherwise,
the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
get a prediction map downsampled by a factor of 32 as an output.
Otherwise, the output prediction map will be (input / 32) - 6 in case of
'VALID' padding.
Returns:
the last op containing the log predictions and end_points dict.
......@@ -217,8 +219,10 @@ def vgg_19(inputs,
fc_conv_padding: the type of padding to use for the fully connected layer
that is implemented as a convolutional layer. Use 'SAME' padding if you
are applying the network in a fully convolutional manner and want to
get a prediction map downsampled by a factor of 32 as an output. Otherwise,
the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
get a prediction map downsampled by a factor of 32 as an output.
Otherwise, the output prediction map will be (input / 32) - 6 in case of
'VALID' padding.
Returns:
the last op containing the log predictions and end_points dict.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment