Merge branch 'master' into cifar10_experiment

748eceae · Marianne Linhares Monteiro · GitHub · 40e906d2 · ed65b632 · 748eceae
Commit 748eceae authored Jul 28, 2017 by Marianne Linhares Monteiro Committed by GitHub Jul 28, 2017
20 changed files
--- a/rebar/download_data.py
+++ b/rebar/download_data.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Download MNIST, Omniglot datasets for Rebar."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import urllib
+import gzip
+import os
+import config
+import struct
+import numpy as np
+import cPickle as pickle
+import datasets
+
+MNIST_URL = 'see README'
+MNIST_BINARIZED_URL = 'see README'
+OMNIGLOT_URL = 'see README'
+
+MNIST_FLOAT_TRAIN = 'train-images-idx3-ubyte'
+
+
+def load_mnist_float(local_filename):
+  with open(local_filename, 'rb') as f:
+    f.seek(4)
+    nimages, rows, cols = struct.unpack('>iii', f.read(12))
+    dim = rows*cols
+
+    images = np.fromfile(f, dtype=np.dtype(np.ubyte))
+    images = (images/255.0).astype('float32').reshape((nimages, dim))
+
+  return images
+
+if __name__ == '__main__':
+  if not os.path.exists(config.DATA_DIR):
+    os.makedirs(config.DATA_DIR)
+
+  # Get MNIST and convert to npy file
+  local_filename = os.path.join(config.DATA_DIR, MNIST_FLOAT_TRAIN)
+  if not os.path.exists(local_filename):
+    urllib.urlretrieve("%s/%s.gz" % (MNIST_URL, MNIST_FLOAT_TRAIN), local_filename+'.gz')
+    with gzip.open(local_filename+'.gz', 'rb') as f:
+      file_content = f.read()
+    with open(local_filename, 'wb') as f:
+      f.write(file_content)
+    os.remove(local_filename+'.gz')
+
+  mnist_float_train = load_mnist_float(local_filename)[:-10000]
+  # save in a nice format
+  np.save(os.path.join(config.DATA_DIR, config.MNIST_FLOAT), mnist_float_train)
+
+  # Get binarized MNIST
+  splits = ['train', 'valid', 'test']
+  mnist_binarized = []
+  for split in splits:
+    filename = 'binarized_mnist_%s.amat' % split
+    url = '%s/binarized_mnist_%s.amat' % (MNIST_BINARIZED_URL, split)
+    local_filename = os.path.join(config.DATA_DIR, filename)
+    if not os.path.exists(local_filename):
+      urllib.urlretrieve(url, local_filename)
+
+    with open(local_filename, 'rb') as f:
+      mnist_binarized.append((np.array([map(int, line.split()) for line in f.readlines()]).astype('float32'), None))
+
+  # save in a nice format
+  with open(os.path.join(config.DATA_DIR, config.MNIST_BINARIZED), 'w') as out:
+    pickle.dump(mnist_binarized, out)
+
+  # Get Omniglot
+  local_filename = os.path.join(config.DATA_DIR, config.OMNIGLOT)
+  if not os.path.exists(local_filename):
+    urllib.urlretrieve(OMNIGLOT_URL,
+                       local_filename)
+
--- a/rebar/logger.py
+++ b/rebar/logger.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Logger for REBAR"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+class Logger:
+  def __init__(self):
+    pass
+
+  def log(self, key, value):
+    pass
+
+  def flush(self):
+    pass
--- a/rebar/rebar.py
+++ b/rebar/rebar.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import tensorflow as tf
+import numpy as np
+from scipy.misc import logsumexp
+
+import tensorflow.contrib.slim as slim
+from tensorflow.python.ops import init_ops
+import utils as U
+
+FLAGS = tf.flags.FLAGS
+
+Q_COLLECTION = "q_collection"
+P_COLLECTION = "p_collection"
+
+class SBN(object):  # REINFORCE
+
+  def __init__(self,
+               hparams,
+               activation_func=tf.nn.sigmoid,
+               mean_xs = None,
+               eval_mode=False):
+    self.eval_mode = eval_mode
+    self.hparams = hparams
+    self.mean_xs = mean_xs
+    self.train_bias= -np.log(1./np.clip(mean_xs, 0.001, 0.999)-1.).astype(np.float32)
+    self.activation_func = activation_func
+
+    self.n_samples = tf.placeholder('int32')
+    self.x = tf.placeholder('float', [None, self.hparams.n_input])
+    self._x = tf.tile(self.x, [self.n_samples, 1])
+
+    self.batch_size = tf.shape(self._x)[0]
+
+    self.uniform_samples = dict()
+    self.uniform_samples_v = dict()
+    self.prior = tf.Variable(tf.zeros([self.hparams.n_hidden],
+                                      dtype=tf.float32),
+                             name='p_prior',
+                             collections=[tf.GraphKeys.GLOBAL_VARIABLES, P_COLLECTION])
+
+    self.run_recognition_network = False
+    self.run_generator_network = False
+
+    # Initialize temperature
+    self.pre_temperature_variable = tf.Variable(
+        np.log(self.hparams.temperature),
+        trainable=False,
+        dtype=tf.float32)
+    self.temperature_variable = tf.exp(self.pre_temperature_variable)
+
+    self.global_step = tf.Variable(0, trainable=False)
+    self.baseline_loss = []
+    self.ema = tf.train.ExponentialMovingAverage(decay=0.999)
+    self.maintain_ema_ops = []
+    self.optimizer_class = tf.train.AdamOptimizer(
+        learning_rate=1*self.hparams.learning_rate,
+        beta2=self.hparams.beta2)
+
+    self._generate_randomness()
+    self._create_network()
+
+
+  def initialize(self, sess):
+    self.sess = sess
+
+  def _create_eta(self, shape=[], collection='CV'):
+    return 2 * tf.sigmoid(tf.Variable(tf.zeros(shape), trainable=False,
+                                      collections=[collection, tf.GraphKeys.GLOBAL_VARIABLES, Q_COLLECTION]))
+
+  def _create_baseline(self, n_output=1, n_hidden=100,
+                       is_zero_init=False,
+                       collection='BASELINE'):
+    # center input
+    h = self._x
+    if self.mean_xs is not None:
+      h -= self.mean_xs
+
+    if is_zero_init:
+      initializer = init_ops.zeros_initializer()
+    else:
+      initializer = slim.variance_scaling_initializer()
+
+    with slim.arg_scope([slim.fully_connected],
+                        variables_collections=[collection, Q_COLLECTION],
+                        trainable=False,
+                        weights_initializer=initializer):
+      h = slim.fully_connected(h, n_hidden, activation_fn=tf.nn.tanh)
+      baseline = slim.fully_connected(h, n_output, activation_fn=None)
+
+      if n_output == 1:
+        baseline = tf.reshape(baseline, [-1])  # very important to reshape
+    return baseline
+
+
+  def _create_transformation(self, input, n_output, reuse, scope_prefix):
+    """Create the deterministic transformation between stochastic layers.
+
+    If self.hparam.nonlinear:
+        2 x tanh layers
+    Else:
+        1 x linear layer
+    """
+    if self.hparams.nonlinear:
+      h = slim.fully_connected(input,
+                               self.hparams.n_hidden,
+                               reuse=reuse,
+                               activation_fn=tf.nn.tanh,
+                               scope='%s_nonlinear_1' % scope_prefix)
+      h = slim.fully_connected(h,
+                               self.hparams.n_hidden,
+                               reuse=reuse,
+                               activation_fn=tf.nn.tanh,
+                               scope='%s_nonlinear_2' % scope_prefix)
+      h = slim.fully_connected(h,
+                               n_output,
+                               reuse=reuse,
+                               activation_fn=None,
+                               scope='%s' % scope_prefix)
+    else:
+      h = slim.fully_connected(input,
+                               n_output,
+                               reuse=reuse,
+                               activation_fn=None,
+                               scope='%s' % scope_prefix)
+    return h
+
+  def _recognition_network(self, sampler=None, log_likelihood_func=None):
+    """x values -> samples from Q and return log Q(h|x)."""
+    samples = {}
+    reuse = None if not self.run_recognition_network else True
+
+    # Set defaults
+    if sampler is None:
+      sampler = self._random_sample
+
+    if log_likelihood_func is None:
+      log_likelihood_func = lambda sample, log_params: (
+        U.binary_log_likelihood(sample['activation'], log_params))
+
+    logQ = []
+
+
+    if self.hparams.task in ['sbn', 'omni']:
+      # Initialize the edge case
+      samples[-1] = {'activation': self._x}
+      if self.mean_xs is not None:
+        samples[-1]['activation'] -= self.mean_xs  # center the input
+      samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0
+
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[Q_COLLECTION]):
+        for i in xrange(self.hparams.n_layer):
+          # Set up the input to the layer
+          input = 2.0*samples[i-1]['activation'] - 1.0
+
+          # Create the conditional distribution (output is the logits)
+          h = self._create_transformation(input,
+                                          n_output=self.hparams.n_hidden,
+                                          reuse=reuse,
+                                          scope_prefix='q_%d' % i)
+
+          samples[i] = sampler(h, self.uniform_samples[i], i)
+          logQ.append(log_likelihood_func(samples[i], h))
+
+      self.run_recognition_network = True
+      return logQ, samples
+    elif self.hparams.task == 'sp':
+      # Initialize the edge case
+      samples[-1] = {'activation': tf.split(self._x,
+                                            num_or_size_splits=2,
+                                            axis=1)[0]}  # top half of digit
+      if self.mean_xs is not None:
+        samples[-1]['activation'] -= np.split(self.mean_xs, 2, 0)[0]  # center the input
+      samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0
+
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[Q_COLLECTION]):
+        for i in xrange(self.hparams.n_layer):
+          # Set up the input to the layer
+          input = 2.0*samples[i-1]['activation'] - 1.0
+
+          # Create the conditional distribution (output is the logits)
+          h = self._create_transformation(input,
+                                          n_output=self.hparams.n_hidden,
+                                          reuse=reuse,
+                                          scope_prefix='q_%d' % i)
+
+          samples[i] = sampler(h, self.uniform_samples[i], i)
+          logQ.append(log_likelihood_func(samples[i], h))
+
+      self.run_recognition_network = True
+      return logQ, samples
+
+  def _generator_network(self, samples, logQ, log_likelihood_func=None):
+    '''Returns learning signal and function.
+
+    This is the implementation for SBNs for the ELBO.
+
+    Args:
+      samples: dictionary of sampled latent variables
+      logQ: list of log q(h_i) terms
+      log_likelihood_func: function used to compute log probs for the latent
+        variables
+
+    Returns:
+      learning_signal: the "reward" function
+      function_term: part of the function that depends on the parameters
+        and needs to have the gradient taken through
+    '''
+    reuse=None if not self.run_generator_network else True
+
+    if self.hparams.task in ['sbn', 'omni']:
+      if log_likelihood_func is None:
+        log_likelihood_func = lambda sample, log_params: (
+          U.binary_log_likelihood(sample['activation'], log_params))
+
+      logPPrior = log_likelihood_func(
+          samples[self.hparams.n_layer-1],
+          tf.expand_dims(self.prior, 0))
+
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[P_COLLECTION]):
+
+        for i in reversed(xrange(self.hparams.n_layer)):
+          if i == 0:
+            n_output = self.hparams.n_input
+          else:
+            n_output = self.hparams.n_hidden
+          input = 2.0*samples[i]['activation']-1.0
+
+          h = self._create_transformation(input,
+                                          n_output,
+                                          reuse=reuse,
+                                          scope_prefix='p_%d' % i)
+
+          if i == 0:
+            # Assume output is binary
+            logP = U.binary_log_likelihood(self._x, h + self.train_bias)
+          else:
+            logPPrior += log_likelihood_func(samples[i-1], h)
+
+      self.run_generator_network = True
+      return logP + logPPrior - tf.add_n(logQ), logP + logPPrior
+    elif self.hparams.task == 'sp':
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[P_COLLECTION]):
+        n_output = int(self.hparams.n_input/2)
+        i = self.hparams.n_layer - 1  # use the last layer
+        input = 2.0*samples[i]['activation']-1.0
+
+        h = self._create_transformation(input,
+                                        n_output,
+                                        reuse=reuse,
+                                        scope_prefix='p_%d' % i)
+
+        # Predict on the lower half of the image
+        logP = U.binary_log_likelihood(tf.split(self._x,
+                                              num_or_size_splits=2,
+                                              axis=1)[1],
+                                     h + np.split(self.train_bias, 2, 0)[1])
+
+      self.run_generator_network = True
+      return logP, logP
+
+
+  def _create_loss(self):
+    # Hard loss
+    logQHard, samples = self._recognition_network()
+    reinforce_learning_signal, reinforce_model_grad = self._generator_network(samples, logQHard)
+    logQHard = tf.add_n(logQHard)
+
+    # REINFORCE
+    learning_signal = tf.stop_gradient(center(reinforce_learning_signal))
+    self.optimizerLoss = -(learning_signal*logQHard +
+                           reinforce_model_grad)
+    self.lHat = map(tf.reduce_mean, [
+        reinforce_learning_signal,
+        U.rms(learning_signal),
+    ])
+
+    return reinforce_learning_signal
+
+  def _reshape(self, t):
+    return tf.transpose(tf.reshape(t,
+                      [self.n_samples, -1]))
+
+
+  def compute_tensor_variance(self, t):
+    """Compute the mean per component variance.
+
+    Use a moving average to estimate the required moments.
+    """
+    t_sq = tf.reduce_mean(tf.square(t))
+    self.maintain_ema_ops.append(self.ema.apply([t, t_sq]))
+
+    # mean per component variance
+    variance_estimator = (self.ema.average(t_sq) -
+                          tf.reduce_mean(
+                              tf.square(self.ema.average(t))))
+
+    return variance_estimator
+
+  def _create_train_op(self, grads_and_vars, extra_grads_and_vars=[]):
+    '''
+    Args:
+      grads_and_vars: gradients to apply and compute running average variance
+      extra_grads_and_vars: gradients to apply (not used to compute average variance)
+    '''
+    # Variance summaries
+    first_moment = U.vectorize(grads_and_vars, skip_none=True)
+    second_moment = tf.square(first_moment)
+    self.maintain_ema_ops.append(self.ema.apply([first_moment, second_moment]))
+
+    # Add baseline losses
+    if len(self.baseline_loss) > 0:
+      mean_baseline_loss = tf.reduce_mean(tf.add_n(self.baseline_loss))
+      extra_grads_and_vars += self.optimizer_class.compute_gradients(
+          mean_baseline_loss,
+          var_list=tf.get_collection('BASELINE'))
+
+    # Ensure that all required tensors are computed before updates are executed
+    extra_optimizer = tf.train.AdamOptimizer(
+        learning_rate=10*self.hparams.learning_rate,
+        beta2=self.hparams.beta2)
+    with tf.control_dependencies(
+        [tf.group(*[g for g, _ in (grads_and_vars + extra_grads_and_vars) if g is not None])]):
+
+      # Filter out the P_COLLECTION variables if we're in eval mode
+      if self.eval_mode:
+        grads_and_vars = [(g, v) for g, v in grads_and_vars
+                          if v not in tf.get_collection(P_COLLECTION)]
+
+      train_op = self.optimizer_class.apply_gradients(grads_and_vars,
+                                                      global_step=self.global_step)
+
+      if len(extra_grads_and_vars) > 0:
+        extra_train_op = extra_optimizer.apply_gradients(extra_grads_and_vars)
+      else:
+        extra_train_op = tf.no_op()
+
+      self.optimizer = tf.group(train_op, extra_train_op, *self.maintain_ema_ops)
+
+    # per parameter variance
+    variance_estimator = (self.ema.average(second_moment) -
+        tf.square(self.ema.average(first_moment)))
+    self.grad_variance = tf.reduce_mean(variance_estimator)
+
+  def _create_network(self):
+    logF = self._create_loss()
+    self.optimizerLoss = tf.reduce_mean(self.optimizerLoss)
+
+    # Setup optimizer
+    grads_and_vars = self.optimizer_class.compute_gradients(self.optimizerLoss)
+    self._create_train_op(grads_and_vars)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+  def partial_fit(self, X, n_samples=1):
+    if hasattr(self, 'grad_variances'):
+      grad_variance_field_to_return = self.grad_variances
+    else:
+      grad_variance_field_to_return = self.grad_variance
+    _, res, grad_variance, step, temperature = self.sess.run(
+        (self.optimizer, self.lHat, grad_variance_field_to_return, self.global_step, self.temperature_variable),
+        feed_dict={self.x: X, self.n_samples: n_samples})
+    return res, grad_variance, step, temperature
+
+  def partial_grad(self, X, n_samples=1):
+    control_variate_grads, step = self.sess.run(
+        (self.control_variate_grads, self.global_step),
+        feed_dict={self.x: X, self.n_samples: n_samples})
+    return control_variate_grads, step
+
+  def partial_eval(self, X, n_samples=5):
+    if n_samples < 1000:
+      res, iwae = self.sess.run(
+          (self.lHat, self.iwae),
+          feed_dict={self.x: X, self.n_samples: n_samples})
+      res = [iwae] + res
+    else:  # special case to handle OOM
+      assert n_samples % 100 == 0, "When using large # of samples, it must be divisble by 100"
+      res = []
+      for i in xrange(int(n_samples/100)):
+        logF, = self.sess.run(
+            (self.logF,),
+            feed_dict={self.x: X, self.n_samples: 100})
+        res.append(logsumexp(logF, axis=1))
+      res = [np.mean(logsumexp(res, axis=0) - np.log(n_samples))]
+    return res
+
+
+  # Random samplers
+  def _mean_sample(self, log_alpha, _, layer):
+    """Returns mean of random variables parameterized by log_alpha."""
+    mu = tf.nn.sigmoid(log_alpha)
+    return {
+        'preactivation': mu,
+        'activation': mu,
+        'log_param': log_alpha,
+    }
+
+  def _generate_randomness(self):
+    for i in xrange(self.hparams.n_layer):
+      self.uniform_samples[i] = tf.stop_gradient(tf.random_uniform(
+          [self.batch_size, self.hparams.n_hidden]))
+
+  def _u_to_v(self, log_alpha, u, eps = 1e-8):
+    """Convert u to tied randomness in v."""
+    u_prime = tf.nn.sigmoid(-log_alpha)  # g(u') = 0
+
+    v_1 = (u - u_prime) / tf.clip_by_value(1 - u_prime, eps, 1)
+    v_1 = tf.clip_by_value(v_1, 0, 1)
+    v_1 = tf.stop_gradient(v_1)
+    v_1 = v_1*(1 - u_prime) + u_prime
+    v_0 = u / tf.clip_by_value(u_prime, eps, 1)
+    v_0 = tf.clip_by_value(v_0, 0, 1)
+    v_0 = tf.stop_gradient(v_0)
+    v_0 = v_0 * u_prime
+
+    v = tf.where(u > u_prime, v_1, v_0)
+    v = tf.check_numerics(v, 'v sampling is not numerically stable.')
+    v = v + tf.stop_gradient(-v + u)  # v and u are the same up to numerical errors
+
+    return v
+
+  def _random_sample(self, log_alpha, u, layer):
+    """Returns sampled random variables parameterized by log_alpha."""
+    # Generate tied randomness for later
+    if layer not in self.uniform_samples_v:
+      self.uniform_samples_v[layer] = self._u_to_v(log_alpha, u)
+
+    # Sample random variable underlying softmax/argmax
+    x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
+    samples = tf.stop_gradient(tf.to_float(x > 0))
+
+    return {
+        'preactivation': x,
+        'activation': samples,
+        'log_param': log_alpha,
+    }
+
+  def _random_sample_soft(self, log_alpha, u, layer, temperature=None):
+    """Returns sampled random variables parameterized by log_alpha."""
+    if temperature is None:
+      temperature = self.hparams.temperature
+
+    # Sample random variable underlying softmax/argmax
+    x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
+    x /= tf.expand_dims(temperature, -1)
+
+    if self.hparams.muprop_relaxation:
+      y = tf.nn.sigmoid(x + log_alpha * tf.expand_dims(temperature/(temperature + 1), -1))
+    else:
+      y = tf.nn.sigmoid(x)
+
+    return {
+        'preactivation': x,
+        'activation': y,
+        'log_param': log_alpha
+    }
+
+  def _random_sample_soft_v(self, log_alpha, _, layer, temperature=None):
+    """Returns sampled random variables parameterized by log_alpha."""
+    v = self.uniform_samples_v[layer]
+
+    return self._random_sample_soft(log_alpha, v, layer, temperature)
+
+  def get_gumbel_gradient(self):
+    logQ, softSamples = self._recognition_network(sampler=self._random_sample_soft)
+    logQ = tf.add_n(logQ)
+    logPPrior, logP = self._generator_network(softSamples)
+
+    softELBO = logPPrior + logP - logQ
+    gumbel_gradient = (self.optimizer_class.
+                       compute_gradients(softELBO))
+    debug = {
+        'softELBO': softELBO,
+    }
+
+    return gumbel_gradient, debug
+
+  # samplers used for quadratic version
+  def _random_sample_switch(self, log_alpha, u, layer, switch_layer, temperature=None):
+    """Run partial discrete, then continuous path.
+
+       Args:
+        switch_layer: this layer and beyond will be continuous
+    """
+    if layer < switch_layer:
+      return self._random_sample(log_alpha, u, layer)
+    else:
+      return self._random_sample_soft(log_alpha, u, layer, temperature)
+
+  def _random_sample_switch_v(self, log_alpha, u, layer, switch_layer, temperature=None):
+    """Run partial discrete, then continuous path.
+
+       Args:
+        switch_layer: this layer and beyond will be continuous
+    """
+    if layer < switch_layer:
+      return self._random_sample(log_alpha, u, layer)
+    else:
+      return self._random_sample_soft_v(log_alpha, u, layer, temperature)
+
+
+  # #####
+  # Gradient computation
+  # #####
+  def get_nvil_gradient(self):
+    """Compute the NVIL gradient."""
+    # Hard loss
+    logQHard, samples = self._recognition_network()
+    ELBO, reinforce_model_grad = self._generator_network(samples, logQHard)
+    logQHard = tf.add_n(logQHard)
+
+    # Add baselines (no variance normalization)
+    learning_signal = tf.stop_gradient(ELBO) - self._create_baseline()
+
+    # Set up losses
+    self.baseline_loss.append(tf.square(learning_signal))
+    optimizerLoss = -(tf.stop_gradient(learning_signal)*logQHard +
+                           reinforce_model_grad)
+    optimizerLoss = tf.reduce_mean(optimizerLoss)
+
+    nvil_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
+    debug = {
+        'ELBO': ELBO,
+        'RMS of centered learning signal': U.rms(learning_signal),
+    }
+
+    return nvil_gradient, debug
+
+
+  def get_simple_muprop_gradient(self):
+    """ Computes the simple muprop gradient.
+
+    This muprop control variate does not include the linear term.
+    """
+    # Hard loss
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
+
+    # Soft loss
+    logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
+    muELBO, _  = self._generator_network(muSamples, logQ)
+
+    scaling_baseline = self._create_eta(collection='BASELINE')
+    learning_signal = (hardELBO
+                       - scaling_baseline * muELBO
+                       - self._create_baseline())
+    self.baseline_loss.append(tf.square(learning_signal))
+
+    optimizerLoss = -(tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
+                      + reinforce_model_grad)
+    optimizerLoss = tf.reduce_mean(optimizerLoss)
+
+    simple_muprop_gradient = (self.optimizer_class.
+                              compute_gradients(optimizerLoss))
+    debug = {
+        'ELBO': hardELBO,
+        'muELBO': muELBO,
+        'RMS': U.rms(learning_signal),
+    }
+
+    return simple_muprop_gradient, debug
+
+  def get_muprop_gradient(self):
+    """
+    random sample function that actually returns mean
+    new forward pass that returns logQ as a list
+
+    can get x_i from samples
+    """
+
+    # Hard loss
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
+
+    # Soft loss
+    logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
+    muELBO, _ = self._generator_network(muSamples, logQ)
+
+    # Compute gradients
+    muELBOGrads = tf.gradients(tf.reduce_sum(muELBO),
+                               [ muSamples[i]['activation'] for
+                                i in xrange(self.hparams.n_layer) ])
+
+    # Compute MuProp gradient estimates
+    learning_signal = hardELBO
+    optimizerLoss = 0.0
+    learning_signals = []
+    for i in xrange(self.hparams.n_layer):
+      dfDiff = tf.reduce_sum(
+          muELBOGrads[i] * (hardSamples[i]['activation'] -
+                            muSamples[i]['activation']),
+          axis=1)
+      dfMu = tf.reduce_sum(
+          tf.stop_gradient(muELBOGrads[i]) *
+          tf.nn.sigmoid(hardSamples[i]['log_param']),
+          axis=1)
+
+      scaling_baseline_0 = self._create_eta(collection='BASELINE')
+      scaling_baseline_1 = self._create_eta(collection='BASELINE')
+      learning_signals.append(learning_signal - scaling_baseline_0 * muELBO - scaling_baseline_1 * dfDiff - self._create_baseline())
+      self.baseline_loss.append(tf.square(learning_signals[i]))
+
+      optimizerLoss += (
+          logQHard[i] * tf.stop_gradient(learning_signals[i]) +
+          tf.stop_gradient(scaling_baseline_1) * dfMu)
+    optimizerLoss += reinforce_model_grad
+    optimizerLoss *= -1
+
+    optimizerLoss = tf.reduce_mean(optimizerLoss)
+
+    muprop_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
+    debug = {
+        'ELBO': hardELBO,
+        'muELBO': muELBO,
+    }
+
+    debug.update(dict([
+        ('RMS learning signal layer %d' % i, U.rms(learning_signal))
+        for (i, learning_signal) in enumerate(learning_signals)]))
+
+    return muprop_gradient, debug
+
+  # REBAR gradient helper functions
+  def _create_gumbel_control_variate(self, logQHard, temperature=None):
+    '''Calculate gumbel control variate.
+    '''
+    if temperature is None:
+      temperature = self.hparams.temperature
+
+    logQ, softSamples = self._recognition_network(sampler=functools.partial(
+        self._random_sample_soft, temperature=temperature))
+    softELBO, _ = self._generator_network(softSamples, logQ)
+    logQ = tf.add_n(logQ)
+
+    # Generate the softELBO_v (should be the same value but different grads)
+    logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
+        self._random_sample_soft_v, temperature=temperature))
+    softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
+    logQ_v = tf.add_n(logQ_v)
+
+    # Compute losses
+    learning_signal = tf.stop_gradient(softELBO_v)
+
+    # Control variate
+    h = (tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
+          - softELBO + softELBO_v)
+
+    extra = (softELBO_v, -softELBO + softELBO_v)
+
+    return h, extra
+
+  def _create_gumbel_control_variate_quadratic(self, logQHard, temperature=None):
+    '''Calculate gumbel control variate.
+    '''
+    if temperature is None:
+      temperature = self.hparams.temperature
+
+    h = 0
+    extra = []
+    for layer in xrange(self.hparams.n_layer):
+      logQ, softSamples = self._recognition_network(sampler=functools.partial(
+          self._random_sample_switch, switch_layer=layer, temperature=temperature))
+      softELBO, _ = self._generator_network(softSamples, logQ)
+
+      # Generate the softELBO_v (should be the same value but different grads)
+      logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
+          self._random_sample_switch_v, switch_layer=layer, temperature=temperature))
+      softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
+
+      # Compute losses
+      learning_signal = tf.stop_gradient(softELBO_v)
+
+      # Control variate
+      h += (tf.stop_gradient(learning_signal) * logQHard[layer]
+            - softELBO + softELBO_v)
+
+      extra.append((softELBO_v, -softELBO + softELBO_v))
+
+    return h, extra
+
+  def _create_hard_elbo(self):
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
+    reinforce_learning_signal = tf.stop_gradient(hardELBO)
+
+    # Center learning signal
+    baseline = self._create_baseline(collection='CV')
+    reinforce_learning_signal = tf.stop_gradient(reinforce_learning_signal) - baseline
+
+    nvil_gradient = (tf.stop_gradient(hardELBO) - baseline) * tf.add_n(logQHard) + reinforce_model_grad
+
+    return hardELBO, nvil_gradient, logQHard
+
+  def multiply_by_eta(self, h_grads, eta):
+    # Modifies eta
+    res = []
+    eta_statistics = []
+    for (g, v) in h_grads:
+      if g is None:
+        res.append((g, v))
+      else:
+        if 'network' not in eta:
+          eta['network'] = self._create_eta()
+        res.append((g*eta['network'], v))
+    eta_statistics.append(eta['network'])
+
+    return res, eta_statistics
+
+  def multiply_by_eta_per_layer(self, h_grads, eta):
+    # Modifies eta
+    res = []
+    eta_statistics = []
+    for (g, v) in h_grads:
+      if g is None:
+        res.append((g, v))
+      else:
+        if v not in eta:
+          eta[v] = self._create_eta()
+        res.append((g*eta[v], v))
+        eta_statistics.append(eta[v])
+
+    return res, eta_statistics
+
+  def multiply_by_eta_per_unit(self, h_grads, eta):
+    # Modifies eta
+    res = []
+    eta_statistics = []
+    for (g, v) in h_grads:
+      if g is None:
+        res.append((g, v))
+      else:
+        if v not in eta:
+          g_shape = g.shape_as_list()
+          assert len(g_shape) <= 2, 'Gradient has too many dimensions'
+          if len(g_shape) == 1:
+            eta[v] = self._create_eta(g_shape)
+          else:
+            eta[v] = self._create_eta([1, g_shape[1]])
+        h_grads.append((g*eta[v], v))
+        eta_statistics.extend(tf.nn.moments(tf.squeeze(eta[v]), axes=[0]))
+    return res, eta_statistics
+
+  def get_dynamic_rebar_gradient(self):
+    """Get the dynamic rebar gradient (t, eta optimized)."""
+    tiled_pre_temperature = tf.tile([self.pre_temperature_variable],
+                                [self.batch_size])
+    temperature = tf.exp(tiled_pre_temperature)
+
+    hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
+    if self.hparams.quadratic:
+      gumbel_cv, extra  = self._create_gumbel_control_variate_quadratic(logQHard, temperature=temperature)
+    else:
+      gumbel_cv, extra  = self._create_gumbel_control_variate(logQHard, temperature=temperature)
+
+    f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))
+
+    eta = {}
+    h_grads, eta_statistics = self.multiply_by_eta_per_layer(
+        self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
+        eta)
+
+    model_grads = U.add_grads_and_vars(f_grads, h_grads)
+    total_grads = model_grads
+
+    # Construct the variance objective
+    g = U.vectorize(model_grads, set_none_to_zero=True)
+    self.maintain_ema_ops.append(self.ema.apply([g]))
+    gbar = 0  #tf.stop_gradient(self.ema.average(g))
+    variance_objective = tf.reduce_mean(tf.square(g - gbar))
+
+    reinf_g_t = 0
+    if self.hparams.quadratic:
+      for layer in xrange(self.hparams.n_layer):
+        gumbel_learning_signal, _ = extra[layer]
+        df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
+        reinf_g_t_i, _ = self.multiply_by_eta_per_layer(
+            self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * logQHard[layer])),
+            eta)
+        reinf_g_t += U.vectorize(reinf_g_t_i, set_none_to_zero=True)
+
+      reparam = tf.add_n([reparam_i for _, reparam_i in extra])
+    else:
+      gumbel_learning_signal, reparam = extra
+      df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
+      reinf_g_t, _ = self.multiply_by_eta_per_layer(
+          self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * tf.add_n(logQHard))),
+          eta)
+      reinf_g_t = U.vectorize(reinf_g_t, set_none_to_zero=True)
+
+    reparam_g, _ = self.multiply_by_eta_per_layer(
+        self.optimizer_class.compute_gradients(tf.reduce_mean(reparam)),
+        eta)
+    reparam_g = U.vectorize(reparam_g, set_none_to_zero=True)
+    reparam_g_t = tf.gradients(tf.reduce_mean(2*tf.stop_gradient(g - gbar)*reparam_g), self.pre_temperature_variable)[0]
+
+    variance_objective_grad = tf.reduce_mean(2*(g - gbar)*reinf_g_t) + reparam_g_t
+
+    debug = { 'ELBO': hardELBO,
+             'etas': eta_statistics,
+             'variance_objective': variance_objective,
+             }
+    return total_grads, debug, variance_objective, variance_objective_grad
+
+  def get_rebar_gradient(self):
+    """Get the rebar gradient."""
+    hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
+    if self.hparams.quadratic:
+      gumbel_cv, _ = self._create_gumbel_control_variate_quadratic(logQHard)
+    else:
+      gumbel_cv, _ = self._create_gumbel_control_variate(logQHard)
+
+    f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))
+
+    eta = {}
+    h_grads, eta_statistics = self.multiply_by_eta_per_layer(
+        self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
+        eta)
+
+    model_grads = U.add_grads_and_vars(f_grads, h_grads)
+    total_grads = model_grads
+
+    # Construct the variance objective
+    variance_objective = tf.reduce_mean(tf.square(U.vectorize(model_grads, set_none_to_zero=True)))
+
+    debug = { 'ELBO': hardELBO,
+             'etas': eta_statistics,
+             'variance_objective': variance_objective,
+             }
+    return total_grads, debug, variance_objective
+
+###
+# Create varaints
+###
+class SBNSimpleMuProp(SBN):
+  def _create_loss(self):
+    simple_muprop_gradient, debug = self.get_simple_muprop_gradient()
+
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+        debug['muELBO'],
+    ])
+
+    return debug['ELBO'], simple_muprop_gradient
+
+  def _create_network(self):
+    logF, loss_grads = self._create_loss()
+    self._create_train_op(loss_grads)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+class SBNMuProp(SBN):
+  def _create_loss(self):
+    muprop_gradient, debug = self.get_muprop_gradient()
+
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+        debug['muELBO'],
+    ])
+
+    return debug['ELBO'], muprop_gradient
+
+  def _create_network(self):
+    logF, loss_grads = self._create_loss()
+    self._create_train_op(loss_grads)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+
+class SBNNVIL(SBN):
+  def _create_loss(self):
+    nvil_gradient, debug = self.get_nvil_gradient()
+
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+    ])
+
+    return debug['ELBO'], nvil_gradient
+
+  def _create_network(self):
+    logF, loss_grads = self._create_loss()
+    self._create_train_op(loss_grads)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+
+class SBNRebar(SBN):
+  def _create_loss(self):
+    rebar_gradient, debug, variance_objective = self.get_rebar_gradient()
+
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+    ])
+    self.lHat.extend(map(tf.reduce_mean, debug['etas']))
+
+    return debug['ELBO'], rebar_gradient, variance_objective
+
+  def _create_network(self):
+    logF, loss_grads, variance_objective = self._create_loss()
+
+    # Create additional updates for control variates and temperature
+    eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
+                                                        var_list=tf.get_collection('CV')))
+
+    self._create_train_op(loss_grads, eta_grads)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+class SBNDynamicRebar(SBN):
+  def _create_loss(self):
+    rebar_gradient, debug, variance_objective, variance_objective_grad = self.get_dynamic_rebar_gradient()
+
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+        self.temperature_variable,
+    ])
+    self.lHat.extend(debug['etas'])
+
+    return debug['ELBO'], rebar_gradient, variance_objective, variance_objective_grad
+
+  def _create_network(self):
+    logF, loss_grads, variance_objective, variance_objective_grad = self._create_loss()
+
+    # Create additional updates for control variates and temperature
+    eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
+                                                        var_list=tf.get_collection('CV'))
+                 + [(variance_objective_grad, self.pre_temperature_variable)])
+
+    self._create_train_op(loss_grads, eta_grads)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+
+class SBNTrackGradVariances(SBN):
+  """Follow NVIL, compute gradient variances for NVIL, MuProp and REBAR."""
+  def compute_gradient_moments(self, grads_and_vars):
+    first_moment = U.vectorize(grads_and_vars, set_none_to_zero=True)
+    second_moment = tf.square(first_moment)
+    self.maintain_ema_ops.append(self.ema.apply([first_moment, second_moment]))
+
+    return self.ema.average(first_moment), self.ema.average(second_moment)
+
+  def _create_loss(self):
+    self.losses = [
+        ('NVIL', self.get_nvil_gradient),
+        ('SimpleMuProp', self.get_simple_muprop_gradient),
+        ('MuProp', self.get_muprop_gradient),
+    ]
+
+    moments = []
+    for k, v in self.losses:
+      print(k)
+      gradient, debug = v()
+      if k == 'SimpleMuProp':
+        ELBO = debug['ELBO']
+        gradient_to_follow = gradient
+
+      moments.append(self.compute_gradient_moments(
+          gradient))
+
+    self.losses.append(('DynamicREBAR', self.get_dynamic_rebar_gradient))
+    dynamic_rebar_gradient, _, variance_objective, variance_objective_grad = self.get_dynamic_rebar_gradient()
+    moments.append(self.compute_gradient_moments(dynamic_rebar_gradient))
+
+    self.losses.append(('REBAR', self.get_rebar_gradient))
+    rebar_gradient, _, variance_objective2 = self.get_rebar_gradient()
+    moments.append(self.compute_gradient_moments(rebar_gradient))
+
+    mu = tf.reduce_mean(tf.stack([f for f, _ in moments]), axis=0)
+    self.grad_variances = []
+    deviations = []
+    for f, s in moments:
+      self.grad_variances.append(tf.reduce_mean(s - tf.square(mu)))
+      deviations.append(tf.reduce_mean(tf.square(f - mu)))
+
+    self.lHat = map(tf.reduce_mean, [
+        ELBO,
+        self.temperature_variable,
+        variance_objective_grad,
+        variance_objective_grad*variance_objective_grad,
+    ])
+    self.lHat.extend(deviations)
+    self.lHat.append(tf.log(tf.reduce_mean(mu*mu)))
+    #    self.lHat.extend(map(tf.log, grad_variances))
+
+    return ELBO, gradient_to_follow, variance_objective + variance_objective2, variance_objective_grad
+
+  def _create_network(self):
+    logF, loss_grads, variance_objective, variance_objective_grad = self._create_loss()
+    eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
+                                                        var_list=tf.get_collection('CV'))
+                 + [(variance_objective_grad, self.pre_temperature_variable)])
+    self._create_train_op(loss_grads, eta_grads)
+
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+
+
+class SBNGumbel(SBN):
+  def _random_sample_soft(self, log_alpha, u, layer, temperature=None):
+    """Returns sampled random variables parameterized by log_alpha."""
+    if temperature is None:
+      temperature = self.hparams.temperature
+
+    # Sample random variable underlying softmax/argmax
+    x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
+    x /= temperature
+
+    if self.hparams.muprop_relaxation:
+      x += temperature/(temperature + 1)*log_alpha
+
+    y = tf.nn.sigmoid(x)
+
+    return {
+        'preactivation': x,
+        'activation': y,
+        'log_param': log_alpha
+    }
+
+  def _create_loss(self):
+    # Hard loss
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, _ = self._generator_network(hardSamples, logQHard)
+
+    logQ, softSamples = self._recognition_network(sampler=self._random_sample_soft)
+    softELBO, _ = self._generator_network(softSamples, logQ)
+
+    self.optimizerLoss = -softELBO
+    self.lHat = map(tf.reduce_mean, [
+        hardELBO,
+        softELBO,
+    ])
+
+    return hardELBO
+
+default_hparams = tf.contrib.training.HParams(model='SBNGumbel',
+                             n_hidden=200,
+                             n_input=784,
+                             n_layer=1,
+                             nonlinear=False,
+                             learning_rate=0.001,
+                             temperature=0.5,
+                             n_samples=1,
+                             batch_size=24,
+                             trial=1,
+                             muprop_relaxation=True,
+                             dynamic_b=False, # dynamic binarization
+                             quadratic=True,
+                             beta2=0.99999,
+                             task='sbn',
+                             )
--- a/rebar/rebar_train.py
+++ b/rebar/rebar_train.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import random
+import sys
+import os
+
+import numpy as np
+import tensorflow as tf
+
+import rebar
+import datasets
+import logger as L
+gfile = tf.gfile
+
+tf.app.flags.DEFINE_string("working_dir", "/tmp/rebar",
+                           """Directory where to save data, write logs, etc.""")
+tf.app.flags.DEFINE_string('hparams', '',
+                           '''Comma separated list of name=value pairs.''')
+tf.app.flags.DEFINE_integer('eval_freq', 20,
+                           '''How often to run the evaluation step.''')
+FLAGS = tf.flags.FLAGS
+
+def manual_scalar_summary(name, value):
+  value = tf.Summary.Value(tag=name, simple_value=value)
+  summary_str = tf.Summary(value=[value])
+  return summary_str
+
+def eval(sbn, eval_xs, n_samples=100, batch_size=5):
+  n = eval_xs.shape[0]
+  i = 0
+  res = []
+  while i < n:
+    batch_xs = eval_xs[i:min(i+batch_size, n)]
+    res.append(sbn.partial_eval(batch_xs, n_samples))
+    i += batch_size
+  res = np.mean(res, axis=0)
+  return res
+
+def train(sbn, train_xs, valid_xs, test_xs, training_steps, debug=False):
+  hparams = sorted(sbn.hparams.values().items())
+  hparams = (map(str, x) for x in hparams)
+  hparams = ('_'.join(x) for x in hparams)
+  hparams_str = '.'.join(hparams)
+
+  logger = L.Logger()
+
+  # Create the experiment name from the hparams
+  experiment_name = ([str(sbn.hparams.n_hidden) for i in xrange(sbn.hparams.n_layer)] +
+                     [str(sbn.hparams.n_input)])
+  if sbn.hparams.nonlinear:
+    experiment_name = '~'.join(experiment_name)
+  else:
+    experiment_name = '-'.join(experiment_name)
+  experiment_name = 'SBN_%s' % experiment_name
+  rowkey = {'experiment': experiment_name,
+            'model': hparams_str}
+
+  # Create summary writer
+  summ_dir = os.path.join(FLAGS.working_dir, hparams_str)
+  summary_writer = tf.summary.FileWriter(
+      summ_dir, flush_secs=15, max_queue=100)
+
+  sv = tf.train.Supervisor(logdir=os.path.join(
+      FLAGS.working_dir, hparams_str),
+                     save_summaries_secs=0,
+                     save_model_secs=1200,
+                     summary_op=None,
+                     recovery_wait_secs=30,
+                     global_step=sbn.global_step)
+  with sv.managed_session() as sess:
+    # Dump hparams to file
+    with gfile.Open(os.path.join(FLAGS.working_dir,
+                                 hparams_str,
+                                 'hparams.json'),
+                    'w') as out:
+      json.dump(sbn.hparams.values(), out)
+
+    sbn.initialize(sess)
+    batch_size = sbn.hparams.batch_size
+    scores = []
+    n = train_xs.shape[0]
+    index = range(n)
+
+    while not sv.should_stop():
+      lHats = []
+      grad_variances = []
+      temperatures = []
+      random.shuffle(index)
+      i = 0
+      while i < n:
+        batch_index = index[i:min(i+batch_size, n)]
+        batch_xs = train_xs[batch_index, :]
+
+        if sbn.hparams.dynamic_b:
+          # Dynamically binarize the batch data
+          batch_xs = (np.random.rand(*batch_xs.shape) < batch_xs).astype(float)
+
+        lHat, grad_variance, step, temperature = sbn.partial_fit(batch_xs,
+                                                    sbn.hparams.n_samples)
+        if debug:
+          print(i, lHat)
+          if i > 100:
+            return
+        lHats.append(lHat)
+        grad_variances.append(grad_variance)
+        temperatures.append(temperature)
+        i += batch_size
+
+      grad_variances = np.log(np.mean(grad_variances, axis=0)).tolist()
+      summary_strings = []
+      if isinstance(grad_variances, list):
+        grad_variances = dict(zip([k for (k, v) in sbn.losses], map(float, grad_variances)))
+        rowkey['step'] = step
+        logger.log(rowkey, {'step': step,
+                             'train': np.mean(lHats, axis=0)[0],
+                             'grad_variances': grad_variances,
+                             'temperature': np.mean(temperatures), })
+        grad_variances = '\n'.join(map(str, sorted(grad_variances.iteritems())))
+      else:
+        rowkey['step'] = step
+        logger.log(rowkey, {'step': step,
+                             'train': np.mean(lHats, axis=0)[0],
+                             'grad_variance': grad_variances,
+                             'temperature': np.mean(temperatures), })
+        summary_strings.append(manual_scalar_summary("log grad variance", grad_variances))
+
+      print('Step %d: %s\n%s' % (step, str(np.mean(lHats, axis=0)), str(grad_variances)))
+
+      # Every few epochs compute test and validation scores
+      epoch = int(step / (train_xs.shape[0] / sbn.hparams.batch_size))
+      if epoch % FLAGS.eval_freq == 0:
+        valid_res = eval(sbn, valid_xs)
+        test_res= eval(sbn, test_xs)
+
+        print('\nValid %d: %s' % (step, str(valid_res)))
+        print('Test %d: %s\n' % (step, str(test_res)))
+        logger.log(rowkey, {'step': step,
+                             'valid': valid_res[0],
+                             'test': test_res[0]})
+        logger.flush()  # Flush infrequently
+
+      # Create summaries
+      summary_strings.extend([
+        manual_scalar_summary("Train ELBO", np.mean(lHats, axis=0)[0]),
+        manual_scalar_summary("Temperature", np.mean(temperatures)),
+      ])
+      for summ_str in summary_strings:
+        summary_writer.add_summary(summ_str, global_step=step)
+      summary_writer.flush()
+
+      sys.stdout.flush()
+      scores.append(np.mean(lHats, axis=0))
+
+      if step > training_steps:
+        break
+
+    return scores
+
+
+def main():
+  # Parse hyperparams
+  hparams = rebar.default_hparams
+  hparams.parse(FLAGS.hparams)
+  print(hparams.values())
+
+  train_xs, valid_xs, test_xs = datasets.load_data(hparams)
+  mean_xs = np.mean(train_xs, axis=0)  # Compute mean centering on training
+
+  training_steps = 2000000
+  model = getattr(rebar, hparams.model)
+  sbn = model(hparams, mean_xs=mean_xs)
+
+  scores = train(sbn, train_xs, valid_xs, test_xs,
+                 training_steps=training_steps, debug=False)
+
+if __name__ == '__main__':
+  main()
--- a/rebar/utils.py
+++ b/rebar/utils.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Basic data management and plotting utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import cPickle as pickle
+import getpass
+import numpy as np
+import gc
+import tensorflow as tf
+
+#
+# Python utlities
+#
+def exp_moving_average(x, alpha=0.9):
+  res = []
+  mu = 0
+  alpha_factor = 1
+  for x_i in x:
+    mu += (1 - alpha)*(x_i - mu)
+    alpha_factor *= alpha
+    res.append(mu/(1 - alpha_factor))
+
+  return np.array(res)
+
+def sanitize(s):
+  return s.replace('.', '_')
+
+#
+# Tensorflow utilities
+#
+def softplus(x):
+  '''
+  Let m = max(0, x), then,
+
+  sofplus(x) = log(1 + e(x)) = log(e(0) + e(x)) = log(e(m)(e(-m) + e(x-m)))
+             = m + log(e(-m) + e(x - m))
+
+  The term inside of the log is guaranteed to be between 1 and 2.
+  '''
+  m = tf.maximum(tf.zeros_like(x), x)
+  return m + tf.log(tf.exp(-m) + tf.exp(x - m))
+
+def safe_log_prob(x, eps=1e-8):
+  return tf.log(tf.clip_by_value(x, eps, 1.0))
+
+def rms(x):
+  return tf.sqrt(tf.reduce_mean(tf.square(x)))
+
+def center(x):
+  mu = (tf.reduce_sum(x) - x)/tf.to_float(tf.shape(x)[0] - 1)
+  return x - mu
+
+def vectorize(grads_and_vars, set_none_to_zero=False, skip_none=False):
+  if set_none_to_zero:
+    return tf.concat([tf.reshape(g, [-1]) if g is not None else
+                         tf.reshape(tf.zeros_like(v), [-1]) for g, v in grads_and_vars], 0)
+  elif skip_none:
+    return tf.concat([tf.reshape(g, [-1]) for g, v in grads_and_vars if g is not None], 0)
+  else:
+    return tf.concat([tf.reshape(g, [-1]) for g, v in grads_and_vars], 0)
+
+def add_grads_and_vars(a, b):
+  '''Add grads_and_vars from two calls to tf.compute_gradients.'''
+  res = []
+  for (g_a, v_a), (g_b, v_b) in zip(a, b):
+    assert v_a == v_b
+    if g_a is None:
+      res.append((g_b, v_b))
+    elif g_b is None:
+      res.append((g_a, v_a))
+    else:
+      res.append((g_a + g_b, v_a))
+  return res
+
+def binary_log_likelihood(y, log_y_hat):
+  """Computes binary log likelihood.
+
+  Args:
+    y: observed data
+    log_y_hat: parameters of the binary variables
+
+  Returns:
+    log_likelihood
+  """
+  return tf.reduce_sum(y*(-softplus(-log_y_hat)) +
+                       (1 - y)*(-log_y_hat-softplus(-log_y_hat)),
+                       1)
+
+def cov(a, b):
+  """Compute the sample covariance between two vectors."""
+  mu_a = tf.reduce_mean(a)
+  mu_b = tf.reduce_mean(b)
+  n = tf.to_float(tf.shape(a)[0])
+
+  return tf.reduce_sum((a - mu_a)*(b - mu_b))/(n - 1.0)
+
+def corr(a, b):
+  return cov(a, b)*tf.rsqrt(cov(a, a))*tf.rsqrt(cov(b, b))
+
+def logSumExp(t, axis=0, keep_dims = False):
+  '''Computes the log(sum(exp(t))) numerically stabily.
+
+  Args:
+    t: input tensor
+    axis: which axis to sum over
+    keep_dims: whether to keep the dim or not
+
+  Returns:
+    tensor with result
+
+  '''
+  m = tf.reduce_max(t, [axis])
+  res = m + tf.log(tf.reduce_sum(tf.exp(t - tf.expand_dims(m, axis)), [axis]))
+
+  if keep_dims:
+    return tf.expand_dims(res, axis)
+  else:
+    return res
+
+if __name__ == '__main__':
+  app.run()
--- a/resnet/cifar_input.py
+++ b/resnet/cifar_input.py
@@ -58,7 +58,7 @@ def build_input(dataset, data_path, batch_size, mode):
  record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
  label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
  # Convert from string to [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
+  depth_major = tf.reshape(tf.slice(record, [label_offset + label_bytes], [image_bytes]),
                           [depth, image_size, image_size])
  # Convert from [depth, height, width] to [height, width, depth].
  image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)

--- a/slim/BUILD
+++ b/slim/BUILD
@@ -2,8 +2,7 @@
 #   Contains files for loading, training and evaluating TF-Slim-based models.

 package(default_visibility = [
-    ":internal",
-    "//domain_adaptation:__subpackages__",
+    "//visibility:public",
 ])

 licenses(["notice"])  # Apache 2.0

--- a/slim/README.md
+++ b/slim/README.md
@@ -256,6 +256,17 @@ and/or multiple CPUs, either synchrononously or asynchronously.
 See [model_deploy](https://github.com/tensorflow/models/blob/master/slim/deployment/model_deploy.py)
 for details.

+### TensorBoard
+
+To visualize the losses and other metrics during training, you can use
+[TensorBoard](https://github.com/tensorflow/tensorboard)
+by running the command below.
+
+```shell
+tensorboard --logdir=${TRAIN_DIR}
+```
+
+Once TensorBoard is running, navigate your web browser to http://localhost:6006.

 # Fine-tuning a model from an existing checkpoint
 <a id='Tuning'></a>
@@ -392,8 +403,7 @@ bazel-bin/tensorflow/examples/label_image/label_image \
  --graph=/tmp/frozen_inception_v3.pb \
  --labels=/tmp/imagenet_slim_labels.txt \
  --input_mean=0 \
-  --input_std=255 \
-  --logtostderr
+  --input_std=255
 ```



--- a/slim/download_and_convert_data.py
+++ b/slim/download_and_convert_data.py
@@ -67,7 +67,7 @@ def main(_):
    download_and_convert_mnist.run(FLAGS.dataset_dir)
  else:
    raise ValueError(
-        'dataset_name [%s] was not recognized.' % FLAGS.dataset_dir)
+        'dataset_name [%s] was not recognized.' % FLAGS.dataset_name)

 if __name__ == '__main__':
  tf.app.run()

--- a/slim/export_inference_graph.py
+++ b/slim/export_inference_graph.py
@@ -48,8 +48,7 @@ bazel-bin/tensorflow/examples/label_image/label_image \
 --graph=/tmp/frozen_inception_v3.pb \
 --labels=/tmp/imagenet_slim_labels.txt \
 --input_mean=0 \
--input_std=255 \
--logtostderr
+--input_std=255

 """

@@ -63,7 +62,6 @@ from tensorflow.python.platform import gfile
 from datasets import dataset_factory
 from nets import nets_factory

-
 slim = tf.contrib.slim

 tf.app.flags.DEFINE_string(
@@ -74,8 +72,8 @@ tf.app.flags.DEFINE_boolean(
    'Whether to save out a training-focused version of the model.')

 tf.app.flags.DEFINE_integer(
-    'default_image_size', 224,
-    'The image size to use if the model does not define it.')
+    'image_size', None,
+    'The image size to use, otherwise use the model default_image_size.')

 tf.app.flags.DEFINE_string('dataset_name', 'imagenet',
                           'The name of the dataset to use with the model.')
@@ -100,16 +98,13 @@ def main(_):
    raise ValueError('You must supply the path to save to with --output_file')
  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default() as graph:
-    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'validation',
+    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train',
                                          FLAGS.dataset_dir)
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=FLAGS.is_training)
-    if hasattr(network_fn, 'default_image_size'):
-      image_size = network_fn.default_image_size
-    else:
-      image_size = FLAGS.default_image_size
+    image_size = FLAGS.image_size or network_fn.default_image_size
    placeholder = tf.placeholder(name='input', dtype=tf.float32,
                                 shape=[1, image_size, image_size, 3])
    network_fn(placeholder)

--- a/slim/export_inference_graph_test.py
+++ b/slim/export_inference_graph_test.py
@@ -25,7 +25,7 @@ import os
 import tensorflow as tf

 from tensorflow.python.platform import gfile
-from google3.third_party.tensorflow_models.slim import export_inference_graph
+import export_inference_graph


 class ExportInferenceGraphTest(tf.test.TestCase):

--- a/slim/nets/inception_v3.py
+++ b/slim/nets/inception_v3.py
@@ -425,6 +425,7 @@ def inception_v3(inputs,
                 prediction_fn=slim.softmax,
                 spatial_squeeze=True,
                 reuse=None,
+                 create_aux_logits=True,
                 scope='InceptionV3'):
  """Inception model from http://arxiv.org/abs/1512.00567.

@@ -457,6 +458,7 @@ def inception_v3(inputs,
        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
+    create_aux_logits: Whether to create the auxiliary logits.
    scope: Optional variable_scope.

  Returns:
@@ -481,30 +483,31 @@ def inception_v3(inputs,
          depth_multiplier=depth_multiplier)

      # Auxiliary Head logits
-      with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
-                          stride=1, padding='SAME'):
-        aux_logits = end_points['Mixed_6e']
-        with tf.variable_scope('AuxLogits'):
-          aux_logits = slim.avg_pool2d(
-              aux_logits, [5, 5], stride=3, padding='VALID',
-              scope='AvgPool_1a_5x5')
-          aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
-                                   scope='Conv2d_1b_1x1')
-
-          # Shape of feature map before the final layer.
-          kernel_size = _reduced_kernel_size_for_small_input(
-              aux_logits, [5, 5])
-          aux_logits = slim.conv2d(
-              aux_logits, depth(768), kernel_size,
-              weights_initializer=trunc_normal(0.01),
-              padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
-          aux_logits = slim.conv2d(
-              aux_logits, num_classes, [1, 1], activation_fn=None,
-              normalizer_fn=None, weights_initializer=trunc_normal(0.001),
-              scope='Conv2d_2b_1x1')
-          if spatial_squeeze:
-            aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
-          end_points['AuxLogits'] = aux_logits
+      if create_aux_logits:
+        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                            stride=1, padding='SAME'):
+          aux_logits = end_points['Mixed_6e']
+          with tf.variable_scope('AuxLogits'):
+            aux_logits = slim.avg_pool2d(
+                aux_logits, [5, 5], stride=3, padding='VALID',
+                scope='AvgPool_1a_5x5')
+            aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
+                                     scope='Conv2d_1b_1x1')
+
+            # Shape of feature map before the final layer.
+            kernel_size = _reduced_kernel_size_for_small_input(
+                aux_logits, [5, 5])
+            aux_logits = slim.conv2d(
+                aux_logits, depth(768), kernel_size,
+                weights_initializer=trunc_normal(0.01),
+                padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
+            aux_logits = slim.conv2d(
+                aux_logits, num_classes, [1, 1], activation_fn=None,
+                normalizer_fn=None, weights_initializer=trunc_normal(0.001),
+                scope='Conv2d_2b_1x1')
+            if spatial_squeeze:
+              aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
+            end_points['AuxLogits'] = aux_logits

      # Final pooling and prediction
      with tf.variable_scope('Logits'):

--- a/slim/nets/mobilenet_v1.py
+++ b/slim/nets/mobilenet_v1.py
@@ -27,6 +27,8 @@ As described in https://arxiv.org/abs/1704.04861.

 100% Mobilenet V1 (base) with input size 224x224:

+See mobilenet_v1()
+
 Layer                                                     params           macs
 --------------------------------------------------------------------------------
 MobilenetV1/Conv2d_0/Conv2D:                                 864      10,838,016
@@ -62,6 +64,8 @@ Total:                                                 3,185,088     567,716,352

 75% Mobilenet V1 (base) with input size 128x128:

+See mobilenet_v1_075()
+
 Layer                                                     params           macs
 --------------------------------------------------------------------------------
 MobilenetV1/Conv2d_0/Conv2D:                                 648       2,654,208
@@ -102,6 +106,7 @@ from __future__ import division
 from __future__ import print_function

 from collections import namedtuple
+import functools

 import tensorflow as tf

@@ -335,6 +340,17 @@ def mobilenet_v1(inputs,
 mobilenet_v1.default_image_size = 224


+def wrapped_partial(func, *args, **kwargs):
+  partial_func = functools.partial(func, *args, **kwargs)
+  functools.update_wrapper(partial_func, func)
+  return partial_func
+
+
+mobilenet_v1_075 = wrapped_partial(mobilenet_v1, depth_multiplier=0.75)
+mobilenet_v1_050 = wrapped_partial(mobilenet_v1, depth_multiplier=0.50)
+mobilenet_v1_025 = wrapped_partial(mobilenet_v1, depth_multiplier=0.25)
+
+
 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
  """Define kernel size which is automatically reduced for small input.


--- a/slim/nets/nets_factory.py
+++ b/slim/nets/nets_factory.py
@@ -54,6 +54,9 @@ networks_map = {'alexnet_v2': alexnet.alexnet_v2,
                'resnet_v2_152': resnet_v2.resnet_v2_152,
                'resnet_v2_200': resnet_v2.resnet_v2_200,
                'mobilenet_v1': mobilenet_v1.mobilenet_v1,
+                'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_075,
+                'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_050,
+                'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_025,
               }

 arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
@@ -78,6 +81,9 @@ arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
                  'resnet_v2_152': resnet_v2.resnet_arg_scope,
                  'resnet_v2_200': resnet_v2.resnet_arg_scope,
                  'mobilenet_v1': mobilenet_v1.mobilenet_v1_arg_scope,
+                  'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_arg_scope,
+                  'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_arg_scope,
+                  'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_arg_scope,
                 }



--- a/slim/nets/resnet_utils.py
+++ b/slim/nets/resnet_utils.py
@@ -199,7 +199,9 @@ def stack_blocks_dense(net, blocks, output_stride=None,
 def resnet_arg_scope(weight_decay=0.0001,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
-                     batch_norm_scale=True):
+                     batch_norm_scale=True,
+                     activation_fn=tf.nn.relu,
+                     use_batch_norm=True):
  """Defines the default ResNet arg scope.

  TODO(gpapan): The batch-normalization related default values above are
@@ -215,6 +217,8 @@ def resnet_arg_scope(weight_decay=0.0001,
      normalizing activations by their variance in batch normalization.
    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
      activations in the batch normalization layer.
+    activation_fn: The activation function which is used in ResNet.
+    use_batch_norm: Whether or not to use batch normalization.

  Returns:
    An `arg_scope` to use for the resnet models.
@@ -230,8 +234,8 @@ def resnet_arg_scope(weight_decay=0.0001,
      [slim.conv2d],
      weights_regularizer=slim.l2_regularizer(weight_decay),
      weights_initializer=slim.variance_scaling_initializer(),
-      activation_fn=tf.nn.relu,
-      normalizer_fn=slim.batch_norm,
+      activation_fn=activation_fn,
+      normalizer_fn=slim.batch_norm if use_batch_norm else None,
      normalizer_params=batch_norm_params):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
      # The following implies padding='SAME' for pool1, which makes feature

--- a/slim/nets/resnet_v1.py
+++ b/slim/nets/resnet_v1.py
@@ -66,8 +66,14 @@ slim = tf.contrib.slim


 @slim.add_arg_scope
-def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
-               outputs_collections=None, scope=None):
+def bottleneck(inputs,
+               depth,
+               depth_bottleneck,
+               stride,
+               rate=1,
+               outputs_collections=None,
+               scope=None,
+               use_bounded_activations=False):
  """Bottleneck residual unit variant with BN after convolutions.

  This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
@@ -86,6 +92,8 @@ def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
    rate: An integer, rate for atrous convolution.
    outputs_collections: Collection to add the ResNet unit output.
    scope: Optional variable_scope.
+    use_bounded_activations: Whether or not to use bounded activations. Bounded
+      activations better lend themselves to quantized inference.

  Returns:
    The ResNet unit's output.
@@ -95,8 +103,12 @@ def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
    if depth == depth_in:
      shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
    else:
-      shortcut = slim.conv2d(inputs, depth, [1, 1], stride=stride,
-                             activation_fn=None, scope='shortcut')
+      shortcut = slim.conv2d(
+          inputs,
+          depth, [1, 1],
+          stride=stride,
+          activation_fn=tf.nn.relu6 if use_bounded_activations else None,
+          scope='shortcut')

    residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
                           scope='conv1')
@@ -105,7 +117,12 @@ def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
    residual = slim.conv2d(residual, depth, [1, 1], stride=1,
                           activation_fn=None, scope='conv3')

-    output = tf.nn.relu(shortcut + residual)
+    if use_bounded_activations:
+      # Use clip_by_value to simulate bandpass activation.
+      residual = tf.clip_by_value(residual, -6.0, 6.0)
+      output = tf.nn.relu6(shortcut + residual)
+    else:
+      output = tf.nn.relu(shortcut + residual)

    return slim.utils.collect_named_outputs(outputs_collections,
                                            sc.original_name_scope,
@@ -119,7 +136,7 @@ def resnet_v1(inputs,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
-              spatial_squeeze=False,
+              spatial_squeeze=True,
              reuse=None,
              scope=None):
  """Generator for v1 ResNet models.

--- a/slim/nets/resnet_v1_test.py
+++ b/slim/nets/resnet_v1_test.py
@@ -251,6 +251,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                    global_pool=True,
                    output_stride=None,
                    include_root_block=True,
+                    spatial_squeeze=True,
                    reuse=None,
                    scope='resnet_v1_small'):
    """A shallow and thin ResNet v1 for faster tests."""
@@ -266,6 +267,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                               global_pool=global_pool,
                               output_stride=output_stride,
                               include_root_block=include_root_block,
+                               spatial_squeeze=spatial_squeeze,
                               reuse=reuse,
                               scope=scope)

@@ -276,6 +278,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
    with slim.arg_scope(resnet_utils.resnet_arg_scope()):
      logits, end_points = self._resnet_small(inputs, num_classes,
                                              global_pool=global_pool,
+                                              spatial_squeeze=False,
                                              scope='resnet')
    self.assertTrue(logits.op.name.startswith('resnet/logits'))
    self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
@@ -307,6 +310,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
    with slim.arg_scope(resnet_utils.resnet_arg_scope()):
      _, end_points = self._resnet_small(inputs, num_classes,
                                         global_pool=global_pool,
+                                         spatial_squeeze=False,
                                         scope='resnet')
      endpoint_to_shape = {
          'resnet/block1': [2, 41, 41, 4],
@@ -325,6 +329,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
      _, end_points = self._resnet_small(inputs, num_classes,
                                         global_pool=global_pool,
                                         include_root_block=False,
+                                         spatial_squeeze=False,
                                         scope='resnet')
      endpoint_to_shape = {
          'resnet/block1': [2, 64, 64, 4],
@@ -345,6 +350,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                                         num_classes,
                                         global_pool=global_pool,
                                         output_stride=output_stride,
+                                         spatial_squeeze=False,
                                         scope='resnet')
      endpoint_to_shape = {
          'resnet/block1': [2, 41, 41, 4],
@@ -391,6 +397,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
    with slim.arg_scope(resnet_utils.resnet_arg_scope()):
      logits, _ = self._resnet_small(inputs, num_classes,
                                     global_pool=global_pool,
+                                     spatial_squeeze=False,
                                     scope='resnet')
    self.assertTrue(logits.op.name.startswith('resnet/logits'))
    self.assertListEqual(logits.get_shape().as_list(),

--- a/slim/nets/resnet_v2.py
+++ b/slim/nets/resnet_v2.py
@@ -115,7 +115,7 @@ def resnet_v2(inputs,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
-              spatial_squeeze=False,
+              spatial_squeeze=True,
              reuse=None,
              scope=None):
  """Generator for v2 (preactivation) ResNet models.
@@ -251,7 +251,7 @@ def resnet_v2_50(inputs,
                 is_training=True,
                 global_pool=True,
                 output_stride=None,
-                 spatial_squeeze=False,
+                 spatial_squeeze=True,
                 reuse=None,
                 scope='resnet_v2_50'):
  """ResNet-50 model of [1]. See resnet_v2() for arg and return description."""
@@ -273,7 +273,7 @@ def resnet_v2_101(inputs,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
-                  spatial_squeeze=False,
+                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v2_101'):
  """ResNet-101 model of [1]. See resnet_v2() for arg and return description."""
@@ -295,7 +295,7 @@ def resnet_v2_152(inputs,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
-                  spatial_squeeze=False,
+                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v2_152'):
  """ResNet-152 model of [1]. See resnet_v2() for arg and return description."""
@@ -317,7 +317,7 @@ def resnet_v2_200(inputs,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
-                  spatial_squeeze=False,
+                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v2_200'):
  """ResNet-200 model of [2]. See resnet_v2() for arg and return description."""

--- a/slim/nets/resnet_v2_test.py
+++ b/slim/nets/resnet_v2_test.py
@@ -251,6 +251,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                    global_pool=True,
                    output_stride=None,
                    include_root_block=True,
+                    spatial_squeeze=True,
                    reuse=None,
                    scope='resnet_v2_small'):
    """A shallow and thin ResNet v2 for faster tests."""
@@ -266,6 +267,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                               global_pool=global_pool,
                               output_stride=output_stride,
                               include_root_block=include_root_block,
+                               spatial_squeeze=spatial_squeeze,
                               reuse=reuse,
                               scope=scope)

@@ -276,6 +278,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
    with slim.arg_scope(resnet_utils.resnet_arg_scope()):
      logits, end_points = self._resnet_small(inputs, num_classes,
                                              global_pool=global_pool,
+                                              spatial_squeeze=False,
                                              scope='resnet')
    self.assertTrue(logits.op.name.startswith('resnet/logits'))
    self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
@@ -307,6 +310,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
    with slim.arg_scope(resnet_utils.resnet_arg_scope()):
      _, end_points = self._resnet_small(inputs, num_classes,
                                         global_pool=global_pool,
+                                         spatial_squeeze=False,
                                         scope='resnet')
      endpoint_to_shape = {
          'resnet/block1': [2, 41, 41, 4],
@@ -325,6 +329,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
      _, end_points = self._resnet_small(inputs, num_classes,
                                         global_pool=global_pool,
                                         include_root_block=False,
+                                         spatial_squeeze=False,
                                         scope='resnet')
      endpoint_to_shape = {
          'resnet/block1': [2, 64, 64, 4],
@@ -345,6 +350,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                                         num_classes,
                                         global_pool=global_pool,
                                         output_stride=output_stride,
+                                         spatial_squeeze=False,
                                         scope='resnet')
      endpoint_to_shape = {
          'resnet/block1': [2, 41, 41, 4],
@@ -393,6 +399,7 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
    with slim.arg_scope(resnet_utils.resnet_arg_scope()):
      logits, _ = self._resnet_small(inputs, num_classes,
                                     global_pool=global_pool,
+                                     spatial_squeeze=False,
                                     scope='resnet')
    self.assertTrue(logits.op.name.startswith('resnet/logits'))
    self.assertListEqual(logits.get_shape().as_list(),

--- a/slim/nets/vgg.py
+++ b/slim/nets/vgg.py
@@ -87,8 +87,9 @@ def vgg_a(inputs,
    fc_conv_padding: the type of padding to use for the fully connected layer
      that is implemented as a convolutional layer. Use 'SAME' padding if you
      are applying the network in a fully convolutional manner and want to
-      get a prediction map downsampled by a factor of 32 as an output. Otherwise,
-      the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
+      get a prediction map downsampled by a factor of 32 as an output.
+      Otherwise, the output prediction map will be (input / 32) - 6 in case of
+      'VALID' padding.

  Returns:
    the last op containing the log predictions and end_points dict.
@@ -152,8 +153,9 @@ def vgg_16(inputs,
    fc_conv_padding: the type of padding to use for the fully connected layer
      that is implemented as a convolutional layer. Use 'SAME' padding if you
      are applying the network in a fully convolutional manner and want to
-      get a prediction map downsampled by a factor of 32 as an output. Otherwise,
-      the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
+      get a prediction map downsampled by a factor of 32 as an output.
+      Otherwise, the output prediction map will be (input / 32) - 6 in case of
+      'VALID' padding.

  Returns:
    the last op containing the log predictions and end_points dict.
@@ -217,8 +219,10 @@ def vgg_19(inputs,
    fc_conv_padding: the type of padding to use for the fully connected layer
      that is implemented as a convolutional layer. Use 'SAME' padding if you
      are applying the network in a fully convolutional manner and want to
-      get a prediction map downsampled by a factor of 32 as an output. Otherwise,
-      the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
+      get a prediction map downsampled by a factor of 32 as an output.
+      Otherwise, the output prediction map will be (input / 32) - 6 in case of
+      'VALID' padding.
+

  Returns:
    the last op containing the log predictions and end_points dict.