Merge branch 'master' of github.com:tensorflow/models

dff0f0c1 · Alexander Gorban · da341f70 · 36203f09 · dff0f0c1 · dff0f0c1
Commit dff0f0c1 authored Aug 08, 2017 by Alexander Gorban
20 changed files
--- a/rebar/config.py
+++ b/rebar/config.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Configuration variables."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+DATA_DIR = 'data'
+MNIST_BINARIZED = 'mnist_salakhutdinov_07-19-2017.pkl'
+MNIST_FLOAT = 'mnist_train_xs_07-19-2017.npy'
+OMNIGLOT = 'omniglot_07-19-2017.mat'
--- a/rebar/datasets.py
+++ b/rebar/datasets.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library of datasets for REBAR."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+import os
+import scipy.io
+import numpy as np
+import cPickle as pickle
+import tensorflow as tf
+import config
+gfile = tf.gfile
+def load_data(hparams):
+  # Load data
+  if hparams.task in ['sbn', 'sp']:
+    reader = read_MNIST
+  elif hparams.task == 'omni':
+    reader = read_omniglot
+  x_train, x_valid, x_test = reader(binarize=not hparams.dynamic_b)
+  return x_train, x_valid, x_test
+def read_MNIST(binarize=False):
+  """Reads in MNIST images.
+  Args:
+    binarize: whether to use the fixed binarization
+  Returns:
+    x_train: 50k training images
+    x_valid: 10k validation images
+    x_test: 10k test images
+  """
+  with gfile.FastGFile(os.path.join(config.DATA_DIR, config.MNIST_BINARIZED), 'r') as f:
+    (x_train, _), (x_valid, _), (x_test, _) = pickle.load(f)
+  if not binarize:
+    with gfile.FastGFile(os.path.join(config.DATA_DIR, config.MNIST_FLOAT), 'r') as f:
+      x_train = np.load(f).reshape(-1, 784)
+  return x_train, x_valid, x_test
+def read_omniglot(binarize=False):
+  """Reads in Omniglot images.
+  Args:
+    binarize: whether to use the fixed binarization
+  Returns:
+    x_train: training images
+    x_valid: validation images
+    x_test: test images
+  """
+  n_validation=1345
+  def reshape_data(data):
+    return data.reshape((-1, 28, 28)).reshape((-1, 28*28), order='fortran')
+  omni_raw = scipy.io.loadmat(os.path.join(config.DATA_DIR, config.OMNIGLOT))
+  train_data = reshape_data(omni_raw['data'].T.astype('float32'))
+  test_data = reshape_data(omni_raw['testdata'].T.astype('float32'))
+  # Binarize the data with a fixed seed
+  if binarize:
+    np.random.seed(5)
+    train_data = (np.random.rand(*train_data.shape) < train_data).astype(float)
+    test_data = (np.random.rand(*test_data.shape) < test_data).astype(float)
+  shuffle_seed = 123
+  permutation = np.random.RandomState(seed=shuffle_seed).permutation(train_data.shape[0])
+  train_data = train_data[permutation]
+  x_train = train_data[:-n_validation]
+  x_valid = train_data[-n_validation:]
+  x_test = test_data
+  return x_train, x_valid, x_test
--- a/rebar/download_data.py
+++ b/rebar/download_data.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Download MNIST, Omniglot datasets for Rebar."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import urllib
+import gzip
+import os
+import config
+import struct
+import numpy as np
+import cPickle as pickle
+import datasets
+MNIST_URL = 'see README'
+MNIST_BINARIZED_URL = 'see README'
+OMNIGLOT_URL = 'see README'
+MNIST_FLOAT_TRAIN = 'train-images-idx3-ubyte'
+def load_mnist_float(local_filename):
+  with open(local_filename, 'rb') as f:
+    f.seek(4)
+    nimages, rows, cols = struct.unpack('>iii', f.read(12))
+    dim = rows*cols
+    images = np.fromfile(f, dtype=np.dtype(np.ubyte))
+    images = (images/255.0).astype('float32').reshape((nimages, dim))
+  return images
+if __name__ == '__main__':
+  if not os.path.exists(config.DATA_DIR):
+    os.makedirs(config.DATA_DIR)
+  # Get MNIST and convert to npy file
+  local_filename = os.path.join(config.DATA_DIR, MNIST_FLOAT_TRAIN)
+  if not os.path.exists(local_filename):
+    urllib.urlretrieve("%s/%s.gz" % (MNIST_URL, MNIST_FLOAT_TRAIN), local_filename+'.gz')
+    with gzip.open(local_filename+'.gz', 'rb') as f:
+      file_content = f.read()
+    with open(local_filename, 'wb') as f:
+      f.write(file_content)
+    os.remove(local_filename+'.gz')
+  mnist_float_train = load_mnist_float(local_filename)[:-10000]
+  # save in a nice format
+  np.save(os.path.join(config.DATA_DIR, config.MNIST_FLOAT), mnist_float_train)
+  # Get binarized MNIST
+  splits = ['train', 'valid', 'test']
+  mnist_binarized = []
+  for split in splits:
+    filename = 'binarized_mnist_%s.amat' % split
+    url = '%s/binarized_mnist_%s.amat' % (MNIST_BINARIZED_URL, split)
+    local_filename = os.path.join(config.DATA_DIR, filename)
+    if not os.path.exists(local_filename):
+      urllib.urlretrieve(url, local_filename)
+    with open(local_filename, 'rb') as f:
+      mnist_binarized.append((np.array([map(int, line.split()) for line in f.readlines()]).astype('float32'), None))
+  # save in a nice format
+  with open(os.path.join(config.DATA_DIR, config.MNIST_BINARIZED), 'w') as out:
+    pickle.dump(mnist_binarized, out)
+  # Get Omniglot
+  local_filename = os.path.join(config.DATA_DIR, config.OMNIGLOT)
+  if not os.path.exists(local_filename):
+    urllib.urlretrieve(OMNIGLOT_URL,
+                       local_filename)
--- a/rebar/logger.py
+++ b/rebar/logger.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logger for REBAR"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+class Logger:
+  def __init__(self):
+    pass
+  def log(self, key, value):
+    pass
+  def flush(self):
+    pass
--- a/rebar/rebar.py
+++ b/rebar/rebar.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import tensorflow as tf
+import numpy as np
+from scipy.misc import logsumexp
+import tensorflow.contrib.slim as slim
+from tensorflow.python.ops import init_ops
+import utils as U
+FLAGS = tf.flags.FLAGS
+Q_COLLECTION = "q_collection"
+P_COLLECTION = "p_collection"
+class SBN(object):  # REINFORCE
+  def __init__(self,
+               hparams,
+               activation_func=tf.nn.sigmoid,
+               mean_xs = None,
+               eval_mode=False):
+    self.eval_mode = eval_mode
+    self.hparams = hparams
+    self.mean_xs = mean_xs
+    self.train_bias= -np.log(1./np.clip(mean_xs, 0.001, 0.999)-1.).astype(np.float32)
+    self.activation_func = activation_func
+    self.n_samples = tf.placeholder('int32')
+    self.x = tf.placeholder('float', [None, self.hparams.n_input])
+    self._x = tf.tile(self.x, [self.n_samples, 1])
+    self.batch_size = tf.shape(self._x)[0]
+    self.uniform_samples = dict()
+    self.uniform_samples_v = dict()
+    self.prior = tf.Variable(tf.zeros([self.hparams.n_hidden],
+                                      dtype=tf.float32),
+                             name='p_prior',
+                             collections=[tf.GraphKeys.GLOBAL_VARIABLES, P_COLLECTION])
+    self.run_recognition_network = False
+    self.run_generator_network = False
+    # Initialize temperature
+    self.pre_temperature_variable = tf.Variable(
+        np.log(self.hparams.temperature),
+        trainable=False,
+        dtype=tf.float32)
+    self.temperature_variable = tf.exp(self.pre_temperature_variable)
+    self.global_step = tf.Variable(0, trainable=False)
+    self.baseline_loss = []
+    self.ema = tf.train.ExponentialMovingAverage(decay=0.999)
+    self.maintain_ema_ops = []
+    self.optimizer_class = tf.train.AdamOptimizer(
+        learning_rate=1*self.hparams.learning_rate,
+        beta2=self.hparams.beta2)
+    self._generate_randomness()
+    self._create_network()
+  def initialize(self, sess):
+    self.sess = sess
+  def _create_eta(self, shape=[], collection='CV'):
+    return 2 * tf.sigmoid(tf.Variable(tf.zeros(shape), trainable=False,
+                                      collections=[collection, tf.GraphKeys.GLOBAL_VARIABLES, Q_COLLECTION]))
+  def _create_baseline(self, n_output=1, n_hidden=100,
+                       is_zero_init=False,
+                       collection='BASELINE'):
+    # center input
+    h = self._x
+    if self.mean_xs is not None:
+      h -= self.mean_xs
+    if is_zero_init:
+      initializer = init_ops.zeros_initializer()
+    else:
+      initializer = slim.variance_scaling_initializer()
+    with slim.arg_scope([slim.fully_connected],
+                        variables_collections=[collection, Q_COLLECTION],
+                        trainable=False,
+                        weights_initializer=initializer):
+      h = slim.fully_connected(h, n_hidden, activation_fn=tf.nn.tanh)
+      baseline = slim.fully_connected(h, n_output, activation_fn=None)
+      if n_output == 1:
+        baseline = tf.reshape(baseline, [-1])  # very important to reshape
+    return baseline
+  def _create_transformation(self, input, n_output, reuse, scope_prefix):
+    """Create the deterministic transformation between stochastic layers.
+    If self.hparam.nonlinear:
+        2 x tanh layers
+    Else:
+        1 x linear layer
+    """
+    if self.hparams.nonlinear:
+      h = slim.fully_connected(input,
+                               self.hparams.n_hidden,
+                               reuse=reuse,
+                               activation_fn=tf.nn.tanh,
+                               scope='%s_nonlinear_1' % scope_prefix)
+      h = slim.fully_connected(h,
+                               self.hparams.n_hidden,
+                               reuse=reuse,
+                               activation_fn=tf.nn.tanh,
+                               scope='%s_nonlinear_2' % scope_prefix)
+      h = slim.fully_connected(h,
+                               n_output,
+                               reuse=reuse,
+                               activation_fn=None,
+                               scope='%s' % scope_prefix)
+    else:
+      h = slim.fully_connected(input,
+                               n_output,
+                               reuse=reuse,
+                               activation_fn=None,
+                               scope='%s' % scope_prefix)
+    return h
+  def _recognition_network(self, sampler=None, log_likelihood_func=None):
+    """x values -> samples from Q and return log Q(h|x)."""
+    samples = {}
+    reuse = None if not self.run_recognition_network else True
+    # Set defaults
+    if sampler is None:
+      sampler = self._random_sample
+    if log_likelihood_func is None:
+      log_likelihood_func = lambda sample, log_params: (
+        U.binary_log_likelihood(sample['activation'], log_params))
+    logQ = []
+    if self.hparams.task in ['sbn', 'omni']:
+      # Initialize the edge case
+      samples[-1] = {'activation': self._x}
+      if self.mean_xs is not None:
+        samples[-1]['activation'] -= self.mean_xs  # center the input
+      samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[Q_COLLECTION]):
+        for i in xrange(self.hparams.n_layer):
+          # Set up the input to the layer
+          input = 2.0*samples[i-1]['activation'] - 1.0
+          # Create the conditional distribution (output is the logits)
+          h = self._create_transformation(input,
+                                          n_output=self.hparams.n_hidden,
+                                          reuse=reuse,
+                                          scope_prefix='q_%d' % i)
+          samples[i] = sampler(h, self.uniform_samples[i], i)
+          logQ.append(log_likelihood_func(samples[i], h))
+      self.run_recognition_network = True
+      return logQ, samples
+    elif self.hparams.task == 'sp':
+      # Initialize the edge case
+      samples[-1] = {'activation': tf.split(self._x,
+                                            num_or_size_splits=2,
+                                            axis=1)[0]}  # top half of digit
+      if self.mean_xs is not None:
+        samples[-1]['activation'] -= np.split(self.mean_xs, 2, 0)[0]  # center the input
+      samples[-1]['activation'] = (samples[-1]['activation'] + 1)/2.0
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[Q_COLLECTION]):
+        for i in xrange(self.hparams.n_layer):
+          # Set up the input to the layer
+          input = 2.0*samples[i-1]['activation'] - 1.0
+          # Create the conditional distribution (output is the logits)
+          h = self._create_transformation(input,
+                                          n_output=self.hparams.n_hidden,
+                                          reuse=reuse,
+                                          scope_prefix='q_%d' % i)
+          samples[i] = sampler(h, self.uniform_samples[i], i)
+          logQ.append(log_likelihood_func(samples[i], h))
+      self.run_recognition_network = True
+      return logQ, samples
+  def _generator_network(self, samples, logQ, log_likelihood_func=None):
+    '''Returns learning signal and function.
+    This is the implementation for SBNs for the ELBO.
+    Args:
+      samples: dictionary of sampled latent variables
+      logQ: list of log q(h_i) terms
+      log_likelihood_func: function used to compute log probs for the latent
+        variables
+    Returns:
+      learning_signal: the "reward" function
+      function_term: part of the function that depends on the parameters
+        and needs to have the gradient taken through
+    '''
+    reuse=None if not self.run_generator_network else True
+    if self.hparams.task in ['sbn', 'omni']:
+      if log_likelihood_func is None:
+        log_likelihood_func = lambda sample, log_params: (
+          U.binary_log_likelihood(sample['activation'], log_params))
+      logPPrior = log_likelihood_func(
+          samples[self.hparams.n_layer-1],
+          tf.expand_dims(self.prior, 0))
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[P_COLLECTION]):
+        for i in reversed(xrange(self.hparams.n_layer)):
+          if i == 0:
+            n_output = self.hparams.n_input
+          else:
+            n_output = self.hparams.n_hidden
+          input = 2.0*samples[i]['activation']-1.0
+          h = self._create_transformation(input,
+                                          n_output,
+                                          reuse=reuse,
+                                          scope_prefix='p_%d' % i)
+          if i == 0:
+            # Assume output is binary
+            logP = U.binary_log_likelihood(self._x, h + self.train_bias)
+          else:
+            logPPrior += log_likelihood_func(samples[i-1], h)
+      self.run_generator_network = True
+      return logP + logPPrior - tf.add_n(logQ), logP + logPPrior
+    elif self.hparams.task == 'sp':
+      with slim.arg_scope([slim.fully_connected],
+                          weights_initializer=slim.variance_scaling_initializer(),
+                          variables_collections=[P_COLLECTION]):
+        n_output = int(self.hparams.n_input/2)
+        i = self.hparams.n_layer - 1  # use the last layer
+        input = 2.0*samples[i]['activation']-1.0
+        h = self._create_transformation(input,
+                                        n_output,
+                                        reuse=reuse,
+                                        scope_prefix='p_%d' % i)
+        # Predict on the lower half of the image
+        logP = U.binary_log_likelihood(tf.split(self._x,
+                                              num_or_size_splits=2,
+                                              axis=1)[1],
+                                     h + np.split(self.train_bias, 2, 0)[1])
+      self.run_generator_network = True
+      return logP, logP
+  def _create_loss(self):
+    # Hard loss
+    logQHard, samples = self._recognition_network()
+    reinforce_learning_signal, reinforce_model_grad = self._generator_network(samples, logQHard)
+    logQHard = tf.add_n(logQHard)
+    # REINFORCE
+    learning_signal = tf.stop_gradient(center(reinforce_learning_signal))
+    self.optimizerLoss = -(learning_signal*logQHard +
+                           reinforce_model_grad)
+    self.lHat = map(tf.reduce_mean, [
+        reinforce_learning_signal,
+        U.rms(learning_signal),
+    ])
+    return reinforce_learning_signal
+  def _reshape(self, t):
+    return tf.transpose(tf.reshape(t,
+                      [self.n_samples, -1]))
+  def compute_tensor_variance(self, t):
+    """Compute the mean per component variance.
+    Use a moving average to estimate the required moments.
+    """
+    t_sq = tf.reduce_mean(tf.square(t))
+    self.maintain_ema_ops.append(self.ema.apply([t, t_sq]))
+    # mean per component variance
+    variance_estimator = (self.ema.average(t_sq) -
+                          tf.reduce_mean(
+                              tf.square(self.ema.average(t))))
+    return variance_estimator
+  def _create_train_op(self, grads_and_vars, extra_grads_and_vars=[]):
+    '''
+    Args:
+      grads_and_vars: gradients to apply and compute running average variance
+      extra_grads_and_vars: gradients to apply (not used to compute average variance)
+    '''
+    # Variance summaries
+    first_moment = U.vectorize(grads_and_vars, skip_none=True)
+    second_moment = tf.square(first_moment)
+    self.maintain_ema_ops.append(self.ema.apply([first_moment, second_moment]))
+    # Add baseline losses
+    if len(self.baseline_loss) > 0:
+      mean_baseline_loss = tf.reduce_mean(tf.add_n(self.baseline_loss))
+      extra_grads_and_vars += self.optimizer_class.compute_gradients(
+          mean_baseline_loss,
+          var_list=tf.get_collection('BASELINE'))
+    # Ensure that all required tensors are computed before updates are executed
+    extra_optimizer = tf.train.AdamOptimizer(
+        learning_rate=10*self.hparams.learning_rate,
+        beta2=self.hparams.beta2)
+    with tf.control_dependencies(
+        [tf.group(*[g for g, _ in (grads_and_vars + extra_grads_and_vars) if g is not None])]):
+      # Filter out the P_COLLECTION variables if we're in eval mode
+      if self.eval_mode:
+        grads_and_vars = [(g, v) for g, v in grads_and_vars
+                          if v not in tf.get_collection(P_COLLECTION)]
+      train_op = self.optimizer_class.apply_gradients(grads_and_vars,
+                                                      global_step=self.global_step)
+      if len(extra_grads_and_vars) > 0:
+        extra_train_op = extra_optimizer.apply_gradients(extra_grads_and_vars)
+      else:
+        extra_train_op = tf.no_op()
+      self.optimizer = tf.group(train_op, extra_train_op, *self.maintain_ema_ops)
+    # per parameter variance
+    variance_estimator = (self.ema.average(second_moment) -
+        tf.square(self.ema.average(first_moment)))
+    self.grad_variance = tf.reduce_mean(variance_estimator)
+  def _create_network(self):
+    logF = self._create_loss()
+    self.optimizerLoss = tf.reduce_mean(self.optimizerLoss)
+    # Setup optimizer
+    grads_and_vars = self.optimizer_class.compute_gradients(self.optimizerLoss)
+    self._create_train_op(grads_and_vars)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+  def partial_fit(self, X, n_samples=1):
+    if hasattr(self, 'grad_variances'):
+      grad_variance_field_to_return = self.grad_variances
+    else:
+      grad_variance_field_to_return = self.grad_variance
+    _, res, grad_variance, step, temperature = self.sess.run(
+        (self.optimizer, self.lHat, grad_variance_field_to_return, self.global_step, self.temperature_variable),
+        feed_dict={self.x: X, self.n_samples: n_samples})
+    return res, grad_variance, step, temperature
+  def partial_grad(self, X, n_samples=1):
+    control_variate_grads, step = self.sess.run(
+        (self.control_variate_grads, self.global_step),
+        feed_dict={self.x: X, self.n_samples: n_samples})
+    return control_variate_grads, step
+  def partial_eval(self, X, n_samples=5):
+    if n_samples < 1000:
+      res, iwae = self.sess.run(
+          (self.lHat, self.iwae),
+          feed_dict={self.x: X, self.n_samples: n_samples})
+      res = [iwae] + res
+    else:  # special case to handle OOM
+      assert n_samples % 100 == 0, "When using large # of samples, it must be divisble by 100"
+      res = []
+      for i in xrange(int(n_samples/100)):
+        logF, = self.sess.run(
+            (self.logF,),
+            feed_dict={self.x: X, self.n_samples: 100})
+        res.append(logsumexp(logF, axis=1))
+      res = [np.mean(logsumexp(res, axis=0) - np.log(n_samples))]
+    return res
+  # Random samplers
+  def _mean_sample(self, log_alpha, _, layer):
+    """Returns mean of random variables parameterized by log_alpha."""
+    mu = tf.nn.sigmoid(log_alpha)
+    return {
+        'preactivation': mu,
+        'activation': mu,
+        'log_param': log_alpha,
+    }
+  def _generate_randomness(self):
+    for i in xrange(self.hparams.n_layer):
+      self.uniform_samples[i] = tf.stop_gradient(tf.random_uniform(
+          [self.batch_size, self.hparams.n_hidden]))
+  def _u_to_v(self, log_alpha, u, eps = 1e-8):
+    """Convert u to tied randomness in v."""
+    u_prime = tf.nn.sigmoid(-log_alpha)  # g(u') = 0
+    v_1 = (u - u_prime) / tf.clip_by_value(1 - u_prime, eps, 1)
+    v_1 = tf.clip_by_value(v_1, 0, 1)
+    v_1 = tf.stop_gradient(v_1)
+    v_1 = v_1*(1 - u_prime) + u_prime
+    v_0 = u / tf.clip_by_value(u_prime, eps, 1)
+    v_0 = tf.clip_by_value(v_0, 0, 1)
+    v_0 = tf.stop_gradient(v_0)
+    v_0 = v_0 * u_prime
+    v = tf.where(u > u_prime, v_1, v_0)
+    v = tf.check_numerics(v, 'v sampling is not numerically stable.')
+    v = v + tf.stop_gradient(-v + u)  # v and u are the same up to numerical errors
+    return v
+  def _random_sample(self, log_alpha, u, layer):
+    """Returns sampled random variables parameterized by log_alpha."""
+    # Generate tied randomness for later
+    if layer not in self.uniform_samples_v:
+      self.uniform_samples_v[layer] = self._u_to_v(log_alpha, u)
+    # Sample random variable underlying softmax/argmax
+    x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
+    samples = tf.stop_gradient(tf.to_float(x > 0))
+    return {
+        'preactivation': x,
+        'activation': samples,
+        'log_param': log_alpha,
+    }
+  def _random_sample_soft(self, log_alpha, u, layer, temperature=None):
+    """Returns sampled random variables parameterized by log_alpha."""
+    if temperature is None:
+      temperature = self.hparams.temperature
+    # Sample random variable underlying softmax/argmax
+    x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
+    x /= tf.expand_dims(temperature, -1)
+    if self.hparams.muprop_relaxation:
+      y = tf.nn.sigmoid(x + log_alpha * tf.expand_dims(temperature/(temperature + 1), -1))
+    else:
+      y = tf.nn.sigmoid(x)
+    return {
+        'preactivation': x,
+        'activation': y,
+        'log_param': log_alpha
+    }
+  def _random_sample_soft_v(self, log_alpha, _, layer, temperature=None):
+    """Returns sampled random variables parameterized by log_alpha."""
+    v = self.uniform_samples_v[layer]
+    return self._random_sample_soft(log_alpha, v, layer, temperature)
+  def get_gumbel_gradient(self):
+    logQ, softSamples = self._recognition_network(sampler=self._random_sample_soft)
+    logQ = tf.add_n(logQ)
+    logPPrior, logP = self._generator_network(softSamples)
+    softELBO = logPPrior + logP - logQ
+    gumbel_gradient = (self.optimizer_class.
+                       compute_gradients(softELBO))
+    debug = {
+        'softELBO': softELBO,
+    }
+    return gumbel_gradient, debug
+  # samplers used for quadratic version
+  def _random_sample_switch(self, log_alpha, u, layer, switch_layer, temperature=None):
+    """Run partial discrete, then continuous path.
+       Args:
+        switch_layer: this layer and beyond will be continuous
+    """
+    if layer < switch_layer:
+      return self._random_sample(log_alpha, u, layer)
+    else:
+      return self._random_sample_soft(log_alpha, u, layer, temperature)
+  def _random_sample_switch_v(self, log_alpha, u, layer, switch_layer, temperature=None):
+    """Run partial discrete, then continuous path.
+       Args:
+        switch_layer: this layer and beyond will be continuous
+    """
+    if layer < switch_layer:
+      return self._random_sample(log_alpha, u, layer)
+    else:
+      return self._random_sample_soft_v(log_alpha, u, layer, temperature)
+  # #####
+  # Gradient computation
+  # #####
+  def get_nvil_gradient(self):
+    """Compute the NVIL gradient."""
+    # Hard loss
+    logQHard, samples = self._recognition_network()
+    ELBO, reinforce_model_grad = self._generator_network(samples, logQHard)
+    logQHard = tf.add_n(logQHard)
+    # Add baselines (no variance normalization)
+    learning_signal = tf.stop_gradient(ELBO) - self._create_baseline()
+    # Set up losses
+    self.baseline_loss.append(tf.square(learning_signal))
+    optimizerLoss = -(tf.stop_gradient(learning_signal)*logQHard +
+                           reinforce_model_grad)
+    optimizerLoss = tf.reduce_mean(optimizerLoss)
+    nvil_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
+    debug = {
+        'ELBO': ELBO,
+        'RMS of centered learning signal': U.rms(learning_signal),
+    }
+    return nvil_gradient, debug
+  def get_simple_muprop_gradient(self):
+    """ Computes the simple muprop gradient.
+    This muprop control variate does not include the linear term.
+    """
+    # Hard loss
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
+    # Soft loss
+    logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
+    muELBO, _  = self._generator_network(muSamples, logQ)
+    scaling_baseline = self._create_eta(collection='BASELINE')
+    learning_signal = (hardELBO
+                       - scaling_baseline * muELBO
+                       - self._create_baseline())
+    self.baseline_loss.append(tf.square(learning_signal))
+    optimizerLoss = -(tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
+                      + reinforce_model_grad)
+    optimizerLoss = tf.reduce_mean(optimizerLoss)
+    simple_muprop_gradient = (self.optimizer_class.
+                              compute_gradients(optimizerLoss))
+    debug = {
+        'ELBO': hardELBO,
+        'muELBO': muELBO,
+        'RMS': U.rms(learning_signal),
+    }
+    return simple_muprop_gradient, debug
+  def get_muprop_gradient(self):
+    """
+    random sample function that actually returns mean
+    new forward pass that returns logQ as a list
+    can get x_i from samples
+    """
+    # Hard loss
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
+    # Soft loss
+    logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
+    muELBO, _ = self._generator_network(muSamples, logQ)
+    # Compute gradients
+    muELBOGrads = tf.gradients(tf.reduce_sum(muELBO),
+                               [ muSamples[i]['activation'] for
+                                i in xrange(self.hparams.n_layer) ])
+    # Compute MuProp gradient estimates
+    learning_signal = hardELBO
+    optimizerLoss = 0.0
+    learning_signals = []
+    for i in xrange(self.hparams.n_layer):
+      dfDiff = tf.reduce_sum(
+          muELBOGrads[i] * (hardSamples[i]['activation'] -
+                            muSamples[i]['activation']),
+          axis=1)
+      dfMu = tf.reduce_sum(
+          tf.stop_gradient(muELBOGrads[i]) *
+          tf.nn.sigmoid(hardSamples[i]['log_param']),
+          axis=1)
+      scaling_baseline_0 = self._create_eta(collection='BASELINE')
+      scaling_baseline_1 = self._create_eta(collection='BASELINE')
+      learning_signals.append(learning_signal - scaling_baseline_0 * muELBO - scaling_baseline_1 * dfDiff - self._create_baseline())
+      self.baseline_loss.append(tf.square(learning_signals[i]))
+      optimizerLoss += (
+          logQHard[i] * tf.stop_gradient(learning_signals[i]) +
+          tf.stop_gradient(scaling_baseline_1) * dfMu)
+    optimizerLoss += reinforce_model_grad
+    optimizerLoss *= -1
+    optimizerLoss = tf.reduce_mean(optimizerLoss)
+    muprop_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
+    debug = {
+        'ELBO': hardELBO,
+        'muELBO': muELBO,
+    }
+    debug.update(dict([
+        ('RMS learning signal layer %d' % i, U.rms(learning_signal))
+        for (i, learning_signal) in enumerate(learning_signals)]))
+    return muprop_gradient, debug
+  # REBAR gradient helper functions
+  def _create_gumbel_control_variate(self, logQHard, temperature=None):
+    '''Calculate gumbel control variate.
+    '''
+    if temperature is None:
+      temperature = self.hparams.temperature
+    logQ, softSamples = self._recognition_network(sampler=functools.partial(
+        self._random_sample_soft, temperature=temperature))
+    softELBO, _ = self._generator_network(softSamples, logQ)
+    logQ = tf.add_n(logQ)
+    # Generate the softELBO_v (should be the same value but different grads)
+    logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
+        self._random_sample_soft_v, temperature=temperature))
+    softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
+    logQ_v = tf.add_n(logQ_v)
+    # Compute losses
+    learning_signal = tf.stop_gradient(softELBO_v)
+    # Control variate
+    h = (tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
+          - softELBO + softELBO_v)
+    extra = (softELBO_v, -softELBO + softELBO_v)
+    return h, extra
+  def _create_gumbel_control_variate_quadratic(self, logQHard, temperature=None):
+    '''Calculate gumbel control variate.
+    '''
+    if temperature is None:
+      temperature = self.hparams.temperature
+    h = 0
+    extra = []
+    for layer in xrange(self.hparams.n_layer):
+      logQ, softSamples = self._recognition_network(sampler=functools.partial(
+          self._random_sample_switch, switch_layer=layer, temperature=temperature))
+      softELBO, _ = self._generator_network(softSamples, logQ)
+      # Generate the softELBO_v (should be the same value but different grads)
+      logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
+          self._random_sample_switch_v, switch_layer=layer, temperature=temperature))
+      softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
+      # Compute losses
+      learning_signal = tf.stop_gradient(softELBO_v)
+      # Control variate
+      h += (tf.stop_gradient(learning_signal) * logQHard[layer]
+            - softELBO + softELBO_v)
+      extra.append((softELBO_v, -softELBO + softELBO_v))
+    return h, extra
+  def _create_hard_elbo(self):
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)
+    reinforce_learning_signal = tf.stop_gradient(hardELBO)
+    # Center learning signal
+    baseline = self._create_baseline(collection='CV')
+    reinforce_learning_signal = tf.stop_gradient(reinforce_learning_signal) - baseline
+    nvil_gradient = (tf.stop_gradient(hardELBO) - baseline) * tf.add_n(logQHard) + reinforce_model_grad
+    return hardELBO, nvil_gradient, logQHard
+  def multiply_by_eta(self, h_grads, eta):
+    # Modifies eta
+    res = []
+    eta_statistics = []
+    for (g, v) in h_grads:
+      if g is None:
+        res.append((g, v))
+      else:
+        if 'network' not in eta:
+          eta['network'] = self._create_eta()
+        res.append((g*eta['network'], v))
+    eta_statistics.append(eta['network'])
+    return res, eta_statistics
+  def multiply_by_eta_per_layer(self, h_grads, eta):
+    # Modifies eta
+    res = []
+    eta_statistics = []
+    for (g, v) in h_grads:
+      if g is None:
+        res.append((g, v))
+      else:
+        if v not in eta:
+          eta[v] = self._create_eta()
+        res.append((g*eta[v], v))
+        eta_statistics.append(eta[v])
+    return res, eta_statistics
+  def multiply_by_eta_per_unit(self, h_grads, eta):
+    # Modifies eta
+    res = []
+    eta_statistics = []
+    for (g, v) in h_grads:
+      if g is None:
+        res.append((g, v))
+      else:
+        if v not in eta:
+          g_shape = g.shape_as_list()
+          assert len(g_shape) <= 2, 'Gradient has too many dimensions'
+          if len(g_shape) == 1:
+            eta[v] = self._create_eta(g_shape)
+          else:
+            eta[v] = self._create_eta([1, g_shape[1]])
+        h_grads.append((g*eta[v], v))
+        eta_statistics.extend(tf.nn.moments(tf.squeeze(eta[v]), axes=[0]))
+    return res, eta_statistics
+  def get_dynamic_rebar_gradient(self):
+    """Get the dynamic rebar gradient (t, eta optimized)."""
+    tiled_pre_temperature = tf.tile([self.pre_temperature_variable],
+                                [self.batch_size])
+    temperature = tf.exp(tiled_pre_temperature)
+    hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
+    if self.hparams.quadratic:
+      gumbel_cv, extra  = self._create_gumbel_control_variate_quadratic(logQHard, temperature=temperature)
+    else:
+      gumbel_cv, extra  = self._create_gumbel_control_variate(logQHard, temperature=temperature)
+    f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))
+    eta = {}
+    h_grads, eta_statistics = self.multiply_by_eta_per_layer(
+        self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
+        eta)
+    model_grads = U.add_grads_and_vars(f_grads, h_grads)
+    total_grads = model_grads
+    # Construct the variance objective
+    g = U.vectorize(model_grads, set_none_to_zero=True)
+    self.maintain_ema_ops.append(self.ema.apply([g]))
+    gbar = 0  #tf.stop_gradient(self.ema.average(g))
+    variance_objective = tf.reduce_mean(tf.square(g - gbar))
+    reinf_g_t = 0
+    if self.hparams.quadratic:
+      for layer in xrange(self.hparams.n_layer):
+        gumbel_learning_signal, _ = extra[layer]
+        df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
+        reinf_g_t_i, _ = self.multiply_by_eta_per_layer(
+            self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * logQHard[layer])),
+            eta)
+        reinf_g_t += U.vectorize(reinf_g_t_i, set_none_to_zero=True)
+      reparam = tf.add_n([reparam_i for _, reparam_i in extra])
+    else:
+      gumbel_learning_signal, reparam = extra
+      df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
+      reinf_g_t, _ = self.multiply_by_eta_per_layer(
+          self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * tf.add_n(logQHard))),
+          eta)
+      reinf_g_t = U.vectorize(reinf_g_t, set_none_to_zero=True)
+    reparam_g, _ = self.multiply_by_eta_per_layer(
+        self.optimizer_class.compute_gradients(tf.reduce_mean(reparam)),
+        eta)
+    reparam_g = U.vectorize(reparam_g, set_none_to_zero=True)
+    reparam_g_t = tf.gradients(tf.reduce_mean(2*tf.stop_gradient(g - gbar)*reparam_g), self.pre_temperature_variable)[0]
+    variance_objective_grad = tf.reduce_mean(2*(g - gbar)*reinf_g_t) + reparam_g_t
+    debug = { 'ELBO': hardELBO,
+             'etas': eta_statistics,
+             'variance_objective': variance_objective,
+             }
+    return total_grads, debug, variance_objective, variance_objective_grad
+  def get_rebar_gradient(self):
+    """Get the rebar gradient."""
+    hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
+    if self.hparams.quadratic:
+      gumbel_cv, _ = self._create_gumbel_control_variate_quadratic(logQHard)
+    else:
+      gumbel_cv, _ = self._create_gumbel_control_variate(logQHard)
+    f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))
+    eta = {}
+    h_grads, eta_statistics = self.multiply_by_eta_per_layer(
+        self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
+        eta)
+    model_grads = U.add_grads_and_vars(f_grads, h_grads)
+    total_grads = model_grads
+    # Construct the variance objective
+    variance_objective = tf.reduce_mean(tf.square(U.vectorize(model_grads, set_none_to_zero=True)))
+    debug = { 'ELBO': hardELBO,
+             'etas': eta_statistics,
+             'variance_objective': variance_objective,
+             }
+    return total_grads, debug, variance_objective
+###
+# Create varaints
+###
+class SBNSimpleMuProp(SBN):
+  def _create_loss(self):
+    simple_muprop_gradient, debug = self.get_simple_muprop_gradient()
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+        debug['muELBO'],
+    ])
+    return debug['ELBO'], simple_muprop_gradient
+  def _create_network(self):
+    logF, loss_grads = self._create_loss()
+    self._create_train_op(loss_grads)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+class SBNMuProp(SBN):
+  def _create_loss(self):
+    muprop_gradient, debug = self.get_muprop_gradient()
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+        debug['muELBO'],
+    ])
+    return debug['ELBO'], muprop_gradient
+  def _create_network(self):
+    logF, loss_grads = self._create_loss()
+    self._create_train_op(loss_grads)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+class SBNNVIL(SBN):
+  def _create_loss(self):
+    nvil_gradient, debug = self.get_nvil_gradient()
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+    ])
+    return debug['ELBO'], nvil_gradient
+  def _create_network(self):
+    logF, loss_grads = self._create_loss()
+    self._create_train_op(loss_grads)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+class SBNRebar(SBN):
+  def _create_loss(self):
+    rebar_gradient, debug, variance_objective = self.get_rebar_gradient()
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+    ])
+    self.lHat.extend(map(tf.reduce_mean, debug['etas']))
+    return debug['ELBO'], rebar_gradient, variance_objective
+  def _create_network(self):
+    logF, loss_grads, variance_objective = self._create_loss()
+    # Create additional updates for control variates and temperature
+    eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
+                                                        var_list=tf.get_collection('CV')))
+    self._create_train_op(loss_grads, eta_grads)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+class SBNDynamicRebar(SBN):
+  def _create_loss(self):
+    rebar_gradient, debug, variance_objective, variance_objective_grad = self.get_dynamic_rebar_gradient()
+    self.lHat = map(tf.reduce_mean, [
+        debug['ELBO'],
+        self.temperature_variable,
+    ])
+    self.lHat.extend(debug['etas'])
+    return debug['ELBO'], rebar_gradient, variance_objective, variance_objective_grad
+  def _create_network(self):
+    logF, loss_grads, variance_objective, variance_objective_grad = self._create_loss()
+    # Create additional updates for control variates and temperature
+    eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
+                                                        var_list=tf.get_collection('CV'))
+                 + [(variance_objective_grad, self.pre_temperature_variable)])
+    self._create_train_op(loss_grads, eta_grads)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+class SBNTrackGradVariances(SBN):
+  """Follow NVIL, compute gradient variances for NVIL, MuProp and REBAR."""
+  def compute_gradient_moments(self, grads_and_vars):
+    first_moment = U.vectorize(grads_and_vars, set_none_to_zero=True)
+    second_moment = tf.square(first_moment)
+    self.maintain_ema_ops.append(self.ema.apply([first_moment, second_moment]))
+    return self.ema.average(first_moment), self.ema.average(second_moment)
+  def _create_loss(self):
+    self.losses = [
+        ('NVIL', self.get_nvil_gradient),
+        ('SimpleMuProp', self.get_simple_muprop_gradient),
+        ('MuProp', self.get_muprop_gradient),
+    ]
+    moments = []
+    for k, v in self.losses:
+      print(k)
+      gradient, debug = v()
+      if k == 'SimpleMuProp':
+        ELBO = debug['ELBO']
+        gradient_to_follow = gradient
+      moments.append(self.compute_gradient_moments(
+          gradient))
+    self.losses.append(('DynamicREBAR', self.get_dynamic_rebar_gradient))
+    dynamic_rebar_gradient, _, variance_objective, variance_objective_grad = self.get_dynamic_rebar_gradient()
+    moments.append(self.compute_gradient_moments(dynamic_rebar_gradient))
+    self.losses.append(('REBAR', self.get_rebar_gradient))
+    rebar_gradient, _, variance_objective2 = self.get_rebar_gradient()
+    moments.append(self.compute_gradient_moments(rebar_gradient))
+    mu = tf.reduce_mean(tf.stack([f for f, _ in moments]), axis=0)
+    self.grad_variances = []
+    deviations = []
+    for f, s in moments:
+      self.grad_variances.append(tf.reduce_mean(s - tf.square(mu)))
+      deviations.append(tf.reduce_mean(tf.square(f - mu)))
+    self.lHat = map(tf.reduce_mean, [
+        ELBO,
+        self.temperature_variable,
+        variance_objective_grad,
+        variance_objective_grad*variance_objective_grad,
+    ])
+    self.lHat.extend(deviations)
+    self.lHat.append(tf.log(tf.reduce_mean(mu*mu)))
+    #    self.lHat.extend(map(tf.log, grad_variances))
+    return ELBO, gradient_to_follow, variance_objective + variance_objective2, variance_objective_grad
+  def _create_network(self):
+    logF, loss_grads, variance_objective, variance_objective_grad = self._create_loss()
+    eta_grads = (self.optimizer_class.compute_gradients(variance_objective,
+                                                        var_list=tf.get_collection('CV'))
+                 + [(variance_objective_grad, self.pre_temperature_variable)])
+    self._create_train_op(loss_grads, eta_grads)
+    # Create IWAE lower bound for evaluation
+    self.logF = self._reshape(logF)
+    self.iwae = tf.reduce_mean(U.logSumExp(self.logF, axis=1) -
+                               tf.log(tf.to_float(self.n_samples)))
+class SBNGumbel(SBN):
+  def _random_sample_soft(self, log_alpha, u, layer, temperature=None):
+    """Returns sampled random variables parameterized by log_alpha."""
+    if temperature is None:
+      temperature = self.hparams.temperature
+    # Sample random variable underlying softmax/argmax
+    x = log_alpha + U.safe_log_prob(u) - U.safe_log_prob(1 - u)
+    x /= temperature
+    if self.hparams.muprop_relaxation:
+      x += temperature/(temperature + 1)*log_alpha
+    y = tf.nn.sigmoid(x)
+    return {
+        'preactivation': x,
+        'activation': y,
+        'log_param': log_alpha
+    }
+  def _create_loss(self):
+    # Hard loss
+    logQHard, hardSamples = self._recognition_network()
+    hardELBO, _ = self._generator_network(hardSamples, logQHard)
+    logQ, softSamples = self._recognition_network(sampler=self._random_sample_soft)
+    softELBO, _ = self._generator_network(softSamples, logQ)
+    self.optimizerLoss = -softELBO
+    self.lHat = map(tf.reduce_mean, [
+        hardELBO,
+        softELBO,
+    ])
+    return hardELBO
+default_hparams = tf.contrib.training.HParams(model='SBNGumbel',
+                             n_hidden=200,
+                             n_input=784,
+                             n_layer=1,
+                             nonlinear=False,
+                             learning_rate=0.001,
+                             temperature=0.5,
+                             n_samples=1,
+                             batch_size=24,
+                             trial=1,
+                             muprop_relaxation=True,
+                             dynamic_b=False, # dynamic binarization
+                             quadratic=True,
+                             beta2=0.99999,
+                             task='sbn',
+                             )
--- a/rebar/rebar_train.py
+++ b/rebar/rebar_train.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import random
+import sys
+import os
+import numpy as np
+import tensorflow as tf
+import rebar
+import datasets
+import logger as L
+gfile = tf.gfile
+tf.app.flags.DEFINE_string("working_dir", "/tmp/rebar",
+                           """Directory where to save data, write logs, etc.""")
+tf.app.flags.DEFINE_string('hparams', '',
+                           '''Comma separated list of name=value pairs.''')
+tf.app.flags.DEFINE_integer('eval_freq', 20,
+                           '''How often to run the evaluation step.''')
+FLAGS = tf.flags.FLAGS
+def manual_scalar_summary(name, value):
+  value = tf.Summary.Value(tag=name, simple_value=value)
+  summary_str = tf.Summary(value=[value])
+  return summary_str
+def eval(sbn, eval_xs, n_samples=100, batch_size=5):
+  n = eval_xs.shape[0]
+  i = 0
+  res = []
+  while i < n:
+    batch_xs = eval_xs[i:min(i+batch_size, n)]
+    res.append(sbn.partial_eval(batch_xs, n_samples))
+    i += batch_size
+  res = np.mean(res, axis=0)
+  return res
+def train(sbn, train_xs, valid_xs, test_xs, training_steps, debug=False):
+  hparams = sorted(sbn.hparams.values().items())
+  hparams = (map(str, x) for x in hparams)
+  hparams = ('_'.join(x) for x in hparams)
+  hparams_str = '.'.join(hparams)
+  logger = L.Logger()
+  # Create the experiment name from the hparams
+  experiment_name = ([str(sbn.hparams.n_hidden) for i in xrange(sbn.hparams.n_layer)] +
+                     [str(sbn.hparams.n_input)])
+  if sbn.hparams.nonlinear:
+    experiment_name = '~'.join(experiment_name)
+  else:
+    experiment_name = '-'.join(experiment_name)
+  experiment_name = 'SBN_%s' % experiment_name
+  rowkey = {'experiment': experiment_name,
+            'model': hparams_str}
+  # Create summary writer
+  summ_dir = os.path.join(FLAGS.working_dir, hparams_str)
+  summary_writer = tf.summary.FileWriter(
+      summ_dir, flush_secs=15, max_queue=100)
+  sv = tf.train.Supervisor(logdir=os.path.join(
+      FLAGS.working_dir, hparams_str),
+                     save_summaries_secs=0,
+                     save_model_secs=1200,
+                     summary_op=None,
+                     recovery_wait_secs=30,
+                     global_step=sbn.global_step)
+  with sv.managed_session() as sess:
+    # Dump hparams to file
+    with gfile.Open(os.path.join(FLAGS.working_dir,
+                                 hparams_str,
+                                 'hparams.json'),
+                    'w') as out:
+      json.dump(sbn.hparams.values(), out)
+    sbn.initialize(sess)
+    batch_size = sbn.hparams.batch_size
+    scores = []
+    n = train_xs.shape[0]
+    index = range(n)
+    while not sv.should_stop():
+      lHats = []
+      grad_variances = []
+      temperatures = []
+      random.shuffle(index)
+      i = 0
+      while i < n:
+        batch_index = index[i:min(i+batch_size, n)]
+        batch_xs = train_xs[batch_index, :]
+        if sbn.hparams.dynamic_b:
+          # Dynamically binarize the batch data
+          batch_xs = (np.random.rand(*batch_xs.shape) < batch_xs).astype(float)
+        lHat, grad_variance, step, temperature = sbn.partial_fit(batch_xs,
+                                                    sbn.hparams.n_samples)
+        if debug:
+          print(i, lHat)
+          if i > 100:
+            return
+        lHats.append(lHat)
+        grad_variances.append(grad_variance)
+        temperatures.append(temperature)
+        i += batch_size
+      grad_variances = np.log(np.mean(grad_variances, axis=0)).tolist()
+      summary_strings = []
+      if isinstance(grad_variances, list):
+        grad_variances = dict(zip([k for (k, v) in sbn.losses], map(float, grad_variances)))
+        rowkey['step'] = step
+        logger.log(rowkey, {'step': step,
+                             'train': np.mean(lHats, axis=0)[0],
+                             'grad_variances': grad_variances,
+                             'temperature': np.mean(temperatures), })
+        grad_variances = '\n'.join(map(str, sorted(grad_variances.iteritems())))
+      else:
+        rowkey['step'] = step
+        logger.log(rowkey, {'step': step,
+                             'train': np.mean(lHats, axis=0)[0],
+                             'grad_variance': grad_variances,
+                             'temperature': np.mean(temperatures), })
+        summary_strings.append(manual_scalar_summary("log grad variance", grad_variances))
+      print('Step %d: %s\n%s' % (step, str(np.mean(lHats, axis=0)), str(grad_variances)))
+      # Every few epochs compute test and validation scores
+      epoch = int(step / (train_xs.shape[0] / sbn.hparams.batch_size))
+      if epoch % FLAGS.eval_freq == 0:
+        valid_res = eval(sbn, valid_xs)
+        test_res= eval(sbn, test_xs)
+        print('\nValid %d: %s' % (step, str(valid_res)))
+        print('Test %d: %s\n' % (step, str(test_res)))
+        logger.log(rowkey, {'step': step,
+                             'valid': valid_res[0],
+                             'test': test_res[0]})
+        logger.flush()  # Flush infrequently
+      # Create summaries
+      summary_strings.extend([
+        manual_scalar_summary("Train ELBO", np.mean(lHats, axis=0)[0]),
+        manual_scalar_summary("Temperature", np.mean(temperatures)),
+      ])
+      for summ_str in summary_strings:
+        summary_writer.add_summary(summ_str, global_step=step)
+      summary_writer.flush()
+      sys.stdout.flush()
+      scores.append(np.mean(lHats, axis=0))
+      if step > training_steps:
+        break
+    return scores
+def main():
+  # Parse hyperparams
+  hparams = rebar.default_hparams
+  hparams.parse(FLAGS.hparams)
+  print(hparams.values())
+  train_xs, valid_xs, test_xs = datasets.load_data(hparams)
+  mean_xs = np.mean(train_xs, axis=0)  # Compute mean centering on training
+  training_steps = 2000000
+  model = getattr(rebar, hparams.model)
+  sbn = model(hparams, mean_xs=mean_xs)
+  scores = train(sbn, train_xs, valid_xs, test_xs,
+                 training_steps=training_steps, debug=False)
+if __name__ == '__main__':
+  main()
--- a/rebar/utils.py
+++ b/rebar/utils.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Basic data management and plotting utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import cPickle as pickle
+import getpass
+import numpy as np
+import gc
+import tensorflow as tf
+#
+# Python utlities
+#
+def exp_moving_average(x, alpha=0.9):
+  res = []
+  mu = 0
+  alpha_factor = 1
+  for x_i in x:
+    mu += (1 - alpha)*(x_i - mu)
+    alpha_factor *= alpha
+    res.append(mu/(1 - alpha_factor))
+  return np.array(res)
+def sanitize(s):
+  return s.replace('.', '_')
+#
+# Tensorflow utilities
+#
+def softplus(x):
+  '''
+  Let m = max(0, x), then,
+  sofplus(x) = log(1 + e(x)) = log(e(0) + e(x)) = log(e(m)(e(-m) + e(x-m)))
+             = m + log(e(-m) + e(x - m))
+  The term inside of the log is guaranteed to be between 1 and 2.
+  '''
+  m = tf.maximum(tf.zeros_like(x), x)
+  return m + tf.log(tf.exp(-m) + tf.exp(x - m))
+def safe_log_prob(x, eps=1e-8):
+  return tf.log(tf.clip_by_value(x, eps, 1.0))
+def rms(x):
+  return tf.sqrt(tf.reduce_mean(tf.square(x)))
+def center(x):
+  mu = (tf.reduce_sum(x) - x)/tf.to_float(tf.shape(x)[0] - 1)
+  return x - mu
+def vectorize(grads_and_vars, set_none_to_zero=False, skip_none=False):
+  if set_none_to_zero:
+    return tf.concat([tf.reshape(g, [-1]) if g is not None else
+                         tf.reshape(tf.zeros_like(v), [-1]) for g, v in grads_and_vars], 0)
+  elif skip_none:
+    return tf.concat([tf.reshape(g, [-1]) for g, v in grads_and_vars if g is not None], 0)
+  else:
+    return tf.concat([tf.reshape(g, [-1]) for g, v in grads_and_vars], 0)
+def add_grads_and_vars(a, b):
+  '''Add grads_and_vars from two calls to tf.compute_gradients.'''
+  res = []
+  for (g_a, v_a), (g_b, v_b) in zip(a, b):
+    assert v_a == v_b
+    if g_a is None:
+      res.append((g_b, v_b))
+    elif g_b is None:
+      res.append((g_a, v_a))
+    else:
+      res.append((g_a + g_b, v_a))
+  return res
+def binary_log_likelihood(y, log_y_hat):
+  """Computes binary log likelihood.
+  Args:
+    y: observed data
+    log_y_hat: parameters of the binary variables
+  Returns:
+    log_likelihood
+  """
+  return tf.reduce_sum(y*(-softplus(-log_y_hat)) +
+                       (1 - y)*(-log_y_hat-softplus(-log_y_hat)),
+                       1)
+def cov(a, b):
+  """Compute the sample covariance between two vectors."""
+  mu_a = tf.reduce_mean(a)
+  mu_b = tf.reduce_mean(b)
+  n = tf.to_float(tf.shape(a)[0])
+  return tf.reduce_sum((a - mu_a)*(b - mu_b))/(n - 1.0)
+def corr(a, b):
+  return cov(a, b)*tf.rsqrt(cov(a, a))*tf.rsqrt(cov(b, b))
+def logSumExp(t, axis=0, keep_dims = False):
+  '''Computes the log(sum(exp(t))) numerically stabily.
+  Args:
+    t: input tensor
+    axis: which axis to sum over
+    keep_dims: whether to keep the dim or not
+  Returns:
+    tensor with result
+  '''
+  m = tf.reduce_max(t, [axis])
+  res = m + tf.log(tf.reduce_sum(tf.exp(t - tf.expand_dims(m, axis)), [axis]))
+  if keep_dims:
+    return tf.expand_dims(res, axis)
+  else:
+    return res
+if __name__ == '__main__':
+  app.run()
--- a/resnet/cifar_input.py
+++ b/resnet/cifar_input.py
@@ -58,7 +58,7 @@ def build_input(dataset, data_path, batch_size, mode):
  record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
  label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
  # Convert from string to [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
+  depth_major = tf.reshape(tf.slice(record, [label_offset + label_bytes], [image_bytes]),
                           [depth, image_size, image_size])
  # Convert from [depth, height, width] to [height, width, depth].
  image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)

--- a/slim/BUILD
+++ b/slim/BUILD
@@ -2,8 +2,7 @@
 #   Contains files for loading, training and evaluating TF-Slim-based models.
 package(default_visibility = [
-    ":internal",
+    "//visibility:public",
-    "//domain_adaptation:__subpackages__",
 ])
 licenses(["notice"])  # Apache 2.0

--- a/slim/README.md
+++ b/slim/README.md
@@ -213,7 +213,7 @@ Model | TF-Slim File | Checkpoint | Top-1 Accuracy| Top-5 Accuracy |
 ^ ResNet V2 models use Inception pre-processing and input image size of 299 (use
 `--preprocessing_name inception --eval_image_size 299` when using
 `eval_image_classifier.py`). Performance numbers for ResNet V2 models are
-reported on ImageNet valdiation set.
+reported on the ImageNet validation set.
 All 16 MobileNet Models reported in the [MobileNet Paper](https://arxiv.org/abs/1704.04861) can be found [here](https://github.com/tensorflow/models/tree/master/slim/nets/mobilenet_v1.md).
@@ -256,6 +256,17 @@ and/or multiple CPUs, either synchrononously or asynchronously.
 See [model_deploy](https://github.com/tensorflow/models/blob/master/slim/deployment/model_deploy.py)
 for details.
+### TensorBoard
+To visualize the losses and other metrics during training, you can use
+[TensorBoard](https://github.com/tensorflow/tensorboard)
+by running the command below.
+```shell
+tensorboard --logdir=${TRAIN_DIR}
+```
+Once TensorBoard is running, navigate your web browser to http://localhost:6006.
 # Fine-tuning a model from an existing checkpoint
 <a id='Tuning'></a>
@@ -392,8 +403,7 @@ bazel-bin/tensorflow/examples/label_image/label_image \
  --graph=/tmp/frozen_inception_v3.pb \
  --labels=/tmp/imagenet_slim_labels.txt \
  --input_mean=0 \
-  --input_std=255 \
+  --input_std=255
-  --logtostderr
 ```

--- a/slim/download_and_convert_data.py
+++ b/slim/download_and_convert_data.py
@@ -67,7 +67,7 @@ def main(_):
    download_and_convert_mnist.run(FLAGS.dataset_dir)
  else:
    raise ValueError(
-        'dataset_name [%s] was not recognized.' % FLAGS.dataset_dir)
+        'dataset_name [%s] was not recognized.' % FLAGS.dataset_name)
 if __name__ == '__main__':
  tf.app.run()

--- a/slim/export_inference_graph.py
+++ b/slim/export_inference_graph.py
@@ -48,8 +48,7 @@ bazel-bin/tensorflow/examples/label_image/label_image \
 --graph=/tmp/frozen_inception_v3.pb \
 --labels=/tmp/imagenet_slim_labels.txt \
 --input_mean=0 \
--input_std=255 \
+--input_std=255
--logtostderr
 """
@@ -63,7 +62,6 @@ from tensorflow.python.platform import gfile
 from datasets import dataset_factory
 from nets import nets_factory
 slim = tf.contrib.slim
 tf.app.flags.DEFINE_string(
@@ -74,8 +72,13 @@ tf.app.flags.DEFINE_boolean(
    'Whether to save out a training-focused version of the model.')
 tf.app.flags.DEFINE_integer(
-    'default_image_size', 224,
+    'image_size', None,
-    'The image size to use if the model does not define it.')
+    'The image size to use, otherwise use the model default_image_size.')
+tf.app.flags.DEFINE_integer(
+    'batch_size', None,
+    'Batch size for the exported model. Defaulted to "None" so batch size can '
+    'be specified at model runtime.')
 tf.app.flags.DEFINE_string('dataset_name', 'imagenet',
                           'The name of the dataset to use with the model.')
@@ -100,18 +103,16 @@ def main(_):
    raise ValueError('You must supply the path to save to with --output_file')
  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default() as graph:
-    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'validation',
+    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train',
                                          FLAGS.dataset_dir)
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=FLAGS.is_training)
-    if hasattr(network_fn, 'default_image_size'):
+    image_size = FLAGS.image_size or network_fn.default_image_size
-      image_size = network_fn.default_image_size
-    else:
-      image_size = FLAGS.default_image_size
    placeholder = tf.placeholder(name='input', dtype=tf.float32,
-                                 shape=[1, image_size, image_size, 3])
+                                 shape=[FLAGS.batch_size, image_size,
+                                        image_size, 3])
    network_fn(placeholder)
    graph_def = graph.as_graph_def()
    with gfile.GFile(FLAGS.output_file, 'wb') as f:

--- a/slim/export_inference_graph_test.py
+++ b/slim/export_inference_graph_test.py
@@ -25,7 +25,7 @@ import os
 import tensorflow as tf
 from tensorflow.python.platform import gfile
-from google3.third_party.tensorflow_models.slim import export_inference_graph
+import export_inference_graph
 class ExportInferenceGraphTest(tf.test.TestCase):

--- a/slim/nets/inception_resnet_v2.py
+++ b/slim/nets/inception_resnet_v2.py
@@ -331,7 +331,7 @@ inception_resnet_v2.default_image_size = 299
 def inception_resnet_v2_arg_scope(weight_decay=0.00004,
                                  batch_norm_decay=0.9997,
                                  batch_norm_epsilon=0.001):
-  """Yields the scope with the default parameters for inception_resnet_v2.
+  """Returns the scope with the default parameters for inception_resnet_v2.
  Args:
    weight_decay: the weight decay for weights variables.

--- a/slim/nets/inception_v1.py
+++ b/slim/nets/inception_v1.py
@@ -93,7 +93,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 32, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -110,7 +111,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -132,7 +134,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -149,7 +152,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -166,7 +170,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -183,7 +188,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -200,7 +206,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -222,7 +229,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
@@ -239,7 +247,8 @@ def inception_v1_base(inputs,
          with tf.variable_scope('Branch_3'):
            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
            branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if final_endpoint == end_point: return net, end_points
    raise ValueError('Unknown final endpoint %s' % final_endpoint)
@@ -270,8 +279,8 @@ def inception_v1(inputs,
    is_training: whether is training or not.
    dropout_keep_prob: the percentage of activation values that are retained.
    prediction_fn: a function to get predictions out of logits.
-    spatial_squeeze: if True, logits is of shape [B, C], if false logits is
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
-        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.

--- a/slim/nets/inception_v2.py
+++ b/slim/nets/inception_v2.py
@@ -30,6 +30,8 @@ def inception_v2_base(inputs,
                      final_endpoint='Mixed_5c',
                      min_depth=16,
                      depth_multiplier=1.0,
+                      use_separable_conv=True,
+                      data_format='NHWC',
                      scope=None):
  """Inception v2 (6a2).
@@ -51,6 +53,9 @@ def inception_v2_base(inputs,
      for all convolution ops. The value must be greater than zero. Typical
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
+    use_separable_conv: Use a separable convolution for the first layer
+      Conv2d_1a_7x7. If this is False, use a normal convolution instead.
+    data_format: Data format of the activations ('NHWC' or 'NCHW').
    scope: Optional variable_scope.
  Returns:
@@ -72,28 +77,52 @@ def inception_v2_base(inputs,
    raise ValueError('depth_multiplier is not greater than zero.')
  depth = lambda d: max(int(d * depth_multiplier), min_depth)
+  if data_format != 'NHWC' and data_format != 'NCHW':
+    raise ValueError('data_format must be either NHWC or NCHW.')
+  if data_format == 'NCHW' and use_separable_conv:
+    raise ValueError(
+        'separable convolution only supports NHWC layout. NCHW data format can'
+        ' only be used when use_separable_conv is False.'
+    )
+  concat_dim = 3 if data_format == 'NHWC' else 1
  with tf.variable_scope(scope, 'InceptionV2', [inputs]):
    with slim.arg_scope(
-        [slim.conv2d, slim.max_pool2d, slim.avg_pool2d, slim.separable_conv2d],
+        [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
-        stride=1, padding='SAME'):
+        stride=1,
+        padding='SAME',
+        data_format=data_format):
      # Note that sizes in the comments below assume an input spatial size of
      # 224x224, however, the inputs can be of any size greater 32x32.
      # 224 x 224 x 3
      end_point = 'Conv2d_1a_7x7'
-      # depthwise_multiplier here is different from depth_multiplier.
-      # depthwise_multiplier determines the output channels of the initial
+      if use_separable_conv:
-      # depthwise conv (see docs for tf.nn.separable_conv2d), while
+        # depthwise_multiplier here is different from depth_multiplier.
-      # depth_multiplier controls the # channels of the subsequent 1x1
+        # depthwise_multiplier determines the output channels of the initial
-      # convolution. Must have
+        # depthwise conv (see docs for tf.nn.separable_conv2d), while
-      #   in_channels * depthwise_multipler <= out_channels
+        # depth_multiplier controls the # channels of the subsequent 1x1
-      # so that the separable convolution is not overparameterized.
+        # convolution. Must have
-      depthwise_multiplier = min(int(depth(64) / 3), 8)
+        #   in_channels * depthwise_multipler <= out_channels
-      net = slim.separable_conv2d(
+        # so that the separable convolution is not overparameterized.
-          inputs, depth(64), [7, 7], depth_multiplier=depthwise_multiplier,
+        depthwise_multiplier = min(int(depth(64) / 3), 8)
-          stride=2, weights_initializer=trunc_normal(1.0),
+        net = slim.separable_conv2d(
-          scope=end_point)
+            inputs, depth(64), [7, 7],
+            depth_multiplier=depthwise_multiplier,
+            stride=2,
+            padding='SAME',
+            weights_initializer=trunc_normal(1.0),
+            scope=end_point)
+      else:
+        # Use a normal convolution instead of a separable convolution.
+        net = slim.conv2d(
+            inputs,
+            depth(64), [7, 7],
+            stride=2,
+            weights_initializer=trunc_normal(1.0),
+            scope=end_point)
      end_points[end_point] = net
      if end_point == final_endpoint: return net, end_points
      # 112 x 112 x 64
@@ -145,7 +174,8 @@ def inception_v2_base(inputs,
              branch_3, depth(32), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 28 x 28 x 256
@@ -175,7 +205,8 @@ def inception_v2_base(inputs,
              branch_3, depth(64), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 28 x 28 x 320
@@ -200,7 +231,7 @@ def inception_v2_base(inputs,
        with tf.variable_scope('Branch_2'):
          branch_2 = slim.max_pool2d(
              net, [3, 3], stride=2, scope='MaxPool_1a_3x3')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
+        net = tf.concat(axis=concat_dim, values=[branch_0, branch_1, branch_2])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 14 x 14 x 576
@@ -230,7 +261,8 @@ def inception_v2_base(inputs,
              branch_3, depth(128), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 14 x 14 x 576
@@ -260,7 +292,8 @@ def inception_v2_base(inputs,
              branch_3, depth(128), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 14 x 14 x 576
@@ -290,10 +323,10 @@ def inception_v2_base(inputs,
              branch_3, depth(96), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 14 x 14 x 576
      end_point = 'Mixed_4e'
      with tf.variable_scope(end_point):
@@ -321,7 +354,8 @@ def inception_v2_base(inputs,
              branch_3, depth(96), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 14 x 14 x 576
@@ -346,7 +380,8 @@ def inception_v2_base(inputs,
        with tf.variable_scope('Branch_2'):
          branch_2 = slim.max_pool2d(net, [3, 3], stride=2,
                                     scope='MaxPool_1a_3x3')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 7 x 7 x 1024
@@ -376,10 +411,10 @@ def inception_v2_base(inputs,
              branch_3, depth(128), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
      # 7 x 7 x 1024
      end_point = 'Mixed_5c'
      with tf.variable_scope(end_point):
@@ -407,7 +442,8 @@ def inception_v2_base(inputs,
              branch_3, depth(128), [1, 1],
              weights_initializer=trunc_normal(0.1),
              scope='Conv2d_0b_1x1')
-        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
        end_points[end_point] = net
        if end_point == final_endpoint: return net, end_points
    raise ValueError('Unknown final endpoint %s' % final_endpoint)
@@ -443,8 +479,8 @@ def inception_v2(inputs,
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
    prediction_fn: a function to get predictions out of logits.
-    spatial_squeeze: if True, logits is of shape [B, C], if false logits is
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
-        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
@@ -504,8 +540,8 @@ def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
  known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot
  handle tensors that define the kernel size.
      shape = tf.shape(input_tensor)
-      return = tf.pack([tf.minimum(shape[1], kernel_size[0]),
+      return = tf.stack([tf.minimum(shape[1], kernel_size[0]),
-                        tf.minimum(shape[2], kernel_size[1])])
+                         tf.minimum(shape[2], kernel_size[1])])
  """
  shape = input_tensor.get_shape().as_list()

--- a/slim/nets/inception_v2_test.py
+++ b/slim/nets/inception_v2_test.py
@@ -164,6 +164,68 @@ class InceptionV2Test(tf.test.TestCase):
    with self.assertRaises(ValueError):
      _ = inception.inception_v2(inputs, num_classes, depth_multiplier=0.0)
+  def testBuildEndPointsWithUseSeparableConvolutionFalse(self):
+    batch_size = 5
+    height, width = 224, 224
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2_base(inputs)
+    endpoint_keys = [
+        key for key in end_points.keys()
+        if key.startswith('Mixed') or key.startswith('Conv')
+    ]
+    _, end_points_with_replacement = inception.inception_v2_base(
+        inputs, use_separable_conv=False)
+    # The endpoint shapes must be equal to the original shape even when the
+    # separable convolution is replaced with a normal convolution.
+    for key in endpoint_keys:
+      original_shape = end_points[key].get_shape().as_list()
+      self.assertTrue(key in end_points_with_replacement)
+      new_shape = end_points_with_replacement[key].get_shape().as_list()
+      self.assertListEqual(original_shape, new_shape)
+  def testBuildEndPointsNCHWDataFormat(self):
+    batch_size = 5
+    height, width = 224, 224
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2_base(inputs)
+    endpoint_keys = [
+        key for key in end_points.keys()
+        if key.startswith('Mixed') or key.startswith('Conv')
+    ]
+    inputs_in_nchw = tf.random_uniform((batch_size, 3, height, width))
+    _, end_points_with_replacement = inception.inception_v2_base(
+        inputs_in_nchw, use_separable_conv=False, data_format='NCHW')
+    # With the 'NCHW' data format, all endpoint activations have a transposed
+    # shape from the original shape with the 'NHWC' layout.
+    for key in endpoint_keys:
+      transposed_original_shape = tf.transpose(
+          end_points[key], [0, 3, 1, 2]).get_shape().as_list()
+      self.assertTrue(key in end_points_with_replacement)
+      new_shape = end_points_with_replacement[key].get_shape().as_list()
+      self.assertListEqual(transposed_original_shape, new_shape)
+  def testBuildErrorsForDataFormats(self):
+    batch_size = 5
+    height, width = 224, 224
+    inputs = tf.random_uniform((batch_size, height, width, 3))
+    # 'NCWH' data format is not supported.
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v2_base(inputs, data_format='NCWH')
+    # 'NCHW' data format is not supported for separable convolution.
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v2_base(inputs, data_format='NCHW')
  def testHalfSizeImages(self):
    batch_size = 5
    height, width = 112, 112

--- a/slim/nets/inception_v3.py
+++ b/slim/nets/inception_v3.py
@@ -425,6 +425,7 @@ def inception_v3(inputs,
                 prediction_fn=slim.softmax,
                 spatial_squeeze=True,
                 reuse=None,
+                 create_aux_logits=True,
                 scope='InceptionV3'):
  """Inception model from http://arxiv.org/abs/1512.00567.
@@ -453,10 +454,11 @@ def inception_v3(inputs,
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
    prediction_fn: a function to get predictions out of logits.
-    spatial_squeeze: if True, logits is of shape [B, C], if false logits is
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
-        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
+    create_aux_logits: Whether to create the auxiliary logits.
    scope: Optional variable_scope.
  Returns:
@@ -481,30 +483,31 @@ def inception_v3(inputs,
          depth_multiplier=depth_multiplier)
      # Auxiliary Head logits
-      with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+      if create_aux_logits:
-                          stride=1, padding='SAME'):
+        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
-        aux_logits = end_points['Mixed_6e']
+                            stride=1, padding='SAME'):
-        with tf.variable_scope('AuxLogits'):
+          aux_logits = end_points['Mixed_6e']
-          aux_logits = slim.avg_pool2d(
+          with tf.variable_scope('AuxLogits'):
-              aux_logits, [5, 5], stride=3, padding='VALID',
+            aux_logits = slim.avg_pool2d(
-              scope='AvgPool_1a_5x5')
+                aux_logits, [5, 5], stride=3, padding='VALID',
-          aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
+                scope='AvgPool_1a_5x5')
-                                   scope='Conv2d_1b_1x1')
+            aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
+                                     scope='Conv2d_1b_1x1')
-          # Shape of feature map before the final layer.
-          kernel_size = _reduced_kernel_size_for_small_input(
+            # Shape of feature map before the final layer.
-              aux_logits, [5, 5])
+            kernel_size = _reduced_kernel_size_for_small_input(
-          aux_logits = slim.conv2d(
+                aux_logits, [5, 5])
-              aux_logits, depth(768), kernel_size,
+            aux_logits = slim.conv2d(
-              weights_initializer=trunc_normal(0.01),
+                aux_logits, depth(768), kernel_size,
-              padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
+                weights_initializer=trunc_normal(0.01),
-          aux_logits = slim.conv2d(
+                padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
-              aux_logits, num_classes, [1, 1], activation_fn=None,
+            aux_logits = slim.conv2d(
-              normalizer_fn=None, weights_initializer=trunc_normal(0.001),
+                aux_logits, num_classes, [1, 1], activation_fn=None,
-              scope='Conv2d_2b_1x1')
+                normalizer_fn=None, weights_initializer=trunc_normal(0.001),
-          if spatial_squeeze:
+                scope='Conv2d_2b_1x1')
-            aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
+            if spatial_squeeze:
-          end_points['AuxLogits'] = aux_logits
+              aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
+            end_points['AuxLogits'] = aux_logits
      # Final pooling and prediction
      with tf.variable_scope('Logits'):
@@ -544,8 +547,8 @@ def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
  known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot
  handle tensors that define the kernel size.
      shape = tf.shape(input_tensor)
-      return = tf.pack([tf.minimum(shape[1], kernel_size[0]),
+      return = tf.stack([tf.minimum(shape[1], kernel_size[0]),
-                        tf.minimum(shape[2], kernel_size[1])])
+                         tf.minimum(shape[2], kernel_size[1])])
  """
  shape = input_tensor.get_shape().as_list()

--- a/slim/nets/mobilenet_v1.py
+++ b/slim/nets/mobilenet_v1.py
@@ -27,6 +27,8 @@ As described in https://arxiv.org/abs/1704.04861.
 100% Mobilenet V1 (base) with input size 224x224:
+See mobilenet_v1()
 Layer                                                     params           macs
 --------------------------------------------------------------------------------
 MobilenetV1/Conv2d_0/Conv2D:                                 864      10,838,016
@@ -62,6 +64,8 @@ Total:                                                 3,185,088     567,716,352
 75% Mobilenet V1 (base) with input size 128x128:
+See mobilenet_v1_075()
 Layer                                                     params           macs
 --------------------------------------------------------------------------------
 MobilenetV1/Conv2d_0/Conv2D:                                 648       2,654,208
@@ -102,6 +106,7 @@ from __future__ import division
 from __future__ import print_function
 from collections import namedtuple
+import functools
 import tensorflow as tf
@@ -335,6 +340,17 @@ def mobilenet_v1(inputs,
 mobilenet_v1.default_image_size = 224
+def wrapped_partial(func, *args, **kwargs):
+  partial_func = functools.partial(func, *args, **kwargs)
+  functools.update_wrapper(partial_func, func)
+  return partial_func
+mobilenet_v1_075 = wrapped_partial(mobilenet_v1, depth_multiplier=0.75)
+mobilenet_v1_050 = wrapped_partial(mobilenet_v1, depth_multiplier=0.50)
+mobilenet_v1_025 = wrapped_partial(mobilenet_v1, depth_multiplier=0.25)
 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
  """Define kernel size which is automatically reduced for small input.

--- a/slim/nets/nets_factory.py
+++ b/slim/nets/nets_factory.py
@@ -54,6 +54,9 @@ networks_map = {'alexnet_v2': alexnet.alexnet_v2,
                'resnet_v2_152': resnet_v2.resnet_v2_152,
                'resnet_v2_200': resnet_v2.resnet_v2_200,
                'mobilenet_v1': mobilenet_v1.mobilenet_v1,
+                'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_075,
+                'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_050,
+                'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_025,
               }
 arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
@@ -78,6 +81,9 @@ arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
                  'resnet_v2_152': resnet_v2.resnet_arg_scope,
                  'resnet_v2_200': resnet_v2.resnet_arg_scope,
                  'mobilenet_v1': mobilenet_v1.mobilenet_v1_arg_scope,
+                  'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_arg_scope,
+                  'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_arg_scope,
+                  'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_arg_scope,
                 }
@@ -100,10 +106,10 @@ def get_network_fn(name, num_classes, weight_decay=0.0, is_training=False):
  """
  if name not in networks_map:
    raise ValueError('Name of network unknown %s' % name)
-  arg_scope = arg_scopes_map[name](weight_decay=weight_decay)
  func = networks_map[name]
  @functools.wraps(func)
  def network_fn(images):
+    arg_scope = arg_scopes_map[name](weight_decay=weight_decay)
    with slim.arg_scope(arg_scope):
      return func(images, num_classes, is_training=is_training)
  if hasattr(func, 'default_image_size'):