First commit of learning_unsupervised_learning

7148c1f5 · Luke Metz · d640ab9c · 7148c1f5 · 7148c1f5 · 7148c1f5
Commit 7148c1f5 authored Mar 31, 2018 by Luke Metz
20 changed files
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -17,6 +17,7 @@
 /research/inception/ @shlens @vincentvanhoucke
 /research/learned_optimizer/ @olganw @nirum
 /research/learning_to_remember_rare_events/ @lukaszkaiser @ofirnachum
+/research/learning_unsupervised_learning/ @lukemetz @nirum
 /research/lexnet_nc/ @vered1986 @waterson
 /research/lfads/ @jazcollins @susillo
 /research/lm_1b/ @oriolvinyals @panyx0718

--- a/research/README.md
+++ b/research/README.md
@@ -36,6 +36,8 @@ installation](https://www.tensorflow.org/install).
 -   [inception](inception): deep convolutional networks for computer vision.
 -   [learning_to_remember_rare_events](learning_to_remember_rare_events): a
    large-scale life-long memory module for use in deep learning.
+-   [learning_unsupervised_learning](learning_unsupervised_learning): a
+    meta-learned unsupervised learning update rule.
 -   [lexnet_nc](lexnet_nc): a distributed model for noun compound relationship
    classification.
 -   [lfads](lfads): sequential variational autoencoder for analyzing

--- a/research/learning_unsupervised_learning/.gitignore
+++ b/research/learning_unsupervised_learning/.gitignore
+*.pyc
--- a/research/learning_unsupervised_learning/README.md
+++ b/research/learning_unsupervised_learning/README.md
+# Learning Unsupervised Learning Rules
+This repository contains code and weights for the learned update rule
+presented in "Learning Unsupervised Learning Rules." At this time, this
+code can not meta-train the update rule.
+### Structure
+`run_eval.py` contains the main training loop. This constructs an op
+that runs one iteration of the learned update rule and assigns the
+results to variables. Additionally, it loads the weights from our
+pre-trained model.
+The base model and the update rule architecture definition can be found in
+`architectures/more_local_weight_update.py`. For a complete description
+of the model, see our [paper](https://arxiv.org/abs/1804.00222).
+### Dependencies
+[absl]([https://github.com/abseil/abseil-py), [tensorflow](https://tensorflow.org), [sonnet](https://github.com/deepmind/sonnet)
+### Usage
+First, download the [pre-trained optimizer model weights](https://storage.googleapis.com/learning_unsupervised_learning/200_tf_graph.zip) and extract it.
+```bash
+# move to the folder above this folder
+cd path_to/research/learning_unsupervised_learning/../
+# launch the eval script
+python -m learning_unsupervised_learning.run_eval \
+--train_log_dir="/tmp/learning_unsupervised_learning" \
+--checkpoint_dir="/path/to/downloaded/model/tf_graph_data.ckpt"
+```
+### Contact
+Luke Metz, Niru Maheswaranathan, Github: @lukemetz, @nirum. Email: {lmetz, nirum}@google.com
--- a/research/learning_unsupervised_learning/__init__.py
+++ b/research/learning_unsupervised_learning/__init__.py
--- a/research/learning_unsupervised_learning/architectures/__init__.py
+++ b/research/learning_unsupervised_learning/architectures/__init__.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import more_local_weight_update
--- a/research/learning_unsupervised_learning/architectures/common.py
+++ b/research/learning_unsupervised_learning/architectures/common.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sonnet as snt
+import tensorflow as tf
+import numpy as np
+import collections
+from learning_unsupervised_learning import utils
+from tensorflow.python.util import nest
+from learning_unsupervised_learning import variable_replace
+class LinearBatchNorm(snt.AbstractModule):
+  """Module that does a Linear layer then a BatchNorm followed by an activation fn"""
+  def __init__(self, size, activation_fn=tf.nn.relu, name="LinearBatchNorm"):
+    self.size = size
+    self.activation_fn = activation_fn
+    super(LinearBatchNorm, self).__init__(name=name)
+  def _build(self, x):
+    x = tf.to_float(x)
+    initializers={"w": tf.truncated_normal_initializer(stddev=0.01)}
+    lin = snt.Linear(self.size, use_bias=False, initializers=initializers)
+    z = lin(x)
+    scale = tf.constant(1., dtype=tf.float32)
+    offset = tf.get_variable(
+        "b",
+        shape=[1, z.shape.as_list()[1]],
+        initializer=tf.truncated_normal_initializer(stddev=0.1),
+        dtype=tf.float32
+    )
+    mean, var = tf.nn.moments(z, [0], keep_dims=True)
+    z = ((z - mean) * tf.rsqrt(var + 1e-6)) * scale + offset
+    x_p = self.activation_fn(z)
+    return z, x_p
+  # This needs to work by string name sadly due to how the variable replace
+  # works and would also work even if the custom getter approuch was used.
+  # This is verbose, but it should atleast be clear as to what is going on.
+  # TODO(lmetz) a better way to do this (the next 3 functions:
+  #    _raw_name, w(), b() )
+  def _raw_name(self, var_name):
+    """Return just the name of the variable, not the scopes."""
+    return var_name.split("/")[-1].split(":")[0]
+  @property
+  def w(self):
+    var_list = snt.get_variables_in_module(self)
+    w = [x for x in var_list if self._raw_name(x.name) == "w"]
+    assert len(w) == 1
+    return w[0]
+  @property
+  def b(self):
+    var_list = snt.get_variables_in_module(self)
+    b = [x for x in var_list if self._raw_name(x.name) == "b"]
+    assert len(b) == 1
+    return b[0]
+class Linear(snt.AbstractModule):
+  def __init__(self, size, use_bias=True, init_const_mag=True):
+    self.size = size
+    self.use_bias = use_bias
+    self.init_const_mag = init_const_mag
+    super(Linear, self).__init__(name="commonLinear")
+  def _build(self, x):
+    if self.init_const_mag:
+      initializers={"w": tf.truncated_normal_initializer(stddev=0.01)}
+    else:
+      initializers={}
+    lin = snt.Linear(self.size, use_bias=self.use_bias, initializers=initializers)
+    z = lin(x)
+    return z
+  # This needs to work by string name sadly due to how the variable replace
+  # works and would also work even if the custom getter approuch was used.
+  # This is verbose, but it should atleast be clear as to what is going on.
+  # TODO(lmetz) a better way to do this (the next 3 functions:
+  #    _raw_name, w(), b() )
+  def _raw_name(self, var_name):
+    """Return just the name of the variable, not the scopes."""
+    return var_name.split("/")[-1].split(":")[0]
+  @property
+  def w(self):
+    var_list = snt.get_variables_in_module(self)
+    if self.use_bias:
+      assert len(var_list) == 2, "Found not 2 but %d" % len(var_list)
+    else:
+      assert len(var_list) == 1, "Found not 1 but %d" % len(var_list)
+    w = [x for x in var_list if self._raw_name(x.name) == "w"]
+    assert len(w) == 1
+    return w[0]
+  @property
+  def b(self):
+    var_list = snt.get_variables_in_module(self)
+    assert len(var_list) == 2, "Found not 2 but %d" % len(var_list)
+    b = [x for x in var_list if self._raw_name(x.name) == "b"]
+    assert len(b) == 1
+    return b[0]
+def transformer_at_state(base_model, new_variables):
+  """Get the base_model that has been transformed to use the variables
+  in final_state.
+  Args:
+    base_model: snt.Module
+      Goes from batch to features
+    new_variables: list
+      New list of variables to use
+  Returns:
+    func: callable of same api as base_model.
+  """
+  assert not variable_replace.in_variable_replace_scope()
+  def _feature_transformer(input_data):
+    """Feature transformer at the end of training."""
+    initial_variables = base_model.get_variables()
+    replacement = collections.OrderedDict(
+        utils.eqzip(initial_variables, new_variables))
+    with variable_replace.variable_replace(replacement):
+      features = base_model(input_data)
+    return features
+  return _feature_transformer
--- a/research/learning_unsupervised_learning/architectures/more_local_weight_update.py
+++ b/research/learning_unsupervised_learning/architectures/more_local_weight_update.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+from learning_unsupervised_learning.architectures import common
+from learning_unsupervised_learning import optimizers
+from learning_unsupervised_learning import utils
+from learning_unsupervised_learning import summary_utils
+OptState = collections.namedtuple('OptState',
+                                  ['variables', 'opt_state', 'index'])
+BaseModelOutputs = collections.namedtuple(
+    'BaseModelOutputs', ['xs', 'zs', 'mods', 'batch', 'backward_mods'])
+class GradChannelReadout(snt.AbstractModule):
+  """Perform a linear readout and reshape from input 3 tensor."""
+  def __init__(self,
+               num_grad_channels,
+               device,
+               perm=(2, 0, 1),
+               name='GradChannelReadout'):
+    """Args:
+      num_grad_channels: int
+        number of channels to readout to.
+      device: str or callable
+        devicwe to place weights.
+      perm: list or tuple
+        transpose applied.
+    """
+    self.num_grad_channels = num_grad_channels
+    self.device = device
+    self.perm = perm
+    super(GradChannelReadout, self).__init__(name=name)
+  def _build(self, h):
+    with tf.device(self.device):
+      mod = snt.Linear(self.num_grad_channels)
+      ret = snt.BatchApply(mod)(h)
+      # return as [num_grad_channels] x [bs] x [num units]
+      return tf.transpose(ret, perm=self.perm)
+def get_weight_stats(x, axis):
+  """ Compute weight statistics over the given axis.
+  Args:
+    x: tf.Tensor
+      a batch of activations.
+    axis: int
+      axis to perform statistics over.
+  Returns:
+    tf.Tensor
+      a 3-D tensor with statistics.
+  """
+  if x is None:
+    return []
+  stats = []
+  l1 = tf.reduce_mean(tf.abs(x), axis=axis)
+  l2 = tf.sqrt(tf.reduce_mean(x**2, axis=axis) + 1e-6)
+  mean, var = tf.nn.moments(x, [axis])
+  stats.extend([l1, l2, mean, tf.sqrt(var + 1e-8)])
+  stats = [tf.reshape(s, [-1, 1, 1]) for s in stats]
+  return stats
+class AddUnitBatchStatistics(snt.AbstractModule):
+  """Compute some number of statistics over units and concat them on."""
+  def __init__(self, name='AddUnitBatchStatistics'):
+    super(AddUnitBatchStatistics, self).__init__(name=name)
+  def _build(self, x):
+    # [channel, bs, 1]
+    output = x
+    for d in [0, 1]:
+      stats = []
+      l1 = tf.reduce_mean(tf.abs(x), axis=d, keepdims=True)
+      l2 = tf.sqrt(tf.reduce_mean(x**2, axis=d, keepdims=True) + 1e-6)
+      mean, var = tf.nn.moments(x, [d], keepdims=True)
+      stats.extend([l1, l2, mean, tf.sqrt(var + 1e-8)])
+      to_add = tf.concat(stats, axis=2)  # [channels/1, units/1, stats]
+      output += snt.BatchApply(snt.Linear(x.shape.as_list()[2]))(to_add)
+    return output
+class ConcatUnitConv(snt.AbstractModule):
+  """Do a small number of convolutions over units and concat / add them on."""
+  def __init__(self, add=True):
+    self.add = add
+    super(ConcatUnitConv, self).__init__(name='ConcatUnitConv')
+  def _build(self, x):
+    # x is [units, bs, 1]
+    net = tf.transpose(x, [1, 0, 2])  # now [bs x units x 1]
+    channels = x.shape.as_list()[2]
+    mod = snt.Conv1D(output_channels=channels, kernel_shape=[3])
+    net = mod(net)
+    net = snt.BatchNorm(axis=[0, 1])(net, is_training=False)
+    net = tf.nn.relu(net)
+    mod = snt.Conv1D(output_channels=channels, kernel_shape=[3])
+    net = mod(net)
+    net = snt.BatchNorm(axis=[0, 1])(net, is_training=False)
+    net = tf.nn.relu(net)
+    to_concat = tf.transpose(net, [1, 0, 2])
+    if self.add:
+      return x + to_concat
+    else:
+      return tf.concat([x, to_concat], 2)
+class MoreLocalWeightUpdateProcess(snt.AbstractModule):
+  def __init__(
+      self,
+      remote_device,
+      local_device,
+      top_delta_size=64,
+      top_delta_layers=2,
+      compute_h_size=64,
+      compute_h_layers=1,
+      delta_dim=32,
+      num_grad_channels=4,
+      normalize_epsilon=1.,
+  ):
+    self.local_device = local_device
+    self.remote_device = remote_device
+    self.top_delta_size = top_delta_size
+    self.top_delta_layers = top_delta_layers
+    self.compute_h_size = compute_h_size
+    self.compute_h_layers = compute_h_layers
+    self.delta_dim = delta_dim
+    self.num_grad_channels = num_grad_channels
+    self.normalize_epsilon = normalize_epsilon,
+    with tf.device(local_device):
+      self.opt = optimizers.UnrollableGradientDescentRollingOptimizer(
+          learning_rate=1e-4)
+    # lazily initialized for readouts
+    self.readout_mods = {}
+    super(MoreLocalWeightUpdateProcess,
+          self).__init__(name='MoreLocalWeightUpdateProcess')
+    with tf.device(remote_device):
+      self()
+  def normalize(self, change_w, normalize_epsilon=None):
+    if normalize_epsilon is None:
+      normalize_epsilon = self.normalize_epsilon
+    # normalize the weights per receptive-field, rather than per-matrix
+    var = tf.reduce_mean(tf.square(change_w), axis=0, keepdims=True)
+    change_w = (change_w) / tf.sqrt(normalize_epsilon + var)
+    return change_w
+  def _build(self):
+    pass
+  @snt.reuse_variables
+  def compute_top_delta(self, z):
+    """ parameterization of topD. This converts the top level activation
+    to an error signal.
+    Args:
+      z: tf.Tensor
+        batch of final layer post activations
+    Returns
+      delta: tf.Tensor
+        the error signal
+    """
+    s_idx = 0
+    with tf.variable_scope('compute_top_delta'), tf.device(self.remote_device):
+      # typically this takes [BS, length, input_channels],
+      # We are applying this such that we convolve over the batch dimension.
+      act = tf.expand_dims(tf.transpose(z, [1, 0]), 2)  # [channels, BS, 1]
+      mod = snt.Conv1D(output_channels=self.top_delta_size, kernel_shape=[5])
+      act = mod(act)
+      act = snt.BatchNorm(axis=[0, 1])(act, is_training=False)
+      act = tf.nn.relu(act)
+      bs = act.shape.as_list()[0]
+      act = tf.transpose(act, [2, 1, 0])
+      act = snt.Conv1D(output_channels=bs, kernel_shape=[3])(act)
+      act = snt.BatchNorm(axis=[0, 1])(act, is_training=False)
+      act = tf.nn.relu(act)
+      act = snt.Conv1D(output_channels=bs, kernel_shape=[3])(act)
+      act = snt.BatchNorm(axis=[0, 1])(act, is_training=False)
+      act = tf.nn.relu(act)
+      act = tf.transpose(act, [2, 1, 0])
+      prev_act = act
+      for i in range(self.top_delta_layers):
+        mod = snt.Conv1D(output_channels=self.top_delta_size, kernel_shape=[3])
+        act = mod(act)
+        act = snt.BatchNorm(axis=[0, 1])(act, is_training=False)
+        act = tf.nn.relu(act)
+        prev_act = act
+      mod = snt.Conv1D(output_channels=self.delta_dim, kernel_shape=[3])
+      act = mod(act)
+      # [bs, feature_channels, delta_channels]
+      act = tf.transpose(act, [1, 0, 2])
+      return act
+  @snt.reuse_variables
+  def compute_h(self,
+                x,
+                z,
+                d,
+                bias,
+                W_bot,
+                W_top,
+                compute_perc=1.0,
+                compute_units=None):
+    """z = [BS, n_units] a = [BS, n_units] b = [BS, n_units] d = [BS, n_units, delta_channels]
+    """
+    s_idx = 0
+    if compute_perc != 1.0:
+      assert compute_units is None
+    with tf.device(self.remote_device):
+      inp_feat = [x, z]
+      inp_feat = [tf.transpose(f, [1, 0]) for f in inp_feat]
+      units = x.shape.as_list()[1]
+      bs = x.shape.as_list()[0]
+      # add unit ID, to help the network differentiate units
+      id_theta = tf.linspace(0., (4) * np.pi, units)
+      assert bs is not None
+      id_theta_bs = tf.reshape(id_theta, [-1, 1]) * tf.ones([1, bs])
+      inp_feat += [tf.sin(id_theta_bs), tf.cos(id_theta_bs)]
+      # list of [units, BS, 1]
+      inp_feat = [tf.expand_dims(f, 2) for f in inp_feat]
+      d_trans = tf.transpose(d, [1, 0, 2])
+      if compute_perc != 1.0:
+        compute_units = int(compute_perc * inp_feat.shape.as_list()[0])
+      # add weight matrix statistics, both from above and below
+      w_stats_bot = get_weight_stats(W_bot, 0)
+      w_stats_top = get_weight_stats(W_top, 1)
+      w_stats = w_stats_bot + w_stats_top
+      if W_bot is None or W_top is None:
+        # if it's an edge layer (top or bottom), just duplicate the stats for
+        # the weight matrix that does exist
+        w_stats = w_stats + w_stats
+      w_stats = [tf.ones([1, x.shape[0], 1]) * ww for ww in w_stats]
+      # w_stats is a list, with entries with shape UNITS x 1 x channels
+      if compute_units is None:
+        inp_feat_in = inp_feat
+        d_trans_in = d_trans
+        w_stats_in = w_stats
+        bias_in = tf.transpose(bias)
+      else:
+        # only run on a subset of the activations.
+        mask = tf.random_uniform(
+            minval=0,
+            maxval=1,
+            dtype=tf.float32,
+            shape=inp_feat[0].shape.as_list()[0:1])
+        _, ind = tf.nn.top_k(mask, k=compute_units)
+        ind = tf.reshape(ind, [-1, 1])
+        inp_feat_in = [tf.gather_nd(xx, ind) for xx in inp_feat]
+        w_stats_in = [tf.gather_nd(xx, ind) for xx in w_stats]
+        d_trans_in = tf.gather_nd(d_trans, ind)
+        bias_in = tf.gather_nd(tf.transpose(bias), ind)
+      w_stats_in = tf.concat(w_stats_in, 2)
+      w_stats_in_norm = w_stats_in * tf.rsqrt(
+          tf.reduce_mean(w_stats_in**2) + 1e-6)
+      act = tf.concat(inp_feat_in + [d_trans_in], 2)
+      act = snt.BatchNorm(axis=[0, 1])(act, is_training=True)
+      bias_dense = tf.reshape(bias_in, [-1, 1, 1]) * tf.ones([1, bs, 1])
+      act = tf.concat([w_stats_in_norm, bias_dense, act], 2)
+      mod = snt.Conv1D(output_channels=self.compute_h_size, kernel_shape=[3])
+      act = mod(act)
+      act = snt.BatchNorm(axis=[0, 1])(act, is_training=True)
+      act = tf.nn.relu(act)
+      act2 = ConcatUnitConv()(act)
+      act = act2
+      prev_act = act
+      for i in range(self.compute_h_layers):
+        mod = snt.Conv1D(output_channels=self.compute_h_size, kernel_shape=[3])
+        act = mod(act)
+        act = snt.BatchNorm(axis=[0, 1])(act, is_training=True)
+        act = tf.nn.relu(act)
+        act = ConcatUnitConv()(act)
+        prev_act = act
+      h = act
+      if compute_units is not None:
+        shape = inp_feat[0].shape.as_list()[:1] + h.shape.as_list()[1:]
+        h = tf.scatter_nd(ind, h, shape=shape)
+      h = tf.transpose(h, [1, 0, 2])  # [bs, units, channels]
+      return h
+  ## wrappers to allow forward and backward to have different variables
+  @snt.reuse_variables
+  def merge_change_w_forward(self, change_w_terms, global_prefix='', prefix=''):
+    return self.merge_change_w(
+        change_w_terms, global_prefix=global_prefix, prefix=prefix)
+  @snt.reuse_variables
+  def merge_change_w_backward(self, change_w_terms, global_prefix='',
+                              prefix=''):
+    return self.merge_change_w(
+        change_w_terms, global_prefix=global_prefix, prefix=prefix)
+  def merge_change_w(self, change_w_terms, global_prefix='', prefix=''):
+    with tf.device(
+        self.remote_device), tf.name_scope(global_prefix + '_merge_change_w'):
+      w_base = change_w_terms['w_base']
+      for kk in sorted(change_w_terms.keys()):
+        name = global_prefix + 'change_w_plane_%s' % kk
+        delta_w = change_w_terms[kk]
+        mean, var = tf.nn.moments(delta_w, [0, 1])
+        root_mean_square = tf.sqrt(tf.reduce_mean(delta_w**2) + 1e-6)
+      for kk in sorted(change_w_terms.keys()):
+        change_w_terms[kk] = self.normalize(change_w_terms[kk])
+      initializers = {
+          'w': tf.constant_initializer(0.1),
+          'b': tf.zeros_initializer()
+      }
+      mod = snt.Linear(
+          1,
+          name=global_prefix + '_weight_readout_coeffs',
+          initializers=initializers)
+      change_w_terms_list = [
+          change_w_terms[kk] for kk in sorted(change_w_terms.keys())
+      ]
+      stack_terms = tf.stack(change_w_terms_list, axis=-1)
+      change_w = tf.squeeze(
+          snt.BatchApply(mod)(stack_terms), axis=-1) / len(change_w_terms)
+      # only allow perpendicular updates, or updates which grow length. don't
+      # allow length to decay towards zero.
+      ip = tf.reduce_mean(change_w * w_base)
+      # zero out any updates that shrink length
+      ip = tf.nn.relu(ip)
+      change_w -= w_base * ip
+      change_w /= tf.sqrt(len(change_w_terms) * 1.)
+      change_w = self.normalize(change_w)
+      # encourage the receptive field to not collapse to 0
+      change_w -= w_base / 7.  # This is an arbitrary scale choice
+      return tf.identity(change_w)
+  @snt.reuse_variables
+  def bias_readout(self, h):
+    with tf.device(self.remote_device):
+      mod = snt.Linear(1, name='bias_readout')
+      ret = snt.BatchApply(mod)(h)
+      return tf.squeeze(ret, 2)
+  @snt.reuse_variables
+  def next_delta(self, z, h, d):
+    with tf.device(self.remote_device):
+      return d * tf.expand_dims(tf.nn.sigmoid(z), 2) + self.to_delta_size(h)
+  @utils.create_variables_in_class_scope
+  def get_readout_mod(self, name):
+    if name not in self.readout_mods:
+      self.readout_mods[name] = GradChannelReadout(
+          self.num_grad_channels, device=self.remote_device, name=name)
+    return self.readout_mods[name]
+  @utils.create_variables_in_class_scope
+  def low_rank_readout(self, name, h1, h2, psd=False):
+    BS = h1.shape.as_list()[0]
+    r_t = self.get_readout_mod(name + '_top')(h1)
+    if psd:
+      r_b = r_t
+    else:
+      r_b = self.get_readout_mod(name + '_bottom')(h2)
+    return tf.reduce_mean(tf.matmul(r_b, r_t, transpose_a=True), axis=0) / BS
+  @snt.reuse_variables
+  def to_delta_size(self, h):
+    with tf.device(self.remote_device):
+      mod = snt.Linear(self.delta_dim)
+      return snt.BatchApply(mod)(h)
+  @snt.reuse_variables
+  def initial_state(self, variables):
+    """The inner optimization state.
+    Args:
+      variables: list of tf.Variable
+        list of variables to get the initial state of.
+    Returns:
+      opt_state: OptState
+    """
+    with tf.device(self.local_device):
+      initial_opt_state = self.opt.get_state(variables)
+    return OptState(
+        variables=variables, opt_state=initial_opt_state, index=tf.constant(0))
+  @snt.reuse_variables
+  def compute_next_state(self, grads, learning_rate, cur_state,
+                         cur_transformer):
+    summaries = []
+    with tf.device(self.local_device):
+      with tf.control_dependencies(summaries):
+        new_vars, new_state = self.opt.compute_updates(
+            cur_state.variables, grads, learning_rate, cur_state.opt_state)
+        pass
+    return OptState(
+        variables=tuple(new_vars),
+        opt_state=new_state,
+        index=cur_state.index + 1)
+  def assign_state(self, base_model, next_state):
+    var_ups = [
+        v.assign(nv) for v, nv in utils.eqzip(base_model.get_variables(),
+                                              next_state.variables)
+    ]
+    opt_ups = self.opt.assign_state(next_state.opt_state)
+    return tf.group(opt_ups, *var_ups)
+  def local_variables(self):
+    return list(self.opt.get_variables())
+  def remote_variables(self):
+    train = list(
+        snt.get_variables_in_module(self, tf.GraphKeys.TRAINABLE_VARIABLES))
+    train += list(
+        snt.get_variables_in_module(self,
+                                    tf.GraphKeys.MOVING_AVERAGE_VARIABLES))
+    return train
+class MoreLocalWeightUpdateWLearner(snt.AbstractModule):
+  """The BaseModel that the UnsupervisedUpdateRule acts on.
+  """
+  def __init__(self,
+               remote_device,
+               local_device,
+               inner_size=128,
+               output_size=32,
+               n_layers=4,
+               shuffle_input=True,
+               activation_fn=tf.nn.relu,
+               identical_updates=True,
+               **kwargs):
+    self.local_device = local_device
+    self.remote_device = remote_device
+    self.inner_size = inner_size
+    self.n_layers = n_layers
+    self.shuffle_input = shuffle_input
+    self.activation_fn = activation_fn
+    self.identical_updates = identical_updates
+    self.output_size = output_size
+    if output_size == None:
+      self.output_size = inner_size
+    self.shuffle_ind = None
+    super(MoreLocalWeightUpdateWLearner, self).__init__(
+        name='LocalWeightUpdateWLearner', **kwargs)
+  @snt.reuse_variables
+  def get_shuffle_ind(self, size):
+    if self.shuffle_ind is None:
+      # put the shuffle in tf memory to make the eval jobs
+      # re-entrant.
+      shuffle_ind_val = np.random.permutation(size)
+      shuffle_ind = tf.get_variable(
+          name='shuffle_ind', dtype=tf.int64, initializer=shuffle_ind_val)
+      unshuffle_ind = tf.scatter_nd(
+          tf.reshape(shuffle_ind, [-1, 1]), tf.range(size), [size])
+    return shuffle_ind, unshuffle_ind
+  def _build(self, batch):
+    image = batch.image
+    x0 = snt.BatchFlatten()(image)
+    if self.shuffle_input:
+      size = x0.shape.as_list()[1]
+      shuffle_ind, unshuffle_ind = self.get_shuffle_ind(size)
+      x0 = tf.gather(x0, shuffle_ind, axis=1)
+    xs = [x0]
+    mods = []
+    zs = []
+    init = {}
+    for i in range(self.n_layers):
+      mod = common.LinearBatchNorm(
+          self.inner_size, activation_fn=self.activation_fn)
+      z, x = mod(xs[i])
+      xs.append(x)
+      zs.append(z)
+      mods.append(mod)
+    mod = common.LinearBatchNorm(
+        self.output_size, activation_fn=self.activation_fn)
+    z, x = mod(xs[-1])
+    mods.append(mod)
+    xs.append(x)
+    zs.append(z)
+    embedding_x = xs[-1]
+    # make a random set of backward mods
+    backward_mods = []
+    for i, (x, x_p1) in enumerate(zip(xs[0:-1], xs[1:])):
+      m = common.LinearBatchNorm(
+          x_p1.shape.as_list()[1], activation_fn=tf.identity)
+      _ = m(x)
+      backward_mods.append(m)
+    shape = image.shape.as_list()[1:4]
+    for mods_p, prefix in [(mods, 'forward'), (backward_mods, 'backward')]:
+      if self.shuffle_input:
+        unshuf_w = tf.gather(mods_p[0].w, unshuffle_ind, axis=0)
+      else:
+        unshuf_w = mods_p[0].w
+      img = summary_utils.first_layer_weight_image(unshuf_w, shape)
+      tf.summary.image(prefix + '_w0_receptive_field', img)
+      for i, m in enumerate(mods_p[0:]):
+        img = summary_utils.inner_layer_weight_image(m.w)
+        tf.summary.image(prefix + '_w%d' % (i + 1), img)
+    img = summary_utils.sorted_images(image, batch.label_onehot)
+    tf.summary.image('inputs', img)
+    # log out pre-activations and activations
+    for all_vis, base_name in [(xs, 'x'), (zs, 'z')]:
+      for i, x_vis in enumerate(all_vis):
+        img = summary_utils.activation_image(x_vis, batch.label_onehot)
+        tf.summary.image('%s%d' % (base_name, i), img)
+    embedding_x = tf.identity(embedding_x)
+    outputs = BaseModelOutputs(
+        xs=xs, zs=zs, mods=mods, batch=batch, backward_mods=backward_mods)
+    return embedding_x, outputs
+  def compute_next_h_d(self, meta_opt, w_bot, w_top, bias, x, z, d, backward_w):
+    """ Propogate error back down the network while computing hidden state.
+    """
+    if z is None:
+      z = x
+    h = meta_opt.compute_h(x, z, d, bias, w_bot,
+                           w_top)  # [bs x 60 x h_channels]
+    # compute the next d
+    delta = meta_opt.next_delta(z, h, d)
+    if backward_w is not None:
+      def delta_matmul(w, delta):
+        d = tf.transpose(delta, [0, 2, 1])  # [bs x delta_channels x n_units)
+        d = snt.BatchApply(lambda x: tf.matmul(x, w, transpose_b=True))(d)
+        d = tf.transpose(d, [0, 2, 1])
+        return d
+      # replace the "backward pass" with a random matrix.
+      d = delta_matmul(backward_w, delta)  # [bs x 60 x delta_channels]
+      var = tf.reduce_mean(tf.square(d), [2], keepdims=True)
+      d = d * tf.rsqrt(1e-6 + var)
+    return h, d
+  def weight_change_for_layer(self, meta_opt, l_idx, w_base, b_base, upper_h,
+                              lower_h, upper_x, lower_x, prefix, include_bias):
+    """Compute the change in weights for each layer.
+    This computes something roughly analagous to a gradient.
+    """
+    reduce_upper_h = upper_h
+    reduce_lower_h = lower_h
+    BS = lower_x.shape.as_list()[0]
+    change_w_terms = dict()
+    # initial weight value normalized
+    # normalize the weights per receptive-field, rather than per-matrix
+    weight_scale = tf.rsqrt(
+        tf.reduce_mean(w_base**2, axis=0, keepdims=True) + 1e-6)
+    w_base *= weight_scale
+    change_w_terms['w_base'] = w_base
+    # this will act to decay larger weights towards zero
+    change_w_terms['large_decay'] = w_base**2 * tf.sign(w_base)
+    # term based on activations
+    ux0 = upper_x - tf.reduce_mean(upper_x, axis=0, keepdims=True)
+    uxs0 = ux0 * tf.rsqrt(tf.reduce_mean(ux0**2, axis=0, keepdims=True) + 1e-6)
+    change_U = tf.matmul(uxs0, uxs0, transpose_a=True) / BS
+    change_U /= tf.sqrt(float(change_U.shape.as_list()[0]))
+    cw = tf.matmul(w_base, change_U)
+    cw_scale = tf.rsqrt(tf.reduce_mean(cw**2 + 1e-8))
+    cw *= cw_scale
+    change_w_terms['decorr_x'] = cw
+    # hebbian term
+    lx0 = lower_x - tf.reduce_mean(lower_x, axis=0, keepdims=True)
+    lxs0 = lx0 * tf.rsqrt(tf.reduce_mean(lx0**2, axis=0, keepdims=True) + 1e-6)
+    cw = tf.matmul(lxs0, uxs0, transpose_a=True) / BS
+    change_w_terms['hebb'] = -cw
+    # 0th order term
+    w_term = meta_opt.low_rank_readout(prefix + 'weight_readout_0', upper_h,
+                                       lower_h)
+    change_w_terms['0_order'] = w_term
+    # # rbf term (weight update scaled by distance from 0)
+    w_term = meta_opt.low_rank_readout(prefix + 'weight_readout_rbf',
+                                       reduce_upper_h, reduce_lower_h)
+    change_w_terms['rbf'] = tf.exp(-w_base**2) * w_term
+    # 1st order term (weight dependent update to weights)
+    w_term = meta_opt.low_rank_readout(prefix + 'weight_readout_1',
+                                       reduce_upper_h, reduce_lower_h)
+    change_w_terms['1_order'] = w_base * w_term
+    # more terms based on single layer readouts.
+    for update_type in ['lin', 'sqr']:
+      for h_source, h_source_name in [(reduce_upper_h, 'upper'),
+                                      (reduce_lower_h, 'lower')]:
+        structures = ['symm']
+        if update_type == 'lin' and h_source_name == 'upper':
+          structures += ['psd']
+        for structure in structures:
+          name = update_type + '_' + h_source_name + '_' + structure
+          if structure == 'symm':
+            change_U = meta_opt.low_rank_readout(prefix + name, h_source,
+                                                 h_source)
+            change_U = (change_U + tf.transpose(change_U)) / tf.sqrt(2.)
+            change_U = tf.matrix_set_diag(change_U,
+                                          tf.zeros(
+                                              [change_U.shape.as_list()[0]]))
+          elif structure == 'psd':
+            change_U = meta_opt.low_rank_readout(
+                prefix + name, h_source, None, psd=True)
+          else:
+            assert False
+          change_U /= tf.sqrt(float(change_U.shape.as_list()[0]))
+          if update_type == 'lin':
+            sign_multiplier = tf.ones_like(w_base)
+            w_base_l = w_base
+          elif update_type == 'sqr':
+            sign_multiplier = tf.sign(w_base)
+            w_base_l = tf.sqrt(1. + w_base**2) - 1.
+          if h_source_name == 'upper':
+            cw = tf.matmul(w_base_l, change_U)  # [N^l-1 x N^l]
+          elif h_source_name == 'lower':
+            cw = tf.matmul(change_U, w_base_l)
+          change_w_terms[name] = cw * sign_multiplier
+    if prefix == 'forward':
+      change_w = meta_opt.merge_change_w_forward(
+          change_w_terms, global_prefix=prefix, prefix='l%d' % l_idx)
+    elif prefix == 'backward':
+      change_w = meta_opt.merge_change_w_backward(
+          change_w_terms, global_prefix=prefix, prefix='l%d' % l_idx)
+    else:
+      assert (False)
+    if not include_bias:
+      return change_w
+    change_b = tf.reduce_mean(meta_opt.bias_readout(upper_h), [0])
+    # force nonlinearities to be exercised -- biases can't all be increased without bound
+    change_b_mean = tf.reduce_mean(change_b)
+    offset = -tf.nn.relu(-change_b_mean)
+    change_b -= offset
+    var = tf.reduce_mean(tf.square(change_b), [0], keepdims=True)
+    change_b = (change_b) / tf.sqrt(0.5 + var)
+    return change_w, change_b
+  def compute_next_state(self, outputs, meta_opt, previous_state):
+    zs = outputs.zs
+    xs = outputs.xs
+    batch = outputs.batch
+    mods = outputs.mods
+    backward_mods = outputs.backward_mods
+    variables = self.get_variables()
+    rev_mods = mods[::-1]
+    rev_backward_mods = backward_mods[::-1]
+    rev_xs = xs[::-1]
+    rev_zs = zs[::-1] + [None]
+    to_top = xs[-1]
+    # variables that change in the loop
+    hs = []
+    d = meta_opt.compute_top_delta(to_top)  # [bs x 32 x delta_channels]
+    iterator = utils.eqzip(rev_backward_mods + [None], rev_mods + [None],
+                           [None] + rev_mods, rev_xs, rev_zs)
+    for (backward_mod, lower_mod, upper_mod, x, z) in iterator:
+      w_bot = None
+      if not lower_mod is None:
+        w_bot = previous_state.variables[variables.index(lower_mod.w)]
+      w_top = None
+      if not upper_mod is None:
+        w_top = previous_state.variables[variables.index(upper_mod.w)]
+      backward_w = None
+      if backward_mod is not None:
+        backward_w = previous_state.variables[variables.index(backward_mod.w)]
+      if lower_mod is not None:
+        bias = previous_state.variables[variables.index(lower_mod.b)]
+      else:
+        bias = tf.zeros([x.shape[1]])
+      h, d = self.compute_next_h_d(
+          meta_opt=meta_opt,
+          w_bot=w_bot,
+          w_top=w_top,
+          bias=bias,
+          backward_w=backward_w,
+          x=x,
+          z=z,
+          d=d)
+      hs.append(h)
+    w_forward_var_idx = [variables.index(mod.w) for mod in rev_mods]
+    w_backward_var_idx = [variables.index(mod.w) for mod in rev_backward_mods]
+    b_var_idx = [variables.index(mod.b) for mod in rev_mods]
+    # storage location for outputs of below loop
+    grads = [None for _ in previous_state.variables]
+    # over-ride learning rate for perturbation variables
+    learning_rate = [None for _ in previous_state.variables]
+    # This is a map -- no state is shared cross loop
+    for l_idx, w_forward_idx, w_backward_idx, b_idx, upper_h, lower_h, lower_x, upper_x in utils.eqzip(
+        range(len(w_forward_var_idx)), w_forward_var_idx, w_backward_var_idx,
+        b_var_idx, hs[:-1], hs[1:], xs[::-1][1:], xs[::-1][:-1]):
+      b_base = previous_state.variables[b_idx]
+      change_w_forward, change_b = self.weight_change_for_layer(
+          meta_opt=meta_opt,
+          l_idx=l_idx,
+          w_base=previous_state.variables[w_forward_idx],
+          b_base=b_base,
+          upper_h=upper_h,
+          lower_h=lower_h,
+          upper_x=upper_x,
+          lower_x=lower_x,
+          prefix='forward',
+          include_bias=True)
+      if self.identical_updates:
+        change_w_backward = change_w_forward
+      else:
+        change_w_backward = self.weight_change_for_layer(
+            meta_opt=meta_opt,
+            l_idx=l_idx,
+            w_base=previous_state.variables[w_backward_idx],
+            b_base=b_base,
+            upper_h=upper_h,
+            lower_h=lower_h,
+            upper_x=upper_x,
+            lower_x=lower_x,
+            prefix='backward',
+            include_bias=False)
+      grads[w_forward_idx] = change_w_forward
+      grads[w_backward_idx] = change_w_backward
+      grads[b_idx] = change_b
+    cur_transformer = common.transformer_at_state(self,
+                                                  previous_state.variables)
+    next_state = meta_opt.compute_next_state(
+        grads,
+        learning_rate=learning_rate,
+        cur_state=previous_state,
+        cur_transformer=lambda x: cur_transformer(x)[0])
+    return next_state
+  def initial_state(self, meta_opt):
+    return meta_opt.initial_state(self.get_variables())
--- a/research/learning_unsupervised_learning/datasets/__init__.py
+++ b/research/learning_unsupervised_learning/datasets/__init__.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import mnist
--- a/research/learning_unsupervised_learning/datasets/common.py
+++ b/research/learning_unsupervised_learning/datasets/common.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import tensorflow as tf
+import numpy as np
+ImageLabelOnehot = collections.namedtuple('ImageLabelOnehot',
+                                          ['image', 'label', 'label_onehot'])
+ImageLabelOnehotRegression = collections.namedtuple(
+    "ImageLabelOnehotRegression",
+    ["image", "label", "label_onehot", "regression_target"])
--- a/research/learning_unsupervised_learning/datasets/mnist.py
+++ b/research/learning_unsupervised_learning/datasets/mnist.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import sonnet as snt
+import tensorflow as tf
+from tensorflow.python.keras.datasets import mnist
+from learning_unsupervised_learning.datasets import common
+class Mnist(snt.AbstractModule):
+  def __init__(self, device, batch_size=128, name="Mnist"):
+    self.device = device
+    self.batch_size = batch_size
+    self._make_dataset()
+    self.iterator = None
+    super(Mnist, self).__init__(name=name)
+  def _make_dataset(self):
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+    x_train = x_train.reshape(60000, 784)
+    x_test = x_test.reshape(10000, 784)
+    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    dataset = dataset.repeat()
+    dataset = dataset.shuffle(self.batch_size * 3)
+    dataset = dataset.batch(self.batch_size)
+    def _map_fn(image, label):
+      image = tf.to_float(image) / 255.
+      label.set_shape([self.batch_size])
+      label = tf.cast(label, dtype=tf.int32)
+      label_onehot = tf.one_hot(label, 10)
+      image = tf.reshape(image, [self.batch_size, 28, 28, 1])
+      return common.ImageLabelOnehot(
+          image=image, label=label, label_onehot=label_onehot)
+    self.dataset = dataset.map(_map_fn)
+  def _build(self):
+    if self.iterator is None:
+      self.iterator = self.dataset.make_one_shot_iterator()
+    batch = self.iterator.get_next()
+    [b.set_shape([self.batch_size] + b.shape.as_list()[1:]) for b in batch]
+    return batch
+class TinyMnist(Mnist):
+  def __init__(self, *args, **kwargs):
+    kwargs.setdefault("name", "TinyMnist")
+    super(TinyMnist, self).__init__(*args, **kwargs)
+  def _make_dataset(self):
+    super(TinyMnist, self)._make_dataset()
+    def _map_fn(batch):
+      new_img = tf.image.resize_images(batch.image, [14, 14])
+      return common.ImageLabelOnehot(
+          image=new_img, label=batch.label, label_onehot=batch.label_onehot)
+    self.dataset = self.dataset.map(_map_fn)
--- a/research/learning_unsupervised_learning/evaluation.py
+++ b/research/learning_unsupervised_learning/evaluation.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Evaluation job.
+This sits on the side and performs evaluation on a saved model.
+This is a separate process for ease of use and stability of numbers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from learning_unsupervised_learning import utils
+def construct_evaluation_graph(theta_process_fn=None,
+                               w_learner_fn=None,
+                               dataset_fn=None,
+                               meta_objectives=None,
+                              ):
+  """Construct the evaluation graph.
+  """
+  if meta_objectives is None:
+    meta_objectives = []
+  tf.train.create_global_step()
+  local_device = ""
+  remote_device = ""
+  meta_opt = theta_process_fn(
+      remote_device=remote_device, local_device=local_device)
+  base_model = w_learner_fn(
+      remote_device=remote_device, local_device=local_device)
+  train_dataset = dataset_fn(device=local_device)
+  # construct variables
+  x, outputs = base_model(train_dataset())
+  initial_state = base_model.initial_state(meta_opt, max_steps=10)
+  next_state = base_model.compute_next_state(outputs, meta_opt, initial_state)
+  with utils.state_barrier_context(next_state):
+    train_one_step_op = meta_opt.assign_state(base_model, next_state)
+  meta_objs = []
+  for meta_obj_fn in meta_objectives:
+    meta_obj = meta_obj_fn(local_device="", remote_device="")
+    meta_objs.append(meta_obj)
+    J = meta_obj(train_dataset, lambda x: base_model(x)[0])
+    tf.summary.scalar(str(meta_obj.__class__.__name__)+"_J", tf.reduce_mean(J))
+  # TODO(lmetz) this is kinda error prone.
+  # We should share the construction of the global variables across train and
+  # make sure both sets of savable variables are the same
+  checkpoint_vars = meta_opt.remote_variables() + [tf.train.get_global_step()]
+  for meta_obj in meta_objs:
+    checkpoint_vars.extend(meta_obj.remote_variables())
+  return checkpoint_vars, train_one_step_op, (base_model, train_dataset)
--- a/research/learning_unsupervised_learning/meta_objective/__init__.py
+++ b/research/learning_unsupervised_learning/meta_objective/__init__.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import sklearn
+import linear_regression
--- a/research/learning_unsupervised_learning/meta_objective/linear_regression.py
+++ b/research/learning_unsupervised_learning/meta_objective/linear_regression.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Closed form linear regression.
+Can be differentiated through.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+from learning_unsupervised_learning import utils
+from learning_unsupervised_learning import variable_replace
+def solve_ridge(x, y, ridge_factor):
+  with tf.name_scope("solve_ridge"):
+    # Added a column of ones to the end of the feature matrix for bias
+    A = tf.concat([x, tf.ones((x.shape.as_list()[0], 1))], axis=1)
+    # Analytic solution for the ridge regression loss
+    inv_target = tf.matmul(A, A, transpose_a=True)
+    np_diag_penalty = ridge_factor * np.ones(
+        A.shape.as_list()[1], dtype="float32")
+    # Remove penalty on bias component of weights
+    np_diag_penalty[-1] = 0.
+    diag_penalty = tf.constant(np_diag_penalty)
+    inv_target += tf.diag(diag_penalty)
+    inv = tf.matrix_inverse(inv_target)
+    w = tf.matmul(inv, tf.matmul(A, y, transpose_a=True))
+    return w
+class LinearRegressionMetaObjective(snt.AbstractModule):
+  """A meta objective based on training Ridge Regression with analytic solution.
+  This is used to evaluate the performance of a given feature set trained in
+  some other manner.
+  """
+  def __init__(self,
+               local_device=None,
+               remote_device=None,
+               zero_one_labels=True,
+               normalize_y_hat=True,
+               normalize_act=False,
+               averages=1,
+               ridge_factor=0.1,
+               center_y=True,
+               hinge_loss=False,
+               samples_per_class=10,
+               test_train_scalar=1.0,
+              ):
+    self._local_device = local_device
+    self._remote_device = remote_device
+    self.zero_one_labels = zero_one_labels
+    self.normalize_y_hat = normalize_y_hat
+    self.normalize_act = normalize_act
+    self.ridge_factor = ridge_factor
+    self.averages = averages
+    self.samples_per_class = samples_per_class
+    self.center_y=center_y
+    self.test_train_scalar=test_train_scalar
+    self.hinge_loss = hinge_loss
+    self.dataset_map = {}
+    super(LinearRegressionMetaObjective,
+          self).__init__(name="LinearRegressionMetaObjective")
+  def _build(self, dataset, feature_transformer):
+    if self.samples_per_class is not None:
+      if dataset not in self.dataset_map:
+        # datasets are outside of frames from while loops
+        with tf.control_dependencies(None):
+          self.dataset_map[dataset] = utils.sample_n_per_class(
+              dataset, self.samples_per_class)
+      dataset = self.dataset_map[dataset]
+    stats = collections.defaultdict(list)
+    losses = []
+    # TODO(lmetz) move this to ingraph control flow?
+    for _ in xrange(self.averages):
+      loss, stat = self._build_once(dataset, feature_transformer)
+      losses.append(loss)
+      for k, v in stat.items():
+        stats[k].append(v)
+    stats = {k: tf.add_n(v) / float(len(v)) for k, v in stats.items()}
+    summary_updates = []
+    for k, v in stats.items():
+      tf.summary.scalar(k, v)
+    with tf.control_dependencies(summary_updates):
+      return tf.add_n(losses) / float(len(losses))
+  def _build_once(self, dataset, feature_transformer):
+    with tf.device(self._local_device):
+      batch = dataset()
+      num_classes = batch.label_onehot.shape.as_list()[1]
+      regression_mod = snt.Linear(num_classes)
+      if self.normalize_act:
+        def normalize_transformer(x):
+          unnorm_x = feature_transformer(x)
+          return tf.nn.l2_normalize(unnorm_x, 0)
+        feature_transformer_wrap = normalize_transformer
+      else:
+        feature_transformer_wrap = feature_transformer
+      # construct the variables of the right shape in the sonnet module by
+      # calling a forward pass through the regressor.
+      with utils.assert_no_new_variables():
+        dummy_features = feature_transformer_wrap(batch)
+      regression_mod(dummy_features)
+      reg_w = regression_mod.w
+      reg_b = regression_mod.b
+      batch_test = dataset()
+      all_batch = utils.structure_map_multi(lambda x: tf.concat(x, 0), [batch, batch_test])
+      #all_batch = tf.concat([batch, batch_test], 0)
+      # Grab a new batch of data from the dataset.
+      features = feature_transformer_wrap(all_batch)
+      features, features_test = utils.structure_map_split(lambda x: tf.split(x, 2, axis=0), features)
+      def center_y(y):
+        y -= tf.reduce_mean(y)
+        y *= tf.rsqrt(tf.reduce_mean(tf.reduce_sum(y**2, axis=[1], keep_dims=True)))
+        return y
+      def get_y_vec(batch):
+        y_pieces = []
+        if hasattr(batch, "label_onehot"):
+          if self.zero_one_labels:
+            y_pieces += [batch.label_onehot]
+          else:
+            y_pieces += [2. * batch.label_onehot - 1.]
+        if hasattr(batch, "regression_target"):
+          y_pieces += [batch.regression_target]
+        y = tf.concat(y_pieces, 1)
+        if self.center_y:
+          y = center_y(y)
+        return y
+      y_train = get_y_vec(batch)
+      w = solve_ridge(features, y_train, self.ridge_factor)
+      # Generate features from another batch to evaluate loss on the validation
+      # set. This provide a less overfit signal to the learned optimizer.
+      y_test = get_y_vec(batch_test)
+      def compute_logit(features):
+        # We have updated the classifier mod in previous steps, we need to
+        # substitute out those variables to get new values.
+        replacement = collections.OrderedDict([(reg_w, w[:-1]), (reg_b, w[-1])])
+        with variable_replace.variable_replace(replacement):
+          logits = regression_mod(features)
+        return logits
+      batch_size = y_train.shape.as_list()[0]
+      logit_train = compute_logit(features)
+      logit_test_unnorm = compute_logit(features_test)
+      if self.normalize_y_hat:
+        logit_test = logit_test_unnorm / tf.sqrt(
+            tf.reduce_sum(logit_test_unnorm**2, axis=[1], keep_dims=True))
+      else:
+        logit_test = logit_test_unnorm
+      stats = {}
+      if self.hinge_loss:
+        # slightly closer to the true classification loss
+        # any distance smaller than 1 is guaranteed to map to the correct class
+        mse_test = tf.reduce_sum(tf.nn.relu(tf.reduce_sum(tf.square(logit_test - y_test), axis=1)-1.)) / batch_size
+      else:
+        mse_test = tf.reduce_sum(tf.square(logit_test - y_test)) / batch_size
+      stats["mse_test"] = mse_test
+      mse_train = tf.reduce_sum(tf.square(logit_train - y_train)) / batch_size
+      stats["mse_train"] = mse_train
+      is_correct_test = tf.equal(tf.argmax(logit_test, 1), tf.argmax(y_test, 1))
+      accuracy_test = tf.reduce_mean(tf.cast(is_correct_test, tf.float32))
+      stats["accuracy_test"] = accuracy_test
+      def test_confusion_fn():
+        test_confusion = tf.confusion_matrix(tf.argmax(y_test, 1), tf.argmax(logit_test, 1))
+        test_confusion = tf.to_float(test_confusion) / tf.constant((logit_test.shape.as_list()[0] / float(logit_test.shape.as_list()[1])), dtype=tf.float32)
+        test_confusion = tf.expand_dims(tf.expand_dims(test_confusion, 0), 3)
+        return test_confusion
+      tf.summary.image("test_confusion", test_confusion_fn())
+      def train_confusion_fn():
+        train_confusion = tf.confusion_matrix(tf.argmax(y_train, 1), tf.argmax(logit_train, 1))
+        train_confusion = tf.to_float(train_confusion) / tf.constant((logit_train.shape.as_list()[0] / float(logit_train.shape.as_list()[1])), dtype=tf.float32)
+        train_confusion = tf.expand_dims(tf.expand_dims(train_confusion, 0), 3)
+        return train_confusion
+      tf.summary.image("train_confusion", train_confusion_fn())
+      is_correct = tf.equal(tf.argmax(logit_train, 1), tf.argmax(y_train, 1))
+      accuracy_train = tf.reduce_mean(tf.cast(is_correct, tf.float32))
+      stats["accuracy_train"] = accuracy_train
+      reg = self.ridge_factor * tf.reduce_sum(tf.square(w[:-1])) / batch_size
+      stats["ridge_component"] = reg
+      stats["total_loss"] = mse_test + reg
+      loss_to_train_at = (reg+ mse_test) * self.test_train_scalar + (mse_train + reg)*(1 - self.test_train_scalar)
+      loss_to_train_at = tf.identity(loss_to_train_at)
+      # Minimizing the test loss should not require regurization because the
+      # metaobjective is solved for the training loss
+      return loss_to_train_at, stats
+  def local_variables(self):
+    """List of variables that need to be updated for each evaluation.
+    These variables should not be stored on a parameter server and
+    should be reset every computation of a meta_objective loss.
+    Returns:
+      vars: list of tf.Variable
+    """
+    return list(
+        snt.get_variables_in_module(self, tf.GraphKeys.TRAINABLE_VARIABLES))
+  def remote_variables(self):
+    return []
--- a/research/learning_unsupervised_learning/meta_objective/sklearn.py
+++ b/research/learning_unsupervised_learning/meta_objective/sklearn.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Can NOT be differentiated through.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+from tensorflow.python.framework import function
+from learning_unsupervised_learning import utils
+from learning_unsupervised_learning.meta_objective import utils as meta_obj_utils
+from sklearn import svm
+from sklearn import linear_model
+def build_fit(device, model_fn, num_classes, probs=True):
+  def _py_fit_predict(trX, trY, teX):
+    assert len(np.unique(trY)) == num_classes
+    model = model_fn()
+    model.fit(trX, trY)
+    trP = model.predict(trX)
+    teP = model.predict(teX)
+    if probs:
+      teP_probs = model.predict_log_proba(teX)
+      return trP.astype(np.int64), teP.astype(np.int64), teP_probs.astype(
+          np.float32)
+    else:
+      teP = model.predict(teX)
+      return trP.astype(np.int64), teP.astype(np.int64)
+  def return_fn(trX, trY, teX):
+    with tf.device(device):
+      with tf.device("/cpu:0"):
+        if probs:
+          return tf.py_func(
+              _py_fit_predict,
+              [tf.identity(trX),
+               tf.identity(trY),
+               tf.identity(teX)], [tf.int64, tf.int64, tf.float32])
+        else:
+          return tf.py_func(
+              _py_fit_predict,
+              [tf.identity(trX),
+               tf.identity(trY),
+               tf.identity(teX)], [tf.int64, tf.int64])
+  return return_fn
+class SKLearn(meta_obj_utils.MultiTrialMetaObjective):
+  def __init__(
+      self,
+      local_device=None,
+      remote_device=None,
+      averages=1,
+      samples_per_class=10,
+      probs=False,
+      stddev=0.01,
+      n_samples=10,
+      name="SKLearn",
+  ):
+    self._local_device = local_device
+    self._remote_device = remote_device
+    self.name = name
+    self.probs = probs
+    self.n_samples = n_samples
+    self.stddev = stddev
+    super(SKLearn, self).__init__(
+        name=name, samples_per_class=samples_per_class, averages=averages)
+  def _get_model(self):
+    raise NotImplemented()
+  def _build_once(self, dataset, feature_transformer):
+    with tf.device(self._local_device):
+      tr_batch = dataset()
+      te_batch = dataset()
+      num_classes = tr_batch.label_onehot.shape.as_list()[1]
+      all_batch = utils.structure_map_multi(lambda x: tf.concat(x, 0),
+                                            [tr_batch, te_batch])
+      features = feature_transformer(all_batch)
+      trX, teX = utils.structure_map_split(lambda x: tf.split(x, 2, axis=0),
+                                           features)
+      trY = tf.to_int64(tr_batch.label)
+      trY_onehot = tf.to_int32(tr_batch.label_onehot)
+      teY = tf.to_int64(te_batch.label)
+      teY_shape = teY.shape.as_list()
+      def blackbox((trX, trY, teX, teY)):
+        trY = tf.to_int32(tf.rint(trY))
+        teY = tf.to_int32(tf.rint(teY))
+        tf_fn = build_fit(
+            self._local_device,
+            self._get_model,
+            num_classes=num_classes,
+            probs=self.probs)
+        if self.probs:
+          trP, teP, teP_probs = tf_fn(trX, trY, teX)
+        else:
+          trP, teP = tf_fn(trX, trY, teX)
+        teY.set_shape(teY_shape)
+        if self.probs:
+          onehot = tf.one_hot(teY, num_classes)
+          crossent = -tf.reduce_sum(onehot * teP_probs, [1])
+          return tf.reduce_mean(crossent)
+        else:
+          # use error rate as the loss if no surrogate is avalible.
+          return 1 - tf.reduce_mean(
+              tf.to_float(tf.equal(teY, tf.to_int32(teP))))
+      test_loss = blackbox((trX, tf.to_float(trY), teX, tf.to_float(teY)))
+      stats = {}
+      tf_fn = build_fit(
+          self._local_device,
+          self._get_model,
+          num_classes=num_classes,
+          probs=self.probs)
+      if self.probs:
+        trP, teP, teP_probs = tf_fn(trX, trY, teX)
+      else:
+        trP, teP = tf_fn(trX, trY, teX)
+      stats["%s/accuracy_train" % self.name] = tf.reduce_mean(
+          tf.to_float(tf.equal(tf.to_int32(trY), tf.to_int32(trP))))
+      stats["%s/accuracy_test" % self.name] = tf.reduce_mean(
+          tf.to_float(tf.equal(tf.to_int32(teY), tf.to_int32(teP))))
+      stats["%s/test_loss" % self.name] = test_loss
+      return test_loss, stats
+class LogisticRegression(SKLearn):
+  def __init__(self, C=1.0, name="LogisticRegression", probs=True, **kwargs):
+    self.C = C
+    super(LogisticRegression, self).__init__(name=name, probs=probs, **kwargs)
+  def _get_model(self):
+    return linear_model.LogisticRegression(C=self.C)
--- a/research/learning_unsupervised_learning/meta_objective/utils.py
+++ b/research/learning_unsupervised_learning/meta_objective/utils.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+from learning_unsupervised_learning import optimizers
+from learning_unsupervised_learning import utils
+from learning_unsupervised_learning import summary_utils
+from learning_unsupervised_learning import variable_replace
+class MultiTrialMetaObjective(snt.AbstractModule):
+  def __init__(self, samples_per_class, averages, **kwargs):
+    self.samples_per_class = samples_per_class
+    self.averages = averages
+    self.dataset_map = {}
+    super(MultiTrialMetaObjective,
+          self).__init__(**kwargs)
+  def _build(self, dataset, feature_transformer):
+    if self.samples_per_class is not None:
+      if dataset not in self.dataset_map:
+        # datasets are outside of frames from while loops
+        with tf.control_dependencies(None):
+          self.dataset_map[dataset] = utils.sample_n_per_class(
+              dataset, self.samples_per_class)
+      dataset = self.dataset_map[dataset]
+    stats = collections.defaultdict(list)
+    losses = []
+    # TODO(lmetz) move this to ingraph control flow?
+    for _ in xrange(self.averages):
+      loss, stat = self._build_once(dataset, feature_transformer)
+      losses.append(loss)
+      for k, v in stat.items():
+        stats[k].append(v)
+    stats = {k: tf.add_n(v) / float(len(v)) for k, v in stats.items()}
+    for k, v in stats.items():
+      tf.summary.scalar(k, v)
+    return tf.add_n(losses) / float(len(losses))
+  def local_variables(self):
+    """List of variables that need to be updated for each evaluation.
+    These variables should not be stored on a parameter server and
+    should be reset every computation of a meta_objective loss.
+    Returns:
+      vars: list of tf.Variable
+    """
+    return list(
+        snt.get_variables_in_module(self, tf.GraphKeys.TRAINABLE_VARIABLES))
+  def remote_variables(self):
+    return []
--- a/research/learning_unsupervised_learning/optimizers.py
+++ b/research/learning_unsupervised_learning/optimizers.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizers for use in unrolled optimization.
+These optimizers contain a compute_updates function and its own ability to keep
+track of internal state.
+These functions can be used with a tf.while_loop to perform multiple training
+steps per sess.run.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import abc
+import collections
+import tensorflow as tf
+import sonnet as snt
+from learning_unsupervised_learning import utils
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+class UnrollableOptimizer(snt.AbstractModule):
+  """Interface for optimizers that can be used in unrolled computation.
+  apply_gradients is derrived from compute_update and assign_state.
+  """
+  def __init__(self, *args, **kwargs):
+    super(UnrollableOptimizer, self).__init__(*args, **kwargs)
+    self()
+  @abc.abstractmethod
+  def compute_updates(self, xs, gs, state=None):
+    """Compute next step updates for a given variable list and state.
+    Args:
+      xs: list of tensors
+        The "variables" to perform an update on.
+        Note these must match the same order for which get_state was originally
+        called.
+      gs: list of tensors
+        Gradients of `xs` with respect to some loss.
+      state: Any
+        Optimizer specific state to keep track of accumulators such as momentum
+        terms
+    """
+    raise NotImplementedError()
+  def _build(self):
+    pass
+  @abc.abstractmethod
+  def get_state(self, var_list):
+    """Get the state value associated with a list of tf.Variables.
+    This state is commonly going to be a NamedTuple that contains some
+    mapping between variables and the state associated with those variables.
+    This state could be a moving momentum variable tracked by the optimizer.
+    Args:
+        var_list: list of tf.Variable
+    Returns:
+      state: Any
+        Optimizer specific state
+    """
+    raise NotImplementedError()
+  def assign_state(self, state):
+    """Assigns the state to the optimizers internal variables.
+    Args:
+      state: Any
+    Returns:
+      op: tf.Operation
+        The operation that performs the assignment.
+    """
+    raise NotImplementedError()
+  def apply_gradients(self, grad_vars):
+    gradients, variables = zip(*grad_vars)
+    state = self.get_state(variables)
+    new_vars, new_state = self.compute_updates(variables, gradients, state)
+    assign_op = self.assign_state(new_state)
+    op = utils.assign_variables(variables, new_vars)
+    return tf.group(assign_op, op, name="apply_gradients")
+class UnrollableGradientDescentRollingOptimizer(UnrollableOptimizer):
+  def __init__(self,
+               learning_rate,
+               name="UnrollableGradientDescentRollingOptimizer"):
+    self.learning_rate = learning_rate
+    super(UnrollableGradientDescentRollingOptimizer, self).__init__(name=name)
+  def compute_updates(self, xs, gs, learning_rates, state):
+    new_vars = []
+    for x, g, lr in utils.eqzip(xs, gs, learning_rates):
+      if lr is None:
+        lr = self.learning_rate
+      if g is not None:
+        new_vars.append((x * (1 - lr) - g * lr))
+      else:
+        new_vars.append(x)
+    return new_vars, state
+  def get_state(self, var_list):
+    return tf.constant(0.0)
+  def assign_state(self, state, var_list=None):
+    return tf.no_op()
--- a/research/learning_unsupervised_learning/run_eval.py
+++ b/research/learning_unsupervised_learning/run_eval.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+""" Script that iteratively applies the unsupervised update rule and evaluates the
+meta-objective performance.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import flags
+from absl import app
+from learning_unsupervised_learning import evaluation
+from learning_unsupervised_learning import datasets
+from learning_unsupervised_learning import architectures
+from learning_unsupervised_learning import summary_utils
+from learning_unsupervised_learning import meta_objective
+import tensorflow as tf
+import sonnet as snt
+from tensorflow.contrib.framework.python.framework import checkpoint_utils
+flags.DEFINE_string("checkpoint", None, "Dir to load pretrained update rule from")
+flags.DEFINE_string("train_log_dir", None, "Training log directory")
+FLAGS = flags.FLAGS
+def train(train_log_dir, checkpoint, eval_every_n_steps=10, num_steps=3000):
+  dataset_fn = datasets.mnist.TinyMnist
+  w_learner_fn = architectures.more_local_weight_update.MoreLocalWeightUpdateWLearner
+  theta_process_fn = architectures.more_local_weight_update.MoreLocalWeightUpdateProcess
+  meta_objectives = []
+  meta_objectives.append(
+      meta_objective.linear_regression.LinearRegressionMetaObjective)
+  meta_objectives.append(meta_objective.sklearn.LogisticRegression)
+  checkpoint_vars, train_one_step_op, (
+      base_model, dataset) = evaluation.construct_evaluation_graph(
+          theta_process_fn=theta_process_fn,
+          w_learner_fn=w_learner_fn,
+          dataset_fn=dataset_fn,
+          meta_objectives=meta_objectives)
+  batch = dataset()
+  pre_logit, outputs = base_model(batch)
+  global_step = tf.train.get_or_create_global_step()
+  var_list = list(
+      snt.get_variables_in_module(base_model, tf.GraphKeys.TRAINABLE_VARIABLES))
+  tf.logging.info("all vars")
+  for v in tf.all_variables():
+    tf.logging.info("   %s" % str(v))
+  global_step = tf.train.get_global_step()
+  accumulate_global_step = global_step.assign_add(1)
+  reset_global_step = global_step.assign(0)
+  train_op = tf.group(
+      train_one_step_op, accumulate_global_step, name="train_op")
+  summary_op = tf.summary.merge_all()
+  file_writer = summary_utils.LoggingFileWriter(train_log_dir, regexes=[".*"])
+  if checkpoint:
+    str_var_list = checkpoint_utils.list_variables(checkpoint)
+    name_to_v_map = {v.op.name: v for v in tf.all_variables()}
+    var_list = [
+        name_to_v_map[vn] for vn, _ in str_var_list if vn in name_to_v_map
+    ]
+    saver = tf.train.Saver(var_list)
+    missed_variables = [
+        v.op.name for v in set(
+            snt.get_variables_in_scope("LocalWeightUpdateProcess",
+                                       tf.GraphKeys.GLOBAL_VARIABLES)) -
+        set(var_list)
+    ]
+    assert len(missed_variables) == 0, "Missed a theta variable."
+  hooks = []
+  with tf.train.SingularMonitoredSession(master="", hooks=hooks) as sess:
+    # global step should be restored from the evals job checkpoint or zero for fresh.
+    step = sess.run(global_step)
+    if step == 0 and checkpoint:
+      tf.logging.info("force restore")
+      saver.restore(sess, checkpoint)
+      tf.logging.info("force restore done")
+      sess.run(reset_global_step)
+      step = sess.run(global_step)
+    while step < num_steps:
+      if step % eval_every_n_steps == 0:
+        s, _, step = sess.run([summary_op, train_op, global_step])
+        file_writer.add_summary(s, step)
+      else:
+        _, step = sess.run([train_op, global_step])
+def main(argv):
+  train(FLAGS.train_log_dir, FLAGS.checkpoint)
+if __name__ == "__main__":
+  app.run(main)
--- a/research/learning_unsupervised_learning/summary_utils.py
+++ b/research/learning_unsupervised_learning/summary_utils.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import collections
+import functools
+import threading
+import tensorflow as tf
+import matplotlib
+import numpy as np
+import time
+import re
+import math
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import scipy.signal
+from tensorflow.python.util import tf_should_use
+from tensorflow.contrib.summary import summary_ops
+from tensorflow.python.ops import summary_op_util
+from tensorflow.contrib.summary import gen_summary_ops
+_DEBUG_DISABLE_SUMMARIES=False
+class LoggingFileWriter(tf.summary.FileWriter):
+  """A FileWriter that also logs things out.
+  This is entirely for ease of debugging / not having to open up Tensorboard
+  a lot.
+  """
+  def __init__(self, logdir, regexes=[], **kwargs):
+    self.regexes = regexes
+    super(LoggingFileWriter, self).__init__(logdir, **kwargs)
+  def add_summary(self, summary, global_step):
+    if type(summary) != tf.Summary:
+      summary_p = tf.Summary()
+      summary_p.ParseFromString(summary)
+      summary = summary_p
+    for s in summary.value:
+      for exists in [re.match(p, s.tag) for p in self.regexes]:
+        if exists is not None:
+          tf.logging.info("%d ] %s : %f", global_step, s.tag, s.simple_value)
+          break
+    super(LoggingFileWriter, self).add_summary(summary, global_step)
+def image_grid(images, max_grid_size=4, border=1):
+  """Given images and N, return first N^2 images as an NxN image grid.
+  Args:
+    images: a `Tensor` of size [batch_size, height, width, channels]
+    max_grid_size: Maximum image grid height/width
+  Returns:
+    Single image batch, of dim [1, h*n, w*n, c]
+  """
+  batch_size = images.shape.as_list()[0]
+  to_pad = int((np.ceil(np.sqrt(batch_size)))**2 - batch_size)
+  images = tf.pad(images, [[0, to_pad], [0, border], [0, border], [0, 0]])
+  batch_size = images.shape.as_list()[0]
+  grid_size = min(int(np.sqrt(batch_size)), max_grid_size)
+  assert images.shape.as_list()[0] >= grid_size * grid_size
+  # If we have a depth channel
+  if images.shape.as_list()[-1] == 4:
+    images = images[:grid_size * grid_size, :, :, 0:3]
+    depth = tf.image.grayscale_to_rgb(images[:grid_size * grid_size, :, :, 3:4])
+    images = tf.reshape(images, [-1, images.shape.as_list()[2], 3])
+    split = tf.split(images, grid_size, axis=0)
+    depth = tf.reshape(depth, [-1, images.shape.as_list()[2], 3])
+    depth_split = tf.split(depth, grid_size, axis=0)
+    grid = tf.concat(split + depth_split, 1)
+    return tf.expand_dims(grid, 0)
+  else:
+    images = images[:grid_size * grid_size, :, :, :]
+    images = tf.reshape(
+        images, [-1, images.shape.as_list()[2],
+                 images.shape.as_list()[3]])
+    split = tf.split(value=images, num_or_size_splits=grid_size, axis=0)
+    grid = tf.concat(split, 1)
+    return tf.expand_dims(grid, 0)
+def first_layer_weight_image(weight, shape):
+  weight_image = tf.reshape(weight,
+                            shape + [tf.identity(weight).shape.as_list()[1]])
+  # [winx, winy, wout]
+  mean, var = tf.nn.moments(weight_image, [0,1,2], keep_dims=True)
+  #mean, var = tf.nn.moments(weight_image, [0,1], keep_dims=True)
+  weight_image = (weight_image - mean) / tf.sqrt(var + 1e-5)
+  weight_image = (weight_image + 1.0) / 2.0
+  weight_image = tf.clip_by_value(weight_image, 0, 1)
+  weight_image = tf.transpose(weight_image, (3, 0, 1, 2))
+  grid = image_grid(weight_image, max_grid_size=10)
+  return grid
+def inner_layer_weight_image(weight):
+  """Visualize a weight matrix of an inner layer.
+  Add padding to make it square, then visualize as a gray scale image
+  """
+  weight = tf.identity(weight) # turn into a tensor
+  weight = weight / (tf.reduce_max(tf.abs(weight), [0], keep_dims=True))
+  weight = tf.reshape(weight, [1]+weight.shape.as_list() + [1])
+  return weight
+def activation_image(activations, label_onehot):
+  """Make a row sorted by class for each activation. Put a black line around the activations."""
+  labels = tf.argmax(label_onehot, axis=1)
+  _, n_classes = label_onehot.shape.as_list()
+  mean, var = tf.nn.moments(activations, [0, 1])
+  activations = (activations - mean)/tf.sqrt(var+1e-5)
+  activations = tf.clip_by_value(activations, -1, 1)
+  activations = (activations + 1.0) / 2.0 # shift to [0, 1]
+  canvas = []
+  for i in xrange(n_classes):
+    inds = tf.where(tf.equal(labels, i))
+    def _gather():
+      return tf.squeeze(tf.gather(activations, inds), 1)
+    def _empty():
+      return tf.zeros([0, activations.shape.as_list()[1]], dtype=tf.float32)
+    assert inds.shape.as_list()[0] is None
+    x = tf.cond(tf.equal(tf.shape(inds)[0], 0), _empty, _gather)
+    canvas.append(x)
+    canvas.append(tf.zeros([1, activations.shape.as_list()[1]]))
+  canvas = tf.concat(canvas, 0)
+  canvas = tf.reshape(canvas, [1, activations.shape.as_list()[0]+n_classes, canvas.shape.as_list()[1], 1])
+  return canvas
+def sorted_images(images, label_onehot):
+  # images is [bs, x, y, c]
+  labels = tf.argmax(label_onehot, axis=1)
+  _, n_classes = label_onehot.shape.as_list()
+  to_stack = []
+  for i in xrange(n_classes):
+    inds = tf.where(tf.equal(labels, i))
+    def _gather():
+      return tf.squeeze(tf.gather(images, inds), 1)
+    def _empty():
+      return tf.zeros([0] + images.shape.as_list()[1:], dtype=tf.float32)
+    assert inds.shape.as_list()[0] is None
+    x = tf.cond(tf.equal(tf.shape(inds)[0], 0), _empty, _gather)
+    to_stack.append(x)
+  # pad / trim all up to 10.
+  padded = []
+  for t in to_stack:
+    n_found = tf.shape(t)[0]
+    pad = tf.pad(t[0:10], tf.stack([tf.stack([0,tf.maximum(0, 10-n_found)]), [0,0], [0,0], [0,0]]))
+    padded.append(pad)
+  xs = [tf.concat(tf.split(p, 10), axis=1) for p in padded]
+  ys = tf.concat(xs, axis=2)
+  ys = tf.cast(tf.clip_by_value(ys, 0., 1.) * 255., tf.uint8)
+  return ys
--- a/research/learning_unsupervised_learning/utils.py
+++ b/research/learning_unsupervised_learning/utils.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import tensorflow as tf
+import sonnet as snt
+import itertools
+import functools
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import variable_scope as variable_scope_ops
+from sonnet.python.modules import util as snt_util
+from tensorflow.python.util import nest
+def eqzip(*args):
+  """Zip but raises error if lengths don't match.
+  Args:
+    *args: list of lists or tuples
+  Returns:
+    list: the result of zip
+  Raises:
+    ValueError: when the lengths don't match
+  """
+  sizes = [len(x) for x in args]
+  if not all([sizes[0] == x for x in sizes]):
+    raise ValueError("Lists are of different sizes. \n %s"%str(sizes))
+  return zip(*args)
+@contextlib.contextmanager
+def assert_no_new_variables():
+  """Ensure that no tf.Variables are constructed inside the context.
+  Yields:
+    None
+  Raises:
+    ValueError: if there is a variable created.
+  """
+  num_vars = len(tf.global_variables())
+  old_variables = tf.global_variables()
+  yield
+  if len(tf.global_variables()) != num_vars:
+    new_vars = set(tf.global_variables()) - set(old_variables)
+    tf.logging.error("NEW VARIABLES CREATED")
+    tf.logging.error(10*"=")
+    for v in new_vars:
+      tf.logging.error(v)
+    raise ValueError("Variables created inside an "
+                     "assert_no_new_variables context")
+  if old_variables != tf.global_variables():
+    raise ValueError("Variables somehow changed inside an "
+                     "assert_no_new_variables context."
+                     "This means something modified the tf.global_variables()")
+def get_variables_in_modules(module_list):
+  var_list = []
+  for m in module_list:
+    var_list.extend(snt.get_variables_in_module(m))
+  return var_list
+def state_barrier_context(state):
+  """Return a context manager that prevents interior ops from running
+  unless the whole state has been computed.
+  This is to prevent assign race conditions.
+  """
+  tensors = [x for x in nest.flatten(state) if type(x) == tf.Tensor]
+  tarray = [x.flow for x in nest.flatten(state) if hasattr(x, "flow")]
+  return tf.control_dependencies(tensors + tarray)
+def _identity_fn(tf_entity):
+  if hasattr(tf_entity, "identity"):
+    return tf_entity.identity()
+  else:
+    return tf.identity(tf_entity)
+def state_barrier_result(state):
+  """Return the same state, but with a control dependency to prevent it from
+  being partially computed
+  """
+  with state_barrier_context(state):
+    return nest.map_structure(_identity_fn, state)
+def train_iterator(num_iterations):
+  """Iterator that returns an index of the current step.
+  This iterator runs forever if num_iterations is None
+  otherwise it runs for some fixed amount of steps.
+  """
+  if num_iterations is None:
+    return itertools.count()
+  else:
+    return xrange(num_iterations)
+def print_op(op, msg):
+  """Print a string and return an op wrapped in a control dependency to make
+  sure it ran."""
+  print_op = tf.Print(tf.constant(0), [tf.constant(0)], msg)
+  return tf.group(op, print_op)
+class MultiQueueRunner(tf.train.QueueRunner):
+  """A QueueRunner with multiple queues """
+  def __init__(self, queues, enqueue_ops):
+    close_op = tf.group(* [q.close() for q in queues])
+    cancel_op = tf.group(
+        * [q.close(cancel_pending_enqueues=True) for q in queues])
+    queue_closed_exception_types = (errors.OutOfRangeError,)
+    enqueue_op = tf.group(*enqueue_ops, name="multi_enqueue")
+    super(MultiQueueRunner, self).__init__(
+        queues[0],
+        enqueue_ops=[enqueue_op],
+        close_op=close_op,
+        cancel_op=cancel_op,
+        queue_closed_exception_types=queue_closed_exception_types)
+# This function is not elegant, but I tried so many other ways to get this to
+# work and this is the only one that ended up not incuring significant overhead
+# or obscure tensorflow bugs.
+def sample_n_per_class(dataset, samples_per_class):
+  """Create a new callable / dataset object that returns batches of each with
+  samples_per_class per label.
+  Args:
+    dataset: fn
+    samples_per_class: int
+  Returns:
+    function, [] -> batch where batch is the same type as the return of
+    dataset().
+  """
+  with tf.control_dependencies(None), tf.name_scope(None):
+    with tf.name_scope("queue_runner/sample_n_per_class"):
+      batch = dataset()
+      num_classes = batch.label_onehot.shape.as_list()[1]
+      batch_size = num_classes * samples_per_class
+      flatten = nest.flatten(batch)
+      queues = []
+      enqueue_ops = []
+      capacity = samples_per_class * 20
+      for i in xrange(num_classes):
+        queue = tf.FIFOQueue(
+            capacity=capacity,
+            shapes=[f.shape.as_list()[1:] for f in flatten],
+            dtypes=[f.dtype for f in flatten])
+        queues.append(queue)
+        idx = tf.where(tf.equal(batch.label, i))
+        sub_batch = []
+        to_enqueue = []
+        for elem in batch:
+          new_e = tf.gather(elem, idx)
+          new_e = tf.squeeze(new_e, 1)
+          to_enqueue.append(new_e)
+        remaining = (capacity - queue.size())
+        to_add = tf.minimum(tf.shape(idx)[0], remaining)
+        def _enqueue():
+          return queue.enqueue_many([t[:to_add] for t in to_enqueue])
+        enqueue_op = tf.cond(
+            tf.equal(to_add, 0), tf.no_op, _enqueue)
+        enqueue_ops.append(enqueue_op)
+      # This has caused many deadlocks / issues. This is some logging to at least
+      # shed light to what is going on.
+      print_lam = lambda: tf.Print(tf.constant(0.0), [q.size() for q in queues], "MultiQueueRunner queues status. Has capacity %d"%capacity)
+      some_percent_of_time = tf.less(tf.random_uniform([]), 0.0005)
+      maybe_print = tf.cond(some_percent_of_time, print_lam, lambda: tf.constant(0.0))
+      with tf.control_dependencies([maybe_print]):
+        enqueue_ops = [tf.group(e) for e in enqueue_ops]
+      qr = MultiQueueRunner(queues=queues, enqueue_ops=enqueue_ops)
+      tf.train.add_queue_runner(qr)
+  def dequeue_batch():
+    with tf.name_scope("sample_n_per_batch/dequeue/"):
+      entries = []
+      for q in queues:
+        entries.append(q.dequeue_many(samples_per_class))
+      flat_batch = [tf.concat(x, 0) for x in zip(*entries)]
+      idx = tf.random_shuffle(tf.range(batch_size))
+      flat_batch = [tf.gather(f, idx, axis=0) for f in flat_batch]
+      return nest.pack_sequence_as(batch, flat_batch)
+  return dequeue_batch
+def structure_map_multi(func, values):
+  all_values = [nest.flatten(v) for v in values]
+  rets = []
+  for pair in zip(*all_values):
+    rets.append(func(pair))
+  return nest.pack_sequence_as(values[0], rets)
+def structure_map_split(func, value):
+  vv = nest.flatten(value)
+  rets = []
+  for v in vv:
+    rets.append(func(v))
+  return [nest.pack_sequence_as(value, r) for r in zip(*rets)]
+def assign_variables(targets, values):
+  return tf.group(*[t.assign(v) for t,v in eqzip(targets, values)],
+                  name="assign_variables")
+def create_variables_in_class_scope(method):
+  """Force the variables constructed in this class to live in the sonnet module.
+  Wraps a method on a sonnet module.
+  For example the following will create two different variables.
+  ```
+  class Mod(snt.AbstractModule):
+    @create_variables_in_class_scope
+    def dynamic_thing(self, input, name):
+      return snt.Linear(name)(input)
+  mod.dynamic_thing(x, name="module_nameA")
+  mod.dynamic_thing(x, name="module_nameB")
+  # reuse
+  mod.dynamic_thing(y, name="module_nameA")
+  ```
+  """
+  @functools.wraps(method)
+  def wrapper(obj, *args, **kwargs):
+    def default_context_manager(reuse=None):
+      variable_scope = obj.variable_scope
+      return tf.variable_scope(variable_scope, reuse=reuse)
+    variable_scope_context_manager = getattr(obj, "_enter_variable_scope",
+                                             default_context_manager)
+    graph = tf.get_default_graph()
+    # Temporarily enter the variable scope to capture it
+    with variable_scope_context_manager() as tmp_variable_scope:
+      variable_scope = tmp_variable_scope
+    with variable_scope_ops._pure_variable_scope(
+        variable_scope, reuse=tf.AUTO_REUSE) as pure_variable_scope:
+      name_scope = variable_scope.original_name_scope
+      if name_scope[-1] != "/":
+        name_scope += "/"
+      with tf.name_scope(name_scope):
+        sub_scope = snt_util.to_snake_case(method.__name__)
+        with tf.name_scope(sub_scope) as scope:
+          out_ops = method(obj, *args, **kwargs)
+          return out_ops
+  return wrapper