add flag for saving images to summary; strings moved to common.py'

b3247557 · Dheera Venkatraman · 75c931fd · 2041d5ca · b3247557 · b3247557
Commit b3247557 authored Apr 04, 2018 by Dheera Venkatraman
20 changed files
--- a/research/differential_privacy/pate/core_test.py
+++ b/research/differential_privacy/pate/core_test.py
+# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for google3.experimental.brain.privacy.pate.pate."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+import numpy as np
+
+import core as pate
+
+
+class PateTest(unittest.TestCase):
+
+  def _test_rdp_gaussian_value_errors(self):
+    # Test for ValueErrors.
+    with self.assertRaises(ValueError):
+      pate.rdp_gaussian(1.0, 1.0, np.array([2, 3, 4]))
+    with self.assertRaises(ValueError):
+      pate.rdp_gaussian(np.log(0.5), -1.0, np.array([2, 3, 4]))
+    with self.assertRaises(ValueError):
+      pate.rdp_gaussian(np.log(0.5), 1.0, np.array([1, 3, 4]))
+
+  def _test_rdp_gaussian_as_function_of_q(self):
+    # Test for data-independent and data-dependent ranges over q.
+    # The following corresponds to orders 1.1, 2.5, 32, 250
+    # sigmas 1.5, 15, 1500, 15000.
+    # Hand calculated -log(q0)s arranged in a 'sigma major' ordering.
+    neglogq0s = [
+        2.8, 2.6, 427, None, 4.8, 4.0, 4.7, 275, 9.6, 8.8, 6.0, 4, 12, 11.2,
+        8.6, 6.4
+    ]
+    idx_neglogq0s = 0  # To iterate through neglogq0s.
+    orders = [1.1, 2.5, 32, 250]
+    sigmas = [1.5, 15, 1500, 15000]
+    for sigma in sigmas:
+      for order in orders:
+        curr_neglogq0 = neglogq0s[idx_neglogq0s]
+        idx_neglogq0s += 1
+        if curr_neglogq0 is None:  # sigma == 1.5 and order == 250:
+          continue
+
+        rdp_at_q0 = pate.rdp_gaussian(-curr_neglogq0, sigma, order)
+
+        # Data-dependent range. (Successively halve the value of q.)
+        logq_dds = (-curr_neglogq0 - np.array(
+            [0, np.log(2), np.log(4), np.log(8)]))
+        # Check that in q_dds, rdp is decreasing.
+        for idx in range(len(logq_dds) - 1):
+          self.assertGreater(
+              pate.rdp_gaussian(logq_dds[idx], sigma, order),
+              pate.rdp_gaussian(logq_dds[idx + 1], sigma, order))
+
+        # Data-independent range.
+        q_dids = np.exp(-curr_neglogq0) + np.array([0.1, 0.2, 0.3, 0.4])
+        # Check that in q_dids, rdp is constant.
+        for q in q_dids:
+          self.assertEqual(rdp_at_q0, pate.rdp_gaussian(
+              np.log(q), sigma, order))
+
+  def _test_compute_eps_from_delta_value_error(self):
+    # Test for ValueError.
+    with self.assertRaises(ValueError):
+      pate.compute_eps_from_delta([1.1, 2, 3, 4], [1, 2, 3], 0.001)
+
+  def _test_compute_eps_from_delta_monotonicity(self):
+    # Test for monotonicity with respect to delta.
+    orders = [1.1, 2.5, 250.0]
+    sigmas = [1e-3, 1.0, 1e5]
+    deltas = [1e-60, 1e-6, 0.1, 0.999]
+    for sigma in sigmas:
+      list_of_eps = []
+      rdps_for_gaussian = np.array(orders) / (2 * sigma**2)
+      for delta in deltas:
+        list_of_eps.append(
+            pate.compute_eps_from_delta(orders, rdps_for_gaussian, delta)[0])
+
+      # Check that in list_of_eps, epsilons are decreasing (as delta increases).
+      sorted_list_of_eps = list(list_of_eps)
+      sorted_list_of_eps.sort(reverse=True)
+      self.assertEqual(list_of_eps, sorted_list_of_eps)
+
+  def _test_compute_q0(self):
+    # Stub code to search a logq space and figure out logq0 by eyeballing
+    # results. This code does not run with the tests. Remove underscore to run.
+    sigma = 15
+    order = 250
+    logqs = np.arange(-290, -270, 1)
+    count = 0
+    for logq in logqs:
+      count += 1
+      sys.stdout.write("\t%0.5g: %0.10g" %
+                       (logq, pate.rdp_gaussian(logq, sigma, order)))
+      sys.stdout.flush()
+      if count % 5 == 0:
+        print("")
+
+  def test_rdp_gaussian(self):
+    self._test_rdp_gaussian_value_errors()
+    self._test_rdp_gaussian_as_function_of_q()
+
+  def test_compute_eps_from_delta(self):
+    self._test_compute_eps_from_delta_value_error()
+    self._test_compute_eps_from_delta_monotonicity()
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/research/differential_privacy/pate/smooth_sensitivity.py
+++ b/research/differential_privacy/pate/smooth_sensitivity.py
+# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions for smooth sensitivity analysis for PATE mechanisms.
+
+This library implements functionality for doing smooth sensitivity analysis
+for Gaussian Noise Max (GNMax), Threshold with Gaussian noise, and Gaussian
+Noise with Smooth Sensitivity (GNSS) mechanisms.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from absl import app
+import numpy as np
+import scipy
+import sympy as sp
+
+import core as pate
+
+################################
+# SMOOTH SENSITIVITY FOR GNMAX #
+################################
+
+# Global dictionary for storing cached q0 values keyed by (sigma, order).
+_logq0_cache = {}
+
+
+def _compute_logq0(sigma, order):
+  key = (sigma, order)
+  if key in _logq0_cache:
+    return _logq0_cache[key]
+
+  logq0 = compute_logq0_gnmax(sigma, order)
+
+  _logq0_cache[key] = logq0  # Update the global variable.
+  return logq0
+
+
+def _compute_logq1(sigma, order, num_classes):
+  logq0 = _compute_logq0(sigma, order)  # Most likely already cached.
+  logq1 = math.log(_compute_bl_gnmax(math.exp(logq0), sigma, num_classes))
+  assert logq1 <= logq0
+  return logq1
+
+
+def _compute_mu1_mu2_gnmax(sigma, logq):
+  # Computes mu1, mu2 according to Proposition 10.
+  mu2 = sigma * math.sqrt(-logq)
+  mu1 = mu2 + 1
+  return mu1, mu2
+
+
+def _compute_data_dep_bound_gnmax(sigma, logq, order):
+  # Applies Theorem 6 in Appendix without checking that logq satisfies necessary
+  # constraints. The pre-conditions must be assured by comparing logq against
+  # logq0 by the caller.
+  variance = sigma**2
+  mu1, mu2 = _compute_mu1_mu2_gnmax(sigma, logq)
+  eps1 = mu1 / variance
+  eps2 = mu2 / variance
+
+  log1q = np.log1p(-math.exp(logq))  # log1q = log(1-q)
+  log_a = (order - 1) * (
+      log1q - (np.log1p(-math.exp((logq + eps2) * (1 - 1 / mu2)))))
+  log_b = (order - 1) * (eps1 - logq / (mu1 - 1))
+
+  return np.logaddexp(log1q + log_a, logq + log_b) / (order - 1)
+
+
+def _compute_rdp_gnmax(sigma, logq, order):
+  logq0 = _compute_logq0(sigma, order)
+  if logq >= logq0:
+    return pate.rdp_data_independent_gaussian(sigma, order)
+  else:
+    return _compute_data_dep_bound_gnmax(sigma, logq, order)
+
+
+def compute_logq0_gnmax(sigma, order):
+  """Computes the point where we start using data-independent bounds.
+
+  Args:
+    sigma: std of the Gaussian noise
+    order: Renyi order lambda
+
+  Returns:
+    logq0: the point above which the data-ind bound overtakes data-dependent
+    bound.
+  """
+
+  def _check_validity_conditions(logq):
+    # Function returns true iff logq is in the range where data-dependent bound
+    # is valid. (Theorem 6 in Appendix.)
+    mu1, mu2 = _compute_mu1_mu2_gnmax(sigma, logq)
+    if mu1 < order:
+      return False
+    eps2 = mu2 / sigma**2
+    # Do computation in the log space. The condition below comes from Lemma 9
+    # from Appendix.
+    return (logq <= (mu2 - 1) * eps2 - mu2 * math.log(mu1 / (mu1 - 1) * mu2 /
+                                                      (mu2 - 1)))
+
+  def _compare_dep_vs_ind(logq):
+    return (_compute_data_dep_bound_gnmax(sigma, logq, order) -
+            pate.rdp_data_independent_gaussian(sigma, order))
+
+  # Natural upper bounds on q0.
+  logub = min(-(1 + 1. / sigma)**2, -((order - 1) / sigma)**2, -1 / sigma**2)
+  assert _check_validity_conditions(logub)
+
+  # If data-dependent bound is already better, we are done already.
+  if _compare_dep_vs_ind(logub) < 0:
+    return logub
+
+  # Identifying a reasonable lower bound to bracket logq0.
+  loglb = 2 * logub  # logub is negative, and thus loglb < logub.
+  while _compare_dep_vs_ind(loglb) > 0:
+    assert loglb > -10000, "The lower bound on q0 is way too low."
+    loglb *= 1.5
+
+  logq0, r = scipy.optimize.brentq(
+      _compare_dep_vs_ind, loglb, logub, full_output=True)
+  assert r.converged, "The root finding procedure failed to converge."
+  assert _check_validity_conditions(logq0)  # just in case.
+
+  return logq0
+
+
+def _compute_bl_gnmax(q, sigma, num_classes):
+  return ((num_classes - 1) / 2 * scipy.special.erfc(
+      1 / sigma + scipy.special.erfcinv(2 * q / (num_classes - 1))))
+
+
+def _compute_bu_gnmax(q, sigma, num_classes):
+  return min(1, (num_classes - 1) / 2 * scipy.special.erfc(
+      -1 / sigma + scipy.special.erfcinv(2 * q / (num_classes - 1))))
+
+
+def _compute_local_sens_gnmax(logq, sigma, num_classes, order):
+  """Implements Algorithm 3 (computes an upper bound on local sensitivity).
+
+  (See Proposition 13 for proof of correctness.)
+  """
+  logq0 = _compute_logq0(sigma, order)
+  logq1 = _compute_logq1(sigma, order, num_classes)
+  if logq1 <= logq <= logq0:
+    logq = logq1
+
+  beta = _compute_rdp_gnmax(sigma, logq, order)
+  beta_bu_q = _compute_rdp_gnmax(
+      sigma, math.log(_compute_bu_gnmax(math.exp(logq), sigma, num_classes)),
+      order)
+  beta_bl_q = _compute_rdp_gnmax(
+      sigma, math.log(_compute_bl_gnmax(math.exp(logq), sigma, num_classes)),
+      order)
+  return max(beta_bu_q - beta, beta - beta_bl_q)
+
+
+def compute_local_sensitivity_bounds_gnmax(votes, num_teachers, sigma, order):
+  """Computes a list of max-LS-at-distance-d for the GNMax mechanism.
+
+  A more efficient implementation of Algorithms 4 and 5 working in time
+  O(teachers*classes). A naive implementation is O(teachers^2*classes) or worse.
+
+  Args:
+    votes: A numpy array of votes.
+    num_teachers: Total number of voting teachers.
+    sigma: Standard deviation of the Guassian noise.
+    order: The Renyi order.
+
+  Returns:
+    A numpy array of local sensitivities at distances d, 0 <= d <= num_teachers.
+  """
+
+  num_classes = len(votes)  # Called m in the paper.
+
+  logq0 = _compute_logq0(sigma, order)
+  logq1 = _compute_logq1(sigma, order, num_classes)
+  logq = pate.compute_logq_gaussian(votes, sigma)
+  plateau = _compute_local_sens_gnmax(logq1, sigma, num_classes, order)
+
+  res = np.full(num_teachers, plateau)
+
+  if logq1 <= logq <= logq0:
+    return res
+
+  # Invariant: votes is sorted in the non-increasing order.
+  votes = sorted(votes, reverse=True)
+
+  res[0] = _compute_local_sens_gnmax(logq, sigma, num_classes, order)
+  curr_d = 0
+
+  go_left = logq > logq0  # Otherwise logq < logq1 and we go right.
+
+  # Iterate while the following is true:
+  #    1. If we are going left, logq is still larger than logq0 and we may still
+  #       increase the gap between votes[0] and votes[1].
+  #    2. If we are going right, logq is still smaller than logq1.
+  while ((go_left and logq > logq0 and votes[1] > 0) or
+         (not go_left and logq < logq1)):
+    curr_d += 1
+    if go_left:  # Try decreasing logq.
+      votes[0] += 1
+      votes[1] -= 1
+      idx = 1
+      # Restore the invariant. (Can be implemented more efficiently by keeping
+      # track of the range of indices equal to votes[1]. Does not seem to matter
+      # for the overall running time.)
+      while idx < len(votes) - 1 and votes[idx] < votes[idx + 1]:
+        votes[idx], votes[idx + 1] = votes[idx + 1], votes[idx]
+        idx += 1
+    else:  # Go right, i.e., try increasing logq.
+      votes[0] -= 1
+      votes[1] += 1  # The invariant holds since otherwise logq >= logq1.
+
+    logq = pate.compute_logq_gaussian(votes, sigma)
+    res[curr_d] = _compute_local_sens_gnmax(logq, sigma, num_classes, order)
+
+  return res
+
+
+##################################################
+# SMOOTH SENSITIVITY FOR THE THRESHOLD MECHANISM #
+##################################################
+
+# A global dictionary of RDPs for various threshold values. Indexed by a 4-tuple
+# (num_teachers, threshold, sigma, order).
+_rdp_thresholds = {}
+
+
+def _compute_rdp_list_threshold(num_teachers, threshold, sigma, order):
+  key = (num_teachers, threshold, sigma, order)
+  if key in _rdp_thresholds:
+    return _rdp_thresholds[key]
+
+  res = np.zeros(num_teachers + 1)
+  for v in range(0, num_teachers + 1):
+    logp = scipy.stats.norm.logsf(threshold - v, scale=sigma)
+    res[v] = pate.compute_rdp_threshold(logp, sigma, order)
+
+  _rdp_thresholds[key] = res
+  return res
+
+
+def compute_local_sensitivity_bounds_threshold(counts, num_teachers, threshold,
+                                               sigma, order):
+  """Computes a list of max-LS-at-distance-d for the threshold mechanism."""
+
+  def _compute_ls(v):
+    ls_step_up, ls_step_down = None, None
+    if v > 0:
+      ls_step_down = abs(rdp_list[v - 1] - rdp_list[v])
+    if v < num_teachers:
+      ls_step_up = abs(rdp_list[v + 1] - rdp_list[v])
+    return max(ls_step_down, ls_step_up)  # Rely on max(x, None) = x.
+
+  cur_max = int(round(max(counts)))
+  rdp_list = _compute_rdp_list_threshold(num_teachers, threshold, sigma, order)
+
+  ls = np.zeros(num_teachers)
+  for d in range(max(cur_max, num_teachers - cur_max)):
+    ls_up, ls_down = None, None
+    if cur_max + d <= num_teachers:
+      ls_up = _compute_ls(cur_max + d)
+    if cur_max - d >= 0:
+      ls_down = _compute_ls(cur_max - d)
+    ls[d] = max(ls_up, ls_down)
+  return ls
+
+
+#############################################
+# PROCEDURES FOR SMOOTH SENSITIVITY RELEASE #
+#############################################
+
+# A global dictionary of exponentially decaying arrays. Indexed by beta.
+dict_beta_discount = {}
+
+
+def compute_discounted_max(beta, a):
+  n = len(a)
+
+  if beta not in dict_beta_discount or (len(dict_beta_discount[beta]) < n):
+    dict_beta_discount[beta] = np.exp(-beta * np.arange(n))
+
+  return max(a * dict_beta_discount[beta][:n])
+
+
+def compute_smooth_sensitivity_gnmax(beta, counts, num_teachers, sigma, order):
+  """Computes smooth sensitivity of a single application of GNMax."""
+
+  ls = compute_local_sensitivity_bounds_gnmax(counts, sigma, order,
+                                              num_teachers)
+  return compute_discounted_max(beta, ls)
+
+
+def compute_rdp_of_smooth_sensitivity_gaussian(beta, sigma, order):
+  """Computes the RDP curve for the GNSS mechanism.
+
+  Implements Theorem 23 (https://arxiv.org/pdf/1802.08908.pdf).
+  """
+  if beta > 0 and not 1 < order < 1 / (2 * beta):
+    raise ValueError("Order outside the (1, 1/(2*beta)) range.")
+
+  return order * math.exp(2 * beta) / sigma**2 + (
+      -.5 * math.log(1 - 2 * order * beta) + beta * order) / (
+          order - 1)
+
+
+def compute_params_for_ss_release(eps, delta):
+  """Computes sigma for additive Gaussian noise scaled by smooth sensitivity.
+
+  Presently not used. (We proceed via RDP analysis.)
+
+  Compute beta, sigma for applying Lemma 2.6 (full version of Nissim et al.) via
+  Lemma 2.10.
+  """
+  # Rather than applying Lemma 2.10 directly, which would give suboptimal alpha,
+  # (see http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf),
+  # we extract a sufficient condition on alpha from its proof.
+  #
+  # Let a = rho_(delta/2)(Z_1). Then solve for alpha such that
+  # 2 alpha a + alpha^2 = eps/2.
+  a = scipy.special.ndtri(1 - delta / 2)
+  alpha = math.sqrt(a**2 + eps / 2) - a
+
+  beta = eps / (2 * scipy.special.chdtri(1, delta / 2))
+
+  return alpha, beta
+
+
+#######################################################
+# SYMBOLIC-NUMERIC VERIFICATION OF CONDITIONS C5--C6. #
+#######################################################
+
+
+def _construct_symbolic_beta(q, sigma, order):
+  mu2 = sigma * sp.sqrt(sp.log(1 / q))
+  mu1 = mu2 + 1
+  eps1 = mu1 / sigma**2
+  eps2 = mu2 / sigma**2
+  a = (1 - q) / (1 - (q * sp.exp(eps2))**(1 - 1 / mu2))
+  b = sp.exp(eps1) / q**(1 / (mu1 - 1))
+  s = (1 - q) * a**(order - 1) + q * b**(order - 1)
+  return (1 / (order - 1)) * sp.log(s)
+
+
+def _construct_symbolic_bu(q, sigma, m):
+  return (m - 1) / 2 * sp.erfc(sp.erfcinv(2 * q / (m - 1)) - 1 / sigma)
+
+
+def _is_non_decreasing(fn, q, bounds):
+  """Verifies whether the function is non-decreasing within a range.
+
+  Args:
+    fn: Symbolic function of a single variable.
+    q: The name of f's variable.
+    bounds: Pair of (lower_bound, upper_bound) reals.
+
+  Returns:
+    True iff the function is non-decreasing in the range.
+  """
+  diff_fn = sp.diff(fn, q)  # Symbolically compute the derivative.
+  diff_fn_lambdified = sp.lambdify(
+      q,
+      diff_fn,
+      modules=[
+          "numpy", {
+              "erfc": scipy.special.erfc,
+              "erfcinv": scipy.special.erfcinv
+          }
+      ])
+  r = scipy.optimize.minimize_scalar(
+      diff_fn_lambdified, bounds=bounds, method="bounded")
+  assert r.success, "Minimizer failed to converge."
+  return r.fun >= 0  # Check whether the derivative is non-negative.
+
+
+def check_conditions(sigma, m, order):
+  """Checks conditions C5 and C6 (Section B.4.2 in Appendix)."""
+  q = sp.symbols("q", positive=True, real=True)
+
+  beta = _construct_symbolic_beta(q, sigma, order)
+  q0 = math.exp(compute_logq0_gnmax(sigma, order))
+
+  cond5 = _is_non_decreasing(beta, q, (0, q0))
+
+  if cond5:
+    bl_q0 = _compute_bl_gnmax(q0, sigma, m)
+
+    bu = _construct_symbolic_bu(q, sigma, m)
+    delta_beta = beta.subs(q, bu) - beta
+
+    cond6 = _is_non_decreasing(delta_beta, q, (0, bl_q0))
+  else:
+    cond6 = False  # Skip the check, since Condition 5 is false already.
+
+  return (cond5, cond6)
+
+
+def main(argv):
+  del argv  # Unused.
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/research/differential_privacy/pate/smooth_sensitivity_test.py
+++ b/research/differential_privacy/pate/smooth_sensitivity_test.py
+# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for google3.experimental.brain.privacy.pate.pate_smooth_sensitivity."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import smooth_sensitivity as pate_ss
+
+
+class PateSmoothSensitivityTest(unittest.TestCase):
+
+  def test_check_conditions(self):
+    self.assertEqual(pate_ss.check_conditions(20, 10, 25.), (True, False))
+    self.assertEqual(pate_ss.check_conditions(30, 10, 25.), (True, True))
+
+  def _assert_all_close(self, x, y):
+    """Asserts that two numpy arrays are close."""
+    self.assertEqual(len(x), len(y))
+    self.assertTrue(np.allclose(x, y, rtol=1e-8, atol=0))
+
+  def test_compute_local_sensitivity_bounds_gnmax(self):
+    counts1 = np.array([10, 0, 0])
+    sigma1 = .5
+    order1 = 1.5
+
+    answer1 = np.array(
+        [3.13503646e-17, 1.60178280e-08, 5.90681786e-03] + [5.99981308e+00] * 7)
+
+    # Test for "going right" in the smooth sensitivity computation.
+    out1 = pate_ss.compute_local_sensitivity_bounds_gnmax(
+        counts1, 10, sigma1, order1)
+
+    self._assert_all_close(out1, answer1)
+
+    counts2 = np.array([1000, 500, 300, 200, 0])
+    sigma2 = 250.
+    order2 = 10.
+
+    # Test for "going left" in the smooth sensitivity computation.
+    out2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
+        counts2, 2000, sigma2, order2)
+
+    answer2 = np.array([0.] * 298 + [2.77693450548e-7, 2.10853979548e-6] +
+                       [2.73113623988e-6] * 1700)
+    self._assert_all_close(out2, answer2)
+
+  def test_compute_local_sensitivity_bounds_threshold(self):
+    counts1_3 = np.array([20, 10, 0])
+    num_teachers = sum(counts1_3)
+    t1 = 16  # high threshold
+    sigma = 2
+    order = 10
+
+    out1 = pate_ss.compute_local_sensitivity_bounds_threshold(
+        counts1_3, num_teachers, t1, sigma, order)
+    answer1 = np.array([0] * 3 + [
+        1.48454129e-04, 1.47826870e-02, 3.94153241e-02, 6.45775697e-02,
+        9.01543247e-02, 1.16054002e-01, 1.42180452e-01, 1.42180452e-01,
+        1.48454129e-04, 1.47826870e-02, 3.94153241e-02, 6.45775697e-02,
+        9.01543266e-02, 1.16054000e-01, 1.42180452e-01, 1.68302106e-01,
+        1.93127860e-01
+    ] + [0] * 10)
+    self._assert_all_close(out1, answer1)
+
+    t2 = 2  # low threshold
+
+    out2 = pate_ss.compute_local_sensitivity_bounds_threshold(
+        counts1_3, num_teachers, t2, sigma, order)
+    answer2 = np.array([
+        1.60212079e-01, 2.07021132e-01, 2.07021132e-01, 1.93127860e-01,
+        1.68302106e-01, 1.42180452e-01, 1.16054002e-01, 9.01543247e-02,
+        6.45775697e-02, 3.94153241e-02, 1.47826870e-02, 1.48454129e-04
+    ] + [0] * 18)
+    self._assert_all_close(out2, answer2)
+
+    t3 = 50  # very high threshold (larger than the number of teachers).
+
+    out3 = pate_ss.compute_local_sensitivity_bounds_threshold(
+        counts1_3, num_teachers, t3, sigma, order)
+
+    answer3 = np.array([
+        1.35750725752e-19, 1.88990500499e-17, 2.05403154065e-15,
+        1.74298153642e-13, 1.15489723995e-11, 5.97584949325e-10,
+        2.41486826748e-08, 7.62150641922e-07, 1.87846248741e-05,
+        0.000360973025976, 0.000360973025976, 2.76377015215e-50,
+        1.00904975276e-53, 2.87254164748e-57, 6.37583360761e-61,
+        1.10331620211e-64, 1.48844393335e-68, 1.56535552444e-72,
+        1.28328011060e-76, 8.20047697109e-81
+    ] + [0] * 10)
+
+    self._assert_all_close(out3, answer3)
+
+    # Fractional values.
+    counts4 = np.array([19.5, -5.1, 0])
+    t4 = 10.1
+    out4 = pate_ss.compute_local_sensitivity_bounds_threshold(
+        counts4, num_teachers, t4, sigma, order)
+
+    answer4 = np.array([
+        0.0620410301, 0.0875807131, 0.113451958, 0.139561671, 0.1657074530,
+        0.1908244840, 0.2070270720, 0.207027072, 0.169718100, 0.0575152142,
+        0.00678695871
+    ] + [0] * 6 + [0.000536304908, 0.0172181073, 0.041909870] + [0] * 10)
+    self._assert_all_close(out4, answer4)
+
+
+if __name__ == "__main__":
+  unittest.main()
--- a/research/learning_unsupervised_learning/.gitignore
+++ b/research/learning_unsupervised_learning/.gitignore
+*.pyc
--- a/research/learning_unsupervised_learning/README.md
+++ b/research/learning_unsupervised_learning/README.md
+# Learning Unsupervised Learning Rules
+This repository contains code and weights for the learned update rule
+presented in "Learning Unsupervised Learning Rules." At this time, this
+code can not meta-train the update rule.
+
+
+### Structure
+`run_eval.py` contains the main training loop. This constructs an op
+that runs one iteration of the learned update rule and assigns the
+results to variables. Additionally, it loads the weights from our
+pre-trained model.
+
+The base model and the update rule architecture definition can be found in
+`architectures/more_local_weight_update.py`. For a complete description
+of the model, see our [paper](https://arxiv.org/abs/1804.00222).
+
+### Dependencies
+[absl]([https://github.com/abseil/abseil-py), [tensorflow](https://tensorflow.org), [sonnet](https://github.com/deepmind/sonnet)
+
+### Usage
+
+First, download the [pre-trained optimizer model weights](https://storage.googleapis.com/learning_unsupervised_learning/200_tf_graph.zip) and extract it.
+
+```bash
+# move to the folder above this folder
+cd path_to/research/learning_unsupervised_learning/../
+
+# launch the eval script
+python -m learning_unsupervised_learning.run_eval \
+--train_log_dir="/tmp/learning_unsupervised_learning" \
+--checkpoint_dir="/path/to/downloaded/model/tf_graph_data.ckpt"
+```
+
+### Contact
+Luke Metz, Niru Maheswaranathan, Github: @lukemetz, @nirum. Email: {lmetz, nirum}@google.com
+
+
--- a/research/learning_unsupervised_learning/__init__.py
+++ b/research/learning_unsupervised_learning/__init__.py
--- a/research/learning_unsupervised_learning/architectures/__init__.py
+++ b/research/learning_unsupervised_learning/architectures/__init__.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import more_local_weight_update
--- a/research/learning_unsupervised_learning/architectures/common.py
+++ b/research/learning_unsupervised_learning/architectures/common.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sonnet as snt
+import tensorflow as tf
+import numpy as np
+import collections
+from learning_unsupervised_learning import utils
+
+from tensorflow.python.util import nest
+
+from learning_unsupervised_learning import variable_replace
+
+
+class LinearBatchNorm(snt.AbstractModule):
+  """Module that does a Linear layer then a BatchNorm followed by an activation fn"""
+  def __init__(self, size, activation_fn=tf.nn.relu, name="LinearBatchNorm"):
+    self.size = size
+    self.activation_fn = activation_fn
+    super(LinearBatchNorm, self).__init__(name=name)
+
+  def _build(self, x):
+    x = tf.to_float(x)
+    initializers={"w": tf.truncated_normal_initializer(stddev=0.01)}
+    lin = snt.Linear(self.size, use_bias=False, initializers=initializers)
+    z = lin(x)
+
+    scale = tf.constant(1., dtype=tf.float32)
+    offset = tf.get_variable(
+        "b",
+        shape=[1, z.shape.as_list()[1]],
+        initializer=tf.truncated_normal_initializer(stddev=0.1),
+        dtype=tf.float32
+    )
+
+    mean, var = tf.nn.moments(z, [0], keep_dims=True)
+    z = ((z - mean) * tf.rsqrt(var + 1e-6)) * scale + offset
+
+    x_p = self.activation_fn(z)
+
+    return z, x_p
+
+  # This needs to work by string name sadly due to how the variable replace
+  # works and would also work even if the custom getter approuch was used.
+  # This is verbose, but it should atleast be clear as to what is going on.
+  # TODO(lmetz) a better way to do this (the next 3 functions:
+  #    _raw_name, w(), b() )
+  def _raw_name(self, var_name):
+    """Return just the name of the variable, not the scopes."""
+    return var_name.split("/")[-1].split(":")[0]
+
+
+  @property
+  def w(self):
+    var_list = snt.get_variables_in_module(self)
+    w = [x for x in var_list if self._raw_name(x.name) == "w"]
+    assert len(w) == 1
+    return w[0]
+
+  @property
+  def b(self):
+    var_list = snt.get_variables_in_module(self)
+    b = [x for x in var_list if self._raw_name(x.name) == "b"]
+    assert len(b) == 1
+    return b[0]
+
+
+
+class Linear(snt.AbstractModule):
+  def __init__(self, size, use_bias=True, init_const_mag=True):
+    self.size = size
+    self.use_bias = use_bias
+    self.init_const_mag = init_const_mag
+    super(Linear, self).__init__(name="commonLinear")
+
+  def _build(self, x):
+    if self.init_const_mag:
+      initializers={"w": tf.truncated_normal_initializer(stddev=0.01)}
+    else:
+      initializers={}
+    lin = snt.Linear(self.size, use_bias=self.use_bias, initializers=initializers)
+    z = lin(x)
+    return z
+
+  # This needs to work by string name sadly due to how the variable replace
+  # works and would also work even if the custom getter approuch was used.
+  # This is verbose, but it should atleast be clear as to what is going on.
+  # TODO(lmetz) a better way to do this (the next 3 functions:
+  #    _raw_name, w(), b() )
+  def _raw_name(self, var_name):
+    """Return just the name of the variable, not the scopes."""
+    return var_name.split("/")[-1].split(":")[0]
+
+  @property
+  def w(self):
+    var_list = snt.get_variables_in_module(self)
+    if self.use_bias:
+      assert len(var_list) == 2, "Found not 2 but %d" % len(var_list)
+    else:
+      assert len(var_list) == 1, "Found not 1 but %d" % len(var_list)
+    w = [x for x in var_list if self._raw_name(x.name) == "w"]
+    assert len(w) == 1
+    return w[0]
+
+  @property
+  def b(self):
+    var_list = snt.get_variables_in_module(self)
+    assert len(var_list) == 2, "Found not 2 but %d" % len(var_list)
+    b = [x for x in var_list if self._raw_name(x.name) == "b"]
+    assert len(b) == 1
+    return b[0]
+
+
+def transformer_at_state(base_model, new_variables):
+  """Get the base_model that has been transformed to use the variables
+  in final_state.
+  Args:
+    base_model: snt.Module
+      Goes from batch to features
+    new_variables: list
+      New list of variables to use
+  Returns:
+    func: callable of same api as base_model.
+  """
+  assert not variable_replace.in_variable_replace_scope()
+
+  def _feature_transformer(input_data):
+    """Feature transformer at the end of training."""
+    initial_variables = base_model.get_variables()
+    replacement = collections.OrderedDict(
+        utils.eqzip(initial_variables, new_variables))
+    with variable_replace.variable_replace(replacement):
+      features = base_model(input_data)
+    return features
+
+  return _feature_transformer
--- a/research/learning_unsupervised_learning/architectures/more_local_weight_update.py
+++ b/research/learning_unsupervised_learning/architectures/more_local_weight_update.py
--- a/research/learning_unsupervised_learning/datasets/__init__.py
+++ b/research/learning_unsupervised_learning/datasets/__init__.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import mnist
--- a/research/learning_unsupervised_learning/datasets/common.py
+++ b/research/learning_unsupervised_learning/datasets/common.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import tensorflow as tf
+import numpy as np
+
+ImageLabelOnehot = collections.namedtuple('ImageLabelOnehot',
+                                          ['image', 'label', 'label_onehot'])
+ImageLabelOnehotRegression = collections.namedtuple(
+    "ImageLabelOnehotRegression",
+    ["image", "label", "label_onehot", "regression_target"])
--- a/research/learning_unsupervised_learning/datasets/mnist.py
+++ b/research/learning_unsupervised_learning/datasets/mnist.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import sonnet as snt
+import tensorflow as tf
+from tensorflow.python.keras.datasets import mnist
+from learning_unsupervised_learning.datasets import common
+
+class Mnist(snt.AbstractModule):
+  def __init__(self, device, batch_size=128, name="Mnist"):
+    self.device = device
+    self.batch_size = batch_size
+
+    self._make_dataset()
+    self.iterator = None
+
+    super(Mnist, self).__init__(name=name)
+
+  def _make_dataset(self):
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+    x_train = x_train.reshape(60000, 784)
+    x_test = x_test.reshape(10000, 784)
+
+    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    dataset = dataset.repeat()
+    dataset = dataset.shuffle(self.batch_size * 3)
+    dataset = dataset.batch(self.batch_size)
+    def _map_fn(image, label):
+      image = tf.to_float(image) / 255.
+      label.set_shape([self.batch_size])
+      label = tf.cast(label, dtype=tf.int32)
+      label_onehot = tf.one_hot(label, 10)
+      image = tf.reshape(image, [self.batch_size, 28, 28, 1])
+      return common.ImageLabelOnehot(
+          image=image, label=label, label_onehot=label_onehot)
+
+    self.dataset = dataset.map(_map_fn)
+
+  def _build(self):
+    if self.iterator is None:
+      self.iterator = self.dataset.make_one_shot_iterator()
+    batch = self.iterator.get_next()
+    [b.set_shape([self.batch_size] + b.shape.as_list()[1:]) for b in batch]
+    return batch
+
+
+class TinyMnist(Mnist):
+  def __init__(self, *args, **kwargs):
+    kwargs.setdefault("name", "TinyMnist")
+    super(TinyMnist, self).__init__(*args, **kwargs)
+
+  def _make_dataset(self):
+    super(TinyMnist, self)._make_dataset()
+
+    def _map_fn(batch):
+      new_img = tf.image.resize_images(batch.image, [14, 14])
+      return common.ImageLabelOnehot(
+          image=new_img, label=batch.label, label_onehot=batch.label_onehot)
+
+    self.dataset = self.dataset.map(_map_fn)
--- a/research/learning_unsupervised_learning/evaluation.py
+++ b/research/learning_unsupervised_learning/evaluation.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""Evaluation job.
+
+This sits on the side and performs evaluation on a saved model.
+This is a separate process for ease of use and stability of numbers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from learning_unsupervised_learning import utils
+
+
+def construct_evaluation_graph(theta_process_fn=None,
+                               w_learner_fn=None,
+                               dataset_fn=None,
+                               meta_objectives=None,
+                              ):
+  """Construct the evaluation graph.
+  """
+  if meta_objectives is None:
+    meta_objectives = []
+
+  tf.train.create_global_step()
+
+  local_device = ""
+  remote_device = ""
+
+  meta_opt = theta_process_fn(
+      remote_device=remote_device, local_device=local_device)
+
+  base_model = w_learner_fn(
+      remote_device=remote_device, local_device=local_device)
+
+  train_dataset = dataset_fn(device=local_device)
+
+  # construct variables
+  x, outputs = base_model(train_dataset())
+  initial_state = base_model.initial_state(meta_opt, max_steps=10)
+  next_state = base_model.compute_next_state(outputs, meta_opt, initial_state)
+  with utils.state_barrier_context(next_state):
+    train_one_step_op = meta_opt.assign_state(base_model, next_state)
+
+  meta_objs = []
+  for meta_obj_fn in meta_objectives:
+    meta_obj = meta_obj_fn(local_device="", remote_device="")
+    meta_objs.append(meta_obj)
+    J = meta_obj(train_dataset, lambda x: base_model(x)[0])
+    tf.summary.scalar(str(meta_obj.__class__.__name__)+"_J", tf.reduce_mean(J))
+
+  # TODO(lmetz) this is kinda error prone.
+  # We should share the construction of the global variables across train and
+  # make sure both sets of savable variables are the same
+  checkpoint_vars = meta_opt.remote_variables() + [tf.train.get_global_step()]
+  for meta_obj in meta_objs:
+    checkpoint_vars.extend(meta_obj.remote_variables())
+
+  return checkpoint_vars, train_one_step_op, (base_model, train_dataset)
--- a/research/learning_unsupervised_learning/meta_objective/__init__.py
+++ b/research/learning_unsupervised_learning/meta_objective/__init__.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import sklearn
+import linear_regression
--- a/research/learning_unsupervised_learning/meta_objective/linear_regression.py
+++ b/research/learning_unsupervised_learning/meta_objective/linear_regression.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+
+"""Closed form linear regression.
+
+Can be differentiated through.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+
+from learning_unsupervised_learning import utils
+from learning_unsupervised_learning import variable_replace
+
+
+def solve_ridge(x, y, ridge_factor):
+  with tf.name_scope("solve_ridge"):
+    # Added a column of ones to the end of the feature matrix for bias
+    A = tf.concat([x, tf.ones((x.shape.as_list()[0], 1))], axis=1)
+
+    # Analytic solution for the ridge regression loss
+    inv_target = tf.matmul(A, A, transpose_a=True)
+    np_diag_penalty = ridge_factor * np.ones(
+        A.shape.as_list()[1], dtype="float32")
+    # Remove penalty on bias component of weights
+    np_diag_penalty[-1] = 0.
+    diag_penalty = tf.constant(np_diag_penalty)
+    inv_target += tf.diag(diag_penalty)
+
+    inv = tf.matrix_inverse(inv_target)
+    w = tf.matmul(inv, tf.matmul(A, y, transpose_a=True))
+    return w
+
+
+class LinearRegressionMetaObjective(snt.AbstractModule):
+  """A meta objective based on training Ridge Regression with analytic solution.
+
+  This is used to evaluate the performance of a given feature set trained in
+  some other manner.
+  """
+
+  def __init__(self,
+               local_device=None,
+               remote_device=None,
+               zero_one_labels=True,
+               normalize_y_hat=True,
+               normalize_act=False,
+               averages=1,
+               ridge_factor=0.1,
+               center_y=True,
+               hinge_loss=False,
+               samples_per_class=10,
+               test_train_scalar=1.0,
+              ):
+    self._local_device = local_device
+    self._remote_device = remote_device
+    self.zero_one_labels = zero_one_labels
+    self.normalize_y_hat = normalize_y_hat
+    self.normalize_act = normalize_act
+    self.ridge_factor = ridge_factor
+    self.averages = averages
+    self.samples_per_class = samples_per_class
+    self.center_y=center_y
+    self.test_train_scalar=test_train_scalar
+    self.hinge_loss = hinge_loss
+
+    self.dataset_map = {}
+
+    super(LinearRegressionMetaObjective,
+          self).__init__(name="LinearRegressionMetaObjective")
+
+  def _build(self, dataset, feature_transformer):
+    if self.samples_per_class is not None:
+      if dataset not in self.dataset_map:
+        # datasets are outside of frames from while loops
+        with tf.control_dependencies(None):
+          self.dataset_map[dataset] = utils.sample_n_per_class(
+              dataset, self.samples_per_class)
+
+      dataset = self.dataset_map[dataset]
+
+    stats = collections.defaultdict(list)
+    losses = []
+    # TODO(lmetz) move this to ingraph control flow?
+    for _ in xrange(self.averages):
+      loss, stat = self._build_once(dataset, feature_transformer)
+      losses.append(loss)
+      for k, v in stat.items():
+        stats[k].append(v)
+    stats = {k: tf.add_n(v) / float(len(v)) for k, v in stats.items()}
+
+    summary_updates = []
+    for k, v in stats.items():
+      tf.summary.scalar(k, v)
+
+    with tf.control_dependencies(summary_updates):
+      return tf.add_n(losses) / float(len(losses))
+
+  def _build_once(self, dataset, feature_transformer):
+    with tf.device(self._local_device):
+      batch = dataset()
+      num_classes = batch.label_onehot.shape.as_list()[1]
+
+      regression_mod = snt.Linear(num_classes)
+
+      if self.normalize_act:
+
+        def normalize_transformer(x):
+          unnorm_x = feature_transformer(x)
+          return tf.nn.l2_normalize(unnorm_x, 0)
+
+        feature_transformer_wrap = normalize_transformer
+      else:
+        feature_transformer_wrap = feature_transformer
+
+      # construct the variables of the right shape in the sonnet module by
+      # calling a forward pass through the regressor.
+      with utils.assert_no_new_variables():
+        dummy_features = feature_transformer_wrap(batch)
+      regression_mod(dummy_features)
+      reg_w = regression_mod.w
+      reg_b = regression_mod.b
+
+      batch_test = dataset()
+      all_batch = utils.structure_map_multi(lambda x: tf.concat(x, 0), [batch, batch_test])
+      #all_batch = tf.concat([batch, batch_test], 0)
+      # Grab a new batch of data from the dataset.
+      features = feature_transformer_wrap(all_batch)
+      features, features_test = utils.structure_map_split(lambda x: tf.split(x, 2, axis=0), features)
+
+      def center_y(y):
+        y -= tf.reduce_mean(y)
+        y *= tf.rsqrt(tf.reduce_mean(tf.reduce_sum(y**2, axis=[1], keep_dims=True)))
+        return y
+      def get_y_vec(batch):
+        y_pieces = []
+        if hasattr(batch, "label_onehot"):
+          if self.zero_one_labels:
+            y_pieces += [batch.label_onehot]
+          else:
+            y_pieces += [2. * batch.label_onehot - 1.]
+        if hasattr(batch, "regression_target"):
+          y_pieces += [batch.regression_target]
+        y = tf.concat(y_pieces, 1)
+        if self.center_y:
+          y = center_y(y)
+        return y
+
+      y_train = get_y_vec(batch)
+
+      w = solve_ridge(features, y_train, self.ridge_factor)
+
+      # Generate features from another batch to evaluate loss on the validation
+      # set. This provide a less overfit signal to the learned optimizer.
+      y_test = get_y_vec(batch_test)
+
+      def compute_logit(features):
+        # We have updated the classifier mod in previous steps, we need to
+        # substitute out those variables to get new values.
+        replacement = collections.OrderedDict([(reg_w, w[:-1]), (reg_b, w[-1])])
+        with variable_replace.variable_replace(replacement):
+          logits = regression_mod(features)
+
+        return logits
+
+      batch_size = y_train.shape.as_list()[0]
+
+      logit_train = compute_logit(features)
+      logit_test_unnorm = compute_logit(features_test)
+      if self.normalize_y_hat:
+        logit_test = logit_test_unnorm / tf.sqrt(
+            tf.reduce_sum(logit_test_unnorm**2, axis=[1], keep_dims=True))
+      else:
+        logit_test = logit_test_unnorm
+
+      stats = {}
+
+      if self.hinge_loss:
+        # slightly closer to the true classification loss
+        # any distance smaller than 1 is guaranteed to map to the correct class
+        mse_test = tf.reduce_sum(tf.nn.relu(tf.reduce_sum(tf.square(logit_test - y_test), axis=1)-1.)) / batch_size
+      else:
+        mse_test = tf.reduce_sum(tf.square(logit_test - y_test)) / batch_size
+
+      stats["mse_test"] = mse_test
+
+      mse_train = tf.reduce_sum(tf.square(logit_train - y_train)) / batch_size
+      stats["mse_train"] = mse_train
+
+      is_correct_test = tf.equal(tf.argmax(logit_test, 1), tf.argmax(y_test, 1))
+      accuracy_test = tf.reduce_mean(tf.cast(is_correct_test, tf.float32))
+      stats["accuracy_test"] = accuracy_test
+
+      def test_confusion_fn():
+        test_confusion = tf.confusion_matrix(tf.argmax(y_test, 1), tf.argmax(logit_test, 1))
+        test_confusion = tf.to_float(test_confusion) / tf.constant((logit_test.shape.as_list()[0] / float(logit_test.shape.as_list()[1])), dtype=tf.float32)
+        test_confusion = tf.expand_dims(tf.expand_dims(test_confusion, 0), 3)
+        return test_confusion
+      tf.summary.image("test_confusion", test_confusion_fn())
+
+      def train_confusion_fn():
+        train_confusion = tf.confusion_matrix(tf.argmax(y_train, 1), tf.argmax(logit_train, 1))
+        train_confusion = tf.to_float(train_confusion) / tf.constant((logit_train.shape.as_list()[0] / float(logit_train.shape.as_list()[1])), dtype=tf.float32)
+        train_confusion = tf.expand_dims(tf.expand_dims(train_confusion, 0), 3)
+        return train_confusion
+      tf.summary.image("train_confusion", train_confusion_fn())
+
+      is_correct = tf.equal(tf.argmax(logit_train, 1), tf.argmax(y_train, 1))
+      accuracy_train = tf.reduce_mean(tf.cast(is_correct, tf.float32))
+      stats["accuracy_train"] = accuracy_train
+
+      reg = self.ridge_factor * tf.reduce_sum(tf.square(w[:-1])) / batch_size
+      stats["ridge_component"] = reg
+
+      stats["total_loss"] = mse_test + reg
+
+      loss_to_train_at = (reg+ mse_test) * self.test_train_scalar + (mse_train + reg)*(1 - self.test_train_scalar)
+
+      loss_to_train_at = tf.identity(loss_to_train_at)
+
+      # Minimizing the test loss should not require regurization because the
+      # metaobjective is solved for the training loss
+      return loss_to_train_at, stats
+
+  def local_variables(self):
+    """List of variables that need to be updated for each evaluation.
+
+    These variables should not be stored on a parameter server and
+    should be reset every computation of a meta_objective loss.
+
+    Returns:
+      vars: list of tf.Variable
+    """
+    return list(
+        snt.get_variables_in_module(self, tf.GraphKeys.TRAINABLE_VARIABLES))
+
+  def remote_variables(self):
+    return []
--- a/research/learning_unsupervised_learning/meta_objective/sklearn.py
+++ b/research/learning_unsupervised_learning/meta_objective/sklearn.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""
+
+Can NOT be differentiated through.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+from learning_unsupervised_learning import utils
+
+from learning_unsupervised_learning.meta_objective import utils as meta_obj_utils
+
+from sklearn import svm
+from sklearn import linear_model
+
+
+def build_fit(device, model_fn, num_classes, probs=True):
+
+  def _py_fit_predict(trX, trY, teX):
+    assert len(np.unique(trY)) == num_classes
+    model = model_fn()
+    model.fit(trX, trY)
+    trP = model.predict(trX)
+    teP = model.predict(teX)
+    if probs:
+      teP_probs = model.predict_log_proba(teX)
+      return trP.astype(np.int64), teP.astype(np.int64), teP_probs.astype(
+          np.float32)
+    else:
+      teP = model.predict(teX)
+      return trP.astype(np.int64), teP.astype(np.int64)
+
+  def return_fn(trX, trY, teX):
+    with tf.device(device):
+      with tf.device("/cpu:0"):
+        if probs:
+          return tf.py_func(
+              _py_fit_predict,
+              [tf.identity(trX),
+               tf.identity(trY),
+               tf.identity(teX)], [tf.int64, tf.int64, tf.float32])
+        else:
+          return tf.py_func(
+              _py_fit_predict,
+              [tf.identity(trX),
+               tf.identity(trY),
+               tf.identity(teX)], [tf.int64, tf.int64])
+
+  return return_fn
+
+
+class SKLearn(meta_obj_utils.MultiTrialMetaObjective):
+
+  def __init__(
+      self,
+      local_device=None,
+      remote_device=None,
+      averages=1,
+      samples_per_class=10,
+      probs=False,
+      stddev=0.01,
+      n_samples=10,
+      name="SKLearn",
+  ):
+    self._local_device = local_device
+    self._remote_device = remote_device
+    self.name = name
+    self.probs = probs
+    self.n_samples = n_samples
+    self.stddev = stddev
+
+    super(SKLearn, self).__init__(
+        name=name, samples_per_class=samples_per_class, averages=averages)
+
+  def _get_model(self):
+    raise NotImplemented()
+
+  def _build_once(self, dataset, feature_transformer):
+    with tf.device(self._local_device):
+      tr_batch = dataset()
+      te_batch = dataset()
+      num_classes = tr_batch.label_onehot.shape.as_list()[1]
+      all_batch = utils.structure_map_multi(lambda x: tf.concat(x, 0),
+                                            [tr_batch, te_batch])
+      features = feature_transformer(all_batch)
+      trX, teX = utils.structure_map_split(lambda x: tf.split(x, 2, axis=0),
+                                           features)
+      trY = tf.to_int64(tr_batch.label)
+      trY_onehot = tf.to_int32(tr_batch.label_onehot)
+      teY = tf.to_int64(te_batch.label)
+      teY_shape = teY.shape.as_list()
+
+      def blackbox((trX, trY, teX, teY)):
+        trY = tf.to_int32(tf.rint(trY))
+        teY = tf.to_int32(tf.rint(teY))
+        tf_fn = build_fit(
+            self._local_device,
+            self._get_model,
+            num_classes=num_classes,
+            probs=self.probs)
+        if self.probs:
+          trP, teP, teP_probs = tf_fn(trX, trY, teX)
+        else:
+          trP, teP = tf_fn(trX, trY, teX)
+
+        teY.set_shape(teY_shape)
+        if self.probs:
+          onehot = tf.one_hot(teY, num_classes)
+          crossent = -tf.reduce_sum(onehot * teP_probs, [1])
+          return tf.reduce_mean(crossent)
+        else:
+          # use error rate as the loss if no surrogate is avalible.
+          return 1 - tf.reduce_mean(
+              tf.to_float(tf.equal(teY, tf.to_int32(teP))))
+
+      test_loss = blackbox((trX, tf.to_float(trY), teX, tf.to_float(teY)))
+
+      stats = {}
+
+      tf_fn = build_fit(
+          self._local_device,
+          self._get_model,
+          num_classes=num_classes,
+          probs=self.probs)
+      if self.probs:
+        trP, teP, teP_probs = tf_fn(trX, trY, teX)
+      else:
+        trP, teP = tf_fn(trX, trY, teX)
+      stats["%s/accuracy_train" % self.name] = tf.reduce_mean(
+          tf.to_float(tf.equal(tf.to_int32(trY), tf.to_int32(trP))))
+      stats["%s/accuracy_test" % self.name] = tf.reduce_mean(
+          tf.to_float(tf.equal(tf.to_int32(teY), tf.to_int32(teP))))
+      stats["%s/test_loss" % self.name] = test_loss
+      return test_loss, stats
+
+
+class LogisticRegression(SKLearn):
+
+  def __init__(self, C=1.0, name="LogisticRegression", probs=True, **kwargs):
+    self.C = C
+    super(LogisticRegression, self).__init__(name=name, probs=probs, **kwargs)
+
+  def _get_model(self):
+    return linear_model.LogisticRegression(C=self.C)
--- a/research/learning_unsupervised_learning/meta_objective/utils.py
+++ b/research/learning_unsupervised_learning/meta_objective/utils.py
+# Copyright 2018 Google, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+import sonnet as snt
+import tensorflow as tf
+
+from learning_unsupervised_learning import optimizers
+from learning_unsupervised_learning import utils
+from learning_unsupervised_learning import summary_utils
+from learning_unsupervised_learning import variable_replace
+
+class MultiTrialMetaObjective(snt.AbstractModule):
+  def __init__(self, samples_per_class, averages, **kwargs):
+    self.samples_per_class = samples_per_class
+    self.averages = averages
+    self.dataset_map = {}
+
+    super(MultiTrialMetaObjective,
+          self).__init__(**kwargs)
+
+  def _build(self, dataset, feature_transformer):
+    if self.samples_per_class is not None:
+      if dataset not in self.dataset_map:
+        # datasets are outside of frames from while loops
+        with tf.control_dependencies(None):
+          self.dataset_map[dataset] = utils.sample_n_per_class(
+              dataset, self.samples_per_class)
+
+      dataset = self.dataset_map[dataset]
+
+    stats = collections.defaultdict(list)
+    losses = []
+    # TODO(lmetz) move this to ingraph control flow?
+    for _ in xrange(self.averages):
+      loss, stat = self._build_once(dataset, feature_transformer)
+      losses.append(loss)
+      for k, v in stat.items():
+        stats[k].append(v)
+    stats = {k: tf.add_n(v) / float(len(v)) for k, v in stats.items()}
+
+    for k, v in stats.items():
+      tf.summary.scalar(k, v)
+
+    return tf.add_n(losses) / float(len(losses))
+
+  def local_variables(self):
+    """List of variables that need to be updated for each evaluation.
+
+    These variables should not be stored on a parameter server and
+    should be reset every computation of a meta_objective loss.
+
+    Returns:
+      vars: list of tf.Variable
+    """
+    return list(
+        snt.get_variables_in_module(self, tf.GraphKeys.TRAINABLE_VARIABLES))
+
+  def remote_variables(self):
+    return []
--- a/research/learning_unsupervised_learning/optimizers.py
+++ b/research/learning_unsupervised_learning/optimizers.py
--- a/research/learning_unsupervised_learning/run_eval.py
+++ b/research/learning_unsupervised_learning/run_eval.py
--- a/research/learning_unsupervised_learning/summary_utils.py
+++ b/research/learning_unsupervised_learning/summary_utils.py