Commit ac0829fa authored by Xin Pan's avatar Xin Pan
Browse files

Consolidate privacy/ and differential_privacy/.

parent 8cca18b0
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from differential_privacy.multiple_teachers import deep_cnn
from differential_privacy.multiple_teachers import input
from differential_privacy.multiple_teachers import metrics
tf.flags.DEFINE_string('dataset', 'svhn', 'The name of the dataset to use')
tf.flags.DEFINE_integer('nb_labels', 10, 'Number of output classes')
tf.flags.DEFINE_string('data_dir','/tmp','Temporary storage')
tf.flags.DEFINE_string('train_dir','/tmp/train_dir',
'Where model ckpt are saved')
tf.flags.DEFINE_integer('max_steps', 3000, 'Number of training steps to run.')
tf.flags.DEFINE_integer('nb_teachers', 50, 'Teachers in the ensemble.')
tf.flags.DEFINE_integer('teacher_id', 0, 'ID of teacher being trained.')
tf.flags.DEFINE_boolean('deeper', False, 'Activate deeper CNN model')
FLAGS = tf.flags.FLAGS
def train_teacher(dataset, nb_teachers, teacher_id):
"""
This function trains a teacher (teacher id) among an ensemble of nb_teachers
models for the dataset specified.
:param dataset: string corresponding to dataset (svhn, cifar10)
:param nb_teachers: total number of teachers in the ensemble
:param teacher_id: id of the teacher being trained
:return: True if everything went well
"""
# If working directories do not exist, create them
assert input.create_dir_if_needed(FLAGS.data_dir)
assert input.create_dir_if_needed(FLAGS.train_dir)
# Load the dataset
if dataset == 'svhn':
train_data,train_labels,test_data,test_labels = input.ld_svhn(extended=True)
elif dataset == 'cifar10':
train_data, train_labels, test_data, test_labels = input.ld_cifar10()
elif dataset == 'mnist':
train_data, train_labels, test_data, test_labels = input.ld_mnist()
else:
print("Check value of dataset flag")
return False
# Retrieve subset of data for this teacher
data, labels = input.partition_dataset(train_data,
train_labels,
nb_teachers,
teacher_id)
print("Length of training data: " + str(len(labels)))
# Define teacher checkpoint filename and full path
if FLAGS.deeper:
filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '_deep.ckpt'
else:
filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt'
ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename
# Perform teacher training
assert deep_cnn.train(data, labels, ckpt_path)
# Append final step value to checkpoint for evaluation
ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)
# Retrieve teacher probability estimates on the test data
teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final)
# Compute teacher accuracy
precision = metrics.accuracy(teacher_preds, test_labels)
print('Precision of teacher after training: ' + str(precision))
return True
def main(argv=None): # pylint: disable=unused-argument
# Make a call to train_teachers with values specified in flags
assert train_teacher(FLAGS.dataset, FLAGS.nb_teachers, FLAGS.teacher_id)
if __name__ == '__main__':
tf.app.run()
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
def batch_indices(batch_nb, data_length, batch_size):
"""
This helper function computes a batch start and end index
:param batch_nb: the batch number
:param data_length: the total length of the data being parsed by batches
:param batch_size: the number of inputs in each batch
:return: pair of (start, end) indices
"""
# Batch start and end index
start = int(batch_nb * batch_size)
end = int((batch_nb + 1) * batch_size)
# When there are not enough inputs left, we reuse some to complete the batch
if end > data_length:
shift = end - data_length
start -= shift
end -= shift
return start, end
package(default_visibility = [":internal"])
licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
package_group(
name = "internal",
packages = [
"//third_party/tensorflow_models/...",
],
)
py_binary(
name = "gaussian_moments",
srcs = [
"gaussian_moments.py",
],
deps = [
],
)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A standalone utility for computing the log moments.
The utility for computing the log moments. It consists of two methods.
compute_log_moment(q, sigma, T, lmbd) computes the log moment with sampling
probability q, noise sigma, order lmbd, and T steps. get_privacy_spent computes
delta (or eps) given log moments and eps (or delta).
Example use:
Suppose that we have run an algorithm with parameters, an array of
(q1, sigma1, T1) ... (qk, sigmak, Tk), and we wish to compute eps for a given
delta. The example code would be:
max_lmbd = 32
lmbds = xrange(1, max_lmbd + 1)
log_moments = []
for lmbd in lmbds:
log_moment = 0
for q, sigma, T in parameters:
log_moment += compute_log_moment(q, sigma, T, lmbd)
log_moments.append((lmbd, log_moment))
eps, delta = get_privacy_spent(log_moments, target_delta=delta)
To verify that the I1 >= I2 (see comments in GaussianMomentsAccountant in
accountant.py for the context), run the same loop above with verify=True
passed to compute_log_moment.
"""
import math
import sys
import numpy as np
import scipy.integrate as integrate
import scipy.stats
from sympy.mpmath import mp
def _to_np_float64(v):
if math.isnan(v) or math.isinf(v):
return np.inf
return np.float64(v)
######################
# FLOAT64 ARITHMETIC #
######################
def pdf_gauss(x, sigma, mean=0):
return scipy.stats.norm.pdf(x, loc=mean, scale=sigma)
def cropped_ratio(a, b):
if a < 1E-50 and b < 1E-50:
return 1.
else:
return a / b
def integral_inf(fn):
integral, _ = integrate.quad(fn, -np.inf, np.inf)
return integral
def integral_bounded(fn, lb, ub):
integral, _ = integrate.quad(fn, lb, ub)
return integral
def distributions(sigma, q):
mu0 = lambda y: pdf_gauss(y, sigma=sigma, mean=0.0)
mu1 = lambda y: pdf_gauss(y, sigma=sigma, mean=1.0)
mu = lambda y: (1 - q) * mu0(y) + q * mu1(y)
return mu0, mu1, mu
def compute_a(sigma, q, lmbd, verbose=False):
lmbd_int = int(math.ceil(lmbd))
if lmbd_int == 0:
return 1.0
a_lambda_first_term_exact = 0
a_lambda_second_term_exact = 0
for i in xrange(lmbd_int + 1):
coef_i = scipy.special.binom(lmbd_int, i) * (q ** i)
s1, s2 = 0, 0
for j in xrange(i + 1):
coef_j = scipy.special.binom(i, j) * (-1) ** (i - j)
s1 += coef_j * np.exp((j * j - j) / (2.0 * (sigma ** 2)))
s2 += coef_j * np.exp((j * j + j) / (2.0 * (sigma ** 2)))
a_lambda_first_term_exact += coef_i * s1
a_lambda_second_term_exact += coef_i * s2
a_lambda_exact = ((1.0 - q) * a_lambda_first_term_exact +
q * a_lambda_second_term_exact)
if verbose:
print "A: by binomial expansion {} = {} + {}".format(
a_lambda_exact,
(1.0 - q) * a_lambda_first_term_exact,
q * a_lambda_second_term_exact)
return _to_np_float64(a_lambda_exact)
def compute_b(sigma, q, lmbd, verbose=False):
mu0, _, mu = distributions(sigma, q)
b_lambda_fn = lambda z: mu0(z) * np.power(cropped_ratio(mu0(z), mu(z)), lmbd)
b_lambda = integral_inf(b_lambda_fn)
m = sigma ** 2 * (np.log((2. - q) / (1. - q)) + 1. / (2 * sigma ** 2))
b_fn = lambda z: (np.power(mu0(z) / mu(z), lmbd) -
np.power(mu(-z) / mu0(z), lmbd))
if verbose:
print "M =", m
print "f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m))
assert b_fn(-m) < 0 and b_fn(m) < 0
b_lambda_int1_fn = lambda z: (mu0(z) *
np.power(cropped_ratio(mu0(z), mu(z)), lmbd))
b_lambda_int2_fn = lambda z: (mu0(z) *
np.power(cropped_ratio(mu(z), mu0(z)), lmbd))
b_int1 = integral_bounded(b_lambda_int1_fn, -m, m)
b_int2 = integral_bounded(b_lambda_int2_fn, -m, m)
a_lambda_m1 = compute_a(sigma, q, lmbd - 1)
b_bound = a_lambda_m1 + b_int1 - b_int2
if verbose:
print "B: by numerical integration", b_lambda
print "B must be no more than ", b_bound
print b_lambda, b_bound
return _to_np_float64(b_lambda)
###########################
# MULTIPRECISION ROUTINES #
###########################
def pdf_gauss_mp(x, sigma, mean):
return mp.mpf(1.) / mp.sqrt(mp.mpf("2.") * sigma ** 2 * mp.pi) * mp.exp(
- (x - mean) ** 2 / (mp.mpf("2.") * sigma ** 2))
def integral_inf_mp(fn):
integral, _ = mp.quad(fn, [-mp.inf, mp.inf], error=True)
return integral
def integral_bounded_mp(fn, lb, ub):
integral, _ = mp.quad(fn, [lb, ub], error=True)
return integral
def distributions_mp(sigma, q):
mu0 = lambda y: pdf_gauss_mp(y, sigma=sigma, mean=mp.mpf(0))
mu1 = lambda y: pdf_gauss_mp(y, sigma=sigma, mean=mp.mpf(1))
mu = lambda y: (1 - q) * mu0(y) + q * mu1(y)
return mu0, mu1, mu
def compute_a_mp(sigma, q, lmbd, verbose=False):
lmbd_int = int(math.ceil(lmbd))
if lmbd_int == 0:
return 1.0
mu0, mu1, mu = distributions_mp(sigma, q)
a_lambda_fn = lambda z: mu(z) * (mu(z) / mu0(z)) ** lmbd_int
a_lambda_first_term_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** lmbd_int
a_lambda_second_term_fn = lambda z: mu1(z) * (mu(z) / mu0(z)) ** lmbd_int
a_lambda = integral_inf_mp(a_lambda_fn)
a_lambda_first_term = integral_inf_mp(a_lambda_first_term_fn)
a_lambda_second_term = integral_inf_mp(a_lambda_second_term_fn)
if verbose:
print "A: by numerical integration {} = {} + {}".format(
a_lambda,
(1 - q) * a_lambda_first_term,
q * a_lambda_second_term)
return _to_np_float64(a_lambda)
def compute_b_mp(sigma, q, lmbd, verbose=False):
lmbd_int = int(math.ceil(lmbd))
if lmbd_int == 0:
return 1.0
mu0, _, mu = distributions_mp(sigma, q)
b_lambda_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int
b_lambda = integral_inf_mp(b_lambda_fn)
m = sigma ** 2 * (mp.log((2 - q) / (1 - q)) + 1 / (2 * (sigma ** 2)))
b_fn = lambda z: ((mu0(z) / mu(z)) ** lmbd_int -
(mu(-z) / mu0(z)) ** lmbd_int)
if verbose:
print "M =", m
print "f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m))
assert b_fn(-m) < 0 and b_fn(m) < 0
b_lambda_int1_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int
b_lambda_int2_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** lmbd_int
b_int1 = integral_bounded_mp(b_lambda_int1_fn, -m, m)
b_int2 = integral_bounded_mp(b_lambda_int2_fn, -m, m)
a_lambda_m1 = compute_a_mp(sigma, q, lmbd - 1)
b_bound = a_lambda_m1 + b_int1 - b_int2
if verbose:
print "B by numerical integration", b_lambda
print "B must be no more than ", b_bound
assert b_lambda < b_bound + 1e-5
return _to_np_float64(b_lambda)
def _compute_delta(log_moments, eps):
"""Compute delta for given log_moments and eps.
Args:
log_moments: the log moments of privacy loss, in the form of pairs
of (moment_order, log_moment)
eps: the target epsilon.
Returns:
delta
"""
min_delta = 1.0
for moment_order, log_moment in log_moments:
if moment_order == 0:
continue
if math.isinf(log_moment) or math.isnan(log_moment):
sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order)
continue
if log_moment < moment_order * eps:
min_delta = min(min_delta,
math.exp(log_moment - moment_order * eps))
return min_delta
def _compute_eps(log_moments, delta):
"""Compute epsilon for given log_moments and delta.
Args:
log_moments: the log moments of privacy loss, in the form of pairs
of (moment_order, log_moment)
delta: the target delta.
Returns:
epsilon
"""
min_eps = float("inf")
for moment_order, log_moment in log_moments:
if moment_order == 0:
continue
if math.isinf(log_moment) or math.isnan(log_moment):
sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order)
continue
min_eps = min(min_eps, (log_moment - math.log(delta)) / moment_order)
return min_eps
def compute_log_moment(q, sigma, steps, lmbd, verify=False, verbose=False):
"""Compute the log moment of Gaussian mechanism for given parameters.
Args:
q: the sampling ratio.
sigma: the noise sigma.
steps: the number of steps.
lmbd: the moment order.
verify: if False, only compute the symbolic version. If True, computes
both symbolic and numerical solutions and verifies the results match.
verbose: if True, print out debug information.
Returns:
the log moment with type np.float64, could be np.inf.
"""
moment = compute_a(sigma, q, lmbd, verbose=verbose)
if verify:
mp.dps = 50
moment_a_mp = compute_a_mp(sigma, q, lmbd, verbose=verbose)
moment_b_mp = compute_b_mp(sigma, q, lmbd, verbose=verbose)
np.testing.assert_allclose(moment, moment_a_mp, rtol=1e-10)
if not np.isinf(moment_a_mp):
# The following test fails for (1, np.inf)!
np.testing.assert_array_less(moment_b_mp, moment_a_mp)
if np.isinf(moment):
return np.inf
else:
return np.log(moment) * steps
def get_privacy_spent(log_moments, target_eps=None, target_delta=None):
"""Compute delta (or eps) for given eps (or delta) from log moments.
Args:
log_moments: array of (moment_order, log_moment) pairs.
target_eps: if not None, the epsilon for which we would like to compute
corresponding delta value.
target_delta: if not None, the delta for which we would like to compute
corresponding epsilon value. Exactly one of target_eps and target_delta
is None.
Returns:
eps, delta pair
"""
assert (target_eps is None) ^ (target_delta is None)
assert not ((target_eps is None) and (target_delta is None))
if target_eps is not None:
return (target_eps, _compute_delta(log_moments, target_eps))
else:
return (_compute_eps(log_moments, target_delta), target_delta)
package(default_visibility = [":internal"])
licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
package_group(
name = "internal",
packages = [
"//differential_privacy/...",
],
)
py_library(
name = "accountant",
srcs = [
"accountant.py",
],
deps = [
"//differential_privacy/dp_sgd/dp_optimizer:utils",
],
)
py_test(
name = "accountant_test",
srcs = ["accountant_test.py"],
deps = [
":accountant",
"//differential_privacy/dp_sgd/dp_optimizer:utils",
],
)
......@@ -32,7 +32,7 @@ import sys
import numpy
import tensorflow as tf
from differential_privacy.dp_optimizer import utils
from differential_privacy.dp_sgd.dp_optimizer import utils
EpsDelta = collections.namedtuple("EpsDelta", ["spent_eps", "spent_delta"])
......@@ -312,9 +312,17 @@ class GaussianMomentsAccountant(MomentsAccountant):
k steps. Since we use direct estimate, the obtained privacy bound has tight
constant.
For GaussianMomentAccountant, it suffices to compute I1, as I1 >= I2
(TODO(liqzhang): make sure this is true.), which reduce to computing
E(P(x+s)/P(x+s-1) - 1)^i for s = 0 and 1.
For GaussianMomentAccountant, it suffices to compute I1, as I1 >= I2,
which reduce to computing E(P(x+s)/P(x+s-1) - 1)^i for s = 0 and 1. In the
companion gaussian_moments.py file, we supply procedure for computing both
I1 and I2 (the computation of I2 is through multi-precision integration
package). It can be verified that indeed I1 >= I2 for wide range of parameters
we have tried, though at the moment we are unable to prove this claim.
We recommend that when using this accountant, users independently verify
using gaussian_moments.py that for their parameters, I1 is indeed larger
than I2. This can be done by following the instructions in
gaussian_moments.py.
"""
def __init__(self, total_examples, moment_orders=32):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment