Unverified Commit 911a0d23 authored by Ilya Mironov's avatar Ilya Mironov Committed by GitHub
Browse files

Merge pull request #4197 from ilyamironov/master

Updates to differential_privacy/pate following the Banff workshop.
parents bc0edaf8 004dd361
......@@ -51,9 +51,11 @@ The output is written to the console.
* plot_partition.py — Script for producing partition.pdf, a detailed breakdown of privacy
costs for Confident-GNMax with smooth sensitivity analysis (takes ~50 hours).
* rdp_flow.py and plot_ls_q.py are currently not used.
* plots_for_slides.py — Script for producing several plots for the slide deck.
* download.py — Utility script for populating the data/ directory.
* plot_ls_q.py is not used.
All Python files take flags. Run script_name.py --help for help on flags.
......@@ -186,7 +186,7 @@ def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta):
ss=rdp_ss[order_idx],
delta=-math.log(delta) / (order_opt[i] - 1))
ss_std_opt[i] = ss_std[order_idx]
if i > 0 and (i + 1) % 10 == 0:
if i > 0 and (i + 1) % 1 == 0:
print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} '
'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, '
'step2 = {:.3f}, ss = {:.3f}'.format(
......@@ -292,6 +292,8 @@ def plot_partition(figures_dir, gnmax_conf, print_order):
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
fig.patch.set_alpha(0)
l1 = ax.plot(
x, y3, color='b', ls='-', label=r'Total privacy cost', linewidth=1).pop()
......@@ -311,8 +313,8 @@ def plot_partition(figures_dir, gnmax_conf, print_order):
plt.xlim([0, xlim])
ax.set_ylim([0, 3.])
ax.set_xlabel('Number of queries answered', fontsize=10)
ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=10)
ax.set_xlabel('Number of queries answered', fontsize=16)
ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
# Merging legends.
if print_order:
......@@ -321,11 +323,11 @@ def plot_partition(figures_dir, gnmax_conf, print_order):
x, y_right, 'r', ls='-', label=r'Optimal order', linewidth=5,
alpha=.5).pop()
ax2.grid(False)
ax2.set_ylabel(r'Optimal Renyi order', fontsize=16)
# ax2.set_ylabel(r'Optimal Renyi order', fontsize=16)
ax2.set_ylim([0, 200.])
ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13)
# ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13)
ax.tick_params(labelsize=10)
ax.tick_params(labelsize=14)
plot_filename = os.path.join(figures_dir, 'partition.pdf')
print('Saving the graph to ' + plot_filename)
fig.savefig(plot_filename, bbox_inches='tight', dpi=800)
......@@ -387,7 +389,7 @@ def main(argv):
figures_dir = os.path.expanduser(FLAGS.figures_dir)
plot_comparison(figures_dir, simple_ind, conf_ind, simple_dep, conf_dep)
plot_partition(figures_dir, conf_dep, False)
plot_partition(figures_dir, conf_dep, True)
plt.close('all')
......
......@@ -13,9 +13,7 @@
# limitations under the License.
# ==============================================================================
"""Plots two graphs illustrating cost of privacy per answered query.
PRESENTLY NOT USED.
"""Plots graphs for the slide deck.
A script in support of the PATE2 paper. The input is a file containing a numpy
array of votes, one query per row, one class per column. Ex:
......@@ -23,7 +21,7 @@ array of votes, one query per row, one class per column. Ex:
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of two pdf files.
The output graphs are visualized using the TkAgg backend.
"""
from __future__ import absolute_import
from __future__ import division
......@@ -35,43 +33,51 @@ import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
import random
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
flags.DEFINE_boolean('transparent', False, 'Set background to transparent.')
flags.mark_flag_as_required('counts_file')
def plot_rdp_curve_per_example(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.5
def setup_plot():
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
styles = [':', '-']
labels = ['ex1', 'ex2']
if FLAGS.transparent:
fig.patch.set_alpha(0)
return fig, ax
for i in xrange(votes.shape[0]):
print(sorted(votes[i,], reverse=True)[:10])
def plot_rdp_curve_per_example(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.001
fig, ax = setup_plot()
for i in range(votes.shape[0]):
for sigma in sigmas:
logq = pate.compute_logq_gaussian(votes[i,], sigma)
rdp = pate.rdp_gaussian(logq, sigma, orders)
ax.plot(
orders,
rdp,
label=r'{} $\sigma$={}'.format(labels[i], int(sigma)),
linestyle=styles[i],
alpha=1.,
label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)),
linewidth=5)
for sigma in sigmas:
......@@ -79,21 +85,45 @@ def plot_rdp_curve_per_example(votes, sigmas):
orders,
pate.rdp_data_independent_gaussian(sigma, orders),
alpha=.3,
label=r'Data-ind bound $\sigma$={}'.format(int(sigma)),
label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
linewidth=10)
plt.yticks([0, .01])
plt.xlabel(r'Order $\lambda$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
plt.xlim(xmin=1, xmax=100)
plt.ylim(ymin=0)
plt.xticks([1, 20, 40, 60, 80, 100])
plt.yticks([0, .0025, .005, .0075, .01])
plt.xlabel(r'Order $\alpha$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow1.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.legend(loc=0, fontsize=13)
plt.show()
def plot_rdp_of_sigma(v, order):
sigmas = np.linspace(1., 1000., endpoint=True, num=1000)
fig, ax = setup_plot()
y = np.zeros(len(sigmas))
for i, sigma in enumerate(sigmas):
logq = pate.compute_logq_gaussian(v, sigma)
y[i] = pate.rdp_gaussian(logq, sigma, order)
ax.plot(sigmas, y, alpha=.8, linewidth=5)
plt.xlim(xmin=1, xmax=1000)
plt.ylim(ymin=0)
# plt.yticks([0, .0004, .0008, .0012])
ax.tick_params(labelleft='off')
plt.xlabel(r'Noise $\sigma$', fontsize=16)
plt.ylabel(r'RDP at order $\alpha={}$'.format(order), fontsize=16)
ax.tick_params(labelsize=14)
# plt.legend(loc=0, fontsize=13)
plt.show()
def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders,
target_answered):
rdp_cum = np.zeros(len(orders))
......@@ -115,46 +145,138 @@ def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders,
def plot_rdp_total(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=100)
orders[0] = 1.5
orders[0] = 1.1
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
fig, ax = setup_plot()
target_answered = 2000
for sigma in sigmas:
rdp = compute_rdp_curve(votes, 5000, 1000, sigma, orders, 2000)
rdp = compute_rdp_curve(votes, 5000, 1000, sigma, orders, target_answered)
ax.plot(
orders,
rdp,
alpha=.8,
label=r'$\sigma$={}'.format(int(sigma)),
label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)),
linewidth=5)
plt.xlabel(r'Order $\lambda$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
# for sigma in sigmas:
# ax.plot(
# orders,
# target_answered * pate.rdp_data_independent_gaussian(sigma, orders),
# alpha=.3,
# label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
# linewidth=10)
plt.xlim(xmin=1, xmax=100)
plt.ylim(ymin=0)
plt.xticks([1, 20, 40, 60, 80, 100])
plt.yticks([0, .0005, .001, .0015, .002])
plt.xlabel(r'Order $\alpha$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow2.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.legend(loc=0, fontsize=13)
plt.show()
def plot_data_ind_curve():
fig, ax = setup_plot()
orders = np.linspace(1., 10., endpoint=True, num=1000)
orders[0] = 1.01
ax.plot(
orders,
pate.rdp_data_independent_gaussian(1., orders),
alpha=.5,
color='gray',
linewidth=10)
# plt.yticks([])
plt.xlim(xmin=1, xmax=10)
plt.ylim(ymin=0)
plt.xticks([1, 3, 5, 7, 9])
ax.tick_params(labelsize=14)
plt.show()
def plot_two_data_ind_curves():
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.001
fig, ax = setup_plot()
for sigma in [100, 150]:
ax.plot(
orders,
pate.rdp_data_independent_gaussian(sigma, orders),
alpha=.3,
label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
linewidth=10)
plt.xlim(xmin=1, xmax=100)
plt.ylim(ymin=0)
plt.xticks([1, 20, 40, 60, 80, 100])
plt.yticks([0, .0025, .005, .0075, .01])
plt.xlabel(r'Order $\alpha$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
ax.tick_params(labelsize=14)
plt.legend(loc=0, fontsize=13)
plt.show()
def scatter_plot(votes, threshold, sigma1, sigma2, order):
fig, ax = setup_plot()
x = []
y = []
for i, v in enumerate(votes):
if threshold is not None and sigma1 is not None:
q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
else:
q_step1 = 1.
if random.random() < q_step1:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
x.append(max(v))
y.append(pate.rdp_gaussian(logq_step2, sigma2, order))
print('Selected {} queries.'.format(len(x)))
# Plot the data-independent curve:
# data_ind = pate.rdp_data_independent_gaussian(sigma, order)
# plt.plot([0, 5000], [data_ind, data_ind], color='tab:blue', linestyle='-', linewidth=2)
ax.set_yscale('log')
plt.xlim(xmin=0, xmax=5000)
plt.ylim(ymin=1e-300, ymax=1)
plt.yticks([1, 1e-100, 1e-200, 1e-300])
plt.scatter(x, y, s=1, alpha=0.5)
plt.ylabel(r'RDP at $\alpha={}$'.format(order), fontsize=16)
plt.xlabel(r'max count', fontsize=16)
ax.tick_params(labelsize=14)
plt.show()
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
votes = votes[:12000,] # truncate to 4000 samples
plot_data_ind_curve()
plot_two_data_ind_curves()
v1 = [2550, 2200, 250] # based on votes[2,]
v2 = [2600, 2200, 200] # based on votes[381,]
plot_rdp_curve_per_example(np.array([v1, v2]), (100., 150.))
# v2 = [2600, 2200, 200] # based on votes[381,]
plot_rdp_curve_per_example(np.array([v1]), (100., 150.))
plot_rdp_of_sigma(np.array(v1), 20.)
votes = np.load(fin_name)
plot_rdp_total(votes, (100., 150.))
plot_rdp_total(votes[:12000, ], (100., 150.))
scatter_plot(votes[:6000, ], None, None, 100, 20) # w/o thresholding
scatter_plot(votes[:6000, ], 3500, 1500, 100, 20) # with thresholding
if __name__ == '__main__':
......
......@@ -15,7 +15,7 @@ dataset.
The framework consists of _teachers_, the _student_ model, and the _aggregator_. The
teachers are models trained on disjoint subsets of the training datasets. The student
model has access to an insensitive (i.e., public) unlabelled dataset, which is labelled by
model has access to an insensitive (e.g., public) unlabelled dataset, which is labelled by
interacting with the ensemble of teachers via the _aggregator_. The aggregator tallies
outputs of the teacher models, and either forwards a (noisy) aggregate to the student, or
refuses to answer.
......@@ -57,13 +57,13 @@ $ python smooth_sensitivity_test.py
## Files in this directory
* core.py --- RDP privacy accountant for several vote aggregators (GNMax,
* core.py &mdash; RDP privacy accountant for several vote aggregators (GNMax,
Threshold, Laplace).
* smooth_sensitivity.py --- Smooth sensitivity analysis for GNMax and
* smooth_sensitivity.py &mdash; Smooth sensitivity analysis for GNMax and
Threshold mechanisms.
* core_test.py and smooth_sensitivity_test.py --- Unit tests for the
* core_test.py and smooth_sensitivity_test.py &mdash; Unit tests for the
files above.
## Contact information
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# ==============================================================================
"""Tests for google3.experimental.brain.privacy.pate.pate."""
"""Tests for pate.core."""
from __future__ import absolute_import
from __future__ import division
......
......@@ -13,7 +13,7 @@
# limitations under the License.
# ==============================================================================
"""Tests for google3.experimental.brain.privacy.pate.pate_smooth_sensitivity."""
"""Tests for pate.smooth_sensitivity."""
from __future__ import absolute_import
from __future__ import division
......
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A standalone utility for computing the log moments.
The utility for computing the log moments. It consists of two methods.
compute_log_moment(q, sigma, T, lmbd) computes the log moment with sampling
probability q, noise sigma, order lmbd, and T steps. get_privacy_spent computes
delta (or eps) given log moments and eps (or delta).
Example use:
Suppose that we have run an algorithm with parameters, an array of
(q1, sigma1, T1) ... (qk, sigmak, Tk), and we wish to compute eps for a given
delta. The example code would be:
max_lmbd = 32
lmbds = xrange(1, max_lmbd + 1)
log_moments = []
for lmbd in lmbds:
log_moment = 0
for q, sigma, T in parameters:
log_moment += compute_log_moment(q, sigma, T, lmbd)
log_moments.append((lmbd, log_moment))
eps, delta = get_privacy_spent(log_moments, target_delta=delta)
To verify that the I1 >= I2 (see comments in GaussianMomentsAccountant in
accountant.py for the context), run the same loop above with verify=True
passed to compute_log_moment.
"""
from __future__ import print_function
import math
import sys
import numpy as np
import scipy.integrate as integrate
import scipy.stats
from six.moves import xrange
from sympy.mpmath import mp
def _to_np_float64(v):
if math.isnan(v) or math.isinf(v):
return np.inf
return np.float64(v)
######################
# FLOAT64 ARITHMETIC #
######################
def pdf_gauss(x, sigma, mean=0):
return scipy.stats.norm.pdf(x, loc=mean, scale=sigma)
def cropped_ratio(a, b):
if a < 1E-50 and b < 1E-50:
return 1.
else:
return a / b
def integral_inf(fn):
integral, _ = integrate.quad(fn, -np.inf, np.inf)
return integral
def integral_bounded(fn, lb, ub):
integral, _ = integrate.quad(fn, lb, ub)
return integral
def distributions(sigma, q):
mu0 = lambda y: pdf_gauss(y, sigma=sigma, mean=0.0)
mu1 = lambda y: pdf_gauss(y, sigma=sigma, mean=1.0)
mu = lambda y: (1 - q) * mu0(y) + q * mu1(y)
return mu0, mu1, mu
def compute_a(sigma, q, lmbd, verbose=False):
lmbd_int = int(math.ceil(lmbd))
if lmbd_int == 0:
return 1.0
a_lambda_first_term_exact = 0
a_lambda_second_term_exact = 0
for i in xrange(lmbd_int + 1):
coef_i = scipy.special.binom(lmbd_int, i) * (q ** i)
s1, s2 = 0, 0
for j in xrange(i + 1):
coef_j = scipy.special.binom(i, j) * (-1) ** (i - j)
s1 += coef_j * np.exp((j * j - j) / (2.0 * (sigma ** 2)))
s2 += coef_j * np.exp((j * j + j) / (2.0 * (sigma ** 2)))
a_lambda_first_term_exact += coef_i * s1
a_lambda_second_term_exact += coef_i * s2
a_lambda_exact = ((1.0 - q) * a_lambda_first_term_exact +
q * a_lambda_second_term_exact)
if verbose:
print("A: by binomial expansion {} = {} + {}".format(
a_lambda_exact,
(1.0 - q) * a_lambda_first_term_exact,
q * a_lambda_second_term_exact))
return _to_np_float64(a_lambda_exact)
def compute_b(sigma, q, lmbd, verbose=False):
mu0, _, mu = distributions(sigma, q)
b_lambda_fn = lambda z: mu0(z) * np.power(cropped_ratio(mu0(z), mu(z)), lmbd)
b_lambda = integral_inf(b_lambda_fn)
m = sigma ** 2 * (np.log((2. - q) / (1. - q)) + 1. / (2 * sigma ** 2))
b_fn = lambda z: (np.power(mu0(z) / mu(z), lmbd) -
np.power(mu(-z) / mu0(z), lmbd))
if verbose:
print("M =", m)
print("f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m)))
assert b_fn(-m) < 0 and b_fn(m) < 0
b_lambda_int1_fn = lambda z: (mu0(z) *
np.power(cropped_ratio(mu0(z), mu(z)), lmbd))
b_lambda_int2_fn = lambda z: (mu0(z) *
np.power(cropped_ratio(mu(z), mu0(z)), lmbd))
b_int1 = integral_bounded(b_lambda_int1_fn, -m, m)
b_int2 = integral_bounded(b_lambda_int2_fn, -m, m)
a_lambda_m1 = compute_a(sigma, q, lmbd - 1)
b_bound = a_lambda_m1 + b_int1 - b_int2
if verbose:
print("B: by numerical integration", b_lambda)
print("B must be no more than ", b_bound)
print(b_lambda, b_bound)
return _to_np_float64(b_lambda)
###########################
# MULTIPRECISION ROUTINES #
###########################
def pdf_gauss_mp(x, sigma, mean):
return mp.mpf(1.) / mp.sqrt(mp.mpf("2.") * sigma ** 2 * mp.pi) * mp.exp(
- (x - mean) ** 2 / (mp.mpf("2.") * sigma ** 2))
def integral_inf_mp(fn):
integral, _ = mp.quad(fn, [-mp.inf, mp.inf], error=True)
return integral
def integral_bounded_mp(fn, lb, ub):
integral, _ = mp.quad(fn, [lb, ub], error=True)
return integral
def distributions_mp(sigma, q):
mu0 = lambda y: pdf_gauss_mp(y, sigma=sigma, mean=mp.mpf(0))
mu1 = lambda y: pdf_gauss_mp(y, sigma=sigma, mean=mp.mpf(1))
mu = lambda y: (1 - q) * mu0(y) + q * mu1(y)
return mu0, mu1, mu
def compute_a_mp(sigma, q, lmbd, verbose=False):
lmbd_int = int(math.ceil(lmbd))
if lmbd_int == 0:
return 1.0
mu0, mu1, mu = distributions_mp(sigma, q)
a_lambda_fn = lambda z: mu(z) * (mu(z) / mu0(z)) ** lmbd_int
a_lambda_first_term_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** lmbd_int
a_lambda_second_term_fn = lambda z: mu1(z) * (mu(z) / mu0(z)) ** lmbd_int
a_lambda = integral_inf_mp(a_lambda_fn)
a_lambda_first_term = integral_inf_mp(a_lambda_first_term_fn)
a_lambda_second_term = integral_inf_mp(a_lambda_second_term_fn)
if verbose:
print("A: by numerical integration {} = {} + {}".format(
a_lambda,
(1 - q) * a_lambda_first_term,
q * a_lambda_second_term))
return _to_np_float64(a_lambda)
def compute_b_mp(sigma, q, lmbd, verbose=False):
lmbd_int = int(math.ceil(lmbd))
if lmbd_int == 0:
return 1.0
mu0, _, mu = distributions_mp(sigma, q)
b_lambda_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int
b_lambda = integral_inf_mp(b_lambda_fn)
m = sigma ** 2 * (mp.log((2 - q) / (1 - q)) + 1 / (2 * (sigma ** 2)))
b_fn = lambda z: ((mu0(z) / mu(z)) ** lmbd_int -
(mu(-z) / mu0(z)) ** lmbd_int)
if verbose:
print("M =", m)
print("f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m)))
assert b_fn(-m) < 0 and b_fn(m) < 0
b_lambda_int1_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int
b_lambda_int2_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** lmbd_int
b_int1 = integral_bounded_mp(b_lambda_int1_fn, -m, m)
b_int2 = integral_bounded_mp(b_lambda_int2_fn, -m, m)
a_lambda_m1 = compute_a_mp(sigma, q, lmbd - 1)
b_bound = a_lambda_m1 + b_int1 - b_int2
if verbose:
print("B by numerical integration", b_lambda)
print("B must be no more than ", b_bound)
assert b_lambda < b_bound + 1e-5
return _to_np_float64(b_lambda)
def _compute_delta(log_moments, eps):
"""Compute delta for given log_moments and eps.
Args:
log_moments: the log moments of privacy loss, in the form of pairs
of (moment_order, log_moment)
eps: the target epsilon.
Returns:
delta
"""
min_delta = 1.0
for moment_order, log_moment in log_moments:
if moment_order == 0:
continue
if math.isinf(log_moment) or math.isnan(log_moment):
sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order)
continue
if log_moment < moment_order * eps:
min_delta = min(min_delta,
math.exp(log_moment - moment_order * eps))
return min_delta
def _compute_eps(log_moments, delta):
"""Compute epsilon for given log_moments and delta.
Args:
log_moments: the log moments of privacy loss, in the form of pairs
of (moment_order, log_moment)
delta: the target delta.
Returns:
epsilon
"""
min_eps = float("inf")
for moment_order, log_moment in log_moments:
if moment_order == 0:
continue
if math.isinf(log_moment) or math.isnan(log_moment):
sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order)
continue
min_eps = min(min_eps, (log_moment - math.log(delta)) / moment_order)
return min_eps
def compute_log_moment(q, sigma, steps, lmbd, verify=False, verbose=False):
"""Compute the log moment of Gaussian mechanism for given parameters.
Args:
q: the sampling ratio.
sigma: the noise sigma.
steps: the number of steps.
lmbd: the moment order.
verify: if False, only compute the symbolic version. If True, computes
both symbolic and numerical solutions and verifies the results match.
verbose: if True, print out debug information.
Returns:
the log moment with type np.float64, could be np.inf.
"""
moment = compute_a(sigma, q, lmbd, verbose=verbose)
if verify:
mp.dps = 50
moment_a_mp = compute_a_mp(sigma, q, lmbd, verbose=verbose)
moment_b_mp = compute_b_mp(sigma, q, lmbd, verbose=verbose)
np.testing.assert_allclose(moment, moment_a_mp, rtol=1e-10)
if not np.isinf(moment_a_mp):
# The following test fails for (1, np.inf)!
np.testing.assert_array_less(moment_b_mp, moment_a_mp)
if np.isinf(moment):
return np.inf
else:
return np.log(moment) * steps
def get_privacy_spent(log_moments, target_eps=None, target_delta=None):
"""Compute delta (or eps) for given eps (or delta) from log moments.
Args:
log_moments: array of (moment_order, log_moment) pairs.
target_eps: if not None, the epsilon for which we would like to compute
corresponding delta value.
target_delta: if not None, the delta for which we would like to compute
corresponding epsilon value. Exactly one of target_eps and target_delta
is None.
Returns:
eps, delta pair
"""
assert (target_eps is None) ^ (target_delta is None)
assert not ((target_eps is None) and (target_delta is None))
if target_eps is not None:
return (target_eps, _compute_delta(log_moments, target_eps))
else:
return (_compute_eps(log_moments, target_delta), target_delta)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""RDP analysis of the Gaussian-with-sampling mechanism.
Functionality for computing Renyi differential privacy of an additive Gaussian
mechanism with sampling. Its public interface consists of two methods:
compute_rdp(q, sigma, T, order) computes RDP with sampling probability q,
noise sigma, T steps at a given order.
get_privacy_spent computes delta (or eps) given RDP and eps (or delta).
Example use:
Suppose that we have run an algorithm with parameters, an array of
(q1, sigma1, T1) ... (qk, sigma_k, Tk), and we wish to compute eps for a given
delta. The example code would be:
max_order = 32
orders = range(2, max_order + 1)
rdp = np.zeros_like(orders, dtype=float)
for q, sigma, T in parameters:
rdp += rdp_accountant.compute_rdp(q, sigma, T, orders)
eps, _, opt_order = rdp_accountant.get_privacy_spent(rdp, target_delta=delta)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import sys
from absl import app
from absl import flags
import numpy as np
from scipy import special
FLAGS = flags.FLAGS
flags.DEFINE_boolean("rdp_verbose", False,
"Output intermediate results for RDP computation.")
FLAGS(sys.argv) # Load the flags (including on import)
########################
# LOG-SPACE ARITHMETIC #
########################
def _log_add(logx, logy):
"""Add two numbers in the log space."""
a, b = min(logx, logy), max(logx, logy)
if a == -np.inf: # adding 0
return b
# Use exp(a) + exp(b) = (exp(a - b) + 1) * exp(b)
return math.log1p(math.exp(a - b)) + b # log1p(x) = log(x + 1)
def _log_sub(logx, logy):
"""Subtract two numbers in the log space. Answer must be positive."""
if logy == -np.inf: # subtracting 0
return logx
assert logx > logy
try:
# Use exp(x) - exp(y) = (exp(x - y) - 1) * exp(y).
return math.log(math.expm1(logx - logy)) + logy # expm1(x) = exp(x) - 1
except OverflowError:
return logx
def _log_print(logx):
"""Pretty print."""
if logx < math.log(sys.float_info.max):
return "{}".format(math.exp(logx))
else:
return "exp({})".format(logx)
def _compute_log_a_int(q, sigma, alpha):
"""Compute log(A_alpha) for integer alpha."""
assert isinstance(alpha, (int, long))
# The first and second terms of A_alpha in the log space:
log_a1, log_a2 = -np.inf, -np.inf
for i in range(alpha + 1):
# Compute in the log space. Extra care needed for q = 0 or 1.
log_coef_i = math.log(special.binom(alpha, i))
if q > 0:
log_coef_i += i * math.log(q)
elif i > 0:
continue # The term is 0, skip the rest.
if q < 1.0:
log_coef_i += (alpha - i) * math.log(1 - q)
elif i < alpha:
continue # The term is 0, skip the rest.
s1 = log_coef_i + (i * i - i) / (2.0 * (sigma ** 2))
s2 = log_coef_i + (i * i + i) / (2.0 * (sigma ** 2))
log_a1 = _log_add(log_a1, s1)
log_a2 = _log_add(log_a2, s2)
log_a = _log_add(math.log(1 - q) + log_a1, math.log(q) + log_a2)
if FLAGS.rdp_verbose:
print("A: by binomial expansion {} = {} + {}".format(
_log_print(log_a),
_log_print(math.log(1 - q) + log_a1), _log_print(math.log(q) + log_a2)))
return float(log_a)
def _compute_log_a_frac(q, sigma, alpha):
"""Compute log(A_alpha) for fractional alpha."""
# The four parts of A_alpha in the log space:
log_a11, log_a12 = -np.inf, -np.inf
log_a21, log_a22 = -np.inf, -np.inf
i = 0
z0, _ = _compute_zs(sigma, q)
while True: # do ... until loop
coef = special.binom(alpha, i)
log_coef = math.log(abs(coef))
j = alpha - i
log_t1 = log_coef + i * math.log(q) + j * math.log(1 - q)
log_t2 = log_coef + j * math.log(q) + i * math.log(1 - q)
log_e11 = math.log(.5) + _log_erfc((i - z0) / (math.sqrt(2) * sigma))
log_e12 = math.log(.5) + _log_erfc((z0 - j) / (math.sqrt(2) * sigma))
log_e21 = math.log(.5) + _log_erfc((i - (z0 - 1)) / (math.sqrt(2) * sigma))
log_e22 = math.log(.5) + _log_erfc((z0 - 1 - j) / (math.sqrt(2) * sigma))
log_s11 = log_t1 + (i * i - i) / (2 * (sigma ** 2)) + log_e11
log_s12 = log_t2 + (j * j - j) / (2 * (sigma ** 2)) + log_e12
log_s21 = log_t1 + (i * i + i) / (2 * (sigma ** 2)) + log_e21
log_s22 = log_t2 + (j * j + j) / (2 * (sigma ** 2)) + log_e22
if coef > 0:
log_a11 = _log_add(log_a11, log_s11)
log_a12 = _log_add(log_a12, log_s12)
log_a21 = _log_add(log_a21, log_s21)
log_a22 = _log_add(log_a22, log_s22)
else:
log_a11 = _log_sub(log_a11, log_s11)
log_a12 = _log_sub(log_a12, log_s12)
log_a21 = _log_sub(log_a21, log_s21)
log_a22 = _log_sub(log_a22, log_s22)
i += 1
if max(log_s11, log_s21, log_s21, log_s22) < -30:
break
log_a = _log_add(
math.log(1. - q) + _log_add(log_a11, log_a12),
math.log(q) + _log_add(log_a21, log_a22))
return log_a
def _compute_log_a(q, sigma, alpha):
if float(alpha).is_integer():
return _compute_log_a_int(q, sigma, int(alpha))
else:
return _compute_log_a_frac(q, sigma, alpha)
def _log_erfc(x):
# Can be replaced with a single call to log_ntdr if available:
# return np.log(2.) + special.log_ntdr(-x * 2**.5)
r = special.erfc(x)
if r == 0.0:
# Using the Laurent series at infinity for the tail of the erfc function:
# erfc(x) ~ exp(-x^2-.5/x^2+.625/x^4)/(x*pi^.5)
# To verify in Mathematica:
# Series[Log[Erfc[x]] + Log[x] + Log[Pi]/2 + x^2, {x, Infinity, 6}]
return (-math.log(math.pi) / 2 - math.log(x) - x ** 2 - .5 * x ** -2 +
.625 * x ** -4 - 37. / 24. * x ** -6 + 353. / 64. * x ** -8)
else:
return math.log(r)
def _compute_zs(sigma, q):
z0 = sigma ** 2 * math.log(1 / q - 1) + .5
z1 = min(z0 - 2, z0 / 2)
return z0, z1
def _compute_log_b0(sigma, q, alpha, z1):
"""Return an approximation to B0 or None if failed to converge."""
z0, _ = _compute_zs(sigma, q)
s, log_term, log_b0, k, sign, max_log_term = 0, 1., 0, 0, 1, -np.inf
# Keep adding new terms until precision is no longer preserved.
# Don't stop on the negative.
while (k < alpha or (log_term > max_log_term - 36 and log_term > -30) or
sign < 0.):
log_b1 = k * (k - 2 * z0) / (2 * sigma ** 2)
log_b2 = _log_erfc((k - z1) / (math.sqrt(2) * sigma))
log_term = log_b0 + log_b1 + log_b2
max_log_term = max(max_log_term, log_term)
s += sign * math.exp(log_term)
k += 1
# Maintain invariant: sign * exp(log_b0) = {-alpha choose k}
log_b0 += math.log(abs(-alpha - k + 1)) - math.log(k)
sign *= -1
if s == 0: # May happen if all terms are < 1e-324.
return -np.inf
if s < 0 or math.log(s) < max_log_term - 25: # The series failed to converge.
return None
c = math.log(.5) - math.log(1 - q) * alpha
return c + math.log(s)
def _bound_log_b1(sigma, q, alpha, z1):
log_c = _log_add(math.log(1 - q),
math.log(q) + (2 * z1 - 1.) / (2 * sigma ** 2))
return math.log(.5) - log_c * alpha + _log_erfc(z1 / (math.sqrt(2) * sigma))
def _bound_log_b(q, sigma, alpha):
"""Compute a numerically stable bound on log(B_alpha)."""
if q == 1.: # If the sampling rate is 100%, A and B are symmetric.
return _compute_log_a(q, sigma, alpha)
z0, z1 = _compute_zs(sigma, q)
log_b_bound = np.inf
# Puts a lower bound on B1: it cannot be less than its value at z0.
log_lb_b1 = _bound_log_b1(sigma, q, alpha, z0)
while z0 - z1 > 1e-3:
m = (z0 + z1) / 2
log_b0 = _compute_log_b0(sigma, q, alpha, m)
if log_b0 is None:
z0 = m
continue
log_b1 = _bound_log_b1(sigma, q, alpha, m)
log_b_bound = min(log_b_bound, _log_add(log_b0, log_b1))
log_b_min_bound = _log_add(log_b0, log_lb_b1)
if (log_b_bound < 0 or
log_b_min_bound < 0 or
log_b_bound > log_b_min_bound + .01):
# If the bound is likely to be too loose, move z1 closer to z0 and repeat.
z1 = m
else:
break
return log_b_bound
def _log_bound_b_elementary(q, alpha):
return -math.log(1 - q) * alpha
def _compute_delta(orders, rdp, eps):
"""Compute delta given an RDP curve and target epsilon.
Args:
orders: An array (or a scalar) of orders.
rdp: A list (or a scalar) of RDP guarantees.
eps: The target epsilon.
Returns:
Pair of (delta, optimal_order).
Raises:
ValueError: If input is malformed.
"""
orders_vec = np.atleast_1d(orders)
rdp_vec = np.atleast_1d(rdp)
if len(orders_vec) != len(rdp_vec):
raise ValueError("Input lists must have the same length.")
deltas = np.exp((rdp_vec - eps) * (orders_vec - 1))
idx_opt = np.argmin(deltas)
return min(deltas[idx_opt], 1.), orders_vec[idx_opt]
def _compute_eps(orders, rdp, delta):
"""Compute epsilon given an RDP curve and target delta.
Args:
orders: An array (or a scalar) of orders.
rdp: A list (or a scalar) of RDP guarantees.
delta: The target delta.
Returns:
Pair of (eps, optimal_order).
Raises:
ValueError: If input is malformed.
"""
orders_vec = np.atleast_1d(orders)
rdp_vec = np.atleast_1d(rdp)
if len(orders_vec) != len(rdp_vec):
raise ValueError("Input lists must have the same length.")
eps = rdp_vec - math.log(delta) / (orders_vec - 1)
idx_opt = np.nanargmin(eps) # Ignore NaNs
return eps[idx_opt], orders_vec[idx_opt]
def _compute_rdp(q, sigma, steps, alpha):
log_moment_a = _compute_log_a(q, sigma, alpha - 1)
log_bound_b = _log_bound_b_elementary(q, alpha - 1) # does not require sigma
if log_bound_b < log_moment_a:
if FLAGS.rdp_verbose:
print("Elementary bound suffices : {} < {}".format(
_log_print(log_bound_b), _log_print(log_moment_a)))
else:
log_bound_b2 = _bound_log_b(q, sigma, alpha - 1)
if math.isnan(log_bound_b2):
if FLAGS.rdp_verbose:
print("B bound failed to converge")
else:
if FLAGS.rdp_verbose and (log_bound_b2 < log_bound_b):
print("Elementary bound is stronger: {} < {}".format(
_log_print(log_bound_b2), _log_print(log_bound_b)))
log_bound_b = min(log_bound_b, log_bound_b2)
return max(log_moment_a, log_bound_b) * steps / (alpha - 1)
def compute_rdp(q, sigma, steps, orders):
"""Compute RDP of Gaussian mechanism with sampling for given parameters.
Args:
q: The sampling ratio.
sigma: The noise sigma.
steps: The number of steps.
orders: An array (or a scalar) of RDP orders.
Returns:
The RDPs at all orders, can be np.inf.
"""
if np.isscalar(orders):
return _compute_rdp(q, sigma, steps, orders)
else:
rdp = np.zeros_like(orders, dtype=float)
for i, order in enumerate(orders):
rdp[i] = _compute_rdp(q, sigma, steps, order)
return rdp
def get_privacy_spent(orders, rdp, target_eps=None, target_delta=None):
"""Compute delta (or eps) for given eps (or delta) from the RDP curve.
Args:
orders: An array (or a scalar) of RDP orders.
rdp: An array of RDP values.
target_eps: If not None, the epsilon for which we compute the corresponding
delta.
target_delta: If not None, the delta for which we compute the corresponding
epsilon. Exactly one of target_eps and target_delta must be None.
Returns:
eps, delta, opt_order.
Raises:
ValueError: If target_eps and target_delta are messed up.
"""
if target_eps is None and target_delta is None:
raise ValueError(
"Exactly one out of eps and delta must be None. (Both are).")
if target_eps is not None and target_delta is not None:
raise ValueError(
"Exactly one out of eps and delta must be None. (None is).")
if target_eps is not None:
delta, opt_order = _compute_delta(orders, rdp, target_eps)
return target_eps, delta, opt_order
else:
eps, opt_order = _compute_eps(orders, rdp, target_delta)
return eps, target_delta, opt_order
def main(_):
pass
if __name__ == "__main__":
app.run(main)
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for rdp_accountant.py."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import absltest
import mpmath as mp
import numpy as np
import rdp_accountant
class TestGaussianMoments(absltest.TestCase):
##############################
# MULTI-PRECISION ARITHMETIC #
##############################
def _pdf_gauss_mp(self, x, sigma, mean):
return 1. / mp.sqrt(2. * sigma ** 2 * mp.pi) * mp.exp(
-(x - mean) ** 2 / (2. * sigma ** 2))
def _integral_inf_mp(self, fn):
integral, _ = mp.quad(
fn, [-mp.inf, mp.inf], error=True,
maxdegree=7) # maxdegree = 6 is not enough
return integral
def _integral_bounded_mp(self, fn, lb, ub):
integral, _ = mp.quad(fn, [lb, ub], error=True)
return integral
def _distributions_mp(self, sigma, q):
mu0 = lambda y: self._pdf_gauss_mp(y, sigma=sigma, mean=0)
mu1 = lambda y: self._pdf_gauss_mp(y, sigma=sigma, mean=1.)
mu = lambda y: (1 - q) * mu0(y) + q * mu1(y)
return mu0, mu1, mu
def compute_a_mp(self, sigma, q, order, verbose=False):
"""Compute A_lambda for arbitrary lambda by numerical integration."""
mp.dps = 100
mu0, mu1, mu = self._distributions_mp(sigma, q)
a_lambda_fn = lambda z: mu(z) * (mu(z) / mu0(z)) ** order
a_lambda = self._integral_inf_mp(a_lambda_fn)
if verbose:
a_lambda_first_term_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** order
a_lambda_second_term_fn = lambda z: mu1(z) * (mu(z) / mu0(z)) ** order
a_lambda_first_term = self._integral_inf_mp(a_lambda_first_term_fn)
a_lambda_second_term = self._integral_inf_mp(a_lambda_second_term_fn)
print("A: by numerical integration {} = {} + {}".format(
a_lambda, (1 - q) * a_lambda_first_term, q * a_lambda_second_term))
return a_lambda
def compute_b_mp(self, sigma, q, order, verbose=False):
"""Compute B_lambda for arbitrary lambda by numerical integration."""
mu0, _, mu = self._distributions_mp(sigma, q)
b_lambda_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** order
b_numeric = self._integral_inf_mp(b_lambda_fn)
if verbose:
_, z1 = rdp_accountant._compute_zs(sigma, q)
print("z1 = ", z1)
print("x in the Taylor series = ", q / (1 - q) * np.exp(
(2 * z1 - 1) / (2 * sigma ** 2)))
b0_numeric = self._integral_bounded_mp(b_lambda_fn, -np.inf, z1)
b1_numeric = self._integral_bounded_mp(b_lambda_fn, z1, +np.inf)
print("B: numerically {} = {} + {}".format(b_numeric, b0_numeric,
b1_numeric))
return float(b_numeric)
def _compute_rdp_mp(self, q, sigma, order):
log_a_mp = float(mp.log(self.compute_a_mp(sigma, q, order)))
log_b_mp = float(mp.log(self.compute_b_mp(sigma, q, order)))
return log_a_mp, log_b_mp
# TEST ROUTINES
def _almost_equal(self, a, b, rtol, atol=0):
# Analogue of np.testing.assert_allclose(a, b, rtol, atol).
self.assertBetween(a, b * (1 - rtol) - atol, b * (1 + rtol) + atol)
def _compare_bounds(self, q, sigma, order):
log_a_mp, log_b_mp = self._compute_rdp_mp(q, sigma, order)
log_a = rdp_accountant._compute_log_a(q, sigma, order)
log_bound_b = rdp_accountant._bound_log_b(q, sigma, order)
if log_a_mp < 1000 and log_a_mp > 1e-6:
self._almost_equal(log_a, log_a_mp, rtol=1e-6)
else: # be more tolerant for _very_ large or small logarithms
self._almost_equal(log_a, log_a_mp, rtol=1e-3, atol=1e-14)
if np.isfinite(log_bound_b):
# Ignore divergence between the bound and exact value of B if
# they don't matter anyway (bound on A is larger) or q > .5
if log_bound_b > log_a and q <= .5:
self._almost_equal(log_b_mp, log_bound_b, rtol=1e-6, atol=1e-14)
if np.isfinite(log_a_mp) and np.isfinite(log_b_mp):
# We hypothesize that this assertion is always true; no proof yet.
self.assertLessEqual(log_b_mp, log_a_mp + 1e-6)
def test_compute_rdp(self):
rdp_scalar = rdp_accountant.compute_rdp(0.1, 2, 10, 5)
self.assertAlmostEqual(rdp_scalar, 0.07737, places=5)
rdp_vec = rdp_accountant.compute_rdp(0.01, 2.5, 50, [1.5, 2.5, 5, 50, 100])
correct = [0.00065, 0.001085, 0.00218075, 0.023846, 167.416307]
for i in range(len(rdp_vec)):
self.assertAlmostEqual(rdp_vec[i], correct[i], places=5)
def test_compare_with_mp(self):
# Compare the cheap computation with an expensive, multi-precision
# computation for a few parameters. Takes a few seconds.
self._compare_bounds(q=.01, sigma=.1, order=.5)
self._compare_bounds(q=.1, sigma=1., order=5)
self._compare_bounds(q=.5, sigma=2., order=32.5)
for q in (1e-6, .1, .999):
for sigma in (.1, 10., 100.):
for order in (1.01, 2, 255.9, 256):
self._compare_bounds(q, sigma, order)
def test_get_privacy_spent(self):
orders = range(2, 33)
rdp = rdp_accountant.compute_rdp(0.01, 4, 10000, orders)
eps, delta, opt_order = rdp_accountant.get_privacy_spent(orders, rdp,
target_delta=1e-5)
self.assertAlmostEqual(eps, 1.258575, places=5)
self.assertEqual(opt_order, 20)
eps, delta, _ = rdp_accountant.get_privacy_spent(orders, rdp,
target_eps=1.258575)
self.assertAlmostEqual(delta, 1e-5)
def test_compute_privacy_loss(self):
parameters = [(0.01, 4, 10000), (0.1, 2, 100)]
delta = 1e-5
orders = (1.25, 1.5, 1.75, 2., 2.5, 3., 4., 5., 6., 7., 8., 10., 12., 14.,
16., 20., 24., 28., 32., 64., 256.)
rdp = np.zeros_like(orders, dtype=float)
for q, sigma, steps in parameters:
rdp += rdp_accountant.compute_rdp(q, sigma, steps, orders)
eps, delta, opt_order = rdp_accountant.get_privacy_spent(orders, rdp,
target_delta=delta)
self.assertAlmostEqual(eps, 3.276237, places=5)
self.assertEqual(opt_order, 8)
if __name__ == "__main__":
absltest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment