Commit 86906935 authored by Nicolas Papernot's avatar Nicolas Papernot Committed by Ilya Mironov
Browse files

remove all code related to differential privacy (#6045)

parent d32d957a
"""Produces two plots. One compares aggregators and their analyses. The other
illustrates sources of privacy loss for Confident-GNMax.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of two files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import pickle
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
from collections import namedtuple
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
import smooth_sensitivity as pate_ss
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_boolean('cache', False,
'Read results of privacy analysis from cache.')
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
flags.DEFINE_float('threshold', None, 'Threshold for step 1 (selection).')
flags.DEFINE_float('sigma1', None, 'Sigma for step 1 (selection).')
flags.DEFINE_float('sigma2', None, 'Sigma for step 2 (argmax).')
flags.DEFINE_integer('queries', None, 'Number of queries made by the student.')
flags.DEFINE_float('delta', 1e-8, 'Target delta.')
flags.mark_flag_as_required('counts_file')
flags.mark_flag_as_required('threshold')
flags.mark_flag_as_required('sigma1')
flags.mark_flag_as_required('sigma2')
Partition = namedtuple('Partition', ['step1', 'step2', 'ss', 'delta'])
def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta):
orders = np.logspace(np.log10(1.5), np.log10(500), num=100)
n = votes.shape[0]
rdp_total = np.zeros(len(orders))
answered_total = 0
answered = np.zeros(n)
eps_cum = np.full(n, None, dtype=float)
for i in range(n):
v = votes[i,]
if threshold is not None and sigma1 is not None:
q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v))
rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders)
else:
q_step1 = 1. # always answer
answered_total += q_step1
answered[i] = answered_total
rdp_total += q_step1 * pate.rdp_data_independent_gaussian(sigma2, orders)
eps_cum[i], order_opt = pate.compute_eps_from_delta(orders, rdp_total,
delta)
if i > 0 and (i + 1) % 1000 == 0:
print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} '
'at order = {:.2f}.'.format(
i + 1,
answered[i],
eps_cum[i],
order_opt))
sys.stdout.flush()
return eps_cum, answered
def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta):
# Short list of orders.
# orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20))
# Long list of orders.
orders = np.concatenate((np.arange(20, 40, .2),
np.arange(40, 75, .5),
np.logspace(np.log10(75), np.log10(200), num=20)))
n = votes.shape[0]
num_classes = votes.shape[1]
num_teachers = int(sum(votes[0,]))
if threshold is not None and sigma1 is not None:
is_data_ind_step1 = pate.is_data_independent_always_opt_gaussian(
num_teachers, num_classes, sigma1, orders)
else:
is_data_ind_step1 = [True] * len(orders)
is_data_ind_step2 = pate.is_data_independent_always_opt_gaussian(
num_teachers, num_classes, sigma2, orders)
eps_partitioned = np.full(n, None, dtype=Partition)
order_opt = np.full(n, None, dtype=float)
ss_std_opt = np.full(n, None, dtype=float)
answered = np.zeros(n)
rdp_step1_total = np.zeros(len(orders))
rdp_step2_total = np.zeros(len(orders))
ls_total = np.zeros((len(orders), num_teachers))
answered_total = 0
for i in range(n):
v = votes[i,]
if threshold is not None and sigma1 is not None:
logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v)
rdp_step1_total += pate.compute_rdp_threshold(logq_step1, sigma1, orders)
else:
logq_step1 = 0. # always answer
pr_answered = np.exp(logq_step1)
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp_step2_total += pr_answered * pate.rdp_gaussian(logq_step2, sigma2,
orders)
answered_total += pr_answered
rdp_ss = np.zeros(len(orders))
ss_std = np.zeros(len(orders))
for j, order in enumerate(orders):
if not is_data_ind_step1[j]:
ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(v,
num_teachers, threshold, sigma1, order)
else:
ls_step1 = np.full(num_teachers, 0, dtype=float)
if not is_data_ind_step2[j]:
ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
v, num_teachers, sigma2, order)
else:
ls_step2 = np.full(num_teachers, 0, dtype=float)
ls_total[j,] += ls_step1 + pr_answered * ls_step2
beta_ss = .49 / order
ss = pate_ss.compute_discounted_max(beta_ss, ls_total[j,])
sigma_ss = ((order * math.exp(2 * beta_ss)) / ss) ** (1 / 3)
rdp_ss[j] = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta_ss, sigma_ss, order)
ss_std[j] = ss * sigma_ss
rdp_total = rdp_step1_total + rdp_step2_total + rdp_ss
answered[i] = answered_total
_, order_opt[i] = pate.compute_eps_from_delta(orders, rdp_total, delta)
order_idx = np.searchsorted(orders, order_opt[i])
# Since optimal orders are always non-increasing, shrink orders array
# and all cumulative arrays to speed up computation.
if order_idx < len(orders):
orders = orders[:order_idx + 1]
rdp_step1_total = rdp_step1_total[:order_idx + 1]
rdp_step2_total = rdp_step2_total[:order_idx + 1]
eps_partitioned[i] = Partition(step1=rdp_step1_total[order_idx],
step2=rdp_step2_total[order_idx],
ss=rdp_ss[order_idx],
delta=-math.log(delta) / (order_opt[i] - 1))
ss_std_opt[i] = ss_std[order_idx]
if i > 0 and (i + 1) % 1 == 0:
print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} '
'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, '
'step2 = {:.3f}, ss = {:.3f}'.format(
i + 1,
answered[i],
sum(eps_partitioned[i]),
ss_std_opt[i],
order_opt[i],
eps_partitioned[i].delta,
eps_partitioned[i].step1,
eps_partitioned[i].step2,
eps_partitioned[i].ss))
sys.stdout.flush()
return eps_partitioned, answered, ss_std_opt, order_opt
def plot_comparison(figures_dir, simple_ind, conf_ind, simple_dep, conf_dep):
"""Plots variants of GNMax algorithm and their analyses.
"""
def pivot(x_axis, eps, answered):
y = np.full(len(x_axis), None, dtype=float) # delta
for i, x in enumerate(x_axis):
idx = np.searchsorted(answered, x)
if idx < len(eps):
y[i] = eps[idx]
return y
def pivot_dep(x_axis, data_dep):
eps_partitioned, answered, _, _ = data_dep
eps = [sum(p) for p in eps_partitioned] # Flatten eps
return pivot(x_axis, eps, answered)
xlim = 10000
x_axis = range(0, xlim, 10)
y_simple_ind = pivot(x_axis, *simple_ind)
y_conf_ind = pivot(x_axis, *conf_ind)
y_simple_dep = pivot_dep(x_axis, simple_dep)
y_conf_dep = pivot_dep(x_axis, conf_dep)
# plt.close('all')
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(x_axis, y_simple_ind, ls='--', color='r', lw=3, label=r'Simple GNMax, data-ind analysis')
ax.plot(x_axis, y_conf_ind, ls='--', color='b', lw=3, label=r'Confident GNMax, data-ind analysis')
ax.plot(x_axis, y_simple_dep, ls='-', color='r', lw=3, label=r'Simple GNMax, data-dep analysis')
ax.plot(x_axis, y_conf_dep, ls='-', color='b', lw=3, label=r'Confident GNMax, data-dep analysis')
plt.xticks(np.arange(0, xlim + 1000, 2000))
plt.xlim([0, xlim])
plt.ylim(bottom=0)
plt.legend(fontsize=16)
ax.set_xlabel('Number of queries answered', fontsize=16)
ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
ax.tick_params(labelsize=14)
plot_filename = os.path.join(figures_dir, 'comparison.pdf')
print('Saving the graph to ' + plot_filename)
fig.savefig(plot_filename, bbox_inches='tight')
plt.show()
def plot_partition(figures_dir, gnmax_conf, print_order):
"""Plots an expert version of the privacy-per-answered-query graph.
Args:
figures_dir: A name of the directory where to save the plot.
eps: The cumulative privacy cost.
partition: Allocation of the privacy cost.
answered: Cumulative number of queries answered.
order_opt: The list of optimal orders.
"""
eps_partitioned, answered, ss_std_opt, order_opt = gnmax_conf
xlim = 10000
x = range(0, int(xlim), 10)
lenx = len(x)
y0 = np.full(lenx, np.nan, dtype=float) # delta
y1 = np.full(lenx, np.nan, dtype=float) # delta + step1
y2 = np.full(lenx, np.nan, dtype=float) # delta + step1 + step2
y3 = np.full(lenx, np.nan, dtype=float) # delta + step1 + step2 + ss
noise_std = np.full(lenx, np.nan, dtype=float)
y_right = np.full(lenx, np.nan, dtype=float)
for i in range(lenx):
idx = np.searchsorted(answered, x[i])
if idx < len(eps_partitioned):
y0[i] = eps_partitioned[idx].delta
y1[i] = y0[i] + eps_partitioned[idx].step1
y2[i] = y1[i] + eps_partitioned[idx].step2
y3[i] = y2[i] + eps_partitioned[idx].ss
noise_std[i] = ss_std_opt[idx]
y_right[i] = order_opt[idx]
# plt.close('all')
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
fig.patch.set_alpha(0)
l1 = ax.plot(
x, y3, color='b', ls='-', label=r'Total privacy cost', linewidth=1).pop()
for y in (y0, y1, y2):
ax.plot(x, y, color='b', ls='-', label=r'_nolegend_', alpha=.5, linewidth=1)
ax.fill_between(x, [0] * lenx, y0.tolist(), facecolor='b', alpha=.5)
ax.fill_between(x, y0.tolist(), y1.tolist(), facecolor='b', alpha=.4)
ax.fill_between(x, y1.tolist(), y2.tolist(), facecolor='b', alpha=.3)
ax.fill_between(x, y2.tolist(), y3.tolist(), facecolor='b', alpha=.2)
ax.fill_between(x, (y3 - noise_std).tolist(), (y3 + noise_std).tolist(),
facecolor='r', alpha=.5)
plt.xticks(np.arange(0, xlim + 1000, 2000))
plt.xlim([0, xlim])
ax.set_ylim([0, 3.])
ax.set_xlabel('Number of queries answered', fontsize=16)
ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
# Merging legends.
if print_order:
ax2 = ax.twinx()
l2 = ax2.plot(
x, y_right, 'r', ls='-', label=r'Optimal order', linewidth=5,
alpha=.5).pop()
ax2.grid(False)
# ax2.set_ylabel(r'Optimal Renyi order', fontsize=16)
ax2.set_ylim([0, 200.])
# ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13)
ax.tick_params(labelsize=14)
plot_filename = os.path.join(figures_dir, 'partition.pdf')
print('Saving the graph to ' + plot_filename)
fig.savefig(plot_filename, bbox_inches='tight', dpi=800)
plt.show()
def run_all_analyses(votes, threshold, sigma1, sigma2, delta):
simple_ind = analyze_gnmax_conf_data_ind(votes, None, None, sigma2,
delta)
conf_ind = analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2,
delta)
simple_dep = analyze_gnmax_conf_data_dep(votes, None, None, sigma2,
delta)
conf_dep = analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2,
delta)
return (simple_ind, conf_ind, simple_dep, conf_dep)
def run_or_load_all_analyses():
temp_filename = os.path.expanduser('~/tmp/partition_cached.pkl')
if FLAGS.cache and os.path.isfile(temp_filename):
print('Reading from cache ' + temp_filename)
with open(temp_filename, 'rb') as f:
all_analyses = pickle.load(f)
else:
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
if FLAGS.queries is not None:
if votes.shape[0] < FLAGS.queries:
raise ValueError('Expect {} rows, got {} in {}'.format(
FLAGS.queries, votes.shape[0], fin_name))
# Truncate the votes matrix to the number of queries made.
votes = votes[:FLAGS.queries, ]
all_analyses = run_all_analyses(votes, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta)
print('Writing to cache ' + temp_filename)
with open(temp_filename, 'wb') as f:
pickle.dump(all_analyses, f)
return all_analyses
def main(argv):
del argv # Unused.
simple_ind, conf_ind, simple_dep, conf_dep = run_or_load_all_analyses()
figures_dir = os.path.expanduser(FLAGS.figures_dir)
plot_comparison(figures_dir, simple_ind, conf_ind, simple_dep, conf_dep)
plot_partition(figures_dir, conf_dep, True)
plt.close('all')
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots graphs for the slide deck.
A script in support of the PATE2 paper. The input is a file containing a numpy
array of votes, one query per row, one class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output graphs are visualized using the TkAgg backend.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
import random
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
flags.DEFINE_boolean('transparent', False, 'Set background to transparent.')
flags.mark_flag_as_required('counts_file')
def setup_plot():
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
if FLAGS.transparent:
fig.patch.set_alpha(0)
return fig, ax
def plot_rdp_curve_per_example(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.001
fig, ax = setup_plot()
for i in range(votes.shape[0]):
for sigma in sigmas:
logq = pate.compute_logq_gaussian(votes[i,], sigma)
rdp = pate.rdp_gaussian(logq, sigma, orders)
ax.plot(
orders,
rdp,
alpha=1.,
label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)),
linewidth=5)
for sigma in sigmas:
ax.plot(
orders,
pate.rdp_data_independent_gaussian(sigma, orders),
alpha=.3,
label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
linewidth=10)
plt.xlim(xmin=1, xmax=100)
plt.ylim(ymin=0)
plt.xticks([1, 20, 40, 60, 80, 100])
plt.yticks([0, .0025, .005, .0075, .01])
plt.xlabel(r'Order $\alpha$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
ax.tick_params(labelsize=14)
plt.legend(loc=0, fontsize=13)
plt.show()
def plot_rdp_of_sigma(v, order):
sigmas = np.linspace(1., 1000., endpoint=True, num=1000)
fig, ax = setup_plot()
y = np.zeros(len(sigmas))
for i, sigma in enumerate(sigmas):
logq = pate.compute_logq_gaussian(v, sigma)
y[i] = pate.rdp_gaussian(logq, sigma, order)
ax.plot(sigmas, y, alpha=.8, linewidth=5)
plt.xlim(xmin=1, xmax=1000)
plt.ylim(ymin=0)
# plt.yticks([0, .0004, .0008, .0012])
ax.tick_params(labelleft='off')
plt.xlabel(r'Noise $\sigma$', fontsize=16)
plt.ylabel(r'RDP at order $\alpha={}$'.format(order), fontsize=16)
ax.tick_params(labelsize=14)
# plt.legend(loc=0, fontsize=13)
plt.show()
def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders,
target_answered):
rdp_cum = np.zeros(len(orders))
answered = 0
for i, v in enumerate(votes):
v = sorted(v, reverse=True)
q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp = pate.rdp_gaussian(logq_step2, sigma2, orders)
rdp_cum += q_step1 * rdp
answered += q_step1
if answered >= target_answered:
print('Processed {} queries to answer {}.'.format(i, target_answered))
return rdp_cum
assert False, 'Never reached {} answered queries.'.format(target_answered)
def plot_rdp_total(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=100)
orders[0] = 1.1
fig, ax = setup_plot()
target_answered = 2000
for sigma in sigmas:
rdp = compute_rdp_curve(votes, 5000, 1000, sigma, orders, target_answered)
ax.plot(
orders,
rdp,
alpha=.8,
label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)),
linewidth=5)
# for sigma in sigmas:
# ax.plot(
# orders,
# target_answered * pate.rdp_data_independent_gaussian(sigma, orders),
# alpha=.3,
# label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
# linewidth=10)
plt.xlim(xmin=1, xmax=100)
plt.ylim(ymin=0)
plt.xticks([1, 20, 40, 60, 80, 100])
plt.yticks([0, .0005, .001, .0015, .002])
plt.xlabel(r'Order $\alpha$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
ax.tick_params(labelsize=14)
plt.legend(loc=0, fontsize=13)
plt.show()
def plot_data_ind_curve():
fig, ax = setup_plot()
orders = np.linspace(1., 10., endpoint=True, num=1000)
orders[0] = 1.01
ax.plot(
orders,
pate.rdp_data_independent_gaussian(1., orders),
alpha=.5,
color='gray',
linewidth=10)
# plt.yticks([])
plt.xlim(xmin=1, xmax=10)
plt.ylim(ymin=0)
plt.xticks([1, 3, 5, 7, 9])
ax.tick_params(labelsize=14)
plt.show()
def plot_two_data_ind_curves():
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.001
fig, ax = setup_plot()
for sigma in [100, 150]:
ax.plot(
orders,
pate.rdp_data_independent_gaussian(sigma, orders),
alpha=.3,
label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
linewidth=10)
plt.xlim(xmin=1, xmax=100)
plt.ylim(ymin=0)
plt.xticks([1, 20, 40, 60, 80, 100])
plt.yticks([0, .0025, .005, .0075, .01])
plt.xlabel(r'Order $\alpha$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
ax.tick_params(labelsize=14)
plt.legend(loc=0, fontsize=13)
plt.show()
def scatter_plot(votes, threshold, sigma1, sigma2, order):
fig, ax = setup_plot()
x = []
y = []
for i, v in enumerate(votes):
if threshold is not None and sigma1 is not None:
q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
else:
q_step1 = 1.
if random.random() < q_step1:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
x.append(max(v))
y.append(pate.rdp_gaussian(logq_step2, sigma2, order))
print('Selected {} queries.'.format(len(x)))
# Plot the data-independent curve:
# data_ind = pate.rdp_data_independent_gaussian(sigma, order)
# plt.plot([0, 5000], [data_ind, data_ind], color='tab:blue', linestyle='-', linewidth=2)
ax.set_yscale('log')
plt.xlim(xmin=0, xmax=5000)
plt.ylim(ymin=1e-300, ymax=1)
plt.yticks([1, 1e-100, 1e-200, 1e-300])
plt.scatter(x, y, s=1, alpha=0.5)
plt.ylabel(r'RDP at $\alpha={}$'.format(order), fontsize=16)
plt.xlabel(r'max count', fontsize=16)
ax.tick_params(labelsize=14)
plt.show()
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
plot_data_ind_curve()
plot_two_data_ind_curves()
v1 = [2550, 2200, 250] # based on votes[2,]
# v2 = [2600, 2200, 200] # based on votes[381,]
plot_rdp_curve_per_example(np.array([v1]), (100., 150.))
plot_rdp_of_sigma(np.array(v1), 20.)
votes = np.load(fin_name)
plot_rdp_total(votes[:12000, ], (100., 150.))
scatter_plot(votes[:6000, ], None, None, 100, 20) # w/o thresholding
scatter_plot(votes[:6000, ], 3500, 1500, 100, 20) # with thresholding
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Illustrates how noisy thresholding check changes distribution of queries.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is one of two graphs depending on the setting of the plot variable.
The output is written to a pdf file.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_enum('plot', 'small', ['small', 'large'], 'Selects which of'
'the two plots is produced.')
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('plot_file', '', 'Plot file to write.')
flags.mark_flag_as_required('counts_file')
def compute_count_per_bin(bin_num, votes):
"""Tabulates number of examples in each bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
Returns:
Array of counts of length bin_num.
"""
sums = np.sum(votes, axis=1)
# Check that all rows contain the same number of votes.
assert max(sums) == min(sums)
s = max(sums)
counts = np.zeros(bin_num)
n = votes.shape[0]
for i in xrange(n):
v = votes[i,]
bin_idx = int(math.floor(max(v) * bin_num / s))
assert 0 <= bin_idx < bin_num
counts[bin_idx] += 1
return counts
def compute_privacy_cost_per_bins(bin_num, votes, sigma2, order):
"""Outputs average privacy cost per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
sigma2: The scale (std) of the Gaussian noise. (Same as sigma_2 in
Algorithms 1 and 2.)
order: The Renyi order for which privacy cost is computed.
Returns:
Expected eps of RDP (ignoring delta) per example in each bin.
"""
n = votes.shape[0]
bin_counts = np.zeros(bin_num)
bin_rdp = np.zeros(bin_num) # RDP at order=order
for i in xrange(n):
v = votes[i,]
logq = pate.compute_logq_gaussian(v, sigma2)
rdp_at_order = pate.rdp_gaussian(logq, sigma2, order)
bin_idx = int(math.floor(max(v) * bin_num / sum(v)))
assert 0 <= bin_idx < bin_num
bin_counts[bin_idx] += 1
bin_rdp[bin_idx] += rdp_at_order
if (i + 1) % 1000 == 0:
print('example {}'.format(i + 1))
sys.stdout.flush()
return bin_rdp / bin_counts
def compute_expected_answered_per_bin(bin_num, votes, threshold, sigma1):
"""Computes expected number of answers per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
threshold: The threshold against which check is performed.
sigma1: The std of the Gaussian noise with which check is performed. (Same
as sigma_1 in Algorithms 1 and 2.)
Returns:
Expected number of queries answered per bin.
"""
n = votes.shape[0]
bin_answered = np.zeros(bin_num)
for i in xrange(n):
v = votes[i,]
p = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
bin_idx = int(math.floor(max(v) * bin_num / sum(v)))
assert 0 <= bin_idx < bin_num
bin_answered[bin_idx] += p
if (i + 1) % 1000 == 0:
print('example {}'.format(i + 1))
sys.stdout.flush()
return bin_answered
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
votes = votes[:4000,] # truncate to 4000 samples
if FLAGS.plot == 'small':
bin_num = 5
m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500)
elif FLAGS.plot == 'large':
bin_num = 10
m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500)
a_check = compute_expected_answered_per_bin(bin_num, votes, 5000, 1500)
eps = compute_privacy_cost_per_bins(bin_num, votes, 100, 50)
else:
raise ValueError('--plot flag must be one of ["small", "large"]')
counts = compute_count_per_bin(bin_num, votes)
bins = np.linspace(0, 100, num=bin_num, endpoint=False)
plt.close('all')
fig, ax = plt.subplots()
if FLAGS.plot == 'small':
fig.set_figheight(5)
fig.set_figwidth(5)
ax.bar(
bins,
counts,
20,
color='orangered',
linestyle='dotted',
linewidth=5,
edgecolor='red',
fill=False,
alpha=.5,
align='edge',
label='LNMax answers')
ax.bar(
bins,
m_check,
20,
color='g',
alpha=.5,
linewidth=0,
edgecolor='g',
align='edge',
label='Confident-GNMax\nanswers')
elif FLAGS.plot == 'large':
fig.set_figheight(4.7)
fig.set_figwidth(7)
ax.bar(
bins,
counts,
10,
linestyle='dashed',
linewidth=5,
edgecolor='red',
fill=False,
alpha=.5,
align='edge',
label='LNMax answers')
ax.bar(
bins,
m_check,
10,
color='g',
alpha=.5,
linewidth=0,
edgecolor='g',
align='edge',
label='Confident-GNMax\nanswers (moderate)')
ax.bar(
bins,
a_check,
10,
color='b',
alpha=.5,
align='edge',
label='Confident-GNMax\nanswers (aggressive)')
ax2 = ax.twinx()
bin_centers = [x + 5 for x in bins]
ax2.plot(bin_centers, eps, 'ko', alpha=.8)
ax2.set_ylim([1e-200, 1.])
ax2.set_yscale('log')
ax2.grid(False)
ax2.set_yticks([1e-3, 1e-50, 1e-100, 1e-150, 1e-200])
plt.tick_params(which='minor', right='off')
ax2.set_ylabel(r'Per query privacy cost $\varepsilon$', fontsize=16)
plt.xlim([0, 100])
ax.set_ylim([0, 2500])
# ax.set_yscale('log')
ax.set_xlabel('Percentage of teachers that agree', fontsize=16)
ax.set_ylabel('Number of queries answered', fontsize=16)
vals = ax.get_xticks()
ax.set_xticklabels([str(int(x)) + '%' for x in vals])
ax.tick_params(labelsize=14, bottom=True, top=True, left=True, right=True)
ax.legend(loc=2, prop={'size': 16})
# simple: 'figures/noisy_thresholding_check_perf.pdf')
# detailed: 'figures/noisy_thresholding_check_perf_details.pdf'
print('Saving the graph to ' + FLAGS.plot_file)
plt.savefig(os.path.expanduser(FLAGS.plot_file), bbox_inches='tight')
plt.show()
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots three graphs illustrating cost of privacy per answered query.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of three pdf files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import pickle
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_boolean('cache', False,
'Read results of privacy analysis from cache.')
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
flags.mark_flag_as_required('counts_file')
def run_analysis(votes, mechanism, noise_scale, params):
"""Computes data-dependent privacy.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf')
noise_scale: A mechanism privacy parameter.
params: Other privacy parameters.
Returns:
Four lists: cumulative privacy cost epsilon, how privacy budget is split,
how many queries were answered, optimal order.
"""
def compute_partition(order_opt, eps):
order_opt_idx = np.searchsorted(orders, order_opt)
if mechanism == 'gnmax_conf':
p = (rdp_select_cum[order_opt_idx],
rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx],
-math.log(delta) / (order_opt - 1))
else:
p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1))
return [x / eps for x in p] # Ensures that sum(x) == 1
# Short list of orders.
# orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500), num=100)))
delta = 1e-8
n = votes.shape[0]
eps_total = np.zeros(n)
partition = [None] * n
order_opt = np.full(n, np.nan, dtype=float)
answered = np.zeros(n, dtype=float)
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
rdp_select_cum = np.zeros(len(orders))
answered_sum = 0
for i in range(n):
v = votes[i,]
if mechanism == 'lnmax':
logq_lnmax = pate.compute_logq_laplace(v, noise_scale)
rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders)
rdp_sqrd = rdp_query ** 2
pr_answered = 1
elif mechanism == 'gnmax':
logq_gmax = pate.compute_logq_gaussian(v, noise_scale)
rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders)
rdp_sqrd = rdp_query ** 2
pr_answered = 1
elif mechanism == 'gnmax_conf':
logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v)
logq_step2 = pate.compute_logq_gaussian(v, noise_scale)
q_step1 = np.exp(logq_step1)
logq_step1_min = min(logq_step1, math.log1p(-q_step1))
rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min,
2 ** .5 * params['sigma1'], orders)
rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders)
rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_gnmax_step1 ** 2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2
+ q_step1 * rdp_gnmax_step2 ** 2)
rdp_select_cum += rdp_gnmax_step1
pr_answered = q_step1
else:
raise ValueError(
'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]')
rdp_cum += rdp_query
rdp_sqrd_cum += rdp_sqrd
answered_sum += pr_answered
answered[i] = answered_sum
eps_total[i], order_opt[i] = pate.compute_eps_from_delta(
orders, rdp_cum, delta)
partition[i] = compute_partition(order_opt[i], eps_total[i])
if i > 0 and (i + 1) % 1000 == 0:
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i) ** 2 # Ignore Bessel's correction.
order_opt_idx = np.searchsorted(orders, order_opt[i])
eps_std = ((i + 1) * rdp_var[order_opt_idx]) ** .5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered_sum, eps_total[i], eps_std, order_opt[i],
-math.log(delta) / (order_opt[i] - 1)))
sys.stdout.flush()
return eps_total, partition, answered, order_opt
def print_plot_small(figures_dir, eps_lap, eps_gnmax, answered_gnmax):
"""Plots a graph of LNMax vs GNMax.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax: The cumulative privacy costs of the Gaussian mechanism
answered_gnmax: The cumulative count of queries answered.
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
y_lap = np.zeros(len(x_axis), dtype=float)
y_gnmax = np.full(len(x_axis), np.nan, dtype=float)
for i in range(len(x_axis)):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx = np.searchsorted(answered_gnmax, x)
if idx < len(eps_gnmax):
y_gnmax[i] = eps_gnmax[idx]
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis, y_lap, color='r', ls='--', label='LNMax', alpha=.5, linewidth=5)
ax.plot(
x_axis,
y_gnmax,
color='g',
ls='-',
label='Confident-GNMax',
alpha=.5,
linewidth=5)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, 6000])
plt.ylim([0, 6.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_gnmax.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def print_plot_large(figures_dir, eps_lap, eps_gnmax1, answered_gnmax1,
eps_gnmax2, partition_gnmax2, answered_gnmax2):
"""Plots a graph of LNMax vs GNMax with two parameters.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1).
answered_gnmax1: The cumulative count of queries answered (set 1).
eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2).
partition_gnmax2: Allocation of eps for set 2.
answered_gnmax2: The cumulative count of queries answered (set 2).
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
lenx = len(x_axis)
y_lap = np.zeros(lenx)
y_gnmax1 = np.full(lenx, np.nan, dtype=float)
y_gnmax2 = np.full(lenx, np.nan, dtype=float)
y1_gnmax2 = np.full(lenx, np.nan, dtype=float)
for i in range(lenx):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx1 = np.searchsorted(answered_gnmax1, x)
if idx1 < len(eps_gnmax1):
y_gnmax1[i] = eps_gnmax1[idx1]
idx2 = np.searchsorted(answered_gnmax2, x)
if idx2 < len(eps_gnmax2):
y_gnmax2[i] = eps_gnmax2[idx2]
fraction_step1, fraction_step2, _ = partition_gnmax2[idx2]
y1_gnmax2[i] = eps_gnmax2[idx2] * fraction_step1 / (
fraction_step1 + fraction_step2)
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis,
y_lap,
color='r',
ls='dashed',
label='LNMax',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax1,
color='g',
ls='-',
label='Confident-GNMax (moderate)',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax2,
color='b',
ls='-',
label='Confident-GNMax (aggressive)',
alpha=.5,
linewidth=5)
ax.fill_between(
x_axis, [0] * lenx,
y1_gnmax2.tolist(),
facecolor='b',
alpha=.3,
hatch='\\')
ax.plot(
x_axis,
y1_gnmax2,
color='b',
ls='-',
label='_nolegend_',
alpha=.5,
linewidth=1)
ax.fill_between(
x_axis, y1_gnmax2.tolist(), y_gnmax2.tolist(), facecolor='b', alpha=.3)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, xlim])
plt.ylim([0, 1.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_2xgnmax_large.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def run_all_analyses(votes, lambda_laplace, gnmax_parameters, sigma2):
"""Sequentially runs all analyses.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
lambda_laplace: The scale of the Laplace noise (lambda).
gnmax_parameters: A list of parameters for GNMax.
sigma2: Shared parameter for the GNMax mechanisms.
Returns:
Five lists whose length is the number of queries.
"""
print('=== Laplace Mechanism ===')
eps_lap, _, _, _ = run_analysis(votes, 'lnmax', lambda_laplace, None)
print()
# Does not go anywhere, for now
# print('=== Gaussian Mechanism (simple) ===')
# eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None)
eps_gnmax = [[] for p in gnmax_parameters]
partition_gmax = [[] for p in gnmax_parameters]
answered = [[] for p in gnmax_parameters]
order_opt = [[] for p in gnmax_parameters]
for i, p in enumerate(gnmax_parameters):
print('=== Gaussian Mechanism (confident) {}: ==='.format(p))
eps_gnmax[i], partition_gmax[i], answered[i], order_opt[i] = run_analysis(
votes, 'gnmax_conf', sigma2, p)
print()
return eps_lap, eps_gnmax, partition_gmax, answered, order_opt
def main(argv):
del argv # Unused.
lambda_laplace = 50. # corresponds to eps = 1. / lambda_laplace
# Paramaters of the GNMax
gnmax_parameters = ({
't': 1000,
'sigma1': 500
}, {
't': 3500,
'sigma1': 1500
}, {
't': 5000,
'sigma1': 1500
})
sigma2 = 100 # GNMax parameters differ only in Step 1 (selection).
ftemp_name = '/tmp/precomputed.pkl'
figures_dir = os.path.expanduser(FLAGS.figures_dir)
if FLAGS.cache and os.path.isfile(ftemp_name):
print('Reading from cache ' + ftemp_name)
with open(ftemp_name, 'rb') as f:
(eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax) = pickle.load(f)
else:
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
(eps_lap, eps_gnmax, partition_gmax,
answered_gnmax, orders_opt_gnmax) = run_all_analyses(
votes, lambda_laplace, gnmax_parameters, sigma2)
print('Writing to cache ' + ftemp_name)
with open(ftemp_name, 'wb') as f:
pickle.dump((eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax), f)
print_plot_small(figures_dir, eps_lap, eps_gnmax[0], answered_gnmax[0])
print_plot_large(figures_dir, eps_lap, eps_gnmax[1], answered_gnmax[1],
eps_gnmax[2], partition_gmax[2], answered_gnmax[2])
plt.close('all')
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Performs privacy analysis of GNMax with smooth sensitivity.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
Several flavors of the GNMax algorithm can be analyzed.
- Plain GNMax (argmax w/ Gaussian noise) is assumed when arguments threshold
and sigma2 are missing.
- Confident GNMax (thresholding + argmax w/ Gaussian noise) is used when
threshold, sigma1, and sigma2 are given.
- Interactive GNMax (two- or multi-round) is triggered by specifying
baseline_file, which provides baseline values for votes selection in Step 1.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import numpy as np
import core as pate
import smooth_sensitivity as pate_ss
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('baseline_file', None, 'File with baseline scores.')
flags.DEFINE_boolean('data_independent', False,
'Force data-independent bounds.')
flags.DEFINE_float('threshold', None, 'Threshold for step 1 (selection).')
flags.DEFINE_float('sigma1', None, 'Sigma for step 1 (selection).')
flags.DEFINE_float('sigma2', None, 'Sigma for step 2 (argmax).')
flags.DEFINE_integer('queries', None, 'Number of queries made by the student.')
flags.DEFINE_float('delta', 1e-8, 'Target delta.')
flags.DEFINE_float(
'order', None,
'Fixes a Renyi DP order (if unspecified, finds an optimal order from a '
'hardcoded list).')
flags.DEFINE_integer(
'teachers', None,
'Number of teachers (if unspecified, derived from the counts file).')
flags.mark_flag_as_required('counts_file')
flags.mark_flag_as_required('sigma2')
def _check_conditions(sigma, num_classes, orders):
"""Symbolic-numeric verification of conditions C5 and C6.
The conditions on the beta function are verified by constructing the beta
function symbolically, and then checking that its derivative (computed
symbolically) is non-negative within the interval of conjectured monotonicity.
The last check is performed numerically.
"""
print('Checking conditions C5 and C6 for all orders.')
sys.stdout.flush()
conditions_hold = True
for order in orders:
cond5, cond6 = pate_ss.check_conditions(sigma, num_classes, order)
conditions_hold &= cond5 and cond6
if not cond5:
print('Condition C5 does not hold for order =', order)
elif not cond6:
print('Condition C6 does not hold for order =', order)
if conditions_hold:
print('Conditions C5-C6 hold for all orders.')
sys.stdout.flush()
return conditions_hold
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders,
data_ind):
"""Computes the (data-dependent) RDP curve for Confident GNMax."""
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
answered = 0
for i, v in enumerate(votes):
if threshold is None:
logq_step1 = 0 # No thresholding, always proceed to step 2.
rdp_step1 = np.zeros(len(orders))
else:
logq_step1 = pate.compute_logpr_answered(threshold, sigma1,
v - baseline[i,])
if data_ind:
rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders)
else:
rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders)
if data_ind:
rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders)
else:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders)
q_step1 = np.exp(logq_step1)
rdp = rdp_step1 + rdp_step2 * q_step1
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 +
q_step1 * rdp_step2**2)
rdp_sqrd_cum += rdp_sqrd
rdp_cum += rdp
answered += q_step1
if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1):
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i)**2 # Ignore Bessel's correction.
eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
order_opt_idx = np.searchsorted(orders, order_opt)
eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered, eps_total, eps_std, order_opt,
-math.log(delta) / (order_opt - 1)))
sys.stdout.flush()
_, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
return order_opt
def _find_optimal_smooth_sensitivity_parameters(
votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1,
ind_step2, order):
"""Optimizes smooth sensitivity parameters by minimizing a cost function.
The cost function is
exact_eps + cost of GNSS + two stds of noise,
which captures that upper bound of the confidence interval of the sanitized
privacy budget.
Since optimization is done with full view of sensitive data, the results
cannot be released.
"""
rdp_cum = 0
answered_cum = 0
ls_cum = 0
# Define a plausible range for the beta values.
betas = np.arange(.3 / order, .495 / order, .01 / order)
cost_delta = math.log(1 / delta) / (order - 1)
for i, v in enumerate(votes):
if threshold is None:
log_pr_answered = 0
rdp1 = 0
ls_step1 = np.zeros(num_teachers)
else:
log_pr_answered = pate.compute_logpr_answered(threshold, sigma1,
v - baseline[i,])
if ind_step1: # apply data-independent bound for step 1 (thresholding).
rdp1 = pate.compute_rdp_data_independent_threshold(sigma1, order)
ls_step1 = np.zeros(num_teachers)
else:
rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order)
ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(
v - baseline[i,], num_teachers, threshold, sigma1, order)
pr_answered = math.exp(log_pr_answered)
answered_cum += pr_answered
if ind_step2: # apply data-independent bound for step 2 (GNMax).
rdp2 = pate.rdp_data_independent_gaussian(sigma2, order)
ls_step2 = np.zeros(num_teachers)
else:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order)
# Compute smooth sensitivity.
ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
v, num_teachers, sigma2, order)
rdp_cum += rdp1 + pr_answered * rdp2
ls_cum += ls_step1 + pr_answered * ls_step2 # Expected local sensitivity.
if ind_step1 and ind_step2:
# Data-independent bounds.
cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf
else:
# Data-dependent bounds.
cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None
for beta in betas:
ss = pate_ss.compute_discounted_max(beta, ls_cum)
# Solution to the minimization problem:
# min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma}
sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3)
cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta, sigma_ss, order)
# Cost captures exact_eps + cost of releasing SS + two stds of noise.
cost = rdp_cum + cost_ss + 2 * ss * sigma_ss
if cost < cost_opt:
cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss
if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1):
eps_before_ss = rdp_cum + cost_delta
eps_with_ss = (
eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta_opt, sigma_ss_opt, order))
print('{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to '
'{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'.
format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss,
ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt))
sys.stdout.flush()
# Return optimal parameters for the last iteration.
return beta_opt, ss_opt, sigma_ss_opt
####################
# HELPER FUNCTIONS #
####################
def _load_votes(counts_file, baseline_file, queries):
counts_file_expanded = os.path.expanduser(counts_file)
print('Reading raw votes from ' + counts_file_expanded)
sys.stdout.flush()
votes = np.load(counts_file_expanded)
print('Shape of the votes matrix = {}'.format(votes.shape))
if baseline_file is not None:
baseline_file_expanded = os.path.expanduser(baseline_file)
print('Reading baseline values from ' + baseline_file_expanded)
sys.stdout.flush()
baseline = np.load(baseline_file_expanded)
if votes.shape != baseline.shape:
raise ValueError(
'Counts file and baseline file must have the same shape. Got {} and '
'{} instead.'.format(votes.shape, baseline.shape))
else:
baseline = np.zeros_like(votes)
if queries is not None:
if votes.shape[0] < queries:
raise ValueError('Expect {} rows, got {} in {}'.format(
queries, votes.shape[0], counts_file))
# Truncate the votes matrix to the number of queries made.
votes = votes[:queries,]
baseline = baseline[:queries,]
else:
print('Process all {} input rows. (Use --queries flag to truncate.)'.format(
votes.shape[0]))
return votes, baseline
def _count_teachers(votes):
s = np.sum(votes, axis=1)
num_teachers = int(max(s))
if min(s) != num_teachers:
raise ValueError(
'Matrix of votes is malformed: the number of votes is not the same '
'across rows.')
return num_teachers
def _is_data_ind_step1(num_teachers, threshold, sigma1, orders):
if threshold is None:
return True
return np.all(
pate.is_data_independent_always_opt_threshold(num_teachers, threshold,
sigma1, orders))
def _is_data_ind_step2(num_teachers, num_classes, sigma, orders):
return np.all(
pate.is_data_independent_always_opt_gaussian(num_teachers, num_classes,
sigma, orders))
def main(argv):
del argv # Unused.
if (FLAGS.threshold is None) != (FLAGS.sigma1 is None):
raise ValueError(
'--threshold flag and --sigma1 flag must be present or absent '
'simultaneously.')
if FLAGS.order is None:
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500),
num=100)))
# Short list of orders.
# orders = np.round(
# np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
else:
orders = np.array([FLAGS.order])
votes, baseline = _load_votes(FLAGS.counts_file, FLAGS.baseline_file,
FLAGS.queries)
if FLAGS.teachers is None:
num_teachers = _count_teachers(votes)
else:
num_teachers = FLAGS.teachers
num_classes = votes.shape[1]
order = _compute_rdp(votes, baseline, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta, orders,
FLAGS.data_independent)
ind_step1 = _is_data_ind_step1(num_teachers, FLAGS.threshold, FLAGS.sigma1,
order)
ind_step2 = _is_data_ind_step2(num_teachers, num_classes, FLAGS.sigma2, order)
if FLAGS.data_independent or (ind_step1 and ind_step2):
print('Nothing to do here, all analyses are data-independent.')
return
if not _check_conditions(FLAGS.sigma2, num_classes, [order]):
return # Quit early: sufficient conditions for correctness fail to hold.
beta_opt, ss_opt, sigma_ss_opt = _find_optimal_smooth_sensitivity_parameters(
votes, baseline, num_teachers, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta, ind_step1, ind_step2, order)
print('Optimal beta = {:.4f}, E[SS_beta] = {:.4}, sigma_ss = {:.2f}'.format(
beta_opt, ss_opt, sigma_ss_opt))
if __name__ == '__main__':
app.run(main)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
import matplotlib
import os
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('plot_file', '', 'Output file name.')
qa_lnmax = [500, 750] + range(1000, 12500, 500)
acc_lnmax = [43.3, 52.3, 59.8, 66.7, 68.8, 70.5, 71.6, 72.3, 72.6, 72.9, 73.4,
73.4, 73.7, 73.9, 74.2, 74.4, 74.5, 74.7, 74.8, 75, 75.1, 75.1,
75.4, 75.4, 75.4]
qa_gnmax = [456, 683, 908, 1353, 1818, 2260, 2702, 3153, 3602, 4055, 4511, 4964,
5422, 5875, 6332, 6792, 7244, 7696, 8146, 8599, 9041, 9496, 9945,
10390, 10842]
acc_gnmax = [39.6, 52.2, 59.6, 66.6, 69.6, 70.5, 71.8, 72, 72.7, 72.9, 73.3,
73.4, 73.4, 73.8, 74, 74.2, 74.4, 74.5, 74.5, 74.7, 74.8, 75, 75.1,
75.1, 75.4]
qa_gnmax_aggressive = [167, 258, 322, 485, 647, 800, 967, 1133, 1282, 1430,
1573, 1728, 1889, 2028, 2190, 2348, 2510, 2668, 2950,
3098, 3265, 3413, 3581, 3730]
acc_gnmax_aggressive = [17.8, 26.8, 39.3, 48, 55.7, 61, 62.8, 64.8, 65.4, 66.7,
66.2, 68.3, 68.3, 68.7, 69.1, 70, 70.2, 70.5, 70.9,
70.7, 71.3, 71.3, 71.3, 71.8]
def main(argv):
del argv # Unused.
plt.close('all')
fig, ax = plt.subplots()
fig.set_figheight(4.7)
fig.set_figwidth(5)
ax.plot(qa_lnmax, acc_lnmax, color='r', ls='--', linewidth=5., marker='o',
alpha=.5, label='LNMax')
ax.plot(qa_gnmax, acc_gnmax, color='g', ls='-', linewidth=5., marker='o',
alpha=.5, label='Confident-GNMax')
# ax.plot(qa_gnmax_aggressive, acc_gnmax_aggressive, color='b', ls='-', marker='o', alpha=.5, label='Confident-GNMax (aggressive)')
plt.xticks([0, 2000, 4000, 6000])
plt.xlim([0, 6000])
# ax.set_yscale('log')
plt.ylim([65, 76])
ax.tick_params(labelsize=14)
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel('Student test accuracy (%)', fontsize=16)
plt.legend(loc=2, prop={'size': 16})
x = [400, 2116, 4600, 4680]
y = [69.5, 68.5, 74, 72.5]
annotations = [0.76, 2.89, 1.42, 5.76]
color_annotations = ['g', 'r', 'g', 'r']
for i, txt in enumerate(annotations):
ax.annotate(r'${\varepsilon=}$' + str(txt), (x[i], y[i]), fontsize=16,
color=color_annotations[i])
plot_filename = os.path.expanduser(FLAGS.plot_file)
plt.savefig(plot_filename, bbox_inches='tight')
plt.show()
if __name__ == '__main__':
app.run(main)
Implementation of an RDP privacy accountant and smooth sensitivity analysis for
the PATE framework. The underlying theory and supporting experiments appear in
"Scalable Private Learning with PATE" by Nicolas Papernot, Shuang Song, Ilya
Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar Erlingsson (ICLR 2018,
https://arxiv.org/abs/1802.08908).
## Overview
The PATE ('Private Aggregation of Teacher Ensembles') framework was introduced
by Papernot et al. in "Semi-supervised Knowledge Transfer for Deep Learning from
Private Training Data" (ICLR 2017, https://arxiv.org/abs/1610.05755). The
framework enables model-agnostic training that provably provides [differential
privacy](https://en.wikipedia.org/wiki/Differential_privacy) of the training
dataset.
The framework consists of _teachers_, the _student_ model, and the _aggregator_. The
teachers are models trained on disjoint subsets of the training datasets. The student
model has access to an insensitive (e.g., public) unlabelled dataset, which is labelled by
interacting with the ensemble of teachers via the _aggregator_. The aggregator tallies
outputs of the teacher models, and either forwards a (noisy) aggregate to the student, or
refuses to answer.
Differential privacy is enforced by the aggregator. The privacy guarantees can be _data-independent_,
which means that they are solely the function of the aggregator's parameters. Alternatively, privacy
analysis can be _data-dependent_, which allows for finer reasoning where, under certain conditions on
the input distribution, the final privacy guarantees can be improved relative to the data-independent
analysis. Data-dependent privacy guarantees may, by themselves, be a function of sensitive data and
therefore publishing these guarantees requires its own sanitization procedure. In our case
sanitization of data-dependent privacy guarantees proceeds via _smooth sensitivity_ analysis.
The common machinery used for all privacy analyses in this repository is the
R&eacute;nyi differential privacy, or RDP (see https://arxiv.org/abs/1702.07476).
This repository contains implementations of privacy accountants and smooth
sensitivity analysis for several data-independent and data-dependent mechanism that together
comprise the PATE framework.
### Requirements
* Python, version &ge; 2.7
* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`)
* numpy
* scipy
* sympy (for smooth sensitivity analysis)
* unittest (for testing)
### Self-testing
To verify the installation run
```bash
$ python core_test.py
$ python smooth_sensitivity_test.py
```
## Files in this directory
* core.py &mdash; RDP privacy accountant for several vote aggregators (GNMax,
Threshold, Laplace).
* smooth_sensitivity.py &mdash; Smooth sensitivity analysis for GNMax and
Threshold mechanisms.
* core_test.py and smooth_sensitivity_test.py &mdash; Unit tests for the
files above.
## Contact information
You may direct your comments to mironov@google.com and PR to @ilyamironov.
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Core functions for RDP analysis in PATE framework.
This library comprises the core functions for doing differentially private
analysis of the PATE architecture and its various Noisy Max and other
mechanisms.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from absl import app
import numpy as np
import scipy.stats
def _logaddexp(x):
"""Addition in the log space. Analogue of numpy.logaddexp for a list."""
m = max(x)
return m + math.log(sum(np.exp(x - m)))
def _log1mexp(x):
"""Numerically stable computation of log(1-exp(x))."""
if x < -1:
return math.log1p(-math.exp(x))
elif x < 0:
return math.log(-math.expm1(x))
elif x == 0:
return -np.inf
else:
raise ValueError("Argument must be non-positive.")
def compute_eps_from_delta(orders, rdp, delta):
"""Translates between RDP and (eps, delta)-DP.
Args:
orders: A list (or a scalar) of orders.
rdp: A list of RDP guarantees (of the same length as orders).
delta: Target delta.
Returns:
Pair of (eps, optimal_order).
Raises:
ValueError: If input is malformed.
"""
if len(orders) != len(rdp):
raise ValueError("Input lists must have the same length.")
eps = np.array(rdp) - math.log(delta) / (np.array(orders) - 1)
idx_opt = np.argmin(eps)
return eps[idx_opt], orders[idx_opt]
#####################
# RDP FOR THE GNMAX #
#####################
def compute_logq_gaussian(counts, sigma):
"""Returns an upper bound on ln Pr[outcome != argmax] for GNMax.
Implementation of Proposition 7.
Args:
counts: A numpy array of scores.
sigma: The standard deviation of the Gaussian noise in the GNMax mechanism.
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
n = len(counts)
variance = sigma**2
idx_max = np.argmax(counts)
counts_normalized = counts[idx_max] - counts
counts_rest = counts_normalized[np.arange(n) != idx_max] # exclude one index
# Upper bound q via a union bound rather than a more precise calculation.
logq = _logaddexp(
scipy.stats.norm.logsf(counts_rest, scale=math.sqrt(2 * variance)))
# A sketch of a more accurate estimate, which is currently disabled for two
# reasons:
# 1. Numerical instability;
# 2. Not covered by smooth sensitivity analysis.
# covariance = variance * (np.ones((n - 1, n - 1)) + np.identity(n - 1))
# logq = np.log1p(-statsmodels.sandbox.distributions.extras.mvnormcdf(
# counts_rest, np.zeros(n - 1), covariance, maxpts=1e4))
return min(logq, math.log(1 - (1 / n)))
def rdp_data_independent_gaussian(sigma, orders):
"""Computes a data-independent RDP curve for GNMax.
Implementation of Proposition 8.
Args:
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
variance = sigma**2
if np.isscalar(orders):
return orders / variance
else:
return np.atleast_1d(orders) / variance
def rdp_gaussian(logq, sigma, orders):
"""Bounds RDP from above of GNMax given an upper bound on q (Theorem 6).
Args:
logq: Natural logarithm of the probability of a non-argmax outcome.
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if logq > 0 or sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP.
if np.isscalar(orders):
return 0.
else:
return np.full_like(orders, 0., dtype=np.float)
variance = sigma**2
# Use two different higher orders: mu_hi1 and mu_hi2 computed according to
# Proposition 10.
mu_hi2 = math.sqrt(variance * -logq)
mu_hi1 = mu_hi2 + 1
orders_vec = np.atleast_1d(orders)
ret = orders_vec / variance # baseline: data-independent bound
# Filter out entries where data-dependent bound does not apply.
mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1)
rdp_hi1 = mu_hi1 / variance
rdp_hi2 = mu_hi2 / variance
log_a2 = (mu_hi2 - 1) * rdp_hi2
# Make sure q is in the increasing wrt q range and A is positive.
if (np.any(mask) and logq <= log_a2 - mu_hi2 *
(math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and
-logq > rdp_hi2):
# Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0.
log1q = _log1mexp(logq) # log1q = log(1-q)
log_a = (orders - 1) * (
log1q - _log1mexp((logq + rdp_hi2) * (1 - 1 / mu_hi2)))
log_b = (orders - 1) * (rdp_hi1 - logq / (mu_hi1 - 1))
# Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y.
log_s = np.logaddexp(log1q + log_a, logq + log_b)
ret[mask] = np.minimum(ret, log_s / (orders - 1))[mask]
assert np.all(ret >= 0)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def is_data_independent_always_opt_gaussian(num_teachers, num_classes, sigma,
orders):
"""Tests whether data-ind bound is always optimal for GNMax.
Args:
num_teachers: Number of teachers.
num_classes: Number of classes.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
unanimous = np.array([num_teachers] + [0] * (num_classes - 1))
logq = compute_logq_gaussian(unanimous, sigma)
rdp_dep = rdp_gaussian(logq, sigma, orders)
rdp_ind = rdp_data_independent_gaussian(sigma, orders)
return np.isclose(rdp_dep, rdp_ind)
###################################
# RDP FOR THE THRESHOLD MECHANISM #
###################################
def compute_logpr_answered(t, sigma, counts):
"""Computes log of the probability that a noisy threshold is crossed.
Args:
t: The threshold.
sigma: The stdev of the Gaussian noise added to the threshold.
counts: An array of votes.
Returns:
Natural log of the probability that max is larger than a noisy threshold.
"""
# Compared to the paper, max(counts) is rounded to the nearest integer. This
# is done to facilitate computation of smooth sensitivity for the case of
# the interactive mechanism, where votes are not necessarily integer.
return scipy.stats.norm.logsf(t - round(max(counts)), scale=sigma)
def compute_rdp_data_independent_threshold(sigma, orders):
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_data_independent_gaussian(2**.5 * sigma, orders)
def compute_rdp_threshold(log_pr_answered, sigma, orders):
logq = min(log_pr_answered, _log1mexp(log_pr_answered))
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_gaussian(logq, 2**.5 * sigma, orders)
def is_data_independent_always_opt_threshold(num_teachers, threshold, sigma,
orders):
"""Tests whether data-ind bound is always optimal for the threshold mechanism.
Args:
num_teachers: Number of teachers.
threshold: The cut-off threshold.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
# Since the data-dependent bound depends only on max(votes), it suffices to
# check whether the data-dependent bounds are better than data-independent
# bounds in the extreme cases when max(votes) is minimal or maximal.
# For both Confident GNMax and Interactive GNMax it holds that
# 0 <= max(votes) <= num_teachers.
# The upper bound is trivial in both cases.
# The lower bound is trivial for Confident GNMax (and a stronger one, based on
# the pigeonhole principle, is possible).
# For Interactive GNMax (Algorithm 2), the lower bound follows from the
# following argument. Since the votes vector is the difference between the
# actual teachers' votes and the student's baseline, we need to argue that
# max(n_j - M * p_j) >= 0.
# The bound holds because sum_j n_j = sum M * p_j = M. Thus,
# sum_j (n_j - M * p_j) = 0, and max_j (n_j - M * p_j) >= 0 as needed.
logq1 = compute_logpr_answered(threshold, sigma, [0])
logq2 = compute_logpr_answered(threshold, sigma, [num_teachers])
rdp_dep1 = compute_rdp_threshold(logq1, sigma, orders)
rdp_dep2 = compute_rdp_threshold(logq2, sigma, orders)
rdp_ind = compute_rdp_data_independent_threshold(sigma, orders)
return np.isclose(rdp_dep1, rdp_ind) and np.isclose(rdp_dep2, rdp_ind)
#############################
# RDP FOR THE LAPLACE NOISE #
#############################
def compute_logq_laplace(counts, lmbd):
"""Computes an upper bound on log Pr[outcome != argmax] for LNMax.
Args:
counts: A list of scores.
lmbd: The lambda parameter of the Laplace distribution ~exp(-|x| / lambda).
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
# For noisy max, we only get an upper bound via the union bound. See Lemma 4
# in https://arxiv.org/abs/1610.05755.
#
# Pr[ j beats i*] = (2+gap(j,i*))/ 4 exp(gap(j,i*)
# proof at http://mathoverflow.net/questions/66763/
idx_max = np.argmax(counts)
counts_normalized = (counts - counts[idx_max]) / lmbd
counts_rest = np.array(
[counts_normalized[i] for i in range(len(counts)) if i != idx_max])
logq = _logaddexp(np.log(2 - counts_rest) + math.log(.25) + counts_rest)
return min(logq, math.log(1 - (1 / len(counts))))
def rdp_pure_eps(logq, pure_eps, orders):
"""Computes the RDP value given logq and pure privacy eps.
Implementation of https://arxiv.org/abs/1610.05755, Theorem 3.
The bound used is the min of three terms. The first term is from
https://arxiv.org/pdf/1605.02065.pdf.
The second term is based on the fact that when event has probability (1-q) for
q close to zero, q can only change by exp(eps), which corresponds to a
much smaller multiplicative change in (1-q)
The third term comes directly from the privacy guarantee.
Args:
logq: Natural logarithm of the probability of a non-optimal outcome.
pure_eps: eps parameter for DP
orders: array_like list of moments to compute.
Returns:
Array of upper bounds on rdp (a scalar if orders is a scalar).
"""
orders_vec = np.atleast_1d(orders)
q = math.exp(logq)
log_t = np.full_like(orders_vec, np.inf)
if q <= 1 / (math.exp(pure_eps) + 1):
logt_one = math.log1p(-q) + (
math.log1p(-q) - _log1mexp(pure_eps + logq)) * (
orders_vec - 1)
logt_two = logq + pure_eps * (orders_vec - 1)
log_t = np.logaddexp(logt_one, logt_two)
ret = np.minimum(
np.minimum(0.5 * pure_eps * pure_eps * orders_vec,
log_t / (orders_vec - 1)), pure_eps)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def main(argv):
del argv # Unused.
if __name__ == "__main__":
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for pate.core."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import unittest
import numpy as np
import core as pate
class PateTest(unittest.TestCase):
def _test_rdp_gaussian_value_errors(self):
# Test for ValueErrors.
with self.assertRaises(ValueError):
pate.rdp_gaussian(1.0, 1.0, np.array([2, 3, 4]))
with self.assertRaises(ValueError):
pate.rdp_gaussian(np.log(0.5), -1.0, np.array([2, 3, 4]))
with self.assertRaises(ValueError):
pate.rdp_gaussian(np.log(0.5), 1.0, np.array([1, 3, 4]))
def _test_rdp_gaussian_as_function_of_q(self):
# Test for data-independent and data-dependent ranges over q.
# The following corresponds to orders 1.1, 2.5, 32, 250
# sigmas 1.5, 15, 1500, 15000.
# Hand calculated -log(q0)s arranged in a 'sigma major' ordering.
neglogq0s = [
2.8, 2.6, 427, None, 4.8, 4.0, 4.7, 275, 9.6, 8.8, 6.0, 4, 12, 11.2,
8.6, 6.4
]
idx_neglogq0s = 0 # To iterate through neglogq0s.
orders = [1.1, 2.5, 32, 250]
sigmas = [1.5, 15, 1500, 15000]
for sigma in sigmas:
for order in orders:
curr_neglogq0 = neglogq0s[idx_neglogq0s]
idx_neglogq0s += 1
if curr_neglogq0 is None: # sigma == 1.5 and order == 250:
continue
rdp_at_q0 = pate.rdp_gaussian(-curr_neglogq0, sigma, order)
# Data-dependent range. (Successively halve the value of q.)
logq_dds = (-curr_neglogq0 - np.array(
[0, np.log(2), np.log(4), np.log(8)]))
# Check that in q_dds, rdp is decreasing.
for idx in range(len(logq_dds) - 1):
self.assertGreater(
pate.rdp_gaussian(logq_dds[idx], sigma, order),
pate.rdp_gaussian(logq_dds[idx + 1], sigma, order))
# Data-independent range.
q_dids = np.exp(-curr_neglogq0) + np.array([0.1, 0.2, 0.3, 0.4])
# Check that in q_dids, rdp is constant.
for q in q_dids:
self.assertEqual(rdp_at_q0, pate.rdp_gaussian(
np.log(q), sigma, order))
def _test_compute_eps_from_delta_value_error(self):
# Test for ValueError.
with self.assertRaises(ValueError):
pate.compute_eps_from_delta([1.1, 2, 3, 4], [1, 2, 3], 0.001)
def _test_compute_eps_from_delta_monotonicity(self):
# Test for monotonicity with respect to delta.
orders = [1.1, 2.5, 250.0]
sigmas = [1e-3, 1.0, 1e5]
deltas = [1e-60, 1e-6, 0.1, 0.999]
for sigma in sigmas:
list_of_eps = []
rdps_for_gaussian = np.array(orders) / (2 * sigma**2)
for delta in deltas:
list_of_eps.append(
pate.compute_eps_from_delta(orders, rdps_for_gaussian, delta)[0])
# Check that in list_of_eps, epsilons are decreasing (as delta increases).
sorted_list_of_eps = list(list_of_eps)
sorted_list_of_eps.sort(reverse=True)
self.assertEqual(list_of_eps, sorted_list_of_eps)
def _test_compute_q0(self):
# Stub code to search a logq space and figure out logq0 by eyeballing
# results. This code does not run with the tests. Remove underscore to run.
sigma = 15
order = 250
logqs = np.arange(-290, -270, 1)
count = 0
for logq in logqs:
count += 1
sys.stdout.write("\t%0.5g: %0.10g" %
(logq, pate.rdp_gaussian(logq, sigma, order)))
sys.stdout.flush()
if count % 5 == 0:
print("")
def test_rdp_gaussian(self):
self._test_rdp_gaussian_value_errors()
self._test_rdp_gaussian_as_function_of_q()
def test_compute_eps_from_delta(self):
self._test_compute_eps_from_delta_value_error()
self._test_compute_eps_from_delta_monotonicity()
if __name__ == "__main__":
unittest.main()
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for pate.smooth_sensitivity."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import numpy as np
import smooth_sensitivity as pate_ss
class PateSmoothSensitivityTest(unittest.TestCase):
def test_check_conditions(self):
self.assertEqual(pate_ss.check_conditions(20, 10, 25.), (True, False))
self.assertEqual(pate_ss.check_conditions(30, 10, 25.), (True, True))
def _assert_all_close(self, x, y):
"""Asserts that two numpy arrays are close."""
self.assertEqual(len(x), len(y))
self.assertTrue(np.allclose(x, y, rtol=1e-8, atol=0))
def test_compute_local_sensitivity_bounds_gnmax(self):
counts1 = np.array([10, 0, 0])
sigma1 = .5
order1 = 1.5
answer1 = np.array(
[3.13503646e-17, 1.60178280e-08, 5.90681786e-03] + [5.99981308e+00] * 7)
# Test for "going right" in the smooth sensitivity computation.
out1 = pate_ss.compute_local_sensitivity_bounds_gnmax(
counts1, 10, sigma1, order1)
self._assert_all_close(out1, answer1)
counts2 = np.array([1000, 500, 300, 200, 0])
sigma2 = 250.
order2 = 10.
# Test for "going left" in the smooth sensitivity computation.
out2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
counts2, 2000, sigma2, order2)
answer2 = np.array([0.] * 298 + [2.77693450548e-7, 2.10853979548e-6] +
[2.73113623988e-6] * 1700)
self._assert_all_close(out2, answer2)
def test_compute_local_sensitivity_bounds_threshold(self):
counts1_3 = np.array([20, 10, 0])
num_teachers = sum(counts1_3)
t1 = 16 # high threshold
sigma = 2
order = 10
out1 = pate_ss.compute_local_sensitivity_bounds_threshold(
counts1_3, num_teachers, t1, sigma, order)
answer1 = np.array([0] * 3 + [
1.48454129e-04, 1.47826870e-02, 3.94153241e-02, 6.45775697e-02,
9.01543247e-02, 1.16054002e-01, 1.42180452e-01, 1.42180452e-01,
1.48454129e-04, 1.47826870e-02, 3.94153241e-02, 6.45775697e-02,
9.01543266e-02, 1.16054000e-01, 1.42180452e-01, 1.68302106e-01,
1.93127860e-01
] + [0] * 10)
self._assert_all_close(out1, answer1)
t2 = 2 # low threshold
out2 = pate_ss.compute_local_sensitivity_bounds_threshold(
counts1_3, num_teachers, t2, sigma, order)
answer2 = np.array([
1.60212079e-01, 2.07021132e-01, 2.07021132e-01, 1.93127860e-01,
1.68302106e-01, 1.42180452e-01, 1.16054002e-01, 9.01543247e-02,
6.45775697e-02, 3.94153241e-02, 1.47826870e-02, 1.48454129e-04
] + [0] * 18)
self._assert_all_close(out2, answer2)
t3 = 50 # very high threshold (larger than the number of teachers).
out3 = pate_ss.compute_local_sensitivity_bounds_threshold(
counts1_3, num_teachers, t3, sigma, order)
answer3 = np.array([
1.35750725752e-19, 1.88990500499e-17, 2.05403154065e-15,
1.74298153642e-13, 1.15489723995e-11, 5.97584949325e-10,
2.41486826748e-08, 7.62150641922e-07, 1.87846248741e-05,
0.000360973025976, 0.000360973025976, 2.76377015215e-50,
1.00904975276e-53, 2.87254164748e-57, 6.37583360761e-61,
1.10331620211e-64, 1.48844393335e-68, 1.56535552444e-72,
1.28328011060e-76, 8.20047697109e-81
] + [0] * 10)
self._assert_all_close(out3, answer3)
# Fractional values.
counts4 = np.array([19.5, -5.1, 0])
t4 = 10.1
out4 = pate_ss.compute_local_sensitivity_bounds_threshold(
counts4, num_teachers, t4, sigma, order)
answer4 = np.array([
0.0620410301, 0.0875807131, 0.113451958, 0.139561671, 0.1657074530,
0.1908244840, 0.2070270720, 0.207027072, 0.169718100, 0.0575152142,
0.00678695871
] + [0] * 6 + [0.000536304908, 0.0172181073, 0.041909870] + [0] * 10)
self._assert_all_close(out4, answer4)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment