Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
86906935
Commit
86906935
authored
Jan 14, 2019
by
Nicolas Papernot
Committed by
Ilya Mironov
Jan 14, 2019
Browse files
remove all code related to differential privacy (#6045)
parent
d32d957a
Changes
31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
0 additions
and
2864 deletions
+0
-2864
research/differential_privacy/pate/ICLR2018/plot_partition.py
...arch/differential_privacy/pate/ICLR2018/plot_partition.py
+0
-397
research/differential_privacy/pate/ICLR2018/plots_for_slides.py
...ch/differential_privacy/pate/ICLR2018/plots_for_slides.py
+0
-283
research/differential_privacy/pate/ICLR2018/rdp_bucketized.py
...arch/differential_privacy/pate/ICLR2018/rdp_bucketized.py
+0
-263
research/differential_privacy/pate/ICLR2018/rdp_cumulative.py
...arch/differential_privacy/pate/ICLR2018/rdp_cumulative.py
+0
-378
research/differential_privacy/pate/ICLR2018/smooth_sensitivity_table.py
...rential_privacy/pate/ICLR2018/smooth_sensitivity_table.py
+0
-358
research/differential_privacy/pate/ICLR2018/utility_queries_answered.py
...rential_privacy/pate/ICLR2018/utility_queries_answered.py
+0
-75
research/differential_privacy/pate/README.md
research/differential_privacy/pate/README.md
+0
-71
research/differential_privacy/pate/core.py
research/differential_privacy/pate/core.py
+0
-370
research/differential_privacy/pate/core_test.py
research/differential_privacy/pate/core_test.py
+0
-124
research/differential_privacy/pate/smooth_sensitivity.py
research/differential_privacy/pate/smooth_sensitivity.py
+0
-419
research/differential_privacy/pate/smooth_sensitivity_test.py
...arch/differential_privacy/pate/smooth_sensitivity_test.py
+0
-126
No files found.
research/differential_privacy/pate/ICLR2018/plot_partition.py
deleted
100644 → 0
View file @
d32d957a
"""Produces two plots. One compares aggregators and their analyses. The other
illustrates sources of privacy loss for Confident-GNMax.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of two files.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
os
import
pickle
import
sys
sys
.
path
.
append
(
'..'
)
# Main modules reside in the parent directory.
from
absl
import
app
from
absl
import
flags
from
collections
import
namedtuple
import
matplotlib
matplotlib
.
use
(
'TkAgg'
)
import
matplotlib.pyplot
as
plt
# pylint: disable=g-import-not-at-top
import
numpy
as
np
import
core
as
pate
import
smooth_sensitivity
as
pate_ss
plt
.
style
.
use
(
'ggplot'
)
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_boolean
(
'cache'
,
False
,
'Read results of privacy analysis from cache.'
)
flags
.
DEFINE_string
(
'counts_file'
,
None
,
'Counts file.'
)
flags
.
DEFINE_string
(
'figures_dir'
,
''
,
'Path where figures are written to.'
)
flags
.
DEFINE_float
(
'threshold'
,
None
,
'Threshold for step 1 (selection).'
)
flags
.
DEFINE_float
(
'sigma1'
,
None
,
'Sigma for step 1 (selection).'
)
flags
.
DEFINE_float
(
'sigma2'
,
None
,
'Sigma for step 2 (argmax).'
)
flags
.
DEFINE_integer
(
'queries'
,
None
,
'Number of queries made by the student.'
)
flags
.
DEFINE_float
(
'delta'
,
1e-8
,
'Target delta.'
)
flags
.
mark_flag_as_required
(
'counts_file'
)
flags
.
mark_flag_as_required
(
'threshold'
)
flags
.
mark_flag_as_required
(
'sigma1'
)
flags
.
mark_flag_as_required
(
'sigma2'
)
Partition
=
namedtuple
(
'Partition'
,
[
'step1'
,
'step2'
,
'ss'
,
'delta'
])
def
analyze_gnmax_conf_data_ind
(
votes
,
threshold
,
sigma1
,
sigma2
,
delta
):
orders
=
np
.
logspace
(
np
.
log10
(
1.5
),
np
.
log10
(
500
),
num
=
100
)
n
=
votes
.
shape
[
0
]
rdp_total
=
np
.
zeros
(
len
(
orders
))
answered_total
=
0
answered
=
np
.
zeros
(
n
)
eps_cum
=
np
.
full
(
n
,
None
,
dtype
=
float
)
for
i
in
range
(
n
):
v
=
votes
[
i
,]
if
threshold
is
not
None
and
sigma1
is
not
None
:
q_step1
=
np
.
exp
(
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
))
rdp_total
+=
pate
.
rdp_data_independent_gaussian
(
sigma1
,
orders
)
else
:
q_step1
=
1.
# always answer
answered_total
+=
q_step1
answered
[
i
]
=
answered_total
rdp_total
+=
q_step1
*
pate
.
rdp_data_independent_gaussian
(
sigma2
,
orders
)
eps_cum
[
i
],
order_opt
=
pate
.
compute_eps_from_delta
(
orders
,
rdp_total
,
delta
)
if
i
>
0
and
(
i
+
1
)
%
1000
==
0
:
print
(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} '
'at order = {:.2f}.'
.
format
(
i
+
1
,
answered
[
i
],
eps_cum
[
i
],
order_opt
))
sys
.
stdout
.
flush
()
return
eps_cum
,
answered
def
analyze_gnmax_conf_data_dep
(
votes
,
threshold
,
sigma1
,
sigma2
,
delta
):
# Short list of orders.
# orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20))
# Long list of orders.
orders
=
np
.
concatenate
((
np
.
arange
(
20
,
40
,
.
2
),
np
.
arange
(
40
,
75
,
.
5
),
np
.
logspace
(
np
.
log10
(
75
),
np
.
log10
(
200
),
num
=
20
)))
n
=
votes
.
shape
[
0
]
num_classes
=
votes
.
shape
[
1
]
num_teachers
=
int
(
sum
(
votes
[
0
,]))
if
threshold
is
not
None
and
sigma1
is
not
None
:
is_data_ind_step1
=
pate
.
is_data_independent_always_opt_gaussian
(
num_teachers
,
num_classes
,
sigma1
,
orders
)
else
:
is_data_ind_step1
=
[
True
]
*
len
(
orders
)
is_data_ind_step2
=
pate
.
is_data_independent_always_opt_gaussian
(
num_teachers
,
num_classes
,
sigma2
,
orders
)
eps_partitioned
=
np
.
full
(
n
,
None
,
dtype
=
Partition
)
order_opt
=
np
.
full
(
n
,
None
,
dtype
=
float
)
ss_std_opt
=
np
.
full
(
n
,
None
,
dtype
=
float
)
answered
=
np
.
zeros
(
n
)
rdp_step1_total
=
np
.
zeros
(
len
(
orders
))
rdp_step2_total
=
np
.
zeros
(
len
(
orders
))
ls_total
=
np
.
zeros
((
len
(
orders
),
num_teachers
))
answered_total
=
0
for
i
in
range
(
n
):
v
=
votes
[
i
,]
if
threshold
is
not
None
and
sigma1
is
not
None
:
logq_step1
=
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
)
rdp_step1_total
+=
pate
.
compute_rdp_threshold
(
logq_step1
,
sigma1
,
orders
)
else
:
logq_step1
=
0.
# always answer
pr_answered
=
np
.
exp
(
logq_step1
)
logq_step2
=
pate
.
compute_logq_gaussian
(
v
,
sigma2
)
rdp_step2_total
+=
pr_answered
*
pate
.
rdp_gaussian
(
logq_step2
,
sigma2
,
orders
)
answered_total
+=
pr_answered
rdp_ss
=
np
.
zeros
(
len
(
orders
))
ss_std
=
np
.
zeros
(
len
(
orders
))
for
j
,
order
in
enumerate
(
orders
):
if
not
is_data_ind_step1
[
j
]:
ls_step1
=
pate_ss
.
compute_local_sensitivity_bounds_threshold
(
v
,
num_teachers
,
threshold
,
sigma1
,
order
)
else
:
ls_step1
=
np
.
full
(
num_teachers
,
0
,
dtype
=
float
)
if
not
is_data_ind_step2
[
j
]:
ls_step2
=
pate_ss
.
compute_local_sensitivity_bounds_gnmax
(
v
,
num_teachers
,
sigma2
,
order
)
else
:
ls_step2
=
np
.
full
(
num_teachers
,
0
,
dtype
=
float
)
ls_total
[
j
,]
+=
ls_step1
+
pr_answered
*
ls_step2
beta_ss
=
.
49
/
order
ss
=
pate_ss
.
compute_discounted_max
(
beta_ss
,
ls_total
[
j
,])
sigma_ss
=
((
order
*
math
.
exp
(
2
*
beta_ss
))
/
ss
)
**
(
1
/
3
)
rdp_ss
[
j
]
=
pate_ss
.
compute_rdp_of_smooth_sensitivity_gaussian
(
beta_ss
,
sigma_ss
,
order
)
ss_std
[
j
]
=
ss
*
sigma_ss
rdp_total
=
rdp_step1_total
+
rdp_step2_total
+
rdp_ss
answered
[
i
]
=
answered_total
_
,
order_opt
[
i
]
=
pate
.
compute_eps_from_delta
(
orders
,
rdp_total
,
delta
)
order_idx
=
np
.
searchsorted
(
orders
,
order_opt
[
i
])
# Since optimal orders are always non-increasing, shrink orders array
# and all cumulative arrays to speed up computation.
if
order_idx
<
len
(
orders
):
orders
=
orders
[:
order_idx
+
1
]
rdp_step1_total
=
rdp_step1_total
[:
order_idx
+
1
]
rdp_step2_total
=
rdp_step2_total
[:
order_idx
+
1
]
eps_partitioned
[
i
]
=
Partition
(
step1
=
rdp_step1_total
[
order_idx
],
step2
=
rdp_step2_total
[
order_idx
],
ss
=
rdp_ss
[
order_idx
],
delta
=-
math
.
log
(
delta
)
/
(
order_opt
[
i
]
-
1
))
ss_std_opt
[
i
]
=
ss_std
[
order_idx
]
if
i
>
0
and
(
i
+
1
)
%
1
==
0
:
print
(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} '
'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, '
'step2 = {:.3f}, ss = {:.3f}'
.
format
(
i
+
1
,
answered
[
i
],
sum
(
eps_partitioned
[
i
]),
ss_std_opt
[
i
],
order_opt
[
i
],
eps_partitioned
[
i
].
delta
,
eps_partitioned
[
i
].
step1
,
eps_partitioned
[
i
].
step2
,
eps_partitioned
[
i
].
ss
))
sys
.
stdout
.
flush
()
return
eps_partitioned
,
answered
,
ss_std_opt
,
order_opt
def
plot_comparison
(
figures_dir
,
simple_ind
,
conf_ind
,
simple_dep
,
conf_dep
):
"""Plots variants of GNMax algorithm and their analyses.
"""
def
pivot
(
x_axis
,
eps
,
answered
):
y
=
np
.
full
(
len
(
x_axis
),
None
,
dtype
=
float
)
# delta
for
i
,
x
in
enumerate
(
x_axis
):
idx
=
np
.
searchsorted
(
answered
,
x
)
if
idx
<
len
(
eps
):
y
[
i
]
=
eps
[
idx
]
return
y
def
pivot_dep
(
x_axis
,
data_dep
):
eps_partitioned
,
answered
,
_
,
_
=
data_dep
eps
=
[
sum
(
p
)
for
p
in
eps_partitioned
]
# Flatten eps
return
pivot
(
x_axis
,
eps
,
answered
)
xlim
=
10000
x_axis
=
range
(
0
,
xlim
,
10
)
y_simple_ind
=
pivot
(
x_axis
,
*
simple_ind
)
y_conf_ind
=
pivot
(
x_axis
,
*
conf_ind
)
y_simple_dep
=
pivot_dep
(
x_axis
,
simple_dep
)
y_conf_dep
=
pivot_dep
(
x_axis
,
conf_dep
)
# plt.close('all')
fig
,
ax
=
plt
.
subplots
()
fig
.
set_figheight
(
4.5
)
fig
.
set_figwidth
(
4.7
)
ax
.
plot
(
x_axis
,
y_simple_ind
,
ls
=
'--'
,
color
=
'r'
,
lw
=
3
,
label
=
r
'Simple GNMax, data-ind analysis'
)
ax
.
plot
(
x_axis
,
y_conf_ind
,
ls
=
'--'
,
color
=
'b'
,
lw
=
3
,
label
=
r
'Confident GNMax, data-ind analysis'
)
ax
.
plot
(
x_axis
,
y_simple_dep
,
ls
=
'-'
,
color
=
'r'
,
lw
=
3
,
label
=
r
'Simple GNMax, data-dep analysis'
)
ax
.
plot
(
x_axis
,
y_conf_dep
,
ls
=
'-'
,
color
=
'b'
,
lw
=
3
,
label
=
r
'Confident GNMax, data-dep analysis'
)
plt
.
xticks
(
np
.
arange
(
0
,
xlim
+
1000
,
2000
))
plt
.
xlim
([
0
,
xlim
])
plt
.
ylim
(
bottom
=
0
)
plt
.
legend
(
fontsize
=
16
)
ax
.
set_xlabel
(
'Number of queries answered'
,
fontsize
=
16
)
ax
.
set_ylabel
(
r
'Privacy cost $\varepsilon$ at $\delta=10^{-8}$'
,
fontsize
=
16
)
ax
.
tick_params
(
labelsize
=
14
)
plot_filename
=
os
.
path
.
join
(
figures_dir
,
'comparison.pdf'
)
print
(
'Saving the graph to '
+
plot_filename
)
fig
.
savefig
(
plot_filename
,
bbox_inches
=
'tight'
)
plt
.
show
()
def
plot_partition
(
figures_dir
,
gnmax_conf
,
print_order
):
"""Plots an expert version of the privacy-per-answered-query graph.
Args:
figures_dir: A name of the directory where to save the plot.
eps: The cumulative privacy cost.
partition: Allocation of the privacy cost.
answered: Cumulative number of queries answered.
order_opt: The list of optimal orders.
"""
eps_partitioned
,
answered
,
ss_std_opt
,
order_opt
=
gnmax_conf
xlim
=
10000
x
=
range
(
0
,
int
(
xlim
),
10
)
lenx
=
len
(
x
)
y0
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
# delta
y1
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
# delta + step1
y2
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
# delta + step1 + step2
y3
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
# delta + step1 + step2 + ss
noise_std
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
y_right
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
for
i
in
range
(
lenx
):
idx
=
np
.
searchsorted
(
answered
,
x
[
i
])
if
idx
<
len
(
eps_partitioned
):
y0
[
i
]
=
eps_partitioned
[
idx
].
delta
y1
[
i
]
=
y0
[
i
]
+
eps_partitioned
[
idx
].
step1
y2
[
i
]
=
y1
[
i
]
+
eps_partitioned
[
idx
].
step2
y3
[
i
]
=
y2
[
i
]
+
eps_partitioned
[
idx
].
ss
noise_std
[
i
]
=
ss_std_opt
[
idx
]
y_right
[
i
]
=
order_opt
[
idx
]
# plt.close('all')
fig
,
ax
=
plt
.
subplots
()
fig
.
set_figheight
(
4.5
)
fig
.
set_figwidth
(
4.7
)
fig
.
patch
.
set_alpha
(
0
)
l1
=
ax
.
plot
(
x
,
y3
,
color
=
'b'
,
ls
=
'-'
,
label
=
r
'Total privacy cost'
,
linewidth
=
1
).
pop
()
for
y
in
(
y0
,
y1
,
y2
):
ax
.
plot
(
x
,
y
,
color
=
'b'
,
ls
=
'-'
,
label
=
r
'_nolegend_'
,
alpha
=
.
5
,
linewidth
=
1
)
ax
.
fill_between
(
x
,
[
0
]
*
lenx
,
y0
.
tolist
(),
facecolor
=
'b'
,
alpha
=
.
5
)
ax
.
fill_between
(
x
,
y0
.
tolist
(),
y1
.
tolist
(),
facecolor
=
'b'
,
alpha
=
.
4
)
ax
.
fill_between
(
x
,
y1
.
tolist
(),
y2
.
tolist
(),
facecolor
=
'b'
,
alpha
=
.
3
)
ax
.
fill_between
(
x
,
y2
.
tolist
(),
y3
.
tolist
(),
facecolor
=
'b'
,
alpha
=
.
2
)
ax
.
fill_between
(
x
,
(
y3
-
noise_std
).
tolist
(),
(
y3
+
noise_std
).
tolist
(),
facecolor
=
'r'
,
alpha
=
.
5
)
plt
.
xticks
(
np
.
arange
(
0
,
xlim
+
1000
,
2000
))
plt
.
xlim
([
0
,
xlim
])
ax
.
set_ylim
([
0
,
3.
])
ax
.
set_xlabel
(
'Number of queries answered'
,
fontsize
=
16
)
ax
.
set_ylabel
(
r
'Privacy cost $\varepsilon$ at $\delta=10^{-8}$'
,
fontsize
=
16
)
# Merging legends.
if
print_order
:
ax2
=
ax
.
twinx
()
l2
=
ax2
.
plot
(
x
,
y_right
,
'r'
,
ls
=
'-'
,
label
=
r
'Optimal order'
,
linewidth
=
5
,
alpha
=
.
5
).
pop
()
ax2
.
grid
(
False
)
# ax2.set_ylabel(r'Optimal Renyi order', fontsize=16)
ax2
.
set_ylim
([
0
,
200.
])
# ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13)
ax
.
tick_params
(
labelsize
=
14
)
plot_filename
=
os
.
path
.
join
(
figures_dir
,
'partition.pdf'
)
print
(
'Saving the graph to '
+
plot_filename
)
fig
.
savefig
(
plot_filename
,
bbox_inches
=
'tight'
,
dpi
=
800
)
plt
.
show
()
def
run_all_analyses
(
votes
,
threshold
,
sigma1
,
sigma2
,
delta
):
simple_ind
=
analyze_gnmax_conf_data_ind
(
votes
,
None
,
None
,
sigma2
,
delta
)
conf_ind
=
analyze_gnmax_conf_data_ind
(
votes
,
threshold
,
sigma1
,
sigma2
,
delta
)
simple_dep
=
analyze_gnmax_conf_data_dep
(
votes
,
None
,
None
,
sigma2
,
delta
)
conf_dep
=
analyze_gnmax_conf_data_dep
(
votes
,
threshold
,
sigma1
,
sigma2
,
delta
)
return
(
simple_ind
,
conf_ind
,
simple_dep
,
conf_dep
)
def
run_or_load_all_analyses
():
temp_filename
=
os
.
path
.
expanduser
(
'~/tmp/partition_cached.pkl'
)
if
FLAGS
.
cache
and
os
.
path
.
isfile
(
temp_filename
):
print
(
'Reading from cache '
+
temp_filename
)
with
open
(
temp_filename
,
'rb'
)
as
f
:
all_analyses
=
pickle
.
load
(
f
)
else
:
fin_name
=
os
.
path
.
expanduser
(
FLAGS
.
counts_file
)
print
(
'Reading raw votes from '
+
fin_name
)
sys
.
stdout
.
flush
()
votes
=
np
.
load
(
fin_name
)
if
FLAGS
.
queries
is
not
None
:
if
votes
.
shape
[
0
]
<
FLAGS
.
queries
:
raise
ValueError
(
'Expect {} rows, got {} in {}'
.
format
(
FLAGS
.
queries
,
votes
.
shape
[
0
],
fin_name
))
# Truncate the votes matrix to the number of queries made.
votes
=
votes
[:
FLAGS
.
queries
,
]
all_analyses
=
run_all_analyses
(
votes
,
FLAGS
.
threshold
,
FLAGS
.
sigma1
,
FLAGS
.
sigma2
,
FLAGS
.
delta
)
print
(
'Writing to cache '
+
temp_filename
)
with
open
(
temp_filename
,
'wb'
)
as
f
:
pickle
.
dump
(
all_analyses
,
f
)
return
all_analyses
def
main
(
argv
):
del
argv
# Unused.
simple_ind
,
conf_ind
,
simple_dep
,
conf_dep
=
run_or_load_all_analyses
()
figures_dir
=
os
.
path
.
expanduser
(
FLAGS
.
figures_dir
)
plot_comparison
(
figures_dir
,
simple_ind
,
conf_ind
,
simple_dep
,
conf_dep
)
plot_partition
(
figures_dir
,
conf_dep
,
True
)
plt
.
close
(
'all'
)
if
__name__
==
'__main__'
:
app
.
run
(
main
)
research/differential_privacy/pate/ICLR2018/plots_for_slides.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots graphs for the slide deck.
A script in support of the PATE2 paper. The input is a file containing a numpy
array of votes, one query per row, one class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output graphs are visualized using the TkAgg backend.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
os
import
sys
sys
.
path
.
append
(
'..'
)
# Main modules reside in the parent directory.
from
absl
import
app
from
absl
import
flags
import
matplotlib
matplotlib
.
use
(
'TkAgg'
)
import
matplotlib.pyplot
as
plt
# pylint: disable=g-import-not-at-top
import
numpy
as
np
import
core
as
pate
import
random
plt
.
style
.
use
(
'ggplot'
)
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_string
(
'counts_file'
,
None
,
'Counts file.'
)
flags
.
DEFINE_string
(
'figures_dir'
,
''
,
'Path where figures are written to.'
)
flags
.
DEFINE_boolean
(
'transparent'
,
False
,
'Set background to transparent.'
)
flags
.
mark_flag_as_required
(
'counts_file'
)
def
setup_plot
():
fig
,
ax
=
plt
.
subplots
()
fig
.
set_figheight
(
4.5
)
fig
.
set_figwidth
(
4.7
)
if
FLAGS
.
transparent
:
fig
.
patch
.
set_alpha
(
0
)
return
fig
,
ax
def
plot_rdp_curve_per_example
(
votes
,
sigmas
):
orders
=
np
.
linspace
(
1.
,
100.
,
endpoint
=
True
,
num
=
1000
)
orders
[
0
]
=
1.001
fig
,
ax
=
setup_plot
()
for
i
in
range
(
votes
.
shape
[
0
]):
for
sigma
in
sigmas
:
logq
=
pate
.
compute_logq_gaussian
(
votes
[
i
,],
sigma
)
rdp
=
pate
.
rdp_gaussian
(
logq
,
sigma
,
orders
)
ax
.
plot
(
orders
,
rdp
,
alpha
=
1.
,
label
=
r
'Data-dependent bound, $\sigma$={}'
.
format
(
int
(
sigma
)),
linewidth
=
5
)
for
sigma
in
sigmas
:
ax
.
plot
(
orders
,
pate
.
rdp_data_independent_gaussian
(
sigma
,
orders
),
alpha
=
.
3
,
label
=
r
'Data-independent bound, $\sigma$={}'
.
format
(
int
(
sigma
)),
linewidth
=
10
)
plt
.
xlim
(
xmin
=
1
,
xmax
=
100
)
plt
.
ylim
(
ymin
=
0
)
plt
.
xticks
([
1
,
20
,
40
,
60
,
80
,
100
])
plt
.
yticks
([
0
,
.
0025
,
.
005
,
.
0075
,
.
01
])
plt
.
xlabel
(
r
'Order $\alpha$'
,
fontsize
=
16
)
plt
.
ylabel
(
r
'RDP value $\varepsilon$ at $\alpha$'
,
fontsize
=
16
)
ax
.
tick_params
(
labelsize
=
14
)
plt
.
legend
(
loc
=
0
,
fontsize
=
13
)
plt
.
show
()
def
plot_rdp_of_sigma
(
v
,
order
):
sigmas
=
np
.
linspace
(
1.
,
1000.
,
endpoint
=
True
,
num
=
1000
)
fig
,
ax
=
setup_plot
()
y
=
np
.
zeros
(
len
(
sigmas
))
for
i
,
sigma
in
enumerate
(
sigmas
):
logq
=
pate
.
compute_logq_gaussian
(
v
,
sigma
)
y
[
i
]
=
pate
.
rdp_gaussian
(
logq
,
sigma
,
order
)
ax
.
plot
(
sigmas
,
y
,
alpha
=
.
8
,
linewidth
=
5
)
plt
.
xlim
(
xmin
=
1
,
xmax
=
1000
)
plt
.
ylim
(
ymin
=
0
)
# plt.yticks([0, .0004, .0008, .0012])
ax
.
tick_params
(
labelleft
=
'off'
)
plt
.
xlabel
(
r
'Noise $\sigma$'
,
fontsize
=
16
)
plt
.
ylabel
(
r
'RDP at order $\alpha={}$'
.
format
(
order
),
fontsize
=
16
)
ax
.
tick_params
(
labelsize
=
14
)
# plt.legend(loc=0, fontsize=13)
plt
.
show
()
def
compute_rdp_curve
(
votes
,
threshold
,
sigma1
,
sigma2
,
orders
,
target_answered
):
rdp_cum
=
np
.
zeros
(
len
(
orders
))
answered
=
0
for
i
,
v
in
enumerate
(
votes
):
v
=
sorted
(
v
,
reverse
=
True
)
q_step1
=
math
.
exp
(
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
))
logq_step2
=
pate
.
compute_logq_gaussian
(
v
,
sigma2
)
rdp
=
pate
.
rdp_gaussian
(
logq_step2
,
sigma2
,
orders
)
rdp_cum
+=
q_step1
*
rdp
answered
+=
q_step1
if
answered
>=
target_answered
:
print
(
'Processed {} queries to answer {}.'
.
format
(
i
,
target_answered
))
return
rdp_cum
assert
False
,
'Never reached {} answered queries.'
.
format
(
target_answered
)
def
plot_rdp_total
(
votes
,
sigmas
):
orders
=
np
.
linspace
(
1.
,
100.
,
endpoint
=
True
,
num
=
100
)
orders
[
0
]
=
1.1
fig
,
ax
=
setup_plot
()
target_answered
=
2000
for
sigma
in
sigmas
:
rdp
=
compute_rdp_curve
(
votes
,
5000
,
1000
,
sigma
,
orders
,
target_answered
)
ax
.
plot
(
orders
,
rdp
,
alpha
=
.
8
,
label
=
r
'Data-dependent bound, $\sigma$={}'
.
format
(
int
(
sigma
)),
linewidth
=
5
)
# for sigma in sigmas:
# ax.plot(
# orders,
# target_answered * pate.rdp_data_independent_gaussian(sigma, orders),
# alpha=.3,
# label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
# linewidth=10)
plt
.
xlim
(
xmin
=
1
,
xmax
=
100
)
plt
.
ylim
(
ymin
=
0
)
plt
.
xticks
([
1
,
20
,
40
,
60
,
80
,
100
])
plt
.
yticks
([
0
,
.
0005
,
.
001
,
.
0015
,
.
002
])
plt
.
xlabel
(
r
'Order $\alpha$'
,
fontsize
=
16
)
plt
.
ylabel
(
r
'RDP value $\varepsilon$ at $\alpha$'
,
fontsize
=
16
)
ax
.
tick_params
(
labelsize
=
14
)
plt
.
legend
(
loc
=
0
,
fontsize
=
13
)
plt
.
show
()
def
plot_data_ind_curve
():
fig
,
ax
=
setup_plot
()
orders
=
np
.
linspace
(
1.
,
10.
,
endpoint
=
True
,
num
=
1000
)
orders
[
0
]
=
1.01
ax
.
plot
(
orders
,
pate
.
rdp_data_independent_gaussian
(
1.
,
orders
),
alpha
=
.
5
,
color
=
'gray'
,
linewidth
=
10
)
# plt.yticks([])
plt
.
xlim
(
xmin
=
1
,
xmax
=
10
)
plt
.
ylim
(
ymin
=
0
)
plt
.
xticks
([
1
,
3
,
5
,
7
,
9
])
ax
.
tick_params
(
labelsize
=
14
)
plt
.
show
()
def
plot_two_data_ind_curves
():
orders
=
np
.
linspace
(
1.
,
100.
,
endpoint
=
True
,
num
=
1000
)
orders
[
0
]
=
1.001
fig
,
ax
=
setup_plot
()
for
sigma
in
[
100
,
150
]:
ax
.
plot
(
orders
,
pate
.
rdp_data_independent_gaussian
(
sigma
,
orders
),
alpha
=
.
3
,
label
=
r
'Data-independent bound, $\sigma$={}'
.
format
(
int
(
sigma
)),
linewidth
=
10
)
plt
.
xlim
(
xmin
=
1
,
xmax
=
100
)
plt
.
ylim
(
ymin
=
0
)
plt
.
xticks
([
1
,
20
,
40
,
60
,
80
,
100
])
plt
.
yticks
([
0
,
.
0025
,
.
005
,
.
0075
,
.
01
])
plt
.
xlabel
(
r
'Order $\alpha$'
,
fontsize
=
16
)
plt
.
ylabel
(
r
'RDP value $\varepsilon$ at $\alpha$'
,
fontsize
=
16
)
ax
.
tick_params
(
labelsize
=
14
)
plt
.
legend
(
loc
=
0
,
fontsize
=
13
)
plt
.
show
()
def
scatter_plot
(
votes
,
threshold
,
sigma1
,
sigma2
,
order
):
fig
,
ax
=
setup_plot
()
x
=
[]
y
=
[]
for
i
,
v
in
enumerate
(
votes
):
if
threshold
is
not
None
and
sigma1
is
not
None
:
q_step1
=
math
.
exp
(
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
))
else
:
q_step1
=
1.
if
random
.
random
()
<
q_step1
:
logq_step2
=
pate
.
compute_logq_gaussian
(
v
,
sigma2
)
x
.
append
(
max
(
v
))
y
.
append
(
pate
.
rdp_gaussian
(
logq_step2
,
sigma2
,
order
))
print
(
'Selected {} queries.'
.
format
(
len
(
x
)))
# Plot the data-independent curve:
# data_ind = pate.rdp_data_independent_gaussian(sigma, order)
# plt.plot([0, 5000], [data_ind, data_ind], color='tab:blue', linestyle='-', linewidth=2)
ax
.
set_yscale
(
'log'
)
plt
.
xlim
(
xmin
=
0
,
xmax
=
5000
)
plt
.
ylim
(
ymin
=
1e-300
,
ymax
=
1
)
plt
.
yticks
([
1
,
1e-100
,
1e-200
,
1e-300
])
plt
.
scatter
(
x
,
y
,
s
=
1
,
alpha
=
0.5
)
plt
.
ylabel
(
r
'RDP at $\alpha={}$'
.
format
(
order
),
fontsize
=
16
)
plt
.
xlabel
(
r
'max count'
,
fontsize
=
16
)
ax
.
tick_params
(
labelsize
=
14
)
plt
.
show
()
def
main
(
argv
):
del
argv
# Unused.
fin_name
=
os
.
path
.
expanduser
(
FLAGS
.
counts_file
)
print
(
'Reading raw votes from '
+
fin_name
)
sys
.
stdout
.
flush
()
plot_data_ind_curve
()
plot_two_data_ind_curves
()
v1
=
[
2550
,
2200
,
250
]
# based on votes[2,]
# v2 = [2600, 2200, 200] # based on votes[381,]
plot_rdp_curve_per_example
(
np
.
array
([
v1
]),
(
100.
,
150.
))
plot_rdp_of_sigma
(
np
.
array
(
v1
),
20.
)
votes
=
np
.
load
(
fin_name
)
plot_rdp_total
(
votes
[:
12000
,
],
(
100.
,
150.
))
scatter_plot
(
votes
[:
6000
,
],
None
,
None
,
100
,
20
)
# w/o thresholding
scatter_plot
(
votes
[:
6000
,
],
3500
,
1500
,
100
,
20
)
# with thresholding
if
__name__
==
'__main__'
:
app
.
run
(
main
)
research/differential_privacy/pate/ICLR2018/rdp_bucketized.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Illustrates how noisy thresholding check changes distribution of queries.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is one of two graphs depending on the setting of the plot variable.
The output is written to a pdf file.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
os
import
sys
sys
.
path
.
append
(
'..'
)
# Main modules reside in the parent directory.
from
absl
import
app
from
absl
import
flags
import
matplotlib
matplotlib
.
use
(
'TkAgg'
)
import
matplotlib.pyplot
as
plt
# pylint: disable=g-import-not-at-top
import
numpy
as
np
import
core
as
pate
plt
.
style
.
use
(
'ggplot'
)
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_enum
(
'plot'
,
'small'
,
[
'small'
,
'large'
],
'Selects which of'
'the two plots is produced.'
)
flags
.
DEFINE_string
(
'counts_file'
,
None
,
'Counts file.'
)
flags
.
DEFINE_string
(
'plot_file'
,
''
,
'Plot file to write.'
)
flags
.
mark_flag_as_required
(
'counts_file'
)
def
compute_count_per_bin
(
bin_num
,
votes
):
"""Tabulates number of examples in each bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
Returns:
Array of counts of length bin_num.
"""
sums
=
np
.
sum
(
votes
,
axis
=
1
)
# Check that all rows contain the same number of votes.
assert
max
(
sums
)
==
min
(
sums
)
s
=
max
(
sums
)
counts
=
np
.
zeros
(
bin_num
)
n
=
votes
.
shape
[
0
]
for
i
in
xrange
(
n
):
v
=
votes
[
i
,]
bin_idx
=
int
(
math
.
floor
(
max
(
v
)
*
bin_num
/
s
))
assert
0
<=
bin_idx
<
bin_num
counts
[
bin_idx
]
+=
1
return
counts
def
compute_privacy_cost_per_bins
(
bin_num
,
votes
,
sigma2
,
order
):
"""Outputs average privacy cost per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
sigma2: The scale (std) of the Gaussian noise. (Same as sigma_2 in
Algorithms 1 and 2.)
order: The Renyi order for which privacy cost is computed.
Returns:
Expected eps of RDP (ignoring delta) per example in each bin.
"""
n
=
votes
.
shape
[
0
]
bin_counts
=
np
.
zeros
(
bin_num
)
bin_rdp
=
np
.
zeros
(
bin_num
)
# RDP at order=order
for
i
in
xrange
(
n
):
v
=
votes
[
i
,]
logq
=
pate
.
compute_logq_gaussian
(
v
,
sigma2
)
rdp_at_order
=
pate
.
rdp_gaussian
(
logq
,
sigma2
,
order
)
bin_idx
=
int
(
math
.
floor
(
max
(
v
)
*
bin_num
/
sum
(
v
)))
assert
0
<=
bin_idx
<
bin_num
bin_counts
[
bin_idx
]
+=
1
bin_rdp
[
bin_idx
]
+=
rdp_at_order
if
(
i
+
1
)
%
1000
==
0
:
print
(
'example {}'
.
format
(
i
+
1
))
sys
.
stdout
.
flush
()
return
bin_rdp
/
bin_counts
def
compute_expected_answered_per_bin
(
bin_num
,
votes
,
threshold
,
sigma1
):
"""Computes expected number of answers per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
threshold: The threshold against which check is performed.
sigma1: The std of the Gaussian noise with which check is performed. (Same
as sigma_1 in Algorithms 1 and 2.)
Returns:
Expected number of queries answered per bin.
"""
n
=
votes
.
shape
[
0
]
bin_answered
=
np
.
zeros
(
bin_num
)
for
i
in
xrange
(
n
):
v
=
votes
[
i
,]
p
=
math
.
exp
(
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
))
bin_idx
=
int
(
math
.
floor
(
max
(
v
)
*
bin_num
/
sum
(
v
)))
assert
0
<=
bin_idx
<
bin_num
bin_answered
[
bin_idx
]
+=
p
if
(
i
+
1
)
%
1000
==
0
:
print
(
'example {}'
.
format
(
i
+
1
))
sys
.
stdout
.
flush
()
return
bin_answered
def
main
(
argv
):
del
argv
# Unused.
fin_name
=
os
.
path
.
expanduser
(
FLAGS
.
counts_file
)
print
(
'Reading raw votes from '
+
fin_name
)
sys
.
stdout
.
flush
()
votes
=
np
.
load
(
fin_name
)
votes
=
votes
[:
4000
,]
# truncate to 4000 samples
if
FLAGS
.
plot
==
'small'
:
bin_num
=
5
m_check
=
compute_expected_answered_per_bin
(
bin_num
,
votes
,
3500
,
1500
)
elif
FLAGS
.
plot
==
'large'
:
bin_num
=
10
m_check
=
compute_expected_answered_per_bin
(
bin_num
,
votes
,
3500
,
1500
)
a_check
=
compute_expected_answered_per_bin
(
bin_num
,
votes
,
5000
,
1500
)
eps
=
compute_privacy_cost_per_bins
(
bin_num
,
votes
,
100
,
50
)
else
:
raise
ValueError
(
'--plot flag must be one of ["small", "large"]'
)
counts
=
compute_count_per_bin
(
bin_num
,
votes
)
bins
=
np
.
linspace
(
0
,
100
,
num
=
bin_num
,
endpoint
=
False
)
plt
.
close
(
'all'
)
fig
,
ax
=
plt
.
subplots
()
if
FLAGS
.
plot
==
'small'
:
fig
.
set_figheight
(
5
)
fig
.
set_figwidth
(
5
)
ax
.
bar
(
bins
,
counts
,
20
,
color
=
'orangered'
,
linestyle
=
'dotted'
,
linewidth
=
5
,
edgecolor
=
'red'
,
fill
=
False
,
alpha
=
.
5
,
align
=
'edge'
,
label
=
'LNMax answers'
)
ax
.
bar
(
bins
,
m_check
,
20
,
color
=
'g'
,
alpha
=
.
5
,
linewidth
=
0
,
edgecolor
=
'g'
,
align
=
'edge'
,
label
=
'Confident-GNMax
\n
answers'
)
elif
FLAGS
.
plot
==
'large'
:
fig
.
set_figheight
(
4.7
)
fig
.
set_figwidth
(
7
)
ax
.
bar
(
bins
,
counts
,
10
,
linestyle
=
'dashed'
,
linewidth
=
5
,
edgecolor
=
'red'
,
fill
=
False
,
alpha
=
.
5
,
align
=
'edge'
,
label
=
'LNMax answers'
)
ax
.
bar
(
bins
,
m_check
,
10
,
color
=
'g'
,
alpha
=
.
5
,
linewidth
=
0
,
edgecolor
=
'g'
,
align
=
'edge'
,
label
=
'Confident-GNMax
\n
answers (moderate)'
)
ax
.
bar
(
bins
,
a_check
,
10
,
color
=
'b'
,
alpha
=
.
5
,
align
=
'edge'
,
label
=
'Confident-GNMax
\n
answers (aggressive)'
)
ax2
=
ax
.
twinx
()
bin_centers
=
[
x
+
5
for
x
in
bins
]
ax2
.
plot
(
bin_centers
,
eps
,
'ko'
,
alpha
=
.
8
)
ax2
.
set_ylim
([
1e-200
,
1.
])
ax2
.
set_yscale
(
'log'
)
ax2
.
grid
(
False
)
ax2
.
set_yticks
([
1e-3
,
1e-50
,
1e-100
,
1e-150
,
1e-200
])
plt
.
tick_params
(
which
=
'minor'
,
right
=
'off'
)
ax2
.
set_ylabel
(
r
'Per query privacy cost $\varepsilon$'
,
fontsize
=
16
)
plt
.
xlim
([
0
,
100
])
ax
.
set_ylim
([
0
,
2500
])
# ax.set_yscale('log')
ax
.
set_xlabel
(
'Percentage of teachers that agree'
,
fontsize
=
16
)
ax
.
set_ylabel
(
'Number of queries answered'
,
fontsize
=
16
)
vals
=
ax
.
get_xticks
()
ax
.
set_xticklabels
([
str
(
int
(
x
))
+
'%'
for
x
in
vals
])
ax
.
tick_params
(
labelsize
=
14
,
bottom
=
True
,
top
=
True
,
left
=
True
,
right
=
True
)
ax
.
legend
(
loc
=
2
,
prop
=
{
'size'
:
16
})
# simple: 'figures/noisy_thresholding_check_perf.pdf')
# detailed: 'figures/noisy_thresholding_check_perf_details.pdf'
print
(
'Saving the graph to '
+
FLAGS
.
plot_file
)
plt
.
savefig
(
os
.
path
.
expanduser
(
FLAGS
.
plot_file
),
bbox_inches
=
'tight'
)
plt
.
show
()
if
__name__
==
'__main__'
:
app
.
run
(
main
)
research/differential_privacy/pate/ICLR2018/rdp_cumulative.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots three graphs illustrating cost of privacy per answered query.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of three pdf files.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
os
import
pickle
import
sys
sys
.
path
.
append
(
'..'
)
# Main modules reside in the parent directory.
from
absl
import
app
from
absl
import
flags
import
matplotlib
matplotlib
.
use
(
'TkAgg'
)
import
matplotlib.pyplot
as
plt
# pylint: disable=g-import-not-at-top
import
numpy
as
np
import
core
as
pate
plt
.
style
.
use
(
'ggplot'
)
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_boolean
(
'cache'
,
False
,
'Read results of privacy analysis from cache.'
)
flags
.
DEFINE_string
(
'counts_file'
,
None
,
'Counts file.'
)
flags
.
DEFINE_string
(
'figures_dir'
,
''
,
'Path where figures are written to.'
)
flags
.
mark_flag_as_required
(
'counts_file'
)
def
run_analysis
(
votes
,
mechanism
,
noise_scale
,
params
):
"""Computes data-dependent privacy.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf')
noise_scale: A mechanism privacy parameter.
params: Other privacy parameters.
Returns:
Four lists: cumulative privacy cost epsilon, how privacy budget is split,
how many queries were answered, optimal order.
"""
def
compute_partition
(
order_opt
,
eps
):
order_opt_idx
=
np
.
searchsorted
(
orders
,
order_opt
)
if
mechanism
==
'gnmax_conf'
:
p
=
(
rdp_select_cum
[
order_opt_idx
],
rdp_cum
[
order_opt_idx
]
-
rdp_select_cum
[
order_opt_idx
],
-
math
.
log
(
delta
)
/
(
order_opt
-
1
))
else
:
p
=
(
rdp_cum
[
order_opt_idx
],
-
math
.
log
(
delta
)
/
(
order_opt
-
1
))
return
[
x
/
eps
for
x
in
p
]
# Ensures that sum(x) == 1
# Short list of orders.
# orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
# Long list of orders.
orders
=
np
.
concatenate
((
np
.
arange
(
2
,
100
+
1
,
.
5
),
np
.
logspace
(
np
.
log10
(
100
),
np
.
log10
(
500
),
num
=
100
)))
delta
=
1e-8
n
=
votes
.
shape
[
0
]
eps_total
=
np
.
zeros
(
n
)
partition
=
[
None
]
*
n
order_opt
=
np
.
full
(
n
,
np
.
nan
,
dtype
=
float
)
answered
=
np
.
zeros
(
n
,
dtype
=
float
)
rdp_cum
=
np
.
zeros
(
len
(
orders
))
rdp_sqrd_cum
=
np
.
zeros
(
len
(
orders
))
rdp_select_cum
=
np
.
zeros
(
len
(
orders
))
answered_sum
=
0
for
i
in
range
(
n
):
v
=
votes
[
i
,]
if
mechanism
==
'lnmax'
:
logq_lnmax
=
pate
.
compute_logq_laplace
(
v
,
noise_scale
)
rdp_query
=
pate
.
rdp_pure_eps
(
logq_lnmax
,
2.
/
noise_scale
,
orders
)
rdp_sqrd
=
rdp_query
**
2
pr_answered
=
1
elif
mechanism
==
'gnmax'
:
logq_gmax
=
pate
.
compute_logq_gaussian
(
v
,
noise_scale
)
rdp_query
=
pate
.
rdp_gaussian
(
logq_gmax
,
noise_scale
,
orders
)
rdp_sqrd
=
rdp_query
**
2
pr_answered
=
1
elif
mechanism
==
'gnmax_conf'
:
logq_step1
=
pate
.
compute_logpr_answered
(
params
[
't'
],
params
[
'sigma1'
],
v
)
logq_step2
=
pate
.
compute_logq_gaussian
(
v
,
noise_scale
)
q_step1
=
np
.
exp
(
logq_step1
)
logq_step1_min
=
min
(
logq_step1
,
math
.
log1p
(
-
q_step1
))
rdp_gnmax_step1
=
pate
.
rdp_gaussian
(
logq_step1_min
,
2
**
.
5
*
params
[
'sigma1'
],
orders
)
rdp_gnmax_step2
=
pate
.
rdp_gaussian
(
logq_step2
,
noise_scale
,
orders
)
rdp_query
=
rdp_gnmax_step1
+
q_step1
*
rdp_gnmax_step2
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd
=
(
rdp_gnmax_step1
**
2
+
2
*
rdp_gnmax_step1
*
q_step1
*
rdp_gnmax_step2
+
q_step1
*
rdp_gnmax_step2
**
2
)
rdp_select_cum
+=
rdp_gnmax_step1
pr_answered
=
q_step1
else
:
raise
ValueError
(
'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]'
)
rdp_cum
+=
rdp_query
rdp_sqrd_cum
+=
rdp_sqrd
answered_sum
+=
pr_answered
answered
[
i
]
=
answered_sum
eps_total
[
i
],
order_opt
[
i
]
=
pate
.
compute_eps_from_delta
(
orders
,
rdp_cum
,
delta
)
partition
[
i
]
=
compute_partition
(
order_opt
[
i
],
eps_total
[
i
])
if
i
>
0
and
(
i
+
1
)
%
1000
==
0
:
rdp_var
=
rdp_sqrd_cum
/
i
-
(
rdp_cum
/
i
)
**
2
# Ignore Bessel's correction.
order_opt_idx
=
np
.
searchsorted
(
orders
,
order_opt
[
i
])
eps_std
=
((
i
+
1
)
*
rdp_var
[
order_opt_idx
])
**
.
5
# Std of the sum.
print
(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'
.
format
(
i
+
1
,
answered_sum
,
eps_total
[
i
],
eps_std
,
order_opt
[
i
],
-
math
.
log
(
delta
)
/
(
order_opt
[
i
]
-
1
)))
sys
.
stdout
.
flush
()
return
eps_total
,
partition
,
answered
,
order_opt
def
print_plot_small
(
figures_dir
,
eps_lap
,
eps_gnmax
,
answered_gnmax
):
"""Plots a graph of LNMax vs GNMax.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax: The cumulative privacy costs of the Gaussian mechanism
answered_gnmax: The cumulative count of queries answered.
"""
xlim
=
6000
x_axis
=
range
(
0
,
int
(
xlim
),
10
)
y_lap
=
np
.
zeros
(
len
(
x_axis
),
dtype
=
float
)
y_gnmax
=
np
.
full
(
len
(
x_axis
),
np
.
nan
,
dtype
=
float
)
for
i
in
range
(
len
(
x_axis
)):
x
=
x_axis
[
i
]
y_lap
[
i
]
=
eps_lap
[
x
]
idx
=
np
.
searchsorted
(
answered_gnmax
,
x
)
if
idx
<
len
(
eps_gnmax
):
y_gnmax
[
i
]
=
eps_gnmax
[
idx
]
fig
,
ax
=
plt
.
subplots
()
fig
.
set_figheight
(
4.5
)
fig
.
set_figwidth
(
4.7
)
ax
.
plot
(
x_axis
,
y_lap
,
color
=
'r'
,
ls
=
'--'
,
label
=
'LNMax'
,
alpha
=
.
5
,
linewidth
=
5
)
ax
.
plot
(
x_axis
,
y_gnmax
,
color
=
'g'
,
ls
=
'-'
,
label
=
'Confident-GNMax'
,
alpha
=
.
5
,
linewidth
=
5
)
plt
.
xticks
(
np
.
arange
(
0
,
7000
,
1000
))
plt
.
xlim
([
0
,
6000
])
plt
.
ylim
([
0
,
6.
])
plt
.
xlabel
(
'Number of queries answered'
,
fontsize
=
16
)
plt
.
ylabel
(
r
'Privacy cost $\varepsilon$ at $\delta=10^{-8}$'
,
fontsize
=
16
)
plt
.
legend
(
loc
=
2
,
fontsize
=
13
)
# loc=2 -- upper left
ax
.
tick_params
(
labelsize
=
14
)
fout_name
=
os
.
path
.
join
(
figures_dir
,
'lnmax_vs_gnmax.pdf'
)
print
(
'Saving the graph to '
+
fout_name
)
fig
.
savefig
(
fout_name
,
bbox_inches
=
'tight'
)
plt
.
show
()
def
print_plot_large
(
figures_dir
,
eps_lap
,
eps_gnmax1
,
answered_gnmax1
,
eps_gnmax2
,
partition_gnmax2
,
answered_gnmax2
):
"""Plots a graph of LNMax vs GNMax with two parameters.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1).
answered_gnmax1: The cumulative count of queries answered (set 1).
eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2).
partition_gnmax2: Allocation of eps for set 2.
answered_gnmax2: The cumulative count of queries answered (set 2).
"""
xlim
=
6000
x_axis
=
range
(
0
,
int
(
xlim
),
10
)
lenx
=
len
(
x_axis
)
y_lap
=
np
.
zeros
(
lenx
)
y_gnmax1
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
y_gnmax2
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
y1_gnmax2
=
np
.
full
(
lenx
,
np
.
nan
,
dtype
=
float
)
for
i
in
range
(
lenx
):
x
=
x_axis
[
i
]
y_lap
[
i
]
=
eps_lap
[
x
]
idx1
=
np
.
searchsorted
(
answered_gnmax1
,
x
)
if
idx1
<
len
(
eps_gnmax1
):
y_gnmax1
[
i
]
=
eps_gnmax1
[
idx1
]
idx2
=
np
.
searchsorted
(
answered_gnmax2
,
x
)
if
idx2
<
len
(
eps_gnmax2
):
y_gnmax2
[
i
]
=
eps_gnmax2
[
idx2
]
fraction_step1
,
fraction_step2
,
_
=
partition_gnmax2
[
idx2
]
y1_gnmax2
[
i
]
=
eps_gnmax2
[
idx2
]
*
fraction_step1
/
(
fraction_step1
+
fraction_step2
)
fig
,
ax
=
plt
.
subplots
()
fig
.
set_figheight
(
4.5
)
fig
.
set_figwidth
(
4.7
)
ax
.
plot
(
x_axis
,
y_lap
,
color
=
'r'
,
ls
=
'dashed'
,
label
=
'LNMax'
,
alpha
=
.
5
,
linewidth
=
5
)
ax
.
plot
(
x_axis
,
y_gnmax1
,
color
=
'g'
,
ls
=
'-'
,
label
=
'Confident-GNMax (moderate)'
,
alpha
=
.
5
,
linewidth
=
5
)
ax
.
plot
(
x_axis
,
y_gnmax2
,
color
=
'b'
,
ls
=
'-'
,
label
=
'Confident-GNMax (aggressive)'
,
alpha
=
.
5
,
linewidth
=
5
)
ax
.
fill_between
(
x_axis
,
[
0
]
*
lenx
,
y1_gnmax2
.
tolist
(),
facecolor
=
'b'
,
alpha
=
.
3
,
hatch
=
'
\\
'
)
ax
.
plot
(
x_axis
,
y1_gnmax2
,
color
=
'b'
,
ls
=
'-'
,
label
=
'_nolegend_'
,
alpha
=
.
5
,
linewidth
=
1
)
ax
.
fill_between
(
x_axis
,
y1_gnmax2
.
tolist
(),
y_gnmax2
.
tolist
(),
facecolor
=
'b'
,
alpha
=
.
3
)
plt
.
xticks
(
np
.
arange
(
0
,
7000
,
1000
))
plt
.
xlim
([
0
,
xlim
])
plt
.
ylim
([
0
,
1.
])
plt
.
xlabel
(
'Number of queries answered'
,
fontsize
=
16
)
plt
.
ylabel
(
r
'Privacy cost $\varepsilon$ at $\delta=10^{-8}$'
,
fontsize
=
16
)
plt
.
legend
(
loc
=
2
,
fontsize
=
13
)
# loc=2 -- upper left
ax
.
tick_params
(
labelsize
=
14
)
fout_name
=
os
.
path
.
join
(
figures_dir
,
'lnmax_vs_2xgnmax_large.pdf'
)
print
(
'Saving the graph to '
+
fout_name
)
fig
.
savefig
(
fout_name
,
bbox_inches
=
'tight'
)
plt
.
show
()
def
run_all_analyses
(
votes
,
lambda_laplace
,
gnmax_parameters
,
sigma2
):
"""Sequentially runs all analyses.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
lambda_laplace: The scale of the Laplace noise (lambda).
gnmax_parameters: A list of parameters for GNMax.
sigma2: Shared parameter for the GNMax mechanisms.
Returns:
Five lists whose length is the number of queries.
"""
print
(
'=== Laplace Mechanism ==='
)
eps_lap
,
_
,
_
,
_
=
run_analysis
(
votes
,
'lnmax'
,
lambda_laplace
,
None
)
print
()
# Does not go anywhere, for now
# print('=== Gaussian Mechanism (simple) ===')
# eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None)
eps_gnmax
=
[[]
for
p
in
gnmax_parameters
]
partition_gmax
=
[[]
for
p
in
gnmax_parameters
]
answered
=
[[]
for
p
in
gnmax_parameters
]
order_opt
=
[[]
for
p
in
gnmax_parameters
]
for
i
,
p
in
enumerate
(
gnmax_parameters
):
print
(
'=== Gaussian Mechanism (confident) {}: ==='
.
format
(
p
))
eps_gnmax
[
i
],
partition_gmax
[
i
],
answered
[
i
],
order_opt
[
i
]
=
run_analysis
(
votes
,
'gnmax_conf'
,
sigma2
,
p
)
print
()
return
eps_lap
,
eps_gnmax
,
partition_gmax
,
answered
,
order_opt
def
main
(
argv
):
del
argv
# Unused.
lambda_laplace
=
50.
# corresponds to eps = 1. / lambda_laplace
# Paramaters of the GNMax
gnmax_parameters
=
({
't'
:
1000
,
'sigma1'
:
500
},
{
't'
:
3500
,
'sigma1'
:
1500
},
{
't'
:
5000
,
'sigma1'
:
1500
})
sigma2
=
100
# GNMax parameters differ only in Step 1 (selection).
ftemp_name
=
'/tmp/precomputed.pkl'
figures_dir
=
os
.
path
.
expanduser
(
FLAGS
.
figures_dir
)
if
FLAGS
.
cache
and
os
.
path
.
isfile
(
ftemp_name
):
print
(
'Reading from cache '
+
ftemp_name
)
with
open
(
ftemp_name
,
'rb'
)
as
f
:
(
eps_lap
,
eps_gnmax
,
partition_gmax
,
answered_gnmax
,
orders_opt_gnmax
)
=
pickle
.
load
(
f
)
else
:
fin_name
=
os
.
path
.
expanduser
(
FLAGS
.
counts_file
)
print
(
'Reading raw votes from '
+
fin_name
)
sys
.
stdout
.
flush
()
votes
=
np
.
load
(
fin_name
)
(
eps_lap
,
eps_gnmax
,
partition_gmax
,
answered_gnmax
,
orders_opt_gnmax
)
=
run_all_analyses
(
votes
,
lambda_laplace
,
gnmax_parameters
,
sigma2
)
print
(
'Writing to cache '
+
ftemp_name
)
with
open
(
ftemp_name
,
'wb'
)
as
f
:
pickle
.
dump
((
eps_lap
,
eps_gnmax
,
partition_gmax
,
answered_gnmax
,
orders_opt_gnmax
),
f
)
print_plot_small
(
figures_dir
,
eps_lap
,
eps_gnmax
[
0
],
answered_gnmax
[
0
])
print_plot_large
(
figures_dir
,
eps_lap
,
eps_gnmax
[
1
],
answered_gnmax
[
1
],
eps_gnmax
[
2
],
partition_gmax
[
2
],
answered_gnmax
[
2
])
plt
.
close
(
'all'
)
if
__name__
==
'__main__'
:
app
.
run
(
main
)
research/differential_privacy/pate/ICLR2018/smooth_sensitivity_table.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Performs privacy analysis of GNMax with smooth sensitivity.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
Several flavors of the GNMax algorithm can be analyzed.
- Plain GNMax (argmax w/ Gaussian noise) is assumed when arguments threshold
and sigma2 are missing.
- Confident GNMax (thresholding + argmax w/ Gaussian noise) is used when
threshold, sigma1, and sigma2 are given.
- Interactive GNMax (two- or multi-round) is triggered by specifying
baseline_file, which provides baseline values for votes selection in Step 1.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
os
import
sys
sys
.
path
.
append
(
'..'
)
# Main modules reside in the parent directory.
from
absl
import
app
from
absl
import
flags
import
numpy
as
np
import
core
as
pate
import
smooth_sensitivity
as
pate_ss
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_string
(
'counts_file'
,
None
,
'Counts file.'
)
flags
.
DEFINE_string
(
'baseline_file'
,
None
,
'File with baseline scores.'
)
flags
.
DEFINE_boolean
(
'data_independent'
,
False
,
'Force data-independent bounds.'
)
flags
.
DEFINE_float
(
'threshold'
,
None
,
'Threshold for step 1 (selection).'
)
flags
.
DEFINE_float
(
'sigma1'
,
None
,
'Sigma for step 1 (selection).'
)
flags
.
DEFINE_float
(
'sigma2'
,
None
,
'Sigma for step 2 (argmax).'
)
flags
.
DEFINE_integer
(
'queries'
,
None
,
'Number of queries made by the student.'
)
flags
.
DEFINE_float
(
'delta'
,
1e-8
,
'Target delta.'
)
flags
.
DEFINE_float
(
'order'
,
None
,
'Fixes a Renyi DP order (if unspecified, finds an optimal order from a '
'hardcoded list).'
)
flags
.
DEFINE_integer
(
'teachers'
,
None
,
'Number of teachers (if unspecified, derived from the counts file).'
)
flags
.
mark_flag_as_required
(
'counts_file'
)
flags
.
mark_flag_as_required
(
'sigma2'
)
def
_check_conditions
(
sigma
,
num_classes
,
orders
):
"""Symbolic-numeric verification of conditions C5 and C6.
The conditions on the beta function are verified by constructing the beta
function symbolically, and then checking that its derivative (computed
symbolically) is non-negative within the interval of conjectured monotonicity.
The last check is performed numerically.
"""
print
(
'Checking conditions C5 and C6 for all orders.'
)
sys
.
stdout
.
flush
()
conditions_hold
=
True
for
order
in
orders
:
cond5
,
cond6
=
pate_ss
.
check_conditions
(
sigma
,
num_classes
,
order
)
conditions_hold
&=
cond5
and
cond6
if
not
cond5
:
print
(
'Condition C5 does not hold for order ='
,
order
)
elif
not
cond6
:
print
(
'Condition C6 does not hold for order ='
,
order
)
if
conditions_hold
:
print
(
'Conditions C5-C6 hold for all orders.'
)
sys
.
stdout
.
flush
()
return
conditions_hold
def
_compute_rdp
(
votes
,
baseline
,
threshold
,
sigma1
,
sigma2
,
delta
,
orders
,
data_ind
):
"""Computes the (data-dependent) RDP curve for Confident GNMax."""
rdp_cum
=
np
.
zeros
(
len
(
orders
))
rdp_sqrd_cum
=
np
.
zeros
(
len
(
orders
))
answered
=
0
for
i
,
v
in
enumerate
(
votes
):
if
threshold
is
None
:
logq_step1
=
0
# No thresholding, always proceed to step 2.
rdp_step1
=
np
.
zeros
(
len
(
orders
))
else
:
logq_step1
=
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
-
baseline
[
i
,])
if
data_ind
:
rdp_step1
=
pate
.
compute_rdp_data_independent_threshold
(
sigma1
,
orders
)
else
:
rdp_step1
=
pate
.
compute_rdp_threshold
(
logq_step1
,
sigma1
,
orders
)
if
data_ind
:
rdp_step2
=
pate
.
rdp_data_independent_gaussian
(
sigma2
,
orders
)
else
:
logq_step2
=
pate
.
compute_logq_gaussian
(
v
,
sigma2
)
rdp_step2
=
pate
.
rdp_gaussian
(
logq_step2
,
sigma2
,
orders
)
q_step1
=
np
.
exp
(
logq_step1
)
rdp
=
rdp_step1
+
rdp_step2
*
q_step1
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd
=
(
rdp_step1
**
2
+
2
*
rdp_step1
*
q_step1
*
rdp_step2
+
q_step1
*
rdp_step2
**
2
)
rdp_sqrd_cum
+=
rdp_sqrd
rdp_cum
+=
rdp
answered
+=
q_step1
if
((
i
+
1
)
%
1000
==
0
)
or
(
i
==
votes
.
shape
[
0
]
-
1
):
rdp_var
=
rdp_sqrd_cum
/
i
-
(
rdp_cum
/
i
)
**
2
# Ignore Bessel's correction.
eps_total
,
order_opt
=
pate
.
compute_eps_from_delta
(
orders
,
rdp_cum
,
delta
)
order_opt_idx
=
np
.
searchsorted
(
orders
,
order_opt
)
eps_std
=
((
i
+
1
)
*
rdp_var
[
order_opt_idx
])
**
.
5
# Std of the sum.
print
(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'
.
format
(
i
+
1
,
answered
,
eps_total
,
eps_std
,
order_opt
,
-
math
.
log
(
delta
)
/
(
order_opt
-
1
)))
sys
.
stdout
.
flush
()
_
,
order_opt
=
pate
.
compute_eps_from_delta
(
orders
,
rdp_cum
,
delta
)
return
order_opt
def
_find_optimal_smooth_sensitivity_parameters
(
votes
,
baseline
,
num_teachers
,
threshold
,
sigma1
,
sigma2
,
delta
,
ind_step1
,
ind_step2
,
order
):
"""Optimizes smooth sensitivity parameters by minimizing a cost function.
The cost function is
exact_eps + cost of GNSS + two stds of noise,
which captures that upper bound of the confidence interval of the sanitized
privacy budget.
Since optimization is done with full view of sensitive data, the results
cannot be released.
"""
rdp_cum
=
0
answered_cum
=
0
ls_cum
=
0
# Define a plausible range for the beta values.
betas
=
np
.
arange
(.
3
/
order
,
.
495
/
order
,
.
01
/
order
)
cost_delta
=
math
.
log
(
1
/
delta
)
/
(
order
-
1
)
for
i
,
v
in
enumerate
(
votes
):
if
threshold
is
None
:
log_pr_answered
=
0
rdp1
=
0
ls_step1
=
np
.
zeros
(
num_teachers
)
else
:
log_pr_answered
=
pate
.
compute_logpr_answered
(
threshold
,
sigma1
,
v
-
baseline
[
i
,])
if
ind_step1
:
# apply data-independent bound for step 1 (thresholding).
rdp1
=
pate
.
compute_rdp_data_independent_threshold
(
sigma1
,
order
)
ls_step1
=
np
.
zeros
(
num_teachers
)
else
:
rdp1
=
pate
.
compute_rdp_threshold
(
log_pr_answered
,
sigma1
,
order
)
ls_step1
=
pate_ss
.
compute_local_sensitivity_bounds_threshold
(
v
-
baseline
[
i
,],
num_teachers
,
threshold
,
sigma1
,
order
)
pr_answered
=
math
.
exp
(
log_pr_answered
)
answered_cum
+=
pr_answered
if
ind_step2
:
# apply data-independent bound for step 2 (GNMax).
rdp2
=
pate
.
rdp_data_independent_gaussian
(
sigma2
,
order
)
ls_step2
=
np
.
zeros
(
num_teachers
)
else
:
logq_step2
=
pate
.
compute_logq_gaussian
(
v
,
sigma2
)
rdp2
=
pate
.
rdp_gaussian
(
logq_step2
,
sigma2
,
order
)
# Compute smooth sensitivity.
ls_step2
=
pate_ss
.
compute_local_sensitivity_bounds_gnmax
(
v
,
num_teachers
,
sigma2
,
order
)
rdp_cum
+=
rdp1
+
pr_answered
*
rdp2
ls_cum
+=
ls_step1
+
pr_answered
*
ls_step2
# Expected local sensitivity.
if
ind_step1
and
ind_step2
:
# Data-independent bounds.
cost_opt
,
beta_opt
,
ss_opt
,
sigma_ss_opt
=
None
,
0.
,
0.
,
np
.
inf
else
:
# Data-dependent bounds.
cost_opt
,
beta_opt
,
ss_opt
,
sigma_ss_opt
=
np
.
inf
,
None
,
None
,
None
for
beta
in
betas
:
ss
=
pate_ss
.
compute_discounted_max
(
beta
,
ls_cum
)
# Solution to the minimization problem:
# min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma}
sigma_ss
=
((
order
*
math
.
exp
(
2
*
beta
))
/
ss
)
**
(
1
/
3
)
cost_ss
=
pate_ss
.
compute_rdp_of_smooth_sensitivity_gaussian
(
beta
,
sigma_ss
,
order
)
# Cost captures exact_eps + cost of releasing SS + two stds of noise.
cost
=
rdp_cum
+
cost_ss
+
2
*
ss
*
sigma_ss
if
cost
<
cost_opt
:
cost_opt
,
beta_opt
,
ss_opt
,
sigma_ss_opt
=
cost
,
beta
,
ss
,
sigma_ss
if
((
i
+
1
)
%
100
==
0
)
or
(
i
==
votes
.
shape
[
0
]
-
1
):
eps_before_ss
=
rdp_cum
+
cost_delta
eps_with_ss
=
(
eps_before_ss
+
pate_ss
.
compute_rdp_of_smooth_sensitivity_gaussian
(
beta_opt
,
sigma_ss_opt
,
order
))
print
(
'{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to '
'{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'
.
format
(
i
+
1
,
answered_cum
,
order
,
eps_before_ss
,
eps_with_ss
,
ss_opt
*
sigma_ss_opt
,
ss_opt
,
beta_opt
,
sigma_ss_opt
))
sys
.
stdout
.
flush
()
# Return optimal parameters for the last iteration.
return
beta_opt
,
ss_opt
,
sigma_ss_opt
####################
# HELPER FUNCTIONS #
####################
def
_load_votes
(
counts_file
,
baseline_file
,
queries
):
counts_file_expanded
=
os
.
path
.
expanduser
(
counts_file
)
print
(
'Reading raw votes from '
+
counts_file_expanded
)
sys
.
stdout
.
flush
()
votes
=
np
.
load
(
counts_file_expanded
)
print
(
'Shape of the votes matrix = {}'
.
format
(
votes
.
shape
))
if
baseline_file
is
not
None
:
baseline_file_expanded
=
os
.
path
.
expanduser
(
baseline_file
)
print
(
'Reading baseline values from '
+
baseline_file_expanded
)
sys
.
stdout
.
flush
()
baseline
=
np
.
load
(
baseline_file_expanded
)
if
votes
.
shape
!=
baseline
.
shape
:
raise
ValueError
(
'Counts file and baseline file must have the same shape. Got {} and '
'{} instead.'
.
format
(
votes
.
shape
,
baseline
.
shape
))
else
:
baseline
=
np
.
zeros_like
(
votes
)
if
queries
is
not
None
:
if
votes
.
shape
[
0
]
<
queries
:
raise
ValueError
(
'Expect {} rows, got {} in {}'
.
format
(
queries
,
votes
.
shape
[
0
],
counts_file
))
# Truncate the votes matrix to the number of queries made.
votes
=
votes
[:
queries
,]
baseline
=
baseline
[:
queries
,]
else
:
print
(
'Process all {} input rows. (Use --queries flag to truncate.)'
.
format
(
votes
.
shape
[
0
]))
return
votes
,
baseline
def
_count_teachers
(
votes
):
s
=
np
.
sum
(
votes
,
axis
=
1
)
num_teachers
=
int
(
max
(
s
))
if
min
(
s
)
!=
num_teachers
:
raise
ValueError
(
'Matrix of votes is malformed: the number of votes is not the same '
'across rows.'
)
return
num_teachers
def
_is_data_ind_step1
(
num_teachers
,
threshold
,
sigma1
,
orders
):
if
threshold
is
None
:
return
True
return
np
.
all
(
pate
.
is_data_independent_always_opt_threshold
(
num_teachers
,
threshold
,
sigma1
,
orders
))
def
_is_data_ind_step2
(
num_teachers
,
num_classes
,
sigma
,
orders
):
return
np
.
all
(
pate
.
is_data_independent_always_opt_gaussian
(
num_teachers
,
num_classes
,
sigma
,
orders
))
def
main
(
argv
):
del
argv
# Unused.
if
(
FLAGS
.
threshold
is
None
)
!=
(
FLAGS
.
sigma1
is
None
):
raise
ValueError
(
'--threshold flag and --sigma1 flag must be present or absent '
'simultaneously.'
)
if
FLAGS
.
order
is
None
:
# Long list of orders.
orders
=
np
.
concatenate
((
np
.
arange
(
2
,
100
+
1
,
.
5
),
np
.
logspace
(
np
.
log10
(
100
),
np
.
log10
(
500
),
num
=
100
)))
# Short list of orders.
# orders = np.round(
# np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
else
:
orders
=
np
.
array
([
FLAGS
.
order
])
votes
,
baseline
=
_load_votes
(
FLAGS
.
counts_file
,
FLAGS
.
baseline_file
,
FLAGS
.
queries
)
if
FLAGS
.
teachers
is
None
:
num_teachers
=
_count_teachers
(
votes
)
else
:
num_teachers
=
FLAGS
.
teachers
num_classes
=
votes
.
shape
[
1
]
order
=
_compute_rdp
(
votes
,
baseline
,
FLAGS
.
threshold
,
FLAGS
.
sigma1
,
FLAGS
.
sigma2
,
FLAGS
.
delta
,
orders
,
FLAGS
.
data_independent
)
ind_step1
=
_is_data_ind_step1
(
num_teachers
,
FLAGS
.
threshold
,
FLAGS
.
sigma1
,
order
)
ind_step2
=
_is_data_ind_step2
(
num_teachers
,
num_classes
,
FLAGS
.
sigma2
,
order
)
if
FLAGS
.
data_independent
or
(
ind_step1
and
ind_step2
):
print
(
'Nothing to do here, all analyses are data-independent.'
)
return
if
not
_check_conditions
(
FLAGS
.
sigma2
,
num_classes
,
[
order
]):
return
# Quit early: sufficient conditions for correctness fail to hold.
beta_opt
,
ss_opt
,
sigma_ss_opt
=
_find_optimal_smooth_sensitivity_parameters
(
votes
,
baseline
,
num_teachers
,
FLAGS
.
threshold
,
FLAGS
.
sigma1
,
FLAGS
.
sigma2
,
FLAGS
.
delta
,
ind_step1
,
ind_step2
,
order
)
print
(
'Optimal beta = {:.4f}, E[SS_beta] = {:.4}, sigma_ss = {:.2f}'
.
format
(
beta_opt
,
ss_opt
,
sigma_ss_opt
))
if
__name__
==
'__main__'
:
app
.
run
(
main
)
research/differential_privacy/pate/ICLR2018/utility_queries_answered.py
deleted
100644 → 0
View file @
d32d957a
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
absl
import
app
from
absl
import
flags
import
matplotlib
import
os
matplotlib
.
use
(
'TkAgg'
)
import
matplotlib.pyplot
as
plt
plt
.
style
.
use
(
'ggplot'
)
FLAGS
=
flags
.
FLAGS
flags
.
DEFINE_string
(
'plot_file'
,
''
,
'Output file name.'
)
qa_lnmax
=
[
500
,
750
]
+
range
(
1000
,
12500
,
500
)
acc_lnmax
=
[
43.3
,
52.3
,
59.8
,
66.7
,
68.8
,
70.5
,
71.6
,
72.3
,
72.6
,
72.9
,
73.4
,
73.4
,
73.7
,
73.9
,
74.2
,
74.4
,
74.5
,
74.7
,
74.8
,
75
,
75.1
,
75.1
,
75.4
,
75.4
,
75.4
]
qa_gnmax
=
[
456
,
683
,
908
,
1353
,
1818
,
2260
,
2702
,
3153
,
3602
,
4055
,
4511
,
4964
,
5422
,
5875
,
6332
,
6792
,
7244
,
7696
,
8146
,
8599
,
9041
,
9496
,
9945
,
10390
,
10842
]
acc_gnmax
=
[
39.6
,
52.2
,
59.6
,
66.6
,
69.6
,
70.5
,
71.8
,
72
,
72.7
,
72.9
,
73.3
,
73.4
,
73.4
,
73.8
,
74
,
74.2
,
74.4
,
74.5
,
74.5
,
74.7
,
74.8
,
75
,
75.1
,
75.1
,
75.4
]
qa_gnmax_aggressive
=
[
167
,
258
,
322
,
485
,
647
,
800
,
967
,
1133
,
1282
,
1430
,
1573
,
1728
,
1889
,
2028
,
2190
,
2348
,
2510
,
2668
,
2950
,
3098
,
3265
,
3413
,
3581
,
3730
]
acc_gnmax_aggressive
=
[
17.8
,
26.8
,
39.3
,
48
,
55.7
,
61
,
62.8
,
64.8
,
65.4
,
66.7
,
66.2
,
68.3
,
68.3
,
68.7
,
69.1
,
70
,
70.2
,
70.5
,
70.9
,
70.7
,
71.3
,
71.3
,
71.3
,
71.8
]
def
main
(
argv
):
del
argv
# Unused.
plt
.
close
(
'all'
)
fig
,
ax
=
plt
.
subplots
()
fig
.
set_figheight
(
4.7
)
fig
.
set_figwidth
(
5
)
ax
.
plot
(
qa_lnmax
,
acc_lnmax
,
color
=
'r'
,
ls
=
'--'
,
linewidth
=
5.
,
marker
=
'o'
,
alpha
=
.
5
,
label
=
'LNMax'
)
ax
.
plot
(
qa_gnmax
,
acc_gnmax
,
color
=
'g'
,
ls
=
'-'
,
linewidth
=
5.
,
marker
=
'o'
,
alpha
=
.
5
,
label
=
'Confident-GNMax'
)
# ax.plot(qa_gnmax_aggressive, acc_gnmax_aggressive, color='b', ls='-', marker='o', alpha=.5, label='Confident-GNMax (aggressive)')
plt
.
xticks
([
0
,
2000
,
4000
,
6000
])
plt
.
xlim
([
0
,
6000
])
# ax.set_yscale('log')
plt
.
ylim
([
65
,
76
])
ax
.
tick_params
(
labelsize
=
14
)
plt
.
xlabel
(
'Number of queries answered'
,
fontsize
=
16
)
plt
.
ylabel
(
'Student test accuracy (%)'
,
fontsize
=
16
)
plt
.
legend
(
loc
=
2
,
prop
=
{
'size'
:
16
})
x
=
[
400
,
2116
,
4600
,
4680
]
y
=
[
69.5
,
68.5
,
74
,
72.5
]
annotations
=
[
0.76
,
2.89
,
1.42
,
5.76
]
color_annotations
=
[
'g'
,
'r'
,
'g'
,
'r'
]
for
i
,
txt
in
enumerate
(
annotations
):
ax
.
annotate
(
r
'${\varepsilon=}$'
+
str
(
txt
),
(
x
[
i
],
y
[
i
]),
fontsize
=
16
,
color
=
color_annotations
[
i
])
plot_filename
=
os
.
path
.
expanduser
(
FLAGS
.
plot_file
)
plt
.
savefig
(
plot_filename
,
bbox_inches
=
'tight'
)
plt
.
show
()
if
__name__
==
'__main__'
:
app
.
run
(
main
)
research/differential_privacy/pate/README.md
deleted
100644 → 0
View file @
d32d957a
Implementation of an RDP privacy accountant and smooth sensitivity analysis for
the PATE framework. The underlying theory and supporting experiments appear in
"Scalable Private Learning with PATE" by Nicolas Papernot, Shuang Song, Ilya
Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar Erlingsson (ICLR 2018,
https://arxiv.org/abs/1802.08908).
## Overview
The PATE ('Private Aggregation of Teacher Ensembles') framework was introduced
by Papernot et al. in "Semi-supervised Knowledge Transfer for Deep Learning from
Private Training Data" (ICLR 2017, https://arxiv.org/abs/1610.05755). The
framework enables model-agnostic training that provably provides
[
differential
privacy
](
https://en.wikipedia.org/wiki/Differential_privacy
)
of the training
dataset.
The framework consists of _teachers_, the _student_ model, and the _aggregator_. The
teachers are models trained on disjoint subsets of the training datasets. The student
model has access to an insensitive (e.g., public) unlabelled dataset, which is labelled by
interacting with the ensemble of teachers via the _aggregator_. The aggregator tallies
outputs of the teacher models, and either forwards a (noisy) aggregate to the student, or
refuses to answer.
Differential privacy is enforced by the aggregator. The privacy guarantees can be _data-independent_,
which means that they are solely the function of the aggregator's parameters. Alternatively, privacy
analysis can be _data-dependent_, which allows for finer reasoning where, under certain conditions on
the input distribution, the final privacy guarantees can be improved relative to the data-independent
analysis. Data-dependent privacy guarantees may, by themselves, be a function of sensitive data and
therefore publishing these guarantees requires its own sanitization procedure. In our case
sanitization of data-dependent privacy guarantees proceeds via _smooth sensitivity_ analysis.
The common machinery used for all privacy analyses in this repository is the
R
é
nyi differential privacy, or RDP (see https://arxiv.org/abs/1702.07476).
This repository contains implementations of privacy accountants and smooth
sensitivity analysis for several data-independent and data-dependent mechanism that together
comprise the PATE framework.
### Requirements
*
Python, version
≥
2.7
*
absl (see
[
here
](
https://github.com/abseil/abseil-py
)
, or just type
`pip install absl-py`
)
*
numpy
*
scipy
*
sympy (for smooth sensitivity analysis)
*
unittest (for testing)
### Self-testing
To verify the installation run
```
bash
$
python core_test.py
$
python smooth_sensitivity_test.py
```
## Files in this directory
*
core.py
—
RDP privacy accountant for several vote aggregators (GNMax,
Threshold, Laplace).
*
smooth_sensitivity.py
—
Smooth sensitivity analysis for GNMax and
Threshold mechanisms.
*
core_test.py and smooth_sensitivity_test.py
—
Unit tests for the
files above.
## Contact information
You may direct your comments to mironov@google.com and PR to @ilyamironov.
research/differential_privacy/pate/core.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Core functions for RDP analysis in PATE framework.
This library comprises the core functions for doing differentially private
analysis of the PATE architecture and its various Noisy Max and other
mechanisms.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
from
absl
import
app
import
numpy
as
np
import
scipy.stats
def
_logaddexp
(
x
):
"""Addition in the log space. Analogue of numpy.logaddexp for a list."""
m
=
max
(
x
)
return
m
+
math
.
log
(
sum
(
np
.
exp
(
x
-
m
)))
def
_log1mexp
(
x
):
"""Numerically stable computation of log(1-exp(x))."""
if
x
<
-
1
:
return
math
.
log1p
(
-
math
.
exp
(
x
))
elif
x
<
0
:
return
math
.
log
(
-
math
.
expm1
(
x
))
elif
x
==
0
:
return
-
np
.
inf
else
:
raise
ValueError
(
"Argument must be non-positive."
)
def
compute_eps_from_delta
(
orders
,
rdp
,
delta
):
"""Translates between RDP and (eps, delta)-DP.
Args:
orders: A list (or a scalar) of orders.
rdp: A list of RDP guarantees (of the same length as orders).
delta: Target delta.
Returns:
Pair of (eps, optimal_order).
Raises:
ValueError: If input is malformed.
"""
if
len
(
orders
)
!=
len
(
rdp
):
raise
ValueError
(
"Input lists must have the same length."
)
eps
=
np
.
array
(
rdp
)
-
math
.
log
(
delta
)
/
(
np
.
array
(
orders
)
-
1
)
idx_opt
=
np
.
argmin
(
eps
)
return
eps
[
idx_opt
],
orders
[
idx_opt
]
#####################
# RDP FOR THE GNMAX #
#####################
def
compute_logq_gaussian
(
counts
,
sigma
):
"""Returns an upper bound on ln Pr[outcome != argmax] for GNMax.
Implementation of Proposition 7.
Args:
counts: A numpy array of scores.
sigma: The standard deviation of the Gaussian noise in the GNMax mechanism.
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
n
=
len
(
counts
)
variance
=
sigma
**
2
idx_max
=
np
.
argmax
(
counts
)
counts_normalized
=
counts
[
idx_max
]
-
counts
counts_rest
=
counts_normalized
[
np
.
arange
(
n
)
!=
idx_max
]
# exclude one index
# Upper bound q via a union bound rather than a more precise calculation.
logq
=
_logaddexp
(
scipy
.
stats
.
norm
.
logsf
(
counts_rest
,
scale
=
math
.
sqrt
(
2
*
variance
)))
# A sketch of a more accurate estimate, which is currently disabled for two
# reasons:
# 1. Numerical instability;
# 2. Not covered by smooth sensitivity analysis.
# covariance = variance * (np.ones((n - 1, n - 1)) + np.identity(n - 1))
# logq = np.log1p(-statsmodels.sandbox.distributions.extras.mvnormcdf(
# counts_rest, np.zeros(n - 1), covariance, maxpts=1e4))
return
min
(
logq
,
math
.
log
(
1
-
(
1
/
n
)))
def
rdp_data_independent_gaussian
(
sigma
,
orders
):
"""Computes a data-independent RDP curve for GNMax.
Implementation of Proposition 8.
Args:
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if
sigma
<
0
or
np
.
any
(
orders
<=
1
):
# not defined for alpha=1
raise
ValueError
(
"Inputs are malformed."
)
variance
=
sigma
**
2
if
np
.
isscalar
(
orders
):
return
orders
/
variance
else
:
return
np
.
atleast_1d
(
orders
)
/
variance
def
rdp_gaussian
(
logq
,
sigma
,
orders
):
"""Bounds RDP from above of GNMax given an upper bound on q (Theorem 6).
Args:
logq: Natural logarithm of the probability of a non-argmax outcome.
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if
logq
>
0
or
sigma
<
0
or
np
.
any
(
orders
<=
1
):
# not defined for alpha=1
raise
ValueError
(
"Inputs are malformed."
)
if
np
.
isneginf
(
logq
):
# If the mechanism's output is fixed, it has 0-DP.
if
np
.
isscalar
(
orders
):
return
0.
else
:
return
np
.
full_like
(
orders
,
0.
,
dtype
=
np
.
float
)
variance
=
sigma
**
2
# Use two different higher orders: mu_hi1 and mu_hi2 computed according to
# Proposition 10.
mu_hi2
=
math
.
sqrt
(
variance
*
-
logq
)
mu_hi1
=
mu_hi2
+
1
orders_vec
=
np
.
atleast_1d
(
orders
)
ret
=
orders_vec
/
variance
# baseline: data-independent bound
# Filter out entries where data-dependent bound does not apply.
mask
=
np
.
logical_and
(
mu_hi1
>
orders_vec
,
mu_hi2
>
1
)
rdp_hi1
=
mu_hi1
/
variance
rdp_hi2
=
mu_hi2
/
variance
log_a2
=
(
mu_hi2
-
1
)
*
rdp_hi2
# Make sure q is in the increasing wrt q range and A is positive.
if
(
np
.
any
(
mask
)
and
logq
<=
log_a2
-
mu_hi2
*
(
math
.
log
(
1
+
1
/
(
mu_hi1
-
1
))
+
math
.
log
(
1
+
1
/
(
mu_hi2
-
1
)))
and
-
logq
>
rdp_hi2
):
# Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0.
log1q
=
_log1mexp
(
logq
)
# log1q = log(1-q)
log_a
=
(
orders
-
1
)
*
(
log1q
-
_log1mexp
((
logq
+
rdp_hi2
)
*
(
1
-
1
/
mu_hi2
)))
log_b
=
(
orders
-
1
)
*
(
rdp_hi1
-
logq
/
(
mu_hi1
-
1
))
# Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y.
log_s
=
np
.
logaddexp
(
log1q
+
log_a
,
logq
+
log_b
)
ret
[
mask
]
=
np
.
minimum
(
ret
,
log_s
/
(
orders
-
1
))[
mask
]
assert
np
.
all
(
ret
>=
0
)
if
np
.
isscalar
(
orders
):
return
np
.
asscalar
(
ret
)
else
:
return
ret
def
is_data_independent_always_opt_gaussian
(
num_teachers
,
num_classes
,
sigma
,
orders
):
"""Tests whether data-ind bound is always optimal for GNMax.
Args:
num_teachers: Number of teachers.
num_classes: Number of classes.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
unanimous
=
np
.
array
([
num_teachers
]
+
[
0
]
*
(
num_classes
-
1
))
logq
=
compute_logq_gaussian
(
unanimous
,
sigma
)
rdp_dep
=
rdp_gaussian
(
logq
,
sigma
,
orders
)
rdp_ind
=
rdp_data_independent_gaussian
(
sigma
,
orders
)
return
np
.
isclose
(
rdp_dep
,
rdp_ind
)
###################################
# RDP FOR THE THRESHOLD MECHANISM #
###################################
def
compute_logpr_answered
(
t
,
sigma
,
counts
):
"""Computes log of the probability that a noisy threshold is crossed.
Args:
t: The threshold.
sigma: The stdev of the Gaussian noise added to the threshold.
counts: An array of votes.
Returns:
Natural log of the probability that max is larger than a noisy threshold.
"""
# Compared to the paper, max(counts) is rounded to the nearest integer. This
# is done to facilitate computation of smooth sensitivity for the case of
# the interactive mechanism, where votes are not necessarily integer.
return
scipy
.
stats
.
norm
.
logsf
(
t
-
round
(
max
(
counts
)),
scale
=
sigma
)
def
compute_rdp_data_independent_threshold
(
sigma
,
orders
):
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return
rdp_data_independent_gaussian
(
2
**
.
5
*
sigma
,
orders
)
def
compute_rdp_threshold
(
log_pr_answered
,
sigma
,
orders
):
logq
=
min
(
log_pr_answered
,
_log1mexp
(
log_pr_answered
))
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return
rdp_gaussian
(
logq
,
2
**
.
5
*
sigma
,
orders
)
def
is_data_independent_always_opt_threshold
(
num_teachers
,
threshold
,
sigma
,
orders
):
"""Tests whether data-ind bound is always optimal for the threshold mechanism.
Args:
num_teachers: Number of teachers.
threshold: The cut-off threshold.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
# Since the data-dependent bound depends only on max(votes), it suffices to
# check whether the data-dependent bounds are better than data-independent
# bounds in the extreme cases when max(votes) is minimal or maximal.
# For both Confident GNMax and Interactive GNMax it holds that
# 0 <= max(votes) <= num_teachers.
# The upper bound is trivial in both cases.
# The lower bound is trivial for Confident GNMax (and a stronger one, based on
# the pigeonhole principle, is possible).
# For Interactive GNMax (Algorithm 2), the lower bound follows from the
# following argument. Since the votes vector is the difference between the
# actual teachers' votes and the student's baseline, we need to argue that
# max(n_j - M * p_j) >= 0.
# The bound holds because sum_j n_j = sum M * p_j = M. Thus,
# sum_j (n_j - M * p_j) = 0, and max_j (n_j - M * p_j) >= 0 as needed.
logq1
=
compute_logpr_answered
(
threshold
,
sigma
,
[
0
])
logq2
=
compute_logpr_answered
(
threshold
,
sigma
,
[
num_teachers
])
rdp_dep1
=
compute_rdp_threshold
(
logq1
,
sigma
,
orders
)
rdp_dep2
=
compute_rdp_threshold
(
logq2
,
sigma
,
orders
)
rdp_ind
=
compute_rdp_data_independent_threshold
(
sigma
,
orders
)
return
np
.
isclose
(
rdp_dep1
,
rdp_ind
)
and
np
.
isclose
(
rdp_dep2
,
rdp_ind
)
#############################
# RDP FOR THE LAPLACE NOISE #
#############################
def
compute_logq_laplace
(
counts
,
lmbd
):
"""Computes an upper bound on log Pr[outcome != argmax] for LNMax.
Args:
counts: A list of scores.
lmbd: The lambda parameter of the Laplace distribution ~exp(-|x| / lambda).
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
# For noisy max, we only get an upper bound via the union bound. See Lemma 4
# in https://arxiv.org/abs/1610.05755.
#
# Pr[ j beats i*] = (2+gap(j,i*))/ 4 exp(gap(j,i*)
# proof at http://mathoverflow.net/questions/66763/
idx_max
=
np
.
argmax
(
counts
)
counts_normalized
=
(
counts
-
counts
[
idx_max
])
/
lmbd
counts_rest
=
np
.
array
(
[
counts_normalized
[
i
]
for
i
in
range
(
len
(
counts
))
if
i
!=
idx_max
])
logq
=
_logaddexp
(
np
.
log
(
2
-
counts_rest
)
+
math
.
log
(.
25
)
+
counts_rest
)
return
min
(
logq
,
math
.
log
(
1
-
(
1
/
len
(
counts
))))
def
rdp_pure_eps
(
logq
,
pure_eps
,
orders
):
"""Computes the RDP value given logq and pure privacy eps.
Implementation of https://arxiv.org/abs/1610.05755, Theorem 3.
The bound used is the min of three terms. The first term is from
https://arxiv.org/pdf/1605.02065.pdf.
The second term is based on the fact that when event has probability (1-q) for
q close to zero, q can only change by exp(eps), which corresponds to a
much smaller multiplicative change in (1-q)
The third term comes directly from the privacy guarantee.
Args:
logq: Natural logarithm of the probability of a non-optimal outcome.
pure_eps: eps parameter for DP
orders: array_like list of moments to compute.
Returns:
Array of upper bounds on rdp (a scalar if orders is a scalar).
"""
orders_vec
=
np
.
atleast_1d
(
orders
)
q
=
math
.
exp
(
logq
)
log_t
=
np
.
full_like
(
orders_vec
,
np
.
inf
)
if
q
<=
1
/
(
math
.
exp
(
pure_eps
)
+
1
):
logt_one
=
math
.
log1p
(
-
q
)
+
(
math
.
log1p
(
-
q
)
-
_log1mexp
(
pure_eps
+
logq
))
*
(
orders_vec
-
1
)
logt_two
=
logq
+
pure_eps
*
(
orders_vec
-
1
)
log_t
=
np
.
logaddexp
(
logt_one
,
logt_two
)
ret
=
np
.
minimum
(
np
.
minimum
(
0.5
*
pure_eps
*
pure_eps
*
orders_vec
,
log_t
/
(
orders_vec
-
1
)),
pure_eps
)
if
np
.
isscalar
(
orders
):
return
np
.
asscalar
(
ret
)
else
:
return
ret
def
main
(
argv
):
del
argv
# Unused.
if
__name__
==
"__main__"
:
app
.
run
(
main
)
research/differential_privacy/pate/core_test.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for pate.core."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
unittest
import
numpy
as
np
import
core
as
pate
class
PateTest
(
unittest
.
TestCase
):
def
_test_rdp_gaussian_value_errors
(
self
):
# Test for ValueErrors.
with
self
.
assertRaises
(
ValueError
):
pate
.
rdp_gaussian
(
1.0
,
1.0
,
np
.
array
([
2
,
3
,
4
]))
with
self
.
assertRaises
(
ValueError
):
pate
.
rdp_gaussian
(
np
.
log
(
0.5
),
-
1.0
,
np
.
array
([
2
,
3
,
4
]))
with
self
.
assertRaises
(
ValueError
):
pate
.
rdp_gaussian
(
np
.
log
(
0.5
),
1.0
,
np
.
array
([
1
,
3
,
4
]))
def
_test_rdp_gaussian_as_function_of_q
(
self
):
# Test for data-independent and data-dependent ranges over q.
# The following corresponds to orders 1.1, 2.5, 32, 250
# sigmas 1.5, 15, 1500, 15000.
# Hand calculated -log(q0)s arranged in a 'sigma major' ordering.
neglogq0s
=
[
2.8
,
2.6
,
427
,
None
,
4.8
,
4.0
,
4.7
,
275
,
9.6
,
8.8
,
6.0
,
4
,
12
,
11.2
,
8.6
,
6.4
]
idx_neglogq0s
=
0
# To iterate through neglogq0s.
orders
=
[
1.1
,
2.5
,
32
,
250
]
sigmas
=
[
1.5
,
15
,
1500
,
15000
]
for
sigma
in
sigmas
:
for
order
in
orders
:
curr_neglogq0
=
neglogq0s
[
idx_neglogq0s
]
idx_neglogq0s
+=
1
if
curr_neglogq0
is
None
:
# sigma == 1.5 and order == 250:
continue
rdp_at_q0
=
pate
.
rdp_gaussian
(
-
curr_neglogq0
,
sigma
,
order
)
# Data-dependent range. (Successively halve the value of q.)
logq_dds
=
(
-
curr_neglogq0
-
np
.
array
(
[
0
,
np
.
log
(
2
),
np
.
log
(
4
),
np
.
log
(
8
)]))
# Check that in q_dds, rdp is decreasing.
for
idx
in
range
(
len
(
logq_dds
)
-
1
):
self
.
assertGreater
(
pate
.
rdp_gaussian
(
logq_dds
[
idx
],
sigma
,
order
),
pate
.
rdp_gaussian
(
logq_dds
[
idx
+
1
],
sigma
,
order
))
# Data-independent range.
q_dids
=
np
.
exp
(
-
curr_neglogq0
)
+
np
.
array
([
0.1
,
0.2
,
0.3
,
0.4
])
# Check that in q_dids, rdp is constant.
for
q
in
q_dids
:
self
.
assertEqual
(
rdp_at_q0
,
pate
.
rdp_gaussian
(
np
.
log
(
q
),
sigma
,
order
))
def
_test_compute_eps_from_delta_value_error
(
self
):
# Test for ValueError.
with
self
.
assertRaises
(
ValueError
):
pate
.
compute_eps_from_delta
([
1.1
,
2
,
3
,
4
],
[
1
,
2
,
3
],
0.001
)
def
_test_compute_eps_from_delta_monotonicity
(
self
):
# Test for monotonicity with respect to delta.
orders
=
[
1.1
,
2.5
,
250.0
]
sigmas
=
[
1e-3
,
1.0
,
1e5
]
deltas
=
[
1e-60
,
1e-6
,
0.1
,
0.999
]
for
sigma
in
sigmas
:
list_of_eps
=
[]
rdps_for_gaussian
=
np
.
array
(
orders
)
/
(
2
*
sigma
**
2
)
for
delta
in
deltas
:
list_of_eps
.
append
(
pate
.
compute_eps_from_delta
(
orders
,
rdps_for_gaussian
,
delta
)[
0
])
# Check that in list_of_eps, epsilons are decreasing (as delta increases).
sorted_list_of_eps
=
list
(
list_of_eps
)
sorted_list_of_eps
.
sort
(
reverse
=
True
)
self
.
assertEqual
(
list_of_eps
,
sorted_list_of_eps
)
def
_test_compute_q0
(
self
):
# Stub code to search a logq space and figure out logq0 by eyeballing
# results. This code does not run with the tests. Remove underscore to run.
sigma
=
15
order
=
250
logqs
=
np
.
arange
(
-
290
,
-
270
,
1
)
count
=
0
for
logq
in
logqs
:
count
+=
1
sys
.
stdout
.
write
(
"
\t
%0.5g: %0.10g"
%
(
logq
,
pate
.
rdp_gaussian
(
logq
,
sigma
,
order
)))
sys
.
stdout
.
flush
()
if
count
%
5
==
0
:
print
(
""
)
def
test_rdp_gaussian
(
self
):
self
.
_test_rdp_gaussian_value_errors
()
self
.
_test_rdp_gaussian_as_function_of_q
()
def
test_compute_eps_from_delta
(
self
):
self
.
_test_compute_eps_from_delta_value_error
()
self
.
_test_compute_eps_from_delta_monotonicity
()
if
__name__
==
"__main__"
:
unittest
.
main
()
research/differential_privacy/pate/smooth_sensitivity.py
deleted
100644 → 0
View file @
d32d957a
This diff is collapsed.
Click to expand it.
research/differential_privacy/pate/smooth_sensitivity_test.py
deleted
100644 → 0
View file @
d32d957a
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for pate.smooth_sensitivity."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
smooth_sensitivity
as
pate_ss
class
PateSmoothSensitivityTest
(
unittest
.
TestCase
):
def
test_check_conditions
(
self
):
self
.
assertEqual
(
pate_ss
.
check_conditions
(
20
,
10
,
25.
),
(
True
,
False
))
self
.
assertEqual
(
pate_ss
.
check_conditions
(
30
,
10
,
25.
),
(
True
,
True
))
def
_assert_all_close
(
self
,
x
,
y
):
"""Asserts that two numpy arrays are close."""
self
.
assertEqual
(
len
(
x
),
len
(
y
))
self
.
assertTrue
(
np
.
allclose
(
x
,
y
,
rtol
=
1e-8
,
atol
=
0
))
def
test_compute_local_sensitivity_bounds_gnmax
(
self
):
counts1
=
np
.
array
([
10
,
0
,
0
])
sigma1
=
.
5
order1
=
1.5
answer1
=
np
.
array
(
[
3.13503646e-17
,
1.60178280e-08
,
5.90681786e-03
]
+
[
5.99981308e+00
]
*
7
)
# Test for "going right" in the smooth sensitivity computation.
out1
=
pate_ss
.
compute_local_sensitivity_bounds_gnmax
(
counts1
,
10
,
sigma1
,
order1
)
self
.
_assert_all_close
(
out1
,
answer1
)
counts2
=
np
.
array
([
1000
,
500
,
300
,
200
,
0
])
sigma2
=
250.
order2
=
10.
# Test for "going left" in the smooth sensitivity computation.
out2
=
pate_ss
.
compute_local_sensitivity_bounds_gnmax
(
counts2
,
2000
,
sigma2
,
order2
)
answer2
=
np
.
array
([
0.
]
*
298
+
[
2.77693450548e-7
,
2.10853979548e-6
]
+
[
2.73113623988e-6
]
*
1700
)
self
.
_assert_all_close
(
out2
,
answer2
)
def
test_compute_local_sensitivity_bounds_threshold
(
self
):
counts1_3
=
np
.
array
([
20
,
10
,
0
])
num_teachers
=
sum
(
counts1_3
)
t1
=
16
# high threshold
sigma
=
2
order
=
10
out1
=
pate_ss
.
compute_local_sensitivity_bounds_threshold
(
counts1_3
,
num_teachers
,
t1
,
sigma
,
order
)
answer1
=
np
.
array
([
0
]
*
3
+
[
1.48454129e-04
,
1.47826870e-02
,
3.94153241e-02
,
6.45775697e-02
,
9.01543247e-02
,
1.16054002e-01
,
1.42180452e-01
,
1.42180452e-01
,
1.48454129e-04
,
1.47826870e-02
,
3.94153241e-02
,
6.45775697e-02
,
9.01543266e-02
,
1.16054000e-01
,
1.42180452e-01
,
1.68302106e-01
,
1.93127860e-01
]
+
[
0
]
*
10
)
self
.
_assert_all_close
(
out1
,
answer1
)
t2
=
2
# low threshold
out2
=
pate_ss
.
compute_local_sensitivity_bounds_threshold
(
counts1_3
,
num_teachers
,
t2
,
sigma
,
order
)
answer2
=
np
.
array
([
1.60212079e-01
,
2.07021132e-01
,
2.07021132e-01
,
1.93127860e-01
,
1.68302106e-01
,
1.42180452e-01
,
1.16054002e-01
,
9.01543247e-02
,
6.45775697e-02
,
3.94153241e-02
,
1.47826870e-02
,
1.48454129e-04
]
+
[
0
]
*
18
)
self
.
_assert_all_close
(
out2
,
answer2
)
t3
=
50
# very high threshold (larger than the number of teachers).
out3
=
pate_ss
.
compute_local_sensitivity_bounds_threshold
(
counts1_3
,
num_teachers
,
t3
,
sigma
,
order
)
answer3
=
np
.
array
([
1.35750725752e-19
,
1.88990500499e-17
,
2.05403154065e-15
,
1.74298153642e-13
,
1.15489723995e-11
,
5.97584949325e-10
,
2.41486826748e-08
,
7.62150641922e-07
,
1.87846248741e-05
,
0.000360973025976
,
0.000360973025976
,
2.76377015215e-50
,
1.00904975276e-53
,
2.87254164748e-57
,
6.37583360761e-61
,
1.10331620211e-64
,
1.48844393335e-68
,
1.56535552444e-72
,
1.28328011060e-76
,
8.20047697109e-81
]
+
[
0
]
*
10
)
self
.
_assert_all_close
(
out3
,
answer3
)
# Fractional values.
counts4
=
np
.
array
([
19.5
,
-
5.1
,
0
])
t4
=
10.1
out4
=
pate_ss
.
compute_local_sensitivity_bounds_threshold
(
counts4
,
num_teachers
,
t4
,
sigma
,
order
)
answer4
=
np
.
array
([
0.0620410301
,
0.0875807131
,
0.113451958
,
0.139561671
,
0.1657074530
,
0.1908244840
,
0.2070270720
,
0.207027072
,
0.169718100
,
0.0575152142
,
0.00678695871
]
+
[
0
]
*
6
+
[
0.000536304908
,
0.0172181073
,
0.041909870
]
+
[
0
]
*
10
)
self
.
_assert_all_close
(
out4
,
answer4
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment