analysis.py 9.98 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
"""
This script computes bounds on the privacy cost of training the
student model from noisy aggregation of labels predicted by teachers.
It should be used only after training the student (and therefore the
teachers as well). We however include the label files required to
reproduce key results from our paper (https://arxiv.org/abs/1610.05755): 
the epsilon bounds for MNIST and SVHN students.

The command that computes the epsilon bound associated
with the training of the MNIST student model (100 label queries
with a (1/20)*2=0.1 epsilon bound each) is:

python analysis.py 
  --counts_file=mnist_250_teachers_labels.npy 
  --indices_file=mnist_250_teachers_100_indices_used_by_student.npy

The command that computes the epsilon bound associated
with the training of the SVHN student model (1000 label queries
with a (1/20)*2=0.1 epsilon bound each) is:

python analysis.py 
  --counts_file=svhn_250_teachers_labels.npy 
  --max_examples=1000
  --delta=1e-6
"""
26
import os
27
28
29
30
import math
import numpy as np
import tensorflow as tf

31
32
from input import maybe_download

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# These parameters can be changed to compute bounds for different failure rates
# or different model predictions.

tf.flags.DEFINE_integer("moments",8, "Number of moments")
tf.flags.DEFINE_float("noise_eps", 0.1, "Eps value for each call to noisymax.")
tf.flags.DEFINE_float("delta", 1e-5, "Target value of delta.")
tf.flags.DEFINE_float("beta", 0.09, "Value of beta for smooth sensitivity")
tf.flags.DEFINE_string("counts_file","","Numpy matrix with raw counts")
tf.flags.DEFINE_string("indices_file","",
    "File containting a numpy matrix with indices used."
    "Optional. Use the first max_examples indices if this is not provided.")
tf.flags.DEFINE_integer("max_examples",1000,
    "Number of examples to use. We will use the first"
    " max_examples many examples from the counts_file"
    " or indices_file to do the privacy cost estimate")
tf.flags.DEFINE_float("too_small", 1e-10, "Small threshold to avoid log of 0")
tf.flags.DEFINE_bool("input_is_counts", False, "False if labels, True if counts")

FLAGS = tf.flags.FLAGS


def compute_q_noisy_max(counts, noise_eps):
  """returns ~ Pr[outcome != winner].

  Args:
    counts: a list of scores
    noise_eps: privacy parameter for noisy_max
  Returns:
    q: the probability that outcome is different from true winner.
  """
  # For noisy max, we only get an upper bound.
  # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
  # proof at http://mathoverflow.net/questions/66763/
  # tight-bounds-on-probability-of-sum-of-laplace-random-variables

  winner = np.argmax(counts)
  counts_normalized = noise_eps * (counts - counts[winner])
  counts_rest = np.array(
      [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
  q = 0.0
  for c in counts_rest:
    gap = -c
    q += (gap + 2.0) / (4.0 * math.exp(gap))
  return min(q, 1.0 - (1.0/len(counts)))


def compute_q_noisy_max_approx(counts, noise_eps):
  """returns ~ Pr[outcome != winner].

  Args:
    counts: a list of scores
    noise_eps: privacy parameter for noisy_max
  Returns:
    q: the probability that outcome is different from true winner.
  """
  # For noisy max, we only get an upper bound.
  # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
  # proof at http://mathoverflow.net/questions/66763/
  # tight-bounds-on-probability-of-sum-of-laplace-random-variables
  # This code uses an approximation that is faster and easier
  # to get local sensitivity bound on.

  winner = np.argmax(counts)
  counts_normalized = noise_eps * (counts - counts[winner])
  counts_rest = np.array(
      [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
  gap = -max(counts_rest)
  q = (len(counts) - 1) * (gap + 2.0) / (4.0 * math.exp(gap))
  return min(q, 1.0 - (1.0/len(counts)))


def logmgf_exact(q, priv_eps, l):
  """Computes the logmgf value given q and privacy eps.

  The bound used is the min of three terms. The first term is from
  https://arxiv.org/pdf/1605.02065.pdf.
  The second term is based on the fact that when event has probability (1-q) for
  q close to zero, q can only change by exp(eps), which corresponds to a
  much smaller multiplicative change in (1-q)
  The third term comes directly from the privacy guarantee.
  Args:
    q: pr of non-optimal outcome
    priv_eps: eps parameter for DP
    l: moment to compute.
  Returns:
    Upper bound on logmgf
  """
  if q < 0.5:
    t_one = (1-q) * math.pow((1-q) / (1 - math.exp(priv_eps) * q), l)
    t_two = q * math.exp(priv_eps * l)
    t = t_one + t_two
    try:
      log_t = math.log(t)
    except ValueError:
      print "Got ValueError in math.log for values :" + str((q, priv_eps, l, t))
      log_t = priv_eps * l
  else:
    log_t = priv_eps * l

  return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l)


def logmgf_from_counts(counts, noise_eps, l):
  """
  ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP
  in our setting where one count can go up by one and another
  can go down by 1.
  """

  q = compute_q_noisy_max(counts, noise_eps)
  return logmgf_exact(q, 2.0 * noise_eps, l)


def sens_at_k(counts, noise_eps, l, k):
  """Return sensitivity at distane k.

  Args:
    counts: an array of scores
    noise_eps: noise parameter used
    l: moment whose sensitivity is being computed
    k: distance
  Returns:
    sensitivity: at distance k
  """
  counts_sorted = sorted(counts, reverse=True)
  if 0.5 * noise_eps * l > 1:
    print "l too large to compute sensitivity"
    return 0
  # Now we can assume that at k, gap remains positive
  # or we have reached the point where logmgf_exact is
  # determined by the first term and ind of q.
  if counts[0] < counts[1] + k:
    return 0
  counts_sorted[0] -= k
  counts_sorted[1] += k
  val = logmgf_from_counts(counts_sorted, noise_eps, l)
  counts_sorted[0] -= 1
  counts_sorted[1] += 1
  val_changed = logmgf_from_counts(counts_sorted, noise_eps, l)
  return val_changed - val


def smoothed_sens(counts, noise_eps, l, beta):
  """Compute beta-smooth sensitivity.

  Args:
    counts: array of scors
    noise_eps: noise parameter
    l: moment of interest
    beta: smoothness parameter
  Returns:
    smooth_sensitivity: a beta smooth upper bound
  """
  k = 0
  smoothed_sensitivity = sens_at_k(counts, noise_eps, l, k)
  while k < max(counts):
    k += 1
    sensitivity_at_k = sens_at_k(counts, noise_eps, l, k)
    smoothed_sensitivity = max(
        smoothed_sensitivity,
        math.exp(-beta * k) * sensitivity_at_k)
    if sensitivity_at_k == 0.0:
      break
  return smoothed_sensitivity


def main(unused_argv):
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
  ##################################################################
  # If we are reproducing results from paper https://arxiv.org/abs/1610.05755,
  # download the required binaries with label information.
  ##################################################################
  
  # Binaries for MNIST results
  paper_binaries_mnist = \
    ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_labels.npy?raw=true", 
    "https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_100_indices_used_by_student.npy?raw=true"]
  if FLAGS.counts_file == "mnist_250_teachers_labels.npy" \
    or FLAGS.indices_file == "mnist_250_teachers_100_indices_used_by_student.npy":
    maybe_download(paper_binaries_mnist, os.getcwd())

  # Binaries for SVHN results
  paper_binaries_svhn = ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/svhn_250_teachers_labels.npy?raw=true"]
  if FLAGS.counts_file == "svhn_250_teachers_labels.npy":
    maybe_download(paper_binaries_svhn, os.getcwd())

218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
  input_mat = np.load(FLAGS.counts_file)
  if FLAGS.input_is_counts:
    counts_mat = input_mat
  else:
    # In this case, the input is the raw predictions. Transform
    num_teachers, n = input_mat.shape
    counts_mat = np.zeros((n, 10)).astype(np.int32)
    for i in range(n):
      for j in range(num_teachers):
        counts_mat[i, input_mat[j, i]] += 1
  n = counts_mat.shape[0]
  num_examples = min(n, FLAGS.max_examples)

  if not FLAGS.indices_file:
    indices = np.array(range(num_examples))
  else:
    index_list = np.load(FLAGS.indices_file)
    indices = index_list[:num_examples]

  l_list = 1.0 + np.array(xrange(FLAGS.moments))
  beta = FLAGS.beta
  total_log_mgf_nm = np.array([0.0 for _ in l_list])
  total_ss_nm = np.array([0.0 for _ in l_list])
  noise_eps = FLAGS.noise_eps
  
  for i in indices:
    total_log_mgf_nm += np.array(
        [logmgf_from_counts(counts_mat[i], noise_eps, l)
         for l in l_list])
    total_ss_nm += np.array(
        [smoothed_sens(counts_mat[i], noise_eps, l, beta)
         for l in l_list])
  delta = FLAGS.delta

  # We want delta = exp(alpha - eps l).
  # Solving gives eps = (alpha - ln (delta))/l
  eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list

  print "Epsilons (Noisy Max): " + str(eps_list_nm)
  print "Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list)

  # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps
  # is eps,delta DP
  # Also if beta < eps / 2(gamma +1), then adding noise 2(gamma+1) SS eta / eps
  # where eta has density proportional to 1 / (1+|z|^gamma) is eps-DP
  # Both from Corolloary 2.4 in
  # http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
  # Print the first one's scale
  ss_eps = 2.0 * beta * math.log(1/delta)
  ss_scale = 2.0 / ss_eps
  print "To get an " + str(ss_eps) + "-DP estimate of epsilon, "
  print "..add noise ~ " + str(ss_scale)
  print "... times " + str(total_ss_nm / l_list)
  print "Epsilon = " + str(min(eps_list_nm)) + "."
  if min(eps_list_nm) == eps_list_nm[-1]:
    print "Warning: May not have used enough values of l"

  # Data indpendent bound, as mechanism is
  # 2*noise_eps DP.
  data_ind_log_mgf = np.array([0.0 for _ in l_list])
  data_ind_log_mgf += num_examples * np.array(
      [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list])

  data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list
  print "Data independent bound = " + str(min(data_ind_eps_list)) + "."

  return


if __name__ == "__main__":
  tf.app.run()