Commit d5fc3ef0 authored by pkulzc's avatar pkulzc
Browse files

Merge remote-tracking branch 'upstream/master'

parents 6b72b5cd 57b99319
...@@ -154,10 +154,10 @@ def _convert_dataset(dataset_split): ...@@ -154,10 +154,10 @@ def _convert_dataset(dataset_split):
i + 1, num_images, shard_id)) i + 1, num_images, shard_id))
sys.stdout.flush() sys.stdout.flush()
# Read the image. # Read the image.
image_data = tf.gfile.FastGFile(image_files[i], 'r').read() image_data = tf.gfile.FastGFile(image_files[i], 'rb').read()
height, width = image_reader.read_image_dims(image_data) height, width = image_reader.read_image_dims(image_data)
# Read the semantic segmentation annotation. # Read the semantic segmentation annotation.
seg_data = tf.gfile.FastGFile(label_files[i], 'r').read() seg_data = tf.gfile.FastGFile(label_files[i], 'rb').read()
seg_height, seg_width = label_reader.read_image_dims(seg_data) seg_height, seg_width = label_reader.read_image_dims(seg_data)
if height != seg_height or width != seg_width: if height != seg_height or width != seg_width:
raise RuntimeError('Shape mismatched between image and label.') raise RuntimeError('Shape mismatched between image and label.')
......
...@@ -125,7 +125,10 @@ def _bytes_list_feature(values): ...@@ -125,7 +125,10 @@ def _bytes_list_feature(values):
Returns: Returns:
A TF-Feature. A TF-Feature.
""" """
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) def norm2bytes(value):
return value.encode() if isinstance(value, str) else value
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[norm2bytes(values)]))
def image_seg_to_tfexample(image_data, filename, height, width, seg_data): def image_seg_to_tfexample(image_data, filename, height, width, seg_data):
......
...@@ -114,13 +114,13 @@ def _convert_dataset(dataset_split): ...@@ -114,13 +114,13 @@ def _convert_dataset(dataset_split):
# Read the image. # Read the image.
image_filename = os.path.join( image_filename = os.path.join(
FLAGS.image_folder, filenames[i] + '.' + FLAGS.image_format) FLAGS.image_folder, filenames[i] + '.' + FLAGS.image_format)
image_data = tf.gfile.FastGFile(image_filename, 'r').read() image_data = tf.gfile.FastGFile(image_filename, 'rb').read()
height, width = image_reader.read_image_dims(image_data) height, width = image_reader.read_image_dims(image_data)
# Read the semantic segmentation annotation. # Read the semantic segmentation annotation.
seg_filename = os.path.join( seg_filename = os.path.join(
FLAGS.semantic_segmentation_folder, FLAGS.semantic_segmentation_folder,
filenames[i] + '.' + FLAGS.label_format) filenames[i] + '.' + FLAGS.label_format)
seg_data = tf.gfile.FastGFile(seg_filename, 'r').read() seg_data = tf.gfile.FastGFile(seg_filename, 'rb').read()
seg_height, seg_width = label_reader.read_image_dims(seg_data) seg_height, seg_width = label_reader.read_image_dims(seg_data)
if height != seg_height or width != seg_width: if height != seg_height or width != seg_width:
raise RuntimeError('Shape mismatched between image and label.') raise RuntimeError('Shape mismatched between image and label.')
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
See model.py for more details and usage. See model.py for more details and usage.
""" """
import six
import math import math
import tensorflow as tf import tensorflow as tf
from deeplab import common from deeplab import common
...@@ -144,7 +145,7 @@ def main(unused_argv): ...@@ -144,7 +145,7 @@ def main(unused_argv):
metrics_to_values, metrics_to_updates = ( metrics_to_values, metrics_to_updates = (
tf.contrib.metrics.aggregate_metric_map(metric_map)) tf.contrib.metrics.aggregate_metric_map(metric_map))
for metric_name, metric_value in metrics_to_values.iteritems(): for metric_name, metric_value in six.iteritems(metrics_to_values):
slim.summaries.add_scalar_summary( slim.summaries.add_scalar_summary(
metric_value, metric_name, print_summary=True) metric_value, metric_name, print_summary=True)
...@@ -163,7 +164,7 @@ def main(unused_argv): ...@@ -163,7 +164,7 @@ def main(unused_argv):
checkpoint_dir=FLAGS.checkpoint_dir, checkpoint_dir=FLAGS.checkpoint_dir,
logdir=FLAGS.eval_logdir, logdir=FLAGS.eval_logdir,
num_evals=num_batches, num_evals=num_batches,
eval_op=metrics_to_updates.values(), eval_op=list(metrics_to_updates.values()),
max_number_of_evaluations=num_eval_iters, max_number_of_evaluations=num_eval_iters,
eval_interval_secs=FLAGS.eval_interval_secs) eval_interval_secs=FLAGS.eval_interval_secs)
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
See model.py for more details and usage. See model.py for more details and usage.
""" """
import six
import tensorflow as tf import tensorflow as tf
from deeplab import common from deeplab import common
from deeplab import model from deeplab import model
...@@ -66,6 +67,9 @@ flags.DEFINE_integer('save_interval_secs', 1200, ...@@ -66,6 +67,9 @@ flags.DEFINE_integer('save_interval_secs', 1200,
flags.DEFINE_integer('save_summaries_secs', 600, flags.DEFINE_integer('save_summaries_secs', 600,
'How often, in seconds, we compute the summaries.') 'How often, in seconds, we compute the summaries.')
flags.DEFINE_boolean('save_summaries_images', False,
'Save sample inputs, labels, and semantic predictions as images to summary.')
# Settings for training strategy. # Settings for training strategy.
flags.DEFINE_enum('learning_policy', 'poly', ['poly', 'step'], flags.DEFINE_enum('learning_policy', 'poly', ['poly', 'step'],
...@@ -177,6 +181,10 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label): ...@@ -177,6 +181,10 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
""" """
samples = inputs_queue.dequeue() samples = inputs_queue.dequeue()
# add name to input and label nodes so we can add to summary
samples[common.IMAGE] = tf.identity(samples[common.IMAGE], name = common.IMAGE)
samples[common.LABEL] = tf.identity(samples[common.LABEL], name = common.LABEL)
model_options = common.ModelOptions( model_options = common.ModelOptions(
outputs_to_num_classes=outputs_to_num_classes, outputs_to_num_classes=outputs_to_num_classes,
crop_size=FLAGS.train_crop_size, crop_size=FLAGS.train_crop_size,
...@@ -190,7 +198,13 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label): ...@@ -190,7 +198,13 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
is_training=True, is_training=True,
fine_tune_batch_norm=FLAGS.fine_tune_batch_norm) fine_tune_batch_norm=FLAGS.fine_tune_batch_norm)
for output, num_classes in outputs_to_num_classes.iteritems(): # add name to graph node so we can add to summary
outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE] = tf.identity(
outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE],
name = common.OUTPUT_TYPE
)
for output, num_classes in six.iteritems(outputs_to_num_classes):
train_utils.add_softmax_cross_entropy_loss_for_each_scale( train_utils.add_softmax_cross_entropy_loss_for_each_scale(
outputs_to_scales_to_logits[output], outputs_to_scales_to_logits[output],
samples[common.LABEL], samples[common.LABEL],
...@@ -217,7 +231,7 @@ def main(unused_argv): ...@@ -217,7 +231,7 @@ def main(unused_argv):
assert FLAGS.train_batch_size % config.num_clones == 0, ( assert FLAGS.train_batch_size % config.num_clones == 0, (
'Training batch size not divisble by number of clones (GPUs).') 'Training batch size not divisble by number of clones (GPUs).')
clone_batch_size = FLAGS.train_batch_size / config.num_clones clone_batch_size = int(FLAGS.train_batch_size / config.num_clones)
# Get dataset-dependent information. # Get dataset-dependent information.
dataset = segmentation_dataset.get_dataset( dataset = segmentation_dataset.get_dataset(
...@@ -226,7 +240,7 @@ def main(unused_argv): ...@@ -226,7 +240,7 @@ def main(unused_argv):
tf.gfile.MakeDirs(FLAGS.train_logdir) tf.gfile.MakeDirs(FLAGS.train_logdir)
tf.logging.info('Training on %s set', FLAGS.train_split) tf.logging.info('Training on %s set', FLAGS.train_split)
with tf.Graph().as_default(): with tf.Graph().as_default() as graph:
with tf.device(config.inputs_device()): with tf.device(config.inputs_device()):
samples = input_generator.get( samples = input_generator.get(
dataset, dataset,
...@@ -267,6 +281,22 @@ def main(unused_argv): ...@@ -267,6 +281,22 @@ def main(unused_argv):
for model_var in slim.get_model_variables(): for model_var in slim.get_model_variables():
summaries.add(tf.summary.histogram(model_var.op.name, model_var)) summaries.add(tf.summary.histogram(model_var.op.name, model_var))
# Add summaries for images, labels, semantic predictions
if FLAGS.save_summaries_images:
summary_image = graph.get_tensor_by_name(
('%s/%s:0' % (first_clone_scope, common.IMAGE)).strip('/'))
summaries.add(tf.summary.image('samples/%s' % common.IMAGE, summary_image))
summary_label = tf.cast(graph.get_tensor_by_name(
('%s/%s:0' % (first_clone_scope, common.LABEL)).strip('/')),
tf.uint8)
summaries.add(tf.summary.image('samples/%s' % common.LABEL, summary_label))
predictions = tf.cast(tf.expand_dims(tf.argmax(graph.get_tensor_by_name(
('%s/%s:0' % (first_clone_scope, common.OUTPUT_TYPE)).strip('/')),
3), -1), tf.uint8)
summaries.add(tf.summary.image('samples/%s' % common.OUTPUT_TYPE, predictions))
# Add summaries for losses. # Add summaries for losses.
for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
# ============================================================================== # ==============================================================================
"""Utility functions for training.""" """Utility functions for training."""
import six
import tensorflow as tf import tensorflow as tf
slim = tf.contrib.slim slim = tf.contrib.slim
...@@ -44,7 +46,7 @@ def add_softmax_cross_entropy_loss_for_each_scale(scales_to_logits, ...@@ -44,7 +46,7 @@ def add_softmax_cross_entropy_loss_for_each_scale(scales_to_logits,
if labels is None: if labels is None:
raise ValueError('No label for softmax cross entropy loss.') raise ValueError('No label for softmax cross entropy loss.')
for scale, logits in scales_to_logits.iteritems(): for scale, logits in six.iteritems(scales_to_logits):
loss_scope = None loss_scope = None
if scope: if scope:
loss_scope = '%s_%s' % (scope, scale) loss_scope = '%s_%s' % (scope, scale)
......
Scripts in support of the paper "Scalable Private Learning with PATE" by Nicolas
Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar
Erlingsson (ICLR 2018, https://arxiv.org/abs/1802.08908).
### Requirements
* Python, version ≥ 2.7
* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`)
* matplotlib
* numpy
* scipy
* sympy (for smooth sensitivity analysis)
* write access to current directory (otherwise, output directories in download.py and *.sh scripts
must be changed)
## Reproducing Figures 1 and 5, and Table 2
Before running any of the analysis scripts, create the data/ directory and download votes files by running\
`$ python download.py`
To generate Figures 1 and 5 run\
`$ sh generate_figures.sh`\
The output is written to the figures/ directory.
For Table 2 run (may take several hours)\
`$ sh generate_table.sh`\
The output is written to the console.
For data-independent bounds (for comparing with Table 2), run\
`$ sh generate_table_data_independent.sh`\
The output is written to the console.
## Files in this directory
* generate_figures.sh --- Master script for generating Figures 1 and 5.
* generate_table.sh --- Master script for generating Table 2.
* generate_table_data_independent.sh --- Master script for computing data-independent
bounds.
* rdp_bucketized.py --- Script for producing Figures 1 (right) and 5 (right).
* rdp_cumulative.py --- Script for producing Figure 1 (left, middle), Figure 5
(left), and partition.pdf (a detailed breakdown of privacy costs per
source).
* smooth_sensitivity_table.py --- Script for generating Table 2.
* rdp_flow.py and plot_ls_q.py are currently not used.
* download.py --- Utility script for populating the data/ directory.
All Python files take flags. Run script_name.py --help for help on flags.
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to download votes files to the data/ directory.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import urllib
import os
import tarfile
FILE_URI = 'https://storage.googleapis.com/pate-votes/votes.gz'
DATA_DIR = 'data/'
def download():
print('Downloading ' + FILE_URI)
tar_filename, _ = urllib.request.urlretrieve(FILE_URI)
print('Unpacking ' + tar_filename)
with tarfile.open(tar_filename, "r:gz") as tar:
tar.extractall(DATA_DIR)
print('Done!')
if __name__ == '__main__':
if not os.path.exists(DATA_DIR):
print('Data directory does not exist. Creating ' + DATA_DIR)
os.makedirs(DATA_DIR)
download()
#!/bin/bash
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
counts_file="data/glyph_5000_teachers.npy"
output_dir="figures/"
executable1="python rdp_bucketized.py"
executable2="python rdp_cumulative.py"
mkdir -p $output_dir
if [ ! -d "$output_dir" ]; then
echo "Directory $output_dir does not exist."
exit 1
fi
$executable1 \
--plot=small \
--counts_file=$counts_file \
--plot_file=$output_dir"noisy_thresholding_check_perf.pdf"
$executable1 \
--plot=large \
--counts_file=$counts_file \
--plot_file=$output_dir"noisy_thresholding_check_perf_details.pdf"
$executable2 \
--cache=False \
--counts_file=$counts_file \
--figures_dir=$output_dir
#!/bin/bash
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
echo "Reproducing Table 2. Takes a couple of hours."
executable="python smooth_sensitivity_table.py"
data_dir="data"
echo
echo "######## MNIST ########"
echo
$executable \
--counts_file=$data_dir"/mnist_250_teachers.npy" \
--threshold=200 \
--sigma1=150 \
--sigma2=40 \
--queries=640 \
--delta=1e-5
echo
echo "######## SVHN ########"
echo
$executable \
--counts_file=$data_dir"/svhn_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=8500 \
--delta=1e-6
echo
echo "######## Adult ########"
echo
$executable \
--counts_file=$data_dir"/adult_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=1500 \
--delta=1e-5
echo
echo "######## Glyph (Confident) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_5000_teachers.npy" \
--threshold=1000 \
--sigma1=500 \
--sigma2=100 \
--queries=12000 \
--delta=1e-8
echo
echo "######## Glyph (Interactive, Round 1) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round1.npy" \
--threshold=3500 \
--sigma1=1500 \
--sigma2=100 \
--delta=1e-8
echo
echo "######## Glyph (Interactive, Round 2) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round2.npy" \
--baseline_file=$data_dir"/glyph_round2_student.npy" \
--threshold=3500 \
--sigma1=2000 \
--sigma2=200 \
--teachers=5000 \
--delta=1e-8
#!/bin/bash
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
echo "Table 2 with data-independent analysis."
executable="python smooth_sensitivity_table.py"
data_dir="data"
echo
echo "######## MNIST ########"
echo
$executable \
--counts_file=$data_dir"/mnist_250_teachers.npy" \
--threshold=200 \
--sigma1=150 \
--sigma2=40 \
--queries=640 \
--delta=1e-5 \
--data_independent
echo
echo "######## SVHN ########"
echo
$executable \
--counts_file=$data_dir"/svhn_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=8500 \
--delta=1e-6 \
--data_independent
echo
echo "######## Adult ########"
echo
$executable \
--counts_file=$data_dir"/adult_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=1500 \
--delta=1e-5 \
--data_independent
echo
echo "######## Glyph (Confident) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_5000_teachers.npy" \
--threshold=1000 \
--sigma1=500 \
--sigma2=100 \
--queries=12000 \
--delta=1e-8 \
--data_independent
echo
echo "######## Glyph (Interactive, Round 1) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round1.npy" \
--threshold=3500 \
--sigma1=1500 \
--sigma2=100 \
--delta=1e-8 \
--data_independent
echo
echo "######## Glyph (Interactive, Round 2) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round2.npy" \
--baseline_file=$data_dir"/glyph_round2_student.npy" \
--threshold=3500 \
--sigma1=2000 \
--sigma2=200 \
--teachers=5000 \
--delta=1e-8 \
--order=8.5 \
--data_independent
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots LS(q).
A script in support of the PATE2 paper. NOT PRESENTLY USED.
The output is written to a specified directory as a pdf file.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import smooth_sensitivity as pate_ss
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('figures_dir', '', 'Path where the output is written to.')
def compute_ls_q(sigma, order, num_classes):
def beta(q):
return pate_ss._compute_rdp_gnmax(sigma, math.log(q), order)
def bu(q):
return pate_ss._compute_bu_gnmax(q, sigma, order)
def bl(q):
return pate_ss._compute_bl_gnmax(q, sigma, order)
def delta_beta(q):
if q == 0 or q > .8:
return 0
beta_q = beta(q)
beta_bu_q = beta(bu(q))
beta_bl_q = beta(bl(q))
assert beta_bl_q <= beta_q <= beta_bu_q
return beta_bu_q - beta_q # max(beta_bu_q - beta_q, beta_q - beta_bl_q)
logq0 = pate_ss.compute_logq0_gnmax(sigma, order)
logq1 = pate_ss._compute_logq1(sigma, order, num_classes)
print(math.exp(logq1), math.exp(logq0))
xs = np.linspace(0, .1, num=1000, endpoint=True)
ys = [delta_beta(x) for x in xs]
return xs, ys
def main(argv):
del argv # Unused.
sigma = 20
order = 20.
num_classes = 10
# sigma = 20
# order = 25.
# num_classes = 10
x_axis, ys = compute_ls_q(sigma, order, num_classes)
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(x_axis, ys, alpha=.8, linewidth=5)
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'ls_of_q.pdf')
print('Saving the graph to ' + fout_name)
plt.show()
plt.close('all')
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Illustrates how noisy thresholding check changes distribution of queries.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is one of two graphs depending on the setting of the plot variable.
The output is written to a pdf file.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_enum('plot', 'small', ['small', 'large'], 'Selects which of'
'the two plots is produced.')
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('plot_file', '', 'Plot file to write.')
def compute_count_per_bin(bin_num, votes):
"""Tabulates number of examples in each bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
Returns:
Array of counts of length bin_num.
"""
sums = np.sum(votes, axis=1)
# Check that all rows contain the same number of votes.
assert max(sums) == min(sums)
s = max(sums)
counts = np.zeros(bin_num)
n = votes.shape[0]
for i in xrange(n):
v = votes[i,]
bin_idx = int(math.floor(max(v) * bin_num / s))
assert 0 <= bin_idx < bin_num
counts[bin_idx] += 1
return counts
def compute_privacy_cost_per_bins(bin_num, votes, sigma2, order):
"""Outputs average privacy cost per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
sigma2: The scale (std) of the Gaussian noise. (Same as sigma_2 in
Algorithms 1 and 2.)
order: The Renyi order for which privacy cost is computed.
Returns:
Expected eps of RDP (ignoring delta) per example in each bin.
"""
n = votes.shape[0]
bin_counts = np.zeros(bin_num)
bin_rdp = np.zeros(bin_num) # RDP at order=order
for i in xrange(n):
v = votes[i,]
logq = pate.compute_logq_gaussian(v, sigma2)
rdp_at_order = pate.rdp_gaussian(logq, sigma2, order)
bin_idx = int(math.floor(max(v) * bin_num / sum(v)))
assert 0 <= bin_idx < bin_num
bin_counts[bin_idx] += 1
bin_rdp[bin_idx] += rdp_at_order
if (i + 1) % 1000 == 0:
print('example {}'.format(i + 1))
sys.stdout.flush()
return bin_rdp / bin_counts
def compute_expected_answered_per_bin(bin_num, votes, threshold, sigma1):
"""Computes expected number of answers per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
threshold: The threshold against which check is performed.
sigma1: The std of the Gaussian noise with which check is performed. (Same
as sigma_1 in Algorithms 1 and 2.)
Returns:
Expected number of queries answered per bin.
"""
n = votes.shape[0]
bin_answered = np.zeros(bin_num)
for i in xrange(n):
v = votes[i,]
p = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
bin_idx = int(math.floor(max(v) * bin_num / sum(v)))
assert 0 <= bin_idx < bin_num
bin_answered[bin_idx] += p
if (i + 1) % 1000 == 0:
print('example {}'.format(i + 1))
sys.stdout.flush()
return bin_answered
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
votes = votes[:4000,] # truncate to 4000 samples
if FLAGS.plot == 'small':
bin_num = 5
m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500)
elif FLAGS.plot == 'large':
bin_num = 10
m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500)
a_check = compute_expected_answered_per_bin(bin_num, votes, 5000, 1500)
eps = compute_privacy_cost_per_bins(bin_num, votes, 100, 50)
counts = compute_count_per_bin(bin_num, votes)
bins = np.linspace(0, 100, num=bin_num, endpoint=False)
plt.close('all')
fig, ax = plt.subplots()
if FLAGS.plot == 'small':
fig.set_figheight(4.7)
fig.set_figwidth(5)
ax.bar(
bins,
counts,
20,
color='orangered',
linestyle='dashed',
linewidth=5,
edgecolor='red',
fill=False,
alpha=.5,
align='edge',
label='LNMax answers')
ax.bar(
bins,
m_check,
20,
color='b',
alpha=.5,
linewidth=0,
edgecolor='g',
align='edge',
label='Confident-GNMax\nanswers')
elif FLAGS.plot == 'large':
fig.set_figheight(4.7)
fig.set_figwidth(7)
ax.bar(
bins,
counts,
10,
linestyle='dashed',
linewidth=5,
edgecolor='red',
fill=False,
alpha=.5,
align='edge',
label='LNMax answers')
ax.bar(
bins,
m_check,
10,
color='g',
alpha=.5,
linewidth=0,
edgecolor='g',
align='edge',
label='Confident-GNMax\nanswers (moderate)')
ax.bar(
bins,
a_check,
10,
color='b',
alpha=.5,
align='edge',
label='Confident-GNMax\nanswers (aggressive)')
ax2 = ax.twinx()
bin_centers = [x + 5 for x in bins]
ax2.plot(bin_centers, eps, 'ko', alpha=.8)
ax2.set_ylim([1e-200, 1.])
ax2.set_yscale('log')
ax2.grid(False)
ax2.set_yticks([1e-3, 1e-50, 1e-100, 1e-150, 1e-200])
plt.tick_params(which='minor', right='off')
ax2.set_ylabel(r'Per query privacy cost $\varepsilon$', fontsize=16)
plt.xlim([0, 100])
# ax.set_yscale('log')
ax.set_xlabel('Percentage of teachers that agree', fontsize=16)
ax.set_ylabel('Number of queries answered', fontsize=16)
vals = ax.get_xticks()
ax.set_xticklabels([str(int(x)) + '%' for x in vals])
ax.tick_params(labelsize=14)
ax.legend(loc=2, prop={'size': 16})
# simple: 'figures/noisy_thresholding_check_perf.pdf')
# detailed: 'figures/noisy_thresholding_check_perf_details.pdf'
print('Saving the graph to ' + FLAGS.plot_file)
plt.savefig(os.path.expanduser(FLAGS.plot_file), bbox_inches='tight')
plt.show()
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots three graphs illustrating cost of privacy per answered query.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of three pdf files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import pickle
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_boolean('cache', False,
'Read results of privacy analysis from cache.')
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
def run_analysis(votes, mechanism, noise_scale, params):
"""Computes data-dependent privacy.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf')
noise_scale: A mechanism privacy parameter.
params: Other privacy parameters.
Returns:
Four lists: cumulative privacy cost epsilon, how privacy budget is split,
how many queries were answered, optimal order.
"""
def compute_partition(order_opt, eps):
order_opt_idx = np.searchsorted(orders, order_opt)
if mechanism == 'gnmax_conf':
p = (rdp_select_cum[order_opt_idx],
rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx],
-math.log(delta) / (order_opt - 1))
else:
p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1))
return [x / eps for x in p] # Ensures that sum(x) == 1
# Short list of orders.
# orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500), num=100)))
delta = 1e-8
n = votes.shape[0]
eps_total = np.zeros(n)
partition = np.full(n, None, dtype=object)
order_opt = np.full(n, None, dtype=float)
answered = np.zeros(n)
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
rdp_select_cum = np.zeros(len(orders))
answered_sum = 0
for i in xrange(n):
v = votes[i,]
if mechanism == 'lnmax':
logq_lnmax = pate.compute_logq_laplace(v, noise_scale)
rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders)
rdp_sqrd = rdp_query**2
pr_answered = 1
elif mechanism == 'gnmax':
logq_gmax = pate.compute_logq_gaussian(v, noise_scale)
rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders)
rdp_sqrd = rdp_query**2
pr_answered = 1
elif mechanism == 'gnmax_conf':
logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v)
logq_step2 = pate.compute_logq_gaussian(v, noise_scale)
q_step1 = np.exp(logq_step1)
logq_step1_min = min(logq_step1, math.log1p(-q_step1))
rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min,
2**.5 * params['sigma1'], orders)
rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders)
rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_gnmax_step1**2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2 +
q_step1 * rdp_gnmax_step2**2)
rdp_select_cum += rdp_gnmax_step1
pr_answered = q_step1
rdp_cum += rdp_query
rdp_sqrd_cum += rdp_sqrd
answered_sum += pr_answered
answered[i] = answered_sum
eps_total[i], order_opt[i] = pate.compute_eps_from_delta(
orders, rdp_cum, delta)
partition[i] = compute_partition(order_opt[i], eps_total[i])
if i > 0 and (i + 1) % 1000 == 0:
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i)**2 # Ignore Bessel's correction.
order_opt_idx = np.searchsorted(orders, order_opt[i])
eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered_sum, eps_total[i], eps_std, order_opt[i],
-math.log(delta) / (order_opt[i] - 1)))
sys.stdout.flush()
return eps_total, partition, answered, order_opt
def print_plot_small(figures_dir, eps_lap, eps_gnmax, answered_gnmax):
"""Plots a graph of LNMax vs GNMax.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax: The cumulative privacy costs of the Gaussian mechanism
answered_gnmax: The cumulative count of queries answered.
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
y_lap = np.zeros(len(x_axis))
y_gnmax = np.full(len(x_axis), None, dtype=float)
for i in xrange(len(x_axis)):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx = np.searchsorted(answered_gnmax, x)
if idx < len(eps_gnmax):
y_gnmax[i] = eps_gnmax[idx]
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis, y_lap, color='r', ls='--', label='LNMax', alpha=.5, linewidth=5)
ax.plot(
x_axis,
y_gnmax,
color='g',
ls='-',
label='Confident-GNMax',
alpha=.5,
linewidth=5)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, 6000])
plt.ylim([0, 6.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_gnmax.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def print_plot_large(figures_dir, eps_lap, eps_gnmax1, answered_gnmax1,
eps_gnmax2, partition_gnmax2, answered_gnmax2):
"""Plots a graph of LNMax vs GNMax with two parameters.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1).
answered_gnmax1: The cumulative count of queries answered (set 1).
eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2).
partition_gnmax2: Allocation of eps for set 2.
answered_gnmax2: The cumulative count of queries answered (set 2).
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
lenx = len(x_axis)
y_lap = np.zeros(lenx)
y_gnmax1 = np.full(lenx, np.nan, dtype=float)
y_gnmax2 = np.full(lenx, np.nan, dtype=float)
y1_gnmax2 = np.full(lenx, np.nan, dtype=float)
for i in xrange(lenx):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx1 = np.searchsorted(answered_gnmax1, x)
if idx1 < len(eps_gnmax1):
y_gnmax1[i] = eps_gnmax1[idx1]
idx2 = np.searchsorted(answered_gnmax2, x)
if idx2 < len(eps_gnmax2):
y_gnmax2[i] = eps_gnmax2[idx2]
fraction_step1, fraction_step2, _ = partition_gnmax2[idx2]
y1_gnmax2[i] = eps_gnmax2[idx2] * fraction_step1 / (
fraction_step1 + fraction_step2)
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis,
y_lap,
color='r',
ls='dashed',
label='LNMax',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax1,
color='g',
ls='-',
label='Confident-GNMax (moderate)',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax2,
color='b',
ls='-',
label='Confident-GNMax (aggressive)',
alpha=.5,
linewidth=5)
ax.fill_between(
x_axis, [0] * lenx,
y1_gnmax2.tolist(),
facecolor='b',
alpha=.3,
hatch='\\')
ax.plot(
x_axis,
y1_gnmax2,
color='b',
ls='-',
label='_nolegend_',
alpha=.5,
linewidth=1)
ax.fill_between(
x_axis, y1_gnmax2.tolist(), y_gnmax2.tolist(), facecolor='b', alpha=.3)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, xlim])
plt.ylim([0, 1.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_2xgnmax_large.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def print_plot_partition(figures_dir, eps, partition, answered, order_opt):
"""Plots an expert version of the privacy-per-answered-query graph.
Args:
figures_dir: A name of the directory where to save the plot.
eps: The cumulative privacy cost.
partition: Allocation of the privacy cost.
answered: Cumulative number of queries answered.
order_opt: The list of optimal orders.
"""
xlim = 6000
x = range(0, int(xlim), 10)
lenx = len(x)
y = np.full(lenx, np.nan, dtype=float)
y1 = np.full(lenx, np.nan, dtype=float)
y2 = np.full(lenx, np.nan, dtype=float)
y_right = np.full(lenx, np.nan, dtype=float)
for i in xrange(lenx):
idx = np.searchsorted(answered, x[i])
if idx < len(eps):
y[i] = eps[idx]
fraction_step1, _, fraction_delta = partition[idx]
y1[i] = fraction_delta * y[i]
y2[i] = (fraction_delta + fraction_step1) * y[i]
y_right[i] = order_opt[idx]
# plt.close('all')
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
l1 = ax.plot(
x, y, color='b', ls='-', label=r'Total privacy cost', linewidth=5).pop()
ax.plot(x, y1, color='b', ls='-', label=r'_nolegend_', alpha=.5, linewidth=1)
ax.plot(x, y2, color='b', ls='-', label=r'_nolegend_', alpha=.5, linewidth=1)
ax.fill_between(x, [0] * lenx, y1.tolist(), facecolor='b', alpha=.1)
ax.fill_between(x, y1.tolist(), y2.tolist(), facecolor='b', alpha=.2)
ax.fill_between(x, y2.tolist(), y.tolist(), facecolor='b', alpha=.3)
t1 = 300
ax.text(x[t1], y1[t1] * .35, r'due to $\delta$', alpha=.5, fontsize=18)
ax.text(
x[t1],
y1[t1] + (y2[t1] - y1[t1]) * .6,
r'selection ($\sigma_1$)',
alpha=.5,
fontsize=18)
t2 = 550
ax.annotate(
r'answering ($\sigma_2$)',
xy=(x[t2], (y[t2] + y2[t2]) / 2),
xytext=(x[200], y[t2] * 1.1),
arrowprops=dict(facecolor='black', shrink=0.005, alpha=.5),
fontsize=18,
alpha=.5)
ax2 = ax.twinx()
l2 = ax2.plot(
x, y_right, 'r', ls='-', label=r'Optimal order', linewidth=5,
alpha=.5).pop()
ax2.grid(False)
ax2.set_ylabel(r'Optimal Renyi order', fontsize=16)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, xlim])
ax.set_ylim([0, 1.])
ax2.set_ylim([0, 200.])
ax.set_xlabel('Number of queries answered', fontsize=16)
ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
# Merging legends.
ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13)
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'partition.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def run_all_analyses(votes, lambda_laplace, gnmax_parameters, sigma2):
"""Sequentially runs all analyses.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
lambda_laplace: The scale of the Laplace noise (lambda).
gnmax_parameters: A list of parameters for GNMax.
sigma2: Shared parameter for the GNMax mechanisms.
Returns:
Five lists whose length is the number of queries.
"""
print('=== Laplace Mechanism ===')
eps_lap, _, _, _ = run_analysis(votes, 'lnmax', lambda_laplace, None)
print()
# Does not go anywhere, for now
# print('=== Gaussian Mechanism (simple) ===')
# eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None)
eps_gnmax = [[] for p in gnmax_parameters]
partition_gmax = [[] for p in gnmax_parameters]
answered = [[] for p in gnmax_parameters]
order_opt = [[] for p in gnmax_parameters]
for i, p in enumerate(gnmax_parameters):
print('=== Gaussian Mechanism (confident) {}: ==='.format(p))
eps_gnmax[i], partition_gmax[i], answered[i], order_opt[i] = run_analysis(
votes, 'gnmax_conf', sigma2, p)
print()
return eps_lap, eps_gnmax, partition_gmax, answered, order_opt
def main(argv):
del argv # Unused.
lambda_laplace = 50. # corresponds to eps = 1. / lambda_laplace
# Paramaters of the GNMax
gnmax_parameters = ({
't': 1000,
'sigma1': 500
}, {
't': 3500,
'sigma1': 1500
}, {
't': 5000,
'sigma1': 1500
})
sigma2 = 100 # GNMax parameters differ only in Step 1 (selection).
ftemp_name = '/tmp/precomputed.pkl'
figures_dir = os.path.expanduser(FLAGS.figures_dir)
if FLAGS.cache and os.path.isfile(ftemp_name):
print('Reading from cache ' + ftemp_name)
with open(ftemp_name, 'rb') as f:
(eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax) = pickle.load(f)
else:
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
(eps_lap, eps_gnmax, partition_gmax,
answered_gnmax, orders_opt_gnmax) = run_all_analyses(
votes, lambda_laplace, gnmax_parameters, sigma2)
print('Writing to cache ' + ftemp_name)
with open(ftemp_name, 'wb') as f:
pickle.dump((eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax), f)
print_plot_small(figures_dir, eps_lap, eps_gnmax[0], answered_gnmax[0])
print_plot_large(figures_dir, eps_lap, eps_gnmax[1], answered_gnmax[1],
eps_gnmax[2], partition_gmax[2], answered_gnmax[2])
print_plot_partition(figures_dir, eps_gnmax[2], partition_gmax[2],
answered_gnmax[2], orders_opt_gnmax[2])
plt.close('all')
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots two graphs illustrating cost of privacy per answered query.
PRESENTLY NOT USED.
A script in support of the PATE2 paper. The input is a file containing a numpy
array of votes, one query per row, one class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of two pdf files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
def plot_rdp_curve_per_example(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.5
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
styles = [':', '-']
labels = ['ex1', 'ex2']
for i in xrange(votes.shape[0]):
print(sorted(votes[i,], reverse=True)[:10])
for sigma in sigmas:
logq = pate.compute_logq_gaussian(votes[i,], sigma)
rdp = pate.rdp_gaussian(logq, sigma, orders)
ax.plot(
orders,
rdp,
label=r'{} $\sigma$={}'.format(labels[i], int(sigma)),
linestyle=styles[i],
linewidth=5)
for sigma in sigmas:
ax.plot(
orders,
pate.rdp_data_independent_gaussian(sigma, orders),
alpha=.3,
label=r'Data-ind bound $\sigma$={}'.format(int(sigma)),
linewidth=10)
plt.yticks([0, .01])
plt.xlabel(r'Order $\lambda$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow1.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.legend(loc=0, fontsize=13)
plt.show()
def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders,
target_answered):
rdp_cum = np.zeros(len(orders))
answered = 0
for i, v in enumerate(votes):
v = sorted(v, reverse=True)
q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp = pate.rdp_gaussian(logq_step2, sigma2, orders)
rdp_cum += q_step1 * rdp
answered += q_step1
if answered >= target_answered:
print('Processed {} queries to answer {}.'.format(i, target_answered))
return rdp_cum
assert False, 'Never reached {} answered queries.'.format(target_answered)
def plot_rdp_total(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=100)
orders[0] = 1.5
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
for sigma in sigmas:
rdp = compute_rdp_curve(votes, 5000, 1000, sigma, orders, 2000)
ax.plot(
orders,
rdp,
alpha=.8,
label=r'$\sigma$={}'.format(int(sigma)),
linewidth=5)
plt.xlabel(r'Order $\lambda$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow2.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.legend(loc=0, fontsize=13)
plt.show()
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
votes = votes[:12000,] # truncate to 4000 samples
v1 = [2550, 2200, 250] # based on votes[2,]
v2 = [2600, 2200, 200] # based on votes[381,]
plot_rdp_curve_per_example(np.array([v1, v2]), (100., 150.))
plot_rdp_total(votes, (100., 150.))
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Performs privacy analysis of GNMax with smooth sensitivity.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
Several flavors of the GNMax algorithm can be analyzed.
- Plain GNMax (argmax w/ Gaussian noise) is assumed when arguments threshold
and sigma2 are missing.
- Confident GNMax (thresholding + argmax w/ Gaussian noise) is used when
threshold, sigma1, and sigma2 are given.
- Interactive GNMax (two- or multi-round) is triggered by specifying
baseline_file, which provides baseline values for votes selection in Step 1.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import numpy as np
import core as pate
import smooth_sensitivity as pate_ss
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('baseline_file', None, 'File with baseline scores.')
flags.DEFINE_boolean('data_independent', False,
'Force data-independent bounds.')
flags.DEFINE_float('threshold', None, 'Threshold for step 1 (selection).')
flags.DEFINE_float('sigma1', None, 'Sigma for step 1 (selection).')
flags.DEFINE_float('sigma2', None, 'Sigma for step 2 (argmax).')
flags.DEFINE_integer('queries', None, 'Number of queries made by the student.')
flags.DEFINE_float('delta', 1e-8, 'Target delta.')
flags.DEFINE_float(
'order', None,
'Fixes a Renyi DP order (if unspecified, finds an optimal order from a '
'hardcoded list).')
flags.DEFINE_integer(
'teachers', None,
'Number of teachers (if unspecified, derived from the counts file).')
flags.mark_flag_as_required('counts_file')
flags.mark_flag_as_required('sigma2')
def _check_conditions(sigma, num_classes, orders):
"""Symbolic-numeric verification of conditions C5 and C6.
The conditions on the beta function are verified by constructing the beta
function symbolically, and then checking that its derivative (computed
symbolically) is non-negative within the interval of conjectured monotonicity.
The last check is performed numerically.
"""
print('Checking conditions C5 and C6 for all orders.')
sys.stdout.flush()
conditions_hold = True
for order in orders:
cond5, cond6 = pate_ss.check_conditions(sigma, num_classes, order)
conditions_hold &= cond5 and cond6
if not cond5:
print('Condition C5 does not hold for order =', order)
elif not cond6:
print('Condition C6 does not hold for order =', order)
if conditions_hold:
print('Conditions C5-C6 hold for all orders.')
sys.stdout.flush()
return conditions_hold
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders,
data_ind):
"""Computes the (data-dependent) RDP curve for Confident GNMax."""
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
answered = 0
for i, v in enumerate(votes):
if threshold is None:
logq_step1 = 0 # No thresholding, always proceed to step 2.
rdp_step1 = np.zeros(len(orders))
else:
logq_step1 = pate.compute_logpr_answered(threshold, sigma1,
v - baseline[i,])
if data_ind:
rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders)
else:
rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders)
if data_ind:
rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders)
else:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders)
q_step1 = np.exp(logq_step1)
rdp = rdp_step1 + rdp_step2 * q_step1
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 +
q_step1 * rdp_step2**2)
rdp_sqrd_cum += rdp_sqrd
rdp_cum += rdp
answered += q_step1
if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1):
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i)**2 # Ignore Bessel's correction.
eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
order_opt_idx = np.searchsorted(orders, order_opt)
eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered, eps_total, eps_std, order_opt,
-math.log(delta) / (order_opt - 1)))
sys.stdout.flush()
_, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
return order_opt
def _find_optimal_smooth_sensitivity_parameters(
votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1,
ind_step2, order):
"""Optimizes smooth sensitivity parameters by minimizing a cost function.
The cost function is
exact_eps + cost of GNSS + two stds of noise,
which captures that upper bound of the confidence interval of the sanitized
privacy budget.
Since optimization is done with full view of sensitive data, the results
cannot be released.
"""
rdp_cum = 0
answered_cum = 0
ls_cum = 0
# Define a plausible range for the beta values.
betas = np.arange(.3 / order, .495 / order, .01 / order)
cost_delta = math.log(1 / delta) / (order - 1)
for i, v in enumerate(votes):
if threshold is None:
log_pr_answered = 0
rdp1 = 0
ls_step1 = np.zeros(num_teachers)
else:
log_pr_answered = pate.compute_logpr_answered(threshold, sigma1,
v - baseline[i,])
if ind_step1: # apply data-independent bound for step 1 (thresholding).
rdp1 = pate.compute_rdp_data_independent_threshold(sigma1, order)
ls_step1 = np.zeros(num_teachers)
else:
rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order)
ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(
v - baseline[i,], num_teachers, threshold, sigma1, order)
pr_answered = math.exp(log_pr_answered)
answered_cum += pr_answered
if ind_step2: # apply data-independent bound for step 2 (GNMax).
rdp2 = pate.rdp_data_independent_gaussian(sigma2, order)
ls_step2 = np.zeros(num_teachers)
else:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order)
# Compute smooth sensitivity.
ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
v, num_teachers, sigma2, order)
rdp_cum += rdp1 + pr_answered * rdp2
ls_cum += ls_step1 + pr_answered * ls_step2 # Expected local sensitivity.
if ind_step1 and ind_step2:
# Data-independent bounds.
cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf
else:
# Data-dependent bounds.
cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None
for beta in betas:
ss = pate_ss.compute_discounted_max(beta, ls_cum)
# Solution to the minimization problem:
# min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma}
sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3)
cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta, sigma_ss, order)
# Cost captures exact_eps + cost of releasing SS + two stds of noise.
cost = rdp_cum + cost_ss + 2 * ss * sigma_ss
if cost < cost_opt:
cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss
if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1):
eps_before_ss = rdp_cum + cost_delta
eps_with_ss = (
eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta_opt, sigma_ss_opt, order))
print('{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to '
'{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'.
format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss,
ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt))
sys.stdout.flush()
# Return optimal parameters for the last iteration.
return beta_opt, ss_opt, sigma_ss_opt
####################
# HELPER FUNCTIONS #
####################
def _load_votes(counts_file, baseline_file, queries):
counts_file_expanded = os.path.expanduser(counts_file)
print('Reading raw votes from ' + counts_file_expanded)
sys.stdout.flush()
votes = np.load(counts_file_expanded)
print('Shape of the votes matrix = {}'.format(votes.shape))
if baseline_file is not None:
baseline_file_expanded = os.path.expanduser(baseline_file)
print('Reading baseline values from ' + baseline_file_expanded)
sys.stdout.flush()
baseline = np.load(baseline_file_expanded)
if votes.shape != baseline.shape:
raise ValueError(
'Counts file and baseline file must have the same shape. Got {} and '
'{} instead.'.format(votes.shape, baseline.shape))
else:
baseline = np.zeros_like(votes)
if queries is not None:
if votes.shape[0] < queries:
raise ValueError('Expect {} rows, got {} in {}'.format(
queries, votes.shape[0], counts_file))
# Truncate the votes matrix to the number of queries made.
votes = votes[:queries,]
baseline = baseline[:queries,]
else:
print('Process all {} input rows. (Use --queries flag to truncate.)'.format(
votes.shape[0]))
return votes, baseline
def _count_teachers(votes):
s = np.sum(votes, axis=1)
num_teachers = int(max(s))
if min(s) != num_teachers:
raise ValueError(
'Matrix of votes is malformed: the number of votes is not the same '
'across rows.')
return num_teachers
def _is_data_ind_step1(num_teachers, threshold, sigma1, orders):
if threshold is None:
return True
return np.all(
pate.is_data_independent_always_opt_threshold(num_teachers, threshold,
sigma1, orders))
def _is_data_ind_step2(num_teachers, num_classes, sigma, orders):
return np.all(
pate.is_data_independent_always_opt_gaussian(num_teachers, num_classes,
sigma, orders))
def main(argv):
del argv # Unused.
if (FLAGS.threshold is None) != (FLAGS.sigma1 is None):
raise ValueError(
'--threshold flag and --sigma1 flag must be present or absent '
'simultaneously.')
if FLAGS.order is None:
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500),
num=100)))
# Short list of orders.
# orders = np.round(
# np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
else:
orders = np.array([FLAGS.order])
votes, baseline = _load_votes(FLAGS.counts_file, FLAGS.baseline_file,
FLAGS.queries)
if FLAGS.teachers is None:
num_teachers = _count_teachers(votes)
else:
num_teachers = FLAGS.teachers
num_classes = votes.shape[1]
order = _compute_rdp(votes, baseline, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta, orders,
FLAGS.data_independent)
ind_step1 = _is_data_ind_step1(num_teachers, FLAGS.threshold, FLAGS.sigma1,
order)
ind_step2 = _is_data_ind_step2(num_teachers, num_classes, FLAGS.sigma2, order)
if FLAGS.data_independent or (ind_step1 and ind_step2):
print('Nothing to do here, all analyses are data-independent.')
return
if not _check_conditions(FLAGS.sigma2, num_classes, [order]):
return # Quit early: sufficient conditions for correctness fail to hold.
beta_opt, ss_opt, sigma_ss_opt = _find_optimal_smooth_sensitivity_parameters(
votes, baseline, num_teachers, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta, ind_step1, ind_step2, order)
print('Optimal beta = {:.4f}, E[SS_beta] = {:.4}, sigma_ss = {:.2f}'.format(
beta_opt, ss_opt, sigma_ss_opt))
if __name__ == '__main__':
app.run(main)
Implementation of an RDP privacy accountant and smooth sensitivity analysis for
the PATE framework. The underlying theory and supporting experiments appear in
"Scalable Private Learning with PATE" by Nicolas Papernot, Shuang Song, Ilya
Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar Erlingsson (ICLR 2018,
https://arxiv.org/abs/1802.08908).
## Overview
The PATE ('Private Aggregation of Teacher Ensembles') framework was introduced
by Papernot et al. in "Semi-supervised Knowledge Transfer for Deep Learning from
Private Training Data" (ICLR 2017, https://arxiv.org/abs/1610.05755). The
framework enables model-agnostic training that provably provides [differential
privacy](https://en.wikipedia.org/wiki/Differential_privacy) of the training
dataset.
The framework consists of _teachers_, the _student_ model, and the _aggregator_. The
teachers are models trained on disjoint subsets of the training datasets. The student
model has access to an insensitive (i.e., public) unlabelled dataset, which is labelled by
interacting with the ensemble of teachers via the _aggregator_. The aggregator tallies
outputs of the teacher models, and either forwards a (noisy) aggregate to the student, or
refuses to answer.
Differential privacy is enforced by the aggregator. The privacy guarantees can be _data-independent_,
which means that they are solely the function of the aggregator's parameters. Alternatively, privacy
analysis can be _data-dependent_, which allows for finer reasoning where, under certain conditions on
the input distribution, the final privacy guarantees can be improved relative to the data-independent
analysis. Data-dependent privacy guarantees may, by themselves, be a function of sensitive data and
therefore publishing these guarantees requires its own sanitization procedure. In our case
sanitization of data-dependent privacy guarantees proceeds via _smooth sensitivity_ analysis.
The common machinery used for all privacy analyses in this repository is the
R&eacute;nyi differential privacy, or RDP (see https://arxiv.org/abs/1702.07476).
This repository contains implementations of privacy accountants and smooth
sensitivity analysis for several data-independent and data-dependent mechanism that together
comprise the PATE framework.
### Requirements
* Python, version &ge; 2.7
* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`)
* numpy
* scipy
* sympy (for smooth sensitivity analysis)
* unittest (for testing)
### Self-testing
To verify the installation run
```bash
$ python core_test.py
$ python smooth_sensitivity_test.py
```
## Files in this directory
* core.py --- RDP privacy accountant for several vote aggregators (GNMax,
Threshold, Laplace).
* smooth_sensitivity.py --- Smooth sensitivity analysis for GNMax and
Threshold mechanisms.
* core_test.py and smooth_sensitivity_test.py --- Unit tests for the
files above.
## Contact information
You may direct your comments to mironov@google.com and PR to @ilyamironov.
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Core functions for RDP analysis in PATE framework.
This library comprises the core functions for doing differentially private
analysis of the PATE architecture and its various Noisy Max and other
mechanisms.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from absl import app
import numpy as np
import scipy.stats
def _logaddexp(x):
"""Addition in the log space. Analogue of numpy.logaddexp for a list."""
m = max(x)
return m + math.log(sum(np.exp(x - m)))
def _log1mexp(x):
"""Numerically stable computation of log(1-exp(x))."""
if x < -1:
return math.log1p(-math.exp(x))
elif x < 0:
return math.log(-math.expm1(x))
elif x == 0:
return -np.inf
else:
raise ValueError("Argument must be non-positive.")
def compute_eps_from_delta(orders, rdp, delta):
"""Translates between RDP and (eps, delta)-DP.
Args:
orders: A list (or a scalar) of orders.
rdp: A list of RDP guarantees (of the same length as orders).
delta: Target delta.
Returns:
Pair of (eps, optimal_order).
Raises:
ValueError: If input is malformed.
"""
if len(orders) != len(rdp):
raise ValueError("Input lists must have the same length.")
eps = np.array(rdp) - math.log(delta) / (np.array(orders) - 1)
idx_opt = np.argmin(eps)
return eps[idx_opt], orders[idx_opt]
#####################
# RDP FOR THE GNMAX #
#####################
def compute_logq_gaussian(counts, sigma):
"""Returns an upper bound on ln Pr[outcome != argmax] for GNMax.
Implementation of Proposition 7.
Args:
counts: A numpy array of scores.
sigma: The standard deviation of the Gaussian noise in the GNMax mechanism.
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
n = len(counts)
variance = sigma**2
idx_max = np.argmax(counts)
counts_normalized = counts[idx_max] - counts
counts_rest = counts_normalized[np.arange(n) != idx_max] # exclude one index
# Upper bound q via a union bound rather than a more precise calculation.
logq = _logaddexp(
scipy.stats.norm.logsf(counts_rest, scale=math.sqrt(2 * variance)))
# A sketch of a more accurate estimate, which is currently disabled for two
# reasons:
# 1. Numerical instability;
# 2. Not covered by smooth sensitivity analysis.
# covariance = variance * (np.ones((n - 1, n - 1)) + np.identity(n - 1))
# logq = np.log1p(-statsmodels.sandbox.distributions.extras.mvnormcdf(
# counts_rest, np.zeros(n - 1), covariance, maxpts=1e4))
return min(logq, math.log(1 - (1 / n)))
def rdp_data_independent_gaussian(sigma, orders):
"""Computes a data-independent RDP curve for GNMax.
Implementation of Proposition 8.
Args:
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
variance = sigma**2
if np.isscalar(orders):
return orders / variance
else:
return np.atleast_1d(orders) / variance
def rdp_gaussian(logq, sigma, orders):
"""Bounds RDP from above of GNMax given an upper bound on q (Theorem 6).
Args:
logq: Natural logarithm of the probability of a non-argmax outcome.
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if logq > 0 or sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP.
if np.isscalar(orders):
return 0.
else:
return np.full_like(orders, 0., dtype=np.float)
variance = sigma**2
# Use two different higher orders: mu_hi1 and mu_hi2 computed according to
# Proposition 10.
mu_hi2 = math.sqrt(variance * -logq)
mu_hi1 = mu_hi2 + 1
orders_vec = np.atleast_1d(orders)
ret = orders_vec / variance # baseline: data-independent bound
# Filter out entries where data-dependent bound does not apply.
mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1)
rdp_hi1 = mu_hi1 / variance
rdp_hi2 = mu_hi2 / variance
log_a2 = (mu_hi2 - 1) * rdp_hi2
# Make sure q is in the increasing wrt q range and A is positive.
if (np.any(mask) and logq <= log_a2 - mu_hi2 *
(math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and
-logq > rdp_hi2):
# Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0.
log1q = _log1mexp(logq) # log1q = log(1-q)
log_a = (orders - 1) * (
log1q - _log1mexp((logq + rdp_hi2) * (1 - 1 / mu_hi2)))
log_b = (orders - 1) * (rdp_hi1 - logq / (mu_hi1 - 1))
# Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y.
log_s = np.logaddexp(log1q + log_a, logq + log_b)
ret[mask] = np.minimum(ret, log_s / (orders - 1))[mask]
assert np.all(ret >= 0)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def is_data_independent_always_opt_gaussian(num_teachers, num_classes, sigma,
orders):
"""Tests whether data-ind bound is always optimal for GNMax.
Args:
num_teachers: Number of teachers.
num_classes: Number of classes.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
unanimous = np.array([num_teachers] + [0] * (num_classes - 1))
logq = compute_logq_gaussian(unanimous, sigma)
rdp_dep = rdp_gaussian(logq, sigma, orders)
rdp_ind = rdp_data_independent_gaussian(sigma, orders)
return np.isclose(rdp_dep, rdp_ind)
###################################
# RDP FOR THE THRESHOLD MECHANISM #
###################################
def compute_logpr_answered(t, sigma, counts):
"""Computes log of the probability that a noisy threshold is crossed.
Args:
t: The threshold.
sigma: The stdev of the Gaussian noise added to the threshold.
counts: An array of votes.
Returns:
Natural log of the probability that max is larger than a noisy threshold.
"""
# Compared to the paper, max(counts) is rounded to the nearest integer. This
# is done to facilitate computation of smooth sensitivity for the case of
# the interactive mechanism, where votes are not necessarily integer.
return scipy.stats.norm.logsf(t - round(max(counts)), scale=sigma)
def compute_rdp_data_independent_threshold(sigma, orders):
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_data_independent_gaussian(2**.5 * sigma, orders)
def compute_rdp_threshold(log_pr_answered, sigma, orders):
logq = min(log_pr_answered, _log1mexp(log_pr_answered))
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_gaussian(logq, 2**.5 * sigma, orders)
def is_data_independent_always_opt_threshold(num_teachers, threshold, sigma,
orders):
"""Tests whether data-ind bound is always optimal for the threshold mechanism.
Args:
num_teachers: Number of teachers.
threshold: The cut-off threshold.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
# Since the data-dependent bound depends only on max(votes), it suffices to
# check whether the data-dependent bounds are better than data-independent
# bounds in the extreme cases when max(votes) is minimal or maximal.
# For both Confident GNMax and Interactive GNMax it holds that
# 0 <= max(votes) <= num_teachers.
# The upper bound is trivial in both cases.
# The lower bound is trivial for Confident GNMax (and a stronger one, based on
# the pigeonhole principle, is possible).
# For Interactive GNMax (Algorithm 2), the lower bound follows from the
# following argument. Since the votes vector is the difference between the
# actual teachers' votes and the student's baseline, we need to argue that
# max(n_j - M * p_j) >= 0.
# The bound holds because sum_j n_j = sum M * p_j = M. Thus,
# sum_j (n_j - M * p_j) = 0, and max_j (n_j - M * p_j) >= 0 as needed.
logq1 = compute_logpr_answered(threshold, sigma, [0])
logq2 = compute_logpr_answered(threshold, sigma, [num_teachers])
rdp_dep1 = compute_rdp_threshold(logq1, sigma, orders)
rdp_dep2 = compute_rdp_threshold(logq2, sigma, orders)
rdp_ind = compute_rdp_data_independent_threshold(sigma, orders)
return np.isclose(rdp_dep1, rdp_ind) and np.isclose(rdp_dep2, rdp_ind)
#############################
# RDP FOR THE LAPLACE NOISE #
#############################
def compute_logq_laplace(counts, lmbd):
"""Computes an upper bound on log Pr[outcome != argmax] for LNMax.
Args:
counts: A list of scores.
lmbd: The lambda parameter of the Laplace distribution ~exp(-|x| / lambda).
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
# For noisy max, we only get an upper bound via the union bound. See Lemma 4
# in https://arxiv.org/abs/1610.05755.
#
# Pr[ j beats i*] = (2+gap(j,i*))/ 4 exp(gap(j,i*)
# proof at http://mathoverflow.net/questions/66763/
idx_max = np.argmax(counts)
counts_normalized = (counts - counts[idx_max]) / lmbd
counts_rest = np.array(
[counts_normalized[i] for i in range(len(counts)) if i != idx_max])
logq = _logaddexp(np.log(2 - counts_rest) + math.log(.25) + counts_rest)
return min(logq, math.log(1 - (1 / len(counts))))
def rdp_pure_eps(logq, pure_eps, orders):
"""Computes the RDP value given logq and pure privacy eps.
Implementation of https://arxiv.org/abs/1610.05755, Theorem 3.
The bound used is the min of three terms. The first term is from
https://arxiv.org/pdf/1605.02065.pdf.
The second term is based on the fact that when event has probability (1-q) for
q close to zero, q can only change by exp(eps), which corresponds to a
much smaller multiplicative change in (1-q)
The third term comes directly from the privacy guarantee.
Args:
logq: Natural logarithm of the probability of a non-optimal outcome.
pure_eps: eps parameter for DP
orders: array_like list of moments to compute.
Returns:
Array of upper bounds on rdp (a scalar if orders is a scalar).
"""
orders_vec = np.atleast_1d(orders)
q = math.exp(logq)
log_t = np.full_like(orders_vec, np.inf)
if q <= 1 / (math.exp(pure_eps) + 1):
logt_one = math.log1p(-q) + (
math.log1p(-q) - _log1mexp(pure_eps + logq)) * (
orders_vec - 1)
logt_two = logq + pure_eps * (orders_vec - 1)
log_t = np.logaddexp(logt_one, logt_two)
ret = np.minimum(
np.minimum(0.5 * pure_eps * pure_eps * orders_vec,
log_t / (orders_vec - 1)), pure_eps)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def main(argv):
del argv # Unused.
if __name__ == "__main__":
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for google3.experimental.brain.privacy.pate.pate."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import unittest
import numpy as np
import core as pate
class PateTest(unittest.TestCase):
def _test_rdp_gaussian_value_errors(self):
# Test for ValueErrors.
with self.assertRaises(ValueError):
pate.rdp_gaussian(1.0, 1.0, np.array([2, 3, 4]))
with self.assertRaises(ValueError):
pate.rdp_gaussian(np.log(0.5), -1.0, np.array([2, 3, 4]))
with self.assertRaises(ValueError):
pate.rdp_gaussian(np.log(0.5), 1.0, np.array([1, 3, 4]))
def _test_rdp_gaussian_as_function_of_q(self):
# Test for data-independent and data-dependent ranges over q.
# The following corresponds to orders 1.1, 2.5, 32, 250
# sigmas 1.5, 15, 1500, 15000.
# Hand calculated -log(q0)s arranged in a 'sigma major' ordering.
neglogq0s = [
2.8, 2.6, 427, None, 4.8, 4.0, 4.7, 275, 9.6, 8.8, 6.0, 4, 12, 11.2,
8.6, 6.4
]
idx_neglogq0s = 0 # To iterate through neglogq0s.
orders = [1.1, 2.5, 32, 250]
sigmas = [1.5, 15, 1500, 15000]
for sigma in sigmas:
for order in orders:
curr_neglogq0 = neglogq0s[idx_neglogq0s]
idx_neglogq0s += 1
if curr_neglogq0 is None: # sigma == 1.5 and order == 250:
continue
rdp_at_q0 = pate.rdp_gaussian(-curr_neglogq0, sigma, order)
# Data-dependent range. (Successively halve the value of q.)
logq_dds = (-curr_neglogq0 - np.array(
[0, np.log(2), np.log(4), np.log(8)]))
# Check that in q_dds, rdp is decreasing.
for idx in range(len(logq_dds) - 1):
self.assertGreater(
pate.rdp_gaussian(logq_dds[idx], sigma, order),
pate.rdp_gaussian(logq_dds[idx + 1], sigma, order))
# Data-independent range.
q_dids = np.exp(-curr_neglogq0) + np.array([0.1, 0.2, 0.3, 0.4])
# Check that in q_dids, rdp is constant.
for q in q_dids:
self.assertEqual(rdp_at_q0, pate.rdp_gaussian(
np.log(q), sigma, order))
def _test_compute_eps_from_delta_value_error(self):
# Test for ValueError.
with self.assertRaises(ValueError):
pate.compute_eps_from_delta([1.1, 2, 3, 4], [1, 2, 3], 0.001)
def _test_compute_eps_from_delta_monotonicity(self):
# Test for monotonicity with respect to delta.
orders = [1.1, 2.5, 250.0]
sigmas = [1e-3, 1.0, 1e5]
deltas = [1e-60, 1e-6, 0.1, 0.999]
for sigma in sigmas:
list_of_eps = []
rdps_for_gaussian = np.array(orders) / (2 * sigma**2)
for delta in deltas:
list_of_eps.append(
pate.compute_eps_from_delta(orders, rdps_for_gaussian, delta)[0])
# Check that in list_of_eps, epsilons are decreasing (as delta increases).
sorted_list_of_eps = list(list_of_eps)
sorted_list_of_eps.sort(reverse=True)
self.assertEqual(list_of_eps, sorted_list_of_eps)
def _test_compute_q0(self):
# Stub code to search a logq space and figure out logq0 by eyeballing
# results. This code does not run with the tests. Remove underscore to run.
sigma = 15
order = 250
logqs = np.arange(-290, -270, 1)
count = 0
for logq in logqs:
count += 1
sys.stdout.write("\t%0.5g: %0.10g" %
(logq, pate.rdp_gaussian(logq, sigma, order)))
sys.stdout.flush()
if count % 5 == 0:
print("")
def test_rdp_gaussian(self):
self._test_rdp_gaussian_value_errors()
self._test_rdp_gaussian_as_function_of_q()
def test_compute_eps_from_delta(self):
self._test_compute_eps_from_delta_value_error()
self._test_compute_eps_from_delta_monotonicity()
if __name__ == "__main__":
unittest.main()
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions for smooth sensitivity analysis for PATE mechanisms.
This library implements functionality for doing smooth sensitivity analysis
for Gaussian Noise Max (GNMax), Threshold with Gaussian noise, and Gaussian
Noise with Smooth Sensitivity (GNSS) mechanisms.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from absl import app
import numpy as np
import scipy
import sympy as sp
import core as pate
################################
# SMOOTH SENSITIVITY FOR GNMAX #
################################
# Global dictionary for storing cached q0 values keyed by (sigma, order).
_logq0_cache = {}
def _compute_logq0(sigma, order):
key = (sigma, order)
if key in _logq0_cache:
return _logq0_cache[key]
logq0 = compute_logq0_gnmax(sigma, order)
_logq0_cache[key] = logq0 # Update the global variable.
return logq0
def _compute_logq1(sigma, order, num_classes):
logq0 = _compute_logq0(sigma, order) # Most likely already cached.
logq1 = math.log(_compute_bl_gnmax(math.exp(logq0), sigma, num_classes))
assert logq1 <= logq0
return logq1
def _compute_mu1_mu2_gnmax(sigma, logq):
# Computes mu1, mu2 according to Proposition 10.
mu2 = sigma * math.sqrt(-logq)
mu1 = mu2 + 1
return mu1, mu2
def _compute_data_dep_bound_gnmax(sigma, logq, order):
# Applies Theorem 6 in Appendix without checking that logq satisfies necessary
# constraints. The pre-conditions must be assured by comparing logq against
# logq0 by the caller.
variance = sigma**2
mu1, mu2 = _compute_mu1_mu2_gnmax(sigma, logq)
eps1 = mu1 / variance
eps2 = mu2 / variance
log1q = np.log1p(-math.exp(logq)) # log1q = log(1-q)
log_a = (order - 1) * (
log1q - (np.log1p(-math.exp((logq + eps2) * (1 - 1 / mu2)))))
log_b = (order - 1) * (eps1 - logq / (mu1 - 1))
return np.logaddexp(log1q + log_a, logq + log_b) / (order - 1)
def _compute_rdp_gnmax(sigma, logq, order):
logq0 = _compute_logq0(sigma, order)
if logq >= logq0:
return pate.rdp_data_independent_gaussian(sigma, order)
else:
return _compute_data_dep_bound_gnmax(sigma, logq, order)
def compute_logq0_gnmax(sigma, order):
"""Computes the point where we start using data-independent bounds.
Args:
sigma: std of the Gaussian noise
order: Renyi order lambda
Returns:
logq0: the point above which the data-ind bound overtakes data-dependent
bound.
"""
def _check_validity_conditions(logq):
# Function returns true iff logq is in the range where data-dependent bound
# is valid. (Theorem 6 in Appendix.)
mu1, mu2 = _compute_mu1_mu2_gnmax(sigma, logq)
if mu1 < order:
return False
eps2 = mu2 / sigma**2
# Do computation in the log space. The condition below comes from Lemma 9
# from Appendix.
return (logq <= (mu2 - 1) * eps2 - mu2 * math.log(mu1 / (mu1 - 1) * mu2 /
(mu2 - 1)))
def _compare_dep_vs_ind(logq):
return (_compute_data_dep_bound_gnmax(sigma, logq, order) -
pate.rdp_data_independent_gaussian(sigma, order))
# Natural upper bounds on q0.
logub = min(-(1 + 1. / sigma)**2, -((order - 1) / sigma)**2, -1 / sigma**2)
assert _check_validity_conditions(logub)
# If data-dependent bound is already better, we are done already.
if _compare_dep_vs_ind(logub) < 0:
return logub
# Identifying a reasonable lower bound to bracket logq0.
loglb = 2 * logub # logub is negative, and thus loglb < logub.
while _compare_dep_vs_ind(loglb) > 0:
assert loglb > -10000, "The lower bound on q0 is way too low."
loglb *= 1.5
logq0, r = scipy.optimize.brentq(
_compare_dep_vs_ind, loglb, logub, full_output=True)
assert r.converged, "The root finding procedure failed to converge."
assert _check_validity_conditions(logq0) # just in case.
return logq0
def _compute_bl_gnmax(q, sigma, num_classes):
return ((num_classes - 1) / 2 * scipy.special.erfc(
1 / sigma + scipy.special.erfcinv(2 * q / (num_classes - 1))))
def _compute_bu_gnmax(q, sigma, num_classes):
return min(1, (num_classes - 1) / 2 * scipy.special.erfc(
-1 / sigma + scipy.special.erfcinv(2 * q / (num_classes - 1))))
def _compute_local_sens_gnmax(logq, sigma, num_classes, order):
"""Implements Algorithm 3 (computes an upper bound on local sensitivity).
(See Proposition 13 for proof of correctness.)
"""
logq0 = _compute_logq0(sigma, order)
logq1 = _compute_logq1(sigma, order, num_classes)
if logq1 <= logq <= logq0:
logq = logq1
beta = _compute_rdp_gnmax(sigma, logq, order)
beta_bu_q = _compute_rdp_gnmax(
sigma, math.log(_compute_bu_gnmax(math.exp(logq), sigma, num_classes)),
order)
beta_bl_q = _compute_rdp_gnmax(
sigma, math.log(_compute_bl_gnmax(math.exp(logq), sigma, num_classes)),
order)
return max(beta_bu_q - beta, beta - beta_bl_q)
def compute_local_sensitivity_bounds_gnmax(votes, num_teachers, sigma, order):
"""Computes a list of max-LS-at-distance-d for the GNMax mechanism.
A more efficient implementation of Algorithms 4 and 5 working in time
O(teachers*classes). A naive implementation is O(teachers^2*classes) or worse.
Args:
votes: A numpy array of votes.
num_teachers: Total number of voting teachers.
sigma: Standard deviation of the Guassian noise.
order: The Renyi order.
Returns:
A numpy array of local sensitivities at distances d, 0 <= d <= num_teachers.
"""
num_classes = len(votes) # Called m in the paper.
logq0 = _compute_logq0(sigma, order)
logq1 = _compute_logq1(sigma, order, num_classes)
logq = pate.compute_logq_gaussian(votes, sigma)
plateau = _compute_local_sens_gnmax(logq1, sigma, num_classes, order)
res = np.full(num_teachers, plateau)
if logq1 <= logq <= logq0:
return res
# Invariant: votes is sorted in the non-increasing order.
votes = sorted(votes, reverse=True)
res[0] = _compute_local_sens_gnmax(logq, sigma, num_classes, order)
curr_d = 0
go_left = logq > logq0 # Otherwise logq < logq1 and we go right.
# Iterate while the following is true:
# 1. If we are going left, logq is still larger than logq0 and we may still
# increase the gap between votes[0] and votes[1].
# 2. If we are going right, logq is still smaller than logq1.
while ((go_left and logq > logq0 and votes[1] > 0) or
(not go_left and logq < logq1)):
curr_d += 1
if go_left: # Try decreasing logq.
votes[0] += 1
votes[1] -= 1
idx = 1
# Restore the invariant. (Can be implemented more efficiently by keeping
# track of the range of indices equal to votes[1]. Does not seem to matter
# for the overall running time.)
while idx < len(votes) - 1 and votes[idx] < votes[idx + 1]:
votes[idx], votes[idx + 1] = votes[idx + 1], votes[idx]
idx += 1
else: # Go right, i.e., try increasing logq.
votes[0] -= 1
votes[1] += 1 # The invariant holds since otherwise logq >= logq1.
logq = pate.compute_logq_gaussian(votes, sigma)
res[curr_d] = _compute_local_sens_gnmax(logq, sigma, num_classes, order)
return res
##################################################
# SMOOTH SENSITIVITY FOR THE THRESHOLD MECHANISM #
##################################################
# A global dictionary of RDPs for various threshold values. Indexed by a 4-tuple
# (num_teachers, threshold, sigma, order).
_rdp_thresholds = {}
def _compute_rdp_list_threshold(num_teachers, threshold, sigma, order):
key = (num_teachers, threshold, sigma, order)
if key in _rdp_thresholds:
return _rdp_thresholds[key]
res = np.zeros(num_teachers + 1)
for v in range(0, num_teachers + 1):
logp = scipy.stats.norm.logsf(threshold - v, scale=sigma)
res[v] = pate.compute_rdp_threshold(logp, sigma, order)
_rdp_thresholds[key] = res
return res
def compute_local_sensitivity_bounds_threshold(counts, num_teachers, threshold,
sigma, order):
"""Computes a list of max-LS-at-distance-d for the threshold mechanism."""
def _compute_ls(v):
ls_step_up, ls_step_down = None, None
if v > 0:
ls_step_down = abs(rdp_list[v - 1] - rdp_list[v])
if v < num_teachers:
ls_step_up = abs(rdp_list[v + 1] - rdp_list[v])
return max(ls_step_down, ls_step_up) # Rely on max(x, None) = x.
cur_max = int(round(max(counts)))
rdp_list = _compute_rdp_list_threshold(num_teachers, threshold, sigma, order)
ls = np.zeros(num_teachers)
for d in range(max(cur_max, num_teachers - cur_max)):
ls_up, ls_down = None, None
if cur_max + d <= num_teachers:
ls_up = _compute_ls(cur_max + d)
if cur_max - d >= 0:
ls_down = _compute_ls(cur_max - d)
ls[d] = max(ls_up, ls_down)
return ls
#############################################
# PROCEDURES FOR SMOOTH SENSITIVITY RELEASE #
#############################################
# A global dictionary of exponentially decaying arrays. Indexed by beta.
dict_beta_discount = {}
def compute_discounted_max(beta, a):
n = len(a)
if beta not in dict_beta_discount or (len(dict_beta_discount[beta]) < n):
dict_beta_discount[beta] = np.exp(-beta * np.arange(n))
return max(a * dict_beta_discount[beta][:n])
def compute_smooth_sensitivity_gnmax(beta, counts, num_teachers, sigma, order):
"""Computes smooth sensitivity of a single application of GNMax."""
ls = compute_local_sensitivity_bounds_gnmax(counts, sigma, order,
num_teachers)
return compute_discounted_max(beta, ls)
def compute_rdp_of_smooth_sensitivity_gaussian(beta, sigma, order):
"""Computes the RDP curve for the GNSS mechanism.
Implements Theorem 23 (https://arxiv.org/pdf/1802.08908.pdf).
"""
if beta > 0 and not 1 < order < 1 / (2 * beta):
raise ValueError("Order outside the (1, 1/(2*beta)) range.")
return order * math.exp(2 * beta) / sigma**2 + (
-.5 * math.log(1 - 2 * order * beta) + beta * order) / (
order - 1)
def compute_params_for_ss_release(eps, delta):
"""Computes sigma for additive Gaussian noise scaled by smooth sensitivity.
Presently not used. (We proceed via RDP analysis.)
Compute beta, sigma for applying Lemma 2.6 (full version of Nissim et al.) via
Lemma 2.10.
"""
# Rather than applying Lemma 2.10 directly, which would give suboptimal alpha,
# (see http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf),
# we extract a sufficient condition on alpha from its proof.
#
# Let a = rho_(delta/2)(Z_1). Then solve for alpha such that
# 2 alpha a + alpha^2 = eps/2.
a = scipy.special.ndtri(1 - delta / 2)
alpha = math.sqrt(a**2 + eps / 2) - a
beta = eps / (2 * scipy.special.chdtri(1, delta / 2))
return alpha, beta
#######################################################
# SYMBOLIC-NUMERIC VERIFICATION OF CONDITIONS C5--C6. #
#######################################################
def _construct_symbolic_beta(q, sigma, order):
mu2 = sigma * sp.sqrt(sp.log(1 / q))
mu1 = mu2 + 1
eps1 = mu1 / sigma**2
eps2 = mu2 / sigma**2
a = (1 - q) / (1 - (q * sp.exp(eps2))**(1 - 1 / mu2))
b = sp.exp(eps1) / q**(1 / (mu1 - 1))
s = (1 - q) * a**(order - 1) + q * b**(order - 1)
return (1 / (order - 1)) * sp.log(s)
def _construct_symbolic_bu(q, sigma, m):
return (m - 1) / 2 * sp.erfc(sp.erfcinv(2 * q / (m - 1)) - 1 / sigma)
def _is_non_decreasing(fn, q, bounds):
"""Verifies whether the function is non-decreasing within a range.
Args:
fn: Symbolic function of a single variable.
q: The name of f's variable.
bounds: Pair of (lower_bound, upper_bound) reals.
Returns:
True iff the function is non-decreasing in the range.
"""
diff_fn = sp.diff(fn, q) # Symbolically compute the derivative.
diff_fn_lambdified = sp.lambdify(
q,
diff_fn,
modules=[
"numpy", {
"erfc": scipy.special.erfc,
"erfcinv": scipy.special.erfcinv
}
])
r = scipy.optimize.minimize_scalar(
diff_fn_lambdified, bounds=bounds, method="bounded")
assert r.success, "Minimizer failed to converge."
return r.fun >= 0 # Check whether the derivative is non-negative.
def check_conditions(sigma, m, order):
"""Checks conditions C5 and C6 (Section B.4.2 in Appendix)."""
q = sp.symbols("q", positive=True, real=True)
beta = _construct_symbolic_beta(q, sigma, order)
q0 = math.exp(compute_logq0_gnmax(sigma, order))
cond5 = _is_non_decreasing(beta, q, (0, q0))
if cond5:
bl_q0 = _compute_bl_gnmax(q0, sigma, m)
bu = _construct_symbolic_bu(q, sigma, m)
delta_beta = beta.subs(q, bu) - beta
cond6 = _is_non_decreasing(delta_beta, q, (0, bl_q0))
else:
cond6 = False # Skip the check, since Condition 5 is false already.
return (cond5, cond6)
def main(argv):
del argv # Unused.
if __name__ == "__main__":
app.run(main)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment