Commit b3247557 authored by Dheera Venkatraman's avatar Dheera Venkatraman
Browse files

add flag for saving images to summary; strings moved to common.py'

parents 75c931fd 2041d5ca
...@@ -199,9 +199,9 @@ def main(_): ...@@ -199,9 +199,9 @@ def main(_):
return return
with tf.gfile.FastGFile(FLAGS.original_image) as image_file: with tf.gfile.FastGFile(FLAGS.original_image) as image_file:
img1_str = image_file.read() img1_str = image_file.read('rb')
with tf.gfile.FastGFile(FLAGS.compared_image) as image_file: with tf.gfile.FastGFile(FLAGS.compared_image) as image_file:
img2_str = image_file.read() img2_str = image_file.read('rb')
input_img = tf.placeholder(tf.string) input_img = tf.placeholder(tf.string)
decoded_image = tf.expand_dims(tf.image.decode_png(input_img, channels=3), 0) decoded_image = tf.expand_dims(tf.image.decode_png(input_img, channels=3), 0)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# ============================================================================== # ==============================================================================
"""Tests for xception.py.""" """Tests for xception.py."""
import six
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
...@@ -309,7 +310,7 @@ class XceptionNetworkTest(tf.test.TestCase): ...@@ -309,7 +310,7 @@ class XceptionNetworkTest(tf.test.TestCase):
'xception/middle_flow/block1': [2, 14, 14, 4], 'xception/middle_flow/block1': [2, 14, 14, 4],
'xception/exit_flow/block1': [2, 7, 7, 8], 'xception/exit_flow/block1': [2, 7, 7, 8],
'xception/exit_flow/block2': [2, 7, 7, 16]} 'xception/exit_flow/block2': [2, 7, 7, 16]}
for endpoint, shape in endpoint_to_shape.iteritems(): for endpoint, shape in six.iteritems(endpoint_to_shape):
self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape)
def testFullyConvolutionalEndpointShapes(self): def testFullyConvolutionalEndpointShapes(self):
...@@ -330,7 +331,7 @@ class XceptionNetworkTest(tf.test.TestCase): ...@@ -330,7 +331,7 @@ class XceptionNetworkTest(tf.test.TestCase):
'xception/middle_flow/block1': [2, 21, 21, 4], 'xception/middle_flow/block1': [2, 21, 21, 4],
'xception/exit_flow/block1': [2, 11, 11, 8], 'xception/exit_flow/block1': [2, 11, 11, 8],
'xception/exit_flow/block2': [2, 11, 11, 16]} 'xception/exit_flow/block2': [2, 11, 11, 16]}
for endpoint, shape in endpoint_to_shape.iteritems(): for endpoint, shape in six.iteritems(endpoint_to_shape):
self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape)
def testAtrousFullyConvolutionalEndpointShapes(self): def testAtrousFullyConvolutionalEndpointShapes(self):
...@@ -352,7 +353,7 @@ class XceptionNetworkTest(tf.test.TestCase): ...@@ -352,7 +353,7 @@ class XceptionNetworkTest(tf.test.TestCase):
'xception/middle_flow/block1': [2, 41, 41, 4], 'xception/middle_flow/block1': [2, 41, 41, 4],
'xception/exit_flow/block1': [2, 41, 41, 8], 'xception/exit_flow/block1': [2, 41, 41, 8],
'xception/exit_flow/block2': [2, 41, 41, 16]} 'xception/exit_flow/block2': [2, 41, 41, 16]}
for endpoint, shape in endpoint_to_shape.iteritems(): for endpoint, shape in six.iteritems(endpoint_to_shape):
self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape)
def testAtrousFullyConvolutionalValues(self): def testAtrousFullyConvolutionalValues(self):
......
...@@ -154,10 +154,10 @@ def _convert_dataset(dataset_split): ...@@ -154,10 +154,10 @@ def _convert_dataset(dataset_split):
i + 1, num_images, shard_id)) i + 1, num_images, shard_id))
sys.stdout.flush() sys.stdout.flush()
# Read the image. # Read the image.
image_data = tf.gfile.FastGFile(image_files[i], 'r').read() image_data = tf.gfile.FastGFile(image_files[i], 'rb').read()
height, width = image_reader.read_image_dims(image_data) height, width = image_reader.read_image_dims(image_data)
# Read the semantic segmentation annotation. # Read the semantic segmentation annotation.
seg_data = tf.gfile.FastGFile(label_files[i], 'r').read() seg_data = tf.gfile.FastGFile(label_files[i], 'rb').read()
seg_height, seg_width = label_reader.read_image_dims(seg_data) seg_height, seg_width = label_reader.read_image_dims(seg_data)
if height != seg_height or width != seg_width: if height != seg_height or width != seg_width:
raise RuntimeError('Shape mismatched between image and label.') raise RuntimeError('Shape mismatched between image and label.')
......
...@@ -125,7 +125,10 @@ def _bytes_list_feature(values): ...@@ -125,7 +125,10 @@ def _bytes_list_feature(values):
Returns: Returns:
A TF-Feature. A TF-Feature.
""" """
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) def norm2bytes(value):
return value.encode() if isinstance(value, str) else value
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[norm2bytes(values)]))
def image_seg_to_tfexample(image_data, filename, height, width, seg_data): def image_seg_to_tfexample(image_data, filename, height, width, seg_data):
......
...@@ -114,13 +114,13 @@ def _convert_dataset(dataset_split): ...@@ -114,13 +114,13 @@ def _convert_dataset(dataset_split):
# Read the image. # Read the image.
image_filename = os.path.join( image_filename = os.path.join(
FLAGS.image_folder, filenames[i] + '.' + FLAGS.image_format) FLAGS.image_folder, filenames[i] + '.' + FLAGS.image_format)
image_data = tf.gfile.FastGFile(image_filename, 'r').read() image_data = tf.gfile.FastGFile(image_filename, 'rb').read()
height, width = image_reader.read_image_dims(image_data) height, width = image_reader.read_image_dims(image_data)
# Read the semantic segmentation annotation. # Read the semantic segmentation annotation.
seg_filename = os.path.join( seg_filename = os.path.join(
FLAGS.semantic_segmentation_folder, FLAGS.semantic_segmentation_folder,
filenames[i] + '.' + FLAGS.label_format) filenames[i] + '.' + FLAGS.label_format)
seg_data = tf.gfile.FastGFile(seg_filename, 'r').read() seg_data = tf.gfile.FastGFile(seg_filename, 'rb').read()
seg_height, seg_width = label_reader.read_image_dims(seg_data) seg_height, seg_width = label_reader.read_image_dims(seg_data)
if height != seg_height or width != seg_width: if height != seg_height or width != seg_width:
raise RuntimeError('Shape mismatched between image and label.') raise RuntimeError('Shape mismatched between image and label.')
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
See model.py for more details and usage. See model.py for more details and usage.
""" """
import six
import math import math
import tensorflow as tf import tensorflow as tf
from deeplab import common from deeplab import common
...@@ -144,7 +145,7 @@ def main(unused_argv): ...@@ -144,7 +145,7 @@ def main(unused_argv):
metrics_to_values, metrics_to_updates = ( metrics_to_values, metrics_to_updates = (
tf.contrib.metrics.aggregate_metric_map(metric_map)) tf.contrib.metrics.aggregate_metric_map(metric_map))
for metric_name, metric_value in metrics_to_values.iteritems(): for metric_name, metric_value in six.iteritems(metrics_to_values):
slim.summaries.add_scalar_summary( slim.summaries.add_scalar_summary(
metric_value, metric_name, print_summary=True) metric_value, metric_name, print_summary=True)
...@@ -163,7 +164,7 @@ def main(unused_argv): ...@@ -163,7 +164,7 @@ def main(unused_argv):
checkpoint_dir=FLAGS.checkpoint_dir, checkpoint_dir=FLAGS.checkpoint_dir,
logdir=FLAGS.eval_logdir, logdir=FLAGS.eval_logdir,
num_evals=num_batches, num_evals=num_batches,
eval_op=metrics_to_updates.values(), eval_op=list(metrics_to_updates.values()),
max_number_of_evaluations=num_eval_iters, max_number_of_evaluations=num_eval_iters,
eval_interval_secs=FLAGS.eval_interval_secs) eval_interval_secs=FLAGS.eval_interval_secs)
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
See model.py for more details and usage. See model.py for more details and usage.
""" """
import six
import tensorflow as tf import tensorflow as tf
from deeplab import common from deeplab import common
from deeplab import model from deeplab import model
...@@ -66,6 +67,9 @@ flags.DEFINE_integer('save_interval_secs', 1200, ...@@ -66,6 +67,9 @@ flags.DEFINE_integer('save_interval_secs', 1200,
flags.DEFINE_integer('save_summaries_secs', 600, flags.DEFINE_integer('save_summaries_secs', 600,
'How often, in seconds, we compute the summaries.') 'How often, in seconds, we compute the summaries.')
flags.DEFINE_boolean('save_summaries_images', False,
'Save sample inputs, labels, and semantic predictions as images to summary.')
# Settings for training strategy. # Settings for training strategy.
flags.DEFINE_enum('learning_policy', 'poly', ['poly', 'step'], flags.DEFINE_enum('learning_policy', 'poly', ['poly', 'step'],
...@@ -197,10 +201,10 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label): ...@@ -197,10 +201,10 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
# add name to graph node so we can add to summary # add name to graph node so we can add to summary
outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE] = tf.identity( outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE] = tf.identity(
outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE], outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE],
name = 'semantic_merged_logits' name = OUTPUT_MERGED_LOGITS_NODE
) )
for output, num_classes in outputs_to_num_classes.iteritems(): for output, num_classes in six.iteritems(outputs_to_num_classes):
train_utils.add_softmax_cross_entropy_loss_for_each_scale( train_utils.add_softmax_cross_entropy_loss_for_each_scale(
outputs_to_scales_to_logits[output], outputs_to_scales_to_logits[output],
samples[common.LABEL], samples[common.LABEL],
...@@ -227,7 +231,7 @@ def main(unused_argv): ...@@ -227,7 +231,7 @@ def main(unused_argv):
assert FLAGS.train_batch_size % config.num_clones == 0, ( assert FLAGS.train_batch_size % config.num_clones == 0, (
'Training batch size not divisble by number of clones (GPUs).') 'Training batch size not divisble by number of clones (GPUs).')
clone_batch_size = FLAGS.train_batch_size / config.num_clones clone_batch_size = int(FLAGS.train_batch_size / config.num_clones)
# Get dataset-dependent information. # Get dataset-dependent information.
dataset = segmentation_dataset.get_dataset( dataset = segmentation_dataset.get_dataset(
...@@ -278,14 +282,15 @@ def main(unused_argv): ...@@ -278,14 +282,15 @@ def main(unused_argv):
summaries.add(tf.summary.histogram(model_var.op.name, model_var)) summaries.add(tf.summary.histogram(model_var.op.name, model_var))
# Add summaries for images, labels, semantic predictions # Add summaries for images, labels, semantic predictions
summary_image = graph.get_tensor_by_name(first_clone_scope + '/input_image:0') if FLAGS.save_summaries_images:
summaries.add(tf.summary.image('samples/input_image', summary_image)) summary_image = graph.get_tensor_by_name('%s/%s:0' % first_clone_slope, INPUT_LABEL_NODE)
summaries.add(tf.summary.image('samples/%s' % INPUT_IMAGE_NODE, summary_image))
summary_label = tf.cast(graph.get_tensor_by_name(first_clone_scope + '/input_label:0'), tf.uint8) summary_label = tf.cast(graph.get_tensor_by_name('%s/%s:0' % first_clone_slope, INPUT_LABEL_NODE), tf.uint8)
summaries.add(tf.summary.image('samples/input_label', summary_label)) summaries.add(tf.summary.image('samples/%s' % INPUT_LABEL_NODE, summary_label))
predictions = tf.cast(tf.expand_dims(tf.argmax(graph.get_tensor_by_name(first_clone_scope + '/semantic_merged_logits:0'), 3), -1), tf.uint8) predictions = tf.cast(tf.expand_dims(tf.argmax(graph.get_tensor_by_name('%s/%s:0' % first_clone_scope, OUTPUT_MERGED_LOGITS_NODE), 3), -1), tf.uint8)
summaries.add(tf.summary.image('samples/semantic_predictions', predictions)) summaries.add(tf.summary.image('samples/%s' % OUTPUT_MERGED_LOGITS_NODE, predictions))
# Add summaries for losses. # Add summaries for losses.
for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
# ============================================================================== # ==============================================================================
"""Utility functions for training.""" """Utility functions for training."""
import six
import tensorflow as tf import tensorflow as tf
slim = tf.contrib.slim slim = tf.contrib.slim
...@@ -44,7 +46,7 @@ def add_softmax_cross_entropy_loss_for_each_scale(scales_to_logits, ...@@ -44,7 +46,7 @@ def add_softmax_cross_entropy_loss_for_each_scale(scales_to_logits,
if labels is None: if labels is None:
raise ValueError('No label for softmax cross entropy loss.') raise ValueError('No label for softmax cross entropy loss.')
for scale, logits in scales_to_logits.iteritems(): for scale, logits in six.iteritems(scales_to_logits):
loss_scope = None loss_scope = None
if scope: if scope:
loss_scope = '%s_%s' % (scope, scale) loss_scope = '%s_%s' % (scope, scale)
......
Scripts in support of the paper "Scalable Private Learning with PATE" by Nicolas
Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar
Erlingsson (ICLR 2018, https://arxiv.org/abs/1802.08908).
### Requirements
* Python, version ≥ 2.7
* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`)
* matplotlib
* numpy
* scipy
* sympy (for smooth sensitivity analysis)
* write access to current directory (otherwise, output directories in download.py and *.sh scripts
must be changed)
## Reproducing Figures 1 and 5, and Table 2
Before running any of the analysis scripts, create the data/ directory and download votes files by running\
`$ python download.py`
To generate Figures 1 and 5 run\
`$ sh generate_figures.sh`\
The output is written to the figures/ directory.
For Table 2 run (may take several hours)\
`$ sh generate_table.sh`\
The output is written to the console.
For data-independent bounds (for comparing with Table 2), run\
`$ sh generate_table_data_independent.sh`\
The output is written to the console.
## Files in this directory
* generate_figures.sh --- Master script for generating Figures 1 and 5.
* generate_table.sh --- Master script for generating Table 2.
* generate_table_data_independent.sh --- Master script for computing data-independent
bounds.
* rdp_bucketized.py --- Script for producing Figures 1 (right) and 5 (right).
* rdp_cumulative.py --- Script for producing Figure 1 (left, middle), Figure 5
(left), and partition.pdf (a detailed breakdown of privacy costs per
source).
* smooth_sensitivity_table.py --- Script for generating Table 2.
* rdp_flow.py and plot_ls_q.py are currently not used.
* download.py --- Utility script for populating the data/ directory.
All Python files take flags. Run script_name.py --help for help on flags.
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to download votes files to the data/ directory.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import urllib
import os
import tarfile
FILE_URI = 'https://storage.googleapis.com/pate-votes/votes.gz'
DATA_DIR = 'data/'
def download():
print('Downloading ' + FILE_URI)
tar_filename, _ = urllib.request.urlretrieve(FILE_URI)
print('Unpacking ' + tar_filename)
with tarfile.open(tar_filename, "r:gz") as tar:
tar.extractall(DATA_DIR)
print('Done!')
if __name__ == '__main__':
if not os.path.exists(DATA_DIR):
print('Data directory does not exist. Creating ' + DATA_DIR)
os.makedirs(DATA_DIR)
download()
#!/bin/bash
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
counts_file="data/glyph_5000_teachers.npy"
output_dir="figures/"
executable1="python rdp_bucketized.py"
executable2="python rdp_cumulative.py"
mkdir -p $output_dir
if [ ! -d "$output_dir" ]; then
echo "Directory $output_dir does not exist."
exit 1
fi
$executable1 \
--plot=small \
--counts_file=$counts_file \
--plot_file=$output_dir"noisy_thresholding_check_perf.pdf"
$executable1 \
--plot=large \
--counts_file=$counts_file \
--plot_file=$output_dir"noisy_thresholding_check_perf_details.pdf"
$executable2 \
--cache=False \
--counts_file=$counts_file \
--figures_dir=$output_dir
#!/bin/bash
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
echo "Reproducing Table 2. Takes a couple of hours."
executable="python smooth_sensitivity_table.py"
data_dir="data"
echo
echo "######## MNIST ########"
echo
$executable \
--counts_file=$data_dir"/mnist_250_teachers.npy" \
--threshold=200 \
--sigma1=150 \
--sigma2=40 \
--queries=640 \
--delta=1e-5
echo
echo "######## SVHN ########"
echo
$executable \
--counts_file=$data_dir"/svhn_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=8500 \
--delta=1e-6
echo
echo "######## Adult ########"
echo
$executable \
--counts_file=$data_dir"/adult_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=1500 \
--delta=1e-5
echo
echo "######## Glyph (Confident) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_5000_teachers.npy" \
--threshold=1000 \
--sigma1=500 \
--sigma2=100 \
--queries=12000 \
--delta=1e-8
echo
echo "######## Glyph (Interactive, Round 1) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round1.npy" \
--threshold=3500 \
--sigma1=1500 \
--sigma2=100 \
--delta=1e-8
echo
echo "######## Glyph (Interactive, Round 2) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round2.npy" \
--baseline_file=$data_dir"/glyph_round2_student.npy" \
--threshold=3500 \
--sigma1=2000 \
--sigma2=200 \
--teachers=5000 \
--delta=1e-8
#!/bin/bash
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
echo "Table 2 with data-independent analysis."
executable="python smooth_sensitivity_table.py"
data_dir="data"
echo
echo "######## MNIST ########"
echo
$executable \
--counts_file=$data_dir"/mnist_250_teachers.npy" \
--threshold=200 \
--sigma1=150 \
--sigma2=40 \
--queries=640 \
--delta=1e-5 \
--data_independent
echo
echo "######## SVHN ########"
echo
$executable \
--counts_file=$data_dir"/svhn_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=8500 \
--delta=1e-6 \
--data_independent
echo
echo "######## Adult ########"
echo
$executable \
--counts_file=$data_dir"/adult_250_teachers.npy" \
--threshold=300 \
--sigma1=200 \
--sigma2=40 \
--queries=1500 \
--delta=1e-5 \
--data_independent
echo
echo "######## Glyph (Confident) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_5000_teachers.npy" \
--threshold=1000 \
--sigma1=500 \
--sigma2=100 \
--queries=12000 \
--delta=1e-8 \
--data_independent
echo
echo "######## Glyph (Interactive, Round 1) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round1.npy" \
--threshold=3500 \
--sigma1=1500 \
--sigma2=100 \
--delta=1e-8 \
--data_independent
echo
echo "######## Glyph (Interactive, Round 2) ########"
echo
$executable \
--counts_file=$data_dir"/glyph_round2.npy" \
--baseline_file=$data_dir"/glyph_round2_student.npy" \
--threshold=3500 \
--sigma1=2000 \
--sigma2=200 \
--teachers=5000 \
--delta=1e-8 \
--order=8.5 \
--data_independent
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots LS(q).
A script in support of the PATE2 paper. NOT PRESENTLY USED.
The output is written to a specified directory as a pdf file.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import smooth_sensitivity as pate_ss
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('figures_dir', '', 'Path where the output is written to.')
def compute_ls_q(sigma, order, num_classes):
def beta(q):
return pate_ss._compute_rdp_gnmax(sigma, math.log(q), order)
def bu(q):
return pate_ss._compute_bu_gnmax(q, sigma, order)
def bl(q):
return pate_ss._compute_bl_gnmax(q, sigma, order)
def delta_beta(q):
if q == 0 or q > .8:
return 0
beta_q = beta(q)
beta_bu_q = beta(bu(q))
beta_bl_q = beta(bl(q))
assert beta_bl_q <= beta_q <= beta_bu_q
return beta_bu_q - beta_q # max(beta_bu_q - beta_q, beta_q - beta_bl_q)
logq0 = pate_ss.compute_logq0_gnmax(sigma, order)
logq1 = pate_ss._compute_logq1(sigma, order, num_classes)
print(math.exp(logq1), math.exp(logq0))
xs = np.linspace(0, .1, num=1000, endpoint=True)
ys = [delta_beta(x) for x in xs]
return xs, ys
def main(argv):
del argv # Unused.
sigma = 20
order = 20.
num_classes = 10
# sigma = 20
# order = 25.
# num_classes = 10
x_axis, ys = compute_ls_q(sigma, order, num_classes)
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(x_axis, ys, alpha=.8, linewidth=5)
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'ls_of_q.pdf')
print('Saving the graph to ' + fout_name)
plt.show()
plt.close('all')
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Illustrates how noisy thresholding check changes distribution of queries.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is one of two graphs depending on the setting of the plot variable.
The output is written to a pdf file.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_enum('plot', 'small', ['small', 'large'], 'Selects which of'
'the two plots is produced.')
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('plot_file', '', 'Plot file to write.')
def compute_count_per_bin(bin_num, votes):
"""Tabulates number of examples in each bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
Returns:
Array of counts of length bin_num.
"""
sums = np.sum(votes, axis=1)
# Check that all rows contain the same number of votes.
assert max(sums) == min(sums)
s = max(sums)
counts = np.zeros(bin_num)
n = votes.shape[0]
for i in xrange(n):
v = votes[i,]
bin_idx = int(math.floor(max(v) * bin_num / s))
assert 0 <= bin_idx < bin_num
counts[bin_idx] += 1
return counts
def compute_privacy_cost_per_bins(bin_num, votes, sigma2, order):
"""Outputs average privacy cost per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
sigma2: The scale (std) of the Gaussian noise. (Same as sigma_2 in
Algorithms 1 and 2.)
order: The Renyi order for which privacy cost is computed.
Returns:
Expected eps of RDP (ignoring delta) per example in each bin.
"""
n = votes.shape[0]
bin_counts = np.zeros(bin_num)
bin_rdp = np.zeros(bin_num) # RDP at order=order
for i in xrange(n):
v = votes[i,]
logq = pate.compute_logq_gaussian(v, sigma2)
rdp_at_order = pate.rdp_gaussian(logq, sigma2, order)
bin_idx = int(math.floor(max(v) * bin_num / sum(v)))
assert 0 <= bin_idx < bin_num
bin_counts[bin_idx] += 1
bin_rdp[bin_idx] += rdp_at_order
if (i + 1) % 1000 == 0:
print('example {}'.format(i + 1))
sys.stdout.flush()
return bin_rdp / bin_counts
def compute_expected_answered_per_bin(bin_num, votes, threshold, sigma1):
"""Computes expected number of answers per bin.
Args:
bin_num: Number of bins.
votes: A matrix of votes, where each row contains votes in one instance.
threshold: The threshold against which check is performed.
sigma1: The std of the Gaussian noise with which check is performed. (Same
as sigma_1 in Algorithms 1 and 2.)
Returns:
Expected number of queries answered per bin.
"""
n = votes.shape[0]
bin_answered = np.zeros(bin_num)
for i in xrange(n):
v = votes[i,]
p = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
bin_idx = int(math.floor(max(v) * bin_num / sum(v)))
assert 0 <= bin_idx < bin_num
bin_answered[bin_idx] += p
if (i + 1) % 1000 == 0:
print('example {}'.format(i + 1))
sys.stdout.flush()
return bin_answered
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
votes = votes[:4000,] # truncate to 4000 samples
if FLAGS.plot == 'small':
bin_num = 5
m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500)
elif FLAGS.plot == 'large':
bin_num = 10
m_check = compute_expected_answered_per_bin(bin_num, votes, 3500, 1500)
a_check = compute_expected_answered_per_bin(bin_num, votes, 5000, 1500)
eps = compute_privacy_cost_per_bins(bin_num, votes, 100, 50)
counts = compute_count_per_bin(bin_num, votes)
bins = np.linspace(0, 100, num=bin_num, endpoint=False)
plt.close('all')
fig, ax = plt.subplots()
if FLAGS.plot == 'small':
fig.set_figheight(4.7)
fig.set_figwidth(5)
ax.bar(
bins,
counts,
20,
color='orangered',
linestyle='dashed',
linewidth=5,
edgecolor='red',
fill=False,
alpha=.5,
align='edge',
label='LNMax answers')
ax.bar(
bins,
m_check,
20,
color='b',
alpha=.5,
linewidth=0,
edgecolor='g',
align='edge',
label='Confident-GNMax\nanswers')
elif FLAGS.plot == 'large':
fig.set_figheight(4.7)
fig.set_figwidth(7)
ax.bar(
bins,
counts,
10,
linestyle='dashed',
linewidth=5,
edgecolor='red',
fill=False,
alpha=.5,
align='edge',
label='LNMax answers')
ax.bar(
bins,
m_check,
10,
color='g',
alpha=.5,
linewidth=0,
edgecolor='g',
align='edge',
label='Confident-GNMax\nanswers (moderate)')
ax.bar(
bins,
a_check,
10,
color='b',
alpha=.5,
align='edge',
label='Confident-GNMax\nanswers (aggressive)')
ax2 = ax.twinx()
bin_centers = [x + 5 for x in bins]
ax2.plot(bin_centers, eps, 'ko', alpha=.8)
ax2.set_ylim([1e-200, 1.])
ax2.set_yscale('log')
ax2.grid(False)
ax2.set_yticks([1e-3, 1e-50, 1e-100, 1e-150, 1e-200])
plt.tick_params(which='minor', right='off')
ax2.set_ylabel(r'Per query privacy cost $\varepsilon$', fontsize=16)
plt.xlim([0, 100])
# ax.set_yscale('log')
ax.set_xlabel('Percentage of teachers that agree', fontsize=16)
ax.set_ylabel('Number of queries answered', fontsize=16)
vals = ax.get_xticks()
ax.set_xticklabels([str(int(x)) + '%' for x in vals])
ax.tick_params(labelsize=14)
ax.legend(loc=2, prop={'size': 16})
# simple: 'figures/noisy_thresholding_check_perf.pdf')
# detailed: 'figures/noisy_thresholding_check_perf_details.pdf'
print('Saving the graph to ' + FLAGS.plot_file)
plt.savefig(os.path.expanduser(FLAGS.plot_file), bbox_inches='tight')
plt.show()
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots three graphs illustrating cost of privacy per answered query.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
The input is a file containing a numpy array of votes, one query per row, one
class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of three pdf files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import pickle
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_boolean('cache', False,
'Read results of privacy analysis from cache.')
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
def run_analysis(votes, mechanism, noise_scale, params):
"""Computes data-dependent privacy.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf')
noise_scale: A mechanism privacy parameter.
params: Other privacy parameters.
Returns:
Four lists: cumulative privacy cost epsilon, how privacy budget is split,
how many queries were answered, optimal order.
"""
def compute_partition(order_opt, eps):
order_opt_idx = np.searchsorted(orders, order_opt)
if mechanism == 'gnmax_conf':
p = (rdp_select_cum[order_opt_idx],
rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx],
-math.log(delta) / (order_opt - 1))
else:
p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1))
return [x / eps for x in p] # Ensures that sum(x) == 1
# Short list of orders.
# orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500), num=100)))
delta = 1e-8
n = votes.shape[0]
eps_total = np.zeros(n)
partition = np.full(n, None, dtype=object)
order_opt = np.full(n, None, dtype=float)
answered = np.zeros(n)
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
rdp_select_cum = np.zeros(len(orders))
answered_sum = 0
for i in xrange(n):
v = votes[i,]
if mechanism == 'lnmax':
logq_lnmax = pate.compute_logq_laplace(v, noise_scale)
rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders)
rdp_sqrd = rdp_query**2
pr_answered = 1
elif mechanism == 'gnmax':
logq_gmax = pate.compute_logq_gaussian(v, noise_scale)
rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders)
rdp_sqrd = rdp_query**2
pr_answered = 1
elif mechanism == 'gnmax_conf':
logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v)
logq_step2 = pate.compute_logq_gaussian(v, noise_scale)
q_step1 = np.exp(logq_step1)
logq_step1_min = min(logq_step1, math.log1p(-q_step1))
rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min,
2**.5 * params['sigma1'], orders)
rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders)
rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_gnmax_step1**2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2 +
q_step1 * rdp_gnmax_step2**2)
rdp_select_cum += rdp_gnmax_step1
pr_answered = q_step1
rdp_cum += rdp_query
rdp_sqrd_cum += rdp_sqrd
answered_sum += pr_answered
answered[i] = answered_sum
eps_total[i], order_opt[i] = pate.compute_eps_from_delta(
orders, rdp_cum, delta)
partition[i] = compute_partition(order_opt[i], eps_total[i])
if i > 0 and (i + 1) % 1000 == 0:
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i)**2 # Ignore Bessel's correction.
order_opt_idx = np.searchsorted(orders, order_opt[i])
eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered_sum, eps_total[i], eps_std, order_opt[i],
-math.log(delta) / (order_opt[i] - 1)))
sys.stdout.flush()
return eps_total, partition, answered, order_opt
def print_plot_small(figures_dir, eps_lap, eps_gnmax, answered_gnmax):
"""Plots a graph of LNMax vs GNMax.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax: The cumulative privacy costs of the Gaussian mechanism
answered_gnmax: The cumulative count of queries answered.
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
y_lap = np.zeros(len(x_axis))
y_gnmax = np.full(len(x_axis), None, dtype=float)
for i in xrange(len(x_axis)):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx = np.searchsorted(answered_gnmax, x)
if idx < len(eps_gnmax):
y_gnmax[i] = eps_gnmax[idx]
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis, y_lap, color='r', ls='--', label='LNMax', alpha=.5, linewidth=5)
ax.plot(
x_axis,
y_gnmax,
color='g',
ls='-',
label='Confident-GNMax',
alpha=.5,
linewidth=5)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, 6000])
plt.ylim([0, 6.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_gnmax.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def print_plot_large(figures_dir, eps_lap, eps_gnmax1, answered_gnmax1,
eps_gnmax2, partition_gnmax2, answered_gnmax2):
"""Plots a graph of LNMax vs GNMax with two parameters.
Args:
figures_dir: A name of the directory where to save the plot.
eps_lap: The cumulative privacy costs of the Laplace mechanism.
eps_gnmax1: The cumulative privacy costs of the Gaussian mechanism (set 1).
answered_gnmax1: The cumulative count of queries answered (set 1).
eps_gnmax2: The cumulative privacy costs of the Gaussian mechanism (set 2).
partition_gnmax2: Allocation of eps for set 2.
answered_gnmax2: The cumulative count of queries answered (set 2).
"""
xlim = 6000
x_axis = range(0, int(xlim), 10)
lenx = len(x_axis)
y_lap = np.zeros(lenx)
y_gnmax1 = np.full(lenx, np.nan, dtype=float)
y_gnmax2 = np.full(lenx, np.nan, dtype=float)
y1_gnmax2 = np.full(lenx, np.nan, dtype=float)
for i in xrange(lenx):
x = x_axis[i]
y_lap[i] = eps_lap[x]
idx1 = np.searchsorted(answered_gnmax1, x)
if idx1 < len(eps_gnmax1):
y_gnmax1[i] = eps_gnmax1[idx1]
idx2 = np.searchsorted(answered_gnmax2, x)
if idx2 < len(eps_gnmax2):
y_gnmax2[i] = eps_gnmax2[idx2]
fraction_step1, fraction_step2, _ = partition_gnmax2[idx2]
y1_gnmax2[i] = eps_gnmax2[idx2] * fraction_step1 / (
fraction_step1 + fraction_step2)
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
ax.plot(
x_axis,
y_lap,
color='r',
ls='dashed',
label='LNMax',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax1,
color='g',
ls='-',
label='Confident-GNMax (moderate)',
alpha=.5,
linewidth=5)
ax.plot(
x_axis,
y_gnmax2,
color='b',
ls='-',
label='Confident-GNMax (aggressive)',
alpha=.5,
linewidth=5)
ax.fill_between(
x_axis, [0] * lenx,
y1_gnmax2.tolist(),
facecolor='b',
alpha=.3,
hatch='\\')
ax.plot(
x_axis,
y1_gnmax2,
color='b',
ls='-',
label='_nolegend_',
alpha=.5,
linewidth=1)
ax.fill_between(
x_axis, y1_gnmax2.tolist(), y_gnmax2.tolist(), facecolor='b', alpha=.3)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, xlim])
plt.ylim([0, 1.])
plt.xlabel('Number of queries answered', fontsize=16)
plt.ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
plt.legend(loc=2, fontsize=13) # loc=2 -- upper left
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'lnmax_vs_2xgnmax_large.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def print_plot_partition(figures_dir, eps, partition, answered, order_opt):
"""Plots an expert version of the privacy-per-answered-query graph.
Args:
figures_dir: A name of the directory where to save the plot.
eps: The cumulative privacy cost.
partition: Allocation of the privacy cost.
answered: Cumulative number of queries answered.
order_opt: The list of optimal orders.
"""
xlim = 6000
x = range(0, int(xlim), 10)
lenx = len(x)
y = np.full(lenx, np.nan, dtype=float)
y1 = np.full(lenx, np.nan, dtype=float)
y2 = np.full(lenx, np.nan, dtype=float)
y_right = np.full(lenx, np.nan, dtype=float)
for i in xrange(lenx):
idx = np.searchsorted(answered, x[i])
if idx < len(eps):
y[i] = eps[idx]
fraction_step1, _, fraction_delta = partition[idx]
y1[i] = fraction_delta * y[i]
y2[i] = (fraction_delta + fraction_step1) * y[i]
y_right[i] = order_opt[idx]
# plt.close('all')
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
l1 = ax.plot(
x, y, color='b', ls='-', label=r'Total privacy cost', linewidth=5).pop()
ax.plot(x, y1, color='b', ls='-', label=r'_nolegend_', alpha=.5, linewidth=1)
ax.plot(x, y2, color='b', ls='-', label=r'_nolegend_', alpha=.5, linewidth=1)
ax.fill_between(x, [0] * lenx, y1.tolist(), facecolor='b', alpha=.1)
ax.fill_between(x, y1.tolist(), y2.tolist(), facecolor='b', alpha=.2)
ax.fill_between(x, y2.tolist(), y.tolist(), facecolor='b', alpha=.3)
t1 = 300
ax.text(x[t1], y1[t1] * .35, r'due to $\delta$', alpha=.5, fontsize=18)
ax.text(
x[t1],
y1[t1] + (y2[t1] - y1[t1]) * .6,
r'selection ($\sigma_1$)',
alpha=.5,
fontsize=18)
t2 = 550
ax.annotate(
r'answering ($\sigma_2$)',
xy=(x[t2], (y[t2] + y2[t2]) / 2),
xytext=(x[200], y[t2] * 1.1),
arrowprops=dict(facecolor='black', shrink=0.005, alpha=.5),
fontsize=18,
alpha=.5)
ax2 = ax.twinx()
l2 = ax2.plot(
x, y_right, 'r', ls='-', label=r'Optimal order', linewidth=5,
alpha=.5).pop()
ax2.grid(False)
ax2.set_ylabel(r'Optimal Renyi order', fontsize=16)
plt.xticks(np.arange(0, 7000, 1000))
plt.xlim([0, xlim])
ax.set_ylim([0, 1.])
ax2.set_ylim([0, 200.])
ax.set_xlabel('Number of queries answered', fontsize=16)
ax.set_ylabel(r'Privacy cost $\varepsilon$ at $\delta=10^{-8}$', fontsize=16)
# Merging legends.
ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=0, fontsize=13)
ax.tick_params(labelsize=14)
fout_name = os.path.join(figures_dir, 'partition.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.show()
def run_all_analyses(votes, lambda_laplace, gnmax_parameters, sigma2):
"""Sequentially runs all analyses.
Args:
votes: A matrix of votes, where each row contains votes in one instance.
lambda_laplace: The scale of the Laplace noise (lambda).
gnmax_parameters: A list of parameters for GNMax.
sigma2: Shared parameter for the GNMax mechanisms.
Returns:
Five lists whose length is the number of queries.
"""
print('=== Laplace Mechanism ===')
eps_lap, _, _, _ = run_analysis(votes, 'lnmax', lambda_laplace, None)
print()
# Does not go anywhere, for now
# print('=== Gaussian Mechanism (simple) ===')
# eps, _, _, _ = run_analysis(votes[:n,], 'gnmax', sigma1, None)
eps_gnmax = [[] for p in gnmax_parameters]
partition_gmax = [[] for p in gnmax_parameters]
answered = [[] for p in gnmax_parameters]
order_opt = [[] for p in gnmax_parameters]
for i, p in enumerate(gnmax_parameters):
print('=== Gaussian Mechanism (confident) {}: ==='.format(p))
eps_gnmax[i], partition_gmax[i], answered[i], order_opt[i] = run_analysis(
votes, 'gnmax_conf', sigma2, p)
print()
return eps_lap, eps_gnmax, partition_gmax, answered, order_opt
def main(argv):
del argv # Unused.
lambda_laplace = 50. # corresponds to eps = 1. / lambda_laplace
# Paramaters of the GNMax
gnmax_parameters = ({
't': 1000,
'sigma1': 500
}, {
't': 3500,
'sigma1': 1500
}, {
't': 5000,
'sigma1': 1500
})
sigma2 = 100 # GNMax parameters differ only in Step 1 (selection).
ftemp_name = '/tmp/precomputed.pkl'
figures_dir = os.path.expanduser(FLAGS.figures_dir)
if FLAGS.cache and os.path.isfile(ftemp_name):
print('Reading from cache ' + ftemp_name)
with open(ftemp_name, 'rb') as f:
(eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax) = pickle.load(f)
else:
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
(eps_lap, eps_gnmax, partition_gmax,
answered_gnmax, orders_opt_gnmax) = run_all_analyses(
votes, lambda_laplace, gnmax_parameters, sigma2)
print('Writing to cache ' + ftemp_name)
with open(ftemp_name, 'wb') as f:
pickle.dump((eps_lap, eps_gnmax, partition_gmax, answered_gnmax,
orders_opt_gnmax), f)
print_plot_small(figures_dir, eps_lap, eps_gnmax[0], answered_gnmax[0])
print_plot_large(figures_dir, eps_lap, eps_gnmax[1], answered_gnmax[1],
eps_gnmax[2], partition_gmax[2], answered_gnmax[2])
print_plot_partition(figures_dir, eps_gnmax[2], partition_gmax[2],
answered_gnmax[2], orders_opt_gnmax[2])
plt.close('all')
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plots two graphs illustrating cost of privacy per answered query.
PRESENTLY NOT USED.
A script in support of the PATE2 paper. The input is a file containing a numpy
array of votes, one query per row, one class per column. Ex:
43, 1821, ..., 3
31, 16, ..., 0
...
0, 86, ..., 438
The output is written to a specified directory and consists of two pdf files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top
import numpy as np
import core as pate
plt.style.use('ggplot')
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', '', 'Counts file.')
flags.DEFINE_string('figures_dir', '', 'Path where figures are written to.')
def plot_rdp_curve_per_example(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=1000)
orders[0] = 1.5
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
styles = [':', '-']
labels = ['ex1', 'ex2']
for i in xrange(votes.shape[0]):
print(sorted(votes[i,], reverse=True)[:10])
for sigma in sigmas:
logq = pate.compute_logq_gaussian(votes[i,], sigma)
rdp = pate.rdp_gaussian(logq, sigma, orders)
ax.plot(
orders,
rdp,
label=r'{} $\sigma$={}'.format(labels[i], int(sigma)),
linestyle=styles[i],
linewidth=5)
for sigma in sigmas:
ax.plot(
orders,
pate.rdp_data_independent_gaussian(sigma, orders),
alpha=.3,
label=r'Data-ind bound $\sigma$={}'.format(int(sigma)),
linewidth=10)
plt.yticks([0, .01])
plt.xlabel(r'Order $\lambda$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow1.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.legend(loc=0, fontsize=13)
plt.show()
def compute_rdp_curve(votes, threshold, sigma1, sigma2, orders,
target_answered):
rdp_cum = np.zeros(len(orders))
answered = 0
for i, v in enumerate(votes):
v = sorted(v, reverse=True)
q_step1 = math.exp(pate.compute_logpr_answered(threshold, sigma1, v))
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp = pate.rdp_gaussian(logq_step2, sigma2, orders)
rdp_cum += q_step1 * rdp
answered += q_step1
if answered >= target_answered:
print('Processed {} queries to answer {}.'.format(i, target_answered))
return rdp_cum
assert False, 'Never reached {} answered queries.'.format(target_answered)
def plot_rdp_total(votes, sigmas):
orders = np.linspace(1., 100., endpoint=True, num=100)
orders[0] = 1.5
fig, ax = plt.subplots()
fig.set_figheight(4.5)
fig.set_figwidth(4.7)
for sigma in sigmas:
rdp = compute_rdp_curve(votes, 5000, 1000, sigma, orders, 2000)
ax.plot(
orders,
rdp,
alpha=.8,
label=r'$\sigma$={}'.format(int(sigma)),
linewidth=5)
plt.xlabel(r'Order $\lambda$', fontsize=16)
plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
ax.tick_params(labelsize=14)
fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow2.pdf')
print('Saving the graph to ' + fout_name)
fig.savefig(fout_name, bbox_inches='tight')
plt.legend(loc=0, fontsize=13)
plt.show()
def main(argv):
del argv # Unused.
fin_name = os.path.expanduser(FLAGS.counts_file)
print('Reading raw votes from ' + fin_name)
sys.stdout.flush()
votes = np.load(fin_name)
votes = votes[:12000,] # truncate to 4000 samples
v1 = [2550, 2200, 250] # based on votes[2,]
v2 = [2600, 2200, 200] # based on votes[381,]
plot_rdp_curve_per_example(np.array([v1, v2]), (100., 150.))
plot_rdp_total(votes, (100., 150.))
if __name__ == '__main__':
app.run(main)
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Performs privacy analysis of GNMax with smooth sensitivity.
A script in support of the paper "Scalable Private Learning with PATE" by
Nicolas Papernot, Shuang Song, Ilya Mironov, Ananth Raghunathan, Kunal Talwar,
Ulfar Erlingsson (https://arxiv.org/abs/1802.08908).
Several flavors of the GNMax algorithm can be analyzed.
- Plain GNMax (argmax w/ Gaussian noise) is assumed when arguments threshold
and sigma2 are missing.
- Confident GNMax (thresholding + argmax w/ Gaussian noise) is used when
threshold, sigma1, and sigma2 are given.
- Interactive GNMax (two- or multi-round) is triggered by specifying
baseline_file, which provides baseline values for votes selection in Step 1.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
sys.path.append('..') # Main modules reside in the parent directory.
from absl import app
from absl import flags
import numpy as np
import core as pate
import smooth_sensitivity as pate_ss
FLAGS = flags.FLAGS
flags.DEFINE_string('counts_file', None, 'Counts file.')
flags.DEFINE_string('baseline_file', None, 'File with baseline scores.')
flags.DEFINE_boolean('data_independent', False,
'Force data-independent bounds.')
flags.DEFINE_float('threshold', None, 'Threshold for step 1 (selection).')
flags.DEFINE_float('sigma1', None, 'Sigma for step 1 (selection).')
flags.DEFINE_float('sigma2', None, 'Sigma for step 2 (argmax).')
flags.DEFINE_integer('queries', None, 'Number of queries made by the student.')
flags.DEFINE_float('delta', 1e-8, 'Target delta.')
flags.DEFINE_float(
'order', None,
'Fixes a Renyi DP order (if unspecified, finds an optimal order from a '
'hardcoded list).')
flags.DEFINE_integer(
'teachers', None,
'Number of teachers (if unspecified, derived from the counts file).')
flags.mark_flag_as_required('counts_file')
flags.mark_flag_as_required('sigma2')
def _check_conditions(sigma, num_classes, orders):
"""Symbolic-numeric verification of conditions C5 and C6.
The conditions on the beta function are verified by constructing the beta
function symbolically, and then checking that its derivative (computed
symbolically) is non-negative within the interval of conjectured monotonicity.
The last check is performed numerically.
"""
print('Checking conditions C5 and C6 for all orders.')
sys.stdout.flush()
conditions_hold = True
for order in orders:
cond5, cond6 = pate_ss.check_conditions(sigma, num_classes, order)
conditions_hold &= cond5 and cond6
if not cond5:
print('Condition C5 does not hold for order =', order)
elif not cond6:
print('Condition C6 does not hold for order =', order)
if conditions_hold:
print('Conditions C5-C6 hold for all orders.')
sys.stdout.flush()
return conditions_hold
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders,
data_ind):
"""Computes the (data-dependent) RDP curve for Confident GNMax."""
rdp_cum = np.zeros(len(orders))
rdp_sqrd_cum = np.zeros(len(orders))
answered = 0
for i, v in enumerate(votes):
if threshold is None:
logq_step1 = 0 # No thresholding, always proceed to step 2.
rdp_step1 = np.zeros(len(orders))
else:
logq_step1 = pate.compute_logpr_answered(threshold, sigma1,
v - baseline[i,])
if data_ind:
rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders)
else:
rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders)
if data_ind:
rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders)
else:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders)
q_step1 = np.exp(logq_step1)
rdp = rdp_step1 + rdp_step2 * q_step1
# The expression below evaluates
# E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
rdp_sqrd = (
rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 +
q_step1 * rdp_step2**2)
rdp_sqrd_cum += rdp_sqrd
rdp_cum += rdp
answered += q_step1
if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1):
rdp_var = rdp_sqrd_cum / i - (
rdp_cum / i)**2 # Ignore Bessel's correction.
eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
order_opt_idx = np.searchsorted(orders, order_opt)
eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum.
print(
'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
'at order = {:.2f} (contribution from delta = {:.3f})'.format(
i + 1, answered, eps_total, eps_std, order_opt,
-math.log(delta) / (order_opt - 1)))
sys.stdout.flush()
_, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
return order_opt
def _find_optimal_smooth_sensitivity_parameters(
votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1,
ind_step2, order):
"""Optimizes smooth sensitivity parameters by minimizing a cost function.
The cost function is
exact_eps + cost of GNSS + two stds of noise,
which captures that upper bound of the confidence interval of the sanitized
privacy budget.
Since optimization is done with full view of sensitive data, the results
cannot be released.
"""
rdp_cum = 0
answered_cum = 0
ls_cum = 0
# Define a plausible range for the beta values.
betas = np.arange(.3 / order, .495 / order, .01 / order)
cost_delta = math.log(1 / delta) / (order - 1)
for i, v in enumerate(votes):
if threshold is None:
log_pr_answered = 0
rdp1 = 0
ls_step1 = np.zeros(num_teachers)
else:
log_pr_answered = pate.compute_logpr_answered(threshold, sigma1,
v - baseline[i,])
if ind_step1: # apply data-independent bound for step 1 (thresholding).
rdp1 = pate.compute_rdp_data_independent_threshold(sigma1, order)
ls_step1 = np.zeros(num_teachers)
else:
rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order)
ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(
v - baseline[i,], num_teachers, threshold, sigma1, order)
pr_answered = math.exp(log_pr_answered)
answered_cum += pr_answered
if ind_step2: # apply data-independent bound for step 2 (GNMax).
rdp2 = pate.rdp_data_independent_gaussian(sigma2, order)
ls_step2 = np.zeros(num_teachers)
else:
logq_step2 = pate.compute_logq_gaussian(v, sigma2)
rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order)
# Compute smooth sensitivity.
ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
v, num_teachers, sigma2, order)
rdp_cum += rdp1 + pr_answered * rdp2
ls_cum += ls_step1 + pr_answered * ls_step2 # Expected local sensitivity.
if ind_step1 and ind_step2:
# Data-independent bounds.
cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf
else:
# Data-dependent bounds.
cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None
for beta in betas:
ss = pate_ss.compute_discounted_max(beta, ls_cum)
# Solution to the minimization problem:
# min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma}
sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3)
cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta, sigma_ss, order)
# Cost captures exact_eps + cost of releasing SS + two stds of noise.
cost = rdp_cum + cost_ss + 2 * ss * sigma_ss
if cost < cost_opt:
cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss
if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1):
eps_before_ss = rdp_cum + cost_delta
eps_with_ss = (
eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
beta_opt, sigma_ss_opt, order))
print('{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to '
'{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'.
format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss,
ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt))
sys.stdout.flush()
# Return optimal parameters for the last iteration.
return beta_opt, ss_opt, sigma_ss_opt
####################
# HELPER FUNCTIONS #
####################
def _load_votes(counts_file, baseline_file, queries):
counts_file_expanded = os.path.expanduser(counts_file)
print('Reading raw votes from ' + counts_file_expanded)
sys.stdout.flush()
votes = np.load(counts_file_expanded)
print('Shape of the votes matrix = {}'.format(votes.shape))
if baseline_file is not None:
baseline_file_expanded = os.path.expanduser(baseline_file)
print('Reading baseline values from ' + baseline_file_expanded)
sys.stdout.flush()
baseline = np.load(baseline_file_expanded)
if votes.shape != baseline.shape:
raise ValueError(
'Counts file and baseline file must have the same shape. Got {} and '
'{} instead.'.format(votes.shape, baseline.shape))
else:
baseline = np.zeros_like(votes)
if queries is not None:
if votes.shape[0] < queries:
raise ValueError('Expect {} rows, got {} in {}'.format(
queries, votes.shape[0], counts_file))
# Truncate the votes matrix to the number of queries made.
votes = votes[:queries,]
baseline = baseline[:queries,]
else:
print('Process all {} input rows. (Use --queries flag to truncate.)'.format(
votes.shape[0]))
return votes, baseline
def _count_teachers(votes):
s = np.sum(votes, axis=1)
num_teachers = int(max(s))
if min(s) != num_teachers:
raise ValueError(
'Matrix of votes is malformed: the number of votes is not the same '
'across rows.')
return num_teachers
def _is_data_ind_step1(num_teachers, threshold, sigma1, orders):
if threshold is None:
return True
return np.all(
pate.is_data_independent_always_opt_threshold(num_teachers, threshold,
sigma1, orders))
def _is_data_ind_step2(num_teachers, num_classes, sigma, orders):
return np.all(
pate.is_data_independent_always_opt_gaussian(num_teachers, num_classes,
sigma, orders))
def main(argv):
del argv # Unused.
if (FLAGS.threshold is None) != (FLAGS.sigma1 is None):
raise ValueError(
'--threshold flag and --sigma1 flag must be present or absent '
'simultaneously.')
if FLAGS.order is None:
# Long list of orders.
orders = np.concatenate((np.arange(2, 100 + 1, .5),
np.logspace(np.log10(100), np.log10(500),
num=100)))
# Short list of orders.
# orders = np.round(
# np.concatenate((np.arange(2, 50 + 1, 1),
# np.logspace(np.log10(50), np.log10(1000), num=20))))
else:
orders = np.array([FLAGS.order])
votes, baseline = _load_votes(FLAGS.counts_file, FLAGS.baseline_file,
FLAGS.queries)
if FLAGS.teachers is None:
num_teachers = _count_teachers(votes)
else:
num_teachers = FLAGS.teachers
num_classes = votes.shape[1]
order = _compute_rdp(votes, baseline, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta, orders,
FLAGS.data_independent)
ind_step1 = _is_data_ind_step1(num_teachers, FLAGS.threshold, FLAGS.sigma1,
order)
ind_step2 = _is_data_ind_step2(num_teachers, num_classes, FLAGS.sigma2, order)
if FLAGS.data_independent or (ind_step1 and ind_step2):
print('Nothing to do here, all analyses are data-independent.')
return
if not _check_conditions(FLAGS.sigma2, num_classes, [order]):
return # Quit early: sufficient conditions for correctness fail to hold.
beta_opt, ss_opt, sigma_ss_opt = _find_optimal_smooth_sensitivity_parameters(
votes, baseline, num_teachers, FLAGS.threshold, FLAGS.sigma1,
FLAGS.sigma2, FLAGS.delta, ind_step1, ind_step2, order)
print('Optimal beta = {:.4f}, E[SS_beta] = {:.4}, sigma_ss = {:.2f}'.format(
beta_opt, ss_opt, sigma_ss_opt))
if __name__ == '__main__':
app.run(main)
Implementation of an RDP privacy accountant and smooth sensitivity analysis for
the PATE framework. The underlying theory and supporting experiments appear in
"Scalable Private Learning with PATE" by Nicolas Papernot, Shuang Song, Ilya
Mironov, Ananth Raghunathan, Kunal Talwar, Ulfar Erlingsson (ICLR 2018,
https://arxiv.org/abs/1802.08908).
## Overview
The PATE ('Private Aggregation of Teacher Ensembles') framework was introduced
by Papernot et al. in "Semi-supervised Knowledge Transfer for Deep Learning from
Private Training Data" (ICLR 2017, https://arxiv.org/abs/1610.05755). The
framework enables model-agnostic training that provably provides [differential
privacy](https://en.wikipedia.org/wiki/Differential_privacy) of the training
dataset.
The framework consists of _teachers_, the _student_ model, and the _aggregator_. The
teachers are models trained on disjoint subsets of the training datasets. The student
model has access to an insensitive (i.e., public) unlabelled dataset, which is labelled by
interacting with the ensemble of teachers via the _aggregator_. The aggregator tallies
outputs of the teacher models, and either forwards a (noisy) aggregate to the student, or
refuses to answer.
Differential privacy is enforced by the aggregator. The privacy guarantees can be _data-independent_,
which means that they are solely the function of the aggregator's parameters. Alternatively, privacy
analysis can be _data-dependent_, which allows for finer reasoning where, under certain conditions on
the input distribution, the final privacy guarantees can be improved relative to the data-independent
analysis. Data-dependent privacy guarantees may, by themselves, be a function of sensitive data and
therefore publishing these guarantees requires its own sanitization procedure. In our case
sanitization of data-dependent privacy guarantees proceeds via _smooth sensitivity_ analysis.
The common machinery used for all privacy analyses in this repository is the
R&eacute;nyi differential privacy, or RDP (see https://arxiv.org/abs/1702.07476).
This repository contains implementations of privacy accountants and smooth
sensitivity analysis for several data-independent and data-dependent mechanism that together
comprise the PATE framework.
### Requirements
* Python, version &ge; 2.7
* absl (see [here](https://github.com/abseil/abseil-py), or just type `pip install absl-py`)
* numpy
* scipy
* sympy (for smooth sensitivity analysis)
* unittest (for testing)
### Self-testing
To verify the installation run
```bash
$ python core_test.py
$ python smooth_sensitivity_test.py
```
## Files in this directory
* core.py --- RDP privacy accountant for several vote aggregators (GNMax,
Threshold, Laplace).
* smooth_sensitivity.py --- Smooth sensitivity analysis for GNMax and
Threshold mechanisms.
* core_test.py and smooth_sensitivity_test.py --- Unit tests for the
files above.
## Contact information
You may direct your comments to mironov@google.com and PR to @ilyamironov.
# Copyright 2017 The 'Scalable Private Learning with PATE' Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Core functions for RDP analysis in PATE framework.
This library comprises the core functions for doing differentially private
analysis of the PATE architecture and its various Noisy Max and other
mechanisms.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from absl import app
import numpy as np
import scipy.stats
def _logaddexp(x):
"""Addition in the log space. Analogue of numpy.logaddexp for a list."""
m = max(x)
return m + math.log(sum(np.exp(x - m)))
def _log1mexp(x):
"""Numerically stable computation of log(1-exp(x))."""
if x < -1:
return math.log1p(-math.exp(x))
elif x < 0:
return math.log(-math.expm1(x))
elif x == 0:
return -np.inf
else:
raise ValueError("Argument must be non-positive.")
def compute_eps_from_delta(orders, rdp, delta):
"""Translates between RDP and (eps, delta)-DP.
Args:
orders: A list (or a scalar) of orders.
rdp: A list of RDP guarantees (of the same length as orders).
delta: Target delta.
Returns:
Pair of (eps, optimal_order).
Raises:
ValueError: If input is malformed.
"""
if len(orders) != len(rdp):
raise ValueError("Input lists must have the same length.")
eps = np.array(rdp) - math.log(delta) / (np.array(orders) - 1)
idx_opt = np.argmin(eps)
return eps[idx_opt], orders[idx_opt]
#####################
# RDP FOR THE GNMAX #
#####################
def compute_logq_gaussian(counts, sigma):
"""Returns an upper bound on ln Pr[outcome != argmax] for GNMax.
Implementation of Proposition 7.
Args:
counts: A numpy array of scores.
sigma: The standard deviation of the Gaussian noise in the GNMax mechanism.
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
n = len(counts)
variance = sigma**2
idx_max = np.argmax(counts)
counts_normalized = counts[idx_max] - counts
counts_rest = counts_normalized[np.arange(n) != idx_max] # exclude one index
# Upper bound q via a union bound rather than a more precise calculation.
logq = _logaddexp(
scipy.stats.norm.logsf(counts_rest, scale=math.sqrt(2 * variance)))
# A sketch of a more accurate estimate, which is currently disabled for two
# reasons:
# 1. Numerical instability;
# 2. Not covered by smooth sensitivity analysis.
# covariance = variance * (np.ones((n - 1, n - 1)) + np.identity(n - 1))
# logq = np.log1p(-statsmodels.sandbox.distributions.extras.mvnormcdf(
# counts_rest, np.zeros(n - 1), covariance, maxpts=1e4))
return min(logq, math.log(1 - (1 / n)))
def rdp_data_independent_gaussian(sigma, orders):
"""Computes a data-independent RDP curve for GNMax.
Implementation of Proposition 8.
Args:
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
variance = sigma**2
if np.isscalar(orders):
return orders / variance
else:
return np.atleast_1d(orders) / variance
def rdp_gaussian(logq, sigma, orders):
"""Bounds RDP from above of GNMax given an upper bound on q (Theorem 6).
Args:
logq: Natural logarithm of the probability of a non-argmax outcome.
sigma: Standard deviation of Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Upper bound on RPD for all orders. A scalar if orders is a scalar.
Raises:
ValueError: If the input is malformed.
"""
if logq > 0 or sigma < 0 or np.any(orders <= 1): # not defined for alpha=1
raise ValueError("Inputs are malformed.")
if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP.
if np.isscalar(orders):
return 0.
else:
return np.full_like(orders, 0., dtype=np.float)
variance = sigma**2
# Use two different higher orders: mu_hi1 and mu_hi2 computed according to
# Proposition 10.
mu_hi2 = math.sqrt(variance * -logq)
mu_hi1 = mu_hi2 + 1
orders_vec = np.atleast_1d(orders)
ret = orders_vec / variance # baseline: data-independent bound
# Filter out entries where data-dependent bound does not apply.
mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1)
rdp_hi1 = mu_hi1 / variance
rdp_hi2 = mu_hi2 / variance
log_a2 = (mu_hi2 - 1) * rdp_hi2
# Make sure q is in the increasing wrt q range and A is positive.
if (np.any(mask) and logq <= log_a2 - mu_hi2 *
(math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and
-logq > rdp_hi2):
# Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0.
log1q = _log1mexp(logq) # log1q = log(1-q)
log_a = (orders - 1) * (
log1q - _log1mexp((logq + rdp_hi2) * (1 - 1 / mu_hi2)))
log_b = (orders - 1) * (rdp_hi1 - logq / (mu_hi1 - 1))
# Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y.
log_s = np.logaddexp(log1q + log_a, logq + log_b)
ret[mask] = np.minimum(ret, log_s / (orders - 1))[mask]
assert np.all(ret >= 0)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def is_data_independent_always_opt_gaussian(num_teachers, num_classes, sigma,
orders):
"""Tests whether data-ind bound is always optimal for GNMax.
Args:
num_teachers: Number of teachers.
num_classes: Number of classes.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
unanimous = np.array([num_teachers] + [0] * (num_classes - 1))
logq = compute_logq_gaussian(unanimous, sigma)
rdp_dep = rdp_gaussian(logq, sigma, orders)
rdp_ind = rdp_data_independent_gaussian(sigma, orders)
return np.isclose(rdp_dep, rdp_ind)
###################################
# RDP FOR THE THRESHOLD MECHANISM #
###################################
def compute_logpr_answered(t, sigma, counts):
"""Computes log of the probability that a noisy threshold is crossed.
Args:
t: The threshold.
sigma: The stdev of the Gaussian noise added to the threshold.
counts: An array of votes.
Returns:
Natural log of the probability that max is larger than a noisy threshold.
"""
# Compared to the paper, max(counts) is rounded to the nearest integer. This
# is done to facilitate computation of smooth sensitivity for the case of
# the interactive mechanism, where votes are not necessarily integer.
return scipy.stats.norm.logsf(t - round(max(counts)), scale=sigma)
def compute_rdp_data_independent_threshold(sigma, orders):
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_data_independent_gaussian(2**.5 * sigma, orders)
def compute_rdp_threshold(log_pr_answered, sigma, orders):
logq = min(log_pr_answered, _log1mexp(log_pr_answered))
# The input to the threshold mechanism has stability 1, compared to
# GNMax, which has stability = 2. Hence the sqrt(2) factor below.
return rdp_gaussian(logq, 2**.5 * sigma, orders)
def is_data_independent_always_opt_threshold(num_teachers, threshold, sigma,
orders):
"""Tests whether data-ind bound is always optimal for the threshold mechanism.
Args:
num_teachers: Number of teachers.
threshold: The cut-off threshold.
sigma: Standard deviation of the Gaussian noise.
orders: An array_like list of Renyi orders.
Returns:
Boolean array of length |orders| (a scalar if orders is a scalar). True if
the data-independent bound is always the same as the data-dependent bound.
"""
# Since the data-dependent bound depends only on max(votes), it suffices to
# check whether the data-dependent bounds are better than data-independent
# bounds in the extreme cases when max(votes) is minimal or maximal.
# For both Confident GNMax and Interactive GNMax it holds that
# 0 <= max(votes) <= num_teachers.
# The upper bound is trivial in both cases.
# The lower bound is trivial for Confident GNMax (and a stronger one, based on
# the pigeonhole principle, is possible).
# For Interactive GNMax (Algorithm 2), the lower bound follows from the
# following argument. Since the votes vector is the difference between the
# actual teachers' votes and the student's baseline, we need to argue that
# max(n_j - M * p_j) >= 0.
# The bound holds because sum_j n_j = sum M * p_j = M. Thus,
# sum_j (n_j - M * p_j) = 0, and max_j (n_j - M * p_j) >= 0 as needed.
logq1 = compute_logpr_answered(threshold, sigma, [0])
logq2 = compute_logpr_answered(threshold, sigma, [num_teachers])
rdp_dep1 = compute_rdp_threshold(logq1, sigma, orders)
rdp_dep2 = compute_rdp_threshold(logq2, sigma, orders)
rdp_ind = compute_rdp_data_independent_threshold(sigma, orders)
return np.isclose(rdp_dep1, rdp_ind) and np.isclose(rdp_dep2, rdp_ind)
#############################
# RDP FOR THE LAPLACE NOISE #
#############################
def compute_logq_laplace(counts, lmbd):
"""Computes an upper bound on log Pr[outcome != argmax] for LNMax.
Args:
counts: A list of scores.
lmbd: The lambda parameter of the Laplace distribution ~exp(-|x| / lambda).
Returns:
logq: Natural log of the probability that outcome is different from argmax.
"""
# For noisy max, we only get an upper bound via the union bound. See Lemma 4
# in https://arxiv.org/abs/1610.05755.
#
# Pr[ j beats i*] = (2+gap(j,i*))/ 4 exp(gap(j,i*)
# proof at http://mathoverflow.net/questions/66763/
idx_max = np.argmax(counts)
counts_normalized = (counts - counts[idx_max]) / lmbd
counts_rest = np.array(
[counts_normalized[i] for i in range(len(counts)) if i != idx_max])
logq = _logaddexp(np.log(2 - counts_rest) + math.log(.25) + counts_rest)
return min(logq, math.log(1 - (1 / len(counts))))
def rdp_pure_eps(logq, pure_eps, orders):
"""Computes the RDP value given logq and pure privacy eps.
Implementation of https://arxiv.org/abs/1610.05755, Theorem 3.
The bound used is the min of three terms. The first term is from
https://arxiv.org/pdf/1605.02065.pdf.
The second term is based on the fact that when event has probability (1-q) for
q close to zero, q can only change by exp(eps), which corresponds to a
much smaller multiplicative change in (1-q)
The third term comes directly from the privacy guarantee.
Args:
logq: Natural logarithm of the probability of a non-optimal outcome.
pure_eps: eps parameter for DP
orders: array_like list of moments to compute.
Returns:
Array of upper bounds on rdp (a scalar if orders is a scalar).
"""
orders_vec = np.atleast_1d(orders)
q = math.exp(logq)
log_t = np.full_like(orders_vec, np.inf)
if q <= 1 / (math.exp(pure_eps) + 1):
logt_one = math.log1p(-q) + (
math.log1p(-q) - _log1mexp(pure_eps + logq)) * (
orders_vec - 1)
logt_two = logq + pure_eps * (orders_vec - 1)
log_t = np.logaddexp(logt_one, logt_two)
ret = np.minimum(
np.minimum(0.5 * pure_eps * pure_eps * orders_vec,
log_t / (orders_vec - 1)), pure_eps)
if np.isscalar(orders):
return np.asscalar(ret)
else:
return ret
def main(argv):
del argv # Unused.
if __name__ == "__main__":
app.run(main)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment