Commit 1f4747a4 authored by pkulzc's avatar pkulzc
Browse files

Merge remote-tracking branch 'upstream/master'

parents d2d01f4f a7aa25d3
......@@ -28,4 +28,6 @@ If you would like to make any fixes or improvements to the models, please [submi
The *Official Models* are made available as a Python module. To run the models and associated scripts, add the top-level ***/models*** folder to the Python path with the command: `export PYTHONPATH="$PYTHONPATH:/path/to/models"`
To install dependencies pass `-r official/requirements.txt` to pip. (i.e. `pip3 install --user -r official/requirements.txt`)
To make Official Models easier to use, we are planning to create a pip installable Official Models package. This is being tracked in [#917](https://github.com/tensorflow/models/issues/917).
......@@ -15,7 +15,7 @@
"description": "The date when the test of the model is started",
"mode": "REQUIRED",
"name": "run_date",
"type": "DATETIME"
"type": "TIMESTAMP"
},
{
"description": "The tensorflow version information.",
......@@ -58,7 +58,7 @@
"type": "RECORD"
},
{
"description": "Enviornment variables when the benchmark run is executed.",
"description": "Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
......@@ -74,7 +74,27 @@
}
],
"mode": "REPEATED",
"name": "enviornment_variable",
"name": "environment_variable",
"type": "RECORD"
},
{
"description": "TF Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "tensorflow_environment_variables",
"type": "RECORD"
},
{
......
......@@ -175,11 +175,14 @@ def validate_batch_size_for_multi_gpu(batch_size):
raise ValueError(err)
def main(_):
def main(argv):
parser = MNISTArgParser()
flags = parser.parse_args(args=argv[1:])
model_function = model_fn
if FLAGS.multi_gpu:
validate_batch_size_for_multi_gpu(FLAGS.batch_size)
if flags.multi_gpu:
validate_batch_size_for_multi_gpu(flags.batch_size)
# There are two steps required if using multi-GPU: (1) wrap the model_fn,
# and (2) wrap the optimizer. The first happens here, and (2) happens
......@@ -187,16 +190,16 @@ def main(_):
model_function = tf.contrib.estimator.replicate_model_fn(
model_fn, loss_reduction=tf.losses.Reduction.MEAN)
data_format = FLAGS.data_format
data_format = flags.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
mnist_classifier = tf.estimator.Estimator(
model_fn=model_function,
model_dir=FLAGS.model_dir,
model_dir=flags.model_dir,
params={
'data_format': data_format,
'multi_gpu': FLAGS.multi_gpu
'multi_gpu': flags.multi_gpu
})
# Set up training and evaluation input functions.
......@@ -206,35 +209,35 @@ def main(_):
# When choosing shuffle buffer sizes, larger sizes result in better
# randomness, while smaller sizes use less memory. MNIST is a small
# enough dataset that we can easily shuffle the full epoch.
ds = dataset.train(FLAGS.data_dir)
ds = ds.cache().shuffle(buffer_size=50000).batch(FLAGS.batch_size)
ds = dataset.train(flags.data_dir)
ds = ds.cache().shuffle(buffer_size=50000).batch(flags.batch_size)
# Iterate through the dataset a set number (`epochs_between_evals`) of times
# during each training session.
ds = ds.repeat(FLAGS.epochs_between_evals)
ds = ds.repeat(flags.epochs_between_evals)
return ds
def eval_input_fn():
return dataset.test(FLAGS.data_dir).batch(
FLAGS.batch_size).make_one_shot_iterator().get_next()
return dataset.test(flags.data_dir).batch(
flags.batch_size).make_one_shot_iterator().get_next()
# Set up hook that outputs training logs every 100 steps.
train_hooks = hooks_helper.get_train_hooks(
FLAGS.hooks, batch_size=FLAGS.batch_size)
flags.hooks, batch_size=flags.batch_size)
# Train and evaluate model.
for _ in range(FLAGS.train_epochs // FLAGS.epochs_between_evals):
for _ in range(flags.train_epochs // flags.epochs_between_evals):
mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print('\nEvaluation results:\n\t%s\n' % eval_results)
# Export the model
if FLAGS.export_dir is not None:
if flags.export_dir is not None:
image = tf.placeholder(tf.float32, [None, 28, 28])
input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
'image': image,
})
mnist_classifier.export_savedmodel(FLAGS.export_dir, input_fn)
mnist_classifier.export_savedmodel(flags.export_dir, input_fn)
class MNISTArgParser(argparse.ArgumentParser):
......@@ -243,14 +246,9 @@ class MNISTArgParser(argparse.ArgumentParser):
def __init__(self):
super(MNISTArgParser, self).__init__(parents=[
parsers.BaseParser(),
parsers.ImageModelParser()])
self.add_argument(
'--export_dir',
type=str,
help='[default: %(default)s] If set, a SavedModel serialization of the '
'model will be exported to this directory at the end of training. '
'See the README for more details and relevant links.')
parsers.ImageModelParser(),
parsers.ExportParser(),
])
self.set_defaults(
data_dir='/tmp/mnist_data',
......@@ -261,6 +259,4 @@ class MNISTArgParser(argparse.ArgumentParser):
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
parser = MNISTArgParser()
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
main(argv=sys.argv)
......@@ -38,8 +38,6 @@ from official.mnist import dataset as mnist_dataset
from official.mnist import mnist
from official.utils.arg_parsers import parsers
FLAGS = None
def loss(logits, labels):
return tf.reduce_mean(
......@@ -97,35 +95,38 @@ def test(model, dataset):
tf.contrib.summary.scalar('accuracy', accuracy.result())
def main(_):
def main(argv):
parser = MNISTEagerArgParser()
flags = parser.parse_args(args=argv[1:])
tfe.enable_eager_execution()
# Automatically determine device and data_format
(device, data_format) = ('/gpu:0', 'channels_first')
if FLAGS.no_gpu or tfe.num_gpus() <= 0:
if flags.no_gpu or tfe.num_gpus() <= 0:
(device, data_format) = ('/cpu:0', 'channels_last')
# If data_format is defined in FLAGS, overwrite automatically set value.
if FLAGS.data_format is not None:
if flags.data_format is not None:
data_format = data_format
print('Using device %s, and data format %s.' % (device, data_format))
# Load the datasets
train_ds = mnist_dataset.train(FLAGS.data_dir).shuffle(60000).batch(
FLAGS.batch_size)
test_ds = mnist_dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size)
train_ds = mnist_dataset.train(flags.data_dir).shuffle(60000).batch(
flags.batch_size)
test_ds = mnist_dataset.test(flags.data_dir).batch(flags.batch_size)
# Create the model and optimizer
model = mnist.Model(data_format)
optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum)
optimizer = tf.train.MomentumOptimizer(flags.lr, flags.momentum)
# Create file writers for writing TensorBoard summaries.
if FLAGS.output_dir:
if flags.output_dir:
# Create directories to which summaries will be written
# tensorboard --logdir=<output_dir>
# can then be used to see the recorded summaries.
train_dir = os.path.join(FLAGS.output_dir, 'train')
test_dir = os.path.join(FLAGS.output_dir, 'eval')
tf.gfile.MakeDirs(FLAGS.output_dir)
train_dir = os.path.join(flags.output_dir, 'train')
test_dir = os.path.join(flags.output_dir, 'eval')
tf.gfile.MakeDirs(flags.output_dir)
else:
train_dir = None
test_dir = None
......@@ -135,19 +136,19 @@ def main(_):
test_dir, flush_millis=10000, name='test')
# Create and restore checkpoint (if one exists on the path)
checkpoint_prefix = os.path.join(FLAGS.model_dir, 'ckpt')
checkpoint_prefix = os.path.join(flags.model_dir, 'ckpt')
step_counter = tf.train.get_or_create_global_step()
checkpoint = tfe.Checkpoint(
model=model, optimizer=optimizer, step_counter=step_counter)
# Restore variables on creation if a checkpoint exists.
checkpoint.restore(tf.train.latest_checkpoint(FLAGS.model_dir))
checkpoint.restore(tf.train.latest_checkpoint(flags.model_dir))
# Train and evaluate for a set number of epochs.
with tf.device(device):
for _ in range(FLAGS.train_epochs):
for _ in range(flags.train_epochs):
start = time.time()
with summary_writer.as_default():
train(model, optimizer, train_ds, step_counter, FLAGS.log_interval)
train(model, optimizer, train_ds, step_counter, flags.log_interval)
end = time.time()
print('\nTrain time for epoch #%d (%d total steps): %f' %
(checkpoint.save_counter.numpy() + 1,
......@@ -205,6 +206,4 @@ class MNISTEagerArgParser(argparse.ArgumentParser):
)
if __name__ == '__main__':
parser = MNISTEagerArgParser()
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
main(argv=sys.argv)
......@@ -46,6 +46,10 @@ tf.flags.DEFINE_string(
"metadata.")
# Model specific parameters
tf.flags.DEFINE_string(
"master", default=None,
help="GRPC URL of the master (e.g. grpc://ip.address.of.tpu:8470). You "
"must specify either this flag or --tpu.")
tf.flags.DEFINE_string("data_dir", "",
"Path to directory containing the MNIST dataset")
tf.flags.DEFINE_string("model_dir", None, "Estimator model_dir")
......@@ -132,11 +136,24 @@ def main(argv):
del argv # Unused.
tf.logging.set_verbosity(tf.logging.INFO)
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
if FLAGS.master is None and FLAGS.tpu is None:
raise RuntimeError('You must specify either --master or --tpu.')
if FLAGS.master is not None:
if FLAGS.tpu is not None:
tf.logging.warn('Both --master and --tpu are set. Ignoring '
'--tpu and using --master.')
tpu_grpc_url = FLAGS.master
else:
tpu_cluster_resolver = (
tf.contrib.cluster_resolver.TPUClusterResolver(
FLAGS.tpu,
zone=FLAGS.tpu_zone,
project=FLAGS.gcp_project))
tpu_grpc_url = tpu_cluster_resolver.get_master()
run_config = tf.contrib.tpu.RunConfig(
cluster=tpu_cluster_resolver,
master=tpu_grpc_url,
evaluation_master=tpu_grpc_url,
model_dir=FLAGS.model_dir,
session_config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True),
......
psutil>=5.4.3
py-cpuinfo>=3.3.0
google-cloud-bigquery>=0.31.0
\ No newline at end of file
......@@ -51,4 +51,13 @@ The model will begin training and will automatically evaluate itself on the vali
Note that there are a number of other options you can specify, including `--model_dir` to choose where to store the model and `--resnet_size` to choose the model size (options include ResNet-18 through ResNet-200). See [`resnet.py`](resnet.py) for the full list of options.
### Pre-trained model
You can download a 190 MB pre-trained version of ResNet-50 achieving 75.3% top-1 single-crop accuracy here: [resnet50_2017_11_30.tar.gz](http://download.tensorflow.org/models/official/resnet50_2017_11_30.tar.gz). Simply download and uncompress the file, and point the model to the extracted directory using the `--model_dir` flag.
You can download 190 MB pre-trained versions of ResNet-50 achieving 76.3% and 75.3% (respectively) top-1 single-crop accuracy here: [resnetv2_imagenet_checkpoint.tar.gz](http://download.tensorflow.org/models/official/resnetv2_imagenet_checkpoint.tar.gz), [resnetv1_imagenet_checkpoint.tar.gz](http://download.tensorflow.org/models/official/resnetv1_imagenet_checkpoint.tar.gz). Simply download and uncompress the file, and point the model to the extracted directory using the `--model_dir` flag.
Other versions and formats:
* [ResNet-v2-ImageNet Checkpoint](http://download.tensorflow.org/models/official/resnetv2_imagenet_checkpoint.tar.gz)
* [ResNet-v2-ImageNet SavedModel](http://download.tensorflow.org/models/official/resnetv2_imagenet_savedmodel.tar.gz)
* [ResNet-v2-ImageNet Frozen Graph](http://download.tensorflow.org/models/official/resnetv2_imagenet_frozen_graph.pb)
* [ResNet-v1-ImageNet Checkpoint](http://download.tensorflow.org/models/official/resnetv1_imagenet_checkpoint.tar.gz)
* [ResNet-v1-ImageNet SavedModel](http://download.tensorflow.org/models/official/resnetv1_imagenet_savedmodel.tar.gz)
* [ResNet-v1-ImageNet Frozen Graph](http://download.tensorflow.org/models/official/resnetv1_imagenet_frozen_graph.pb)
......@@ -228,7 +228,10 @@ def main(argv):
flags = parser.parse_args(args=argv[1:])
input_function = flags.use_synthetic_data and get_synth_input_fn() or input_fn
resnet_run_loop.resnet_main(flags, cifar10_model_fn, input_function)
resnet_run_loop.resnet_main(
flags, cifar10_model_fn, input_function,
shape=[_HEIGHT, _WIDTH, _NUM_CHANNELS])
if __name__ == '__main__':
......
......@@ -305,7 +305,10 @@ def main(argv):
flags = parser.parse_args(args=argv[1:])
input_function = flags.use_synthetic_data and get_synth_input_fn() or input_fn
resnet_run_loop.resnet_main(flags, imagenet_model_fn, input_function)
resnet_run_loop.resnet_main(
flags, imagenet_model_fn, input_function,
shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
if __name__ == '__main__':
......
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test that the definitions of ResNet layers haven't changed.
These tests will fail if either:
a) The graph of a resnet layer changes and the change is significant enough
that it can no longer load existing checkpoints.
b) The numerical results produced by the layer change.
A warning will be issued if the graph changes, but the checkpoint still loads.
In the event that a layer change is intended, or the TensorFlow implementation
of a layer changes (and thus changes the graph), regenerate using the command:
$ python3 layer_test.py -regen
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.resnet import resnet_model
from official.utils.testing import reference_data
DATA_FORMAT = "channels_last" # CPU instructions often preclude channels_first
BATCH_SIZE = 32
BLOCK_TESTS = [
dict(bottleneck=True, projection=True, version=1, width=8, channels=4),
dict(bottleneck=True, projection=True, version=2, width=8, channels=4),
dict(bottleneck=True, projection=False, version=1, width=8, channels=4),
dict(bottleneck=True, projection=False, version=2, width=8, channels=4),
dict(bottleneck=False, projection=True, version=1, width=8, channels=4),
dict(bottleneck=False, projection=True, version=2, width=8, channels=4),
dict(bottleneck=False, projection=False, version=1, width=8, channels=4),
dict(bottleneck=False, projection=False, version=2, width=8, channels=4),
]
class BaseTest(reference_data.BaseTest):
"""Tests for core ResNet layers."""
@property
def test_name(self):
return "resnet"
def _batch_norm_ops(self, test=False):
name = "batch_norm"
g = tf.Graph()
with g.as_default():
tf.set_random_seed(self.name_to_seed(name))
input_tensor = tf.get_variable(
"input_tensor", dtype=tf.float32,
initializer=tf.random_uniform((32, 16, 16, 3), maxval=1)
)
layer = resnet_model.batch_norm(
inputs=input_tensor, data_format=DATA_FORMAT, training=True)
self._save_or_test_ops(
name=name, graph=g, ops_to_eval=[input_tensor, layer], test=test,
correctness_function=self.default_correctness_function
)
def make_projection(self, filters_out, strides, data_format):
"""1D convolution with stride projector.
Args:
filters_out: Number of filters in the projection.
strides: Stride length for convolution.
data_format: channels_first or channels_last
Returns:
A CNN projector function with kernel_size 1.
"""
def projection_shortcut(inputs):
return resnet_model.conv2d_fixed_padding(
inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
data_format=data_format)
return projection_shortcut
def _resnet_block_ops(self, test, batch_size, bottleneck, projection,
version, width, channels):
"""Test whether resnet block construction has changed.
Args:
test: Whether or not to run as a test case.
batch_size: Number of points in the fake image. This is needed due to
batch normalization.
bottleneck: Whether or not to use bottleneck layers.
projection: Whether or not to project the input.
version: Which version of ResNet to test.
width: The width of the fake image.
channels: The number of channels in the fake image.
"""
name = "batch-size-{}_{}{}_version-{}_width-{}_channels-{}".format(
batch_size,
"bottleneck" if bottleneck else "building",
"_projection" if projection else "",
version,
width,
channels
)
if version == 1:
block_fn = resnet_model._building_block_v1
if bottleneck:
block_fn = resnet_model._bottleneck_block_v1
else:
block_fn = resnet_model._building_block_v2
if bottleneck:
block_fn = resnet_model._bottleneck_block_v2
g = tf.Graph()
with g.as_default():
tf.set_random_seed(self.name_to_seed(name))
strides = 1
channels_out = channels
projection_shortcut = None
if projection:
strides = 2
channels_out *= strides
projection_shortcut = self.make_projection(
filters_out=channels_out, strides=strides, data_format=DATA_FORMAT)
filters = channels_out
if bottleneck:
filters = channels_out // 4
input_tensor = tf.get_variable(
"input_tensor", dtype=tf.float32,
initializer=tf.random_uniform((batch_size, width, width, channels),
maxval=1)
)
layer = block_fn(inputs=input_tensor, filters=filters, training=True,
projection_shortcut=projection_shortcut, strides=strides,
data_format=DATA_FORMAT)
self._save_or_test_ops(
name=name, graph=g, ops_to_eval=[input_tensor, layer], test=test,
correctness_function=self.default_correctness_function
)
def test_batch_norm(self):
self._batch_norm_ops(test=True)
def test_block_0(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[0])
def test_block_1(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[1])
def test_block_2(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[2])
def test_block_3(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[3])
def test_block_4(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[4])
def test_block_5(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[5])
def test_block_6(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[6])
def test_block_7(self):
self._resnet_block_ops(test=True, batch_size=BATCH_SIZE, **BLOCK_TESTS[7])
def regenerate(self):
"""Create reference data files for ResNet layer tests."""
self._batch_norm_ops(test=False)
for block_params in BLOCK_TESTS:
self._resnet_block_ops(test=False, batch_size=BATCH_SIZE, **block_params)
if __name__ == "__main__":
reference_data.main(argv=sys.argv, test_class=BaseTest)
......@@ -30,7 +30,9 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.resnet import resnet_model
from official.utils.arg_parsers import parsers
from official.utils.export import export
from official.utils.logging import hooks_helper
from official.utils.logging import logger
################################################################################
......@@ -218,7 +220,13 @@ def resnet_model_fn(features, labels, mode, model_class,
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Return the predictions and the specification for serving a SavedModel
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
export_outputs={
'predict': tf.estimator.export.PredictOutput(predictions)
})
# Calculate loss, which includes softmax cross entropy and L2 regularization.
cross_entropy = tf.losses.softmax_cross_entropy(
......@@ -309,8 +317,20 @@ def validate_batch_size_for_multi_gpu(batch_size):
raise ValueError(err)
def resnet_main(flags, model_function, input_function):
"""Shared main loop for ResNet Models."""
def resnet_main(flags, model_function, input_function, shape=None):
"""Shared main loop for ResNet Models.
Args:
flags: FLAGS object that contains the params for running. See
ResnetArgParser for created flags.
model_function: the function that instantiates the Model and builds the
ops for train/eval. This will be passed directly into the estimator.
input_function: the function that processes the dataset and returns a
dataset that the estimator can train on. This will be wrapped with
all the relevant flags for running and passed to estimator.
shape: list of ints representing the shape of the images used for training.
This is only used if flags.export_dir is passed.
"""
# Using the Winograd non-fused algorithms provides a small performance boost.
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
......@@ -347,9 +367,17 @@ def resnet_main(flags, model_function, input_function):
'version': flags.version,
})
if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger.log_run_info("resnet")
else:
benchmark_logger = None
for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks(
flags.hooks, batch_size=flags.batch_size)
flags.hooks,
batch_size=flags.batch_size,
benchmark_log_dir=flags.benchmark_log_dir)
print('Starting a training cycle.')
......@@ -377,16 +405,38 @@ def resnet_main(flags, model_function, input_function):
steps=flags.max_train_steps)
print(eval_results)
if benchmark_logger:
benchmark_logger.log_estimator_evaluation_result(eval_results)
if flags.export_dir is not None:
warn_on_multi_gpu_export(flags.multi_gpu)
# Exports a saved model for the given classifier.
input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
shape, batch_size=flags.batch_size)
classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
def warn_on_multi_gpu_export(multi_gpu=False):
"""For the time being, multi-GPU mode does not play nicely with exporting."""
if multi_gpu:
tf.logging.warning(
'You are exporting a SavedModel while in multi-GPU mode. Note that '
'the resulting SavedModel will require the same GPUs be available.'
'If you wish to serve the SavedModel from a different device, '
'try exporting the SavedModel with multi-GPU mode turned off.')
class ResnetArgParser(argparse.ArgumentParser):
"""Arguments for configuring and running a Resnet Model.
"""
"""Arguments for configuring and running a Resnet Model."""
def __init__(self, resnet_size_choices=None):
super(ResnetArgParser, self).__init__(parents=[
parsers.BaseParser(),
parsers.PerformanceParser(),
parsers.ImageModelParser(),
parsers.ExportParser(),
parsers.BenchmarkParser(),
])
self.add_argument(
......
......@@ -131,7 +131,7 @@ class BaseParser(argparse.ArgumentParser):
"of train hooks. "
"Example: --hooks LoggingTensorHook ExamplesPerSecondHook. "
"Allowed hook names (case-insensitive): LoggingTensorHook, "
"ProfilerHook, ExamplesPerSecondHook. "
"ProfilerHook, ExamplesPerSecondHook, LoggingMetricHook."
"See official.utils.logging.hooks_helper for details.",
metavar="<HK>"
)
......@@ -224,3 +224,70 @@ class ImageModelParser(argparse.ArgumentParser):
"was built for CPU or GPU.",
metavar="<CF>"
)
class ExportParser(argparse.ArgumentParser):
"""Parsing options for exporting saved models or other graph defs.
This is a separate parser for now, but should be made part of BaseParser
once all models are brought up to speed.
Args:
add_help: Create the "--help" flag. False if class instance is a parent.
export_dir: Create a flag to specify where a SavedModel should be exported.
"""
def __init__(self, add_help=False, export_dir=True):
super(ExportParser, self).__init__(add_help=add_help)
if export_dir:
self.add_argument(
"--export_dir", "-ed",
help="[default: %(default)s] If set, a SavedModel serialization of "
"the model will be exported to this directory at the end of "
"training. See the README for more details and relevant links.",
metavar="<ED>"
)
class BenchmarkParser(argparse.ArgumentParser):
"""Default parser for benchmark logging.
Args:
add_help: Create the "--help" flag. False if class instance is a parent.
benchmark_log_dir: Create a flag to specify location for benchmark logging.
"""
def __init__(self, add_help=False, benchmark_log_dir=True,
bigquery_uploader=True):
super(BenchmarkParser, self).__init__(add_help=add_help)
if benchmark_log_dir:
self.add_argument(
"--benchmark_log_dir", "-bld", default=None,
help="[default: %(default)s] The location of the benchmark logging.",
metavar="<BLD>"
)
if bigquery_uploader:
self.add_argument(
"--gcp_project", "-gp", default=None,
help="[default: %(default)s] The GCP project name where the benchmark"
" will be uploaded.",
metavar="<GP>"
)
self.add_argument(
"--bigquery_data_set", "-bds", default="test_benchmark",
help="[default: %(default)s] The Bigquery dataset name where the"
" benchmark will be uploaded.",
metavar="<BDS>"
)
self.add_argument(
"--bigquery_run_table", "-brt", default="benchmark_run",
help="[default: %(default)s] The Bigquery table name where the"
" benchmark run information will be uploaded.",
metavar="<BRT>"
)
self.add_argument(
"--bigquery_metric_table", "-bmt", default="benchmark_metric",
help="[default: %(default)s] The Bigquery table name where the"
" benchmark metric information will be uploaded.",
metavar="<BMT>"
)
......@@ -28,7 +28,8 @@ class TestParser(argparse.ArgumentParser):
parsers.BaseParser(),
parsers.PerformanceParser(num_parallel_calls=True, inter_op=True,
intra_op=True, use_synthetic_data=True),
parsers.ImageModelParser(data_format=True)
parsers.ImageModelParser(data_format=True),
parsers.BenchmarkParser(benchmark_log_dir=True, bigquery_uploader=True)
])
......@@ -58,6 +59,20 @@ class BaseTester(unittest.TestCase):
for key, value in defaults.items():
assert namespace_vars[key] == value
def test_benchmark_setting(self):
defaults = dict(
hooks=["LoggingMetricHook"],
benchmark_log_dir="/tmp/12345",
gcp_project="project_abc",
)
parser = TestParser()
parser.set_defaults(**defaults)
namespace_vars = vars(parser.parse_args([]))
for key, value in defaults.items():
assert namespace_vars[key] == value
def test_booleans(self):
"""Test to ensure boolean flags trigger as expected.
"""
......
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Convenience functions for exporting models as SavedModels or other types."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32,
batch_size=1):
"""Returns a input_receiver_fn that can be used during serving.
This expects examples to come through as float tensors, and simply
wraps them as TensorServingInputReceivers.
Arguably, this should live in tf.estimator.export. Testing here first.
Args:
shape: list representing target size of a single example.
dtype: the expected datatype for the input example
batch_size: number of input tensors that will be passed for prediction
Returns:
A function that itself returns a TensorServingInputReceiver.
"""
def serving_input_receiver_fn():
# Prep a placeholder where the input example will be fed in
features = tf.placeholder(
dtype=dtype, shape=[batch_size] + shape, name='input_tensor')
return tf.estimator.export.TensorServingInputReceiver(
features=features, receiver_tensors=features)
return serving_input_receiver_fn
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for exporting utils."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.export import export
class ExportUtilsTest(tf.test.TestCase):
"""Tests for the ExportUtils."""
def test_build_tensor_serving_input_receiver_fn(self):
receiver_fn = export.build_tensor_serving_input_receiver_fn(shape=[4, 5])
with tf.Graph().as_default():
receiver = receiver_fn()
self.assertIsInstance(
receiver, tf.estimator.export.TensorServingInputReceiver)
self.assertIsInstance(receiver.features, tf.Tensor)
self.assertEqual(receiver.features.shape, tf.TensorShape([1, 4, 5]))
self.assertEqual(receiver.features.dtype, tf.float32)
self.assertIsInstance(receiver.receiver_tensors, dict)
# Note that Python 3 can no longer index .values() directly; cast to list.
self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
tf.TensorShape([1, 4, 5]))
def test_build_tensor_serving_input_receiver_fn_batch_dtype(self):
receiver_fn = export.build_tensor_serving_input_receiver_fn(
shape=[4, 5], dtype=tf.int8, batch_size=10)
with tf.Graph().as_default():
receiver = receiver_fn()
self.assertIsInstance(
receiver, tf.estimator.export.TensorServingInputReceiver)
self.assertIsInstance(receiver.features, tf.Tensor)
self.assertEqual(receiver.features.shape, tf.TensorShape([10, 4, 5]))
self.assertEqual(receiver.features.dtype, tf.int8)
self.assertIsInstance(receiver.receiver_tensors, dict)
# Note that Python 3 can no longer index .values() directly; cast to list.
self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
tf.TensorShape([10, 4, 5]))
if __name__ == "__main__":
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to upload benchmark generated by BenchmarkLogger to remote repo.
This library require google cloud bigquery lib as dependency, which can be
installed with:
> pip install --upgrade google-cloud-bigquery
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import sys
import uuid
from google.cloud import bigquery
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.arg_parsers import parsers
from official.utils.logging import logger
class BigQueryUploader(object):
"""Upload the benchmark and metric info to BigQuery."""
def __init__(self, logging_dir, gcp_project=None, credentials=None):
"""Initialized BigQueryUploader with proper setting.
Args:
logging_dir: string, logging directory that contains the benchmark log.
gcp_project: string, the name of the GCP project that the log will be
uploaded to. The default project name will be detected from local
environment if no value is provided.
credentials: google.auth.credentials. The credential to access the
BigQuery service. The default service account credential will be
detected from local environment if no value is provided. Please use
google.oauth2.service_account.Credentials to load credential from local
file for the case that the test is run out side of GCP.
"""
self._logging_dir = logging_dir
self._bq_client = bigquery.Client(
project=gcp_project, credentials=credentials)
def upload_benchmark_run(self, dataset_name, table_name, run_id):
"""Upload benchmark run information to Bigquery.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the data will be uploaded.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format.
"""
expected_file = os.path.join(
self._logging_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME)
with tf.gfile.GFile(expected_file) as f:
benchmark_json = json.load(f)
benchmark_json["model_id"] = run_id
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, [benchmark_json])
if errors:
tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))
def upload_metric(self, dataset_name, table_name, run_id):
"""Upload metric information to Bigquery.
Args:
dataset_name: string, the name of bigquery dataset where the data will be
uploaded.
table_name: string, the name of bigquery table under the dataset where
the metric data will be uploaded. This is different from the
benchmark_run table.
run_id: string, a unique ID that will be attached to the data, usually
this is a UUID4 format. This should be the same as the benchmark run_id.
"""
expected_file = os.path.join(
self._logging_dir, logger.METRIC_LOG_FILE_NAME)
with tf.gfile.GFile(expected_file) as f:
lines = f.readlines()
metrics = []
for line in filter(lambda l: l.strip(), lines):
metric = json.loads(line)
metric["run_id"] = run_id
metrics.append(metric)
table_ref = self._bq_client.dataset(dataset_name).table(table_name)
errors = self._bq_client.insert_rows_json(table_ref, metrics)
if errors:
tf.logging.error(
"Failed to upload benchmark info to bigquery: {}".format(errors))
def main(argv):
parser = parsers.BenchmarkParser()
flags = parser.parse_args(args=argv[1:])
if not flags.benchmark_log_dir:
print("Usage: benchmark_uploader.py --benchmark_log_dir=/some/dir")
sys.exit(1)
uploader = BigQueryUploader(
flags.benchmark_log_dir,
gcp_project=flags.gcp_project)
run_id = str(uuid.uuid4())
uploader.upload_benchmark_run(
flags.bigquery_data_set, flags.bigquery_run_table, run_id)
uploader.upload_metric(
flags.bigquery_data_set, flags.bigquery_metric_table, run_id)
if __name__ == "__main__":
main(argv=sys.argv)
......@@ -27,6 +27,7 @@ from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logging import hooks
from official.utils.logging import metric_hook
_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
'cross_entropy',
......@@ -122,9 +123,37 @@ def get_examples_per_second_hook(every_n_steps=100,
warm_steps=warm_steps)
def get_logging_metric_hook(benchmark_log_dir=None,
tensors_to_log=None,
every_n_secs=600,
**kwargs): # pylint: disable=unused-argument
"""Function to get LoggingMetricHook.
Args:
benchmark_log_dir: `string`, directory path to save the metric log.
tensors_to_log: List of tensor names or dictionary mapping labels to tensor
names. If not set, log _TENSORS_TO_LOG by default.
every_n_secs: `int`, the frequency for logging the metric. Default to every
10 mins.
Returns:
Returns a ProfilerHook that writes out timelines that can be loaded into
profiling tools like chrome://tracing.
"""
if benchmark_log_dir is None:
raise ValueError("metric_log_dir should be provided to use metric logger")
if tensors_to_log is None:
tensors_to_log = _TENSORS_TO_LOG
return metric_hook.LoggingMetricHook(
tensors=tensors_to_log,
log_dir=benchmark_log_dir,
every_n_secs=every_n_secs)
# A dictionary to map one hook name and its corresponding function
HOOKS = {
'loggingtensorhook': get_logging_tensor_hook,
'profilerhook': get_profiler_hook,
'examplespersecondhook': get_examples_per_second_hook,
'loggingmetrichook': get_logging_metric_hook,
}
......@@ -49,16 +49,19 @@ class BaseTest(unittest.TestCase):
expected_hook_name)
def test_get_train_hooks_logging_tensor_hook(self):
test_hook_name = 'LoggingTensorHook'
self.validate_train_hook_name(test_hook_name, 'loggingtensorhook')
self.validate_train_hook_name('LoggingTensorHook', 'loggingtensorhook')
def test_get_train_hooks_profiler_hook(self):
test_hook_name = 'ProfilerHook'
self.validate_train_hook_name(test_hook_name, 'profilerhook')
self.validate_train_hook_name('ProfilerHook', 'profilerhook')
def test_get_train_hooks_examples_per_second_hook(self):
test_hook_name = 'ExamplesPerSecondHook'
self.validate_train_hook_name(test_hook_name, 'examplespersecondhook')
self.validate_train_hook_name('ExamplesPerSecondHook',
'examplespersecondhook')
def test_get_logging_metric_hook(self):
test_hook_name = 'LoggingMetricHook'
self.validate_train_hook_name(test_hook_name, 'loggingmetrichook',
benchmark_log_dir='/tmp')
if __name__ == '__main__':
tf.test.main()
......@@ -13,19 +13,26 @@
# limitations under the License.
# ==============================================================================
"""Logging utilities for benchmark."""
"""Logging utilities for benchmark.
For collecting local environment metrics like CPU and memory, certain python
packages need be installed. See README for details.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import datetime
import json
import multiprocessing
import numbers
import os
import tensorflow as tf
from tensorflow.python.client import device_lib
_METRIC_LOG_FILE_NAME = "metric.log"
METRIC_LOG_FILE_NAME = "metric.log"
BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
......@@ -37,6 +44,25 @@ class BenchmarkLogger(object):
if not tf.gfile.IsDirectory(self._logging_dir):
tf.gfile.MakeDirs(self._logging_dir)
def log_estimator_evaluation_result(self, eval_results):
"""Log the evaluation result for a estimator.
The evaluate result is a directory that contains metrics defined in
model_fn. It also contains a entry for global_step which contains the value
of the global step when evaluation was performed.
Args:
eval_results: dict, the result of evaluate() from a estimator.
"""
if not isinstance(eval_results, dict):
tf.logging.warning("eval_results should be directory for logging. Got %s",
type(eval_results))
return
global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
for key in sorted(eval_results):
if key != tf.GraphKeys.GLOBAL_STEP:
self.log_metric(key, eval_results[key], global_step=global_step)
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to local file.
......@@ -55,9 +81,12 @@ class BenchmarkLogger(object):
tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return
if extras:
extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
else:
extras = []
with tf.gfile.GFile(
os.path.join(self._logging_dir, _METRIC_LOG_FILE_NAME), "a") as f:
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
metric = {
"name": name,
"value": float(value),
......@@ -72,3 +101,96 @@ class BenchmarkLogger(object):
except (TypeError, ValueError) as e:
tf.logging.warning("Failed to dump metric to log file: "
"name %s, value %s, error %s", name, value, e)
def log_run_info(self, model_name):
"""Collect most of the TF runtime information for the local env.
The schema of the run info follows official/benchmark/datastore/schema.
Args:
model_name: string, the name of the model.
"""
run_info = {
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)
with tf.gfile.GFile(os.path.join(
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
try:
json.dump(run_info, f)
f.write("\n")
except (TypeError, ValueError) as e:
tf.logging.warning("Failed to dump benchmark run info to log file: %s",
e)
def _collect_tensorflow_info(run_info):
run_info["tensorflow_version"] = {
"version": tf.VERSION, "git_hash": tf.GIT_VERSION}
def _collect_tensorflow_environment_variables(run_info):
run_info["tensorflow_environment_variables"] = [
{"name": k, "value": v}
for k, v in sorted(os.environ.items()) if k.startswith("TF_")]
# The following code is mirrored from tensorflow/tools/test/system_info_lib
# which is not exposed for import.
def _collect_cpu_info(run_info):
"""Collect the CPU information for the local environment."""
cpu_info = {}
cpu_info["num_cores"] = multiprocessing.cpu_count()
# Note: cpuinfo is not installed in the TensorFlow OSS tree.
# It is installable via pip.
import cpuinfo # pylint: disable=g-import-not-at-top
info = cpuinfo.get_cpu_info()
cpu_info["cpu_info"] = info["brand"]
cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6
run_info["machine_config"]["cpu_info"] = cpu_info
def _collect_gpu_info(run_info):
"""Collect local GPU information by TF device library."""
gpu_info = {}
local_device_protos = device_lib.list_local_devices()
gpu_info["count"] = len([d for d in local_device_protos
if d.device_type == "GPU"])
# The device description usually is a JSON string, which contains the GPU
# model info, eg:
# "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0"
for d in local_device_protos:
if d.device_type == "GPU":
gpu_info["model"] = _parse_gpu_model(d.physical_device_desc)
# Assume all the GPU connected are same model
break
run_info["machine_config"]["gpu_info"] = gpu_info
def _collect_memory_info(run_info):
# Note: psutil is not installed in the TensorFlow OSS tree.
# It is installable via pip.
import psutil # pylint: disable=g-import-not-at-top
vmem = psutil.virtual_memory()
run_info["machine_config"]["memory_total"] = vmem.total
run_info["machine_config"]["memory_available"] = vmem.available
def _parse_gpu_model(physical_device_desc):
# Assume all the GPU connected are same model
for kv in physical_device_desc.split(","):
k, _, v = kv.partition(":")
if k.strip() == "name":
return v.strip()
return None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment