Commit f1e3135b authored by qianyj's avatar qianyj
Browse files

update TF code

parent f0d87682
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Used to run benchmark_cnn for distributed tests.
In distributed tests, we spawn processes to run tf_cnn_benchmark tasks. We could
directly spawn tf_cnn_benchmark processes, but we want some added functionality,
such as being able to inject custom images during training. So instead, this
file is spawned as a Python process, which supports the added functionality.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags as absl_flags
import numpy as np
import tensorflow.compat.v1 as tf
import benchmark_cnn
import flags
import preprocessing
import test_util
absl_flags.DEFINE_string('fake_input', 'none',
"""What fake input to inject into benchmark_cnn. This
is ignored if --model=test_model.
Options are:
none: Do not inject any fake input.
zeros_and_ones: Half the images will be all 0s with
a label of 0. Half the images will be all 1s with a
label of 1.""")
flags.define_flags()
FLAGS = flags.FLAGS
def get_test_image_preprocessor(batch_size, params):
"""Returns the preprocessing.TestImagePreprocessor that should be injected.
Returns None if no preprocessor should be injected.
Args:
batch_size: The batch size across all GPUs.
params: BenchmarkCNN's parameters.
Returns:
Returns the preprocessing.TestImagePreprocessor that should be injected.
Raises:
ValueError: Flag --fake_input is an invalid value.
"""
if FLAGS.fake_input == 'none':
return None
elif FLAGS.fake_input == 'zeros_and_ones':
half_batch_size = batch_size // 2
images = np.zeros((batch_size, 227, 227, 3), dtype=np.float32)
images[half_batch_size:, :, :, :] = 1
labels = np.array([0] * half_batch_size + [1] * half_batch_size,
dtype=np.int32)
preprocessor = preprocessing.TestImagePreprocessor(
batch_size, [227, 227, 3], params.num_gpus,
benchmark_cnn.get_data_type(params))
preprocessor.set_fake_data(images, labels)
preprocessor.expected_subset = 'validation' if params.eval else 'train'
return preprocessor
else:
raise ValueError('Invalid --fake_input: %s' % FLAGS.fake_input)
def run_with_real_model(params):
"""Runs tf_cnn_benchmarks with a real model."""
bench = benchmark_cnn.BenchmarkCNN(params)
bench.print_info()
preprocessor = get_test_image_preprocessor(bench.batch_size, params)
if preprocessor is not None:
# The test image preprocessor requires queue runners. Since this file is
# used for testing, it is OK to access protected members.
# pylint: disable=protected-access
bench.dataset._queue_runner_required = True
# pylint: enable=protected-access
bench.input_preprocessor = preprocessor
bench.run()
def run_with_test_model(params):
"""Runs tf_cnn_benchmarks with a test model."""
model = test_util.TestCNNModel()
inputs = test_util.get_fake_var_update_inputs()
with test_util.monkey_patch(benchmark_cnn,
LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
bench = benchmark_cnn.BenchmarkCNN(params, dataset=test_util.TestDataSet(),
model=model)
# The test model does not use labels when computing loss, so the label
# values do not matter as long as it's the right shape.
labels = np.array([1] * inputs.shape[0])
bench.input_preprocessor.set_fake_data(inputs, labels)
bench.run()
def main(_):
params = benchmark_cnn.make_params_from_flags()
params = benchmark_cnn.setup(params)
if params.model == 'test_model':
run_with_test_model(params)
else:
run_with_real_model(params)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.app.run()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark_cnn."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
import os
import re
import unittest
import mock
import numpy as np
import tensorflow.compat.v1 as tf
from google.protobuf import text_format
from tensorflow.core.framework import step_stats_pb2
from tensorflow.core.profiler import tfprof_log_pb2
from tensorflow.python.platform import test
import benchmark_cnn
import datasets
import flags
import preprocessing
import test_util
import variable_mgr_util
from platforms import util as platforms_util
def _check_has_gpu():
if not test.is_gpu_available(cuda_only=True):
raise ValueError(
"""You have asked to run part or all of this on GPU, but it appears
that no GPU is available. If your machine has GPUs it is possible you
do not have a version of TensorFlow with GPU support. To build with GPU
support, add --config=cuda to the build flags.\n """)
class TfCnnBenchmarksModelTest(tf.test.TestCase):
"""Tests which are run with multiple models."""
def setUp(self):
super(TfCnnBenchmarksModelTest, self).setUp()
benchmark_cnn.setup(benchmark_cnn.make_params())
def get_model_name(self):
return None
# Return true to run tests that don't need to be run on every model.
# This should be done for one or two cheap models.
def extended_tests(self):
return False
# Return false to suppress actually running the model; this is useful
# for tests that are large.
def model_execution_test(self):
return False
# Return false to suppress actually saving and loading the model.
def model_save_load_test(self):
return False
def testSaveLoadModel(self):
_check_has_gpu()
if not self.get_model_name() or not self.model_save_load_test():
return
params = benchmark_cnn.make_params(
model=self.get_model_name(),
num_batches=1,
num_intra_threads=0,
num_inter_threads=0,
distortions=False,
batch_size=2,
variable_update='replicated',
num_warmup_batches=0,
num_gpus=2,
train_dir=test_util.get_temp_dir('testSaveLoadModel_' +
self.get_model_name()))
# Run one batch and save the model.
# Note that this uses a non-test session.
bench = benchmark_cnn.BenchmarkCNN(params)
bench.run()
self.assertEqual(bench.init_global_step, 0)
# Clear the default graph.
tf.reset_default_graph()
# Test if checkpoint had been saved.
ckpt = tf.train.get_checkpoint_state(params.train_dir)
match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
ckpt.model_checkpoint_path + '.index')
self.assertTrue(match)
self.assertGreaterEqual(int(match.group(1)), params.num_batches)
params = params._replace(num_batches=2)
# Reload the model
bench = benchmark_cnn.BenchmarkCNN(params)
bench.run()
# Check if global step has been restored.
self.assertNotEqual(bench.init_global_step, 0)
ckpt = tf.train.get_checkpoint_state(params.train_dir)
match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
ckpt.model_checkpoint_path + '.index')
self.assertTrue(match)
self.assertGreaterEqual(int(match.group(1)), params.num_batches)
# Check that the batch norm moving averages are restored from checkpoints
with tf.Graph().as_default():
bench = benchmark_cnn.BenchmarkCNN(params)
bench._build_model()
saver = tf.train.Saver(bench.variable_mgr.savable_variables())
with tf.Session(config=benchmark_cnn.create_config_proto(params)) as sess:
benchmark_cnn.load_checkpoint(saver, sess, params.train_dir)
sess.run(bench.variable_mgr.get_post_init_ops())
bn_moving_vars = [
v for v in tf.global_variables()
if '/batchnorm' in v.name and '/moving' in v.name
]
self.assertGreater(len(bn_moving_vars), 0)
for moving_var in bn_moving_vars:
moving_var_value = sess.run(moving_var)
# Check that the moving means and moving variances have been restored
# by asserting they are not their default values of 0 and 1,
# respectively
if '/moving_mean' in moving_var.name:
self.assertFalse(np.array_equal(moving_var_value,
np.zeros(moving_var_value.shape,
moving_var_value.dtype)))
else:
self.assertIn('/moving_variance', moving_var.name)
self.assertFalse(np.array_equal(moving_var_value,
np.ones(moving_var_value.shape,
moving_var_value.dtype)))
def testModel(self):
_check_has_gpu()
if not self.get_model_name() or not self.model_execution_test():
return
params = benchmark_cnn.make_params(
model=self.get_model_name(),
num_batches=1,
num_intra_threads=1,
num_inter_threads=12,
batch_size=2,
distortions=False)
# Run this one; note that this uses a non-test session.
bench = benchmark_cnn.BenchmarkCNN(params)
bench.run()
def testSendRecvVariables(self):
self._testVariables('parameter_server')
if self.extended_tests():
self._testVariables('parameter_server', local_parameter_device='CPU')
self._testVariables('parameter_server', optimizer='sgd')
def testReplicatedVariables(self):
self._testVariables('replicated')
if self.extended_tests():
self._testVariables('replicated', all_reduce_spec=None)
self._testVariables('replicated', use_fp16=True, fp16_vars=False)
self._testVariables(
'replicated',
all_reduce_spec=None,
use_fp16=True,
fp16_vars=False,
fp16_enable_auto_loss_scale=True,
fp16_inc_loss_scale_every_n=4)
def testIndependentVariables(self):
self._testVariables('independent')
self._testVariables(
'independent',
all_reduce_spec=None,
use_fp16=True,
fp16_vars=False,
fp16_enable_auto_loss_scale=True,
fp16_inc_loss_scale_every_n=4)
def testSummaryVerbosity(self):
self._testVariables('parameter_server', summary_verbosity=1)
if self.extended_tests():
self._testVariables('parameter_server', summary_verbosity=2)
self._testVariables('parameter_server', summary_verbosity=3)
def testStagedVariables(self):
self._testVariables('parameter_server', staged_vars=True)
if self.extended_tests():
self._testVariables('parameter_server', staged_vars=True,
local_parameter_device='CPU')
self._testVariables('parameter_server', staged_vars=True, use_fp16=True,
fp16_vars=True)
def _assert_correct_var_type(self, var, params):
if 'gpu_cached_inputs' not in var.name:
if params.use_fp16 and params.fp16_vars and 'batchnorm' not in var.name:
expected_type = tf.float16
else:
expected_type = tf.float32
self.assertEqual(var.dtype.base_dtype, expected_type)
def _testVariables(self,
variable_update,
summary_verbosity=0,
local_parameter_device='GPU',
staged_vars=False,
optimizer='momentum',
# TODO(b/80125832): Enable nccl in tests
# all_reduce_spec='nccl',
all_reduce_spec='',
use_fp16=False,
fp16_vars=False,
fp16_enable_auto_loss_scale=False,
fp16_inc_loss_scale_every_n=10):
if not self.get_model_name():
return
_check_has_gpu()
params = benchmark_cnn.make_params(
model=self.get_model_name(),
num_batches=1,
num_intra_threads=1,
num_inter_threads=12,
distortions=False,
variable_update=variable_update,
local_parameter_device=local_parameter_device,
num_gpus=2,
summary_verbosity=summary_verbosity,
staged_vars=staged_vars,
optimizer=optimizer,
all_reduce_spec=all_reduce_spec,
compact_gradient_transfer=False if all_reduce_spec == 'nccl' else True,
use_fp16=use_fp16,
fp16_loss_scale=2.,
fp16_vars=fp16_vars,
fp16_enable_auto_loss_scale=fp16_enable_auto_loss_scale,
fp16_inc_loss_scale_every_n=fp16_inc_loss_scale_every_n,
)
# Test building models using multiple GPUs, but don't
# run them.
with self.test_session(graph=tf.Graph()):
bench = benchmark_cnn.BenchmarkCNN(params)
bench._build_model()
# Rough validation of variable type and placement, depending on mode.
all_vars = tf.global_variables() + tf.local_variables()
if params.variable_update == 'parameter_server':
for v in all_vars:
tf.logging.debug('var: %s' % v.name)
match = re.match(r'tower_(\d+)/v/gpu_cached_inputs:0', v.name)
if match:
self.assertEqual(v.device, '/device:GPU:%s' % match.group(1))
elif v.name.startswith('v/'):
self.assertEqual(v.device, '/device:%s:0' % local_parameter_device)
self._assert_correct_var_type(v, params)
elif v.name in ('input_processing/images:0',
'input_processing/labels:0', 'init_learning_rate:0',
'global_step:0', 'loss_scale:0',
'loss_scale_normal_steps:0'):
self.assertEqual(v.device, '/device:CPU:0')
else:
raise ValueError('Unexpected variable %s' % v.name)
else:
v0_count = 0
v1_count = 0
for v in all_vars:
if v.name.startswith('tower_0/v0/'):
self.assertEqual(v.name, 'tower_0/v0/gpu_cached_inputs:0')
self.assertEqual(v.device, '/device:GPU:0')
elif v.name.startswith('tower_1/v1/'):
self.assertEqual(v.name, 'tower_1/v1/gpu_cached_inputs:0')
self.assertEqual(v.device, '/device:GPU:1')
elif v.name.startswith('v0/'):
v0_count += 1
self.assertEqual(v.device, '/device:GPU:0')
self._assert_correct_var_type(v, params)
elif v.name.startswith('v1/'):
v1_count += 1
self.assertEqual(v.device, '/device:GPU:1')
self._assert_correct_var_type(v, params)
elif v.name in ('input_processing/images:0',
'input_processing/labels:0', 'init_learning_rate:0',
'global_step:0', 'loss_scale:0',
'loss_scale_normal_steps:0'):
self.assertEqual(v.device, '/device:CPU:0')
else:
raise ValueError('Unexpected variable %s' % v.name)
self.assertEqual(v0_count, v1_count)
# Validate summary ops in the model depending on verbosity level
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
num_summary_ops = len(summary_ops)
self.assertEqual(num_summary_ops > 0, summary_verbosity > 0)
if summary_verbosity > 0:
has_affine_histogram = False
has_gradient_histogram = False
has_log_gradients_histogram = False
for op in summary_ops:
if '/gradients' in op.name:
has_gradient_histogram = True
elif '/affine' in op.name:
has_affine_histogram = True
elif 'log_gradients' in op.name:
has_log_gradients_histogram = True
self.assertEqual(summary_verbosity >= 3, has_affine_histogram)
self.assertEqual(summary_verbosity >= 3, has_gradient_histogram)
self.assertEqual(summary_verbosity >= 2, has_log_gradients_histogram)
if summary_verbosity == 1:
self.assertLess(num_summary_ops, 10)
class TrivialModelTest(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'trivial'
class TestVgg1Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'vgg11'
class TestVgg19Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'vgg19'
class TestLenet5Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'lenet'
class TestGooglenetModel(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'googlenet'
class TestOverfeatModel(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'overfeat'
class TestAlexnetModel(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'alexnet'
def extended_tests(self):
return True
class TestTrivialModel(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'trivial'
class TestInceptionv3Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'inception3'
def extended_tests(self):
return True
class TestInceptionv4Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'inception4'
class TestResnet50Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'resnet50'
def model_save_load_test(self):
return True
class TestResnet101Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'resnet101'
class TestResnet152Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'resnet152'
class TestResnet50V2Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'resnet50_v2'
class TestResnet101V2Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'resnet101_v2'
class TestResnet152V2Model(TfCnnBenchmarksModelTest):
def get_model_name(self):
return 'resnet152_v2'
class TfCnnBenchmarksTest(tf.test.TestCase):
"""Tests that benchmark_cnn runs correctly."""
def setUp(self):
super(TfCnnBenchmarksTest, self).setUp()
_check_has_gpu()
benchmark_cnn.setup(benchmark_cnn.make_params())
def _run_benchmark_cnn(self, params):
logs = []
benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs)
benchmark_cnn.BenchmarkCNN(params).run()
return logs
def _run_benchmark_cnn_with_fake_images(self, params, images, labels):
logs = []
benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs)
bench = benchmark_cnn.BenchmarkCNN(params)
bench.input_preprocessor = preprocessing.TestImagePreprocessor(
params.batch_size * params.num_gpus,
[[params.batch_size, 227, 227, 3], [params.batch_size]],
params.num_gpus,
bench.model.data_type)
bench.dataset._queue_runner_required = True
bench.input_preprocessor.set_fake_data(images, labels)
bench.input_preprocessor.expected_subset = ('validation'
if params.eval else 'train')
bench.run()
return logs
def _run_benchmark_cnn_with_black_and_white_images(self, params):
"""Runs BenchmarkCNN with black and white images.
A BenchmarkCNN is created and run with black and white images as input. Half
the images are black (i.e., filled with 0s) and half are white (i.e., filled
with 255s).
Args:
params: Params for BenchmarkCNN.
Returns:
A list of lines from the output of BenchmarkCNN.
"""
# TODO(reedwm): Instead of generating images here, use black and white
# tfrecords by calling test_util.create_black_and_white_images().
effective_batch_size = params.batch_size * params.num_gpus
half_batch_size = effective_batch_size // 2
images = np.zeros((effective_batch_size, 227, 227, 3), dtype=np.float32)
images[half_batch_size:, :, :, :] = 255
labels = np.array([0] * half_batch_size + [1] * half_batch_size,
dtype=np.int32)
return self._run_benchmark_cnn_with_fake_images(params, images, labels)
def _train_and_eval_local(self,
params,
check_output_values=False,
max_final_loss=10.,
skip=None,
use_test_preprocessor=True):
# TODO(reedwm): check_output_values should default to True and be enabled
# on every test. Currently, if check_output_values=True and the calls to
# tf.set_random_seed(...) and np.seed(...) are passed certain seed values in
# benchmark_cnn.py, then most tests will fail. This indicates the tests
# are brittle and could fail with small changes when
# check_output_values=True, so check_output_values defaults to False for
# now.
def run_fn(run_type, inner_params):
del run_type
if use_test_preprocessor:
return [
self._run_benchmark_cnn_with_black_and_white_images(inner_params)
]
else:
return [self._run_benchmark_cnn(inner_params)]
return test_util.train_and_eval(self, run_fn, params,
check_output_values=check_output_values,
max_final_loss=max_final_loss,
skip=skip)
def testAlexnet(self):
params = test_util.get_params('testAlexnet')._replace(
num_batches=30, init_learning_rate=0.01, model='alexnet')
self._train_and_eval_local(params)
def testNoPrintAccuracy(self):
params = test_util.get_params('testNoPrintAccuracy')._replace(
print_training_accuracy=False)
self._train_and_eval_local(params)
def testLowAccuracy(self):
params = test_util.get_params('testLowAccuracy')._replace(
print_training_accuracy=True, batch_size=5, num_batches=10)
# We force low accuracy by having each batch containing 10 identical images,
# each with a different label. This guarantees a top-1 accuracy of exactly
# 0.1 and a top-5 accuracy of exactly 0.5.
images = np.zeros((10, 227, 227, 3), dtype=np.float32)
labels = np.arange(10, dtype=np.int32)
logs = self._run_benchmark_cnn_with_fake_images(params, images, labels)
training_outputs = test_util.get_training_outputs_from_logs(
logs, params.print_training_accuracy)
last_output = training_outputs[-1]
# TODO(reedwm): These should be assertEqual but for some reason,
# occasionally the accuracies are lower (Running this test 500 times, these
# asserts failed twice). Investigate this problem.
self.assertLessEqual(last_output.top_1_accuracy, 0.1)
self.assertLessEqual(last_output.top_5_accuracy, 0.5)
def testParameterServer(self):
params = test_util.get_params('testParameterServer')
self._train_and_eval_local(params)
def testParameterServerStaged(self):
params = test_util.get_params('testParameterServerStaged')._replace(
staged_vars=True)
self._train_and_eval_local(params)
def testReplicated(self):
params = test_util.get_params('testReplicated')._replace(
variable_update='replicated')
self._train_and_eval_local(params)
def testIndependent(self):
params = test_util.get_params('testIndependent')._replace(
variable_update='independent')
self._train_and_eval_local(params)
def testForwardOnly(self):
params = test_util.get_params('testForwardOnly')._replace(forward_only=True)
# Evaluation is not supported with --forward_only, so we set skip='eval'.
self._train_and_eval_local(params, skip='eval')
def testForwardOnlyAndFreeze(self):
params = test_util.get_params('testForwardOnlyAndFreeze')._replace(
forward_only=True, freeze_when_forward_only=True, train_dir=None)
# Training is not supported with --freeze_when_forward_only.
self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
def testNoDistortions(self):
params = test_util.get_params('testNoDistortions')._replace(
distortions=False)
self._train_and_eval_local(params)
def testCpuAsLocalParamDevice(self):
params = test_util.get_params('testCpuAsLocalParamDevice')._replace(
local_parameter_device='cpu')
self._train_and_eval_local(params)
def testNHWC(self):
params = test_util.get_params('testNHWC')._replace(data_format='NHWC')
self._train_and_eval_local(params)
def testCpuAsDevice(self):
params = test_util.get_params('testCpuAsDevice')._replace(
device='cpu', data_format='NHWC') # NHWC required when --device=cpu
self._train_and_eval_local(params)
def testMomentumParameterServer(self):
params = test_util.get_params('testMomentumParameterServer')._replace(
optimizer='momentum', momentum=0.8)
self._train_and_eval_local(params)
def testRmspropReplicated(self):
params = test_util.get_params('testRmspropReplicated')._replace(
variable_update='replicated',
optimizer='rmsprop',
rmsprop_decay=0.8,
rmsprop_momentum=0.6,
rmsprop_epsilon=0.7,
init_learning_rate=0.01)
self._train_and_eval_local(params)
def testBatchGroupSize(self):
params = test_util.get_params('testBatchGroupSize')._replace(
batch_group_size=4, num_batches=100, num_warmup_batches=5)
self._train_and_eval_local(params)
def testGradientClip(self):
params = test_util.get_params('testGradientClip')._replace(
gradient_clip=100.0)
self._train_and_eval_local(params)
def testWeightDecay(self):
params = test_util.get_params('testWeightDecay')._replace(
weight_decay=0.0001)
self._train_and_eval_local(params)
def testNoLayers(self):
params = test_util.get_params('testNoLayers')._replace(use_tf_layers=False)
self._train_and_eval_local(params)
def testSaveModelSteps(self):
params = test_util.get_params('testSaveModelSteps')._replace(
save_model_steps=2, num_warmup_batches=0, num_batches=10,
max_ckpts_to_keep=3)
self._train_and_eval_local(params)
for i in range(1, 20 + 1):
# We train for 20 steps, since self._train_and_eval_local() does two
# training runs of 10 steps each. We save a checkpoint every 2 steps and
# keep the last 3 checkpoints, so at the end, we should have checkpoints
# for steps 16, 18, and 20.
matches = glob.glob(os.path.join(params.train_dir,
'model.ckpt-{}.*'.format(i)))
if i in (16, 18, 20):
self.assertTrue(matches)
else:
self.assertFalse(matches)
def testFp16WithFp32Vars(self):
params = test_util.get_params('testFp16WithFp32Vars')._replace(
use_fp16=True, fp16_vars=False, fp16_loss_scale=1.)
self._train_and_eval_local(params)
def testFp16WithFp16Vars(self):
params = test_util.get_params('testFp16WithFp16Vars')._replace(
use_fp16=True, fp16_vars=True)
self._train_and_eval_local(params)
def testXlaCompile(self):
params = test_util.get_params('testXlaCompile')._replace(xla_compile=True)
self._train_and_eval_local(params)
@unittest.skip('Fails for unknown reason')
def testXlaCompileWithFp16(self):
params = test_util.get_params('testXlaCompileWithFp16')._replace(
use_fp16=True, xla_compile=True)
self._train_and_eval_local(params)
def testGradientRepacking(self):
params = test_util.get_params('testGradientRepacking1')._replace(
gradient_repacking=2)
self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
params = test_util.get_params('testGradientRepacking2')._replace(
gradient_repacking=2, use_fp16=True)
self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
def testTraceFileChromeTraceFormat(self):
trace_file = os.path.join(self.get_temp_dir(),
'testTraceFileChromeTraceFormat_tracefile')
params = test_util.get_params('testTraceFileChromeTraceFormat')._replace(
trace_file=trace_file, use_chrome_trace_format=True)
self._train_and_eval_local(params)
self.assertGreater(os.stat(trace_file).st_size, 0)
def testTraceFileStepStatsProto(self):
trace_file = os.path.join(self.get_temp_dir(),
'testTraceFileStepStatsProto_tracefile')
params = test_util.get_params('testTraceFileStepStatsProto')._replace(
trace_file=trace_file, use_chrome_trace_format=False)
self._train_and_eval_local(params)
self.assertGreater(os.stat(trace_file).st_size, 0)
with open(trace_file) as f:
step_stats = step_stats_pb2.StepStats()
# The following statement should not raise an exception.
contents = f.read()
text_format.Merge(contents, step_stats)
def testTfprofFile(self):
tfprof_file = os.path.join(self.get_temp_dir(), 'testTfprofFile_tfproffile')
params = test_util.get_params('testTfprofFile')._replace(
tfprof_file=tfprof_file)
self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
self.assertGreater(os.stat(tfprof_file).st_size, 0)
with open(tfprof_file, 'rb') as f:
profile_proto = tfprof_log_pb2.ProfileProto()
# The following statement should not raise an exception.
profile_proto.ParseFromString(f.read())
@unittest.skip('Fails for unknown reason')
def testMoveTrainDir(self):
params = test_util.get_params('testMoveTrainDir')
self._train_and_eval_local(params)
new_train_dir = params.train_dir + '_moved'
os.rename(params.train_dir, new_train_dir)
params = params._replace(train_dir=new_train_dir, eval=True)
self._run_benchmark_cnn_with_black_and_white_images(params)
@mock.patch('tensorflow.compat.v1.train.Saver')
@mock.patch('benchmark_cnn._get_checkpoint_to_load')
def testLoadCheckpoint(self, mock_checkpoint_to_load, mock_saver):
"""Tests load checkpoint with full path to checkpoint."""
expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243'
mock_checkpoint_to_load.return_value = expected_checkpoint
global_batch = benchmark_cnn.load_checkpoint(mock_saver,
None,
expected_checkpoint)
self.assertEqual(global_batch, 1243)
def testGetCheckpointToLoadFullPath(self):
"""Tests passing full path."""
ckpt_path = '/foo/bar/model.ckpt-189'
full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path)
self.assertEqual(full_path, ckpt_path)
def testGetCheckpointToLoadException(self):
"""Tests exception for directory without a checkpoint."""
ckpt_path = '/foo/bar/checkpoints'
self.assertRaises(benchmark_cnn.CheckpointNotFoundException,
benchmark_cnn._get_checkpoint_to_load, ckpt_path)
@mock.patch('tensorflow.compat.v1.train.get_checkpoint_state')
def testGetCheckpointToLoad(self, mock_checkpoint_state):
"""Tests passing path to checkpoint folder."""
expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243'
mock_checkpoint_state.return_value = mock.Mock(
model_checkpoint_path=expected_checkpoint)
ckpt_path = '/path/to/checkpoints/'
full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path)
self.assertEqual(full_path, expected_checkpoint)
def testImagenetPreprocessor(self):
imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data')
params = test_util.get_params('testImagenetPreprocessor')._replace(
data_dir=imagenet_dir, data_name='imagenet')
self._train_and_eval_local(params, use_test_preprocessor=False)
def testImagenetPreprocessorNoDistortions(self):
imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data')
params = test_util.get_params(
'testImagenetPreprocessorNoDistortions')._replace(
data_dir=imagenet_dir, data_name='imagenet', distortions=False)
self._train_and_eval_local(params, use_test_preprocessor=False)
def testImagenetPreprocessorVerboseSummary(self):
imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data')
params = test_util.get_params(
'testImagenetPreprocessorVerboseSummary')._replace(
data_dir=imagenet_dir, data_name='imagenet', distortions=False,
summary_verbosity=2)
self._train_and_eval_local(params, use_test_preprocessor=False)
def testCifar10SyntheticData(self):
params = test_util.get_params('testCifar10SyntheticData')._replace(
data_name='cifar10')
self._train_and_eval_local(params)
def testShiftRatio(self):
test_util.monkey_patch_base_cluster_manager()
params = benchmark_cnn.make_params(
data_name='imagenet',
data_dir=os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data'),
job_name='worker',
worker_hosts='w1,w2,w3,w4',
ps_hosts='p1',
task_index=0)
self.assertEqual(
benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.0)
params = params._replace(task_index=3)
self.assertEqual(
benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.75)
def testDistributedReplicatedSavableVars(self):
test_util.monkey_patch_base_cluster_manager()
params = benchmark_cnn.make_params(
variable_update='distributed_replicated',
model='inception4',
data_name='imagenet',
data_dir=os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data'),
job_name='worker',
worker_hosts='w1,w2,w3,w4',
ps_hosts='p1',
datasets_use_prefetch=False)
bench = benchmark_cnn.BenchmarkCNN(params)
with tf.Graph().as_default():
bench._build_model()
savable_vars = bench.variable_mgr.savable_variables()
# Assert all global variables are in savable_vars
for v in tf.global_variables():
if not v.name.startswith(
variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):
self.assertEqual(v.name, 'global_step:0')
name = bench.variable_mgr._strip_port(v.name)
if name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX):
name = name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):]
self.assertIn(name, savable_vars)
self.assertIn(savable_vars[name], tf.global_variables())
# Assert all local variables on the first tower are in savable_vars
for v in tf.local_variables():
if v.name.startswith('v0/'):
name = bench.variable_mgr._strip_port(v.name)
self.assertIn(name, savable_vars)
def _test_preprocessing_eval(self, image_height, image_width, output_height,
output_width):
image = tf.fill((image_height, image_width, 3),
tf.constant(128, dtype=tf.uint8))
params = benchmark_cnn.make_params()
new_image = preprocessing.eval_image(image, output_height, output_width, 0,
'bilinear', params.summary_verbosity)
with self.test_session() as sess:
new_image_value = sess.run(new_image)
self.assertAllEqual(new_image_value,
np.full((output_height, output_width, 3), 128,
dtype=np.uint8))
def testPreprocessingEval(self):
self._test_preprocessing_eval(10, 10, 4, 4)
self._test_preprocessing_eval(4, 4, 10, 10)
self._test_preprocessing_eval(1, 100, 100, 1)
self._test_preprocessing_eval(100, 1, 1, 100)
self._test_preprocessing_eval(1, 100, 1, 100)
def _test_preprocessing_traing(self, image_buf, image_color,
output_height, output_width, bbox,
batch_position, resize_method, distortions,
summary_verbosity, fuse_decode_and_crop):
new_image = preprocessing.train_image(
image_buf,
output_height,
output_width,
bbox,
batch_position,
resize_method,
distortions,
summary_verbosity=summary_verbosity,
fuse_decode_and_crop=fuse_decode_and_crop)
self.assertEqual(new_image.shape, [output_height, output_width, 3])
with self.test_session(use_gpu=True) as sess:
new_image_value = sess.run(new_image)
self.assertAllClose(
new_image_value,
np.full(
[output_height, output_width, 3],
image_color,
dtype=np.float32),
atol=50.,
rtol=0.)
def testPreprocessingTrain(self):
test_data_dir = os.path.join(platforms_util.get_test_data_dir(), 'images')
black_file = os.path.join(test_data_dir, 'black_image.jpg')
with open(black_file, 'rb') as f:
black_jpg_buffer = f.read()
white_file = os.path.join(test_data_dir, 'white_image.jpg')
with open(white_file, 'rb') as f:
white_jpg_buffer = f.read()
bbox = tf.zeros((1, 0, 4), dtype=tf.float32)
batch_position = 0
# Each size config is (output_height, output_width, resize_method)
size_configs = [(100, 100, 'round_robin'), (150, 10, 'bilinear'),
(10, 150, 'nearest')]
# Each image config is (image_buf, image_color)
image_configs = [(white_jpg_buffer, 255), (black_jpg_buffer, 0)]
for (image_buf, image_color) in image_configs:
for output_height, output_width, resize_method in size_configs:
for distortions in [True, False]:
for summary_verbosity in [0, 2]:
for fuse_decode_and_crop in [True, False]:
self._test_preprocessing_traing(
image_buf, image_color, output_height, output_width, bbox,
batch_position, resize_method, distortions, summary_verbosity,
fuse_decode_and_crop)
def _test_learning_rate(self, params, global_step_to_expected_learning_rate):
self.longMessage = True # pylint: disable=invalid-name
bench = benchmark_cnn.BenchmarkCNN(params)
with tf.Graph().as_default() as graph:
bench._build_model()
global_step = graph.get_tensor_by_name('global_step:0')
learning_rate = graph.get_tensor_by_name('learning_rate_tensor:0')
with self.test_session(graph=graph, use_gpu=True) as sess:
items = global_step_to_expected_learning_rate.items()
for global_step_val, expected_learning_rate in items:
self.assertAlmostEqual(sess.run(learning_rate,
{global_step: global_step_val}),
expected_learning_rate,
msg='at global_step:{}'.
format(global_step_val))
def testLearningRateModelSpecificResNet(self):
params = benchmark_cnn.make_params(model='resnet50',
batch_size=256,
variable_update='parameter_server',
num_gpus=1)
self._test_learning_rate(params, {
0: 0,
150136: 0.128,
150137: 0.0128,
300273: 0.0128,
300274: 0.00128,
10000000: 0.0000128
})
def testLearningRateUserProvidedInitLr(self):
params = benchmark_cnn.make_params(model='resnet50',
batch_size=256,
variable_update='replicated',
init_learning_rate=1.)
self._test_learning_rate(params, {
0: 1.,
10000000: 1.
})
def testLearningRateUserProvidedInitLrAndWarmup(self):
params = benchmark_cnn.make_params(model='resnet50',
batch_size=256,
variable_update='replicated',
init_learning_rate=1.,
num_learning_rate_warmup_epochs=5)
self._test_learning_rate(params, {
0: 0.,
12511: 0.5,
25022: 1.,
10000000: 1.
})
def testLearningRateUserProvidedDecayInfo(self):
params = benchmark_cnn.make_params(model='resnet50',
init_learning_rate=1.,
learning_rate_decay_factor=0.5,
num_epochs_per_decay=2,
minimum_learning_rate=0.3750,
batch_size=32)
self._test_learning_rate(params, {
0: 1.,
80071: 1.,
80072: 0.5,
160143: 0.5,
160144: 0.375,
10000000: 0.375
})
def testLearningRateUserProvidedZeroDecay(self):
params = benchmark_cnn.make_params(model='resnet50',
num_learning_rate_warmup_epochs=0,
learning_rate_decay_factor=0.5,
num_epochs_per_decay=0,
minimum_learning_rate=0.3750,
batch_size=32)
with self.assertRaises(ValueError):
with tf.Graph().as_default():
# This will fail because params.learning_rate_decay_factor cannot be
# nonzero if params.num_epochs_per_decay is zero.
benchmark_cnn.BenchmarkCNN(params)._build_model()
def testLearningRateUserProvidedSchedule(self):
params = benchmark_cnn.make_params(
model='trivial',
batch_size=32,
piecewise_learning_rate_schedule='1;3;.1;5;.01')
self._test_learning_rate(params, {
0: 1.,
120108: 1.,
120109: 0.1,
200181: 0.1,
200182: 0.01,
100000000: 0.01
})
def testNumBatchesAndEpochs(self):
params = benchmark_cnn.make_params()
batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 10, 100)
self.assertEqual(batches, benchmark_cnn._DEFAULT_NUM_BATCHES)
self.assertAlmostEqual(epochs,
float(benchmark_cnn._DEFAULT_NUM_BATCHES) / 10)
params = benchmark_cnn.make_params(num_batches=21)
batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 25, 50)
self.assertEqual(batches, 21)
self.assertAlmostEqual(epochs, 10.5)
params = benchmark_cnn.make_params(num_epochs=3)
batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3)
self.assertEqual(batches, 5)
self.assertAlmostEqual(epochs, 10./3.)
params = benchmark_cnn.make_params(num_epochs=4)
batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3)
self.assertEqual(batches, 6)
self.assertAlmostEqual(epochs, 4)
with self.assertRaises(ValueError):
params = benchmark_cnn.make_params(num_batches=100, num_epochs=100)
benchmark_cnn.get_num_batches_and_epochs(params, 1, 1)
def _testEvalDuringTraining(self, params, expected_num_eval_batches_found):
# The idea of this test is that all train images are black and all eval
# images are white. We pass the images through the TestModel, and ensure
# the outputs are as expected.
batch_size = params.batch_size
eval_batch_size = params.eval_batch_size or params.batch_size
class TestModel(test_util.TestCNNModel):
def __init__(self):
super(TestModel, self).__init__()
self.depth = 3
def add_inference(self, cnn):
if cnn.phase_train:
# This will allow us to test that 100 is only added during training
# and not during eval.
cnn.top_layer += 100
assert cnn.top_layer.shape[0] == batch_size
else:
assert cnn.top_layer.shape[0] == eval_batch_size
# Reduce the image to a single number. The number should be (-1 + 100)
# during training and 1 during testing.
cnn.top_layer = tf.reshape(cnn.top_layer, (cnn.top_layer.shape[0], -1))
cnn.top_layer = tf.reduce_mean(cnn.top_layer, axis=1)
cnn.top_layer = tf.reshape(cnn.top_layer,
(cnn.top_layer.shape[0], 1, 1, 1))
cnn.top_size = 1
trainable_vars = tf.trainable_variables()
# The super method will compute image*A*B, where A=1 and B=2.
super(TestModel, self).add_inference(cnn)
if not cnn.phase_train:
# Assert no new variables were added, since they should be reused from
# training.
assert len(trainable_vars) == len(tf.trainable_variables())
model = TestModel()
dataset = datasets.ImagenetDataset(params.data_dir)
logs = []
bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=model, dataset=dataset)
with test_util.monkey_patch(benchmark_cnn,
log_fn=test_util.print_and_add_to_list(logs)):
bench_cnn.run()
training_outputs = test_util.get_training_outputs_from_logs(
logs, print_training_accuracy=False)
self.assertEqual(len(training_outputs), params.num_batches)
expected_training_output = (-1 + 100) * 1 * 2
for training_output in training_outputs:
self.assertEqual(training_output.loss, expected_training_output)
eval_outputs = test_util.get_evaluation_outputs_from_logs(logs)
self.assertTrue(eval_outputs)
expected_eval_output = 1 * 1 * 2
for eval_output in eval_outputs:
self.assertEqual(eval_output.top_1_accuracy, expected_eval_output)
self.assertEqual(eval_output.top_5_accuracy, expected_eval_output)
num_eval_batches_found = 0
eval_batch_regex = re.compile(r'^\d+\t[0-9.]+ examples/sec$')
for log in logs:
if eval_batch_regex.match(log):
num_eval_batches_found += 1
self.assertEqual(num_eval_batches_found, expected_num_eval_batches_found)
def testEvalDuringTraining(self):
data_dir = test_util.create_black_and_white_images()
base_params = test_util.get_params('testEvalDuringTraining')
train_dir = base_params.train_dir
base_params = base_params._replace(
train_dir=None, print_training_accuracy=False, num_warmup_batches=0,
num_batches=7, num_eval_batches=2, display_every=1,
init_learning_rate=0, weight_decay=0,
distortions=False, data_dir=data_dir)
expected_num_eval_batches_found = (
base_params.num_eval_batches * (base_params.num_batches // 2 + 1))
# Test --eval_during_training_every_n_steps
self._testEvalDuringTraining(
base_params._replace(eval_during_training_every_n_steps=2,
variable_update='parameter_server'),
expected_num_eval_batches_found)
self._testEvalDuringTraining(
base_params._replace(eval_during_training_every_n_steps=2,
variable_update='replicated'),
expected_num_eval_batches_found)
self._testEvalDuringTraining(
base_params._replace(eval_during_training_every_n_steps=2,
variable_update='replicated',
summary_verbosity=2,
save_summaries_steps=2,
datasets_use_prefetch=False),
expected_num_eval_batches_found)
self._testEvalDuringTraining(
base_params._replace(eval_during_training_every_n_steps=2,
variable_update='replicated',
use_fp16=True, train_dir=train_dir,
eval_batch_size=base_params.batch_size + 2),
expected_num_eval_batches_found)
# Test --eval_during_training_every_n_epochs
every_n_epochs = (2 * base_params.batch_size * base_params.num_gpus /
datasets.IMAGENET_NUM_TRAIN_IMAGES)
self._testEvalDuringTraining(
base_params._replace(eval_during_training_every_n_epochs=every_n_epochs,
variable_update='replicated'),
expected_num_eval_batches_found)
# Test --eval_during_training_at_specified_steps
list_steps = [2, 3, 5, 7, 1000]
num_eval_steps = 1 + sum(1 for step in list_steps
if step < base_params.num_batches)
expected_num_eval_batches_found = (
base_params.num_eval_batches * num_eval_steps)
self._testEvalDuringTraining(
base_params._replace(eval_during_training_at_specified_steps=list_steps,
variable_update='replicated'),
expected_num_eval_batches_found)
# Test --eval_during_training_at_specified_epochs
list_epochs = [(step * base_params.batch_size * base_params.num_gpus /
datasets.IMAGENET_NUM_TRAIN_IMAGES)
for step in list_steps]
self._testEvalDuringTraining(
base_params._replace(
eval_during_training_at_specified_epochs=list_epochs,
variable_update='replicated'),
expected_num_eval_batches_found)
# Test --eval_during_training_every_n_steps runs with synthetic data.
params = base_params._replace(
variable_update='replicated', data_dir=None,
eval_during_training_every_n_steps=2, num_batches=2)
benchmark_cnn.BenchmarkCNN(params).run()
def testEvalDuringTrainingNumEpochs(self):
params = benchmark_cnn.make_params(
batch_size=1, eval_batch_size=2, eval_during_training_every_n_steps=1,
num_batches=30, num_eval_epochs=100 / datasets.IMAGENET_NUM_VAL_IMAGES)
bench_cnn = benchmark_cnn.BenchmarkCNN(params)
self.assertEqual(bench_cnn.num_batches, 30)
self.assertAlmostEqual(bench_cnn.num_epochs,
30 / datasets.IMAGENET_NUM_TRAIN_IMAGES)
self.assertAlmostEqual(bench_cnn.num_eval_batches, 50)
self.assertAlmostEqual(bench_cnn.num_eval_epochs,
100 / datasets.IMAGENET_NUM_VAL_IMAGES)
def testEarlyStopping(self):
params = benchmark_cnn.make_params(
batch_size=2,
display_every=1,
num_batches=100,
eval_during_training_every_n_steps=2,
stop_at_top_1_accuracy=0.4,
)
with mock.patch.object(benchmark_cnn.BenchmarkCNN, '_eval_once',
side_effect=[(0.1, 0.1), (0.5, 0.5), (0.2, 0.2)]
) as mock_eval_once:
logs = []
bench_cnn = benchmark_cnn.BenchmarkCNN(params)
with test_util.monkey_patch(benchmark_cnn,
log_fn=test_util.print_and_add_to_list(logs)):
bench_cnn.run()
training_outputs = test_util.get_training_outputs_from_logs(
logs, print_training_accuracy=False)
# We should stop after the second evaluation, and we evaluate every 2
# steps. So there should be 2 * 2 = 4 training outputs.
self.assertEqual(len(training_outputs), 4)
self.assertEqual(mock_eval_once.call_count, 2)
def testOutOfRangeErrorsAreNotIgnored(self):
error_msg = 'Fake OutOfRangeError error message'
with mock.patch.object(benchmark_cnn.BenchmarkCNN, 'benchmark_with_session',
side_effect=tf.errors.OutOfRangeError(None, None,
error_msg)):
with self.assertRaisesRegex(RuntimeError, error_msg):
benchmark_cnn.BenchmarkCNN(benchmark_cnn.make_params()).run()
def testInvalidFlags(self):
params = benchmark_cnn.make_params(device='cpu', data_format='NCHW')
with self.assertRaises(ValueError):
benchmark_cnn.BenchmarkCNN(params)
params = benchmark_cnn.make_params(use_fp16=True, fp16_vars=True,
variable_update='replicated',
all_reduce_spec='nccl')
with self.assertRaises(ValueError):
benchmark_cnn.BenchmarkCNN(params)
# Automatic loss scaling is only supported for 'replicated', 'ps',
# and 'independent' variable_updates.
invalid_variable_updates = [
'distributed_replicated', 'distributed_all_reduce'
]
for variable_update in invalid_variable_updates:
params = benchmark_cnn.make_params(
use_fp16=True,
fp16_vars=True,
fp16_enable_auto_loss_scale=True,
variable_update=variable_update)
with self.assertRaises(ValueError):
benchmark_cnn.BenchmarkCNN(params)
# Automatic loss scaling is not supported for 'nccl'.
params = benchmark_cnn.make_params(
use_fp16=True,
fp16_vars=True,
fp16_enable_auto_loss_scale=True,
all_reduce_spec='nccl')
with self.assertRaises(ValueError):
benchmark_cnn.BenchmarkCNN(params)
# Automatic loss scaling is not supported for 'staged_vars'.
params = benchmark_cnn.make_params(
use_fp16=True,
fp16_vars=True,
fp16_enable_auto_loss_scale=True,
staged_vars=True)
with self.assertRaises(ValueError):
benchmark_cnn.BenchmarkCNN(params)
def testMakeParams(self):
default_params = benchmark_cnn.make_params()
self.assertEqual(default_params.model,
flags.param_specs['model'].default_value)
params = benchmark_cnn.make_params(model='foo')
self.assertEqual(params.model, 'foo')
with self.assertRaises(ValueError):
benchmark_cnn.make_params(job_name='foo')
with self.assertRaises(ValueError):
benchmark_cnn.make_params(gpu_memory_frac_for_testing=-1.)
class VariableUpdateTest(tf.test.TestCase):
"""Tests that variables are updated correctly.
These tests use a very simple deterministic model. For example, some tests use
the model
loss = image * A * B
where image is a 1x1 images (with a single scalar value), and A and B are
scalar variables. Tests will run tf_cnn_benchmarks with such a model, on a
sequence of scalar images, and assert that the losses are the correct value.
Since the losses depend on the variables, this indirectly tests variables are
updated correctly.
"""
def setUp(self):
super(VariableUpdateTest, self).setUp()
_check_has_gpu()
benchmark_cnn.setup(benchmark_cnn.make_params())
def _get_benchmark_cnn_losses(self, inputs, params):
"""Returns the losses of BenchmarkCNN on the given inputs and params."""
logs = []
model = test_util.TestCNNModel()
with test_util.monkey_patch(benchmark_cnn,
log_fn=test_util.print_and_add_to_list(logs),
LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
bench = benchmark_cnn.BenchmarkCNN(
params, dataset=test_util.TestDataSet(), model=model)
# The test model does not use labels when computing loss, so the label
# values do not matter as long as it's the right shape.
labels = np.array([1] * inputs.shape[0])
bench.input_preprocessor.set_fake_data(inputs, labels)
if bench.eval_input_preprocessor:
bench.eval_input_preprocessor.set_fake_data(inputs, labels)
bench.run()
outputs = test_util.get_training_outputs_from_logs(
logs, params.print_training_accuracy)
return [x.loss for x in outputs]
def _test_variable_update(self, params):
"""Tests variables are updated correctly when the given params are used.
A BenchmarkCNN is created with a TestCNNModel, and is run with some scalar
images. The losses are then compared with the losses obtained with
TestCNNModel().manually_compute_losses()
Args:
params: a Params tuple used to create BenchmarkCNN.
"""
inputs = test_util.get_fake_var_update_inputs()
actual_losses = self._get_benchmark_cnn_losses(inputs, params)
expected_losses, = test_util.TestCNNModel().manually_compute_losses(
inputs, 1, params)
rtol = 3e-2 if params.use_fp16 else 1e-5
self.assertAllClose(actual_losses[:len(expected_losses)], expected_losses,
rtol=rtol, atol=0.)
def _test_variable_updates(self, params,
var_updates=('parameter_server', 'replicated')):
for var_update in var_updates:
self._test_variable_update(params._replace(variable_update=var_update))
def testDefault(self):
params = test_util.get_var_update_params()
self._test_variable_updates(params)
# For some reason, this test doesn't always pass
# def testCpuAsDevice(self):
# params = test_util.get_var_update_params()._replace(
# device='cpu',
# data_format='NHWC') # NHWC required when --device=cpu
# self._test_variable_updates(params)
def testCpuAsLocalParamDevice(self):
params = test_util.get_var_update_params()._replace(
local_parameter_device='cpu')
self._test_variable_updates(params)
def testFp16(self):
params = test_util.get_var_update_params()._replace(use_fp16=True)
self._test_variable_updates(params)
def testMomentum(self):
params = test_util.get_var_update_params()._replace(optimizer='momentum')
self._test_variable_updates(params)
def testRmsprop(self):
params = test_util.get_var_update_params()._replace(optimizer='rmsprop')
self._test_variable_updates(params)
def testNoLayers(self):
params = test_util.get_var_update_params()._replace(use_tf_layers=False)
self._test_variable_updates(params)
def testVariousAllReduceSpecs(self):
# We do not test xring, because it requires all Variables to have at least
# two elements.
params = test_util.get_var_update_params()._replace(all_reduce_spec='pscpu')
self._test_variable_updates(params, var_updates=('replicated',))
params = params._replace(all_reduce_spec='psgpu')
self._test_variable_updates(params, var_updates=('replicated',))
# TODO(b/80125832): Enable nccl in tests
# params = params._replace(all_reduce_spec='nccl',
# compact_gradient_transfer=False)
# self._test_variable_updates(params, var_updates=('replicated',))
def testPrintBaseLoss(self):
params = test_util.get_var_update_params()._replace(
loss_type_to_report='base_loss')
self._test_variable_updates(params)
def testSingleL2LossOp(self):
params = test_util.get_var_update_params()._replace(
single_l2_loss_op=True)
self._test_variable_updates(params)
def testResourceVars(self):
params = test_util.get_var_update_params()._replace(
use_resource_vars=True)
self._test_variable_updates(params)
def testEvalDuringTrainingEveryNSteps(self):
# TODO(reedwm): Test that the eval results are correct. This only tests that
# training results are correct.
params = test_util.get_var_update_params()._replace(
eval_during_training_every_n_steps=1)
self._test_variable_updates(params, var_updates=('replicated',))
class VariableMgrLocalReplicatedTest(tf.test.TestCase):
def _test_grad_aggregation_with_var_mgr(self, variable_mgr, num_towers,
num_vars, deferred_grads):
tower_devices = ['/gpu:%d' % i for i in range(num_towers)]
tower_grads = []
expected_sums = [0.] * num_vars
for i, tower_device in enumerate(tower_devices):
with tf.device(tower_device):
grad_vars = []
for j in range(num_vars):
n = num_towers * i + j
grad_vars.append((tf.constant(n, dtype=tf.float32),
tf.Variable(n, dtype=tf.float32)))
expected_sums[j] += n
tower_grads.append(grad_vars)
_, agg_device_grads = variable_mgr.preprocess_device_grads(
tower_grads)
expected_device_grads = []
for i in range(num_towers):
expected_grad_vars = []
for j in range(num_vars):
expected_grad_and_var = [expected_sums[j], num_towers * i + j]
if isinstance(agg_device_grads[i][j], tuple):
# agg_device_grads[i][j] can be a list or tuple.
expected_grad_and_var = tuple(expected_grad_and_var)
expected_grad_vars.append(expected_grad_and_var)
if isinstance(agg_device_grads[i], tuple):
# agg_device_grads[i] can be a list or tuple.
expected_grad_vars = tuple(expected_grad_vars)
expected_device_grads.append(expected_grad_vars)
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
sess.run(tf.initialize_all_variables())
sess.run(variable_mgr._warmup_ops)
if deferred_grads:
# With deferred grads, the result of a session run is always the summed
# gradients from the previous session run.
sess.run(agg_device_grads)
feed_dict = {g: 0 for grad_vars in tower_grads for g, _ in grad_vars}
agg_device_grads_ = sess.run(agg_device_grads, feed_dict)
else:
agg_device_grads_ = sess.run(agg_device_grads)
self.assertEqual(agg_device_grads_, expected_device_grads)
def _test_grad_aggregation(self, params, num_vars):
bench = benchmark_cnn.BenchmarkCNN(params)
deferred_grads = (params.variable_consistency == 'relaxed')
self._test_grad_aggregation_with_var_mgr(bench.variable_mgr, bench.num_gpus,
num_vars, deferred_grads)
def test_grad_aggregation(self):
base_params = benchmark_cnn.make_params(num_gpus=10,
variable_update='replicated',
use_fp16=True)
params = base_params
self._test_grad_aggregation(params, 10)
params = base_params._replace(gradient_repacking=3)
self._test_grad_aggregation(params, 10)
params = base_params._replace(variable_consistency='relaxed')
self._test_grad_aggregation(params, 10)
params = base_params._replace(compact_gradient_transfer=False)
self._test_grad_aggregation(params, 10)
params = base_params._replace(gradient_repacking=3,
variable_consistency='relaxed')
self._test_grad_aggregation(params, 10)
params = base_params._replace(gradient_repacking=3,
compact_gradient_transfer=False)
self._test_grad_aggregation(params, 10)
params = base_params._replace(variable_consistency='relaxed',
compact_gradient_transfer=False)
self._test_grad_aggregation(params, 10)
params = base_params._replace(gradient_repacking=3,
variable_consistency='relaxed',
compact_gradient_transfer=False)
self._test_grad_aggregation(params, 10)
params = base_params._replace(num_gpus=8, hierarchical_copy=True)
self._test_grad_aggregation(params, 10)
# TODO(b/80125832): Enable nccl in tests
# params = base_params._replace(all_reduce_spec='nccl',
# compact_gradient_transfer=False,
# # For some reason, this test freezes when
# # num_gpus=10
# num_gpus=8)
# self._test_grad_aggregation(params, 10)
params = base_params._replace(all_reduce_spec='pscpu')
self._test_grad_aggregation(params, 10)
params = base_params._replace(num_gpus=8,
gradient_repacking=3,
variable_consistency='relaxed',
hierarchical_copy=True)
self._test_grad_aggregation(params, 10)
# TODO(b/80125832): Enable nccl in tests
# params = base_params._replace(num_gpus=8,
# gradient_repacking=3,
# variable_consistency='relaxed',
# all_reduce_spec='nccl',
# compact_gradient_transfer=False)
# self._test_grad_aggregation(params, 10)
params = base_params._replace(gradient_repacking=3,
variable_consistency='relaxed',
all_reduce_spec='pscpu')
self._test_grad_aggregation(params, 10)
params = base_params._replace(gradient_repacking=3,
variable_consistency='relaxed',
all_reduce_spec='xring')
self._test_grad_aggregation(params, 10)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for CNN benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import threading
import numpy as np
import tensorflow.compat.v1 as tf
def tensorflow_version_tuple():
v = tf.__version__
major, minor, patch = v.split('.')
return (int(major), int(minor), patch)
def tensorflow_version():
vt = tensorflow_version_tuple()
return vt[0] * 1000 + vt[1]
def log_fn(log):
print(log, flush=True)
def roll_numpy_batches(array, batch_size, shift_ratio):
"""Moves a proportion of batches from start to the end of the array.
This function moves a proportion of batches, specified by `shift_ratio`, from
the starts of the array to the end. The number of batches moved is rounded
down to the nearest integer. For example,
```
roll_numpy_batches([1, 2, 3, 4, 5, 6], 2, 0.34) == [3, 4, 5, 6, 1, 2]
```
Args:
array: A Numpy array whose first dimension is the batch dimension.
batch_size: The batch size.
shift_ratio: Proportion of batches to move from the start of the array to
the end of the array.
Returns:
A new Numpy array, with a proportion of the batches at the start of `array`
moved to the end.
"""
num_items = array.shape[0]
assert num_items % batch_size == 0
num_batches = num_items // batch_size
starting_batch = int(num_batches * shift_ratio)
starting_item = starting_batch * batch_size
return np.roll(array, -starting_item, axis=0)
# For Python 2.7 compatibility, we do not use threading.Barrier.
class Barrier(object):
"""Implements a lightweight Barrier.
Useful for synchronizing a fixed number of threads at known synchronization
points. Threads block on 'wait()' and simultaneously return once they have
all made that call.
# Implementation adopted from boost/thread/barrier.hpp
"""
def __init__(self, parties):
"""Create a barrier, initialised to 'parties' threads."""
self.cond = threading.Condition(threading.Lock())
self.parties = parties
# Indicates the number of waiting parties.
self.waiting = 0
# generation is needed to deal with spurious wakeups. If self.cond.wait()
# wakes up for other reasons, generation will force it go back to wait().
self.generation = 0
self.broken = False
def wait(self):
"""Wait for the barrier."""
with self.cond:
# Check if the barrier has been disabled or not.
if self.broken:
return
gen = self.generation
self.waiting += 1
if self.waiting == self.parties:
self.waiting = 0
self.generation += 1
self.cond.notify_all()
# loop because of spurious wakeups
while gen == self.generation:
self.cond.wait()
# TODO(huangyp): Remove this method once we find a way to know which step
# is the last barrier.
def abort(self):
"""Clear existing barrier and disable this barrier."""
with self.cond:
if self.waiting > 0:
self.generation += 1
self.cond.notify_all()
self.broken = True
class ImageProducer(object):
"""An image producer that puts images into a staging area periodically.
This class is useful for periodically running a set of ops, `put_ops` on a
different thread every `batch_group_size` steps.
The notify_image_consumption() method is used to increment an internal counter
so that every `batch_group_size` times it is called, `put_ops` is executed. A
barrier is placed so that notify_image_consumption() will block until
the previous call to `put_ops` has been executed.
The start() method is used to start the thread that runs `put_ops`.
The done() method waits until the last put_ops is executed and stops the
thread.
The purpose of this class is to fill an image input pipeline every
`batch_group_size` steps. Suppose `put_ops` supplies `batch_group_size` images
to the input pipeline when run, and that every step, 1 batch of images is
consumed. Then, by calling notify_image_consumption() every step, images are
supplied to the input pipeline at the same amount they are consumed.
Example usage:
```
put_ops = ... # Enqueues `batch_group_size` batches to a StagingArea
get_op = ... # Dequeues 1 batch, and does some operations on it
batch_group_size = 4
with tf.Session() as sess:
image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size)
image_producer.start()
for _ in range(100):
sess.run(get_op)
image_producer.notify_image_consumption()
```
"""
def __init__(self, sess, put_ops, batch_group_size, use_python32_barrier):
self.sess = sess
self.num_gets = 0
self.put_ops = put_ops
self.batch_group_size = batch_group_size
self.done_event = threading.Event()
if (use_python32_barrier and
sys.version_info[0] == 3 and sys.version_info[1] >= 2):
self.put_barrier = threading.Barrier(2)
else:
self.put_barrier = Barrier(2)
def _should_put(self):
return (self.num_gets + 1) % self.batch_group_size == 0
def done(self):
"""Stop the image producer."""
self.done_event.set()
self.put_barrier.abort()
self.thread.join()
def start(self):
"""Start the image producer."""
self.sess.run([self.put_ops])
self.thread = threading.Thread(target=self._loop_producer)
# Set daemon to true to allow Ctrl + C to terminate all threads.
self.thread.daemon = True
self.thread.start()
def notify_image_consumption(self):
"""Increment the counter of image_producer by 1.
This should only be called by the main thread that consumes images and runs
the model computation. One batch of images should be consumed between
calling start() and the first call to this method. Then, one batch of images
should be consumed between any two successive calls to this method.
"""
if self._should_put():
self.put_barrier.wait()
self.num_gets += 1
def _loop_producer(self):
while not self.done_event.isSet():
self.sess.run([self.put_ops])
self.put_barrier.wait()
class BaseClusterManager(object):
"""The manager for the cluster of servers running the benchmark."""
def __init__(self, params):
worker_hosts = params.worker_hosts.split(',')
ps_hosts = params.ps_hosts.split(',') if params.ps_hosts else []
cluster = {'worker': worker_hosts}
if ps_hosts:
cluster['ps'] = ps_hosts
self._cluster_spec = tf.train.ClusterSpec(cluster)
def get_target(self):
"""Returns a target to be passed to tf.Session()."""
raise NotImplementedError('get_target must be implemented by subclass')
def join_server(self):
raise NotImplementedError('join must be implemented by subclass')
def get_cluster_spec(self):
return self._cluster_spec
def num_workers(self):
return len(self._cluster_spec.job_tasks('worker'))
def num_ps(self):
if 'ps' in self._cluster_spec.jobs:
return len(self._cluster_spec.job_tasks('ps'))
else:
return 0
class GrpcClusterManager(BaseClusterManager):
"""A cluster manager for a cluster networked with gRPC."""
def __init__(self, params, config_proto):
super(GrpcClusterManager, self).__init__(params)
if params.job_name == 'controller':
self._target = 'grpc://%s' % self._cluster_spec.job_tasks('worker')[0]
else:
self._server = tf.train.Server(self._cluster_spec,
job_name=params.job_name,
task_index=params.task_index,
config=config_proto,
protocol=params.server_protocol)
self._target = self._server.target
def get_target(self):
return self._target
def join_server(self):
return self._server.join()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_cnn_benchmarks.cnn_util."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import threading
import time
import tensorflow.compat.v1 as tf
import cnn_util
class CnnUtilBarrierTest(tf.test.TestCase):
def testBarrier(self):
num_tasks = 20
num_waits = 4
barrier = cnn_util.Barrier(num_tasks)
threads = []
sync_matrix = []
for i in range(num_tasks):
sync_times = [0] * num_waits
thread = threading.Thread(
target=self._run_task, args=(barrier, sync_times))
thread.start()
threads.append(thread)
sync_matrix.append(sync_times)
for thread in threads:
thread.join()
for wait_index in range(num_waits - 1):
# Max of times at iteration i < min of times at iteration i + 1
self.assertLessEqual(
max([sync_matrix[i][wait_index] for i in range(num_tasks)]),
min([sync_matrix[i][wait_index + 1] for i in range(num_tasks)]))
def _run_task(self, barrier, sync_times):
for wait_index in range(len(sync_times)):
sync_times[wait_index] = time.time()
barrier.wait()
def testBarrierAbort(self):
num_tasks = 2
num_waits = 1
sync_times = [0] * num_waits
barrier = cnn_util.Barrier(num_tasks)
thread = threading.Thread(
target=self._run_task, args=(barrier, sync_times))
thread.start()
barrier.abort()
# thread won't be blocked by done barrier.
thread.join()
class ImageProducerTest(tf.test.TestCase):
def _slow_tensorflow_op(self):
"""Returns a TensorFlow op that takes approximately 0.1s to complete."""
def slow_func(v):
time.sleep(0.1)
return v
return tf.py_func(slow_func, [tf.constant(0.)], tf.float32).op
def _test_image_producer(self, batch_group_size, put_slower_than_get):
# We use the variable x to simulate a staging area of images. x represents
# the number of batches in the staging area.
x = tf.Variable(0, dtype=tf.int32)
if put_slower_than_get:
put_dep = self._slow_tensorflow_op()
get_dep = tf.no_op()
else:
put_dep = tf.no_op()
get_dep = self._slow_tensorflow_op()
with tf.control_dependencies([put_dep]):
put_op = x.assign_add(batch_group_size, use_locking=True)
with tf.control_dependencies([get_dep]):
get_op = x.assign_sub(1, use_locking=True)
with self.test_session() as sess:
sess.run(tf.variables_initializer([x]))
image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size,
use_python32_barrier=False)
image_producer.start()
for _ in range(5 * batch_group_size):
sess.run(get_op)
# We assert x is nonnegative, to ensure image_producer never causes
# an unstage op to block. We assert x is at most 2 * batch_group_size,
# to ensure it doesn't use too much memory by storing too many batches
# in the staging area.
self.assertGreaterEqual(sess.run(x), 0)
self.assertLessEqual(sess.run(x), 2 * batch_group_size)
image_producer.notify_image_consumption()
self.assertGreaterEqual(sess.run(x), 0)
self.assertLessEqual(sess.run(x), 2 * batch_group_size)
image_producer.done()
time.sleep(0.1)
self.assertGreaterEqual(sess.run(x), 0)
self.assertLessEqual(sess.run(x), 2 * batch_group_size)
def test_image_producer(self):
self._test_image_producer(1, False)
self._test_image_producer(1, True)
self._test_image_producer(2, False)
self._test_image_producer(2, True)
self._test_image_producer(3, False)
self._test_image_producer(3, True)
self._test_image_producer(8, False)
self._test_image_producer(8, True)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""COCO-style evaluation metrics.
Forked from reference model implementation.
COCO API: github.com/cocodataset/cocoapi/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
import tempfile
from absl import flags
import numpy as np
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import six
import tensorflow.compat.v1 as tf
import mlperf
import ssd_constants
FLAGS = flags.FLAGS
# https://github.com/cocodataset/cocoapi/issues/49
if six.PY3:
import pycocotools.coco
pycocotools.coco.unicode = str
def async_eval_runner(queue_predictions, queue_results, val_json_file):
"""Load intermediate eval results and get COCO metrics."""
while True:
message = queue_predictions.get()
if message == 'STOP': # poison pill
break
step, predictions = message
results = compute_map(predictions, val_json_file)
queue_results.put((step, results))
def compute_map(predictions, val_json_file):
"""Use model predictions to compute mAP.
Args:
predictions: a list of tuples returned by decoded_predictions function,
each containing the following elements:
image source_id, box coordinates in XYWH order, probability score, label
val_json_file: path to COCO annotation file
Returns:
A dictionary that maps all COCO metrics (keys) to their values
"""
if val_json_file.startswith("gs://"):
_, local_val_json = tempfile.mkstemp(suffix=".json")
tf.gfile.Remove(local_val_json)
tf.gfile.Copy(val_json_file, local_val_json)
atexit.register(tf.gfile.Remove, local_val_json)
else:
local_val_json = val_json_file
cocoGt = COCO(local_val_json)
cocoDt = cocoGt.loadRes(np.array(predictions))
E = COCOeval(cocoGt, cocoDt, iouType='bbox')
E.evaluate()
E.accumulate()
E.summarize()
print("Current AP: {:.5f}".format(E.stats[0]))
metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
# Prefix with "COCO" to group in TensorBoard.
return {"COCO/" + key: value for key, value in zip(metric_names, E.stats)}
def calc_iou(target, candidates):
target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1))
# Left Top & Right Bottom
lt = np.maximum(target_tiled[:,:2], candidates[:,:2])
rb = np.minimum(target_tiled[:,2:], candidates[:,2:])
delta = np.maximum(rb - lt, 0)
intersect = delta[:,0] * delta[:,1]
delta1 = target_tiled[:,2:] - candidates[:,:2]
area1 = delta1[:,0] * delta1[:,1]
delta2 = target_tiled[:,2:] - candidates[:,:2]
area2 = delta2[:,0] * delta2[:,1]
iou = intersect/(area1 + area2 - intersect)
return iou
# TODO(haoyuzhang): Rewrite this NumPy based implementation to TensorFlow based
# implementation under ssd_model.py accuracy_function.
def decode_predictions(labels_and_predictions):
"""Decode predictions and remove unused boxes and labels."""
predictions = []
for example in labels_and_predictions:
source_id = int(example[ssd_constants.SOURCE_ID])
pred_box = example[ssd_constants.PRED_BOXES]
pred_scores = example[ssd_constants.PRED_SCORES]
locs, labels, probs = decode_single(
pred_box, pred_scores, ssd_constants.OVERLAP_CRITERIA,
ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES)
raw_height, raw_width, _ = example[ssd_constants.RAW_SHAPE]
for loc, label, prob in zip(locs, labels, probs):
# Ordering convention differs, hence [1], [0] rather than [0], [1]
x, y = loc[1] * raw_width, loc[0] * raw_height
w, h = (loc[3] - loc[1]) * raw_width, (loc[2] - loc[0]) * raw_height
predictions.append(
[source_id, x, y, w, h, prob, ssd_constants.CLASS_INV_MAP[label]])
mlperf.logger.log(key=mlperf.tags.NMS_THRESHOLD,
value=ssd_constants.OVERLAP_CRITERIA)
mlperf.logger.log(key=mlperf.tags.NMS_MAX_DETECTIONS,
value=ssd_constants.MAX_NUM_EVAL_BOXES)
return predictions
def decode_single(bboxes_in, scores_in, criteria, max_output, max_num=200):
# Reference to https://github.com/amdegroot/ssd.pytorch
bboxes_out = []
scores_out = []
labels_out = []
for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)):
score = np.squeeze(score, 1)
# skip background
if i == 0:
continue
mask = score > ssd_constants.MIN_SCORE
if not np.any(mask):
continue
bboxes, score = bboxes_in[mask, :], score[mask]
score_idx_sorted = np.argsort(score)
score_sorted = score[score_idx_sorted]
score_idx_sorted = score_idx_sorted[-max_num:]
candidates = []
# perform non-maximum suppression
while len(score_idx_sorted):
idx = score_idx_sorted[-1]
bboxes_sorted = bboxes[score_idx_sorted, :]
bboxes_idx = bboxes[idx, :]
iou = calc_iou(bboxes_idx, bboxes_sorted)
score_idx_sorted = score_idx_sorted[iou < criteria]
candidates.append(idx)
bboxes_out.append(bboxes[candidates, :])
scores_out.append(score[candidates])
labels_out.extend([i]*len(candidates))
if len(scores_out) == 0:
tf.logging.info("No objects detected. Returning dummy values.")
return (
np.zeros(shape=(1, 4), dtype=np.float32),
np.zeros(shape=(1,), dtype=np.int32),
np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE,
)
bboxes_out = np.concatenate(bboxes_out, axis=0)
scores_out = np.concatenate(scores_out, axis=0)
labels_out = np.array(labels_out)
max_ids = np.argsort(scores_out)[-max_output:]
return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Constants used in tf_cnn_benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from enum import Enum
# Results fetched with this prefix will not be reduced. Instead, they will be
# passed as matrices to model's postprocess function.
UNREDUCED_ACCURACY_OP_PREFIX = "tensor:"
# Eval result values with this name prefix will be included in summary.
SIMPLE_VALUE_RESULT_PREFIX = "simple_value:"
class BenchmarkMode(object):
"""Benchmark running mode."""
TRAIN = "training"
EVAL = "evaluation"
TRAIN_AND_EVAL = "training + evaluation"
FORWARD_ONLY = "forward only"
class NetworkTopology(str, Enum):
"""Network topology describes how multiple GPUs are inter-connected.
"""
# DGX-1 uses hybrid cube mesh topology with the following device peer to peer
# matrix:
# DMA: 0 1 2 3 4 5 6 7
# 0: Y Y Y Y Y N N N
# 1: Y Y Y Y N Y N N
# 2: Y Y Y Y N N Y N
# 3: Y Y Y Y N N N Y
# 4: Y N N N Y Y Y Y
# 5: N Y N N Y Y Y Y
# 6: N N Y N Y Y Y Y
# 7: N N N Y Y Y Y Y
DGX1 = "dgx1"
# V100 in GCP are connected with the following device peer to peer matrix.
# In this topology, bandwidth of the connection depends on if it uses NVLink
# or PCIe link.
# DMA: 0 1 2 3 4 5 6 7
# 0: Y Y Y Y N Y N N
# 1: Y Y Y Y N N N N
# 2: Y Y Y Y N N N Y
# 3: Y Y Y Y N N N N
# 4: N N N N Y Y Y Y
# 5: Y N N N Y Y Y Y
# 6: N N N N Y Y Y Y
# 7: N N Y N Y Y Y Y
GCP_V100 = "gcp_v100"
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CNN builder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import contextlib
import numpy as np
import tensorflow.compat.v1 as tf
# pylint: disable=g-direct-tensorflow-import
import mlperf
from tensorflow.python.layers import convolutional as conv_layers
from tensorflow.python.layers import core as core_layers
from tensorflow.python.layers import normalization as normalization_layers
from tensorflow.python.layers import pooling as pooling_layers
from tensorflow.python.training import moving_averages
_data_format_to_channel_axis = {'NCHW': 1, 'NHWC': 3}
class ConvNetBuilder(object):
"""Builder of cnn net."""
def __init__(self,
input_op,
input_nchan,
phase_train,
use_tf_layers,
data_format='NCHW',
dtype=tf.float32,
variable_dtype=tf.float32):
self.top_layer = input_op
self.top_size = input_nchan
self.phase_train = phase_train
self.use_tf_layers = use_tf_layers
self.data_format = data_format
self.dtype = dtype
self.variable_dtype = variable_dtype
self.counts = defaultdict(lambda: 0)
self.use_batch_norm = False
self.batch_norm_config = {} # 'decay': 0.997, 'scale': True}
self.channel_pos = ('channels_last'
if data_format == 'NHWC' else 'channels_first')
self.aux_top_layer = None
self.aux_top_size = 0
def get_custom_getter(self):
"""Returns a custom getter that this class's methods must be called under.
All methods of this class must be called under a variable scope that was
passed this custom getter. Example:
```python
network = ConvNetBuilder(...)
with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
network.conv(...)
# Call more methods of network here
```
Currently, this custom getter only does anything if self.use_tf_layers is
True. In that case, it causes variables to be stored as dtype
self.variable_type, then casted to the requested dtype, instead of directly
storing the variable as the requested dtype.
"""
def inner_custom_getter(getter, *args, **kwargs):
"""Custom getter that forces variables to have type self.variable_type."""
if not self.use_tf_layers:
return getter(*args, **kwargs)
requested_dtype = kwargs['dtype']
if not (requested_dtype == tf.float32 and
self.variable_dtype == tf.float16):
# Only change the variable dtype if doing so does not decrease variable
# precision.
kwargs['dtype'] = self.variable_dtype
var = getter(*args, **kwargs)
# This if statement is needed to guard the cast, because batch norm
# assigns directly to the return value of this custom getter. The cast
# makes the return value not a variable so it cannot be assigned. Batch
# norm variables are always in fp32 so this if statement is never
# triggered for them.
if var.dtype.base_dtype != requested_dtype:
var = tf.cast(var, requested_dtype)
return var
return inner_custom_getter
@contextlib.contextmanager
def switch_to_aux_top_layer(self):
"""Context that construct cnn in the auxiliary arm."""
if self.aux_top_layer is None:
raise RuntimeError('Empty auxiliary top layer in the network.')
saved_top_layer = self.top_layer
saved_top_size = self.top_size
self.top_layer = self.aux_top_layer
self.top_size = self.aux_top_size
yield
self.aux_top_layer = self.top_layer
self.aux_top_size = self.top_size
self.top_layer = saved_top_layer
self.top_size = saved_top_size
def get_variable(self, name, shape, dtype, cast_dtype, *args, **kwargs):
# TODO(reedwm): Currently variables and gradients are transferred to other
# devices and machines as type `dtype`, not `cast_dtype`. In particular,
# this means in fp16 mode, variables are transferred as fp32 values, not
# fp16 values, which uses extra bandwidth.
var = tf.get_variable(name, shape, dtype, *args, **kwargs)
return tf.cast(var, cast_dtype)
def _conv2d_impl(self, input_layer, num_channels_in, filters, kernel_size,
strides, padding, kernel_initializer):
if self.use_tf_layers:
return conv_layers.conv2d(input_layer, filters, kernel_size, strides,
padding, self.channel_pos,
kernel_initializer=kernel_initializer,
use_bias=False)
else:
weights_shape = [kernel_size[0], kernel_size[1], num_channels_in, filters]
# We use the name 'conv2d/kernel' so the variable has the same name as its
# tf.layers equivalent. This way, if a checkpoint is written when
# self.use_tf_layers == True, it can be loaded when
# self.use_tf_layers == False, and vice versa.
weights = self.get_variable('conv2d/kernel', weights_shape,
self.variable_dtype, self.dtype,
initializer=kernel_initializer)
if self.data_format == 'NHWC':
strides = [1] + strides + [1]
else:
strides = [1, 1] + strides
return tf.nn.conv2d(input_layer, weights, strides, padding,
data_format=self.data_format)
def conv(self,
num_out_channels,
k_height,
k_width,
d_height=1,
d_width=1,
mode='SAME',
input_layer=None,
num_channels_in=None,
use_batch_norm=None,
stddev=None,
activation='relu',
bias=0.0,
kernel_initializer=None):
"""Construct a conv2d layer on top of cnn."""
if input_layer is None:
input_layer = self.top_layer
if num_channels_in is None:
num_channels_in = self.top_size
if stddev is not None and kernel_initializer is None:
kernel_initializer = tf.truncated_normal_initializer(stddev=stddev)
if kernel_initializer is None:
kernel_initializer = tf.variance_scaling_initializer()
name = 'conv' + str(self.counts['conv'])
self.counts['conv'] += 1
with tf.variable_scope(name):
strides = [1, d_height, d_width, 1]
if self.data_format == 'NCHW':
strides = [strides[0], strides[3], strides[1], strides[2]]
if mode != 'SAME_RESNET':
conv = self._conv2d_impl(input_layer, num_channels_in, num_out_channels,
kernel_size=[k_height, k_width],
strides=[d_height, d_width], padding=mode,
kernel_initializer=kernel_initializer)
else: # Special padding mode for ResNet models
if d_height == 1 and d_width == 1:
conv = self._conv2d_impl(input_layer, num_channels_in,
num_out_channels,
kernel_size=[k_height, k_width],
strides=[d_height, d_width], padding='SAME',
kernel_initializer=kernel_initializer)
else:
rate = 1 # Unused (for 'a trous' convolutions)
kernel_height_effective = k_height + (k_height - 1) * (rate - 1)
pad_h_beg = (kernel_height_effective - 1) // 2
pad_h_end = kernel_height_effective - 1 - pad_h_beg
kernel_width_effective = k_width + (k_width - 1) * (rate - 1)
pad_w_beg = (kernel_width_effective - 1) // 2
pad_w_end = kernel_width_effective - 1 - pad_w_beg
padding = [[0, 0], [pad_h_beg, pad_h_end],
[pad_w_beg, pad_w_end], [0, 0]]
if self.data_format == 'NCHW':
padding = [padding[0], padding[3], padding[1], padding[2]]
padded_input_layer = tf.pad(input_layer, padding)
conv = self._conv2d_impl(padded_input_layer, num_channels_in,
num_out_channels,
kernel_size=[k_height, k_width],
strides=[d_height, d_width], padding='VALID',
kernel_initializer=kernel_initializer)
if use_batch_norm is None:
use_batch_norm = self.use_batch_norm
mlperf.logger.log_conv2d(input_tensor=input_layer, output_tensor=conv,
stride_height=d_height, stride_width=d_width,
filters=num_out_channels,
initializer=kernel_initializer,
use_bias=not use_batch_norm and bias is not None)
if not use_batch_norm:
if bias is not None:
biases = self.get_variable('biases', [num_out_channels],
self.variable_dtype, self.dtype,
initializer=tf.constant_initializer(bias))
biased = tf.reshape(
tf.nn.bias_add(conv, biases, data_format=self.data_format),
conv.get_shape())
else:
biased = conv
else:
self.top_layer = conv
self.top_size = num_out_channels
biased = self.batch_norm(**self.batch_norm_config)
if activation == 'relu':
mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
conv1 = tf.nn.relu(biased)
elif activation == 'linear' or activation is None:
conv1 = biased
elif activation == 'tanh':
conv1 = tf.nn.tanh(biased)
else:
raise KeyError('Invalid activation type \'%s\'' % activation)
self.top_layer = conv1
self.top_size = num_out_channels
return conv1
def _pool(self,
pool_name,
pool_function,
k_height,
k_width,
d_height,
d_width,
mode,
input_layer,
num_channels_in):
"""Construct a pooling layer."""
if input_layer is None:
input_layer = self.top_layer
else:
self.top_size = num_channels_in
name = pool_name + str(self.counts[pool_name])
self.counts[pool_name] += 1
if self.use_tf_layers:
pool = pool_function(
input_layer, [k_height, k_width], [d_height, d_width],
padding=mode,
data_format=self.channel_pos,
name=name)
else:
if self.data_format == 'NHWC':
ksize = [1, k_height, k_width, 1]
strides = [1, d_height, d_width, 1]
else:
ksize = [1, 1, k_height, k_width]
strides = [1, 1, d_height, d_width]
pool = tf.nn.max_pool(input_layer, ksize, strides, padding=mode,
data_format=self.data_format, name=name)
if pool_name == 'mpool':
mlperf.logger.log_max_pool(input_tensor=input_layer,
output_tensor=pool)
self.top_layer = pool
return pool
def mpool(self,
k_height,
k_width,
d_height=2,
d_width=2,
mode='VALID',
input_layer=None,
num_channels_in=None):
"""Construct a max pooling layer."""
return self._pool('mpool', pooling_layers.max_pooling2d, k_height, k_width,
d_height, d_width, mode, input_layer, num_channels_in)
def apool(self,
k_height,
k_width,
d_height=2,
d_width=2,
mode='VALID',
input_layer=None,
num_channels_in=None):
"""Construct an average pooling layer."""
return self._pool('apool', pooling_layers.average_pooling2d, k_height,
k_width, d_height, d_width, mode, input_layer,
num_channels_in)
def reshape(self, shape, input_layer=None):
if input_layer is None:
input_layer = self.top_layer
self.top_layer = tf.reshape(input_layer, shape)
self.top_size = shape[-1] # HACK This may not always work
return self.top_layer
def affine(self,
num_out_channels,
input_layer=None,
num_channels_in=None,
bias=0.0,
stddev=None,
activation='relu'):
if input_layer is None:
input_layer = self.top_layer
if num_channels_in is None:
num_channels_in = self.top_size
name = 'affine' + str(self.counts['affine'])
self.counts['affine'] += 1
with tf.variable_scope(name):
init_factor = 2. if activation == 'relu' else 1.
stddev = stddev or np.sqrt(init_factor / num_channels_in)
kernel = self.get_variable(
'weights', [num_channels_in, num_out_channels],
self.variable_dtype, self.dtype,
initializer=tf.truncated_normal_initializer(stddev=stddev))
biases = self.get_variable('biases', [num_out_channels],
self.variable_dtype, self.dtype,
initializer=tf.constant_initializer(bias))
mlperf.logger.log(key=mlperf.tags.MODEL_HP_DENSE,
value=num_out_channels)
logits = tf.nn.xw_plus_b(input_layer, kernel, biases)
if activation == 'relu':
mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
affine1 = tf.nn.relu(logits, name=name)
elif activation == 'linear' or activation is None:
affine1 = logits
else:
raise KeyError('Invalid activation type \'%s\'' % activation)
self.top_layer = affine1
self.top_size = num_out_channels
return affine1
def inception_module(self, name, cols, input_layer=None, in_size=None):
if input_layer is None:
input_layer = self.top_layer
if in_size is None:
in_size = self.top_size
name += str(self.counts[name])
self.counts[name] += 1
with tf.variable_scope(name):
col_layers = []
col_layer_sizes = []
for c, col in enumerate(cols):
col_layers.append([])
col_layer_sizes.append([])
for l, layer in enumerate(col):
ltype, args = layer[0], layer[1:]
kwargs = {
'input_layer': input_layer,
'num_channels_in': in_size
} if l == 0 else {}
if ltype == 'conv':
self.conv(*args, **kwargs)
elif ltype == 'mpool':
self.mpool(*args, **kwargs)
elif ltype == 'apool':
self.apool(*args, **kwargs)
elif ltype == 'share': # Share matching layer from previous column
self.top_layer = col_layers[c - 1][l]
self.top_size = col_layer_sizes[c - 1][l]
else:
raise KeyError(
'Invalid layer type for inception module: \'%s\'' % ltype)
col_layers[c].append(self.top_layer)
col_layer_sizes[c].append(self.top_size)
catdim = 3 if self.data_format == 'NHWC' else 1
self.top_layer = tf.concat([layers[-1] for layers in col_layers], catdim)
self.top_size = sum([sizes[-1] for sizes in col_layer_sizes])
return self.top_layer
def spatial_mean(self, keep_dims=False):
name = 'spatial_mean' + str(self.counts['spatial_mean'])
self.counts['spatial_mean'] += 1
axes = [1, 2] if self.data_format == 'NHWC' else [2, 3]
self.top_layer = tf.reduce_mean(
self.top_layer, axes, keepdims=keep_dims, name=name)
return self.top_layer
def dropout(self, keep_prob=0.5, input_layer=None):
if input_layer is None:
input_layer = self.top_layer
else:
self.top_size = None
name = 'dropout' + str(self.counts['dropout'])
with tf.variable_scope(name):
if not self.phase_train:
keep_prob = 1.0
if self.use_tf_layers:
dropout = core_layers.dropout(input_layer, 1. - keep_prob,
training=self.phase_train)
else:
dropout = tf.nn.dropout(input_layer, keep_prob)
self.top_layer = dropout
return dropout
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon):
"""Batch normalization on `input_layer` without tf.layers."""
# We make this function as similar as possible to the
# tf.contrib.layers.batch_norm, to minimize the differences between using
# layers and not using layers.
shape = input_layer.shape
num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32,
initializer=tf.zeros_initializer())
if use_scale:
gamma = self.get_variable('gamma', [num_channels], tf.float32,
tf.float32, initializer=tf.ones_initializer())
else:
gamma = tf.constant(1.0, tf.float32, [num_channels])
# For moving variables, we use tf.get_variable instead of self.get_variable,
# since self.get_variable returns the result of tf.cast which we cannot
# assign to.
moving_mean = tf.get_variable('moving_mean', [num_channels],
tf.float32,
initializer=tf.zeros_initializer(),
trainable=False)
moving_variance = tf.get_variable('moving_variance', [num_channels],
tf.float32,
initializer=tf.ones_initializer(),
trainable=False)
if self.phase_train:
bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
input_layer, gamma, beta, epsilon=epsilon,
data_format=self.data_format, is_training=True)
mean_update = moving_averages.assign_moving_average(
moving_mean, batch_mean, decay=decay, zero_debias=False)
variance_update = moving_averages.assign_moving_average(
moving_variance, batch_variance, decay=decay, zero_debias=False)
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
else:
bn, _, _ = tf.nn.fused_batch_norm(
input_layer, gamma, beta, mean=moving_mean,
variance=moving_variance, epsilon=epsilon,
data_format=self.data_format, is_training=False)
return bn
def batch_norm(self, input_layer=None, decay=0.999, scale=False,
epsilon=0.001):
"""Adds a Batch Normalization layer."""
if input_layer is None:
input_layer = self.top_layer
else:
self.top_size = None
name = 'batchnorm' + str(self.counts['batchnorm'])
self.counts['batchnorm'] += 1
center = True
with tf.variable_scope(name) as scope:
if self.use_tf_layers:
layer_obj = normalization_layers.BatchNormalization(
momentum=decay,
scale=scale,
epsilon=epsilon,
fused=True,
axis=_data_format_to_channel_axis[self.data_format],
# We pass this 'scope' argument for compatibility with checkpoints
# created with the contrib version of batch norm. tf_cnn_benchmarks
# used to use the contrib version.
_scope=scope,
center=center,
name=scope.name)
bn = layer_obj.apply(input_layer, training=self.phase_train)
else:
bn = self._batch_norm_without_layers(input_layer, decay, scale, epsilon)
self.top_layer = bn
self.top_size = bn.shape[3] if self.data_format == 'NHWC' else bn.shape[1]
self.top_size = int(self.top_size)
mlperf.logger.log_batch_norm(
input_tensor=input_layer, output_tensor=bn, momentum=decay,
epsilon=epsilon, center=center, scale=scale, training=self.phase_train)
return bn
def lrn(self, depth_radius, bias, alpha, beta):
"""Adds a local response normalization layer."""
name = 'lrn' + str(self.counts['lrn'])
self.counts['lrn'] += 1
self.top_layer = tf.nn.lrn(
self.top_layer, depth_radius, bias, alpha, beta, name=name)
return self.top_layer
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmark dataset utilities.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from abc import abstractmethod
import os
import numpy as np
import six
from six.moves import cPickle
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
from tensorflow.python.platform import gfile
import preprocessing
IMAGENET_NUM_TRAIN_IMAGES = 1281167
IMAGENET_NUM_VAL_IMAGES = 50000
COCO_NUM_TRAIN_IMAGES = 118287
COCO_NUM_VAL_IMAGES = 4952
class Dataset(object):
"""Abstract class for cnn benchmarks dataset."""
def __init__(self,
name,
data_dir=None,
queue_runner_required=False,
num_classes=None):
self.name = name
self.data_dir = data_dir
self._queue_runner_required = queue_runner_required
self._num_classes = num_classes
def tf_record_pattern(self, subset):
return os.path.join(self.data_dir, '%s-*-of-*' % subset)
def reader(self):
return tf.TFRecordReader()
@property
def num_classes(self):
return self._num_classes
@num_classes.setter
def num_classes(self, val):
self._num_classes = val
@abstractmethod
def num_examples_per_epoch(self, subset):
pass
def __str__(self):
return self.name
def get_input_preprocessor(self, input_preprocessor='default'):
assert not self.use_synthetic_gpu_inputs()
return _SUPPORTED_INPUT_PREPROCESSORS[self.name][input_preprocessor]
def queue_runner_required(self):
return self._queue_runner_required
def use_synthetic_gpu_inputs(self):
return not self.data_dir
class LibrispeechDataset(Dataset):
"""Configuration for LibriSpeech dataset."""
def __init__(self, data_dir=None):
super(LibrispeechDataset, self).__init__(
'librispeech', data_dir, num_classes=29)
def tf_record_pattern(self, subset):
if subset == 'train':
return os.path.join(self.data_dir, 'train-clean-*.tfrecords')
elif subset == 'validation':
return os.path.join(self.data_dir, 'test-clean.tfrecords')
else:
return ''
def num_examples_per_epoch(self, subset='train'):
del subset
return 2 # TODO(laigd): currently this is an arbitrary number.
class ImageDataset(Dataset):
"""Abstract class for image datasets."""
def __init__(self,
name,
height,
width,
depth=None,
data_dir=None,
queue_runner_required=False,
num_classes=1001):
super(ImageDataset, self).__init__(name, data_dir, queue_runner_required,
num_classes)
self.height = height
self.width = width
self.depth = depth or 3
class ImagenetDataset(ImageDataset):
"""Configuration for Imagenet dataset."""
def __init__(self, data_dir=None):
super(ImagenetDataset, self).__init__(
'imagenet', 300, 300, data_dir=data_dir)
def num_examples_per_epoch(self, subset='train'):
if subset == 'train':
return IMAGENET_NUM_TRAIN_IMAGES
elif subset == 'validation':
return IMAGENET_NUM_VAL_IMAGES
else:
raise ValueError('Invalid data subset "%s"' % subset)
class Cifar10Dataset(ImageDataset):
"""Configuration for cifar 10 dataset.
It will mount all the input images to memory.
"""
def __init__(self, data_dir=None):
super(Cifar10Dataset, self).__init__(
'cifar10',
32,
32,
data_dir=data_dir,
queue_runner_required=True,
num_classes=11)
def read_data_files(self, subset='train'):
"""Reads from data file and returns images and labels in a numpy array."""
assert self.data_dir, ('Cannot call `read_data_files` when using synthetic '
'data')
if subset == 'train':
filenames = [
os.path.join(self.data_dir, 'data_batch_%d' % i)
for i in xrange(1, 6)
]
elif subset == 'validation':
filenames = [os.path.join(self.data_dir, 'test_batch')]
else:
raise ValueError('Invalid data subset "%s"' % subset)
inputs = []
for filename in filenames:
with gfile.Open(filename, 'rb') as f:
# python2 does not have the encoding parameter
encoding = {} if six.PY2 else {'encoding': 'bytes'}
inputs.append(cPickle.load(f, **encoding))
# See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
# input format.
all_images = np.concatenate(
[each_input[b'data'] for each_input in inputs]).astype(np.float32)
all_labels = np.concatenate(
[each_input[b'labels'] for each_input in inputs])
return all_images, all_labels
def num_examples_per_epoch(self, subset='train'):
if subset == 'train':
return 50000
elif subset == 'validation':
return 10000
else:
raise ValueError('Invalid data subset "%s"' % subset)
class COCODataset(ImageDataset):
"""COnfiguration for COCO dataset."""
def __init__(self, data_dir=None, image_size=300):
super(COCODataset, self).__init__(
'coco', image_size, image_size, data_dir=data_dir, num_classes=81)
def num_examples_per_epoch(self, subset='train'):
if subset == 'train':
return COCO_NUM_TRAIN_IMAGES
elif subset == 'validation':
return COCO_NUM_VAL_IMAGES
else:
raise ValueError('Invalid data subset "%s"' % subset)
_SUPPORTED_DATASETS = {
'imagenet': ImagenetDataset,
'cifar10': Cifar10Dataset,
'librispeech': LibrispeechDataset,
'coco': COCODataset,
}
_SUPPORTED_INPUT_PREPROCESSORS = {
'imagenet': {
'default': preprocessing.RecordInputImagePreprocessor,
'official_models_imagenet': preprocessing.ImagenetPreprocessor,
},
'cifar10': {
'default': preprocessing.Cifar10ImagePreprocessor
},
'librispeech': {
'default': preprocessing.LibrispeechPreprocessor
},
'coco': {
'default': preprocessing.COCOPreprocessor
},
}
def create_dataset(data_dir, data_name):
"""Create a Dataset instance based on data_dir and data_name."""
if not data_dir and not data_name:
# When using synthetic data, use synthetic imagenet images by default.
data_name = 'imagenet'
# Infere dataset name from data_dir if data_name is not provided.
if data_name is None:
for supported_name in _SUPPORTED_DATASETS:
if supported_name in data_dir:
data_name = supported_name
break
else: # Failed to identify dataset name from data dir.
raise ValueError('Could not identify name of dataset. '
'Please specify with --data_name option.')
if data_name not in _SUPPORTED_DATASETS:
raise ValueError('Unknown dataset. Must be one of %s' % ', '.join(
[key for key in sorted(_SUPPORTED_DATASETS.keys())]))
return _SUPPORTED_DATASETS[data_name](data_dir)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains functions to define flags and params.
Calling a DEFINE_* function will add a ParamSpec namedtuple to the param_spec
dict. The DEFINE_* arguments match those in absl. Calling define_flags() creates
a command-line flag for every ParamSpec defined by a DEFINE_* functions.
The reason we don't use absl flags directly is that we want to be able to use
tf_cnn_benchmarks as a library. When using it as a library, we don't want to
define any flags, but instead pass parameters to the BenchmarkCNN constructor.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
from absl import flags as absl_flags
import six
FLAGS = absl_flags.FLAGS
# ParamSpec describes one of benchmark_cnn.BenchmarkCNN's parameters.
ParamSpec = namedtuple('_ParamSpec',
['flag_type', 'default_value', 'description',
'kwargs'])
# Maps from parameter name to its ParamSpec.
param_specs = {}
def DEFINE_string(name, default, help): # pylint: disable=invalid-name,redefined-builtin
param_specs[name] = ParamSpec('string', default, help, {})
def DEFINE_boolean(name, default, help): # pylint: disable=invalid-name,redefined-builtin
param_specs[name] = ParamSpec('boolean', default, help, {})
def DEFINE_integer(name, default, help, lower_bound=None, upper_bound=None): # pylint: disable=invalid-name,redefined-builtin
kwargs = {'lower_bound': lower_bound, 'upper_bound': upper_bound}
param_specs[name] = ParamSpec('integer', default, help, kwargs)
def DEFINE_float(name, default, help, lower_bound=None, upper_bound=None): # pylint: disable=invalid-name,redefined-builtin
kwargs = {'lower_bound': lower_bound, 'upper_bound': upper_bound}
param_specs[name] = ParamSpec('float', default, help, kwargs)
def DEFINE_enum(name, default, enum_values, help): # pylint: disable=invalid-name,redefined-builtin
kwargs = {'enum_values': enum_values}
param_specs[name] = ParamSpec('enum', default, help, kwargs)
def DEFINE_list(name, default, help): # pylint: disable=invalid-name,redefined-builtin
param_specs[name] = ParamSpec('list', default, help, {})
def define_flags(specs=None):
"""Define a command line flag for each ParamSpec in flags.param_specs."""
specs = specs or param_specs
define_flag = {
'boolean': absl_flags.DEFINE_boolean,
'float': absl_flags.DEFINE_float,
'integer': absl_flags.DEFINE_integer,
'string': absl_flags.DEFINE_string,
'enum': absl_flags.DEFINE_enum,
'list': absl_flags.DEFINE_list
}
for name, param_spec in six.iteritems(specs):
if param_spec.flag_type not in define_flag:
raise ValueError('Unknown flag_type %s' % param_spec.flag_type)
else:
define_flag[param_spec.flag_type](name, param_spec.default_value,
help=param_spec.description,
**param_spec.kwargs)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmark various leading indicators CNNs.
The purpose of these tests is to test each model as a high level baseline and
to ensure the various variable_update options have not regressing. Not all
options are tested. The tests focus on the most viable options.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ctypes
import logging
import os
import sys
from absl import flags
from absl.testing import absltest # pylint: disable=unused-import
import tensorflow.compat.v1 as tf # pylint: disable=g-bad-import-order
import benchmark_cnn
from platforms import util as platforms_util
flags.DEFINE_integer('num_batches', None,
'number of batches to run, excluding warmup')
class BenchmarkBase(tf.test.Benchmark):
"""Base class for all benchmarks in this file."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Base class for all benchmarks in this file.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
# Load default values if the benchmark is not run with absl.app.run()
if not flags.FLAGS.is_parsed():
flags.FLAGS.mark_as_parsed()
self.fake_data_dir = os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data')
self.output_dir = output_dir
if root_data_dir is None:
self.data_dir = ('/readahead/200M/placer/prod/home/distbelief/'
'imagenet-tensorflow/imagenet-2012-tfrecord')
else:
self.data_dir = os.path.join(root_data_dir, 'imagenet')
def _run_benchmark(self, params):
"""Run a CNN benchmark and report its results.
Args:
params: Params tuple, typically created by benchmark_cnn.make_params or
benchmark_cnn.make_params_from_flags.
"""
logging.info('Running benchmark [%s]', self._get_name())
params = benchmark_cnn.setup(params)
bench = benchmark_cnn.BenchmarkCNN(params)
bench.print_info()
stats = bench.run()
extras = {}
extras['examples_per_sec'] = stats.get('images_per_sec')
if 'last_average_loss' in stats:
extras['last_average_loss'] = stats['last_average_loss']
if 'top_1_accuracy' in stats:
extras['top_1_accuracy'] = stats['top_1_accuracy']
if 'top_5_accuracy' in stats:
extras['top_5_accuracy'] = stats['top_5_accuracy']
self.report_benchmark(
iters=stats.get('num_steps'),
wall_time=stats.get('average_wall_time'),
extras=extras)
def _shared_params(self):
"""Returns shared parameters for all benchmarks in this file."""
params = {}
if flags.FLAGS.num_batches is not None:
params['num_batches'] = flags.FLAGS.num_batches
if self.output_dir is not None:
params['benchmark_log_dir'] = self.output_dir
return benchmark_cnn.make_params(**params)
def _binary_search_batch_size(self, params, init_batch_size):
"""Find the max batch_size using binary search."""
assert init_batch_size > 0
low_batch_size = 0
high_batch_size = None
batch_size = init_batch_size
# No need to run a warmup or many batches; if it doesn't OOM after 10
# batches, it should work in general.
params = params._replace(num_batches=10, num_warmup_batches=0)
# Find high_batch_size first.
tf.logging.info(
'Looking for upper bound to batch size, starting with %d' % batch_size)
while high_batch_size is None:
tf.logging.info('Trying batch_size %d' % batch_size)
params = params._replace(batch_size=batch_size)
bench = benchmark_cnn.BenchmarkCNN(params)
bench.print_info()
try:
bench.run()
low_batch_size = batch_size
batch_size *= 2
except tf.errors.ResourceExhaustedError:
high_batch_size = batch_size - 1
# Binary Search
tf.logging.info(
'Max batch size is in range (%d, %d]. Starting binary search to find '
'exact max batch size.' % (low_batch_size, batch_size))
while low_batch_size < high_batch_size:
batch_size = (low_batch_size + high_batch_size + 1) // 2
tf.logging.info('Trying batch_size %d' % batch_size)
params = params._replace(batch_size=batch_size)
bench = benchmark_cnn.BenchmarkCNN(params)
bench.print_info()
try:
bench.run()
low_batch_size = batch_size
except tf.errors.ResourceExhaustedError:
high_batch_size = batch_size - 1
self.report_benchmark(extras={'max_batch_size': low_batch_size})
class Resnet50BenchmarksInferenceCpu(BenchmarkBase):
""""Benchmarks for ResNet50 inference on CPU."""
def _shared_params(self):
"""Returns shared parameters for all ResNet50 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
num_gpus=1,
model='resnet50',
num_warmup_batches=5,
num_batches=50,
distortions=False,
forward_only=True,
device='cpu',
data_format='NHWC',
num_intra_threads=0)
def benchmark_synth_forward_batch1(self):
"""Tests 1 CPU batch size 1."""
params = self._shared_params()._replace(batch_size=1)
self._run_benchmark(params)
def benchmark_synth_forward_batch16(self):
"""Tests 1 CPU batch size 16."""
params = self._shared_params()._replace(batch_size=16)
self._run_benchmark(params)
class FrozenResnet50BenchmarksInferenceCpu(Resnet50BenchmarksInferenceCpu):
""""Benchmarks for ResNet50 frozen graph inference on CPU."""
def _shared_params(self):
return super(FrozenResnet50BenchmarksInferenceCpu,
self)._shared_params()._replace(freeze_when_forward_only=True)
class Resnet50BenchmarksInference(BenchmarkBase):
""""Benchmarks for ResNet50 inference."""
def _shared_params(self):
"""Returns shared parameters for all ResNet50 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
num_gpus=1, model='resnet50', distortions=False, forward_only=True)
def benchmark_synth_forward_batch128(self):
"""Tests 1 GPU batch size 128."""
params = self._shared_params()._replace(batch_size=128)
self._run_benchmark(params)
def benchmark_fp16_synth_forward_batch128(self):
"""Tests 1 GPU batch size 128 FP16."""
params = self._shared_params()._replace(batch_size=128, use_fp16=True)
self._run_benchmark(params)
def benchmark_fp16_synth_forward_batch16(self):
"""Tests 1 GPU batch size 16 FP16."""
params = self._shared_params()._replace(batch_size=16, use_fp16=True)
self._run_benchmark(params)
def benchmark_xla_synth_forward_batch128(self):
"""Tests 1 GPU batch size 128 with XLA."""
params = self._shared_params()._replace(batch_size=128, xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_forward_batch128(self):
"""Tests 1 GPU batch size 128 FP16 with XLA."""
params = self._shared_params()._replace(
batch_size=128, use_fp16=True, xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_forward_batch16(self):
"""Tests 1 GPU batch size 16 FP16 with XLA."""
params = self._shared_params()._replace(
batch_size=16, use_fp16=True, xla=True)
self._run_benchmark(params)
class FrozenResnet50BenchmarksInference(Resnet50BenchmarksInference):
""""Benchmarks for ResNet50 frozen graph inference."""
def _shared_params(self):
return super(FrozenResnet50BenchmarksInference,
self)._shared_params()._replace(freeze_when_forward_only=True)
def benchmark_trt_synth_forward_batch128(self):
"""Tests 1 GPU batch size 128."""
params = self._shared_params()._replace(batch_size=128, trt_mode='FP32')
self._run_benchmark(params)
# TODO(laigd): enable fp16 tests for TF-TRT, it's currently not supported yet.
# def benchmark_fp16_trt_synth_forward_batch128(self):
# """Tests 1 GPU batch size 128 FP16."""
# params = self._shared_params()._replace(
# batch_size=128, use_fp16=True, trt_mode='FP16')
# self._run_benchmark(params)
# Test with batch size 16 to compare with native TF GPU implementation and
# XLA.
# def benchmark_fp16_trt_synth_forward_batch16(self):
# """Tests 1 GPU batch size 16 FP16."""
# params = self._shared_params()._replace(
# batch_size=16, use_fp16=True, trt_mode='FP16')
# self._run_benchmark(params)
class Resnet50Benchmarks(BenchmarkBase):
""""Benchmark resnet50 configurations."""
def _shared_params(self):
"""Returns shared parameters for all ResNet50 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='resnet50', batch_size=128, distortions=False,
optimizer='momentum')
def _shared_params_fp16(self):
"""Returns shared parameters for all ResNet50 FP16 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='resnet50',
batch_size=256,
distortions=False,
use_fp16=True,
optimizer='momentum',
loss_type_to_report='base_loss',
compute_lr_on_cpu=True,
single_l2_loss_op=True
)
def benchmark_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data."""
params = self._shared_params()._replace(num_gpus=1)
self._run_benchmark(params)
def benchmark_fake_1gpu_gpuparams(self):
"""Tests 1 gpu with fake data."""
params = self._shared_params()._replace(
num_gpus=1, data_dir=self.fake_data_dir, data_name='imagenet')
self._run_benchmark(params)
def benchmark_synth_1gpu_max_batch_size(self):
"""Finds largest batch size that can be run with 1 gpu using synth data."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server')
self._binary_search_batch_size(params, init_batch_size=128)
def benchmark_synth_4gpu_gpureplicated(self):
"""Tests 4 gpu with synthetic data with parameters replicated."""
params = self._shared_params()._replace(
num_gpus=4,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_synth_8gpu_gpureplicated(self):
"""Tests 8 gpu with synthetic data with parameters replicated."""
params = self._shared_params()._replace(
num_gpus=8,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_fake_8gpu_gpureplicated(self):
"""Tests 8 gpu with fake data with parameters replicated."""
params = self._shared_params()._replace(
num_gpus=8,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
# FP16 mixed-precision tests.
def benchmark_fp16_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data with parameters on the gpu."""
params = self._shared_params_fp16()._replace(
num_gpus=1, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_synth_1gpu_gpuparams_batch128(self):
"""Tests 1 gpu with synthetic data with parameters on the gpu."""
params = self._shared_params_fp16()._replace(
num_gpus=1, batch_size=128, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_synth_4gpu_gpureplicated(self):
"""Tests 4 gpu with synthetic data with nccl and all_reduce."""
params = self._shared_params_fp16()._replace(
num_gpus=4,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_fp16_synth_8gpu_gpureplicated(self):
"""Tests 8 gpu with synthetic with nccl and all_reduce."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_fp16_fake_1gpu_gpuparams(self):
"""Tests 1 gpus with fake data."""
params = self._shared_params_fp16()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_fake_8gpu_gpureplicated(self):
"""Tests 8 gpus with fake data."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_fp16_fakedistort_8gpu_gpureplicated(self):
"""Tests 8 gpus with fake distorted data."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
data_dir=self.fake_data_dir,
data_name='imagenet',
distortions=True,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
# XLA versions of Resnet50 tests only for single GPU.
def benchmark_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data with XLA."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, synthetic data with XLA."""
params = self._shared_params_fp16()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._run_benchmark(params)
# Test does not run as part of continuous testing on guitar.
def benchmark_ng_xla_batch64_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with XLA, synth data, and batch 64."""
params = self._shared_params()._replace(
num_gpus=1, batch_size=64, variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_batch64_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, XLA, synth data, and batch 64."""
params = self._shared_params_fp16()._replace(
num_gpus=1,
batch_size=64,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_batch128_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, XLA, and synth data."""
params = self._shared_params_fp16()._replace(
num_gpus=1,
batch_size=128,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
def benchmark_xla_synth_1gpu_max_batch_size(self):
"""Finds largest batch that can be run with XLA, 1 gpu, and synth data."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._binary_search_batch_size(params, init_batch_size=128)
def benchmark_xla_real_1gpu_gpuparams(self):
"""Tests 1 gpu with real data with XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.data_dir,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
# Test does not run as part of continuous testing.
def benchmark_xla_fake_1gpu_gpuparams(self):
"""Tests 1 gpu with fake data with XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
# Test does not run as part of continuous testing.
def benchmark_xla_fakedistort_1gpu_gpuparams(self):
"""Tests 1 gpu with fake distorted data with XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
distortions=True,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
class Resnet50v15Benchmarks(BenchmarkBase):
""""Benchmark various ResNet50V1.5 configurations.
ResNetV1.5 differs from V1 in stride 2 is used in the first 3x3 convolution of
each block instead of the first 1x1 convolution.
"""
def _shared_params_fp16(self):
"""Returns shared parameters for all ResNet50v1.5 FP16 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='resnet50_v1.5',
batch_size=256,
distortions=False,
use_fp16=True,
optimizer='momentum',
loss_type_to_report='base_loss',
compute_lr_on_cpu=True,
single_l2_loss_op=True
)
def benchmark_fp16_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data."""
params = self._shared_params_fp16()._replace(num_gpus=1)
self._run_benchmark(params)
def benchmark_fp16_batch256_synth_8gpu_gpuparams(self):
"""Tests 8 gpus with synthetic data at batch 256."""
params = self._shared_params_fp16()._replace(num_gpus=8)
self._run_benchmark(params)
def benchmark_fp16_batch128_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data at batch 128 (useful for small GPUs)."""
params = self._shared_params_fp16()._replace(num_gpus=1, batch_size=128)
self._run_benchmark(params)
def benchmark_fp16_fake_1gpu_gpuparams(self):
"""Tests 1 gpu with fake data."""
params = self._shared_params_fp16()._replace(
num_gpus=1, data_dir=self.fake_data_dir, data_name='imagenet')
self._run_benchmark(params)
def benchmark_fp16_synth_8gpu_gpureplicated(self):
"""Tests 8 gpu with synthetic data with parameters replicated."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
num_batches=200,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_fp16_fake_8gpu_gpureplicated(self):
"""Tests 8 gpu with fake data with parameters replicated."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
num_batches=200,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2)
self._run_benchmark(params)
# XLA versions of Resnet50v1.5 tests.
def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, synthetic data with XLA."""
params = self._shared_params_fp16()._replace(num_gpus=1, xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_batch128_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, batch128, synthetic data with XLA."""
params = self._shared_params_fp16()._replace(
num_gpus=1, batch_size=128, xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_compile_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data."""
params = self._shared_params_fp16()._replace(num_gpus=1, xla_compile=True)
self._run_benchmark(params)
def benchmark_fp16_xla_compile_batch128_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data at batch 128 (useful for small GPUs)."""
params = self._shared_params_fp16()._replace(
num_gpus=1, num_batches=200, batch_size=128, xla_compile=True)
self._run_benchmark(params)
def benchmark_fp16_xla_batch256_synth_8gpu_gpuparams(self):
"""Tests 8 gpu with synthetic data and xla autojit."""
params = self._shared_params_fp16()._replace(
num_gpus=8, num_batches=200, batch_size=256, xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_compile_fake_1gpu_gpuparams(self):
"""Tests 1 gpu with fake data."""
params = self._shared_params_fp16()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
xla_compile=True)
self._run_benchmark(params)
def benchmark_fp16_xla_compile_synth_8gpu_gpureplicated(self):
"""Tests 8 gpu with synthetic data with parameters replicated."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
num_batches=200,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
xla_compile=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_8gpu_gpureplicated(self):
"""Tests 8 gpu with synthetic data with parameters replicated."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
num_batches=200,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_compile_fake_8gpu_gpureplicated(self):
"""Tests 8 gpu with fake data with parameters replicated."""
params = self._shared_params_fp16()._replace(
num_gpus=8,
num_batches=200,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
xla_compile=True)
self._run_benchmark(params)
class Vgg16Benchmarks(BenchmarkBase):
""""Benchmark various vgg16 configurations."""
def _shared_params(self):
"""Returns shared parameters for all vgg16 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='vgg16', batch_size=128, distortions=False)
def benchmark_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data with parameters on gpu."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data with parameters on gpu."""
params = self._shared_params()._replace(
num_gpus=1, use_fp16=True, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_synth_8gpu_gpureplicated(self):
"""Tests 8 gpu with synthetic data with parameters replicated."""
params = self._shared_params()._replace(
num_gpus=8,
all_reduce_spec='nccl',
variable_update='replicated',
compact_gradient_transfer=False,
gradient_repacking=2)
self._run_benchmark(params)
# XLA versions of VGG16 tests only for single GPU.
def benchmark_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data and XLA."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, synthetic data, and XLA."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True)
self._run_benchmark(params)
# Test does not run as part of continuous testing.
def benchmark_xla_fake_1gpu_gpuparams(self):
"""Tests 1 gpu with fake data and XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
def benchmark_xla_real_1gpu_gpuparams(self):
"""Tests 1 gpu with real data and XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.data_dir,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
class TrivialBenchmarks(BenchmarkBase):
""""Benchmarks for trivial model.
The purpose of these tests is to verify the upper bound for the input
pipeline. Fake data creates an upperbound on the input pipeline throughput.
"""
def _shared_params(self):
"""Returns shared parameters for all trivial benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='trivial',
num_gpus=8,
distortions=False,
variable_update='independent',
data_dir=self.fake_data_dir)
def benchmark_fake_64batch(self):
params = self._shared_params()._replace(batch_size=64, data_name='imagenet')
self._run_benchmark(params)
def benchmark_fake_128batch(self):
params = self._shared_params()._replace(
batch_size=128, data_name='imagenet')
self._run_benchmark(params)
def benchmark_fake_256batch(self):
params = self._shared_params()._replace(
batch_size=256, data_name='imagenet')
self._run_benchmark(params)
def benchmark_fakedistort_128batch(self):
params = self._shared_params()._replace(
batch_size=128, data_name='imagenet', distortions=True)
self._run_benchmark(params)
class AlexnetBenchmarks(BenchmarkBase):
""""Benchmarks for alexnet."""
def _shared_params(self):
"""Returns shared parameters for all alexnet benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='alexnet', batch_size=512, distortions=False)
def benchmark_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data with parameters on gpu."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data with parameters on gpu."""
params = self._shared_params()._replace(
num_gpus=1, use_fp16=True, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_synth_8gpu_gpureplicated(self):
"""Tests 8 gpus with synthetic data with parameters replicated."""
params = self._shared_params()._replace(
num_gpus=8,
variable_update='replicated',
all_reduce_spec='nccl',
compact_gradient_transfer=False,
gradient_repacking=2)
self._run_benchmark(params)
def benchmark_fake_8gpu_gpureplicated(self):
"""Tests 8 gpus with fake data with parameters replicated."""
params = self._shared_params()._replace(
num_gpus=8,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='replicated',
all_reduce_spec='nccl',
compact_gradient_transfer=False,
gradient_repacking=2)
self._run_benchmark(params)
# XLA Benchmark tests for AlexNet.
def benchmark_xla_synth_1gpuparams(self):
"""Tests 1 gpu with synthetic data and XLA."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, synthetic data and XLA."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True)
self._run_benchmark(params)
# Test does not run as part of continuous testing.
def benchmark_xla_fake_1gpuparams(self):
"""Tests 1 gpu with fake data and XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
def benchmark_xla_real_1gpuparams(self):
"""Tests 1 gpu with real data and XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.data_dir,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
class InceptionV3Benchmarks(BenchmarkBase):
""""Benchmark for InceptionV3."""
def _shared_params(self):
"""Returns shared parameters for all InceptionV3 benchmarks."""
return BenchmarkBase._shared_params(self)._replace(
model='inception3', batch_size=64, distortions=False)
def benchmark_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic data."""
params = self._shared_params()._replace(
num_gpus=1, use_fp16=True, variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_synth_1gpu_max_batch_size(self):
"""Finds largest batch size that can be run with 1 gpu using synth data."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server')
self._binary_search_batch_size(params, init_batch_size=128)
def benchmark_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with synthetic and XLA."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
"""Tests 1 gpu with fp16, XLA and synthetic data."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True, use_fp16=True)
self._run_benchmark(params)
def benchmark_xla_synth_1gpu_max_batch_size(self):
"""Finds largest batch that can be run with XLA, 1 gpu, and synth data."""
params = self._shared_params()._replace(
num_gpus=1, variable_update='parameter_server', xla=True)
self._binary_search_batch_size(params, init_batch_size=128)
# Test does not run as part of continuous testing.
def benchmark_xla_fake_1gpu_gpuparams(self):
"""Tests 1 gpu with fake data with XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.fake_data_dir,
data_name='imagenet',
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
def benchmark_xla_real_1gpu_gpuparams(self):
"""Tests 1 gpu with real data with XLA."""
params = self._shared_params()._replace(
num_gpus=1,
data_dir=self.data_dir,
variable_update='parameter_server',
xla=True)
self._run_benchmark(params)
class NcfBenchmarks(BenchmarkBase):
"""Benchmarks for neural collaborative filtering."""
def _shared_params(self):
return BenchmarkBase._shared_params(self)._replace(
model='ncf', batch_size=64*1024, num_gpus=1, num_warmup_batches=1)
def benchmark_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_fp16_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', use_fp16=True)
self._run_benchmark(params)
def benchmark_xla_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_fp16_xla_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', xla=True, use_fp16=True)
self._run_benchmark(params)
def benchmark_xla_compile_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', xla_compile=True)
self._run_benchmark(params)
def benchmark_fp16_xla_compile_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', xla_compile=True, use_fp16=True)
self._run_benchmark(params)
class DeepSpeech2Benchmarks(BenchmarkBase):
"""Benchmarks for DeepSpeech2 model."""
def _shared_params(self):
return BenchmarkBase._shared_params(self)._replace(
model='deepspeech2', batch_size=32, num_gpus=1, data_name='librispeech')
def benchmark_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(variable_update='parameter_server')
self._run_benchmark(params)
def benchmark_xla_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', xla=True)
self._run_benchmark(params)
def benchmark_xla_compile_synth_1gpu_gpuparams(self):
params = self._shared_params()._replace(
variable_update='parameter_server', xla_compile=True)
self._run_benchmark(params)
class SsdBenchmarks(BenchmarkBase):
"""Benchmarks for SSD model."""
def _cudnn_version(self):
if sys.platform == 'win32':
return None
lib = ctypes.cdll.LoadLibrary(None)
if hasattr(lib, 'cudnnGetErrorString'):
version = lib.cudnnGetVersion()
return version
return None
def _shared_params(self):
cudnn_version = self._cudnn_version()
if cudnn_version is None or cudnn_version < 7300:
raise RuntimeError(
'Needs at least cuDNN 7.3 to work with fp16 (b/112048183). '
'Build with --define=use_experimental_cudnn=1')
return BenchmarkBase._shared_params(self)._replace(
# TODO(b/115672206): Replace backbone model and data dir with replicated
# placer location for better performance.
backbone_model_path=platforms_util.get_ssd_backborn_model_file(), # pylint: disable=line-too-long
data_dir=platforms_util.get_ssd_backboard_data_dir(),
batch_size=128,
data_name='coco',
model='ssd300',
num_batches=10,
num_warmup_batches=1,
num_gpus=1,
optimizer='momentum',
momentum=0.9,
weight_decay=5e-4,
loss_type_to_report='base_loss',
single_l2_loss_op=True,
compute_lr_on_cpu=True,
)
def benchmark_xla_compile_real_1gpu_gpuparams(self):
params = self._shared_params()._replace(
num_gpus=1,
xla_compile=True,
)
self._run_benchmark(params)
def benchmark_real_1gpu_gpuparams(self):
params = self._shared_params()._replace(num_gpus=1,)
self._run_benchmark(params)
def benchmark_xla_compile_fp16_real_1gpu_gpuparams(self):
params = self._shared_params()._replace(
num_gpus=1, xla_compile=True, use_fp16=True)
self._run_benchmark(params)
def benchmark_fp16_real_1gpu_gpuparams(self):
params = self._shared_params()._replace(num_gpus=1, use_fp16=True)
self._run_benchmark(params)
def benchmark_xla_compile_real_8gpu_gpuparams(self):
params = self._shared_params()._replace(
num_gpus=8,
xla_compile=True,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
num_batches=50,
)
self._run_benchmark(params)
def benchmark_real_8gpu_gpuparams(self):
params = self._shared_params()._replace(
num_gpus=8,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
num_batches=50,
)
self._run_benchmark(params)
def benchmark_xla_compile_fp16_real_8gpu_gpuparams(self):
params = self._shared_params()._replace(
num_gpus=8,
xla_compile=True,
use_fp16=True,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
num_batches=50,
)
self._run_benchmark(params)
def benchmark_fp16_real_8gpu_gpuparams(self):
params = self._shared_params()._replace(
num_gpus=8,
use_fp16=True,
variable_update='replicated',
all_reduce_spec='nccl',
gradient_repacking=2,
num_batches=50,
)
self._run_benchmark(params)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains functions related to MLPerf compliance.
MLPerf requires submissions to log what the benchmark does, in order to verify
that the benchmark meets the MLPerf requirements. This module contains a global
object `logger` that is used by other files to log what tf_cnn_benchmarks does
for compliance.
By default, `logger` does nothing, as the MLPerf compliance logs are verbose and
unnecessary if one is not concerned about MLPerf compliance. The logger can be
enabled by using the `mlperf_logger` context manager.
To enable the logger with `mlperf_logger`, the MLPerf compliance library at
https://github.com/mlperf/training/tree/master/compliance is required. If
the logger is not enabled, the library is not needed.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import contextlib
import os
import sys
import tensorflow.compat.v1 as tf
# pylint: disable=g-import-not-at-top
try:
# Not all users have the MLPerf compliance library, so we don't want to
# unconditionally crash if these imports fail.
from mlperf_compliance import mlperf_log
from mlperf_compliance import resnet_log_helper
from mlperf_compliance import tags
import_successful = True
except ImportError:
# The logger cannot be enabled in this case since the MLPerf library isn't
# found. We return empty strings from the `tags` attribute so that
# the benchmark can still run without crashing. This empty tags are passed
# to an instance of `NullMlPerfLogger`, which does not log anything and
# ignores the tag values.
class _Tags(object):
def __getattr__(self, item):
return ''
tags = _Tags()
import_successful = False
# pylint: enable=g-import-not-at-top
_ModelInfo = namedtuple('_ModelInfo', ['print_fn', 'tag_set',
'mlperf_model_name'])
_MLPERF_LOG_PREFIX = ':::MLPv0.5.0'
class MlPerfLogger(object):
"""Logs various aspects about a benchmark run for MLPerf compliance."""
def __init__(self, model):
self._root_dir = os.path.split(os.path.abspath(__file__))[0]
mlperf_log.ROOT_DIR_RESNET = self._root_dir
mlperf_log.ROOT_DIR_SSD = self._root_dir
self.model = model
model_to_info = {
'resnet50_v1.5': _ModelInfo(mlperf_log.resnet_print,
mlperf_log.RESNET_TAG_SET, tags.RESNET),
'ssd300': _ModelInfo(mlperf_log.ssd_print, mlperf_log.SSD_TAG_SET,
tags.SSD)
}
try:
self._log_fn, self.tag_set, self.mlperf_model_name = model_to_info[model]
except KeyError:
raise ValueError('--ml_perf_compliance_logging is only compatible when '
'--model is one of the following: ' +
', '.join(model_to_info.keys()))
def log(self, key, value=None, stack_offset=2):
if key in self.tag_set:
self._log_fn(key, value, stack_offset)
else:
print('Ignoring MLPerf logging item key=%s, value=%s for model %s' %
(key, value, self.model))
def log_deferred_tensor_value(self, key, tensor_value, global_step,
stack_offset=2, every_n=1):
"""Logs the value of a tensor when the graph is run."""
caller = '(%s)' % mlperf_log.get_caller(stack_offset, self._root_dir)
def create_print_op():
return tf.print(_MLPERF_LOG_PREFIX, self.mlperf_model_name,
tf.timestamp(), caller, key,
': { "deferred": true, "value":', tensor_value, '}',
output_stream=sys.stdout)
maybe_print = tf.cond(tf.equal(global_step % every_n, 0), create_print_op,
tf.no_op)
with tf.control_dependencies([maybe_print]):
return tf.identity(tensor_value)
def log_max_pool(self, input_tensor, output_tensor):
if self.model == 'resnet50_v1.5':
resnet_log_helper.log_max_pool(input_tensor, output_tensor)
def log_begin_block(self, input_tensor, block_type):
if self.model == 'resnet50_v1.5':
resnet_log_helper.log_begin_block(input_tensor, block_type)
def log_end_block(self, output_tensor):
if self.model == 'resnet50_v1.5':
resnet_log_helper.log_end_block(output_tensor)
def log_projection(self, input_tensor, output_tensor):
if self.model == 'resnet50_v1.5':
resnet_log_helper.log_projection(input_tensor, output_tensor)
def log_conv2d(self, input_tensor, output_tensor, stride_height, stride_width,
filters, initializer, use_bias):
"""Log a conv2d call."""
if self.model == 'resnet50_v1.5':
assert stride_height == stride_width, (
'--ml_perf_compliance_logging does not support convolutions where '
'the stride height is not equal to the stride width. '
'stride_height=%d, stride_width=%d' % (stride_height, stride_width))
if isinstance(initializer, tf.truncated_normal_initializer) or (
isinstance(initializer, tf.variance_scaling_initializer) and
initializer.distribution == 'truncated_normal'):
initializer = tags.TRUNCATED_NORMAL
elif (isinstance(initializer, tf.glorot_uniform_initializer) or
initializer is None):
initializer = 'glorot_uniform'
resnet_log_helper.log_conv2d(input_tensor, output_tensor, stride_width,
filters, initializer, use_bias)
def log_batch_norm(self, input_tensor, output_tensor, momentum, epsilon,
center, scale, training):
if self.model == 'resnet50_v1.5':
resnet_log_helper.log_batch_norm(input_tensor, output_tensor, momentum,
epsilon, center, scale, training)
def log_train_epochs(self, num_epochs):
"""Logs all the TRAIN_EPOCHs log lines."""
num_epochs_int = int(num_epochs)
for i in range(num_epochs_int):
# MLPerf allows us to print all the train epochs at once instead of
# printing them as we do them.
self.log(key=mlperf_log.TRAIN_EPOCH, value=i, stack_offset=3)
if num_epochs_int != num_epochs:
value = (str(num_epochs_int) +
', but this epoch only has {}% of the examples of a normal epoch'
.format(100 * (num_epochs - num_epochs_int)))
self.log(key=mlperf_log.TRAIN_EPOCH, value=value, stack_offset=3)
def log_input_resize_aspect_preserving(self, height, width, scale_factor):
assert height == width, (
'--ml_perf_compliance_logging does not support models with nonsquare '
'images. Cannot process image with height=%d and width=%d' %
(height, width))
self.log(key=tags.INPUT_RESIZE_ASPECT_PRESERVING,
value={'min': int(height * scale_factor)})
def log_eval_epoch(self, tag, global_step, batch_size, stack_offset=2):
if self.model == 'resnet50_v1.5':
self.log(key=tag, stack_offset=stack_offset+1)
elif self.model == 'ssd300':
epoch = int(global_step * batch_size / 118287)
self.log(key=tag, value=epoch, stack_offset=stack_offset+1)
def log_eval_accuracy(self, accuracy, global_step, batch_size,
examples_per_epoch, stack_offset=2):
"""Logs eval accuracy."""
epoch = int(global_step * batch_size / examples_per_epoch)
eval_accuracy = {'epoch': epoch, 'value': accuracy}
eval_iteration_accuracy = {'iteration': global_step, 'value': accuracy}
self.log(key=tags.EVAL_ACCURACY, value=eval_accuracy,
stack_offset=stack_offset+1)
self.log(key=tags.EVAL_ITERATION_ACCURACY,
value=eval_iteration_accuracy,
stack_offset=stack_offset+1)
def _empty_fn(*args, **kwargs):
del args, kwargs
class NullMlPerfLogger(object):
"""A version of `MlPerfLogger` that does not log anything.
This class has the same interface as `MlPerfLogger`, but does not actually do
anything. This is used when logging is disabled, which is the default
behavior.
"""
def __getattr__(self, item):
return _empty_fn
def log_deferred_tensor_value(self, key, tensor_value, *args, **kwargs):
del key, args, kwargs
return tensor_value
# A global singleton logger. By default, it's the null logger but can be
# switched to an MlPerfLogger with `mlperf_logger()`.
logger = NullMlPerfLogger()
@contextlib.contextmanager
def mlperf_logger(use_mlperf_logger, model):
"""Optionally enable the mlperf logger.
If `use_mlperf_logger` is True, sets the `logger` global variable to an
instance of MlPerfLogger that will print logs for MLPerf compliance. If
`use_mlperf_logger` is False, does nothing.
Args:
use_mlperf_logger: If True, enables the mlperf logger. If False, this
function does nothing.
model: The model that will be logged. Required, because different models
must log different things for MLPerf compliance.
Yields:
Nothing.
Raises:
ImportError: If `use_mlperf_logger` is True but the MLPerf compliance
library cannot be imported
"""
global logger
if use_mlperf_logger:
if not import_successful:
raise ImportError('Failed to import MLPerf compliance library, which is '
'required when --ml_perf_compliance_logging is '
'specified. Clone this repo and add this directory '
'https://github.com/mlperf/training/tree/master/'
'compliance to the PYTHONPATH environmental variable.')
logger_ = MlPerfLogger(model)
old_logger = logger
try:
logger = logger_
yield
finally:
logger = old_logger
else:
yield
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains tests related to MLPerf.
Note this test only passes if the MLPerf compliance library is installed.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import Counter
import logging
import re
import six
import tensorflow.compat.v1 as tf
import benchmark_cnn
import datasets
import mlperf
import test_util
from models import model
from mlperf_compliance import mlperf_log
class _MlPerfTestModel(model.CNNModel):
"""A model to test the MLPerf compliance logging on."""
def __init__(self):
super(_MlPerfTestModel, self).__init__(
'mlperf_test_model', image_size=224, batch_size=2, learning_rate=1)
def add_inference(self, cnn):
assert cnn.top_layer.shape[1:] == (3, 224, 224)
cnn.conv(1, 1, 1, 1, 1, use_batch_norm=True)
cnn.mpool(1, 1, 1, 1, num_channels_in=1)
cnn.reshape([-1, 224 * 224])
cnn.affine(1, activation=None)
# Assert that the batch norm variables are filtered out for L2 loss.
variables = tf.global_variables() + tf.local_variables()
assert len(variables) > len(self.filter_l2_loss_vars(variables))
class MlPerfComplianceTest(tf.test.TestCase):
"""Tests the MLPerf compliance logs.
This serves as a quick check that we probably didn't break the compliance
logging. It is not mean to be as comprehensive as the official MLPerf
compliance checker will be.
"""
def setUp(self):
super(MlPerfComplianceTest, self).setUp()
benchmark_cnn.setup(benchmark_cnn.make_params())
# Map between regex and the number of times we expect to see that regex in the
# logs. Entry commented out with the comment FIXME indicate that
# tf_cnn_benchmarks currently fails compliance in that regard, and needs to be
# fixed to be MLPerf compliant.
EXPECTED_LOG_REGEXES = {
# Preprocessing tags
mlperf.tags.INPUT_ORDER: 2, # 1 for training, 1 for eval
# We pass --tf_random_seed=9876 in the test.
r'%s: 9876' % mlperf.tags.RUN_SET_RANDOM_SEED: 2,
# The Numpy random seed is hardcoded to 4321.
r'%s: 4321' % mlperf.tags.RUN_SET_RANDOM_SEED: 2,
r'%s: %d' % (mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
datasets.IMAGENET_NUM_TRAIN_IMAGES): 1,
r'%s: %d' % (mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
datasets.IMAGENET_NUM_VAL_IMAGES): 1,
mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES + '.*': 1,
mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV + '.*': 1,
mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE + '.*': 1,
mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE + '.*': 1,
mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS + '.*': 1,
mlperf.tags.INPUT_RANDOM_FLIP + '.*': 1,
r'%s: \[224, 224\].*' % mlperf.tags.INPUT_CENTRAL_CROP: 1,
r'%s: \[123.68, 116.78, 103.94\].*' % mlperf.tags.INPUT_MEAN_SUBTRACTION:
2,
r'%s: {"min": 256}.*' % mlperf.tags.INPUT_RESIZE_ASPECT_PRESERVING: 1,
# 1 for training, 1 for eval
r'%s: \[224, 224\].*' % mlperf.tags.INPUT_RESIZE: 2,
# Resnet model tags
mlperf.tags.MODEL_HP_BATCH_NORM + '.*': 2,
# 2 for training, 2 for eval. Although there's only 1 conv2d, each conv2d
# produces 2 logs.
mlperf.tags.MODEL_HP_CONV2D_FIXED_PADDING + '.*': 4,
mlperf.tags.MODEL_HP_RELU + '.*': 2,
mlperf.tags.MODEL_HP_INITIAL_MAX_POOL + '.*': 2,
mlperf.tags.MODEL_HP_DENSE + '.*': 4,
mlperf.tags.MODEL_HP_DENSE + '.*': 4,
# Note that tags our test model does not emit, like MODEL_HP_SHORTCUT_ADD,
# are omitted here.
r'%s: "categorical_cross_entropy".*' % mlperf.tags.MODEL_HP_LOSS_FN: 1,
# 1 for training, 2 because the _MlPerfTestModel calls this when building
# the model for both training and eval
r'%s: true' % mlperf.tags.MODEL_EXCLUDE_BN_FROM_L2: 3,
r'%s: 0.5.*' % mlperf.tags.MODEL_L2_REGULARIZATION: 1,
# Note we do not handle OPT_LR, since that is printed to stderr using
# tf.Print, which we cannot easily intercept.
# Other tags
'%s: "%s"' % (mlperf.tags.OPT_NAME, mlperf.tags.SGD_WITH_MOMENTUM): 1,
'%s: 0.5' % mlperf.tags.OPT_MOMENTUM: 1,
mlperf.tags.RUN_START: 1,
'%s: 2' % mlperf.tags.INPUT_BATCH_SIZE: 1,
mlperf.tags.TRAIN_LOOP: 1,
mlperf.tags.TRAIN_EPOCH + '.*': 1,
'%s: 2' % mlperf.tags.INPUT_SIZE: 2,
mlperf.tags.EVAL_START: 2,
mlperf.tags.EVAL_STOP: 2,
'%s: 6' % mlperf.tags.EVAL_SIZE: 2,
mlperf.tags.EVAL_ACCURACY + '.*': 2,
'%s: 2.0' % mlperf.tags.EVAL_TARGET: 2,
mlperf.tags.RUN_STOP + '.*': 1,
mlperf.tags.RUN_FINAL: 1
}
EXPECTED_LOG_REGEXES = Counter({re.compile(k): v for
k, v in EXPECTED_LOG_REGEXES.items()})
def testMlPerfCompliance(self):
string_io = six.StringIO()
handler = logging.StreamHandler(string_io)
data_dir = test_util.create_black_and_white_images()
try:
mlperf_log.LOGGER.addHandler(handler)
params = benchmark_cnn.make_params(data_dir=data_dir,
data_name='imagenet',
batch_size=2,
num_warmup_batches=0,
num_batches=2,
num_eval_batches=3,
eval_during_training_every_n_steps=1,
distortions=False,
weight_decay=0.5,
optimizer='momentum',
momentum=0.5,
stop_at_top_1_accuracy=2.0,
tf_random_seed=9876,
ml_perf=True)
with mlperf.mlperf_logger(use_mlperf_logger=True, model='resnet50_v1.5'):
bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=_MlPerfTestModel())
bench_cnn.run()
logs = string_io.getvalue().splitlines()
log_regexes = Counter()
for log in logs:
for regex in self.EXPECTED_LOG_REGEXES:
if regex.search(log):
log_regexes[regex] += 1
if log_regexes != self.EXPECTED_LOG_REGEXES:
diff_counter = Counter(log_regexes)
diff_counter.subtract(self.EXPECTED_LOG_REGEXES)
differences = []
for regex in (k for k in diff_counter.keys() if diff_counter[k]):
found_count = log_regexes[regex]
expected_count = self.EXPECTED_LOG_REGEXES[regex]
differences.append(' For regex %s: Found %d lines matching but '
'expected to find %d' %
(regex.pattern, found_count, expected_count))
raise AssertionError('Logs did not match expected logs. Differences:\n'
'%s' % '\n'.join(differences))
finally:
mlperf_log.LOGGER.removeHandler(handler)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Alexnet model configuration.
References:
Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton
ImageNet Classification with Deep Convolutional Neural Networks
Advances in Neural Information Processing Systems. 2012
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v1 as tf
from models import model
class AlexnetModel(model.CNNModel):
"""Alexnet cnn model."""
def __init__(self, params=None):
super(AlexnetModel, self).__init__(
'alexnet', 224 + 3, 512, 0.005, params=params)
def add_inference(self, cnn):
# Note: VALID requires padding the images by 3 in width and height
cnn.conv(64, 11, 11, 4, 4, 'VALID')
cnn.mpool(3, 3, 2, 2)
cnn.conv(192, 5, 5)
cnn.mpool(3, 3, 2, 2)
cnn.conv(384, 3, 3)
cnn.conv(384, 3, 3)
cnn.conv(256, 3, 3)
cnn.mpool(3, 3, 2, 2)
cnn.reshape([-1, 256 * 6 * 6])
cnn.affine(4096)
cnn.dropout()
cnn.affine(4096)
cnn.dropout()
class AlexnetCifar10Model(model.CNNModel):
"""Alexnet cnn model for cifar datasets.
The model architecture follows the one defined in the tensorflow tutorial
model.
Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py
Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
"""
def __init__(self, params=None):
super(AlexnetCifar10Model, self).__init__(
'alexnet', 32, 128, 0.1, params=params)
def add_inference(self, cnn):
cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2)
cnn.mpool(3, 3, 2, 2, mode='SAME')
cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2)
cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
cnn.mpool(3, 3, 2, 2, mode='SAME')
shape = cnn.top_layer.get_shape().as_list()
flat_dim = shape[1] * shape[2] * shape[3]
cnn.reshape([-1, flat_dim])
cnn.affine(384, stddev=0.04, bias=0.1)
cnn.affine(192, stddev=0.04, bias=0.1)
def get_learning_rate(self, global_step, batch_size):
num_examples_per_epoch = 50000
num_epochs_per_decay = 100
decay_steps = (
num_epochs_per_decay * num_examples_per_epoch // batch_size)
decay_factor = 0.1
return tf.train.exponential_decay(
self.learning_rate,
global_step,
decay_steps,
decay_factor,
staircase=True)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Densenet model configuration.
References:
"Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
from models import model as model_lib
class DensenetCifar10Model(model_lib.CNNModel):
"""Densenet cnn network configuration."""
def __init__(self, model, layer_counts, growth_rate, params=None):
self.growth_rate = growth_rate
super(DensenetCifar10Model, self).__init__(
model, 32, 64, 0.1, layer_counts=layer_counts, params=params)
self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
def dense_block(self, cnn, growth_rate):
input_layer = cnn.top_layer
c = cnn.batch_norm(input_layer, **self.batch_norm_config)
c = tf.nn.relu(c)
c = cnn.conv(growth_rate, 3, 3, 1, 1, stddev=np.sqrt(2.0/9/growth_rate),
activation=None, input_layer=c)
channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
cnn.top_layer = tf.concat([input_layer, c], channel_index)
cnn.top_size += growth_rate
def transition_layer(self, cnn):
in_size = cnn.top_size
cnn.batch_norm(**self.batch_norm_config)
cnn.top_layer = tf.nn.relu(cnn.top_layer)
cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0/9/in_size))
cnn.apool(2, 2, 2, 2)
def add_inference(self, cnn):
if self.layer_counts is None:
raise ValueError('Layer counts not specified for %s' % self.get_model())
if self.growth_rate is None:
raise ValueError('Growth rate not specified for %s' % self.get_model())
cnn.conv(16, 3, 3, 1, 1, activation=None)
# Block 1
for _ in xrange(self.layer_counts[0]):
self.dense_block(cnn, self.growth_rate)
self.transition_layer(cnn)
# Block 2
for _ in xrange(self.layer_counts[1]):
self.dense_block(cnn, self.growth_rate)
self.transition_layer(cnn)
# Block 3
for _ in xrange(self.layer_counts[2]):
self.dense_block(cnn, self.growth_rate)
cnn.batch_norm(**self.batch_norm_config)
cnn.top_layer = tf.nn.relu(cnn.top_layer)
channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index]
cnn.spatial_mean()
def get_learning_rate(self, global_step, batch_size):
num_batches_per_epoch = 50000 // batch_size
boundaries = num_batches_per_epoch * np.array([150, 225, 300],
dtype=np.int64)
boundaries = [x for x in boundaries]
values = [0.1, 0.01, 0.001, 0.0001]
return tf.train.piecewise_constant(global_step, boundaries, values)
def create_densenet40_k12_model():
return DensenetCifar10Model('densenet40_k12', (12, 12, 12), 12)
def create_densenet100_k12_model():
return DensenetCifar10Model('densenet100_k12', (32, 32, 32), 12)
def create_densenet100_k24_model():
return DensenetCifar10Model('densenet100_k24', (32, 32, 32), 24)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""DeepSpeech2 model configuration.
References:
https://arxiv.org/abs/1512.02595
Deep Speech 2: End-to-End Speech Recognition in English and Mandarin
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
import constants
from cnn_util import log_fn
from models import model as model_lib
from tensorflow.python.ops import variables # pylint: disable=g-direct-tensorflow-import
class DeepSpeechDecoder(object):
"""Greedy decoder implementation for Deep Speech model."""
def __init__(self, labels, blank_index=28):
"""Decoder initialization.
Args:
labels: a string specifying the speech labels for the decoder to use.
blank_index: an integer specifying index for the blank character. Defaults
to 28.
"""
self.labels = labels
self.blank_index = blank_index
self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
def convert_to_string(self, sequence):
"""Convert a sequence of indexes into corresponding string."""
return ''.join([self.int_to_char[i] for i in sequence])
def wer(self, decode, target):
"""Computes the Word Error Rate (WER).
WER is defined as the edit distance between the two provided sentences after
tokenizing to words.
Args:
decode: string of the decoded output.
target: a string for the ground truth label.
Returns:
A float number for the WER of the current decode-target pair.
"""
try:
from nltk.metrics import distance # pylint: disable=g-import-not-at-top
except ImportError as e:
if 'nltk.metrics' not in e.message:
raise
raise ImportError('To use the experimental deepspeech model, you must '
'pip install -U nltk')
# Map each word to a new char.
words = set(decode.split() + target.split())
word2char = dict(zip(words, range(len(words))))
new_decode = [chr(word2char[w]) for w in decode.split()]
new_target = [chr(word2char[w]) for w in target.split()]
return distance.edit_distance(''.join(new_decode), ''.join(new_target))
def cer(self, decode, target):
"""Computes the Character Error Rate (CER).
CER is defined as the edit distance between the two given strings.
Args:
decode: a string of the decoded output.
target: a string for the ground truth label.
Returns:
A float number denoting the CER for the current sentence pair.
"""
try:
from nltk.metrics import distance # pylint: disable=g-import-not-at-top
except ImportError as e:
if 'nltk.metrics' not in e.message:
raise
raise ImportError('To use the experimental deepspeech model, you must '
'pip install -U nltk')
return distance.edit_distance(decode, target)
def decode(self, char_indexes):
"""Decode the best guess from logits using greedy algorithm."""
# Merge repeated chars.
merge = [k for k, _ in itertools.groupby(char_indexes)]
# Remove the blank index in the decoded sequence.
merge_remove_blank = []
for k in merge:
if k != self.blank_index:
merge_remove_blank.append(k)
return self.convert_to_string(merge_remove_blank)
def decode_logits(self, logits):
"""Decode the best guess from logits using greedy algorithm."""
# Choose the class with maximimum probability.
best = list(np.argmax(logits, axis=1))
return self.decode(best)
class DeepSpeech2Model(model_lib.Model):
"""Define DeepSpeech2 model."""
# Supported rnn cells.
SUPPORTED_RNNS = {
'lstm': tf.nn.rnn_cell.BasicLSTMCell,
'rnn': tf.nn.rnn_cell.RNNCell,
'gru': tf.nn.rnn_cell.GRUCell,
}
# Parameters for batch normalization.
BATCH_NORM_EPSILON = 1e-5
BATCH_NORM_DECAY = 0.997
# Filters of convolution layer
CONV_FILTERS = 32
def __init__(self,
num_rnn_layers=5,
rnn_type='lstm',
is_bidirectional=True,
rnn_hidden_size=800,
use_bias=True,
params=None):
"""Initialize DeepSpeech2 model.
Args:
num_rnn_layers: an integer, the number of rnn layers (default: 5).
rnn_type: a string, one of the supported rnn cells: gru, rnn or lstm.
is_bidirectional: a boolean to indicate if the rnn layer is bidirectional.
rnn_hidden_size: an integer for the number of hidden units in the RNN
cell.
use_bias: a boolean specifying whether to use a bias in the last fc layer.
params: the params from BenchmarkCNN.
"""
super(DeepSpeech2Model, self).__init__(
'deepspeech2',
batch_size=128,
learning_rate=0.0005,
fp16_loss_scale=128,
params=params)
self.num_rnn_layers = num_rnn_layers
self.rnn_type = rnn_type
self.is_bidirectional = is_bidirectional
self.rnn_hidden_size = rnn_hidden_size
self.use_bias = use_bias
self.num_feature_bins = 161
self.max_time_steps = 3494
self.max_label_length = 576
def _batch_norm(self, inputs, training):
"""Batch normalization layer.
Note that the momentum to use will affect validation accuracy over time.
Batch norm has different behaviors during training/evaluation. With a large
momentum, the model takes longer to get a near-accurate estimation of the
moving mean/variance over the entire training dataset, which means we need
more iterations to see good evaluation results. If the training data is
evenly distributed over the feature space, we can also try setting a smaller
momentum (such as 0.1) to get good evaluation result sooner.
Args:
inputs: input data for batch norm layer.
training: a boolean to indicate if it is in training stage.
Returns:
tensor output from batch norm layer.
"""
return tf.layers.batch_normalization(
inputs=inputs,
momentum=DeepSpeech2Model.BATCH_NORM_DECAY,
epsilon=DeepSpeech2Model.BATCH_NORM_EPSILON,
fused=True,
training=training)
def _conv_bn_layer(self, inputs, padding, filters, kernel_size, strides,
layer_id, training):
"""Defines 2D convolutional + batch normalization layer.
Args:
inputs: input data for convolution layer.
padding: padding to be applied before convolution layer.
filters: an integer, number of output filters in the convolution.
kernel_size: a tuple specifying the height and width of the 2D convolution
window.
strides: a tuple specifying the stride length of the convolution.
layer_id: an integer specifying the layer index.
training: a boolean to indicate which stage we are in (training/eval).
Returns:
tensor output from the current layer.
"""
# Perform symmetric padding on the feature dimension of time_step
# This step is required to avoid issues when RNN output sequence is shorter
# than the label length.
inputs = tf.pad(
inputs,
[[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])
inputs = tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding='valid',
use_bias=False,
activation=tf.nn.relu6,
name='cnn_{}'.format(layer_id))
return self._batch_norm(inputs, training)
def _rnn_layer(self, inputs, rnn_cell, rnn_hidden_size, layer_id,
use_batch_norm, is_bidirectional, training):
"""Defines a batch normalization + rnn layer.
Args:
inputs: input tensors for the current layer.
rnn_cell: RNN cell instance to use.
rnn_hidden_size: an integer for the dimensionality of the rnn output
space.
layer_id: an integer for the index of current layer.
use_batch_norm: a boolean specifying whether to perform batch
normalization on input states.
is_bidirectional: a boolean specifying whether the rnn layer is
bi-directional.
training: a boolean to indicate which stage we are in (training/eval).
Returns:
tensor output for the current layer.
"""
if use_batch_norm:
inputs = self._batch_norm(inputs, training)
# Construct forward/backward RNN cells.
fw_cell = rnn_cell(
num_units=rnn_hidden_size, name='rnn_fw_{}'.format(layer_id))
if is_bidirectional:
bw_cell = rnn_cell(
num_units=rnn_hidden_size, name='rnn_bw_{}'.format(layer_id))
outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=fw_cell,
cell_bw=bw_cell,
inputs=inputs,
dtype=tf.float32,
swap_memory=True)
rnn_outputs = tf.concat(outputs, -1)
else:
rnn_outputs = tf.nn.dynamic_rnn(
fw_cell, inputs, dtype=tf.float32, swap_memory=True)
return rnn_outputs
def get_input_data_types(self, subset):
"""Returns the list of data types of the inputs."""
del subset # Same data types for both train and validation subsets.
return [self.data_type, tf.int32, tf.int32, tf.int32]
def get_input_shapes(self, subset):
"""Returns the list of shapes of the padded inputs."""
del subset # Same shapes for both train and validation subsets
return [
[self.batch_size, self.max_time_steps, self.num_feature_bins, 1],
[self.batch_size, self.max_label_length],
[self.batch_size, 1],
[self.batch_size, 1],
]
def get_synthetic_inputs(self, input_name, nclass):
inputs = tf.random_uniform(self.get_input_shapes('train')[0],
dtype=self.get_input_data_types('train')[0])
inputs = variables.VariableV1(inputs, trainable=False,
collections=[tf.GraphKeys.LOCAL_VARIABLES],
name=input_name)
labels = tf.convert_to_tensor(
np.random.randint(28, size=[self.batch_size, self.max_label_length]))
input_lengths = tf.convert_to_tensor(
[self.max_time_steps] * self.batch_size)
label_lengths = tf.convert_to_tensor(
[self.max_label_length] * self.batch_size)
return [inputs, labels, input_lengths, label_lengths]
# TODO(laigd): support fp16.
# TODO(laigd): support multiple gpus.
def build_network(self, inputs, phase_train=True, nclass=29):
"""Builds the forward pass of the deepspeech2 model.
Args:
inputs: The input list of the model.
phase_train: True during training. False during evaluation.
nclass: Number of classes that the input spectrogram can belong to.
Returns:
A BuildNetworkResult which contains the logits and model-specific extra
information.
"""
inputs = inputs[0] # Get the spectrogram feature.
# Two cnn layers.
inputs = self._conv_bn_layer(
inputs,
padding=(20, 5),
filters=DeepSpeech2Model.CONV_FILTERS,
kernel_size=(41, 11),
strides=(2, 2),
layer_id=1,
training=phase_train)
inputs = self._conv_bn_layer(
inputs,
padding=(10, 5),
filters=DeepSpeech2Model.CONV_FILTERS,
kernel_size=(21, 11),
strides=(2, 1),
layer_id=2,
training=phase_train)
# output of conv_layer2 with the shape of
# [batch_size (N), times (T), features (F), channels (C)].
# Convert the conv output to rnn input.
# batch_size = tf.shape(inputs)[0]
feat_size = inputs.get_shape().as_list()[2]
inputs = tf.reshape(
inputs,
[self.batch_size, -1, feat_size * DeepSpeech2Model.CONV_FILTERS])
# RNN layers.
rnn_cell = DeepSpeech2Model.SUPPORTED_RNNS[self.rnn_type]
for layer_counter in xrange(self.num_rnn_layers):
# No batch normalization on the first layer.
use_batch_norm = (layer_counter != 0)
inputs = self._rnn_layer(inputs, rnn_cell, self.rnn_hidden_size,
layer_counter + 1, use_batch_norm,
self.is_bidirectional, phase_train)
# FC layer with batch norm.
inputs = self._batch_norm(inputs, phase_train)
logits = tf.layers.dense(inputs, nclass, use_bias=self.use_bias)
return model_lib.BuildNetworkResult(logits=logits, extra_info=None)
def loss_function(self, inputs, build_network_result):
"""Computes the ctc loss for the current batch of predictions.
Args:
inputs: the input list of the model.
build_network_result: a BuildNetworkResult returned by build_network().
Returns:
The loss tensor of the model.
"""
logits = build_network_result.logits
actual_time_steps = inputs[2]
probs = tf.nn.softmax(logits)
ctc_time_steps = tf.shape(probs)[1]
ctc_input_length = tf.to_float(
tf.multiply(actual_time_steps, ctc_time_steps))
ctc_input_length = tf.to_int32(
tf.floordiv(ctc_input_length, tf.to_float(self.max_time_steps)))
label_length = inputs[3]
label_length = tf.to_int32(tf.squeeze(label_length))
ctc_input_length = tf.to_int32(tf.squeeze(ctc_input_length))
labels = inputs[1]
sparse_labels = tf.to_int32(
tf.keras.backend.ctc_label_dense_to_sparse(labels, label_length))
y_pred = tf.log(
tf.transpose(probs, perm=[1, 0, 2]) + tf.keras.backend.epsilon())
losses = tf.expand_dims(
tf.nn.ctc_loss(
labels=sparse_labels,
inputs=y_pred,
sequence_length=ctc_input_length,
ignore_longer_outputs_than_inputs=True),
axis=1)
loss = tf.reduce_mean(losses)
return loss
PROBABILITY_TENSOR = 'deepspeech2_prob'
LABEL_TENSOR = 'deepspeech2_label'
def accuracy_function(self, inputs, logits):
"""Returns the ops to evaluate the model performance."""
# Get probabilities of each predicted class
probs = tf.nn.softmax(logits)
assert probs.shape.as_list()[0] == self.batch_size
return {
(constants.UNREDUCED_ACCURACY_OP_PREFIX + self.PROBABILITY_TENSOR):
probs,
(constants.UNREDUCED_ACCURACY_OP_PREFIX + self.LABEL_TENSOR):
inputs[1],
}
def postprocess(self, results):
"""Postprocess results returned from model in Python."""
probs = results[self.PROBABILITY_TENSOR]
total_wer, total_cer = 0, 0
speech_labels = " abcdefghijklmnopqrstuvwxyz'-"
greedy_decoder = DeepSpeechDecoder(speech_labels)
# Evaluate the performance using WER (Word Error Rate) and CER (Character
# Error Rate) as metrics.
targets = results[self.LABEL_TENSOR] # The ground truth transcript
for i in range(self.batch_size):
# Decode string.
predicted_str = greedy_decoder.decode_logits(probs[i])
expected_str = greedy_decoder.decode(targets[i])
# Compute CER.
total_cer += (greedy_decoder.cer(predicted_str, expected_str) /
len(expected_str))
# Compute WER.
total_wer += (greedy_decoder.wer(predicted_str, expected_str) /
len(expected_str.split()))
# Get mean value
total_cer /= self.batch_size
total_wer /= self.batch_size
log_fn('total CER: {:f}; total WER: {:f}; total example: {:d}.'.format(
total_cer, total_wer, self.batch_size))
# TODO(laigd): get rid of top_N_accuracy bindings in benchmark_cnn.py
return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrap the official recommendation model in a tf_cnn_benchmarks Model.
This allows the recommendation NCF model to be used in tf_cnn_benchmarks.
Currently, the implementation is fairly hacky, because tf_cnn_benchmarks is
intended to be used only with CNNs.
Only synthetic data with 1 GPU is currently supported.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v1 as tf
from models import model
# Obtained by running the official NCF model with the following command:
# python ncf_main.py --dataset ml-20m
# and printing the number of users and items here:
# https://github.com/tensorflow/models/blob/d089975f630a8a01be63e45ef08a31be14bb96b4/official/recommendation/data_preprocessing.py#L68
_NUM_USERS_20M = 138493
_NUM_ITEMS_20M = 26744
# TODO(reedwm): Support multi-GPU. Currently keras layers, which this model
# uses, ignore variable_scopes, which we rely on for multi-GPU support.
# TODO(reedwm): Support real data. This will require a significant refactor.
# TODO(reedwm): All-reduce IndexedSlices more effectively.
# TODO(reedwm): Support the 1M variant of this model.
class NcfModel(model.Model):
r"""A model.Model wrapper around the official NCF recommendation model.
To do an NCF run with synthetic data that roughly matches what the official
model does, run:
python tf_cnn_benchmarks.py --optimizer=adam --model=ncf --batch_size=65536 \
--weight_decay=0 --sparse_to_dense_grads
"""
def __init__(self, params=None):
super(NcfModel, self).__init__(
'official_ncf', batch_size=2048, learning_rate=0.0005,
fp16_loss_scale=128, params=params)
if self.fp16_vars:
raise ValueError('NCF model only supports float32 variables for now.')
def build_network(self, inputs, phase_train=True, nclass=1001):
try:
from official.recommendation import neumf_model # pylint: disable=g-import-not-at-top
except ImportError as e:
if 'neumf_model' not in e.message:
raise
raise ImportError('To use the experimental NCF model, you must clone the '
'repo https://github.com/tensorflow/models and add '
'tensorflow/models to the PYTHONPATH.')
del nclass
users, items, _ = inputs
params = {
'num_users': _NUM_USERS_20M,
'num_items': _NUM_ITEMS_20M,
'model_layers': (256, 256, 128, 64),
'mf_dim': 64,
'mf_regularization': 0,
'mlp_reg_layers': (0, 0, 0, 0),
'use_tpu': False
}
user_input = tf.keras.layers.Input(tensor=users, name='user_input')
item_input = tf.keras.layers.Input(tensor=items, name='item_input')
if self.data_type == tf.float32:
keras_model = neumf_model.construct_model(user_input, item_input, params)
logits = keras_model.output
else:
assert self.data_type == tf.float16
old_floatx = tf.keras.backend.floatx()
try:
tf.keras.backend.set_floatx('float16')
# We cannot rely on the variable_scope's fp16 custom getter here,
# because the NCF model uses keras layers, which ignore variable scopes.
# So we use a variable_creator_scope instead.
with tf.variable_creator_scope(_fp16_variable_creator):
keras_model = neumf_model.construct_model(user_input, item_input,
params)
logits = tf.cast(keras_model.output, tf.float32)
finally:
tf.keras.backend.set_floatx(old_floatx)
return model.BuildNetworkResult(logits=logits, extra_info=None)
def loss_function(self, inputs, build_network_result):
logits = build_network_result.logits
# Softmax with the first column of ones is equivalent to sigmoid.
# TODO(reedwm): Actually, the first column should be zeros to be equivalent
# to sigmoid. But, we keep it at ones to match the official models.
logits = tf.concat([tf.ones(logits.shape, dtype=logits.dtype), logits],
axis=1)
return tf.losses.sparse_softmax_cross_entropy(
labels=inputs[2],
logits=logits
)
def get_synthetic_inputs(self, input_name, nclass):
"""Returns the ops to generate synthetic inputs and labels."""
def users_init_val():
return tf.random_uniform((self.batch_size, 1), minval=0,
maxval=_NUM_USERS_20M, dtype=tf.int32)
users = tf.Variable(users_init_val, dtype=tf.int32, trainable=False,
collections=[tf.GraphKeys.LOCAL_VARIABLES],
name='synthetic_users')
def items_init_val():
return tf.random_uniform((self.batch_size, 1), minval=0,
maxval=_NUM_ITEMS_20M, dtype=tf.int32)
items = tf.Variable(items_init_val, dtype=tf.int32, trainable=False,
collections=[tf.GraphKeys.LOCAL_VARIABLES],
name='synthetic_items')
def labels_init_val():
return tf.random_uniform((self.batch_size,), minval=0, maxval=2,
dtype=tf.int32)
labels = tf.Variable(labels_init_val, dtype=tf.int32, trainable=False,
collections=[tf.GraphKeys.LOCAL_VARIABLES],
name='synthetic_labels')
return [users, items, labels]
def get_input_shapes(self, subset):
del subset
return [[self.batch_size, 1], [self.batch_size, 1], [self.batch_size]]
def get_input_data_types(self, subset):
del subset
return [self.int32, tf.int32, tf.int32]
def _fp16_variable_creator(next_creator, **kwargs):
"""Variable creator to create variables in fp32 and cast them to fp16."""
dtype = kwargs.get('dtype', None)
initial_value = kwargs.get('initial_value', None)
if dtype is None:
if initial_value is not None and not callable(initial_value):
dtype = initial_value.dtype
if dtype == tf.float16:
if callable(initial_value):
new_initial_value = lambda: tf.cast(initial_value(), tf.float32)
else:
new_initial_value = tf.cast(initial_value, tf.float32)
kwargs['dtype'] = tf.float32
kwargs['initial_value'] = new_initial_value
var = next_creator(**kwargs)
return tf.cast(var, dtype=tf.float16)
else:
return next_creator(**kwargs)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Googlenet model configuration.
References:
Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich
Going deeper with convolutions
arXiv preprint arXiv:1409.4842 (2014)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from models import model
class GooglenetModel(model.CNNModel):
"""GoogLeNet."""
def __init__(self, params=None):
super(GooglenetModel, self).__init__(
'googlenet', 224, 32, 0.005, params=params)
def add_inference(self, cnn):
def inception_v1(cnn, k, l, m, n, p, q):
cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)],
[('conv', n, 1, 1), ('conv', p, 5, 5)],
[('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]]
cnn.inception_module('incept_v1', cols)
cnn.conv(64, 7, 7, 2, 2)
cnn.mpool(3, 3, 2, 2, mode='SAME')
cnn.conv(64, 1, 1)
cnn.conv(192, 3, 3)
cnn.mpool(3, 3, 2, 2, mode='SAME')
inception_v1(cnn, 64, 96, 128, 16, 32, 32)
inception_v1(cnn, 128, 128, 192, 32, 96, 64)
cnn.mpool(3, 3, 2, 2, mode='SAME')
inception_v1(cnn, 192, 96, 208, 16, 48, 64)
inception_v1(cnn, 160, 112, 224, 24, 64, 64)
inception_v1(cnn, 128, 128, 256, 24, 64, 64)
inception_v1(cnn, 112, 144, 288, 32, 64, 64)
inception_v1(cnn, 256, 160, 320, 32, 128, 128)
cnn.mpool(3, 3, 2, 2, mode='SAME')
inception_v1(cnn, 256, 160, 320, 32, 128, 128)
inception_v1(cnn, 384, 192, 384, 48, 128, 128)
cnn.apool(7, 7, 1, 1, mode='VALID')
cnn.reshape([-1, 1024])
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Inception model configuration.
Includes multiple models: inception3, inception4, inception-resnet2.
References:
Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
Inception-v4, Inception-ResNet and the Impact of Residual Connections on
Learning
Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich
Going Deeper with Convolutions
http://arxiv.org/pdf/1409.4842v1.pdf
Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens,
Zbigniew Wojna
Rethinking the Inception Architecture for Computer Vision
arXiv preprint arXiv:1512.00567 (2015)
Inception v3 model: http://arxiv.org/abs/1512.00567
Inception v4 and Resnet V2 architectures: http://arxiv.org/abs/1602.07261
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import xrange # pylint: disable=redefined-builtin
from models import model
class Inceptionv3Model(model.CNNModel):
"""InceptionV3."""
def __init__(self, auxiliary=False, params=None):
self._auxiliary = auxiliary
super(Inceptionv3Model, self).__init__(
'inception3', 299, 32, 0.005, params=params)
def add_inference(self, cnn):
def inception_v3_a(cnn, n):
cols = [[('conv', 64, 1, 1)], [('conv', 48, 1, 1), ('conv', 64, 5, 5)],
[('conv', 64, 1, 1), ('conv', 96, 3, 3), ('conv', 96, 3, 3)],
[('apool', 3, 3, 1, 1, 'SAME'), ('conv', n, 1, 1)]]
cnn.inception_module('incept_v3_a', cols)
def inception_v3_b(cnn):
cols = [[('conv', 384, 3, 3, 2, 2, 'VALID')],
[('conv', 64, 1, 1),
('conv', 96, 3, 3),
('conv', 96, 3, 3, 2, 2, 'VALID')],
[('mpool', 3, 3, 2, 2, 'VALID')]]
cnn.inception_module('incept_v3_b', cols)
def inception_v3_c(cnn, n):
cols = [[('conv', 192, 1, 1)],
[('conv', n, 1, 1), ('conv', n, 1, 7), ('conv', 192, 7, 1)],
[('conv', n, 1, 1), ('conv', n, 7, 1), ('conv', n, 1, 7),
('conv', n, 7, 1), ('conv', 192, 1, 7)],
[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 192, 1, 1)]]
cnn.inception_module('incept_v3_c', cols)
def inception_v3_d(cnn):
cols = [[('conv', 192, 1, 1), ('conv', 320, 3, 3, 2, 2, 'VALID')],
[('conv', 192, 1, 1), ('conv', 192, 1, 7), ('conv', 192, 7, 1),
('conv', 192, 3, 3, 2, 2, 'VALID')],
[('mpool', 3, 3, 2, 2, 'VALID')]]
cnn.inception_module('incept_v3_d', cols)
def inception_v3_e(cnn, pooltype):
cols = [[('conv', 320, 1, 1)], [('conv', 384, 1, 1), ('conv', 384, 1, 3)],
[('share',), ('conv', 384, 3, 1)],
[('conv', 448, 1, 1), ('conv', 384, 3, 3), ('conv', 384, 1, 3)],
[('share',), ('share',), ('conv', 384, 3, 1)],
[('mpool' if pooltype == 'max' else 'apool', 3, 3, 1, 1, 'SAME'),
('conv', 192, 1, 1)]]
cnn.inception_module('incept_v3_e', cols)
def incept_v3_aux(cnn):
assert cnn.aux_top_layer is None
cnn.aux_top_layer = cnn.top_layer
cnn.aux_top_size = cnn.top_size
with cnn.switch_to_aux_top_layer():
cnn.apool(5, 5, 3, 3, mode='VALID')
cnn.conv(128, 1, 1, mode='SAME')
cnn.conv(768, 5, 5, mode='VALID', stddev=0.01)
cnn.reshape([-1, 768])
cnn.use_batch_norm = True
cnn.conv(32, 3, 3, 2, 2, mode='VALID') # 299 x 299 x 3
cnn.conv(32, 3, 3, 1, 1, mode='VALID') # 149 x 149 x 32
cnn.conv(64, 3, 3, 1, 1, mode='SAME') # 147 x 147 x 64
cnn.mpool(3, 3, 2, 2, mode='VALID') # 147 x 147 x 64
cnn.conv(80, 1, 1, 1, 1, mode='VALID') # 73 x 73 x 80
cnn.conv(192, 3, 3, 1, 1, mode='VALID') # 71 x 71 x 192
cnn.mpool(3, 3, 2, 2, 'VALID') # 35 x 35 x 192
inception_v3_a(cnn, 32) # 35 x 35 x 256 mixed.
inception_v3_a(cnn, 64) # 35 x 35 x 288 mixed_1.
inception_v3_a(cnn, 64) # 35 x 35 x 288 mixed_2
inception_v3_b(cnn) # 17 x 17 x 768 mixed_3
inception_v3_c(cnn, 128) # 17 x 17 x 768 mixed_4
inception_v3_c(cnn, 160) # 17 x 17 x 768 mixed_5
inception_v3_c(cnn, 160) # 17 x 17 x 768 mixed_6
inception_v3_c(cnn, 192) # 17 x 17 x 768 mixed_7
if self._auxiliary:
incept_v3_aux(cnn) # Auxillary Head logits
inception_v3_d(cnn) # 17 x 17 x 1280 mixed_8
inception_v3_e(cnn, 'avg') # 8 x 8 x 2048 mixed_9
inception_v3_e(cnn, 'max') # 8 x 8 x 2048 mixed_10
cnn.apool(8, 8, 1, 1, 'VALID') # 8 x 8 x 2048
cnn.reshape([-1, 2048]) # 1 x 1 x 2048
# Stem functions
def inception_v4_sa(cnn):
cols = [[('mpool', 3, 3, 2, 2, 'VALID')], [('conv', 96, 3, 3, 2, 2, 'VALID')]]
cnn.inception_module('incept_v4_sa', cols)
def inception_v4_sb(cnn):
cols = [[('conv', 64, 1, 1), ('conv', 96, 3, 3, 1, 1, 'VALID')],
[('conv', 64, 1, 1), ('conv', 64, 7, 1), ('conv', 64, 1, 7),
('conv', 96, 3, 3, 1, 1, 'VALID')]]
cnn.inception_module('incept_v4_sb', cols)
def inception_v4_sc(cnn):
cols = [[('conv', 192, 3, 3, 2, 2, 'VALID')],
[('mpool', 3, 3, 2, 2, 'VALID')]]
cnn.inception_module('incept_v4_sc', cols)
# Reduction functions
def inception_v4_ra(cnn, k, l, m, n):
cols = [
[('mpool', 3, 3, 2, 2, 'VALID')], [('conv', n, 3, 3, 2, 2, 'VALID')],
[('conv', k, 1, 1), ('conv', l, 3, 3), ('conv', m, 3, 3, 2, 2, 'VALID')]
]
cnn.inception_module('incept_v4_ra', cols)
def inception_v4_rb(cnn):
cols = [[('mpool', 3, 3, 2, 2, 'VALID')],
[('conv', 192, 1, 1), ('conv', 192, 3, 3, 2, 2, 'VALID')],
[('conv', 256, 1, 1), ('conv', 256, 1, 7), ('conv', 320, 7, 1),
('conv', 320, 3, 3, 2, 2, 'VALID')]]
cnn.inception_module('incept_v4_rb', cols)
class Inceptionv4Model(model.CNNModel):
"""Inceptionv4."""
def __init__(self, params=None):
super(Inceptionv4Model, self).__init__(
'inception4', 299, 32, 0.005, params=params)
def add_inference(self, cnn):
def inception_v4_a(cnn):
cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 96, 1, 1)],
[('conv', 96, 1, 1)], [('conv', 64, 1, 1), ('conv', 96, 3, 3)],
[('conv', 64, 1, 1), ('conv', 96, 3, 3), ('conv', 96, 3, 3)]]
cnn.inception_module('incept_v4_a', cols)
def inception_v4_b(cnn):
cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 128, 1, 1)],
[('conv', 384, 1, 1)],
[('conv', 192, 1, 1), ('conv', 224, 1, 7), ('conv', 256, 7, 1)],
[('conv', 192, 1, 1), ('conv', 192, 1, 7), ('conv', 224, 7, 1),
('conv', 224, 1, 7), ('conv', 256, 7, 1)]]
cnn.inception_module('incept_v4_b', cols)
def inception_v4_c(cnn):
cols = [[('apool', 3, 3, 1, 1, 'SAME'), ('conv', 256, 1, 1)],
[('conv', 256, 1, 1)], [('conv', 384, 1, 1), ('conv', 256, 1, 3)],
[('share',), ('conv', 256, 3, 1)],
[('conv', 384, 1, 1), ('conv', 448, 1, 3), ('conv', 512, 3, 1),
('conv', 256, 3, 1)], [('share',), ('share',), ('share',),
('conv', 256, 1, 3)]]
cnn.inception_module('incept_v4_c', cols)
cnn.use_batch_norm = True
cnn.conv(32, 3, 3, 2, 2, mode='VALID')
cnn.conv(32, 3, 3, 1, 1, mode='VALID')
cnn.conv(64, 3, 3)
inception_v4_sa(cnn)
inception_v4_sb(cnn)
inception_v4_sc(cnn)
for _ in xrange(4):
inception_v4_a(cnn)
inception_v4_ra(cnn, 192, 224, 256, 384)
for _ in xrange(7):
inception_v4_b(cnn)
inception_v4_rb(cnn)
for _ in xrange(3):
inception_v4_c(cnn)
cnn.spatial_mean()
cnn.dropout(0.8)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment