Commit a67c28c8 authored by A. Unique TensorFlower's avatar A. Unique TensorFlower Committed by saberkun
Browse files

Internal change

PiperOrigin-RevId: 372471631
parent 027e5dc6
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common benchmark class for model garden models."""
import os
import pprint
# Import libraries
from absl import logging
import tensorflow as tf
from tensorflow.python.platform import benchmark # pylint: disable=unused-import
from official.common import registry_imports # pylint: disable=unused-import
from official.benchmark import benchmark_lib
from official.benchmark import benchmark_definitions
from official.benchmark import config_utils
from official.core import exp_factory
from official.modeling import hyperparams
def _get_benchmark_params(benchmark_models):
"""Formats benchmark params into a list."""
parameterized_benchmark_params = []
for _, benchmarks in benchmark_models.items():
for name, params in benchmarks.items():
for execution_mode in ['performance', 'accuracy']:
benchmark_name = '{}.{}'.format(name, execution_mode)
benchmark_params = (
benchmark_name, # First arg is used by ParameterizedBenchmark.
benchmark_name,
params['experiment_type'],
execution_mode,
params['platform'],
params['precision'],
params['metric_bounds'],
params.get('config_files') or [],
params.get('params_override') or None)
parameterized_benchmark_params.append(benchmark_params)
return parameterized_benchmark_params
class BaseBenchmark( # pylint: disable=undefined-variable
tf.test.Benchmark, metaclass=benchmark.ParameterizedBenchmark):
"""Common Benchmark.
benchmark.ParameterizedBenchmark is used to auto create benchmarks from
benchmark method according to the benchmarks defined in
benchmark_definitions. The name of the new benchmark methods is
benchmark__{benchmark_name}. _get_benchmark_params is used to generate the
benchmark name and args.
"""
_benchmark_parameters = _get_benchmark_params(
benchmark_definitions.VISION_BENCHMARKS) + _get_benchmark_params(
benchmark_definitions.NLP_BENCHMARKS)
def __init__(self,
output_dir=None,
tpu=None):
"""Initialize class.
Args:
output_dir: Base directory to store all output for the test.
tpu: (optional) TPU name to use in a TPU benchmark.
"""
if os.getenv('BENCHMARK_OUTPUT_DIR'):
self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
elif output_dir:
self.output_dir = output_dir
else:
self.output_dir = '/tmp'
if os.getenv('BENCHMARK_TPU'):
self._resolved_tpu = os.getenv('BENCHMARK_TPU')
elif tpu:
self._resolved_tpu = tpu
else:
self._resolved_tpu = None
def _get_model_dir(self, folder_name):
"""Returns directory to store info, e.g. saved model and event log."""
return os.path.join(self.output_dir, folder_name)
def benchmark(self,
benchmark_name,
experiment_type,
execution_mode,
platform,
precision,
metric_bounds,
config_files,
params_override):
params = exp_factory.get_exp_config(experiment_type)
for config_file in config_files:
file_path = config_utils.get_config_path(config_file)
params = hyperparams.override_params_dict(
params, file_path, is_strict=True)
if params_override:
params = hyperparams.override_params_dict(
params, params_override, is_strict=True)
# platform in format tpu.[n]x[n] or gpu.[n]
if 'tpu' in platform:
params.runtime.distribution_strategy = 'tpu'
params.runtime.tpu = self._resolved_tpu
elif 'gpu' in platform:
params.runtime.num_gpus = int(platform.split('.')[-1])
params.runtime.distribution_strategy = 'mirrored'
else:
NotImplementedError('platform :{} is not supported'.format(platform))
params.runtime.mixed_precision_dtype = precision
params.validate()
params.lock()
tf.io.gfile.makedirs(self._get_model_dir(benchmark_name))
hyperparams.save_params_dict_to_yaml(
params,
os.path.join(self._get_model_dir(benchmark_name), 'params.yaml'))
pp = pprint.PrettyPrinter()
logging.info('Final experiment parameters: %s',
pp.pformat(params.as_dict()))
benchmark_data = benchmark_lib.run_benchmark(
execution_mode, params, self._get_model_dir(benchmark_name))
metrics = []
if execution_mode == 'accuracy':
for metric_bound in metric_bounds:
metric = {
'name': metric_bound['name'],
'value': benchmark_data['metrics'][metric_bound['name']],
'min_value': metric_bound['min_value'],
'max_value': metric_bound['max_value']
}
metrics.append(metric)
metrics.append({'name': 'startup_time',
'value': benchmark_data['startup_time']})
metrics.append({'name': 'exp_per_second',
'value': benchmark_data['examples_per_second']})
self.report_benchmark(
iters=-1,
wall_time=benchmark_data['wall_time'],
metrics=metrics,
extras={'model_name': benchmark_name.split('.')[0],
'platform': platform,
'implementation': 'orbit.ctl',
'parameters': precision})
if __name__ == '__main__':
tf.test.main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model garden benchmark definitions."""
# tf-vision benchmarks
IMAGE_CLASSIFICATION_BENCHMARKS = {
'image_classification.resnet50.tpu.4x4.bf16':
dict(
experiment_type='resnet_imagenet',
platform='tpu.4x4',
precision='bfloat16',
metric_bounds=[{
'name': 'accuracy',
'min_value': 0.76,
'max_value': 0.77
}],
config_files=['official/vision/beta/configs/experiments/'
'image_classification/imagenet_resnet50_tpu.yaml']),
'image_classification.resnet50.gpu.8.fp16':
dict(
experiment_type='resnet_imagenet',
platform='gpu.8',
precision='float16',
metric_bounds=[{
'name': 'accuracy',
'min_value': 0.76,
'max_value': 0.77
}],
config_files=['official/vision/beta/configs/experiments/'
'image_classification/imagenet_resnet50_gpu.yaml'])
}
VISION_BENCHMARKS = {
'image_classification': IMAGE_CLASSIFICATION_BENCHMARKS,
}
NLP_BENCHMARKS = {
}
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TFM common benchmark training driver."""
import os
import time
from typing import Any, Mapping
from absl import logging
import orbit
import tensorflow as tf
from official.common import distribute_utils
from official.core import config_definitions
from official.core import task_factory
from official.core import train_utils
from official.modeling import performance
def run_benchmark(
execution_mode: str,
params: config_definitions.ExperimentConfig,
model_dir: str,
distribution_strategy: tf.distribute.Strategy = None
) -> Mapping[str, Any]:
"""Runs benchmark for a specific experiment.
Args:
execution_mode: A 'str', specifying the mode. Can be 'accuracy', or
'performance'.
params: ExperimentConfig instance.
model_dir: A 'str', a path to store model checkpoints and summaries.
distribution_strategy: A tf.distribute.Strategy to use. If specified,
it will be used instead of inferring the strategy from params.
Returns:
benchmark_data: returns benchmark data in dict format.
"""
# For GPU runs, allow option to set thread mode
if params.runtime.gpu_thread_mode:
os.environ['TF_GPU_THREAD_MODE'] = params.runtime.gpu_thread_mode
logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
# Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
# can have significant impact on model speeds by utilizing float16 in case of
# GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
# dtype is float16
if params.runtime.mixed_precision_dtype:
performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
strategy = distribution_strategy or distribute_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu)
with strategy.scope():
task = task_factory.get_task(params.task, logging_dir=model_dir)
trainer = train_utils.create_trainer(
params,
task,
train=True,
evaluate=(execution_mode == 'accuracy'))
# Initialize the model if possible, e.g., from a pre-trained checkpoint.
trainer.initialize()
steps_per_loop = params.trainer.steps_per_loop if (
execution_mode == 'accuracy') else 100
controller = orbit.Controller(
strategy=strategy,
trainer=trainer,
evaluator=trainer if (execution_mode == 'accuracy') else None,
global_step=trainer.global_step,
steps_per_loop=steps_per_loop)
logging.info('Starts to execute execution mode: %s', execution_mode)
with strategy.scope():
# Training for one loop, first loop time includes warmup time.
first_loop_start_time = time.time()
controller.train(steps=steps_per_loop)
first_loop_time = time.time() - first_loop_start_time
# Training for second loop.
second_loop_start_time = time.time()
controller.train(steps=2*steps_per_loop)
second_loop_time = time.time() - second_loop_start_time
if execution_mode == 'accuracy':
controller.train(steps=params.trainer.train_steps)
wall_time = time.time() - first_loop_time
eval_logs = trainer.evaluate(
tf.convert_to_tensor(params.trainer.validation_steps))
benchmark_data = {'metrics': eval_logs}
elif execution_mode == 'performance':
benchmark_data = {}
else:
raise NotImplementedError(
'The benchmark execution mode is not implemented: %s' %
execution_mode)
# First training loop time contains startup time plus training time, while
# second training loop time is purely training time. Startup time can be
# recovered by subtracting second trianing loop time from first training
# loop time.
startup_time = first_loop_time - second_loop_time
wall_time = time.time() - first_loop_start_time
examples_per_second = steps_per_loop * params.task.train_data.global_batch_size / second_loop_time
benchmark_data.update(
dict(
examples_per_second=examples_per_second,
wall_time=wall_time,
startup_time=startup_time))
return benchmark_data
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tensorflow_models.official.benchmark.benchmark_lib."""
# pylint: disable=g-direct-tensorflow-import
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.common import registry_imports # pylint: disable=unused-import
from official.benchmark import benchmark_lib
from official.core import exp_factory
from official.modeling import hyperparams
def all_strategy_combinations():
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],)
class BenchmarkLibTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super(BenchmarkLibTest, self).setUp()
self._test_config = {
'trainer': {
'steps_per_loop': 10,
'optimizer_config': {
'optimizer': {
'type': 'sgd'
},
'learning_rate': {
'type': 'constant'
}
},
'continuous_eval_timeout': 5,
'train_steps': 20,
'validation_steps': 10
},
}
@combinations.generate(
combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
execution_mode=['performance', 'accuracy'],
))
def test_benchmark(self, distribution, execution_mode):
model_dir = self.get_temp_dir()
params = exp_factory.get_exp_config('mock')
params = hyperparams.override_params_dict(
params, self._test_config, is_strict=True)
benchmark_data = benchmark_lib.run_benchmark(execution_mode,
params,
model_dir,
distribution)
self.assertIn('examples_per_second', benchmark_data)
self.assertIn('wall_time', benchmark_data)
self.assertIn('startup_time', benchmark_data)
if execution_mode == 'accuracy':
self.assertIn('metrics', benchmark_data)
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmarks config utils."""
import os
def get_config_path(
config_file: str,
base_dir: str = '') -> str:
"""Gets the absolute path of the config file."""
return os.path.join(base_dir, config_file)
......@@ -334,6 +334,18 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
FLAGS.dtype = 'fp16'
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16(self):
"""Test Keras model with 8 GPUs with tf.keras mixed precision."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.dtype = 'fp16'
FLAGS.enable_xla = True
self._run_and_report_benchmark()
def benchmark_8_gpu_eager(self):
"""Test Keras model with 8 GPUs, eager, fp32."""
self._setup()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment