Commit 472e2f80 authored by zhanggzh's avatar zhanggzh
Browse files

Merge remote-tracking branch 'tf_model/main'

parents d91296eb f3a14f85
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utils to annotate and trace benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from absl import logging
from absl.testing import flagsaver
FLAGS = flags.FLAGS
flags.DEFINE_multi_string(
'benchmark_method_flags', None,
'Optional list of runtime flags of the form key=value. Specify '
'multiple times to specify different flags. These will override the FLAGS '
'object directly after hardcoded settings in individual benchmark methods '
'before they call _run_and_report benchmark. Example if we set '
'--benchmark_method_flags=train_steps=10 and a benchmark method hardcodes '
'FLAGS.train_steps=10000 and later calls _run_and_report_benchmark, '
'it\'ll only run for 10 steps. This is useful for '
'debugging/profiling workflows.')
def enable_runtime_flags(decorated_func):
"""Sets attributes from --benchmark_method_flags for method execution.
@enable_runtime_flags decorator temporarily adds flags passed in via
--benchmark_method_flags and runs the decorated function in that context.
A user can set --benchmark_method_flags=train_steps=5 to run the benchmark
method in the snippet below with FLAGS.train_steps=5 for debugging (without
modifying the benchmark code).
class ModelBenchmark():
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
# run benchmark ...
# report benchmark results ...
def benchmark_method(self):
FLAGS.train_steps = 1000
...
self._run_and_report_benchmark()
Args:
decorated_func: The method that runs the benchmark after previous setup
execution that set some flags.
Returns:
new_func: The same method which executes in a temporary context where flag
overrides from --benchmark_method_flags are active.
"""
def runner(*args, **kwargs):
"""Creates a temporary context to activate --benchmark_method_flags."""
if FLAGS.benchmark_method_flags:
saved_flag_values = flagsaver.save_flag_values()
for key_value in FLAGS.benchmark_method_flags:
key, value = key_value.split('=', 1)
try:
numeric_float = float(value)
numeric_int = int(numeric_float)
if abs(numeric_int) == abs(numeric_float):
flag_value = numeric_int
else:
flag_value = numeric_float
except ValueError:
flag_value = value
logging.info('Setting --%s=%s', key, flag_value)
setattr(FLAGS, key, flag_value)
else:
saved_flag_values = None
try:
result = decorated_func(*args, **kwargs)
return result
finally:
if saved_flag_values:
flagsaver.restore_flag_values(saved_flag_values)
return runner
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes BERT benchmarks and accuracy tests."""
import json
import math
import os
import time
from absl import flags
from absl.testing import flagsaver
import tensorflow as tf
from official.benchmark import benchmark_wrappers
from official.benchmark import bert_benchmark_utils as benchmark_utils
from official.benchmark import owner_utils
from official.common import distribute_utils
from official.legacy.bert import configs
from official.legacy.bert import run_classifier
# pylint: disable=line-too-long
PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt'
CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record'
CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record'
CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data'
MODEL_CONFIG_FILE_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json'
# pylint: enable=line-too-long
TMP_DIR = os.getenv('TMPDIR')
FLAGS = flags.FLAGS
class BertClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
"""Base class to hold methods common to test classes in the module."""
def __init__(self, output_dir=None, tpu=None):
super(BertClassifyBenchmarkBase, self).__init__(output_dir, tpu=tpu)
self.num_epochs = None
self.num_steps_per_epoch = None
FLAGS.steps_per_loop = 1
@flagsaver.flagsaver
def _run_bert_classifier(self, callbacks=None, use_ds=True):
"""Starts BERT classification task."""
with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
input_meta_data = json.loads(reader.read().decode('utf-8'))
bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs
if self.num_steps_per_epoch:
steps_per_epoch = self.num_steps_per_epoch
else:
train_data_size = input_meta_data['train_data_size']
steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
warmup_steps = int(epochs * steps_per_epoch * 0.1)
eval_steps = int(
math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
if self.tpu:
strategy = distribute_utils.get_distribution_strategy(
distribution_strategy='tpu', tpu_address=self.tpu)
else:
strategy = distribute_utils.get_distribution_strategy(
distribution_strategy='mirrored' if use_ds else 'off',
num_gpus=self.num_gpus)
max_seq_length = input_meta_data['max_seq_length']
train_input_fn = run_classifier.get_dataset_fn(
FLAGS.train_data_path,
max_seq_length,
FLAGS.train_batch_size,
is_training=True)
eval_input_fn = run_classifier.get_dataset_fn(
FLAGS.eval_data_path,
max_seq_length,
FLAGS.eval_batch_size,
is_training=False)
_, summary = run_classifier.run_bert_classifier(
strategy,
bert_config,
input_meta_data,
FLAGS.model_dir,
epochs,
steps_per_epoch,
FLAGS.steps_per_loop,
eval_steps,
warmup_steps,
FLAGS.learning_rate,
FLAGS.init_checkpoint,
train_input_fn,
eval_input_fn,
training_callbacks=False,
custom_callbacks=callbacks)
return summary
class BertClassifyBenchmarkReal(BertClassifyBenchmarkBase):
"""Short benchmark performance tests for BERT model.
Tests BERT classification performance in different GPU, TPU configurations.
The naming convention of below test cases follow
`benchmark_(number of gpus)_gpu_(dataset type)` for GPUs and
`benchmark_(topology)_tpu_(dataset type)` for TPUs.
"""
def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
super(BertClassifyBenchmarkReal, self).__init__(
output_dir=output_dir, tpu=tpu)
self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
self.bert_config_file = MODEL_CONFIG_FILE_PATH
self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
# Since we only care about performance metrics, we limit
# the number of training steps and epochs to prevent unnecessarily
# long tests.
self.num_steps_per_epoch = 100
self.num_epochs = 1
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
training_summary_path,
min_accuracy=0,
max_accuracy=1,
use_ds=True):
"""Starts BERT performance benchmark test."""
start_time_sec = time.time()
summary = self._run_bert_classifier(
callbacks=[self.timer_callback], use_ds=use_ds)
wall_time_sec = time.time() - start_time_sec
# Since we do not load from any pretrained checkpoints, we ignore all
# accuracy metrics.
summary.pop('eval_metrics', None)
summary['start_time_sec'] = start_time_sec
super(BertClassifyBenchmarkReal, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=min_accuracy,
max_accuracy=max_accuracy)
def benchmark_1_gpu_mrpc(self):
"""Test BERT model performance with 1 GPU."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc')
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
FLAGS.train_batch_size = 4
FLAGS.eval_batch_size = 4
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
def benchmark_1_gpu_mrpc_xla(self):
"""Test BERT model performance with 1 GPU."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_xla')
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
FLAGS.train_batch_size = 4
FLAGS.eval_batch_size = 4
FLAGS.enable_xla = True
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
def benchmark_1_gpu_mrpc_no_dist_strat(self):
"""Test BERT model performance with 1 GPU, no distribution strategy."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_no_dist_strat')
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
FLAGS.train_batch_size = 4
FLAGS.eval_batch_size = 4
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path, use_ds=False)
@owner_utils.Owner('tf-model-garden')
def benchmark_8_gpu_mrpc(self):
"""Test BERT model performance with 8 GPUs."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
@owner_utils.Owner('tf-model-garden')
def benchmark_2x2_tpu_mrpc(self):
"""Test BERT model performance with 2x2 TPU."""
self._setup()
FLAGS.steps_per_loop = 50
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
FLAGS.train_batch_size = 32
FLAGS.eval_batch_size = 32
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path, use_ds=False)
class BertClassifyAccuracy(BertClassifyBenchmarkBase):
"""Short accuracy test for BERT model.
Tests BERT classification task model accuracy. The naming
convention of below test cases follow
`benchmark_(number of gpus)_gpu_(dataset type)` format.
"""
def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
self.bert_config_file = MODEL_CONFIG_FILE_PATH
self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
super(BertClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
training_summary_path,
min_accuracy=0.84,
max_accuracy=0.88):
"""Starts BERT accuracy benchmark test."""
start_time_sec = time.time()
summary = self._run_bert_classifier(callbacks=[self.timer_callback])
wall_time_sec = time.time() - start_time_sec
super(BertClassifyAccuracy, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=min_accuracy,
max_accuracy=max_accuracy)
def _setup(self):
super(BertClassifyAccuracy, self)._setup()
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
FLAGS.init_checkpoint = self.pretrained_checkpoint_path
@owner_utils.Owner('tf-model-garden')
def benchmark_8_gpu_mrpc(self):
"""Run BERT model accuracy test with 8 GPUs.
Due to comparatively small cardinality of MRPC dataset, training
accuracy metric has high variance between trainings. As so, we
set the wide range of allowed accuracy (84% to 88%).
"""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
def benchmark_8_gpu_mrpc_xla(self):
"""Run BERT model accuracy test with 8 GPUs with XLA."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc_xla')
FLAGS.enable_xla = True
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
@owner_utils.Owner('tf-model-garden')
def benchmark_2x2_tpu_mrpc(self):
"""Run BERT model accuracy test on 2x2 TPU."""
self._setup()
FLAGS.steps_per_loop = 50
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
self._run_and_report_benchmark(summary_path)
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions or classes shared between BERT benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
# pylint: disable=g-bad-import-order
import numpy as np
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.utils.flags import core as flags_core
from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
FLAGS = flags.FLAGS
class BenchmarkTimerCallback(tf.keras.callbacks.Callback):
"""Callback that records time it takes to run each batch."""
def __init__(self, num_batches_to_skip=10):
super(BenchmarkTimerCallback, self).__init__()
self.batch_start_times = {}
self.batch_stop_times = {}
def on_batch_begin(self, batch, logs=None):
self.batch_start_times[batch] = time.time()
def on_batch_end(self, batch, logs=None):
# If there are multiple steps_per_loop, the end batch index will not be the
# same as the starting index. Use the last starting index instead.
if batch not in self.batch_start_times:
batch = max(self.batch_start_times.keys())
self.batch_stop_times[batch] = time.time()
def get_examples_per_sec(self, batch_size, num_batches_to_skip=1):
batch_durations = []
for batch in self.batch_start_times:
if batch in self.batch_stop_times and batch >= num_batches_to_skip:
batch_durations.append(self.batch_stop_times[batch] -
self.batch_start_times[batch])
return batch_size / np.mean(batch_durations)
def get_startup_time(self, program_start_time):
return self.batch_start_times[0] - program_start_time
class BertBenchmarkBase(PerfZeroBenchmark):
"""Base class to hold methods common to test classes."""
local_flags = None
def __init__(self, output_dir=None, tpu=None, **kwargs):
super(BertBenchmarkBase, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
self.num_gpus = 8
self.timer_callback = None
def _setup(self):
"""Sets up and resets flags before each test."""
super(BertBenchmarkBase, self)._setup()
self.timer_callback = BenchmarkTimerCallback()
def _report_benchmark(self, stats, wall_time_sec, min_accuracy, max_accuracy):
"""Report benchmark results by writing to local protobuf file.
Args:
stats: dict returned from BERT models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
min_accuracy: Minimum classification accuracy constraint to verify
correctness of the model.
max_accuracy: Maximum classification accuracy constraint to verify
correctness of the model.
"""
metrics = [{
'name': 'training_loss',
'value': stats['train_loss'],
}]
if self.timer_callback:
metrics.append({
'name':
'exp_per_second',
'value':
self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
FLAGS.steps_per_loop)
})
else:
metrics.append({
'name': 'exp_per_second',
'value': 0.0,
})
if self.timer_callback and 'start_time_sec' in stats:
metrics.append({
'name': 'startup_time',
'value': self.timer_callback.get_startup_time(stats['start_time_sec'])
})
if 'eval_metrics' in stats:
metrics.append({
'name': 'eval_accuracy',
'value': stats['eval_metrics'],
'min_value': min_accuracy,
'max_value': max_accuracy,
})
flags_str = flags_core.get_nondefault_flags_as_str()
self.report_benchmark(
iters=stats['total_training_steps'],
wall_time=wall_time_sec,
metrics=metrics,
extras={'flags': flags_str})
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes benchmark testing for bert pretraining."""
# pylint: disable=line-too-long
import json
import os
import time
from typing import Optional
from absl import flags
from absl import logging
import tensorflow as tf
from official.benchmark import benchmark_wrappers
from official.benchmark import bert_benchmark_utils
from official.benchmark import owner_utils
from official.common import distribute_utils
from official.legacy.bert import run_pretraining
from official.utils.flags import core as flags_core
# Pretrain masked lanauge modeling accuracy range:
MIN_MLM_ACCURACY = 0.635
MAX_MLM_ACCURACY = 0.645
# Pretrain next sentence prediction accuracy range:
MIN_NSP_ACCURACY = 0.94
MAX_NSP_ACCURACY = 0.96
# Pretrain masked lanauge modeling accuracy range:
MIN_MLM_ACCURACY_GPU = 0.378
MAX_MLM_ACCURACY_GPU = 0.388
# Pretrain next sentence prediction accuracy range:
MIN_NSP_ACCURACY_GPU = 0.82
MAX_NSP_ACCURACY_GPU = 0.84
BERT_PRETRAIN_FILES_SEQ128 = 'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*'
BERT_BASE_CONFIG_FILE = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json'
FLAGS = flags.FLAGS
class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
"""Benchmark accuracy tests for BERT Pretraining."""
def __init__(self,
output_dir: Optional[str] = None,
tpu: Optional[str] = None,
**kwargs):
"""Inits BertPretrainAccuracyBenchmark class.
Args:
output_dir: Directory where to output e.g. log files
tpu: TPU name to use in a TPU benchmark.
**kwargs: Additional keyword arguments.
"""
super(BertPretrainAccuracyBenchmark, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _get_distribution_strategy(self, ds_type='mirrored'):
"""Gets the distribution strategy.
Args:
ds_type: String, the distribution strategy type to be used. Can be
'mirrored', 'multi_worker_mirrored', 'tpu' and 'off'.
Returns:
A `tf.distribute.DistibutionStrategy` object.
"""
if self.tpu or ds_type == 'tpu':
return distribute_utils.get_distribution_strategy(
distribution_strategy='tpu', tpu_address=self.tpu)
elif ds_type == 'multi_worker_mirrored':
# Configures cluster spec for multi-worker distribution strategy.
_ = distribute_utils.configure_cluster(FLAGS.worker_hosts,
FLAGS.task_index)
return distribute_utils.get_distribution_strategy(
distribution_strategy=ds_type,
num_gpus=FLAGS.num_gpus,
all_reduce_alg=FLAGS.all_reduce_alg)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, summary_path: str, report_accuracy: bool,
ds_type: str):
"""Runs and reports the benchmark given the provided configuration."""
distribution = self._get_distribution_strategy(ds_type=ds_type)
logging.info('Flags: %s', flags_core.get_nondefault_flags_as_str())
start_time_sec = time.time()
run_pretraining.run_bert_pretrain(
strategy=distribution, custom_callbacks=self.timer_callback)
wall_time_sec = time.time() - start_time_sec
# For GPU multi-worker, the summary text file is only generated on chief
# (metrics aggregated), so only chief has to report the result.
if tf.io.gfile.exists(summary_path):
with tf.io.gfile.GFile(summary_path, 'rb') as reader:
summary = json.loads(reader.read().decode('utf-8'))
self._report_benchmark(summary, start_time_sec, wall_time_sec,
report_accuracy, ds_type)
def _report_benchmark(self, summary, start_time_sec, wall_time_sec,
report_accuracy, ds_type):
metrics = [{
'name': 'train_loss',
'value': summary['train_loss'],
}, {
'name':
'exp_per_second',
'value':
self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
FLAGS.steps_per_loop)
}, {
'name': 'startup_time',
'value': self.timer_callback.get_startup_time(start_time_sec)
}]
if report_accuracy:
if ds_type == 'tpu':
min_mlm_acc = MIN_MLM_ACCURACY
max_mlm_acc = MAX_MLM_ACCURACY
min_nsp_acc = MIN_NSP_ACCURACY
max_nsp_acc = MAX_NSP_ACCURACY
else:
min_mlm_acc = MIN_MLM_ACCURACY_GPU
max_mlm_acc = MAX_MLM_ACCURACY_GPU
min_nsp_acc = MIN_NSP_ACCURACY_GPU
max_nsp_acc = MAX_NSP_ACCURACY_GPU
metrics.extend([{
'name': 'masked_lm_accuracy',
'value': summary['masked_lm_accuracy'],
'min_value': min_mlm_acc,
'max_value': max_mlm_acc,
}, {
'name': 'next_sentence_accuracy',
'value': summary['next_sentence_accuracy'],
'min_value': min_nsp_acc,
'max_value': max_nsp_acc,
}])
self.report_benchmark(
iters=summary['total_training_steps'],
wall_time=wall_time_sec,
metrics=metrics,
extras={'flags': flags_core.get_nondefault_flags_as_str()})
def _specify_common_flags(self):
FLAGS.bert_config_file = BERT_BASE_CONFIG_FILE
FLAGS.learning_rate = 1e-4
FLAGS.warmup_steps = 10000
FLAGS.steps_per_loop = 10000
FLAGS.input_files = BERT_PRETRAIN_FILES_SEQ128
FLAGS.max_seq_length = 128
FLAGS.max_predictions_per_seq = 20
def _specify_tpu_common_flags(self):
FLAGS.distribution_strategy = 'tpu'
FLAGS.dtype = 'bf16'
def _specify_gpu_common_flags(self):
FLAGS.distribution_strategy = 'mirrored'
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
@owner_utils.Owner('tf-model-garden')
def benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps(self):
"""Test bert pretraining with 8x8 TPU for 500k steps."""
# This is used for accuracy test.
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 500000
FLAGS.num_train_epochs = 1
FLAGS.model_dir = self._get_model_dir(
'benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Set train_summary_interval to -1 to disable training summary, because
# writing summary to gcs may fail and summaries are not needed for this
# accuracy benchmark test.
FLAGS.train_summary_interval = -1
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=True,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps(self):
"""Test bert pretraining with 2x2 TPU for 10000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2
FLAGS.train_batch_size = 128
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_2x2_tpu_bf16_seq128_10k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir(self):
"""Test bert pretraining with 2x2 TPU with MLIR for 10000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2
FLAGS.train_batch_size = 128
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
tf.config.experimental.enable_mlir_bridge()
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps(self):
"""Test bert pretraining with 4x4 TPU for 10000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_4x4_tpu_bf16_seq128_10k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir(self):
"""Test bert pretraining with 4x4 TPU with MLIR for 10000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
tf.config.experimental.enable_mlir_bridge()
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_4x4_tpu_bf16_seq128_1k_steps(self):
"""Test bert pretraining with 4x4 TPU for 1000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.warmup_steps = 0
FLAGS.num_steps_per_epoch = 1000
FLAGS.num_train_epochs = 1
FLAGS.steps_per_loop = 500
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_4x4_tpu_bf16_seq128_1k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_8x8_tpu_bf16_seq128_10k_steps(self):
"""Test bert pretraining with 8x8 TPU for 10000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_8x8_tpu_bf16_seq128_10k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden')
def benchmark_perf_8x16_tpu_bf16_seq128_1k_steps(self):
"""Test bert pretraining with 8x16 TPU for 1000 steps."""
self._setup()
self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 4096
FLAGS.warmup_steps = 0
FLAGS.num_steps_per_epoch = 1000
FLAGS.num_train_epochs = 1
FLAGS.steps_per_loop = 500
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_8x16_tpu_bf16_seq128_1k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_accuracy_1x8_gpu_fp16_seq128_15k_steps(self):
"""Test bert pretraining with 8 GPU for 15k steps."""
# This is used for accuracy test.
self._setup()
self._specify_common_flags()
self._specify_gpu_common_flags()
FLAGS.num_gpus = 8
FLAGS.train_batch_size = 96
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 3
FLAGS.steps_per_loop = 5000
FLAGS.model_dir = self._get_model_dir(
'benchmark_accuracy_1x8_gpu_fp16_seq128_15k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Set train_summary_interval to -1 to disable training summary, because
# writing summary to gcs may fail and summaries are not needed for this
# accuracy benchmark test.
FLAGS.train_summary_interval = -1
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=True,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_1x1_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 1 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_common_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.num_gpus = 1
FLAGS.train_batch_size = 12
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_1x1_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_1x8_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 8 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_common_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.num_gpus = 8
FLAGS.train_batch_size = 96
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_1x8_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
class BertPretrainMultiWorkerBenchmark(BertPretrainAccuracyBenchmark):
"""Bert pretrain distributed benchmark tests with multiple workers."""
def __init__(self, output_dir=None, tpu=None, **kwargs):
super(BertPretrainMultiWorkerBenchmark, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _specify_gpu_mwms_flags(self):
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.all_reduce_alg = 'nccl'
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
FLAGS.num_gpus = 8
@owner_utils.Owner('tf-dist-strat')
def benchmark_accuracy_mwms_1x8_gpu_fp16_seq128_15k_steps(self):
"""Test bert pretraining with 8 GPU for 15k steps."""
# This is used for accuracy test.
self._setup()
self._specify_common_flags()
self._specify_gpu_mwms_flags()
FLAGS.train_batch_size = 96
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 3
FLAGS.steps_per_loop = 5000
FLAGS.model_dir = self._get_model_dir(
'benchmark_accuracy_mwms_1x8_gpu_fp16_seq128_15k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Set train_summary_interval to -1 to disable training summary, because
# writing summary to gcs may fail and summaries are not needed for this
# accuracy benchmark test.
FLAGS.train_summary_interval = -1
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=True,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_accuracy_mwms_2x8_gpu_fp16_seq128_15k_steps(self):
"""Test bert pretraining with 2x8 GPU for 15k steps."""
# This is used for accuracy test.
self._setup()
self._specify_common_flags()
self._specify_gpu_mwms_flags()
# ues the same global batch size as accuracy_mwms_1x8 benchmark.
FLAGS.train_batch_size = 96
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 3
FLAGS.steps_per_loop = 5000
FLAGS.model_dir = self._get_model_dir(
'benchmark_accuracy_mwms_2x8_gpu_fp16_seq128_15k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Set train_summary_interval to -1 to disable training summary, because
# writing summary to gcs may fail and summaries are not needed for this
# accuracy benchmark test.
FLAGS.train_summary_interval = -1
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=True,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_mwms_1x8_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 1x8 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_mwms_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.train_batch_size = 96 * 1
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_mwms_1x8_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_mwms_2x8_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 2x8 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_mwms_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.train_batch_size = 96 * 2
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_mwms_2x8_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_mwms_8x8_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 8x8 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_mwms_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.train_batch_size = 96*8
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_mwms_8x8_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes BERT SQuAD benchmarks and accuracy tests."""
import json
import os
import time
from absl import flags
from absl import logging
from absl.testing import flagsaver
import tensorflow as tf
from official.benchmark import benchmark_wrappers
from official.benchmark import bert_benchmark_utils as benchmark_utils
from official.benchmark import owner_utils
from official.common import distribute_utils
from official.legacy.bert import run_squad
from official.utils.misc import keras_utils
# pylint: disable=line-too-long
PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt'
SQUAD_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_train.tf_record'
SQUAD_PREDICT_FILE = 'gs://tf-perfzero-data/bert/squad/dev-v1.1.json'
SQUAD_VOCAB_FILE = 'gs://tf-perfzero-data/bert/squad/vocab.txt'
SQUAD_MEDIUM_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_medium_meta_data'
SQUAD_LONG_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_long_meta_data'
SQUAD_FULL_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_full_meta_data'
MODEL_CONFIG_FILE_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json'
# pylint: enable=line-too-long
TMP_DIR = os.getenv('TMPDIR')
FLAGS = flags.FLAGS
class BertSquadBenchmarkBase(benchmark_utils.BertBenchmarkBase):
"""Base class to hold methods common to test classes in the module."""
def __init__(self, output_dir=None, tpu=None, **kwargs):
super(BertSquadBenchmarkBase, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _read_training_summary_from_file(self):
"""Reads the training summary from a file."""
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
with tf.io.gfile.GFile(summary_path, 'rb') as reader:
return json.loads(reader.read().decode('utf-8'))
def _read_input_meta_data_from_file(self):
"""Reads the input metadata from a file."""
with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
return json.loads(reader.read().decode('utf-8'))
def _get_distribution_strategy(self, ds_type='mirrored'):
"""Gets the distribution strategy.
Args:
ds_type: String, the distribution strategy type to be used. Can be
'mirrored', 'multi_worker_mirrored', 'tpu' and 'off'.
Returns:
A `tf.distribute.DistibutionStrategy` object.
"""
if self.tpu or ds_type == 'tpu':
return distribute_utils.get_distribution_strategy(
distribution_strategy='tpu', tpu_address=self.tpu)
elif ds_type == 'multi_worker_mirrored':
# Configures cluster spec for multi-worker distribution strategy.
_ = distribute_utils.configure_cluster(FLAGS.worker_hosts,
FLAGS.task_index)
return distribute_utils.get_distribution_strategy(
distribution_strategy=ds_type,
num_gpus=self.num_gpus,
all_reduce_alg=FLAGS.all_reduce_alg)
def _init_gpu_and_data_threads(self):
"""Set env variables before any TF calls."""
if FLAGS.tf_gpu_thread_mode:
keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=FLAGS.per_gpu_thread_count,
gpu_thread_mode=FLAGS.tf_gpu_thread_mode,
num_gpus=self.num_gpus,
datasets_num_private_threads=FLAGS.datasets_num_private_threads)
@flagsaver.flagsaver
def _train_squad(self, run_eagerly=False, ds_type='mirrored'):
"""Runs BERT SQuAD training. Uses mirrored strategy by default."""
self._init_gpu_and_data_threads()
input_meta_data = self._read_input_meta_data_from_file()
strategy = self._get_distribution_strategy(ds_type)
run_squad.train_squad(
strategy=strategy,
input_meta_data=input_meta_data,
run_eagerly=run_eagerly,
custom_callbacks=[self.timer_callback])
@flagsaver.flagsaver
def _evaluate_squad(self, ds_type='mirrored'):
"""Runs BERT SQuAD evaluation. Uses mirrored strategy by default."""
self._init_gpu_and_data_threads()
input_meta_data = self._read_input_meta_data_from_file()
strategy = self._get_distribution_strategy(ds_type)
if input_meta_data.get('version_2_with_negative', False):
logging.error('In memory evaluation result for SQuAD v2 is not accurate')
eval_metrics = run_squad.eval_squad(
strategy=strategy, input_meta_data=input_meta_data)
# Use F1 score as reported evaluation metric.
self.eval_metrics = eval_metrics['final_f1']
class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
"""Short benchmark performance tests for BERT SQuAD model.
Tests BERT SQuAD performance in different GPU configurations.
The naming convention of below test cases follow
`benchmark_(number of gpus)_gpu` format for GPUs and
`benchmark_(topology)_tpu` format for TPUs.
"""
def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
super(BertSquadBenchmarkReal, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _setup(self):
"""Sets up the benchmark and SQuAD flags."""
super(BertSquadBenchmarkReal, self)._setup()
FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
FLAGS.predict_file = SQUAD_PREDICT_FILE
FLAGS.vocab_file = SQUAD_VOCAB_FILE
FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
FLAGS.num_train_epochs = 1
FLAGS.steps_per_loop = 100
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, run_eagerly=False, ds_type='mirrored'):
"""Runs the benchmark and reports various metrics."""
if FLAGS.train_batch_size <= 4 or run_eagerly:
FLAGS.input_meta_data_path = SQUAD_MEDIUM_INPUT_META_DATA_PATH
else:
FLAGS.input_meta_data_path = SQUAD_LONG_INPUT_META_DATA_PATH
start_time_sec = time.time()
self._train_squad(run_eagerly=run_eagerly, ds_type=ds_type)
wall_time_sec = time.time() - start_time_sec
summary = self._read_training_summary_from_file()
summary['start_time_sec'] = start_time_sec
super(BertSquadBenchmarkReal, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=0,
max_accuracy=1)
def benchmark_1_gpu(self):
"""Tests BERT SQuAD model performance with 1 GPU."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad')
FLAGS.train_batch_size = 4
self._run_and_report_benchmark()
def benchmark_1_gpu_eager(self):
"""Tests BERT SQuAD model performance with 1 GPU."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_eager')
FLAGS.train_batch_size = 2
self._run_and_report_benchmark(run_eagerly=True)
def benchmark_1_gpu_xla(self):
"""Tests BERT SQuAD model performance with 1 GPU with XLA."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad')
# XLA runs out of memory when running with batch size 4.
FLAGS.train_batch_size = 3
FLAGS.enable_xla = True
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat(self):
"""Tests BERT SQuAD model performance with 1 GPU without DS."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat_squad')
FLAGS.train_batch_size = 4
self._run_and_report_benchmark(ds_type='off')
def benchmark_1_gpu_eager_no_dist_strat(self):
"""Tests BERT SQuAD model performance with 1 GPU with eager execution."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_eager_no_dist_strat_squad')
FLAGS.train_batch_size = 4
self._run_and_report_benchmark(ds_type='off', run_eagerly=True)
@owner_utils.Owner('tf-model-garden')
def benchmark_8_gpu(self):
"""Tests BERT SQuAD model performance with 8 GPUs."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
FLAGS.train_batch_size = 24
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16_eager(self):
"""Tests BERT SQuAD model performance with 1 GPU and FP16."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_fp16_eager')
FLAGS.train_batch_size = 4
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark(run_eagerly=True)
def benchmark_1_gpu_fp16(self):
"""Tests BERT SQuAD model performance with 1 GPU and FP16."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_fp16')
FLAGS.train_batch_size = 4
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_1_gpu_xla_fp16(self):
"""Tests BERT SQuAD model performance with 1 GPU with XLA and FP16."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad_fp16')
FLAGS.train_batch_size = 4
FLAGS.enable_xla = True
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16(self):
"""Tests BERT SQuAD model performance with 8 GPUs."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_fp16')
FLAGS.train_batch_size = 32
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
def benchmark_8_gpu_xla_fp16(self):
"""Tests BERT SQuAD model performance with 8 GPUs with XLA."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_fp16')
FLAGS.train_batch_size = 32
FLAGS.enable_xla = True
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_8_gpu_xla_tf32(self):
"""Tests BERT SQuAD model performance with 8 GPUs with XLA using TF32."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_xla_tf32')
FLAGS.train_batch_size = 32
FLAGS.enable_xla = True
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_8_gpu_xla_fp32_no_tf32(self):
"""Tests BERT SQuAD model performance with 8 GPUs with XLA using FP32."""
self._setup()
tf.config.experimental.enable_tensor_float_32_execution(False)
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_xla_fp32_no_tf32')
FLAGS.train_batch_size = 32
FLAGS.enable_xla = True
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
@owner_utils.Owner('tf-model-garden')
def benchmark_2x2_tpu(self):
"""Tests BERT SQuAD model performance with 2x2 TPU."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
FLAGS.train_batch_size = 48
FLAGS.predict_batch_size = 48
FLAGS.mode = 'train'
FLAGS.learning_rate = 8e-5
FLAGS.num_train_epochs = 1
FLAGS.steps_per_loop = 100
FLAGS.do_lower_case = True
FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
self._run_and_report_benchmark()
class BertSquadAccuracy(BertSquadBenchmarkBase):
"""Short accuracy test for BERT SQuAD model.
Tests BERT SQuAD accuracy. The naming convention of below test cases follow
`benchmark_(number of gpus)_gpu` format for GPUs and
`benchmark_(topology)_tpu` format for TPUs.
"""
def __init__(self, output_dir=None, tpu=None, **kwargs):
super(BertSquadAccuracy, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _setup(self):
"""Sets up the benchmark and SQuAD flags."""
super(BertSquadAccuracy, self)._setup()
FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
FLAGS.predict_file = SQUAD_PREDICT_FILE
FLAGS.vocab_file = SQUAD_VOCAB_FILE
FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
FLAGS.num_train_epochs = 2
FLAGS.steps_per_loop = 100
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, run_eagerly=False, ds_type='mirrored'):
"""Runs the benchmark and reports various metrics."""
start_time_sec = time.time()
self._train_squad(run_eagerly=run_eagerly, ds_type=ds_type)
self._evaluate_squad(ds_type=ds_type)
wall_time_sec = time.time() - start_time_sec
summary = self._read_training_summary_from_file()
summary['eval_metrics'] = self.eval_metrics
summary['start_time_sec'] = start_time_sec
super(BertSquadAccuracy, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=0.900,
max_accuracy=0.920)
def benchmark_1_gpu_eager(self):
"""Tests BERT SQuAD model accuracy with 1 GPU with eager execution."""
self._setup()
self.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_eager')
FLAGS.train_batch_size = 4
self._run_and_report_benchmark(ds_type='off', run_eagerly=True)
@owner_utils.Owner('tf-model-garden')
def benchmark_8_gpu(self):
"""Tests BERT SQuAD model accuracy with 8 GPUs."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
FLAGS.train_batch_size = 24
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16(self):
"""Tests BERT SQuAD model accuracy with 8 GPUs and FP16."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_fp16')
FLAGS.train_batch_size = 32
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
def benchmark_8_gpu_xla(self):
"""Tests BERT SQuAD model accuracy with 8 GPUs."""
self._setup()
self.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_xla')
FLAGS.train_batch_size = 32
FLAGS.enable_xla = True
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
@owner_utils.Owner('tf-model-garden')
def benchmark_2x2_tpu(self):
"""Tests BERT SQuAD model accuracy with 2x2 TPU."""
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
FLAGS.train_batch_size = 48
self._run_and_report_benchmark()
class BertSquadMultiWorkerAccuracy(BertSquadBenchmarkBase):
"""BERT SQuAD distributed accuracy tests with multiple workers."""
def __init__(self, output_dir=None, tpu=None, **kwargs):
super(BertSquadMultiWorkerAccuracy, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _setup(self):
"""Sets up the benchmark and SQuAD flags."""
super(BertSquadMultiWorkerAccuracy, self)._setup()
FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
FLAGS.predict_file = SQUAD_PREDICT_FILE
FLAGS.vocab_file = SQUAD_VOCAB_FILE
FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
FLAGS.num_train_epochs = 2
FLAGS.steps_per_loop = 100
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, use_ds=True, run_eagerly=False):
"""Runs the benchmark and reports various metrics."""
start_time_sec = time.time()
self._train_squad(run_eagerly=run_eagerly, ds_type='multi_worker_mirrored')
self._evaluate_squad(ds_type='multi_worker_mirrored')
wall_time_sec = time.time() - start_time_sec
summary = self._read_training_summary_from_file()
summary['eval_metrics'] = self.eval_metrics
super(BertSquadMultiWorkerAccuracy, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=0.900,
max_accuracy=0.920)
def _benchmark_common(self, num_workers, all_reduce_alg):
"""Common to all benchmarks in this class."""
self._setup()
num_gpus = 8
FLAGS.num_gpus = num_gpus
FLAGS.dtype = 'fp16'
FLAGS.enable_xla = False
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
'benchmark_8_gpu_{}_worker_fp16_{}_tweaked'.format(
num_workers, all_reduce_alg))
FLAGS.train_batch_size = 4 * num_gpus * num_workers
FLAGS.all_reduce_alg = all_reduce_alg
self._run_and_report_benchmark()
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
"""8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
self._benchmark_common(num_workers=2, all_reduce_alg='ring')
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
"""8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
self._benchmark_common(num_workers=2, all_reduce_alg='nccl')
def benchmark_8_gpu_8_workers_fp16_ring_tweaked(self):
"""8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
self._benchmark_common(num_workers=8, all_reduce_alg='ring')
def benchmark_8_gpu_8_workers_fp16_nccl_tweaked(self):
"""8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
self._benchmark_common(num_workers=8, all_reduce_alg='nccl')
class BertSquadMultiWorkerBenchmark(BertSquadBenchmarkBase):
"""BERT SQuAD distributed benchmark tests with multiple workers."""
def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
super(BertSquadMultiWorkerBenchmark, self).__init__(
output_dir=output_dir, tpu=tpu, **kwargs)
def _setup(self):
"""Sets up the benchmark and SQuAD flags."""
super(BertSquadMultiWorkerBenchmark, self)._setup()
FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
FLAGS.predict_file = SQUAD_PREDICT_FILE
FLAGS.vocab_file = SQUAD_VOCAB_FILE
FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
FLAGS.num_train_epochs = 1
FLAGS.steps_per_loop = 100
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, use_ds=True, run_eagerly=False):
"""Runs the benchmark and reports various metrics."""
if FLAGS.train_batch_size <= 4 * 8:
FLAGS.input_meta_data_path = SQUAD_LONG_INPUT_META_DATA_PATH
else:
FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
start_time_sec = time.time()
self._train_squad(run_eagerly=run_eagerly, ds_type='multi_worker_mirrored')
wall_time_sec = time.time() - start_time_sec
summary = self._read_training_summary_from_file()
summary['start_time_sec'] = start_time_sec
super(BertSquadMultiWorkerBenchmark, self)._report_benchmark(
stats=summary,
wall_time_sec=wall_time_sec,
min_accuracy=0,
max_accuracy=1)
def _benchmark_common(self, num_workers, all_reduce_alg):
"""Common to all benchmarks in this class."""
self._setup()
num_gpus = 8
FLAGS.num_gpus = num_gpus
FLAGS.dtype = 'fp16'
FLAGS.enable_xla = False
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
'benchmark_8_gpu_{}_worker_fp16_{}_tweaked'.format(
num_workers, all_reduce_alg))
FLAGS.train_batch_size = 4 * num_gpus * num_workers
FLAGS.all_reduce_alg = all_reduce_alg
self._run_and_report_benchmark()
def benchmark_8_gpu_1_worker_fp16_ring_tweaked(self):
"""8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
self._benchmark_common(num_workers=1, all_reduce_alg='ring')
def benchmark_8_gpu_1_worker_fp16_nccl_tweaked(self):
"""8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
self._benchmark_common(num_workers=1, all_reduce_alg='nccl')
def benchmark_8_gpu_2_workers_fp16_ring_tweaked(self):
"""8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
self._benchmark_common(num_workers=2, all_reduce_alg='ring')
def benchmark_8_gpu_2_workers_fp16_nccl_tweaked(self):
"""8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
self._benchmark_common(num_workers=2, all_reduce_alg='nccl')
def benchmark_8_gpu_8_workers_fp16_ring_tweaked(self):
"""8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
self._benchmark_common(num_workers=8, all_reduce_alg='ring')
def benchmark_8_gpu_8_workers_fp16_nccl_tweaked(self):
"""8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
self._benchmark_common(num_workers=8, all_reduce_alg='nccl')
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmarks config utils."""
import os
from typing import Optional
def get_config_path(
config_file: str,
base_dir: Optional[str] = None) -> str:
"""Gets the absolute path of the config file."""
resolved_base_dir = '' if base_dir is None else base_dir
return os.path.join(resolved_base_dir, config_file)
[
{
"description": "The ID of the benchmark run, where this metric should tie to.",
"mode": "REQUIRED",
"name": "run_id",
"type": "STRING"
},
{
"description": "The name of the metric, which should be descriptive. E.g. training_loss, accuracy.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The unit of the metric. E.g. MB per sec.",
"mode": "NULLABLE",
"name": "unit",
"type": "STRING"
},
{
"description": "The value of the metric.",
"mode": "NULLABLE",
"name": "value",
"type": "FLOAT"
},
{
"description": "The timestamp when the metric is recorded.",
"mode": "REQUIRED",
"name": "timestamp",
"type": "TIMESTAMP"
},
{
"description": "The global step when this metric is recorded.",
"mode": "NULLABLE",
"name": "global_step",
"type": "INTEGER"
},
{
"description": "Free format metadata for the extra information about the metric.",
"mode": "REPEATED",
"name": "extras",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "name",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
]
}
]
[
{
"description": "The UUID of the run for the benchmark.",
"mode": "REQUIRED",
"name": "model_id",
"type": "STRING"
},
{
"description": "The name of the model, E.g ResNet50, LeNet-5 etc.",
"mode": "REQUIRED",
"name": "model_name",
"type": "STRING"
},
{
"description": "The date when the test of the model is started",
"mode": "REQUIRED",
"name": "run_date",
"type": "TIMESTAMP"
},
{
"description": "The unique name for a test by the combination of key parameters, eg batch size, num of GPU, etc. It is hardware independent.",
"mode": "NULLABLE",
"name": "test_id",
"type": "STRING"
},
{
"description": "The tensorflow version information.",
"fields": [
{
"description": "Version of the tensorflow. E.g. 1.7.0-rc0",
"mode": "REQUIRED",
"name": "version",
"type": "STRING"
},
{
"description": "Git Hash of the tensorflow",
"mode": "NULLABLE",
"name": "git_hash",
"type": "STRING"
},
{
"description": "The channel of the tensorflow binary, eg, nightly, RC, final, custom.",
"mode": "NULLABLE",
"name": "channel",
"type": "STRING"
},
{
"description": "Identify anything special about the build, eg CUDA 10, NCCL, MKL, etc.",
"mode": "NULLABLE",
"name": "build_type",
"type": "STRING"
}
],
"mode": "REQUIRED",
"name": "tensorflow_version",
"type": "RECORD"
},
{
"description": "The arbitrary attribute of the model.",
"fields": [
{
"description": "The name of the attribute.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the attribute.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "attribute",
"type": "RECORD"
},
{
"description": "Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "environment_variable",
"type": "RECORD"
},
{
"description": "TF Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "tensorflow_environment_variables",
"type": "RECORD"
},
{
"description": "The list of parameters run with the model. It could contain hyperparameters or others.",
"fields": [
{
"description": "The name of the parameter.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The string value of the parameter.",
"mode": "NULLABLE",
"name": "string_value",
"type": "STRING"
},
{
"description": "The bool value of the parameter.",
"mode": "NULLABLE",
"name": "bool_value",
"type": "STRING"
},
{
"description": "The int/long value of the parameter.",
"mode": "NULLABLE",
"name": "long_value",
"type": "INTEGER"
},
{
"description": "The double/float value of parameter.",
"mode": "NULLABLE",
"name": "float_value",
"type": "FLOAT"
}
],
"mode": "REPEATED",
"name": "run_parameters",
"type": "RECORD"
},
{
"description": "The dataset that run with the benchmark.",
"mode": "NULLABLE",
"name": "dataset",
"type": "RECORD",
"fields": [
{
"description": "The name of the dataset that the model is trained/validated with. E.g ImageNet, mnist.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The arbitrary attribute of the dataset.",
"fields": [
{
"description": "The name of the attribute.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the attribute.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "attribute",
"type": "RECORD"
}
]
},
{
"description": "Used to differentiate from AWS, GCE or DGX-1 at a high level",
"mode": "NULLABLE",
"name": "test_environment",
"type": "STRING"
},
{
"description": "The machine configuration of the benchmark run.",
"mode": "NULLABLE",
"name": "machine_config",
"type": "RECORD",
"fields": [
{
"description": "The platform information of the benchmark run.",
"mode": "NULLABLE",
"name": "platform_info",
"type": "RECORD",
"fields": [
{
"description": "Eg: 64bit.",
"mode": "NULLABLE",
"name": "bits",
"type": "STRING"
},
{
"description": "Eg: ELF.",
"mode": "NULLABLE",
"name": "linkage",
"type": "STRING"
},
{
"description": "Eg: i386.",
"mode": "NULLABLE",
"name": "machine",
"type": "STRING"
},
{
"description": "Eg: 3.13.0-76-generic.",
"mode": "NULLABLE",
"name": "release",
"type": "STRING"
},
{
"description": "Eg: Linux.",
"mode": "NULLABLE",
"name": "system",
"type": "STRING"
},
{
"description": "Eg: #120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016.",
"mode": "NULLABLE",
"name": "version",
"type": "STRING"
}
]
},
{
"description": "The CPU information of the benchmark run.",
"mode": "NULLABLE",
"name": "cpu_info",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "num_cores",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "num_cores_allowed",
"type": "INTEGER"
},
{
"description" : "How fast are those CPUs.",
"mode": "NULLABLE",
"name": "mhz_per_cpu",
"type": "FLOAT"
},
{
"description" : "Additional CPU info, Eg: Intel Ivybridge with HyperThreading (24 cores).",
"mode": "NULLABLE",
"name": "cpu_info",
"type": "STRING"
},
{
"description" : "What kind of cpu scaling is enabled on the host. Eg performance, ondemand, conservative, mixed.",
"mode": "NULLABLE",
"name": "cpu_governor",
"type": "STRING"
},
{
"description": "Cache size of the CPUs.",
"mode": "NULLABLE",
"name": "cache_size",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "level",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "size",
"type": "INTEGER"
}
]
}
]
},
{
"mode": "NULLABLE",
"name": "gpu_info",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "count",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "model",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "cuda_version",
"type": "STRING"
}
]
},
{
"description": "The cloud instance inforation if the benchmark run is executed on cloud",
"mode": "NULLABLE",
"name": "cloud_info",
"type": "RECORD",
"fields": [
{
"description": "The instance type, E.g. n1-standard-4.",
"mode": "NULLABLE",
"name": "instance_type",
"type": "STRING"
},
{
"description": "The arbitrary attribute of the cloud info.",
"fields": [
{
"description": "The name of the attribute.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the attribute.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "attribute",
"type": "RECORD"
}
]
},
{
"mode": "NULLABLE",
"name": "memory_total",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "memory_available",
"type": "STRING"
}
]
}
]
[
{
"description": "The UUID of the run for the benchmark.",
"mode": "REQUIRED",
"name": "run_id",
"type": "STRING"
},
{
"description": "The status of the run for the benchmark. Eg, running, failed, success",
"mode": "REQUIRED",
"name": "status",
"type": "STRING"
}
]
\ No newline at end of file
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Keras benchmarks and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
from official.utils.flags import core as flags_core
class KerasBenchmark(PerfZeroBenchmark):
"""Base benchmark class with methods to simplify testing."""
def __init__(self,
output_dir=None,
default_flags=None,
flag_methods=None,
tpu=None):
super(KerasBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
flag_methods=flag_methods,
tpu=tpu)
def _report_benchmark(self,
stats,
wall_time_sec,
top_1_max=None,
top_1_min=None,
log_steps=None,
total_batch_size=None,
warmup=1,
start_time_sec=None):
"""Report benchmark results by writing to local protobuf file.
Args:
stats: dict returned from keras models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
top_1_max: highest passing level for top_1 accuracy.
top_1_min: lowest passing level for top_1 accuracy.
log_steps: How often the log was created for stats['step_timestamp_log'].
total_batch_size: Global batch-size.
warmup: number of entries in stats['step_timestamp_log'] to ignore.
start_time_sec: the start time of the program in seconds since epoch
"""
metrics = []
if 'accuracy_top_1' in stats:
metrics.append({
'name': 'accuracy_top_1',
'value': stats['accuracy_top_1'],
'min_value': top_1_min,
'max_value': top_1_max
})
metrics.append({
'name': 'top_1_train_accuracy',
'value': stats['training_accuracy_top_1']
})
if (warmup and 'step_timestamp_log' in stats and
len(stats['step_timestamp_log']) > warmup):
# first entry in the time_log is start of step 1. The rest of the
# entries are the end of each step recorded
time_log = stats['step_timestamp_log']
elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
num_examples = (
total_batch_size * log_steps * (len(time_log) - warmup - 1))
examples_per_sec = num_examples / elapsed
metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
if 'avg_exp_per_second' in stats:
metrics.append({
'name': 'avg_exp_per_second',
'value': stats['avg_exp_per_second']
})
if start_time_sec and 'step_timestamp_log' in stats:
time_log = stats['step_timestamp_log']
# time_log[0] is recorded at the beginning of the first step.
startup_time = time_log[0].timestamp - start_time_sec
metrics.append({'name': 'startup_time', 'value': startup_time})
flags_str = flags_core.get_nondefault_flags_as_str()
self.report_benchmark(
iters=-1,
wall_time=wall_time_sec,
metrics=metrics,
extras={'flags': flags_str})
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Keras benchmarks and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl import flags
import tensorflow as tf
from official.benchmark import keras_benchmark
from official.benchmark import benchmark_wrappers
from official.benchmark.models import resnet_cifar_main
MIN_TOP_1_ACCURACY = 0.929
MAX_TOP_1_ACCURACY = 0.938
FLAGS = flags.FLAGS
CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
"""Accuracy tests for ResNet56 Keras CIFAR-10."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more named
arguments before updating the constructor.
"""
self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
flag_methods = [resnet_cifar_main.define_cifar_flags]
super(Resnet56KerasAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def _setup(self):
super(Resnet56KerasAccuracy, self)._setup()
FLAGS.use_tensor_lr = False
def benchmark_graph_1_gpu(self):
"""Test keras based model with Keras fit and distribution strategies."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
def benchmark_1_gpu(self):
"""Test keras based model with eager and distribution strategies."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
def benchmark_cpu(self):
"""Test keras based model on CPU."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_cpu_no_dist_strat(self):
"""Test keras based model on CPU without distribution strategies."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_cpu_no_dist_strat_run_eagerly(self):
"""Test keras based model on CPU w/forced eager and no dist_strat."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir(
'benchmark_cpu_no_dist_strat_run_eagerly')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat(self):
"""Test keras based model with eager and no dist strat."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Test keras based model w/forced eager and no dist_strat."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
self._run_and_report_benchmark()
def benchmark_graph_1_gpu_no_dist_strat(self):
"""Test keras based model with Keras fit but not distribution strategies."""
self._setup()
FLAGS.distribution_strategy = 'off'
FLAGS.num_gpus = 1
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
def benchmark_2_gpu(self):
"""Test keras based model with eager and distribution strategies."""
self._setup()
FLAGS.num_gpus = 2
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
def benchmark_graph_2_gpu(self):
"""Test keras based model with Keras fit and distribution strategies."""
self._setup()
FLAGS.num_gpus = 2
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.dtype = 'fp32'
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = resnet_cifar_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet56KerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=MIN_TOP_1_ACCURACY,
top_1_max=MAX_TOP_1_ACCURACY,
total_batch_size=FLAGS.batch_size,
log_steps=100)
class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
"""Short performance tests for ResNet56 via Keras and CIFAR-10."""
def __init__(self, output_dir=None, default_flags=None):
flag_methods = [resnet_cifar_main.define_cifar_flags]
super(Resnet56KerasBenchmarkBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = resnet_cifar_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet56KerasBenchmarkBase, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_1_gpu(self):
"""Test 1 gpu."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_xla(self):
"""Test 1 gpu with xla enabled."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = False
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_graph_1_gpu(self):
"""Test 1 gpu graph."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.run_eagerly = False
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat(self):
"""Test 1 gpu without distribution strategies."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_graph_1_gpu_no_dist_strat(self):
"""Test 1 gpu graph mode without distribution strategies."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = False
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Test 1 gpu without distribution strategy and forced eager."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = 128
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
self._run_and_report_benchmark()
def benchmark_2_gpu(self):
"""Test 2 gpu."""
self._setup()
FLAGS.num_gpus = 2
FLAGS.enable_eager = True
FLAGS.run_eagerly = False
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark()
def benchmark_graph_2_gpu(self):
"""Test 2 gpu graph mode."""
self._setup()
FLAGS.num_gpus = 2
FLAGS.enable_eager = False
FLAGS.run_eagerly = False
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark()
def benchmark_cpu(self):
"""Test cpu."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.enable_eager = True
FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
FLAGS.batch_size = 128
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_graph_cpu(self):
"""Test cpu graph mode."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.enable_eager = False
FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu')
FLAGS.batch_size = 128
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_cpu_no_dist_strat_run_eagerly(self):
"""Test cpu without distribution strategy and forced eager."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.distribution_strategy = 'off'
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.model_dir = self._get_model_dir(
'benchmark_cpu_no_dist_strat_run_eagerly')
FLAGS.batch_size = 128
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_cpu_no_dist_strat(self):
"""Test cpu without distribution strategies."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
FLAGS.batch_size = 128
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
def benchmark_graph_cpu_no_dist_strat(self):
"""Test cpu graph mode without distribution strategies."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.enable_eager = False
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu_no_dist_strat')
FLAGS.batch_size = 128
FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark()
class Resnet56KerasBenchmarkSynth(Resnet56KerasBenchmarkBase):
"""Synthetic benchmarks for ResNet56 and Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
default_flags = {}
default_flags['skip_eval'] = True
default_flags['use_synthetic_data'] = True
default_flags['train_steps'] = 110
default_flags['log_steps'] = 10
default_flags['use_tensor_lr'] = False
super(Resnet56KerasBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=default_flags)
class Resnet56KerasBenchmarkReal(Resnet56KerasBenchmarkBase):
"""Real data benchmarks for ResNet56 and Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
default_flags = {}
default_flags['skip_eval'] = True
default_flags['data_dir'] = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
default_flags['train_steps'] = 110
default_flags['log_steps'] = 10
default_flags['use_tensor_lr'] = False
super(Resnet56KerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=default_flags)
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Keras benchmarks and accuracy tests."""
# pylint: disable=line-too-long
from __future__ import print_function
import json
import os
import time
from typing import Any, MutableMapping, Optional
from absl import flags
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.benchmark import benchmark_wrappers
from official.benchmark import keras_benchmark
from official.benchmark.models import resnet_imagenet_main
from official.legacy.image_classification import classifier_trainer
MIN_TOP_1_ACCURACY = 0.76
MAX_TOP_1_ACCURACY = 0.77
MOBILENET_V1_MIN_TOP_1_ACCURACY = 0.65
MOBILENET_V1_MAX_TOP_1_ACCURACY = 0.68
# Range of top-1 accracies for model optimization techniques.
# Each item indicates (MIN_TOP_1_ACCURACY, MAX_TOP_1_ACCURACY).
MODEL_OPTIMIZATION_TOP_1_ACCURACY = {
'RESNET50_FINETUNE_PRUNING': (0.76, 0.77),
'MOBILENET_V1_FINETUNE_PRUNING': (0.67, 0.68),
'MOBILENET_V1_FINETUNE_CLUSTERING': (0.68, 0.70)
}
FLAGS = flags.FLAGS
def _get_classifier_parameters(
model_variant: Optional[str] = None,
num_gpus: int = 0,
builder: str = 'records',
skip_eval: bool = False,
distribution_strategy: str = 'mirrored',
per_replica_batch_size: int = 128,
epochs: int = 90,
steps: int = 0,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False,
gpu_thread_mode: Optional[str] = None,
dataset_num_private_threads: Optional[int] = None,
loss_scale: Optional[str] = None,
report_metrics: bool = True,
batchnorm_spatial_persistent: bool = False) -> MutableMapping[str, Any]:
"""Gets classifier trainer's ResNet parameters."""
params = {
'runtime': {
'num_gpus': num_gpus,
'distribution_strategy': distribution_strategy,
'run_eagerly': run_eagerly,
'enable_xla': enable_xla,
'dataset_num_private_threads': dataset_num_private_threads,
'gpu_thread_mode': gpu_thread_mode,
'loss_scale': loss_scale,
'batchnorm_spatial_persistent': batchnorm_spatial_persistent,
},
'train_dataset': {
'builder': builder,
'use_per_replica_batch_size': True,
'batch_size': per_replica_batch_size,
'image_size': 224,
'dtype': dtype,
},
'validation_dataset': {
'builder': builder,
'batch_size': per_replica_batch_size,
'use_per_replica_batch_size': True,
'image_size': 224,
'dtype': dtype,
},
'train': {
'epochs': epochs,
'steps': steps,
'callbacks': {
'enable_tensorboard': False,
'enable_checkpoint_and_export': False,
'enable_time_history': True,
},
'metrics': ['accuracy'] if report_metrics else [],
},
'model': {
'loss': {
'label_smoothing': 0.1,
},
},
'evaluation': {
'epochs_between_evals': epochs_between_evals,
'skip_eval': skip_eval,
},
}
if model_variant is not None:
params['model']['model_params'] = {
'model_name': model_variant,
}
return params
class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for ResNet50 in Keras."""
def __init__(self,
output_dir: Optional[str] = None,
root_data_dir: Optional[str] = None,
**kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [classifier_trainer.define_classifier_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(Resnet50KerasAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(
self,
experiment_name: str,
top_1_min: float = MIN_TOP_1_ACCURACY,
top_1_max: float = MAX_TOP_1_ACCURACY,
num_gpus: int = 0,
distribution_strategy: str = 'mirrored',
per_replica_batch_size: int = 128,
epochs: int = 90,
steps: int = 0,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False,
gpu_thread_mode: Optional[str] = None,
dataset_num_private_threads: Optional[int] = None,
loss_scale: Optional[str] = None):
"""Runs and reports the benchmark given the provided configuration."""
FLAGS.model_type = 'resnet'
FLAGS.dataset = 'imagenet'
FLAGS.mode = 'train_and_eval'
FLAGS.data_dir = self.data_dir
FLAGS.model_dir = self._get_model_dir(experiment_name)
parameters = _get_classifier_parameters(
num_gpus=num_gpus,
distribution_strategy=distribution_strategy,
per_replica_batch_size=per_replica_batch_size,
epochs=epochs,
steps=steps,
epochs_between_evals=epochs_between_evals,
dtype=dtype,
enable_xla=enable_xla,
run_eagerly=run_eagerly,
gpu_thread_mode=gpu_thread_mode,
dataset_num_private_threads=dataset_num_private_threads,
report_metrics=True,
loss_scale=loss_scale,
batchnorm_spatial_persistent=True)
FLAGS.params_override = json.dumps(parameters)
total_batch_size = num_gpus * per_replica_batch_size
start_time_sec = time.time()
stats = classifier_trainer.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet50KerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=total_batch_size,
log_steps=100)
def benchmark_8_gpu(self):
"""Tests Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu',
num_gpus=8,
per_replica_batch_size=128,
epochs=90,
epochs_between_evals=10,
dtype='float32')
def benchmark_8_gpu_fp16(self):
"""Tests Keras model with eager, dist_strat, 8 GPUs, and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16',
num_gpus=8,
per_replica_batch_size=256,
epochs=90,
epochs_between_evals=10,
dtype='float16')
def benchmark_xla_8_gpu_fp16(self):
"""Tests Keras model with XLA, eager, dist_strat, 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16',
num_gpus=8,
per_replica_batch_size=256,
epochs=90,
epochs_between_evals=10,
dtype='float16',
enable_xla=True)
def benchmark_xla_8_gpu_fp16_dynamic(self):
"""Tests Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_dynamic',
top_1_min=0.736,
num_gpus=8,
per_replica_batch_size=256,
epochs=90,
epochs_between_evals=10,
dtype='float16',
loss_scale='dynamic')
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
class MobilenetV1KerasAccuracy(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for MobilenetV1 in Keras."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(MobilenetV1KerasAccuracy, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags={
'model': 'mobilenet',
'optimizer': 'mobilenet_default',
'initial_learning_rate_per_sample': 0.00039,
})
def benchmark_8_gpu(self):
"""Test Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MOBILENET_V1_MIN_TOP_1_ACCURACY,
top_1_max=MOBILENET_V1_MAX_TOP_1_ACCURACY):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(MobilenetV1KerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
class KerasClassifierBenchmarkBase(keras_benchmark.KerasBenchmark):
"""Classifier Trainer benchmarks."""
def __init__(self, model, output_dir=None, default_flags=None,
tpu=None, dataset_builder='records', train_epochs=1,
train_steps=110, data_dir=None):
flag_methods = [classifier_trainer.define_classifier_flags]
self.model = model
self.dataset_builder = dataset_builder
self.train_epochs = train_epochs
self.train_steps = train_steps
self.data_dir = data_dir
super(KerasClassifierBenchmarkBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(
self,
experiment_name: str,
model_variant: Optional[str] = None,
skip_steps: Optional[int] = None,
top_1_min: float = MIN_TOP_1_ACCURACY,
top_1_max: float = MAX_TOP_1_ACCURACY,
num_gpus: int = 0,
num_tpus: int = 0,
distribution_strategy: str = 'mirrored',
per_replica_batch_size: int = 128,
epochs_between_evals: int = 1,
dtype: str = 'float32',
enable_xla: bool = False,
run_eagerly: bool = False,
gpu_thread_mode: Optional[str] = None,
dataset_num_private_threads: Optional[int] = None,
loss_scale: Optional[str] = None):
"""Runs and reports the benchmark given the provided configuration."""
FLAGS.model_type = self.model
FLAGS.dataset = 'imagenet'
FLAGS.mode = 'train_and_eval'
FLAGS.data_dir = self.data_dir
FLAGS.model_dir = self._get_model_dir(experiment_name)
parameters = _get_classifier_parameters(
model_variant=model_variant,
builder=self.dataset_builder,
skip_eval=True,
num_gpus=num_gpus,
distribution_strategy=distribution_strategy,
per_replica_batch_size=per_replica_batch_size,
epochs=self.train_epochs,
steps=self.train_steps,
epochs_between_evals=epochs_between_evals,
dtype=dtype,
enable_xla=enable_xla,
gpu_thread_mode=gpu_thread_mode,
dataset_num_private_threads=dataset_num_private_threads,
loss_scale=loss_scale,
report_metrics=False,
batchnorm_spatial_persistent=True)
FLAGS.params_override = json.dumps(parameters)
if distribution_strategy == 'tpu':
total_batch_size = num_tpus * per_replica_batch_size
else:
total_batch_size = num_gpus * per_replica_batch_size
start_time_sec = time.time()
stats = classifier_trainer.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
# Number of logged step time entries that are excluded in performance
# report. We keep results from last 100 batches, or skip the steps based on
# input skip_steps.
warmup = (skip_steps or (self.train_steps - 100)) // FLAGS.log_steps
super(KerasClassifierBenchmarkBase, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=total_batch_size,
log_steps=FLAGS.log_steps,
warmup=warmup,
start_time_sec=start_time_sec)
def benchmark_1_gpu_no_dist_strat(self):
"""Tests Keras model with 1 GPU, no distribution strategy."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_no_dist_strat',
num_gpus=1,
distribution_strategy='off',
per_replica_batch_size=128)
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Tests Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly',
num_gpus=1,
run_eagerly=True,
distribution_strategy='off',
per_replica_batch_size=64)
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
"""Tests with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly_fp16',
num_gpus=1,
run_eagerly=True,
distribution_strategy='off',
dtype='float16',
per_replica_batch_size=128)
def benchmark_1_gpu(self):
"""Tests Keras model with 1 GPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu',
num_gpus=1,
distribution_strategy='one_device',
per_replica_batch_size=128)
def benchmark_xla_1_gpu(self):
"""Tests Keras model with XLA and 1 GPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
per_replica_batch_size=128)
def benchmark_1_gpu_fp16(self):
"""Tests Keras model with 1 GPU and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_fp16',
num_gpus=1,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256)
def benchmark_1_gpu_fp16_dynamic(self):
"""Tests Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_1_gpu_fp16_dynamic',
num_gpus=1,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256,
loss_scale='dynamic')
def benchmark_xla_1_gpu_fp16(self):
"""Tests Keras model with XLA, 1 GPU and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu_fp16',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256)
def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Tests Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu_fp16_tweaked',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private')
def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Tests Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_1_gpu_fp16_dynamic',
num_gpus=1,
enable_xla=True,
distribution_strategy='one_device',
dtype='float16',
per_replica_batch_size=256,
loss_scale='dynamic')
def benchmark_8_gpu(self):
"""Tests Keras model with 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu',
num_gpus=8,
distribution_strategy='mirrored',
per_replica_batch_size=128)
def benchmark_8_gpu_tweaked(self):
"""Tests Keras model with manual config tuning and 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_tweaked',
num_gpus=8,
distribution_strategy='mirrored',
per_replica_batch_size=128,
dataset_num_private_threads=14)
def benchmark_xla_8_gpu(self):
"""Tests Keras model with XLA and 8 GPUs."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=128)
def benchmark_xla_8_gpu_tweaked(self):
"""Tests Keras model with manual config tuning, 8 GPUs, and XLA."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_tweaked',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=128,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=24)
def benchmark_8_gpu_fp16(self):
"""Tests Keras model with 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16',
num_gpus=8,
dtype='float16',
distribution_strategy='mirrored',
per_replica_batch_size=256)
def benchmark_8_gpu_fp16_tweaked(self):
"""Tests Keras model with 8 GPUs, fp16, and manual config tuning."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16_tweaked',
num_gpus=8,
dtype='float16',
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=40)
def benchmark_8_gpu_fp16_dynamic_tweaked(self):
"""Tests Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8_gpu_fp16_dynamic_tweaked',
num_gpus=8,
dtype='float16',
distribution_strategy='mirrored',
per_replica_batch_size=256,
loss_scale='dynamic',
gpu_thread_mode='gpu_private',
dataset_num_private_threads=40)
def benchmark_xla_8_gpu_fp16(self):
"""Tests Keras model with XLA, 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256)
def benchmark_xla_8_gpu_fp16_tweaked(self):
"""Test Keras model with manual config tuning, XLA, 8 GPUs and fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_tweaked',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=48)
def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self):
"""Tests with manual config tuning, XLA, 8 GPUs and fp16.
Delay performance measurement for stable performance on 96 vCPU platforms.
"""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_tweaked_delay_measure',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
dataset_num_private_threads=48)
def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
"""Tests Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_xla_8_gpu_fp16_dynamic_tweaked',
dtype='float16',
num_gpus=8,
enable_xla=True,
distribution_strategy='mirrored',
per_replica_batch_size=256,
gpu_thread_mode='gpu_private',
loss_scale='dynamic',
dataset_num_private_threads=48)
def benchmark_2x2_tpu_bf16(self):
"""Test Keras model with 2x2 TPU, bf16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_2x2_tpu_bf16',
dtype='bfloat16',
num_tpus=8,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_2x2_tpu(self):
"""Test Keras model with 2x2 TPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_2x2_tpu',
num_tpus=8,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_4x4_tpu_bf16(self):
"""Test Keras model with 4x4 TPU, bf16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_4x4_tpu_bf16',
dtype='bfloat16',
num_tpus=32,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_4x4_tpu(self):
"""Test Keras model with 4x4 TPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_4x4_tpu',
num_tpus=32,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_2x2_tpu_bf16_mlir(self):
"""Test Keras model with 2x2 TPU, bf16."""
self._setup()
tf.config.experimental.enable_mlir_bridge()
self._run_and_report_benchmark(
experiment_name='benchmark_2x2_tpu_bf16_mlir',
dtype='bfloat16',
num_tpus=8,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_4x4_tpu_bf16_mlir(self):
"""Test Keras model with 4x4 TPU, bf16."""
self._setup()
tf.config.experimental.enable_mlir_bridge()
self._run_and_report_benchmark(
experiment_name='benchmark_4x4_tpu_bf16_mlir',
dtype='bfloat16',
num_tpus=32,
distribution_strategy='tpu',
per_replica_batch_size=128)
def benchmark_8x8_tpu_bf16(self):
"""Test Keras model with 8x8 TPU, bf16."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8x8_tpu_bf16',
dtype='bfloat16',
num_tpus=128,
distribution_strategy='tpu',
per_replica_batch_size=64)
def benchmark_8x8_tpu(self):
"""Test Keras model with 8x8 TPU."""
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_8x8_tpu',
num_tpus=128,
distribution_strategy='tpu',
per_replica_batch_size=64)
def fill_report_object(self, stats):
super(KerasClassifierBenchmarkBase, self).fill_report_object(
stats,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
"""Resnet50 benchmarks."""
def __init__(self, output_dir=None, default_flags=None, tpu=None):
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
super(Resnet50KerasBenchmarkBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
tpu=tpu)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, skip_steps=None):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
# Number of logged step time entries that are excluded in performance
# report. We keep results from last 100 batches, or skip the steps based on
# input skip_steps.
warmup = (skip_steps or (FLAGS.train_steps - 100)) // FLAGS.log_steps
super(Resnet50KerasBenchmarkBase, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
warmup=warmup,
start_time_sec=start_time_sec)
def benchmark_1_gpu_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly')
FLAGS.batch_size = 64
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
FLAGS.batch_size = 64
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu(self):
"""Test Keras model with 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_xla_1_gpu(self):
"""Test Keras model with XLA and 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16(self):
"""Test Keras model with 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16_dynamic(self):
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16(self):
"""Test Keras model with XLA, 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._run_and_report_benchmark()
def benchmark_8_gpu(self):
"""Test Keras model with 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_fp32_no_tf32(self):
"""Test Keras model with 8 GPUs.Runs in FP32 by disabling TF32 execution."""
self._setup()
tf.config.experimental.enable_tensor_float_32_execution(False)
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp32_no_tf32')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_tweaked(self):
"""Test Keras model with manual config tuning and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked')
FLAGS.batch_size = 128 * 8 # 8 GPUs
FLAGS.datasets_num_private_threads = 14
self._run_and_report_benchmark()
def benchmark_xla_8_gpu(self):
"""Test Keras model with XLA and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_tweaked(self):
"""Test Keras model with manual config tuning, 8 GPUs, and XLA."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_tweaked')
FLAGS.batch_size = 128 * 8
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 24
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16(self):
"""Test Keras model with 8 GPUs and fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
FLAGS.batch_size = 256 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16_tweaked(self):
"""Test Keras model with 8 GPUs, fp16, and manual config tuning."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 40
self._run_and_report_benchmark()
def benchmark_8_gpu_fp16_dynamic_tweaked(self):
"""Test Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir(
'benchmark_8_gpu_fp16_dynamic_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.loss_scale = 'dynamic'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 40
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16(self):
"""Test Keras model with XLA, 8 GPUs and fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
FLAGS.batch_size = 256 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16_tweaked(self):
"""Test Keras model with manual config tuning, XLA, 8 GPUs and fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self):
"""Test with manual config tuning, XLA, 8 GPUs and fp16.
Delay performance measurement for stable performance on 96 vCPU platforms.
"""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_8_gpu_fp16_tweaked_delay_measure')
FLAGS.batch_size = 256 * 8
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48
FLAGS.train_steps = 310
self._run_and_report_benchmark()
def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
"""Test Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'mirrored'
FLAGS.model_dir = self._get_model_dir(
'benchmark_xla_8_gpu_fp16_dynamic_tweaked')
FLAGS.batch_size = 256 * 8 # 8 GPUs
FLAGS.loss_scale = 'dynamic'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 48
self._run_and_report_benchmark()
def benchmark_2x2_tpu_bf16(self):
"""Test Keras model with 2x2 TPU, bf16."""
self._setup()
FLAGS.dtype = 'bf16'
FLAGS.distribution_strategy = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16')
FLAGS.batch_size = 1024
self._run_and_report_benchmark()
def benchmark_4x4_tpu_bf16(self):
"""Test Keras model with 4x4 TPU, bf16."""
self._setup()
FLAGS.dtype = 'bf16'
FLAGS.distribution_strategy = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16')
FLAGS.batch_size = 4096
self._run_and_report_benchmark()
def benchmark_8x8_tpu_bf16(self):
"""Test Keras model with 8x8 TPU, bf16."""
self._setup()
FLAGS.dtype = 'bf16'
FLAGS.distribution_strategy = 'tpu'
FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16')
FLAGS.batch_size = 8192
self._run_and_report_benchmark()
def fill_report_object(self, stats):
super(Resnet50KerasBenchmarkBase, self).fill_report_object(
stats,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class Resnet50KerasBenchmarkSynth(KerasClassifierBenchmarkBase):
"""Resnet50 synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
def_flags = {}
def_flags['log_steps'] = 10
super(Resnet50KerasBenchmarkSynth, self).__init__(
model='resnet', output_dir=output_dir, default_flags=def_flags, tpu=tpu,
dataset_builder='synthetic', train_epochs=1, train_steps=110)
class Resnet50KerasBenchmarkReal(KerasClassifierBenchmarkBase):
"""Resnet50 real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
data_dir = os.path.join(root_data_dir, 'imagenet')
def_flags = {}
def_flags['log_steps'] = 10
super(Resnet50KerasBenchmarkReal, self).__init__(
model='resnet', output_dir=output_dir, default_flags=def_flags, tpu=tpu,
dataset_builder='records', train_epochs=1, train_steps=110,
data_dir=data_dir)
class EfficientNetKerasBenchmarkReal(KerasClassifierBenchmarkBase):
"""EfficientNet real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
data_dir = os.path.join(root_data_dir, 'imagenet')
def_flags = {}
def_flags['log_steps'] = 10
super(EfficientNetKerasBenchmarkReal, self).__init__(
model='efficientnet', output_dir=output_dir, default_flags=def_flags,
tpu=tpu, dataset_builder='records', train_epochs=1, train_steps=110,
data_dir=data_dir)
def benchmark_2x2_tpu_b7_bf16(self):
self._setup()
self._run_and_report_benchmark(
experiment_name='benchmark_b7_2x2_tpu_bf16',
model_variant='efficientnet-b7',
dtype='bfloat16',
num_tpus=8,
distribution_strategy='tpu',
per_replica_batch_size=128)
class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase):
"""Resnet50 real data (stored in remote storage) benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
# Defining multiple epochs overrides the train_steps setting in benchmarks.
def_flags['train_epochs'] = 2
# Cache dataset so performance is stable after the first epoch.
def_flags['training_dataset_cache'] = True
def_flags['log_steps'] = 100
# Note that for single GPU and pure eager tests which are less likely to be
# input bound and more stable, these tests will run for shorter time by
# overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark
# methods, and skip_steps in _run_and_report_benchmark().
super(Resnet50KerasBenchmarkRemoteData, self).__init__(
output_dir=output_dir, default_flags=def_flags)
def _override_flags_to_run_test_shorter(self):
FLAGS.train_epochs = 1
FLAGS.train_steps = 300
FLAGS.log_steps = 10
def benchmark_1_gpu_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly')
FLAGS.batch_size = 64
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
"""Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
FLAGS.batch_size = 64
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
"""Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.explicit_gpu_placement = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu(self):
"""Test Keras model with 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu(self):
"""Test Keras model with XLA and 1 GPU."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
FLAGS.batch_size = 128
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16(self):
"""Test Keras model with 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_1_gpu_fp16_dynamic(self):
"""Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16(self):
"""Test Keras model with XLA, 1 GPU and fp16."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_tweaked(self):
"""Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.tf_gpu_thread_mode = 'gpu_private'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_fp16_dynamic(self):
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.enable_xla = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 256
FLAGS.loss_scale = 'dynamic'
self._override_flags_to_run_test_shorter()
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
if FLAGS.num_gpus == 1 or FLAGS.run_eagerly:
# For single GPU and pure eager tests which are less likely to be input
# bound and more stable, run for shorter time and use the default
# skip_steps.
skip_steps = None
else:
# skip the first epoch for performance measurement.
skip_steps = 600
super(Resnet50KerasBenchmarkRemoteData,
self)._run_and_report_benchmark(skip_steps=skip_steps)
class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
"""Trivial model with real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
def_flags = {}
def_flags['use_trivial_model'] = True
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['dtype'] = 'fp16'
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
def_flags['train_steps'] = 600
def_flags['log_steps'] = 100
def_flags['distribution_strategy'] = 'mirrored'
super(TrivialKerasBenchmarkReal, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=def_flags)
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(FLAGS)
wall_time_sec = time.time() - start_time_sec
super(TrivialKerasBenchmarkReal, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
def benchmark_8_gpu_warmup(self):
"""Dummy test that runs over an epoch to warmup the machine."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
FLAGS.batch_size = 256 * 8
FLAGS.train_steps = 700
self._run_and_report_benchmark()
def fill_report_object(self, stats):
super(TrivialKerasBenchmarkReal, self).fill_report_object(
stats,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps)
class Resnet50MultiWorkerKerasAccuracy(keras_benchmark.KerasBenchmark):
"""Resnet50 distributed accuracy tests with multiple workers."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
flag_methods = [classifier_trainer.define_imagenet_keras_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(Resnet50MultiWorkerKerasAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def _benchmark_common(self, eager, num_workers, all_reduce_alg):
"""Common to all benchmarks in this class."""
self._setup()
num_gpus = 8
FLAGS.num_gpus = num_gpus
FLAGS.data_dir = self.data_dir
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = eager
FLAGS.enable_xla = False
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format(
'eager' if eager else 'graph', num_workers, all_reduce_alg))
FLAGS.batch_size = 256 * num_gpus * num_workers
FLAGS.all_reduce_alg = all_reduce_alg
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MIN_TOP_1_ACCURACY,
top_1_max=MAX_TOP_1_ACCURACY):
start_time_sec = time.time()
stats = classifier_trainer.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet50MultiWorkerKerasAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
class Resnet50MultiWorkerKerasBenchmark(Resnet50KerasBenchmarkBase):
"""Resnet50 distributed benchmark tests with multiple workers."""
def __init__(self, output_dir=None, default_flags=None):
super(Resnet50MultiWorkerKerasBenchmark, self).__init__(
output_dir=output_dir, default_flags=default_flags)
def _benchmark_common(self, eager, num_workers, all_reduce_alg):
"""Common to all benchmarks in this class."""
self._setup()
num_gpus = 8
FLAGS.num_gpus = num_gpus
FLAGS.dtype = 'fp16'
FLAGS.enable_eager = eager
FLAGS.enable_xla = False
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format(
'eager' if eager else 'graph', num_workers, all_reduce_alg))
FLAGS.batch_size = 256 * num_gpus * num_workers
FLAGS.all_reduce_alg = all_reduce_alg
self._run_and_report_benchmark()
def benchmark_eager_8_gpu_1_worker_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='ring')
def benchmark_eager_8_gpu_1_worker_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='nccl')
def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
"""Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
class Resnet50MultiWorkerKerasBenchmarkSynth(Resnet50MultiWorkerKerasBenchmark):
"""Resnet50 multi-worker synthetic data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['use_synthetic_data'] = True
def_flags['train_steps'] = 110
def_flags['log_steps'] = 10
super(Resnet50MultiWorkerKerasBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags)
class Resnet50MultiWorkerKerasBenchmarkReal(Resnet50MultiWorkerKerasBenchmark):
"""Resnet50 multi-worker real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['skip_eval'] = True
def_flags['report_accuracy_metrics'] = False
def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
def_flags['train_steps'] = 110
def_flags['log_steps'] = 10
super(Resnet50MultiWorkerKerasBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags)
# TODO(kimjaehong): It also should be also cover other metheods of model
# optimization techniques. In that time, this class will change to something
# like 'KerasModelOptimizationAccuracyBase'.
class KerasPruningAccuracyBase(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for pruning method."""
def __init__(self,
output_dir=None,
root_data_dir=None,
default_flags=None,
**kwargs):
"""A accuracy benchmark class for pruning method.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
default_flags: default flags
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
if default_flags is None:
default_flags = {}
default_flags['pruning_method'] = 'polynomial_decay'
default_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
super(KerasPruningAccuracyBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
**kwargs)
def benchmark_8_gpu(self):
"""Test Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = 32 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
'RESNET50_FINETUNE_PRUNING'][0],
top_1_max=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
'RESNET50_FINETUNE_PRUNING'][1]):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(KerasPruningAccuracyBase, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
class MobilenetV1KerasPruningAccuracy(KerasPruningAccuracyBase):
"""Benchmark accuracy tests for MobilenetV1 with pruning method."""
def __init__(self, root_data_dir=None, **kwargs):
default_flags = {
'model': 'mobilenet',
'optimizer': 'mobilenet_default',
'initial_learning_rate_per_sample': 0.00007,
'pretrained_filepath': tf.train.latest_checkpoint(
os.path.join(root_data_dir, 'mobilenet_v1')),
'pruning_begin_step': 0,
'pruning_end_step': 100000,
'pruning_initial_sparsity': 0.0,
'pruning_final_sparsity': 0.5,
'pruning_frequency': 100,
}
super(MobilenetV1KerasPruningAccuracy, self).__init__(
root_data_dir=root_data_dir,
default_flags=default_flags,
**kwargs)
def _run_and_report_benchmark(self):
super(MobilenetV1KerasPruningAccuracy, self)._run_and_report_benchmark(
top_1_min=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][0],
top_1_max=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][1])
class Resnet50KerasPruningAccuracy(KerasPruningAccuracyBase):
"""Benchmark accuracy tests for resnet50 with pruning method."""
def __init__(self, root_data_dir=None, **kwargs):
default_flags = {
'model': 'resnet50_v1.5',
'optimizer': 'mobilenet_default',
'initial_learning_rate_per_sample': 0.0000039,
'pretrained_filepath': tf.train.latest_checkpoint(
os.path.join(root_data_dir, 'resnet50')),
'pruning_begin_step': 0,
'pruning_end_step': 50000,
'pruning_initial_sparsity': 0.0,
'pruning_final_sparsity': 0.5,
'pruning_frequency': 100,
}
super(Resnet50KerasPruningAccuracy, self).__init__(
root_data_dir=root_data_dir,
default_flags=default_flags,
**kwargs)
def _run_and_report_benchmark(self):
super(Resnet50KerasPruningAccuracy, self)._run_and_report_benchmark(
top_1_min=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][0],
top_1_max=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][1])
class KerasPruningBenchmarkRealBase(Resnet50KerasBenchmarkBase):
"""Pruning method benchmarks."""
def __init__(self, root_data_dir=None, default_flags=None, **kwargs):
if default_flags is None:
default_flags = {}
default_flags.update({
'skip_eval': True,
'report_accuracy_metrics': False,
'data_dir': os.path.join(root_data_dir, 'imagenet'),
'train_steps': 110,
'log_steps': 10,
'pruning_method': 'polynomial_decay',
'pruning_begin_step': 0,
'pruning_end_step': 50000,
'pruning_initial_sparsity': 0,
'pruning_final_sparsity': 0.5,
'pruning_frequency': 100,
})
super(KerasPruningBenchmarkRealBase, self).__init__(
default_flags=default_flags, **kwargs)
class MobilenetV1KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
"""Pruning method benchmarks for MobilenetV1."""
def __init__(self, **kwargs):
default_flags = {
'model': 'mobilenet',
'optimizer': 'mobilenet_default',
}
super(MobilenetV1KerasPruningBenchmarkReal, self).__init__(
default_flags=default_flags, **kwargs)
class Resnet50KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
"""Pruning method benchmarks for resnet50."""
def __init__(self, **kwargs):
default_flags = {
'model': 'resnet50_v1.5',
'optimizer': 'mobilenet_default',
}
super(Resnet50KerasPruningBenchmarkReal, self).__init__(
default_flags=default_flags, **kwargs)
class KerasClusteringAccuracyBase(keras_benchmark.KerasBenchmark):
"""Benchmark accuracy tests for clustering method."""
def __init__(self,
output_dir=None,
root_data_dir=None,
default_flags=None,
**kwargs):
"""An accuracy benchmark class for clustering method.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
default_flags: default flags
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
if default_flags is None:
default_flags = {}
default_flags['clustering_method'] = 'selective_clustering'
default_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
default_flags['model'] = 'mobilenet_pretrained'
default_flags['optimizer'] = 'mobilenet_fine_tune'
flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
super(KerasClusteringAccuracyBase, self).__init__(
output_dir=output_dir,
flag_methods=flag_methods,
default_flags=default_flags,
**kwargs)
def benchmark_8_gpu(self):
"""Test Keras model with eager, dist_strat and 8 GPUs."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = 32 * 8
FLAGS.train_epochs = 1
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
self._run_and_report_benchmark()
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self,
top_1_min=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
'MOBILENET_V1_FINETUNE_CLUSTERING'][0],
top_1_max=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
'MOBILENET_V1_FINETUNE_CLUSTERING'][1]):
start_time_sec = time.time()
stats = resnet_imagenet_main.run(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(KerasClusteringAccuracyBase, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=top_1_min,
top_1_max=top_1_max,
total_batch_size=FLAGS.batch_size,
log_steps=100)
class MobilenetV1KerasClusteringAccuracy(KerasClusteringAccuracyBase):
"""Benchmark accuracy tests for MobilenetV1 with clustering method."""
def __init__(self, root_data_dir=None, **kwargs):
default_flags = {
'model': 'mobilenet_pretrained',
'optimizer': 'mobilenet_fine_tune',
}
super(MobilenetV1KerasClusteringAccuracy, self).__init__(
root_data_dir=root_data_dir,
default_flags=default_flags,
**kwargs)
def _run_and_report_benchmark(self):
super(MobilenetV1KerasClusteringAccuracy, self)._run_and_report_benchmark(
top_1_min=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_CLUSTERING'][0],
top_1_max=\
MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_CLUSTERING'][1])
class KerasClusteringBenchmarkRealBase(Resnet50KerasBenchmarkBase):
"""Clustering method benchmarks."""
def __init__(self, root_data_dir=None, default_flags=None, **kwargs):
if default_flags is None:
default_flags = {}
default_flags.update({
'skip_eval': True,
'report_accuracy_metrics': False,
'data_dir': os.path.join(root_data_dir, 'imagenet'),
'clustering_method': 'selective_clustering',
'train_steps': 110,
'log_steps': 10,
})
super(KerasClusteringBenchmarkRealBase, self).__init__(
default_flags=default_flags, **kwargs)
class MobilenetV1KerasClusteringBenchmarkReal(KerasClusteringBenchmarkRealBase):
"""Clustering method benchmarks for MobilenetV1."""
def __init__(self, **kwargs):
default_flags = {
'model': 'mobilenet_pretrained',
'optimizer': 'mobilenet_fine_tune',
}
super(MobilenetV1KerasClusteringBenchmarkReal, self).__init__(
default_flags=default_flags, **kwargs)
if __name__ == '__main__':
tf.test.main()
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to Cifar-10 dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import logging
import tensorflow as tf
from official.legacy.image_classification.resnet import imagenet_preprocessing
HEIGHT = 32
WIDTH = 32
NUM_CHANNELS = 3
_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
# The record is the image plus a one-byte label
_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
NUM_IMAGES = {
'train': 50000,
'validation': 10000,
}
_NUM_DATA_FILES = 5
NUM_CLASSES = 10
def parse_record(raw_record, is_training, dtype):
"""Parses a record containing a training example of an image.
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
This method converts the label to one hot to fit the loss function.
Args:
raw_record: scalar Tensor tf.string containing a serialized Example protocol
buffer.
is_training: A boolean denoting whether the input is for training.
dtype: Data type to use for input images.
Returns:
Tuple with processed image tensor and one-hot-encoded label tensor.
"""
# Convert bytes to a vector of uint8 that is record_bytes long.
record_vector = tf.io.decode_raw(raw_record, tf.uint8)
# The first byte represents the label, which we convert from uint8 to int32
# and then to one-hot.
label = tf.cast(record_vector[0], tf.int32)
# The remaining bytes after the label represent the image, which we reshape
# from [depth * height * width] to [depth, height, width].
depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
[NUM_CHANNELS, HEIGHT, WIDTH])
# Convert from [depth, height, width] to [height, width, depth], and cast as
# float32.
image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
image = preprocess_image(image, is_training)
image = tf.cast(image, dtype)
return image, label
def preprocess_image(image, is_training):
"""Preprocess a single image of layout [height, width, depth]."""
if is_training:
# Resize the image to add four extra pixels on each side.
image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
# Randomly crop a [HEIGHT, WIDTH] section of the image.
image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
# Randomly flip the image horizontally.
image = tf.image.random_flip_left_right(image)
# Subtract off the mean and divide by the variance of the pixels.
image = tf.image.per_image_standardization(image)
return image
def get_filenames(is_training, data_dir):
"""Returns a list of filenames."""
assert tf.io.gfile.exists(data_dir), (
'Run cifar10_download_and_extract.py first to download and extract the '
'CIFAR-10 data.')
if is_training:
return [
os.path.join(data_dir, 'data_batch_%d.bin' % i)
for i in range(1, _NUM_DATA_FILES + 1)
]
else:
return [os.path.join(data_dir, 'test_batch.bin')]
def input_fn(is_training,
data_dir,
batch_size,
dtype=tf.float32,
datasets_num_private_threads=None,
parse_record_fn=parse_record,
input_context=None,
drop_remainder=False):
"""Input function which provides batches for train or eval.
Args:
is_training: A boolean denoting whether the input is for training.
data_dir: The directory containing the input data.
batch_size: The number of samples per batch.
dtype: Data type to use for images/features
datasets_num_private_threads: Number of private threads for tf.data.
parse_record_fn: Function to use for parsing the records.
input_context: A `tf.distribute.InputContext` object passed in by
`tf.distribute.Strategy`.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
Returns:
A dataset that can be used for iteration.
"""
filenames = get_filenames(is_training, data_dir)
dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
if input_context:
logging.info(
'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
input_context.input_pipeline_id, input_context.num_input_pipelines)
dataset = dataset.shard(input_context.num_input_pipelines,
input_context.input_pipeline_id)
return imagenet_preprocessing.process_record_dataset(
dataset=dataset,
is_training=is_training,
batch_size=batch_size,
shuffle_buffer=NUM_IMAGES['train'],
parse_record_fn=parse_record_fn,
dtype=dtype,
datasets_num_private_threads=datasets_num_private_threads,
drop_remainder=drop_remainder)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the Cifar-10 dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Import libraries
from absl import app
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
from official.benchmark.models import cifar_preprocessing
from official.benchmark.models import resnet_cifar_model
from official.benchmark.models import synthetic_util
from official.common import distribute_utils
from official.legacy.image_classification.resnet import common
from official.utils.flags import core as flags_core
from official.utils.misc import keras_utils
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(0.1, 91), (0.01, 136), (0.001, 182)
]
def learning_rate_schedule(current_epoch,
current_batch,
batches_per_epoch,
batch_size):
"""Handles linear scaling rule and LR decay.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
provided scaling factor.
Args:
current_epoch: integer, current epoch indexed from 0.
current_batch: integer, current batch in the current epoch, indexed from 0.
batches_per_epoch: integer, number of steps in an epoch.
batch_size: integer, total batch sized.
Returns:
Adjusted learning rate.
"""
del current_batch, batches_per_epoch # not used
initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
learning_rate = initial_learning_rate
for mult, start_epoch in LR_SCHEDULE:
if current_epoch >= start_epoch:
learning_rate = initial_learning_rate * mult
else:
break
return learning_rate
class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
"""Callback to update learning rate on every batch (not epoch boundaries).
N.B. Only support Keras optimizers, not TF optimizers.
Attributes:
schedule: a function that takes an epoch index and a batch index as input
(both integer, indexed from 0) and returns a new learning rate as
output (float).
"""
def __init__(self, schedule, batch_size, steps_per_epoch):
super(LearningRateBatchScheduler, self).__init__()
self.schedule = schedule
self.steps_per_epoch = steps_per_epoch
self.batch_size = batch_size
self.epochs = -1
self.prev_lr = -1
def on_epoch_begin(self, epoch, logs=None):
if not hasattr(self.model.optimizer, 'learning_rate'):
raise ValueError('Optimizer must have a "learning_rate" attribute.')
self.epochs += 1
def on_batch_begin(self, batch, logs=None):
"""Executes before step begins."""
lr = self.schedule(self.epochs,
batch,
self.steps_per_epoch,
self.batch_size)
if not isinstance(lr, (float, np.float32, np.float64)):
raise ValueError('The output of the "schedule" function should be float.')
if lr != self.prev_lr:
self.model.optimizer.learning_rate = lr # lr should be a float here
self.prev_lr = lr
logging.debug(
'Epoch %05d Batch %05d: LearningRateBatchScheduler '
'change learning rate to %s.', self.epochs, batch, lr)
def run(flags_obj):
"""Run ResNet Cifar-10 training and eval loop using native Keras APIs.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
keras_utils.set_session_config(
enable_xla=flags_obj.enable_xla)
# Execute flag override logic for better model performance
if flags_obj.tf_gpu_thread_mode:
keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus,
datasets_num_private_threads=flags_obj.datasets_num_private_threads)
common.set_cudnn_batchnorm_mode()
dtype = flags_core.get_tf_dtype(flags_obj)
if dtype == 'fp16':
raise ValueError('dtype fp16 is not supported in Keras. Use the default '
'value(fp32).')
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs)
if strategy:
# flags_obj.enable_get_next_as_optional controls whether enabling
# get_next_as_optional behavior in DistributedIterator. If true, last
# partial batch can be supported.
strategy.extended.experimental_enable_get_next_as_optional = (
flags_obj.enable_get_next_as_optional
)
strategy_scope = distribute_utils.get_strategy_scope(strategy)
if flags_obj.use_synthetic_data:
synthetic_util.set_up_synthetic_data()
input_fn = common.get_synth_input_fn(
height=cifar_preprocessing.HEIGHT,
width=cifar_preprocessing.WIDTH,
num_channels=cifar_preprocessing.NUM_CHANNELS,
num_classes=cifar_preprocessing.NUM_CLASSES,
dtype=flags_core.get_tf_dtype(flags_obj),
drop_remainder=True)
else:
synthetic_util.undo_set_up_synthetic_data()
input_fn = cifar_preprocessing.input_fn
train_input_dataset = input_fn(
is_training=True,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
parse_record_fn=cifar_preprocessing.parse_record,
datasets_num_private_threads=flags_obj.datasets_num_private_threads,
dtype=dtype,
# Setting drop_remainder to avoid the partial batch logic in normalization
# layer, which triggers tf.where and leads to extra memory copy of input
# sizes between host and GPU.
drop_remainder=(not flags_obj.enable_get_next_as_optional))
eval_input_dataset = None
if not flags_obj.skip_eval:
eval_input_dataset = input_fn(
is_training=False,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
parse_record_fn=cifar_preprocessing.parse_record)
steps_per_epoch = (
cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
lr_schedule = 0.1
if flags_obj.use_tensor_lr:
initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
values=[initial_learning_rate] +
list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
with strategy_scope:
optimizer = common.get_optimizer(lr_schedule)
model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=optimizer,
metrics=(['sparse_categorical_accuracy']
if flags_obj.report_accuracy_metrics else None),
run_eagerly=flags_obj.run_eagerly)
train_epochs = flags_obj.train_epochs
callbacks = common.get_callbacks()
if not flags_obj.use_tensor_lr:
lr_callback = LearningRateBatchScheduler(
schedule=learning_rate_schedule,
batch_size=flags_obj.batch_size,
steps_per_epoch=steps_per_epoch)
callbacks.append(lr_callback)
# if mutliple epochs, ignore the train_steps flag.
if train_epochs <= 1 and flags_obj.train_steps:
steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
train_epochs = 1
num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
if flags_obj.set_learning_phase_to_train:
# TODO(haoyuzhang): Understand slowdown of setting learning phase when
# not using distribution strategy.
tf.keras.backend.set_learning_phase(1)
num_eval_steps = None
validation_data = None
if not strategy and flags_obj.explicit_gpu_placement:
# TODO(b/135607227): Add device scope automatically in Keras training loop
# when not using distribition strategy.
no_dist_strat_device = tf.device('/device:GPU:0')
no_dist_strat_device.__enter__()
history = model.fit(train_input_dataset,
epochs=train_epochs,
steps_per_epoch=steps_per_epoch,
callbacks=callbacks,
validation_steps=num_eval_steps,
validation_data=validation_data,
validation_freq=flags_obj.epochs_between_evals,
verbose=2)
eval_output = None
if not flags_obj.skip_eval:
eval_output = model.evaluate(eval_input_dataset,
steps=num_eval_steps,
verbose=2)
if not strategy and flags_obj.explicit_gpu_placement:
no_dist_strat_device.__exit__()
stats = common.build_stats(history, eval_output, callbacks)
return stats
def define_cifar_flags():
common.define_keras_flags()
flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
model_dir='/tmp/cifar10_model',
epochs_between_evals=10,
batch_size=128)
def main(_):
return run(flags.FLAGS)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
define_cifar_flags()
app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
# Reference:
- [Deep Residual Learning for Image Recognition](
https://arxiv.org/abs/1512.03385)
Adapted from code contributed by BigMoyan.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
BATCH_NORM_DECAY = 0.997
BATCH_NORM_EPSILON = 1e-5
L2_WEIGHT_DECAY = 2e-4
def identity_building_block(input_tensor,
kernel_size,
filters,
stage,
block,
training=None):
"""The identity block is the block that has no conv layer at shortcut.
Arguments:
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: current block label, used for generating layer names
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
Output tensor for the block.
"""
filters1, filters2 = filters
if tf.keras.backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = tf.keras.layers.Conv2D(
filters1,
kernel_size,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name=conv_name_base + '2a')(
input_tensor)
x = tf.keras.layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(
x, training=training)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Conv2D(
filters2,
kernel_size,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name=conv_name_base + '2b')(
x)
x = tf.keras.layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(
x, training=training)
x = tf.keras.layers.add([x, input_tensor])
x = tf.keras.layers.Activation('relu')(x)
return x
def conv_building_block(input_tensor,
kernel_size,
filters,
stage,
block,
strides=(2, 2),
training=None):
"""A block that has a conv layer at shortcut.
Arguments:
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: current block label, used for generating layer names
strides: Strides for the first conv layer in the block.
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
Output tensor for the block.
Note that from stage 3,
the first conv layer at main path is with strides=(2, 2)
And the shortcut should have strides=(2, 2) as well
"""
filters1, filters2 = filters
if tf.keras.backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = tf.keras.layers.Conv2D(
filters1,
kernel_size,
strides=strides,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name=conv_name_base + '2a')(
input_tensor)
x = tf.keras.layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(
x, training=training)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Conv2D(
filters2,
kernel_size,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name=conv_name_base + '2b')(
x)
x = tf.keras.layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(
x, training=training)
shortcut = tf.keras.layers.Conv2D(
filters2, (1, 1),
strides=strides,
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name=conv_name_base + '1')(
input_tensor)
shortcut = tf.keras.layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '1')(
shortcut, training=training)
x = tf.keras.layers.add([x, shortcut])
x = tf.keras.layers.Activation('relu')(x)
return x
def resnet_block(input_tensor,
size,
kernel_size,
filters,
stage,
conv_strides=(2, 2),
training=None):
"""A block which applies conv followed by multiple identity blocks.
Arguments:
input_tensor: input tensor
size: integer, number of constituent conv/identity building blocks. A conv
block is applied once, followed by (size - 1) identity blocks.
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
conv_strides: Strides for the first conv layer in the block.
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
Output tensor after applying conv and identity blocks.
"""
x = conv_building_block(
input_tensor,
kernel_size,
filters,
stage=stage,
strides=conv_strides,
block='block_0',
training=training)
for i in range(size - 1):
x = identity_building_block(
x,
kernel_size,
filters,
stage=stage,
block='block_%d' % (i + 1),
training=training)
return x
def resnet(num_blocks, classes=10, training=None):
"""Instantiates the ResNet architecture.
Arguments:
num_blocks: integer, the number of conv/identity blocks in each block. The
ResNet contains 3 blocks with each block containing one conv block
followed by (layers_per_block - 1) number of idenity blocks. Each
conv/idenity block has 2 convolutional layers. With the input
convolutional layer and the pooling layer towards the end, this brings the
total size of the network to (6*num_blocks + 2)
classes: optional number of classes to classify images into
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
A Keras model instance.
"""
input_shape = (32, 32, 3)
img_input = tf.keras.Input(shape=input_shape)
if tf.keras.backend.image_data_format() == 'channels_first':
x = tf.keras.layers.Lambda(
lambda x: tf.keras.backend.permute_dimensions(x, (0, 3, 1, 2)),
name='transpose')(
img_input)
bn_axis = 1
else: # channel_last
x = img_input
bn_axis = 3
x = tf.keras.layers.ZeroPadding2D(padding=(1, 1), name='conv1_pad')(x)
x = tf.keras.layers.Conv2D(
16, (3, 3),
strides=(1, 1),
padding='valid',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name='conv1')(
x)
x = tf.keras.layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name='bn_conv1',
)(x, training=training)
x = tf.keras.layers.Activation('relu')(x)
x = resnet_block(
x,
size=num_blocks,
kernel_size=3,
filters=[16, 16],
stage=2,
conv_strides=(1, 1),
training=training)
x = resnet_block(
x,
size=num_blocks,
kernel_size=3,
filters=[32, 32],
stage=3,
conv_strides=(2, 2),
training=training)
x = resnet_block(
x,
size=num_blocks,
kernel_size=3,
filters=[64, 64],
stage=4,
conv_strides=(2, 2),
training=training)
if tf.keras.backend.image_data_format() == 'channels_last':
rm_axes = [1, 2]
else:
rm_axes = [2, 3]
x = tf.keras.layers.Lambda(
lambda x: tf.keras.backend.mean(x, rm_axes), name='reduce_mean')(x)
x = tf.keras.layers.Dense(
classes,
activation='softmax',
kernel_initializer=tf.keras.initializers.RandomNormal(
stddev=0.01),
kernel_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
bias_regularizer=tf.keras.regularizers.L2(L2_WEIGHT_DECAY),
name='fc10')(
x)
inputs = img_input
# Create model.
model = tf.keras.models.Model(inputs, x, name='resnet56')
return model
resnet20 = functools.partial(resnet, num_blocks=3)
resnet32 = functools.partial(resnet, num_blocks=5)
resnet56 = functools.partial(resnet, num_blocks=9)
resnet10 = functools.partial(resnet, num_blocks=110)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test the keras ResNet model with Cifar data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tempfile
import tensorflow as tf
from tensorflow.python.eager import context
from tensorflow.python.platform import googletest
from official.benchmark.models import cifar_preprocessing
from official.benchmark.models import resnet_cifar_main
from official.utils.testing import integration
class KerasCifarTest(googletest.TestCase):
"""Unit tests for Keras ResNet with Cifar."""
_extra_flags = [
"-batch_size", "4", "-train_steps", "1", "-use_synthetic_data", "true"
]
_tempdir = None
def get_temp_dir(self):
if not self._tempdir:
self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
return self._tempdir
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(KerasCifarTest, cls).setUpClass()
resnet_cifar_main.define_cifar_flags()
def setUp(self):
super(KerasCifarTest, self).setUp()
cifar_preprocessing.NUM_IMAGES["validation"] = 4
def tearDown(self):
super(KerasCifarTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
def test_end_to_end_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
extra_flags = [
"-distribution_strategy",
"off",
"-model_dir",
"keras_cifar_no_dist_strat",
"-data_format",
"channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_graph_no_dist_strat(self):
"""Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
extra_flags = [
"-enable_eager",
"false",
"-distribution_strategy",
"off",
"-model_dir",
"keras_cifar_graph_no_dist_strat",
"-data_format",
"channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_1_gpu(self):
"""Test Keras model with 1 GPU."""
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(1, context.num_gpus()))
extra_flags = [
"-num_gpus",
"1",
"-distribution_strategy",
"mirrored",
"-model_dir",
"keras_cifar_1_gpu",
"-data_format",
"channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_graph_1_gpu(self):
"""Test Keras model in legacy graph mode with 1 GPU."""
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(1, context.num_gpus()))
extra_flags = [
"-num_gpus",
"1",
"-noenable_eager",
"-distribution_strategy",
"mirrored",
"-model_dir",
"keras_cifar_graph_1_gpu",
"-data_format",
"channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_2_gpu(self):
"""Test Keras model with 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(2, context.num_gpus()))
extra_flags = [
"-num_gpus",
"2",
"-distribution_strategy",
"mirrored",
"-model_dir",
"keras_cifar_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_graph_2_gpu(self):
"""Test Keras model in legacy graph mode with 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(2, context.num_gpus()))
extra_flags = [
"-num_gpus",
"2",
"-enable_eager",
"false",
"-distribution_strategy",
"mirrored",
"-model_dir",
"keras_cifar_graph_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
if __name__ == "__main__":
googletest.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset."""
import os
# Import libraries
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
from official.common import distribute_utils
from official.legacy.image_classification import test_utils
from official.legacy.image_classification.resnet import common
from official.legacy.image_classification.resnet import imagenet_preprocessing
from official.legacy.image_classification.resnet import resnet_model
from official.modeling import performance
from official.utils.flags import core as flags_core
from official.utils.misc import keras_utils
from official.utils.misc import model_helpers
def _cluster_last_three_conv2d_layers(model):
"""Helper method to cluster last three conv2d layers."""
import tensorflow_model_optimization as tfmot # pylint: disable=g-import-not-at-top
last_three_conv2d_layers = [
layer for layer in model.layers
if isinstance(layer, tf.keras.layers.Conv2D)
][-3:]
cluster_weights = tfmot.clustering.keras.cluster_weights
centroid_initialization = tfmot.clustering.keras.CentroidInitialization
def cluster_fn(layer):
if layer not in last_three_conv2d_layers:
return layer
if layer == last_three_conv2d_layers[0] or \
layer == last_three_conv2d_layers[1]:
clustered = cluster_weights(layer, number_of_clusters=256, \
cluster_centroids_init=centroid_initialization.LINEAR)
print('Clustered {} with 256 clusters'.format(layer.name))
else:
clustered = cluster_weights(layer, number_of_clusters=32, \
cluster_centroids_init=centroid_initialization.LINEAR)
print('Clustered {} with 32 clusters'.format(layer.name))
return clustered
return tf.keras.models.clone_model(model, clone_function=cluster_fn)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using native Keras APIs.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
NotImplementedError: If some features are not currently supported.
Returns:
Dictionary of training and eval stats.
"""
# Execute flag override logic for better model performance
if flags_obj.tf_gpu_thread_mode:
keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus,
datasets_num_private_threads=flags_obj.datasets_num_private_threads)
common.set_cudnn_batchnorm_mode()
dtype = flags_core.get_tf_dtype(flags_obj)
performance.set_mixed_precision_policy(
flags_core.get_tf_dtype(flags_obj))
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
# Configures cluster spec for distribution strategy.
_ = distribute_utils.configure_cluster(flags_obj.worker_hosts,
flags_obj.task_index)
strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu)
if strategy:
# flags_obj.enable_get_next_as_optional controls whether enabling
# get_next_as_optional behavior in DistributedIterator. If true, last
# partial batch can be supported.
strategy.extended.experimental_enable_get_next_as_optional = (
flags_obj.enable_get_next_as_optional
)
strategy_scope = distribute_utils.get_strategy_scope(strategy)
# pylint: disable=protected-access
if flags_obj.use_synthetic_data:
input_fn = common.get_synth_input_fn(
height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
num_channels=imagenet_preprocessing.NUM_CHANNELS,
num_classes=imagenet_preprocessing.NUM_CLASSES,
dtype=dtype,
drop_remainder=True)
else:
input_fn = imagenet_preprocessing.input_fn
# When `enable_xla` is True, we always drop the remainder of the batches
# in the dataset, as XLA-GPU doesn't support dynamic shapes.
drop_remainder = flags_obj.enable_xla
# Current resnet_model.resnet50 input format is always channel-last.
# We use keras_application mobilenet model which input format is depends on
# the keras beckend image data format.
# This use_keras_image_data_format flags indicates whether image preprocessor
# output format should be same as the keras backend image data format or just
# channel-last format.
use_keras_image_data_format = \
(flags_obj.model == 'mobilenet' or flags_obj.model == 'mobilenet_pretrained')
train_input_dataset = input_fn(
is_training=True,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
parse_record_fn=imagenet_preprocessing.get_parse_record_fn(
use_keras_image_data_format=use_keras_image_data_format),
datasets_num_private_threads=flags_obj.datasets_num_private_threads,
dtype=dtype,
drop_remainder=drop_remainder,
tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
training_dataset_cache=flags_obj.training_dataset_cache,
)
eval_input_dataset = None
if not flags_obj.skip_eval:
eval_input_dataset = input_fn(
is_training=False,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
parse_record_fn=imagenet_preprocessing.get_parse_record_fn(
use_keras_image_data_format=use_keras_image_data_format),
dtype=dtype,
drop_remainder=drop_remainder)
lr_schedule = common.PiecewiseConstantDecayWithWarmup(
batch_size=flags_obj.batch_size,
epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
warmup_epochs=common.LR_SCHEDULE[0][1],
boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
multipliers=list(p[0] for p in common.LR_SCHEDULE),
compute_lr_on_cpu=True)
steps_per_epoch = (
imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
with strategy_scope:
if flags_obj.optimizer == 'resnet50_default':
optimizer = common.get_optimizer(lr_schedule)
elif flags_obj.optimizer == 'mobilenet_default' or flags_obj.optimizer == 'mobilenet_fine_tune':
initial_learning_rate = \
flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size
if flags_obj.optimizer == 'mobilenet_fine_tune':
initial_learning_rate = 1e-5
optimizer = tf.keras.optimizers.SGD(
learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate,
decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay,
decay_rate=flags_obj.lr_decay_factor,
staircase=True),
momentum=0.9)
optimizer = performance.configure_optimizer(
optimizer,
use_float16=flags_core.get_tf_dtype(flags_obj) == tf.float16,
loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128),)
# TODO(hongkuny): Remove trivial model usage and move it to benchmark.
if flags_obj.use_trivial_model:
model = test_utils.trivial_model(imagenet_preprocessing.NUM_CLASSES)
elif flags_obj.model == 'resnet50_v1.5':
model = resnet_model.resnet50(
num_classes=imagenet_preprocessing.NUM_CLASSES)
elif flags_obj.model == 'mobilenet' or flags_obj.model == 'mobilenet_pretrained':
# TODO(kimjaehong): Remove layers attribute when minimum TF version
# support 2.0 layers by default.
if flags_obj.model == 'mobilenet_pretrained':
classes_labels = 1000
initial_weights = 'imagenet'
else:
classes_labels = imagenet_preprocessing.NUM_CLASSES
initial_weights = None
model = tf.keras.applications.mobilenet.MobileNet(
weights=initial_weights,
classes=classes_labels,
layers=tf.keras.layers)
if flags_obj.pretrained_filepath:
model.load_weights(flags_obj.pretrained_filepath)
if flags_obj.pruning_method == 'polynomial_decay':
import tensorflow_model_optimization as tfmot # pylint: disable=g-import-not-at-top
if dtype != tf.float32:
raise NotImplementedError(
'Pruning is currently only supported on dtype=tf.float32.')
pruning_params = {
'pruning_schedule':
tfmot.sparsity.keras.PolynomialDecay(
initial_sparsity=flags_obj.pruning_initial_sparsity,
final_sparsity=flags_obj.pruning_final_sparsity,
begin_step=flags_obj.pruning_begin_step,
end_step=flags_obj.pruning_end_step,
frequency=flags_obj.pruning_frequency),
}
model = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)
elif flags_obj.pruning_method:
raise NotImplementedError('Only polynomial_decay is currently supported.')
if flags_obj.clustering_method == 'selective_clustering':
import tensorflow_model_optimization as tfmot # pylint: disable=g-import-not-at-top
if dtype != tf.float32:
raise NotImplementedError(
'Clustering is currently only supported on dtype=tf.float32.')
model = _cluster_last_three_conv2d_layers(model)
elif flags_obj.clustering_method:
raise NotImplementedError(
'Only selective_clustering is implemented.')
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=optimizer,
metrics=(['sparse_categorical_accuracy']
if flags_obj.report_accuracy_metrics else None),
run_eagerly=flags_obj.run_eagerly,
jit_compile=flags_obj.enable_xla)
train_epochs = flags_obj.train_epochs
callbacks = common.get_callbacks(
pruning_method=flags_obj.pruning_method,
enable_checkpoint_and_export=flags_obj.enable_checkpoint_and_export,
model_dir=flags_obj.model_dir)
# If mutliple epochs, ignore the train_steps flag.
if train_epochs <= 1 and flags_obj.train_steps:
steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
train_epochs = 1
num_eval_steps = (
imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
# Only build the training graph. This reduces memory usage introduced by
# control flow ops in layers that have different implementations for
# training and inference (e.g., batch norm).
if flags_obj.set_learning_phase_to_train:
# TODO(haoyuzhang): Understand slowdown of setting learning phase when
# not using distribution strategy.
tf.keras.backend.set_learning_phase(1)
num_eval_steps = None
validation_data = None
if not strategy and flags_obj.explicit_gpu_placement:
# TODO(b/135607227): Add device scope automatically in Keras training loop
# when not using distribution strategy.
no_dist_strat_device = tf.device('/device:GPU:0')
no_dist_strat_device.__enter__()
history = model.fit(train_input_dataset,
epochs=train_epochs,
steps_per_epoch=steps_per_epoch,
callbacks=callbacks,
validation_steps=num_eval_steps,
validation_data=validation_data,
validation_freq=flags_obj.epochs_between_evals,
verbose=2)
eval_output = None
if not flags_obj.skip_eval:
eval_output = model.evaluate(eval_input_dataset,
steps=num_eval_steps,
verbose=2)
if flags_obj.pruning_method:
model = tfmot.sparsity.keras.strip_pruning(model)
if flags_obj.clustering_method:
model = tfmot.clustering.keras.strip_clustering(model)
if flags_obj.enable_checkpoint_and_export:
if dtype == tf.bfloat16:
logging.warning('Keras model.save does not support bfloat16 dtype.')
else:
# Keras model.save assumes a float32 input designature.
export_path = os.path.join(flags_obj.model_dir, 'saved_model')
model.save(export_path, include_optimizer=False)
if not strategy and flags_obj.explicit_gpu_placement:
no_dist_strat_device.__exit__()
stats = common.build_stats(history, eval_output, callbacks)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags(
model=True,
optimizer=True,
pretrained_filepath=True)
common.define_pruning_flags()
common.define_clustering_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
model_helpers.apply_clean(flags.FLAGS)
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
define_imagenet_keras_flags()
app.run(main)
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test the keras ResNet model with ImageNet data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.eager import context
from official.benchmark.models import resnet_imagenet_main
from official.legacy.image_classification.resnet import imagenet_preprocessing
from official.utils.testing import integration
@parameterized.parameters(
"resnet",
# "resnet_polynomial_decay", b/151854314
"mobilenet",
# "mobilenet_polynomial_decay", b/151854314
"mobilenet_selective_clustering",
)
class KerasImagenetTest(tf.test.TestCase):
"""Unit tests for Keras Models with ImageNet."""
_default_flags_dict = [
"-batch_size",
"4",
"-train_steps",
"1",
"-use_synthetic_data",
"true",
"-data_format",
"channels_last",
]
_extra_flags_dict = {
"resnet": [
"-model",
"resnet50_v1.5",
"-optimizer",
"resnet50_default",
],
"resnet_polynomial_decay": [
"-model",
"resnet50_v1.5",
"-optimizer",
"resnet50_default",
"-pruning_method",
"polynomial_decay",
],
"mobilenet": [
"-model",
"mobilenet",
"-optimizer",
"mobilenet_default",
],
"mobilenet_polynomial_decay": [
"-model",
"mobilenet",
"-optimizer",
"mobilenet_default",
"-pruning_method",
"polynomial_decay",
],
"mobilenet_selective_clustering": [
"-model", "mobilenet_pretrained",
"-optimizer", "mobilenet_fine_tune",
"-clustering_method", "selective_clustering",
]
}
_tempdir = None
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(KerasImagenetTest, cls).setUpClass()
resnet_imagenet_main.define_imagenet_keras_flags()
def setUp(self):
super(KerasImagenetTest, self).setUp()
imagenet_preprocessing.NUM_IMAGES["validation"] = 4
self.policy = tf.keras.mixed_precision.global_policy()
def tearDown(self):
super(KerasImagenetTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
tf.keras.mixed_precision.set_global_policy(self.policy)
def get_extra_flags_dict(self, flags_key):
return self._extra_flags_dict[flags_key] + self._default_flags_dict
def test_end_to_end_no_dist_strat(self, flags_key):
"""Test Keras model with 1 GPU, no distribution strategy."""
extra_flags = [
"-distribution_strategy",
"off",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_graph_no_dist_strat(self, flags_key):
"""Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
extra_flags = [
"-enable_eager",
"false",
"-distribution_strategy",
"off",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_1_gpu(self, flags_key):
"""Test Keras model with 1 GPU."""
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(1, context.num_gpus()))
extra_flags = [
"-num_gpus",
"1",
"-distribution_strategy",
"mirrored",
"-enable_checkpoint_and_export",
"1",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_1_gpu_fp16(self, flags_key):
"""Test Keras model with 1 GPU and fp16."""
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(1, context.num_gpus()))
extra_flags = [
"-num_gpus",
"1",
"-dtype",
"fp16",
"-distribution_strategy",
"mirrored",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
if "polynomial_decay" in extra_flags:
self.skipTest("Pruning with fp16 is currently not supported.")
if "selective_clustering" in extra_flags:
self.skipTest("Clustering with fp16 is currently not supported.")
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_2_gpu(self, flags_key):
"""Test Keras model with 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(2, context.num_gpus()))
extra_flags = [
"-num_gpus",
"2",
"-distribution_strategy",
"mirrored",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_xla_2_gpu(self, flags_key):
"""Test Keras model with XLA and 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(2, context.num_gpus()))
extra_flags = [
"-num_gpus",
"2",
"-enable_xla",
"true",
"-distribution_strategy",
"mirrored",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_2_gpu_fp16(self, flags_key):
"""Test Keras model with 2 GPUs and fp16."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(2, context.num_gpus()))
extra_flags = [
"-num_gpus",
"2",
"-dtype",
"fp16",
"-distribution_strategy",
"mirrored",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
if "polynomial_decay" in extra_flags:
self.skipTest("Pruning with fp16 is currently not supported.")
if "selective_clustering" in extra_flags:
self.skipTest("Clustering with fp16 is currently not supported.")
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
def test_end_to_end_xla_2_gpu_fp16(self, flags_key):
"""Test Keras model with XLA, 2 GPUs and fp16."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available"
.format(2, context.num_gpus()))
extra_flags = [
"-num_gpus",
"2",
"-dtype",
"fp16",
"-enable_xla",
"true",
"-distribution_strategy",
"mirrored",
]
extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
if "polynomial_decay" in extra_flags:
self.skipTest("Pruning with fp16 is currently not supported.")
if "selective_clustering" in extra_flags:
self.skipTest("Clustering with fp16 is currently not supported.")
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
if __name__ == "__main__":
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test the keras ResNet model with ImageNet data on TPU."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf
from official.benchmark.models import resnet_imagenet_main
from official.legacy.image_classification.resnet import imagenet_preprocessing
from official.utils.testing import integration
class KerasImagenetTest(tf.test.TestCase, parameterized.TestCase):
"""Unit tests for Keras Models with ImageNet."""
_extra_flags_dict = {
"resnet": [
"-batch_size",
"4",
"-train_steps",
"1",
"-use_synthetic_data",
"true"
"-model",
"resnet50_v1.5",
"-optimizer",
"resnet50_default",
],
"resnet_polynomial_decay": [
"-batch_size",
"4",
"-train_steps",
"1",
"-use_synthetic_data",
"true",
"-model",
"resnet50_v1.5",
"-optimizer",
"resnet50_default",
"-pruning_method",
"polynomial_decay",
],
}
_tempdir = None
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(KerasImagenetTest, cls).setUpClass()
resnet_imagenet_main.define_imagenet_keras_flags()
def setUp(self):
super(KerasImagenetTest, self).setUp()
imagenet_preprocessing.NUM_IMAGES["validation"] = 4
self.policy = tf.keras.mixed_precision.global_policy()
def tearDown(self):
super(KerasImagenetTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
tf.keras.mixed_precision.set_global_policy(self.policy)
@parameterized.parameters([
"resnet",
# "resnet_polynomial_decay" b/151854314
])
def test_end_to_end_tpu(self, flags_key):
"""Test Keras model with TPU distribution strategy."""
extra_flags = [
"-distribution_strategy",
"tpu",
"-data_format",
"channels_last",
"-enable_checkpoint_and_export",
"1",
]
extra_flags = extra_flags + self._extra_flags_dict[flags_key]
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
@parameterized.parameters(["resnet"])
def test_end_to_end_tpu_bf16(self, flags_key):
"""Test Keras model with TPU and bfloat16 activation."""
extra_flags = [
"-distribution_strategy",
"tpu",
"-data_format",
"channels_last",
"-dtype",
"bf16",
]
extra_flags = extra_flags + self._extra_flags_dict[flags_key]
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags)
if __name__ == "__main__":
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment