Unverified Commit 23f75313 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Add Transformer Big Benchmarks + FP16 for other tests. (#6838)

* Add big tests.

* fix super

* Add fp16, increase 8xGPU batch-sizes

* Adding the rest of the fp16 tests.

* Big accuracy test batch_perf_gpu

* fix docstrings

* add _run_and_report

* Edited docstrings
parent 80444539
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Estimator benchmarks and accuracy tests."""
"""Executes Transformer w/Estimator benchmark and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
......@@ -33,10 +33,10 @@ FLAGS = flags.FLAGS
class EstimatorBenchmark(tf.test.Benchmark):
"""Base class to hold methods common to test classes in the module.
"""Methods common to executing transformer w/Estimator tests.
Code under test for the Transformer Estimator models that report mostly the
same data and require the same FLAG setup.
Code under test for the Transformer Estimator models report the same data
and require the same FLAG setup.
"""
local_flags = None
......@@ -76,7 +76,7 @@ class EstimatorBenchmark(tf.test.Benchmark):
Args:
stats: dict returned from estimator models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
wall_time_sec: the during of the benchmark execution in seconds.
bleu_max: highest passing level for bleu score.
bleu_min: lowest passing level for bleu score.
"""
......@@ -106,15 +106,15 @@ class EstimatorBenchmark(tf.test.Benchmark):
metrics=metrics)
class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
"""Benchmark accuracy tests for ResNet50 w/ Estimator."""
class TransformerBigEstimatorAccuracy(EstimatorBenchmark):
"""Benchmark accuracy tests for Transformer Big model w/Estimator."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for ResNet50 w/ Estimator.
"""Benchmark accuracy tests for Transformer Big model w/Estimator.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
output_dir: directory where to output, e.g. log files.
root_data_dir: directory under which to look for dataset.
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
......@@ -136,42 +136,86 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.de')
super(TransformerBaseEstimatorAccuracy, self).__init__(
super(TransformerBigEstimatorAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def benchmark_graph_2_gpu(self):
"""Benchmark graph mode 2 gpus.
def benchmark_graph_8_gpu(self):
"""Benchmark graph mode 8 gpus.
The paper uses 8 GPUs and a much larger effective batch size, this is will
not converge to the 27.3 BLEU (uncased) SOTA.
SOTA is 28.4 BLEU (uncased).
"""
self._setup()
FLAGS.num_gpus = 2
FLAGS.num_gpus = 8
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 4096 * 2
FLAGS.param_set = 'big'
FLAGS.batch_size = 3072 * 8
FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.hooks = ['ExamplesPerSecondHook']
# These bleu scores are based on test runs after at this limited
# number of steps and batch size after verifying SOTA at 8xV100s.
self._run_and_report_benchmark(bleu_min=25.3, bleu_max=26)
self._run_and_report_benchmark()
def benchmark_graph_fp16_2_gpu(self):
"""Benchmark 2 gpu with fp16 mixed-precision.
def _run_and_report_benchmark(self, bleu_min=28.3, bleu_max=29):
"""Run benchmark and report results.
Args:
bleu_min: minimum expected uncased bleu. default is SOTA.
bleu_max: max expected uncased bleu. default is a high number.
"""
start_time_sec = time.time()
stats = transformer_main.run_transformer(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
self._report_benchmark(stats,
wall_time_sec,
bleu_min=bleu_min,
bleu_max=bleu_max)
class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
"""Benchmark accuracy tests for Transformer Base model w/ Estimator."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for Transformer Base model w/ Estimator.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [transformer_main.define_transformer_flags]
self.train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
self.vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
self.bleu_source = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.en')
self.bleu_ref = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.de')
super(TransformerBaseEstimatorAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
The paper uses 8 GPUs and a much larger effective batch-size,
this is unlikely to hit the target bleu score regardless of
number of steps.
def benchmark_graph_2_gpu(self):
"""Benchmark graph mode 2 gpus.
The paper uses 8 GPUs and a much larger effective batch size, this is will
not converge to the 27.3 BLEU (uncased) SOTA.
"""
self._setup()
FLAGS.num_gpus = 2
FLAGS.dtype = 'fp16'
FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check.
......@@ -181,7 +225,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
FLAGS.batch_size = 4096 * 2
FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.hooks = ['ExamplesPerSecondHook']
# These bleu scores are based on test runs after at this limited
# number of steps and batch size after verifying SOTA at 8xV100s.
......@@ -190,7 +234,9 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
def benchmark_graph_8_gpu(self):
"""Benchmark graph mode 8 gpus.
Best so far is 27.2 with 4048 * 8 at 75,000 steps.
SOTA is 27.3 BLEU (uncased).
Best so far is 27.2 with 4048*8 at 75,000 steps.
27.009 with 4096*8 at 100,000 steps and earlier.
Other test: 2024 * 8 peaked at 26.66 at 100,000 steps.
"""
self._setup()
......@@ -201,7 +247,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 3072 * 8
FLAGS.batch_size = 4096 * 8
FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
......@@ -222,7 +268,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base'
FLAGS.batch_size = 3072 * 8
FLAGS.batch_size = 4096 * 8
FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
......@@ -245,14 +291,22 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
bleu_max=bleu_max)
class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmarks for ResNet50 using Estimator."""
local_flags = None
class TransformerEstimatorBenchmark(EstimatorBenchmark):
"""Benchmarks for Transformer (Base and Big) using Estimator."""
def __init__(self, output_dir=None, default_flags=None, batch_per_gpu=4096):
"""Initialize.
Args:
output_dir: Based directory for saving artifacts, e.g. checkpoints.
default_flags: default flags to use for all tests.
batch_per_gpu: batch size to use per gpu.
"""
def __init__(self, output_dir=None, default_flags=None):
flag_methods = [transformer_main.define_transformer_flags]
self.batch_per_gpu = batch_per_gpu
super(TransformerBaseEstimatorBenchmark, self).__init__(
super(TransformerEstimatorBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
flag_methods=flag_methods)
......@@ -261,7 +315,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 1 gpu."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = 4096
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
self._run_and_report_benchmark()
......@@ -270,7 +324,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup()
FLAGS.num_gpus = 1
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096
FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
self._run_and_report_benchmark()
......@@ -278,7 +332,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 2 gpus."""
self._setup()
FLAGS.num_gpus = 2
FLAGS.batch_size = 4096 * 2
FLAGS.batch_size = self.batch_per_gpu * 2
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
self._run_and_report_benchmark()
......@@ -287,7 +341,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup()
FLAGS.num_gpus = 2
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 * 2
FLAGS.batch_size = self.batch_per_gpu * 2
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
self._run_and_report_benchmark()
......@@ -295,7 +349,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 4 gpus."""
self._setup()
FLAGS.num_gpus = 4
FLAGS.batch_size = 4096 * 4
FLAGS.batch_size = self.batch_per_gpu * 4
FLAGS.model_dir = self._get_model_dir('benchmark_graph_4_gpu')
self._run_and_report_benchmark()
......@@ -304,7 +358,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup()
FLAGS.num_gpus = 4
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 * 4
FLAGS.batch_size = self.batch_per_gpu * 4
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_4_gpu')
self._run_and_report_benchmark()
......@@ -312,7 +366,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 8 gpus."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.batch_size = 4096 * 8
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
self._run_and_report_benchmark()
......@@ -321,7 +375,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup()
FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 * 8
FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
self._run_and_report_benchmark()
......@@ -332,7 +386,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._report_benchmark(stats, wall_time_sec)
class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark):
class TransformerBaseEstimatorBenchmarkSynth(TransformerEstimatorBenchmark):
"""Transformer based version synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
......@@ -347,7 +401,7 @@ class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark):
output_dir=output_dir, default_flags=def_flags)
class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark):
class TransformerBaseEstimatorBenchmarkReal(TransformerEstimatorBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
......@@ -367,3 +421,40 @@ class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark):
super(TransformerBaseEstimatorBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags)
class TransformerBigEstimatorBenchmarkReal(TransformerEstimatorBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
def_flags = {}
def_flags['param_set'] = 'big'
def_flags['vocab_file'] = vocab_file
def_flags['data_dir'] = train_data_dir
def_flags['train_steps'] = 200
def_flags['steps_between_evals'] = 200
def_flags['hooks'] = ['ExamplesPerSecondHook']
super(TransformerBigEstimatorBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags, batch_per_gpu=3072)
class TransformerBigEstimatorBenchmarkSynth(TransformerEstimatorBenchmark):
"""Transformer based version synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['param_set'] = 'big'
def_flags['use_synthetic_data'] = True
def_flags['train_steps'] = 200
def_flags['steps_between_evals'] = 200
def_flags['hooks'] = ['ExamplesPerSecondHook']
super(TransformerBigEstimatorBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags, batch_per_gpu=3072)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment