Unverified Commit 23f75313 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Add Transformer Big Benchmarks + FP16 for other tests. (#6838)

* Add big tests.

* fix super

* Add fp16, increase 8xGPU batch-sizes

* Adding the rest of the fp16 tests.

* Big accuracy test batch_perf_gpu

* fix docstrings

* add _run_and_report

* Edited docstrings
parent 80444539
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Executes Estimator benchmarks and accuracy tests.""" """Executes Transformer w/Estimator benchmark and accuracy tests."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
...@@ -33,10 +33,10 @@ FLAGS = flags.FLAGS ...@@ -33,10 +33,10 @@ FLAGS = flags.FLAGS
class EstimatorBenchmark(tf.test.Benchmark): class EstimatorBenchmark(tf.test.Benchmark):
"""Base class to hold methods common to test classes in the module. """Methods common to executing transformer w/Estimator tests.
Code under test for the Transformer Estimator models that report mostly the Code under test for the Transformer Estimator models report the same data
same data and require the same FLAG setup. and require the same FLAG setup.
""" """
local_flags = None local_flags = None
...@@ -76,7 +76,7 @@ class EstimatorBenchmark(tf.test.Benchmark): ...@@ -76,7 +76,7 @@ class EstimatorBenchmark(tf.test.Benchmark):
Args: Args:
stats: dict returned from estimator models with known entries. stats: dict returned from estimator models with known entries.
wall_time_sec: the during of the benchmark execution in seconds wall_time_sec: the during of the benchmark execution in seconds.
bleu_max: highest passing level for bleu score. bleu_max: highest passing level for bleu score.
bleu_min: lowest passing level for bleu score. bleu_min: lowest passing level for bleu score.
""" """
...@@ -106,15 +106,15 @@ class EstimatorBenchmark(tf.test.Benchmark): ...@@ -106,15 +106,15 @@ class EstimatorBenchmark(tf.test.Benchmark):
metrics=metrics) metrics=metrics)
class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): class TransformerBigEstimatorAccuracy(EstimatorBenchmark):
"""Benchmark accuracy tests for ResNet50 w/ Estimator.""" """Benchmark accuracy tests for Transformer Big model w/Estimator."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for ResNet50 w/ Estimator. """Benchmark accuracy tests for Transformer Big model w/Estimator.
Args: Args:
output_dir: directory where to output e.g. log files output_dir: directory where to output, e.g. log files.
root_data_dir: directory under which to look for dataset root_data_dir: directory under which to look for dataset.
**kwargs: arbitrary named arguments. This is needed to make the **kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor. named arguments before updating the constructor.
...@@ -136,42 +136,86 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): ...@@ -136,42 +136,86 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
EN2DE_2014_BLEU_DATA_DIR_NAME, EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.de') 'newstest2014.de')
super(TransformerBaseEstimatorAccuracy, self).__init__( super(TransformerBigEstimatorAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods) output_dir=output_dir, flag_methods=flag_methods)
def benchmark_graph_2_gpu(self): def benchmark_graph_8_gpu(self):
"""Benchmark graph mode 2 gpus. """Benchmark graph mode 8 gpus.
The paper uses 8 GPUs and a much larger effective batch size, this is will SOTA is 28.4 BLEU (uncased).
not converge to the 27.3 BLEU (uncased) SOTA.
""" """
self._setup() self._setup()
FLAGS.num_gpus = 2 FLAGS.num_gpus = 8
FLAGS.data_dir = self.train_data_dir FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check. # Sets values directly to avoid validation check.
FLAGS['bleu_source'].value = self.bleu_source FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base' FLAGS.param_set = 'big'
FLAGS.batch_size = 4096 * 2 FLAGS.batch_size = 3072 * 8
FLAGS.train_steps = 100000 FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000 FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.hooks = ['ExamplesPerSecondHook'] FLAGS.hooks = ['ExamplesPerSecondHook']
# These bleu scores are based on test runs after at this limited self._run_and_report_benchmark()
# number of steps and batch size after verifying SOTA at 8xV100s.
self._run_and_report_benchmark(bleu_min=25.3, bleu_max=26)
def benchmark_graph_fp16_2_gpu(self): def _run_and_report_benchmark(self, bleu_min=28.3, bleu_max=29):
"""Benchmark 2 gpu with fp16 mixed-precision. """Run benchmark and report results.
Args:
bleu_min: minimum expected uncased bleu. default is SOTA.
bleu_max: max expected uncased bleu. default is a high number.
"""
start_time_sec = time.time()
stats = transformer_main.run_transformer(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
self._report_benchmark(stats,
wall_time_sec,
bleu_min=bleu_min,
bleu_max=bleu_max)
class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
"""Benchmark accuracy tests for Transformer Base model w/ Estimator."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for Transformer Base model w/ Estimator.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [transformer_main.define_transformer_flags]
self.train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
self.vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
self.bleu_source = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.en')
self.bleu_ref = os.path.join(root_data_dir,
EN2DE_2014_BLEU_DATA_DIR_NAME,
'newstest2014.de')
super(TransformerBaseEstimatorAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
The paper uses 8 GPUs and a much larger effective batch-size, def benchmark_graph_2_gpu(self):
this is unlikely to hit the target bleu score regardless of """Benchmark graph mode 2 gpus.
number of steps.
The paper uses 8 GPUs and a much larger effective batch size, this is will
not converge to the 27.3 BLEU (uncased) SOTA.
""" """
self._setup() self._setup()
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
FLAGS.dtype = 'fp16'
FLAGS.data_dir = self.train_data_dir FLAGS.data_dir = self.train_data_dir
FLAGS.vocab_file = self.vocab_file FLAGS.vocab_file = self.vocab_file
# Sets values directly to avoid validation check. # Sets values directly to avoid validation check.
...@@ -181,7 +225,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): ...@@ -181,7 +225,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
FLAGS.batch_size = 4096 * 2 FLAGS.batch_size = 4096 * 2
FLAGS.train_steps = 100000 FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000 FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
FLAGS.hooks = ['ExamplesPerSecondHook'] FLAGS.hooks = ['ExamplesPerSecondHook']
# These bleu scores are based on test runs after at this limited # These bleu scores are based on test runs after at this limited
# number of steps and batch size after verifying SOTA at 8xV100s. # number of steps and batch size after verifying SOTA at 8xV100s.
...@@ -190,7 +234,9 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): ...@@ -190,7 +234,9 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
def benchmark_graph_8_gpu(self): def benchmark_graph_8_gpu(self):
"""Benchmark graph mode 8 gpus. """Benchmark graph mode 8 gpus.
Best so far is 27.2 with 4048 * 8 at 75,000 steps. SOTA is 27.3 BLEU (uncased).
Best so far is 27.2 with 4048*8 at 75,000 steps.
27.009 with 4096*8 at 100,000 steps and earlier.
Other test: 2024 * 8 peaked at 26.66 at 100,000 steps. Other test: 2024 * 8 peaked at 26.66 at 100,000 steps.
""" """
self._setup() self._setup()
...@@ -201,7 +247,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): ...@@ -201,7 +247,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
FLAGS['bleu_source'].value = self.bleu_source FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base' FLAGS.param_set = 'base'
FLAGS.batch_size = 3072 * 8 FLAGS.batch_size = 4096 * 8
FLAGS.train_steps = 100000 FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000 FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
...@@ -222,7 +268,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): ...@@ -222,7 +268,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
FLAGS['bleu_source'].value = self.bleu_source FLAGS['bleu_source'].value = self.bleu_source
FLAGS['bleu_ref'].value = self.bleu_ref FLAGS['bleu_ref'].value = self.bleu_ref
FLAGS.param_set = 'base' FLAGS.param_set = 'base'
FLAGS.batch_size = 3072 * 8 FLAGS.batch_size = 4096 * 8
FLAGS.train_steps = 100000 FLAGS.train_steps = 100000
FLAGS.steps_between_evals = 5000 FLAGS.steps_between_evals = 5000
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
...@@ -245,14 +291,22 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark): ...@@ -245,14 +291,22 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
bleu_max=bleu_max) bleu_max=bleu_max)
class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): class TransformerEstimatorBenchmark(EstimatorBenchmark):
"""Benchmarks for ResNet50 using Estimator.""" """Benchmarks for Transformer (Base and Big) using Estimator."""
local_flags = None
def __init__(self, output_dir=None, default_flags=None, batch_per_gpu=4096):
"""Initialize.
Args:
output_dir: Based directory for saving artifacts, e.g. checkpoints.
default_flags: default flags to use for all tests.
batch_per_gpu: batch size to use per gpu.
"""
def __init__(self, output_dir=None, default_flags=None):
flag_methods = [transformer_main.define_transformer_flags] flag_methods = [transformer_main.define_transformer_flags]
self.batch_per_gpu = batch_per_gpu
super(TransformerBaseEstimatorBenchmark, self).__init__( super(TransformerEstimatorBenchmark, self).__init__(
output_dir=output_dir, output_dir=output_dir,
default_flags=default_flags, default_flags=default_flags,
flag_methods=flag_methods) flag_methods=flag_methods)
...@@ -261,7 +315,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -261,7 +315,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 1 gpu.""" """Benchmark graph 1 gpu."""
self._setup() self._setup()
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.batch_size = 4096 FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -270,7 +324,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -270,7 +324,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup() self._setup()
FLAGS.num_gpus = 1 FLAGS.num_gpus = 1
FLAGS.dtype = 'fp16' FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 FLAGS.batch_size = self.batch_per_gpu
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -278,7 +332,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -278,7 +332,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 2 gpus.""" """Benchmark graph 2 gpus."""
self._setup() self._setup()
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
FLAGS.batch_size = 4096 * 2 FLAGS.batch_size = self.batch_per_gpu * 2
FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -287,7 +341,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -287,7 +341,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup() self._setup()
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
FLAGS.dtype = 'fp16' FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 * 2 FLAGS.batch_size = self.batch_per_gpu * 2
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -295,7 +349,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -295,7 +349,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 4 gpus.""" """Benchmark graph 4 gpus."""
self._setup() self._setup()
FLAGS.num_gpus = 4 FLAGS.num_gpus = 4
FLAGS.batch_size = 4096 * 4 FLAGS.batch_size = self.batch_per_gpu * 4
FLAGS.model_dir = self._get_model_dir('benchmark_graph_4_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_4_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -304,7 +358,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -304,7 +358,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup() self._setup()
FLAGS.num_gpus = 4 FLAGS.num_gpus = 4
FLAGS.dtype = 'fp16' FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 * 4 FLAGS.batch_size = self.batch_per_gpu * 4
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_4_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_4_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -312,7 +366,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -312,7 +366,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
"""Benchmark graph 8 gpus.""" """Benchmark graph 8 gpus."""
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 8
FLAGS.batch_size = 4096 * 8 FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -321,7 +375,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -321,7 +375,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._setup() self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 8
FLAGS.dtype = 'fp16' FLAGS.dtype = 'fp16'
FLAGS.batch_size = 4096 * 8 FLAGS.batch_size = self.batch_per_gpu * 8
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
self._run_and_report_benchmark() self._run_and_report_benchmark()
...@@ -332,7 +386,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark): ...@@ -332,7 +386,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
self._report_benchmark(stats, wall_time_sec) self._report_benchmark(stats, wall_time_sec)
class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark): class TransformerBaseEstimatorBenchmarkSynth(TransformerEstimatorBenchmark):
"""Transformer based version synthetic benchmark tests.""" """Transformer based version synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
...@@ -347,7 +401,7 @@ class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark): ...@@ -347,7 +401,7 @@ class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark):
output_dir=output_dir, default_flags=def_flags) output_dir=output_dir, default_flags=def_flags)
class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark): class TransformerBaseEstimatorBenchmarkReal(TransformerEstimatorBenchmark):
"""Transformer based version real data benchmark tests.""" """Transformer based version real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs): def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
...@@ -367,3 +421,40 @@ class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark): ...@@ -367,3 +421,40 @@ class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark):
super(TransformerBaseEstimatorBenchmarkReal, self).__init__( super(TransformerBaseEstimatorBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags) output_dir=output_dir, default_flags=def_flags)
class TransformerBigEstimatorBenchmarkReal(TransformerEstimatorBenchmark):
"""Transformer based version real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
train_data_dir = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME)
vocab_file = os.path.join(root_data_dir,
TRANSFORMER_EN2DE_DATA_DIR_NAME,
'vocab.ende.32768')
def_flags = {}
def_flags['param_set'] = 'big'
def_flags['vocab_file'] = vocab_file
def_flags['data_dir'] = train_data_dir
def_flags['train_steps'] = 200
def_flags['steps_between_evals'] = 200
def_flags['hooks'] = ['ExamplesPerSecondHook']
super(TransformerBigEstimatorBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags, batch_per_gpu=3072)
class TransformerBigEstimatorBenchmarkSynth(TransformerEstimatorBenchmark):
"""Transformer based version synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['param_set'] = 'big'
def_flags['use_synthetic_data'] = True
def_flags['train_steps'] = 200
def_flags['steps_between_evals'] = 200
def_flags['hooks'] = ['ExamplesPerSecondHook']
super(TransformerBigEstimatorBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags, batch_per_gpu=3072)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment