Add Transformer Big Benchmarks + FP16 for other tests. (#6838)

* Add big tests. * fix super * Add fp16, increase 8xGPU batch-sizes * Adding the rest of the fp16 tests. * Big accuracy test batch_perf_gpu * fix docstrings * add _run_and_report * Edited docstrings

Add Transformer Big Benchmarks + FP16 for other tests. (#6838)
* Add big tests. * fix super * Add fp16, increase 8xGPU batch-sizes * Adding the rest of the fp16 tests. * Big accuracy test batch_perf_gpu * fix docstrings * add _run_and_report * Edited docstrings
23f75313 · Toby Boyd · GitHub · 80444539 · 23f75313
Unverified Commit 23f75313 authored May 22, 2019 by Toby Boyd Committed by GitHub May 22, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 138 additions and 47 deletions

official/transformer/transformer_estimator_benchmark.py official/transformer/transformer_estimator_benchmark.py +138 -47

No files found.
--- a/official/transformer/transformer_estimator_benchmark.py
+++ b/official/transformer/transformer_estimator_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Executes Estimator benchmarks and accuracy tests."""
+"""Executes Transformer w/Estimator benchmark and accuracy tests."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -33,10 +33,10 @@ FLAGS = flags.FLAGS
 class EstimatorBenchmark(tf.test.Benchmark):
-  """Base class to hold methods common to test classes in the module.
+  """Methods common to executing transformer w/Estimator tests.
-     Code under test for the Transformer Estimator models that report mostly the
+     Code under test for the Transformer Estimator models report the same data
-     same data and require the same FLAG setup.
+     and require the same FLAG setup.
  """
  local_flags = None
@@ -76,7 +76,7 @@ class EstimatorBenchmark(tf.test.Benchmark):
    Args:
      stats: dict returned from estimator models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
+      wall_time_sec: the during of the benchmark execution in seconds.
      bleu_max: highest passing level for bleu score.
      bleu_min: lowest passing level for bleu score.
    """
@@ -106,15 +106,15 @@ class EstimatorBenchmark(tf.test.Benchmark):
        metrics=metrics)
-class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
+class TransformerBigEstimatorAccuracy(EstimatorBenchmark):
-  """Benchmark accuracy tests for ResNet50 w/ Estimator."""
+  """Benchmark accuracy tests for Transformer Big model w/Estimator."""
  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for ResNet50 w/ Estimator.
+    """Benchmark accuracy tests for Transformer Big model w/Estimator.
    Args:
-      output_dir: directory where to output e.g. log files
+      output_dir: directory where to output, e.g. log files.
-      root_data_dir: directory under which to look for dataset
+      root_data_dir: directory under which to look for dataset.
      **kwargs: arbitrary named arguments. This is needed to make the
                constructor forward compatible in case PerfZero provides more
                named arguments before updating the constructor.
@@ -136,42 +136,86 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
                                 'newstest2014.de')
-    super(TransformerBaseEstimatorAccuracy, self).__init__(
+    super(TransformerBigEstimatorAccuracy, self).__init__(
        output_dir=output_dir, flag_methods=flag_methods)
-  def benchmark_graph_2_gpu(self):
+  def benchmark_graph_8_gpu(self):
-    """Benchmark graph mode 2 gpus.
+    """Benchmark graph mode 8 gpus.
-      The paper uses 8 GPUs and a much larger effective batch size, this is will
+      SOTA is 28.4 BLEU (uncased).
-      not converge to the 27.3 BLEU (uncased) SOTA.
    """
    self._setup()
-    FLAGS.num_gpus = 2
+    FLAGS.num_gpus = 8
    FLAGS.data_dir = self.train_data_dir
    FLAGS.vocab_file = self.vocab_file
    # Sets values directly to avoid validation check.
    FLAGS['bleu_source'].value = self.bleu_source
    FLAGS['bleu_ref'].value = self.bleu_ref
-    FLAGS.param_set = 'base'
+    FLAGS.param_set = 'big'
-    FLAGS.batch_size = 4096 * 2
+    FLAGS.batch_size = 3072 * 8
    FLAGS.train_steps = 100000
    FLAGS.steps_between_evals = 5000
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
    FLAGS.hooks = ['ExamplesPerSecondHook']
-    # These bleu scores are based on test runs after at this limited
+    self._run_and_report_benchmark()
-    # number of steps and batch size after verifying SOTA at 8xV100s.
-    self._run_and_report_benchmark(bleu_min=25.3, bleu_max=26)
-  def benchmark_graph_fp16_2_gpu(self):
+  def _run_and_report_benchmark(self, bleu_min=28.3, bleu_max=29):
-    """Benchmark 2 gpu with fp16 mixed-precision.
+    """Run benchmark and report results.
+    Args:
+      bleu_min: minimum expected uncased bleu. default is SOTA.
+      bleu_max: max expected uncased bleu. default is a high number.
+    """
+    start_time_sec = time.time()
+    stats = transformer_main.run_transformer(flags.FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    self._report_benchmark(stats,
+                           wall_time_sec,
+                           bleu_min=bleu_min,
+                           bleu_max=bleu_max)
+class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
+  """Benchmark accuracy tests for Transformer Base model w/ Estimator."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark accuracy tests for Transformer Base model w/ Estimator.
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [transformer_main.define_transformer_flags]
+    self.train_data_dir = os.path.join(root_data_dir,
+                                       TRANSFORMER_EN2DE_DATA_DIR_NAME)
+    self.vocab_file = os.path.join(root_data_dir,
+                                   TRANSFORMER_EN2DE_DATA_DIR_NAME,
+                                   'vocab.ende.32768')
+    self.bleu_source = os.path.join(root_data_dir,
+                                    EN2DE_2014_BLEU_DATA_DIR_NAME,
+                                    'newstest2014.en')
+    self.bleu_ref = os.path.join(root_data_dir,
+                                 EN2DE_2014_BLEU_DATA_DIR_NAME,
+                                 'newstest2014.de')
+    super(TransformerBaseEstimatorAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)
-      The paper uses 8 GPUs and a much larger effective batch-size,
+  def benchmark_graph_2_gpu(self):
-      this is unlikely to hit the target bleu score regardless of
+    """Benchmark graph mode 2 gpus.
-      number of steps.
+      The paper uses 8 GPUs and a much larger effective batch size, this is will
+      not converge to the 27.3 BLEU (uncased) SOTA.
    """
    self._setup()
    FLAGS.num_gpus = 2
-    FLAGS.dtype = 'fp16'
    FLAGS.data_dir = self.train_data_dir
    FLAGS.vocab_file = self.vocab_file
    # Sets values directly to avoid validation check.
@@ -181,7 +225,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
    FLAGS.batch_size = 4096 * 2
    FLAGS.train_steps = 100000
    FLAGS.steps_between_evals = 5000
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
    FLAGS.hooks = ['ExamplesPerSecondHook']
    # These bleu scores are based on test runs after at this limited
    # number of steps and batch size after verifying SOTA at 8xV100s.
@@ -190,7 +234,9 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
  def benchmark_graph_8_gpu(self):
    """Benchmark graph mode 8 gpus.
-      Best so far is 27.2  with 4048 * 8 at 75,000 steps.
+      SOTA is 27.3 BLEU (uncased).
+      Best so far is 27.2  with 4048*8 at 75,000 steps.
+      27.009 with 4096*8 at 100,000 steps and earlier.
      Other test: 2024 * 8 peaked at 26.66 at 100,000 steps.
    """
    self._setup()
@@ -201,7 +247,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
    FLAGS['bleu_source'].value = self.bleu_source
    FLAGS['bleu_ref'].value = self.bleu_ref
    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 3072 * 8
+    FLAGS.batch_size = 4096 * 8
    FLAGS.train_steps = 100000
    FLAGS.steps_between_evals = 5000
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
@@ -222,7 +268,7 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
    FLAGS['bleu_source'].value = self.bleu_source
    FLAGS['bleu_ref'].value = self.bleu_ref
    FLAGS.param_set = 'base'
-    FLAGS.batch_size = 3072 * 8
+    FLAGS.batch_size = 4096 * 8
    FLAGS.train_steps = 100000
    FLAGS.steps_between_evals = 5000
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
@@ -245,14 +291,22 @@ class TransformerBaseEstimatorAccuracy(EstimatorBenchmark):
                           bleu_max=bleu_max)
-class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
+class TransformerEstimatorBenchmark(EstimatorBenchmark):
-  """Benchmarks for ResNet50 using Estimator."""
+  """Benchmarks for Transformer (Base and Big) using Estimator."""
-  local_flags = None
+  def __init__(self, output_dir=None, default_flags=None, batch_per_gpu=4096):
+    """Initialize.
+    Args:
+      output_dir: Based directory for saving artifacts, e.g. checkpoints.
+      default_flags: default flags to use for all tests.
+      batch_per_gpu: batch size to use per gpu.
+    """
-  def __init__(self, output_dir=None, default_flags=None):
    flag_methods = [transformer_main.define_transformer_flags]
+    self.batch_per_gpu = batch_per_gpu
-    super(TransformerBaseEstimatorBenchmark, self).__init__(
+    super(TransformerEstimatorBenchmark, self).__init__(
        output_dir=output_dir,
        default_flags=default_flags,
        flag_methods=flag_methods)
@@ -261,7 +315,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    """Benchmark graph 1 gpu."""
    self._setup()
    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 4096
+    FLAGS.batch_size = self.batch_per_gpu
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
    self._run_and_report_benchmark()
@@ -270,7 +324,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    self._setup()
    FLAGS.num_gpus = 1
    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 4096
+    FLAGS.batch_size = self.batch_per_gpu
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
    self._run_and_report_benchmark()
@@ -278,7 +332,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    """Benchmark graph 2 gpus."""
    self._setup()
    FLAGS.num_gpus = 2
-    FLAGS.batch_size = 4096 * 2
+    FLAGS.batch_size = self.batch_per_gpu * 2
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
    self._run_and_report_benchmark()
@@ -287,7 +341,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    self._setup()
    FLAGS.num_gpus = 2
    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 4096 * 2
+    FLAGS.batch_size = self.batch_per_gpu * 2
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
    self._run_and_report_benchmark()
@@ -295,7 +349,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    """Benchmark graph 4 gpus."""
    self._setup()
    FLAGS.num_gpus = 4
-    FLAGS.batch_size = 4096 * 4
+    FLAGS.batch_size = self.batch_per_gpu * 4
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_4_gpu')
    self._run_and_report_benchmark()
@@ -304,7 +358,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    self._setup()
    FLAGS.num_gpus = 4
    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 4096 * 4
+    FLAGS.batch_size = self.batch_per_gpu * 4
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_4_gpu')
    self._run_and_report_benchmark()
@@ -312,7 +366,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    """Benchmark graph 8 gpus."""
    self._setup()
    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 4096 * 8
+    FLAGS.batch_size = self.batch_per_gpu * 8
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
    self._run_and_report_benchmark()
@@ -321,7 +375,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    self._setup()
    FLAGS.num_gpus = 8
    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 4096 * 8
+    FLAGS.batch_size = self.batch_per_gpu * 8
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
    self._run_and_report_benchmark()
@@ -332,7 +386,7 @@ class TransformerBaseEstimatorBenchmark(EstimatorBenchmark):
    self._report_benchmark(stats, wall_time_sec)
-class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark):
+class TransformerBaseEstimatorBenchmarkSynth(TransformerEstimatorBenchmark):
  """Transformer based version synthetic benchmark tests."""
  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
@@ -347,7 +401,7 @@ class TransformerBaseEstimatorBenchmarkSynth(TransformerBaseEstimatorBenchmark):
        output_dir=output_dir, default_flags=def_flags)
-class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark):
+class TransformerBaseEstimatorBenchmarkReal(TransformerEstimatorBenchmark):
  """Transformer based version real data benchmark tests."""
  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
@@ -367,3 +421,40 @@ class TransformerBaseEstimatorBenchmarkReal(TransformerBaseEstimatorBenchmark):
    super(TransformerBaseEstimatorBenchmarkReal, self).__init__(
        output_dir=output_dir, default_flags=def_flags)
+class TransformerBigEstimatorBenchmarkReal(TransformerEstimatorBenchmark):
+  """Transformer based version real data benchmark tests."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    train_data_dir = os.path.join(root_data_dir,
+                                  TRANSFORMER_EN2DE_DATA_DIR_NAME)
+    vocab_file = os.path.join(root_data_dir,
+                              TRANSFORMER_EN2DE_DATA_DIR_NAME,
+                              'vocab.ende.32768')
+    def_flags = {}
+    def_flags['param_set'] = 'big'
+    def_flags['vocab_file'] = vocab_file
+    def_flags['data_dir'] = train_data_dir
+    def_flags['train_steps'] = 200
+    def_flags['steps_between_evals'] = 200
+    def_flags['hooks'] = ['ExamplesPerSecondHook']
+    super(TransformerBigEstimatorBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags, batch_per_gpu=3072)
+class TransformerBigEstimatorBenchmarkSynth(TransformerEstimatorBenchmark):
+  """Transformer based version synthetic benchmark tests."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['param_set'] = 'big'
+    def_flags['use_synthetic_data'] = True
+    def_flags['train_steps'] = 200
+    def_flags['steps_between_evals'] = 200
+    def_flags['hooks'] = ['ExamplesPerSecondHook']
+    super(TransformerBigEstimatorBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=def_flags, batch_per_gpu=3072)