Merged commit includes the following changes: (#7301)

259889221 by hongkuny<hongkuny@google.com>: Add no ds / xla / eager perfzero tests -- PiperOrigin-RevId: 259889221

Merged commit includes the following changes: (#7301)
259889221 by hongkuny<hongkuny@google.com>: Add no ds / xla / eager perfzero tests -- PiperOrigin-RevId: 259889221
53e3adb8 · Hongkun Yu · GitHub · 3c5330d8 · 53e3adb8 · 53e3adb8
Unverified Commit 53e3adb8 authored Jul 24, 2019 by Hongkun Yu Committed by GitHub Jul 24, 2019
4 changed files
--- a/official/bert/benchmark/bert_benchmark.py
+++ b/official/bert/benchmark/bert_benchmark.py
@@ -33,6 +33,7 @@ from official.bert import modeling
 from official.bert import run_classifier
 from official.bert.benchmark import benchmark_utils
 from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils

 # pylint: disable=line-too-long
 PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/tf_20/uncased_L-24_H-1024_A-16/bert_model.ckpt'
@@ -54,7 +55,7 @@ class BertClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
    self.num_steps_per_epoch = None

  @flagsaver.flagsaver
-  def _run_bert_classifier(self, callbacks=None):
+  def _run_bert_classifier(self, callbacks=None, use_ds=True, enable_xla=False):
    """Starts BERT classification task."""
    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
      input_meta_data = json.loads(reader.read().decode('utf-8'))
@@ -70,7 +71,11 @@ class BertClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
    eval_steps = int(
        math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
    strategy = distribution_utils.get_distribution_strategy(
-        distribution_strategy='mirrored', num_gpus=self.num_gpus)
+        distribution_strategy='mirrored' if use_ds else 'off',
+        num_gpus=self.num_gpus)
+    # TODO(hongkuny): Enable XLA once we are confident with its performance.
+    keras_utils.set_config_v2(enable_xla)
+
    steps_per_loop = 1

    run_classifier.run_customized_training(
@@ -113,11 +118,14 @@ class BertClassifyBenchmarkReal(BertClassifyBenchmarkBase):
  def _run_and_report_benchmark(self,
                                training_summary_path,
                                min_accuracy=0,
-                                max_accuracy=1):
+                                max_accuracy=1,
+                                use_ds=True,
+                                enable_xla=False):
    """Starts BERT performance benchmark test."""

    start_time_sec = time.time()
-    self._run_bert_classifier(callbacks=[self.timer_callback])
+    self._run_bert_classifier(
+        callbacks=[self.timer_callback], use_ds=use_ds, enable_xla=enable_xla)
    wall_time_sec = time.time() - start_time_sec

    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
@@ -148,6 +156,38 @@ class BertClassifyBenchmarkReal(BertClassifyBenchmarkBase):
    summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
    self._run_and_report_benchmark(summary_path)

+  def benchmark_1_gpu_mrpc_xla(self):
+    """Test BERT model performance with 1 GPU."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_xla')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.train_batch_size = 4
+    FLAGS.eval_batch_size = 4
+
+    summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
+    self._run_and_report_benchmark(summary_path, enable_xla=True)
+
+  def benchmark_1_gpu_mrpc_no_dist_strat(self):
+    """Test BERT model performance with 1 GPU, no distribution strategy."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_no_dist_strat')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.train_batch_size = 4
+    FLAGS.eval_batch_size = 4
+
+    summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
+    self._run_and_report_benchmark(summary_path, use_ds=False)
+
  def benchmark_2_gpu_mrpc(self):
    """Test BERT model performance with 2 GPUs."""

@@ -213,11 +253,13 @@ class BertClassifyAccuracy(BertClassifyBenchmarkBase):
  def _run_and_report_benchmark(self,
                                training_summary_path,
                                min_accuracy=0.84,
-                                max_accuracy=0.88):
+                                max_accuracy=0.88,
+                                enable_xla=False):
    """Starts BERT accuracy benchmark test."""

    start_time_sec = time.time()
-    self._run_bert_classifier(callbacks=[self.timer_callback])
+    self._run_bert_classifier(
+        callbacks=[self.timer_callback], enable_xla=enable_xla)
    wall_time_sec = time.time() - start_time_sec

    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
@@ -229,6 +271,14 @@ class BertClassifyAccuracy(BertClassifyBenchmarkBase):
        min_accuracy=min_accuracy,
        max_accuracy=max_accuracy)

+  def _setup(self):
+    super(BertClassifyAccuracy, self)._setup()
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
+
  def benchmark_8_gpu_mrpc(self):
    """Run BERT model accuracy test with 8 GPUs.

@@ -236,18 +286,20 @@ class BertClassifyAccuracy(BertClassifyBenchmarkBase):
    accuracy metric has high variance between trainings. As so, we
    set the wide range of allowed accuracy (84% to 88%).
    """
-
    self._setup()
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path

    summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
    self._run_and_report_benchmark(summary_path)

+  def benchmark_8_gpu_mrpc_xla(self):
+    """Run BERT model accuracy test with 8 GPUs with XLA."""
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc_xla')
+
+    summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
+    self._run_and_report_benchmark(summary_path, enable_xla=True)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/bert/benchmark/bert_squad_benchmark.py
+++ b/official/bert/benchmark/bert_squad_benchmark.py
@@ -32,6 +32,7 @@ from official.bert import run_squad
 from official.bert.benchmark import benchmark_utils
 from official.bert.benchmark import squad_evaluate_v1_1
 from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils

 # pylint: disable=line-too-long
 PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/tf_20/uncased_L-24_H-1024_A-16/bert_model.ckpt'
@@ -72,27 +73,29 @@ class BertSquadBenchmarkBase(benchmark_utils.BertBenchmarkBase):
    with tf.io.gfile.GFile(predictions_file, 'r') as reader:
      return json.load(reader)

-  def _get_distribution_strategy(self):
+  def _get_distribution_strategy(self, use_ds=True):
    """Gets the distribution strategy."""
    return distribution_utils.get_distribution_strategy(
-        distribution_strategy='mirrored', num_gpus=self.num_gpus)
+        distribution_strategy='mirrored' if use_ds else 'off',
+        num_gpus=self.num_gpus)

  @flagsaver.flagsaver
-  def _train_squad(self):
+  def _train_squad(self, use_ds=True, run_eagerly=False):
    """Runs BERT SQuAD training."""
    input_meta_data = self._read_input_meta_data_from_file()
-    strategy = self._get_distribution_strategy()
+    strategy = self._get_distribution_strategy(use_ds)

    run_squad.train_squad(
        strategy=strategy,
        input_meta_data=input_meta_data,
+        run_eagerly=run_eagerly,
        custom_callbacks=[self.timer_callback])

  @flagsaver.flagsaver
-  def _evaluate_squad(self):
+  def _evaluate_squad(self, use_ds=True):
    """Runs BERT SQuAD evaluation."""
    input_meta_data = self._read_input_meta_data_from_file()
-    strategy = self._get_distribution_strategy()
+    strategy = self._get_distribution_strategy(use_ds)

    run_squad.predict_squad(strategy=strategy, input_meta_data=input_meta_data)

@@ -126,10 +129,14 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    FLAGS.num_train_epochs = 1
    FLAGS.steps_per_loop = 1

-  def _run_and_report_benchmark(self):
+  def _run_and_report_benchmark(self,
+                                use_ds=True,
+                                enable_xla=False,
+                                run_eagerly=False):
    """Runs the benchmark and reports various metrics."""
+    keras_utils.set_config_v2(enable_xla)
    start_time_sec = time.time()
-    self._train_squad()
+    self._train_squad(use_ds=use_ds, run_eagerly=run_eagerly)
    wall_time_sec = time.time() - start_time_sec

    summary = self._read_training_summary_from_file()
@@ -150,6 +157,37 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):

    self._run_and_report_benchmark()

+  def benchmark_1_gpu_xla(self):
+    """Tests BERT SQuAD model performance with 1 GPU with XLA."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad')
+    FLAGS.train_batch_size = 4
+
+    self._run_and_report_benchmark(enable_xla=True)
+
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Tests BERT SQuAD model performance with 1 GPU without DS."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat_squad')
+    FLAGS.train_batch_size = 4
+
+    self._run_and_report_benchmark(use_ds=False)
+
+  def benchmark_1_gpu_eager_no_dist_strat(self):
+    """Tests BERT SQuAD model performance with 1 GPU with eager execution."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_eager_no_dist_strat_squad')
+    FLAGS.train_batch_size = 4
+
+    self._run_and_report_benchmark(use_ds=False, run_eagerly=True)
+
  def benchmark_2_gpu(self):
    """Tests BERT SQuAD model performance with 2 GPUs."""

@@ -203,10 +241,14 @@ class BertSquadAccuracy(BertSquadBenchmarkBase):
    FLAGS.num_train_epochs = 2
    FLAGS.steps_per_loop = 1

-  def _run_and_report_benchmark(self):
+  def _run_and_report_benchmark(self,
+                                use_ds=True,
+                                enable_xla=False,
+                                run_eagerly=False):
    """Runs the benchmark and reports various metrics."""
+    keras_utils.set_config_v2(enable_xla)
    start_time_sec = time.time()
-    self._train_squad()
+    self._train_squad(use_ds=use_ds, run_eagerly=run_eagerly)
    self._evaluate_squad()
    wall_time_sec = time.time() - start_time_sec

@@ -219,6 +261,16 @@ class BertSquadAccuracy(BertSquadBenchmarkBase):
        min_accuracy=0.900,
        max_accuracy=0.908)

+  def benchmark_1_gpu_eager(self):
+    """Tests BERT SQuAD model accuracy with 1 GPU with eager execution."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_eager')
+    FLAGS.train_batch_size = 4
+
+    self._run_and_report_benchmark(use_ds=False, run_eagerly=True)
+
  def benchmark_8_gpu(self):
    """Tests BERT SQuAD model accuracy with 8 GPUs."""

@@ -229,6 +281,16 @@ class BertSquadAccuracy(BertSquadBenchmarkBase):

    self._run_and_report_benchmark()

+  def benchmark_8_gpu_xla(self):
+    """Tests BERT SQuAD model accuracy with 8 GPUs."""
+
+    self._setup()
+    self.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_xla')
+    FLAGS.train_batch_size = 32
+
+    self._run_and_report_benchmark(enable_xla=True)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/bert/model_training_utils.py
+++ b/official/bert/model_training_utils.py
@@ -23,6 +23,7 @@ import os

 from absl import logging
 import tensorflow as tf
+from official.utils.misc import distribution_utils

 _SUMMARY_TXT = 'training_summary.txt'
 _MIN_SUMMARY_STEPS = 10
@@ -196,7 +197,7 @@ def run_customized_training_loop(
  with tf.device(get_primary_cpu_task(use_remote_tpu)):
    train_iterator = _get_input_iterator(train_input_fn, strategy)

-    with strategy.scope():
+    with distribution_utils.get_strategy_scope(strategy):
      # To correctly place the model weights on accelerators,
      # model and optimizer should be created in scope.
      model, sub_model = model_fn()

--- a/official/bert/run_squad.py
+++ b/official/bert/run_squad.py
@@ -173,11 +173,12 @@ def predict_squad_customized(strategy, input_meta_data, bert_config,
    return all_results


-def train_squad(strategy, input_meta_data, custom_callbacks=None):
+def train_squad(strategy,
+                input_meta_data,
+                custom_callbacks=None,
+                run_eagerly=False):
  """Run bert squad training."""
-  if not strategy:
-    raise ValueError('Distribution strategy cannot be None.')
-
+  if strategy:
    logging.info('Training using customized training loop with distribution'
                 ' strategy.')

@@ -219,6 +220,7 @@ def train_squad(strategy, input_meta_data, custom_callbacks=None):
      train_input_fn=train_input_fn,
      init_checkpoint=FLAGS.init_checkpoint,
      use_remote_tpu=use_remote_tpu,
+      run_eagerly=run_eagerly,
      custom_callbacks=custom_callbacks)