Perfzero XLNet classifier Imdb accuracy test on 8 GPUs.

PiperOrigin-RevId: 270817869

Perfzero XLNet classifier Imdb accuracy test on 8 GPUs.
PiperOrigin-RevId: 270817869
a52564cb · Hongkun Yu · A. Unique TensorFlower · 08bb9eb5 · a52564cb · a52564cb
Commit a52564cb authored Sep 23, 2019 by Hongkun Yu Committed by A. Unique TensorFlower Sep 23, 2019
8 changed files
--- a/official/benchmark/bert_benchmark.py
+++ b/official/benchmark/bert_benchmark.py
@@ -26,7 +26,7 @@ import time
 # pylint: disable=g-bad-import-order
 from absl import flags
 from absl.testing import flagsaver
-import tensorflow.compat.v2 as tf
+import tensorflow as tf
 # pylint: enable=g-bad-import-order
 from official.benchmark import bert_benchmark_utils as benchmark_utils

--- a/official/benchmark/bert_benchmark_utils.py
+++ b/official/benchmark/bert_benchmark_utils.py
@@ -100,12 +100,19 @@ class BertBenchmarkBase(tf.test.Benchmark):
    metrics = [{
        'name': 'training_loss',
        'value': stats['train_loss'],
-    }, {
-        'name':
-            'exp_per_second',
-        'value':
-            self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size)
    }]
+    if self.timer_callback:
+      metrics.append({
+          'name':
+              'exp_per_second',
+          'value':
+              self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size)
+      })
+    else:
+      metrics.append({
+          'name': 'exp_per_second',
+          'value': 0.0,
+      })
    if 'eval_metrics' in stats:
      metrics.append({

--- a/official/benchmark/bert_squad_benchmark.py
+++ b/official/benchmark/bert_squad_benchmark.py
@@ -25,7 +25,7 @@ import time
 # pylint: disable=g-bad-import-order
 from absl import flags
 from absl.testing import flagsaver
-import tensorflow.compat.v2 as tf
+import tensorflow as tf
 # pylint: enable=g-bad-import-order
 from official.benchmark import bert_benchmark_utils as benchmark_utils

--- a/official/benchmark/xlnet_benchmark.py
+++ b/official/benchmark/xlnet_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes XLNet benchmarks and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import os
+import time
+# pylint: disable=g-bad-import-order
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+from official.benchmark import bert_benchmark_utils as benchmark_utils
+from official.nlp.xlnet import run_classifier
+# pylint: disable=line-too-long
+PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/xlnet/large/xlnet_model-1'
+CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.train.tf_record'
+CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/xlnet/imdb/spiece.model.len-512.dev.eval.tf_record'
+# pylint: enable=line-too-long
+FLAGS = flags.FLAGS
+class XLNetClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
+  """Base class to hold methods common to test classes in the module."""
+  def __init__(self, output_dir=None):
+    super(XLNetClassifyBenchmarkBase, self).__init__(output_dir)
+    self.num_epochs = None
+    self.num_steps_per_epoch = None
+  @flagsaver.flagsaver
+  def _run_xlnet_classifier(self):
+    """Starts XLNet classification task."""
+    run_classifier.main(unused_argv=None)
+class XLNetClassifyAccuracy(XLNetClassifyBenchmarkBase):
+  """Short accuracy test for XLNet model.
+  Tests XLNet classification task model accuracy. The naming
+  convention of below test cases follow
+  `benchmark_(number of gpus)_gpu_(dataset type)` format.
+  """
+  def __init__(self, output_dir=None, **kwargs):
+    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
+    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
+    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
+    super(XLNetClassifyAccuracy, self).__init__(output_dir=output_dir)
+  def _run_and_report_benchmark(self,
+                                training_summary_path,
+                                min_accuracy=0.95,
+                                max_accuracy=0.97):
+    """Starts XLNet accuracy benchmark test."""
+    start_time_sec = time.time()
+    self._run_xlnet_classifier()
+    wall_time_sec = time.time() - start_time_sec
+    with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
+      summary = json.loads(reader.read().decode('utf-8'))
+    super(XLNetClassifyAccuracy, self)._report_benchmark(
+        stats=summary,
+        wall_time_sec=wall_time_sec,
+        min_accuracy=min_accuracy,
+        max_accuracy=max_accuracy)
+  def _setup(self):
+    super(XLNetClassifyAccuracy, self)._setup()
+    FLAGS.train_data_size = 25000
+    FLAGS.test_data_size = 25024
+    FLAGS.train_batch_size = 16
+    FLAGS.seq_len = 512
+    FLAGS.reuse_len = 256
+    FLAGS.mem_len = 0
+    FLAGS.n_layer = 24
+    FLAGS.d_model = 1024
+    FLAGS.d_embed = 1024
+    FLAGS.n_head = 16
+    FLAGS.d_head = 64
+    FLAGS.d_inner = 4096
+    FLAGS.untie_r = True
+    FLAGS.n_class = 2
+    FLAGS.ff_activation = 'gelu'
+    FLAGS.strategy_type = 'mirror'
+    FLAGS.learning_rate = 2e-5
+    FLAGS.train_steps = 4000
+    FLAGS.warmup_steps = 500
+    FLAGS.iterations = 200
+    FLAGS.bi_data = False
+    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
+    FLAGS.train_tfrecord_path = self.train_data_path
+    FLAGS.test_tfrecord_path = self.eval_data_path
+  def benchmark_8_gpu_imdb(self):
+    """Run XLNet model accuracy test with 8 GPUs."""
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_imdb')
+    # Sets timer_callback to None as we do not use it now.
+    self.timer_callback = None
+    summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/model_training_utils.py
+++ b/official/modeling/model_training_utils.py
@@ -72,7 +72,7 @@ def _steps_to_run(current_step, steps_per_epoch, steps_per_loop):
    return steps_per_loop
-def _write_txt_summary(training_summary, model_dir):
+def write_txt_summary(training_summary, model_dir):
  """Writes a summary text file to record stats."""
  summary_path = os.path.join(model_dir, _SUMMARY_TXT)
  with tf.io.gfile.GFile(summary_path, 'wb') as f:
@@ -415,6 +415,6 @@ def run_customized_training_loop(
            train_metrics[0])
        training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])
-      _write_txt_summary(training_summary, model_dir)
+      write_txt_summary(training_summary, model_dir)
      return model
--- a/official/nlp/xlnet/run_classifier.py
+++ b/official/nlp/xlnet/run_classifier.py
@@ -65,6 +65,8 @@ def run_evaluation(strategy,
      them when calculating the accuracy. For the reason that there will be
      dynamic-shape tensor, we first collect logits, labels and masks from TPU
      and calculate the accuracy via numpy locally.
+  Returns:
+    A float metric, accuracy.
  """
  def _test_step_fn(inputs):
@@ -108,12 +110,14 @@ def run_evaluation(strategy,
            np.argmax(merged_logits[real_index], axis=-1),
            merged_labels[real_index]))
    total += np.shape(real_index)[-1]
+  accuracy = float(correct) / float(total)
  logging.info("Train step: %d  /  acc = %d/%d = %f", step, correct, total,
-               float(correct) / float(total))
+               accuracy)
  if eval_summary_writer:
    with eval_summary_writer.as_default():
      tf.summary.scalar("eval_acc", float(correct) / float(total), step=step)
      eval_summary_writer.flush()
+  return accuracy
 def get_metric_fn():
@@ -191,7 +195,8 @@ def main(unused_argv):
        steps_per_loop=steps_per_loop,
        optimizer=optimizer,
        learning_rate_fn=learning_rate_fn,
-        model_dir=FLAGS.model_dir)
+        model_dir=FLAGS.model_dir,
+        save_steps=1000)
 if __name__ == "__main__":

--- a/official/nlp/xlnet/run_squad.py
+++ b/official/nlp/xlnet/run_squad.py
@@ -116,7 +116,8 @@ def run_evaluation(strategy,
    model: keras model object.
    step: current training step.
    eval_summary_writer: summary writer used to record evaluation metrics.
+  Returns:
+    A float metric, F1 score.
  """
  def _test_step_fn(inputs):
@@ -192,23 +193,23 @@ def run_evaluation(strategy,
  output_null_log_odds_file = os.path.join(input_meta_data["predict_dir"],
                                           "null_odds.json")
-  ret = squad_utils.write_predictions(
+  results = squad_utils.write_predictions(
      eval_examples, input_meta_data["eval_features"], cur_results,
      input_meta_data["n_best_size"], input_meta_data["max_answer_length"],
      output_prediction_file, output_nbest_file, output_null_log_odds_file,
      orig_data, input_meta_data["start_n_top"], input_meta_data["end_n_top"])
-  # Log current result
+  # Log current results.
  log_str = "Result | "
-  for key, val in ret.items():
+  for key, val in results.items():
    log_str += "{} {} | ".format(key, val)
  logging.info(log_str)
  if eval_summary_writer:
    with eval_summary_writer.as_default():
-      tf.summary.scalar("best_f1", ret["best_f1"], step=step)
+      tf.summary.scalar("best_f1", results["best_f1"], step=step)
-      tf.summary.scalar("best_exact", ret["best_exact"], step=step)
+      tf.summary.scalar("best_exact", results["best_exact"], step=step)
      eval_summary_writer.flush()
+  return results["best_f1"]
 def get_qaxlnet_model(model_config, run_config, start_n_top, end_n_top):

--- a/official/nlp/xlnet/training_utils.py
+++ b/official/nlp/xlnet/training_utils.py
@@ -27,7 +27,7 @@ from absl import logging
 # pytype: disable=attribute-error
 # pylint: disable=g-bare-generic,unused-import
 import tensorflow as tf
-# Initialize TPU System.
+from official.modeling import model_training_utils
 from official.nlp.xlnet import data_utils
 from official.nlp import xlnet_modeling as modeling
 from typing import Any, Callable, Dict, Text, Optional
@@ -304,6 +304,18 @@ def train(
                       checkpoint_name.format(step=current_step))
    if test_input_fn:
      logging.info("Running final evaluation after training is complete.")
-      eval_fn(model, current_step, eval_summary_writer)
+      eval_metric = eval_fn(model, current_step, eval_summary_writer)
+    training_summary = {
+        "total_training_steps": total_training_steps,
+        "train_loss": _float_metric_value(train_loss_metric),
+    }
+    if train_metric:
+      training_summary["last_train_metrics"] = _float_metric_value(train_metric)
+    if test_input_fn:
+      # eval_metric is supposed to be a float.
+      training_summary["eval_metrics"] = eval_metric
+    model_training_utils.write_txt_summary(training_summary, model_dir)
    return model