Unverified Commit 6fc642d4 authored by Hongjun Choi's avatar Hongjun Choi Committed by GitHub
Browse files

Merged commit includes the following changes: (#6885)

250009207  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Add feature in BERT to write training metrics to a summary file.

--

PiperOrigin-RevId: 250009207
parent fa10031d
......@@ -29,6 +29,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.bert import run_classifier
# pylint: disable=line-too-long
PRETRAINED_CHECKPOINT_PATH = 'gs://tf-perfzero-data/bert/bert_model.ckpt'
CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record'
CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record'
CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data'
......@@ -55,7 +56,7 @@ class BertBenchmarkBase(tf.test.Benchmark):
"""Sets up and resets flags before each test."""
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
if BertBenchmark.local_flags is None:
if BertBenchmarkBase.local_flags is None:
# Loads flags to get defaults to then override. List cannot be empty.
flags.FLAGS(['foo'])
saved_flag_values = flagsaver.save_flag_values()
......@@ -70,20 +71,39 @@ class BertBenchmarkBase(tf.test.Benchmark):
stats: dict returned from BERT models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
"""
del stats
self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=[])
class BertBenchmark(BertBenchmarkBase):
"""Short performance tests for BERT model."""
metrics = [{
'name': 'training_loss',
'value': stats['train_loss'],
}]
if 'train_metrics' in stats:
metrics.append({
'name': 'train_accuracy',
'value': stats['train_metrics'],
})
if 'eval_metric' in stats:
metrics.append({
'name': 'eval_accuracy',
'value': stats['eval_metrics'],
})
self.report_benchmark(
iters=stats['total_training_steps'],
wall_time=wall_time_sec,
metrics=metrics)
class BertBenchmarkAccuracyTest(BertBenchmarkBase):
"""Short benchmark tests for BERT model."""
def __init__(self, output_dir=None, **kwargs):
self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
self.bert_config_file = MODEL_CONFIG_FILE_PATH
self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
super(BertBenchmark, self).__init__(output_dir=output_dir)
super(BertBenchmarkAccuracyTest, self).__init__(output_dir=output_dir)
@flagsaver.flagsaver
def _run_bert_classifier(self):
......@@ -93,23 +113,28 @@ class BertBenchmark(BertBenchmarkBase):
strategy = tf.distribute.MirroredStrategy()
run_classifier.run_bert(strategy, input_meta_data)
def _run_and_report_benchmark(self):
def _run_and_report_benchmark(self, training_summary_path):
start_time_sec = time.time()
self._run_bert_classifier()
wall_time_sec = time.time() - start_time_sec
super(BertBenchmark, self)._report_benchmark(
stats=None, wall_time_sec=wall_time_sec)
with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
summary = json.loads(reader.read().decode('utf-8'))
super(BertBenchmarkAccuracyTest, self)._report_benchmark(
stats=summary, wall_time_sec=wall_time_sec)
def benchmark_1_gpu(self):
def benchmark_8_gpu(self):
self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file
FLAGS.init_checkpoint = self.pretrained_checkpoint_path
self._run_and_report_benchmark()
summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
self._run_and_report_benchmark(summary_path)
if __name__ == '__main__':
......
......@@ -18,11 +18,14 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
from absl import logging
import tensorflow as tf
SUMMARY_TXT = 'training_summary.txt'
def get_primary_cpu_task(use_remote_tpu=False):
"""Returns primary CPU task to which input pipeline Ops are put."""
......@@ -182,11 +185,13 @@ def run_customized_training_loop(
def _run_evaluation(current_training_step, test_iterator):
"""Runs validation steps and aggregate metrics."""
for _ in range(eval_steps):
test_step(test_iterator)
metric_result = metric.result().numpy().astype(float)
logging.info('Step: [%d] Validation metric = %f', current_training_step,
metric.result())
metric_result)
return metric_result
# Training loop starts here.
checkpoint = tf.train.Checkpoint(model=model)
......@@ -202,18 +207,21 @@ def run_customized_training_loop(
current_step = optimizer.iterations.numpy()
checkpoint_name = 'ctl_step_{step}.ckpt'
train_metric_result = None
eval_metric_result = None
train_loss = None
while current_step < total_training_steps:
loss = train_step(train_iterator)
train_loss = train_step(train_iterator).numpy().astype(float)
current_step += 1
if train_metric:
logging.info(
'Train Step: %d/%d / loss = %s / training metric = %s',
current_step, total_training_steps, loss.numpy(),
train_metric.result())
train_metric_result = train_metric.result().numpy().astype(float)
logging.info('Train Step: %d/%d / loss = %s / training metric = %s',
current_step, total_training_steps, train_loss,
train_metric_result)
else:
logging.info('Train Step: %d/%d / loss = %s', current_step,
total_training_steps, loss.numpy())
total_training_steps, train_loss)
# Saves model checkpoints and run validation steps at every epoch end.
if current_step % steps_per_epoch == 0:
......@@ -238,7 +246,20 @@ def run_customized_training_loop(
if eval_input_fn:
logging.info('Running final evaluation after training is complete.')
_run_evaluation(current_step,
strategy.make_dataset_iterator(eval_input_fn()))
eval_metric_result = _run_evaluation(
current_step, strategy.make_dataset_iterator(eval_input_fn()))
training_summary = {
'total_training_steps': total_training_steps,
'train_loss': train_loss
}
if train_metric_result:
training_summary['train_metrics'] = train_metric_result
if eval_metric_result:
training_summary['eval_metrics'] = eval_metric_result
summary_path = os.path.join(model_dir, SUMMARY_TXT)
with tf.io.gfile.GFile(summary_path, 'wb') as f:
f.write(json.dumps(training_summary, indent=4))
return model
......@@ -21,7 +21,6 @@ from __future__ import print_function
import functools
import json
import math
import os
from absl import app
from absl import flags
......@@ -82,21 +81,6 @@ flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
FLAGS = flags.FLAGS
def write_eval_results(output_dir, results):
"""Writes and prints evaluation results.
Args:
output_dir: string, the path to the output directory.
results: a dictionary of evaluation metrics.
"""
output_eval_file = os.path.join(output_dir, 'eval_results.txt')
with tf.io.gfile.GFile(output_eval_file, 'w') as writer:
logging.info('***** Eval results *****')
for key, val in results.items():
logging.info(' %s = %s', key, str(val))
writer.write('%s = %s\n' % (key, str(val)))
def get_loss_fn(num_classes, loss_scale=1.0):
"""Gets the classification loss function."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment