"tests/vscode:/vscode.git/clone" did not exist on "a4f304a5f5667f9439fd767d8db09a0ae4bb79cd"
Unverified Commit 6fc642d4 authored by Hongjun Choi's avatar Hongjun Choi Committed by GitHub
Browse files

Merged commit includes the following changes: (#6885)

250009207  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Add feature in BERT to write training metrics to a summary file.

--

PiperOrigin-RevId: 250009207
parent fa10031d
...@@ -29,6 +29,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order ...@@ -29,6 +29,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.bert import run_classifier from official.bert import run_classifier
# pylint: disable=line-too-long # pylint: disable=line-too-long
PRETRAINED_CHECKPOINT_PATH = 'gs://tf-perfzero-data/bert/bert_model.ckpt'
CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record' CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record'
CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record' CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record'
CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data' CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data'
...@@ -55,7 +56,7 @@ class BertBenchmarkBase(tf.test.Benchmark): ...@@ -55,7 +56,7 @@ class BertBenchmarkBase(tf.test.Benchmark):
"""Sets up and resets flags before each test.""" """Sets up and resets flags before each test."""
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG) tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
if BertBenchmark.local_flags is None: if BertBenchmarkBase.local_flags is None:
# Loads flags to get defaults to then override. List cannot be empty. # Loads flags to get defaults to then override. List cannot be empty.
flags.FLAGS(['foo']) flags.FLAGS(['foo'])
saved_flag_values = flagsaver.save_flag_values() saved_flag_values = flagsaver.save_flag_values()
...@@ -70,20 +71,39 @@ class BertBenchmarkBase(tf.test.Benchmark): ...@@ -70,20 +71,39 @@ class BertBenchmarkBase(tf.test.Benchmark):
stats: dict returned from BERT models with known entries. stats: dict returned from BERT models with known entries.
wall_time_sec: the during of the benchmark execution in seconds wall_time_sec: the during of the benchmark execution in seconds
""" """
del stats metrics = [{
self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=[]) 'name': 'training_loss',
'value': stats['train_loss'],
}]
class BertBenchmark(BertBenchmarkBase):
"""Short performance tests for BERT model.""" if 'train_metrics' in stats:
metrics.append({
'name': 'train_accuracy',
'value': stats['train_metrics'],
})
if 'eval_metric' in stats:
metrics.append({
'name': 'eval_accuracy',
'value': stats['eval_metrics'],
})
self.report_benchmark(
iters=stats['total_training_steps'],
wall_time=wall_time_sec,
metrics=metrics)
class BertBenchmarkAccuracyTest(BertBenchmarkBase):
"""Short benchmark tests for BERT model."""
def __init__(self, output_dir=None, **kwargs): def __init__(self, output_dir=None, **kwargs):
self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
self.bert_config_file = MODEL_CONFIG_FILE_PATH self.bert_config_file = MODEL_CONFIG_FILE_PATH
self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
super(BertBenchmark, self).__init__(output_dir=output_dir) super(BertBenchmarkAccuracyTest, self).__init__(output_dir=output_dir)
@flagsaver.flagsaver @flagsaver.flagsaver
def _run_bert_classifier(self): def _run_bert_classifier(self):
...@@ -93,23 +113,28 @@ class BertBenchmark(BertBenchmarkBase): ...@@ -93,23 +113,28 @@ class BertBenchmark(BertBenchmarkBase):
strategy = tf.distribute.MirroredStrategy() strategy = tf.distribute.MirroredStrategy()
run_classifier.run_bert(strategy, input_meta_data) run_classifier.run_bert(strategy, input_meta_data)
def _run_and_report_benchmark(self): def _run_and_report_benchmark(self, training_summary_path):
start_time_sec = time.time() start_time_sec = time.time()
self._run_bert_classifier() self._run_bert_classifier()
wall_time_sec = time.time() - start_time_sec wall_time_sec = time.time() - start_time_sec
super(BertBenchmark, self)._report_benchmark( with tf.io.gfile.GFile(training_summary_path, 'rb') as reader:
stats=None, wall_time_sec=wall_time_sec) summary = json.loads(reader.read().decode('utf-8'))
super(BertBenchmarkAccuracyTest, self)._report_benchmark(
stats=summary, wall_time_sec=wall_time_sec)
def benchmark_1_gpu(self): def benchmark_8_gpu(self):
self._setup() self._setup()
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu') FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
FLAGS.train_data_path = self.train_data_path FLAGS.train_data_path = self.train_data_path
FLAGS.eval_data_path = self.eval_data_path FLAGS.eval_data_path = self.eval_data_path
FLAGS.input_meta_data_path = self.input_meta_data_path FLAGS.input_meta_data_path = self.input_meta_data_path
FLAGS.bert_config_file = self.bert_config_file FLAGS.bert_config_file = self.bert_config_file
FLAGS.init_checkpoint = self.pretrained_checkpoint_path
self._run_and_report_benchmark() summary_path = os.path.join(FLAGS.model_dir, 'training_summary.txt')
self._run_and_report_benchmark(summary_path)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -18,11 +18,14 @@ from __future__ import absolute_import ...@@ -18,11 +18,14 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import json
import os import os
from absl import logging from absl import logging
import tensorflow as tf import tensorflow as tf
SUMMARY_TXT = 'training_summary.txt'
def get_primary_cpu_task(use_remote_tpu=False): def get_primary_cpu_task(use_remote_tpu=False):
"""Returns primary CPU task to which input pipeline Ops are put.""" """Returns primary CPU task to which input pipeline Ops are put."""
...@@ -182,11 +185,13 @@ def run_customized_training_loop( ...@@ -182,11 +185,13 @@ def run_customized_training_loop(
def _run_evaluation(current_training_step, test_iterator): def _run_evaluation(current_training_step, test_iterator):
"""Runs validation steps and aggregate metrics.""" """Runs validation steps and aggregate metrics."""
for _ in range(eval_steps): for _ in range(eval_steps):
test_step(test_iterator) test_step(test_iterator)
metric_result = metric.result().numpy().astype(float)
logging.info('Step: [%d] Validation metric = %f', current_training_step, logging.info('Step: [%d] Validation metric = %f', current_training_step,
metric.result()) metric_result)
return metric_result
# Training loop starts here. # Training loop starts here.
checkpoint = tf.train.Checkpoint(model=model) checkpoint = tf.train.Checkpoint(model=model)
...@@ -202,18 +207,21 @@ def run_customized_training_loop( ...@@ -202,18 +207,21 @@ def run_customized_training_loop(
current_step = optimizer.iterations.numpy() current_step = optimizer.iterations.numpy()
checkpoint_name = 'ctl_step_{step}.ckpt' checkpoint_name = 'ctl_step_{step}.ckpt'
train_metric_result = None
eval_metric_result = None
train_loss = None
while current_step < total_training_steps: while current_step < total_training_steps:
loss = train_step(train_iterator) train_loss = train_step(train_iterator).numpy().astype(float)
current_step += 1 current_step += 1
if train_metric: if train_metric:
logging.info( train_metric_result = train_metric.result().numpy().astype(float)
'Train Step: %d/%d / loss = %s / training metric = %s',
current_step, total_training_steps, loss.numpy(), logging.info('Train Step: %d/%d / loss = %s / training metric = %s',
train_metric.result()) current_step, total_training_steps, train_loss,
train_metric_result)
else: else:
logging.info('Train Step: %d/%d / loss = %s', current_step, logging.info('Train Step: %d/%d / loss = %s', current_step,
total_training_steps, loss.numpy()) total_training_steps, train_loss)
# Saves model checkpoints and run validation steps at every epoch end. # Saves model checkpoints and run validation steps at every epoch end.
if current_step % steps_per_epoch == 0: if current_step % steps_per_epoch == 0:
...@@ -238,7 +246,20 @@ def run_customized_training_loop( ...@@ -238,7 +246,20 @@ def run_customized_training_loop(
if eval_input_fn: if eval_input_fn:
logging.info('Running final evaluation after training is complete.') logging.info('Running final evaluation after training is complete.')
_run_evaluation(current_step, eval_metric_result = _run_evaluation(
strategy.make_dataset_iterator(eval_input_fn())) current_step, strategy.make_dataset_iterator(eval_input_fn()))
training_summary = {
'total_training_steps': total_training_steps,
'train_loss': train_loss
}
if train_metric_result:
training_summary['train_metrics'] = train_metric_result
if eval_metric_result:
training_summary['eval_metrics'] = eval_metric_result
summary_path = os.path.join(model_dir, SUMMARY_TXT)
with tf.io.gfile.GFile(summary_path, 'wb') as f:
f.write(json.dumps(training_summary, indent=4))
return model return model
...@@ -21,7 +21,6 @@ from __future__ import print_function ...@@ -21,7 +21,6 @@ from __future__ import print_function
import functools import functools
import json import json
import math import math
import os
from absl import app from absl import app
from absl import flags from absl import flags
...@@ -82,21 +81,6 @@ flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.') ...@@ -82,21 +81,6 @@ flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
def write_eval_results(output_dir, results):
"""Writes and prints evaluation results.
Args:
output_dir: string, the path to the output directory.
results: a dictionary of evaluation metrics.
"""
output_eval_file = os.path.join(output_dir, 'eval_results.txt')
with tf.io.gfile.GFile(output_eval_file, 'w') as writer:
logging.info('***** Eval results *****')
for key, val in results.items():
logging.info(' %s = %s', key, str(val))
writer.write('%s = %s\n' % (key, str(val)))
def get_loss_fn(num_classes, loss_scale=1.0): def get_loss_fn(num_classes, loss_scale=1.0):
"""Gets the classification loss function.""" """Gets the classification loss function."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment