"src/geometry/vscode:/vscode.git/clone" did not exist on "d6eecf90a1fc258de3c494209ea89141c2f4bfbe"
Commit 669b0f18 authored by Zongwei Zhou's avatar Zongwei Zhou Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 328390293
parent f42b8392
...@@ -41,6 +41,16 @@ MAX_MLM_ACCURACY = 0.645 ...@@ -41,6 +41,16 @@ MAX_MLM_ACCURACY = 0.645
MIN_NSP_ACCURACY = 0.94 MIN_NSP_ACCURACY = 0.94
MAX_NSP_ACCURACY = 0.96 MAX_NSP_ACCURACY = 0.96
# Pretrain masked lanauge modeling accuracy range:
MIN_MLM_ACCURACY_GPU = 0.378
MAX_MLM_ACCURACY_GPU = 0.388
# Pretrain next sentence prediction accuracy range:
MIN_NSP_ACCURACY_GPU = 0.82
MAX_NSP_ACCURACY_GPU = 0.84
BERT_PRETRAIN_FILES_SEQ128 = 'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*' BERT_PRETRAIN_FILES_SEQ128 = 'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*'
BERT_BASE_CONFIG_FILE = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json' BERT_BASE_CONFIG_FILE = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json'
...@@ -65,10 +75,11 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -65,10 +75,11 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
output_dir=output_dir, tpu=tpu, **kwargs) output_dir=output_dir, tpu=tpu, **kwargs)
@benchmark_wrappers.enable_runtime_flags @benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self, summary_path: str, report_accuracy: bool): def _run_and_report_benchmark(self, summary_path: str, report_accuracy: bool,
ds_type: str):
"""Runs and reports the benchmark given the provided configuration.""" """Runs and reports the benchmark given the provided configuration."""
distribution = distribution_utils.get_distribution_strategy( distribution = distribution_utils.get_distribution_strategy(
distribution_strategy='tpu', tpu_address=self.tpu) distribution_strategy=ds_type, tpu_address=self.tpu)
logging.info('Flags: %s', flags_core.get_nondefault_flags_as_str()) logging.info('Flags: %s', flags_core.get_nondefault_flags_as_str())
start_time_sec = time.time() start_time_sec = time.time()
run_pretraining.run_bert_pretrain( run_pretraining.run_bert_pretrain(
...@@ -78,10 +89,10 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -78,10 +89,10 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
with tf.io.gfile.GFile(summary_path, 'rb') as reader: with tf.io.gfile.GFile(summary_path, 'rb') as reader:
summary = json.loads(reader.read().decode('utf-8')) summary = json.loads(reader.read().decode('utf-8'))
self._report_benchmark(summary, start_time_sec, wall_time_sec, self._report_benchmark(summary, start_time_sec, wall_time_sec,
report_accuracy) report_accuracy, ds_type)
def _report_benchmark(self, summary, start_time_sec, wall_time_sec, def _report_benchmark(self, summary, start_time_sec, wall_time_sec,
report_accuracy): report_accuracy, ds_type):
metrics = [{ metrics = [{
'name': 'train_loss', 'name': 'train_loss',
'value': summary['train_loss'], 'value': summary['train_loss'],
...@@ -96,16 +107,26 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -96,16 +107,26 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'value': self.timer_callback.get_startup_time(start_time_sec) 'value': self.timer_callback.get_startup_time(start_time_sec)
}] }]
if report_accuracy: if report_accuracy:
if ds_type == 'tpu':
min_mlm_acc = MIN_MLM_ACCURACY
max_mlm_acc = MAX_MLM_ACCURACY
min_nsp_acc = MIN_NSP_ACCURACY
max_nsp_acc = MAX_NSP_ACCURACY
else:
min_mlm_acc = MIN_MLM_ACCURACY_GPU
max_mlm_acc = MAX_MLM_ACCURACY_GPU
min_nsp_acc = MIN_NSP_ACCURACY_GPU
max_nsp_acc = MAX_NSP_ACCURACY_GPU
metrics.extend([{ metrics.extend([{
'name': 'masked_lm_accuracy', 'name': 'masked_lm_accuracy',
'value': summary['masked_lm_accuracy'], 'value': summary['masked_lm_accuracy'],
'min_value': MIN_MLM_ACCURACY, 'min_value': min_mlm_acc,
'max_value': MAX_MLM_ACCURACY, 'max_value': max_mlm_acc,
}, { }, {
'name': 'next_sentence_accuracy', 'name': 'next_sentence_accuracy',
'value': summary['next_sentence_accuracy'], 'value': summary['next_sentence_accuracy'],
'min_value': MIN_NSP_ACCURACY, 'min_value': min_nsp_acc,
'max_value': MAX_NSP_ACCURACY, 'max_value': max_nsp_acc,
}]) }])
self.report_benchmark( self.report_benchmark(
iters=summary['total_training_steps'], iters=summary['total_training_steps'],
...@@ -115,22 +136,30 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -115,22 +136,30 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
def _specify_common_flags(self): def _specify_common_flags(self):
FLAGS.bert_config_file = BERT_BASE_CONFIG_FILE FLAGS.bert_config_file = BERT_BASE_CONFIG_FILE
FLAGS.train_batch_size = 512
FLAGS.learning_rate = 1e-4 FLAGS.learning_rate = 1e-4
FLAGS.warmup_steps = 10000 FLAGS.warmup_steps = 10000
FLAGS.steps_per_loop = 10000 FLAGS.steps_per_loop = 10000
FLAGS.distribution_strategy = 'tpu'
FLAGS.input_files = BERT_PRETRAIN_FILES_SEQ128 FLAGS.input_files = BERT_PRETRAIN_FILES_SEQ128
FLAGS.max_seq_length = 128 FLAGS.max_seq_length = 128
FLAGS.max_predictions_per_seq = 20 FLAGS.max_predictions_per_seq = 20
def _specify_tpu_common_flags(self):
FLAGS.distribution_strategy = 'tpu'
FLAGS.dtype = 'bf16' FLAGS.dtype = 'bf16'
def _specify_gpu_common_flags(self):
FLAGS.distribution_strategy = 'mirrored'
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 'dynamic'
@owner_utils.Owner('tf-model-garden') @owner_utils.Owner('tf-model-garden')
def benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps(self): def benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps(self):
"""Test bert pretraining with 8x8 TPU for 500k steps.""" """Test bert pretraining with 8x8 TPU for 500k steps."""
# This is used for accuracy test. # This is used for accuracy test.
self._setup() self._setup()
self._specify_common_flags() self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 500000 FLAGS.num_steps_per_epoch = 500000
FLAGS.num_train_epochs = 1 FLAGS.num_train_epochs = 1
FLAGS.model_dir = self._get_model_dir( FLAGS.model_dir = self._get_model_dir(
...@@ -142,13 +171,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -142,13 +171,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
# accuracy benchmark test. # accuracy benchmark test.
FLAGS.train_summary_interval = -1 FLAGS.train_summary_interval = -1
self._run_and_report_benchmark( self._run_and_report_benchmark(
summary_path=summary_path, report_accuracy=True) summary_path=summary_path,
report_accuracy=True,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden') @owner_utils.Owner('tf-model-garden')
def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps(self): def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps(self):
"""Test bert pretraining with 2x2 TPU for 10000 steps.""" """Test bert pretraining with 2x2 TPU for 10000 steps."""
self._setup() self._setup()
self._specify_common_flags() self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.num_steps_per_epoch = 5000 FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2 FLAGS.num_train_epochs = 2
FLAGS.train_batch_size = 128 FLAGS.train_batch_size = 128
...@@ -158,13 +190,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -158,13 +190,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'summaries/training_summary.txt') 'summaries/training_summary.txt')
# Disable accuracy check. # Disable accuracy check.
self._run_and_report_benchmark( self._run_and_report_benchmark(
summary_path=summary_path, report_accuracy=False) summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden') @owner_utils.Owner('tf-model-garden')
def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir(self): def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir(self):
"""Test bert pretraining with 2x2 TPU with MLIR for 10000 steps.""" """Test bert pretraining with 2x2 TPU with MLIR for 10000 steps."""
self._setup() self._setup()
self._specify_common_flags() self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.num_steps_per_epoch = 5000 FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2 FLAGS.num_train_epochs = 2
FLAGS.train_batch_size = 128 FLAGS.train_batch_size = 128
...@@ -175,13 +210,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -175,13 +210,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
tf.config.experimental.enable_mlir_bridge() tf.config.experimental.enable_mlir_bridge()
# Disable accuracy check. # Disable accuracy check.
self._run_and_report_benchmark( self._run_and_report_benchmark(
summary_path=summary_path, report_accuracy=False) summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden') @owner_utils.Owner('tf-model-garden')
def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps(self): def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps(self):
"""Test bert pretraining with 4x4 TPU for 10000 steps.""" """Test bert pretraining with 4x4 TPU for 10000 steps."""
self._setup() self._setup()
self._specify_common_flags() self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 5000 FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2 FLAGS.num_train_epochs = 2
FLAGS.model_dir = self._get_model_dir( FLAGS.model_dir = self._get_model_dir(
...@@ -190,13 +229,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -190,13 +229,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'summaries/training_summary.txt') 'summaries/training_summary.txt')
# Disable accuracy check. # Disable accuracy check.
self._run_and_report_benchmark( self._run_and_report_benchmark(
summary_path=summary_path, report_accuracy=False) summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden') @owner_utils.Owner('tf-model-garden')
def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir(self): def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir(self):
"""Test bert pretraining with 4x4 TPU with MLIR for 10000 steps.""" """Test bert pretraining with 4x4 TPU with MLIR for 10000 steps."""
self._setup() self._setup()
self._specify_common_flags() self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 5000 FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2 FLAGS.num_train_epochs = 2
FLAGS.model_dir = self._get_model_dir( FLAGS.model_dir = self._get_model_dir(
...@@ -206,13 +249,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -206,13 +249,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
tf.config.experimental.enable_mlir_bridge() tf.config.experimental.enable_mlir_bridge()
# Disable accuracy check. # Disable accuracy check.
self._run_and_report_benchmark( self._run_and_report_benchmark(
summary_path=summary_path, report_accuracy=False) summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-model-garden') @owner_utils.Owner('tf-model-garden')
def benchmark_perf_8x8_tpu_bf16_seq128_10k_steps(self): def benchmark_perf_8x8_tpu_bf16_seq128_10k_steps(self):
"""Test bert pretraining with 8x8 TPU for 10000 steps.""" """Test bert pretraining with 8x8 TPU for 10000 steps."""
self._setup() self._setup()
self._specify_common_flags() self._specify_common_flags()
self._specify_tpu_common_flags()
FLAGS.train_batch_size = 512
FLAGS.num_steps_per_epoch = 5000 FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 2 FLAGS.num_train_epochs = 2
FLAGS.model_dir = self._get_model_dir( FLAGS.model_dir = self._get_model_dir(
...@@ -221,7 +268,75 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase): ...@@ -221,7 +268,75 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'summaries/training_summary.txt') 'summaries/training_summary.txt')
# Disable accuracy check. # Disable accuracy check.
self._run_and_report_benchmark( self._run_and_report_benchmark(
summary_path=summary_path, report_accuracy=False) summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_accuracy_1x8_gpu_fp16_seq128_15k_steps(self):
"""Test bert pretraining with 8 GPU for 15k steps."""
# This is used for accuracy test.
self._setup()
self._specify_common_flags()
self._specify_gpu_common_flags()
FLAGS.train_batch_size = 96
FLAGS.num_steps_per_epoch = 5000
FLAGS.num_train_epochs = 3
FLAGS.steps_per_loop = 5000
FLAGS.model_dir = self._get_model_dir(
'benchmark_accuracy_1x8_gpu_fp16_seq128_15k_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Set train_summary_interval to -1 to disable training summary, because
# writing summary to gcs may fail and summaries are not needed for this
# accuracy benchmark test.
FLAGS.train_summary_interval = -1
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=True,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_1x1_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 1 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_common_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.num_gpus = 1
FLAGS.train_batch_size = 12
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_1x1_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
@owner_utils.Owner('tf-dist-strat')
def benchmark_perf_1x8_gpu_fp16_seq128_200_steps(self):
"""Test bert pretraining with 8 GPU for 200 steps."""
self._setup()
self._specify_common_flags()
self._specify_gpu_common_flags()
FLAGS.num_steps_per_epoch = 200
FLAGS.num_train_epochs = 1
FLAGS.num_gpus = 8
FLAGS.train_batch_size = 96
FLAGS.steps_per_loop = 100
FLAGS.model_dir = self._get_model_dir(
'benchmark_perf_1x8_gpu_fp16_seq128_200_steps')
summary_path = os.path.join(FLAGS.model_dir,
'summaries/training_summary.txt')
# Disable accuracy check.
self._run_and_report_benchmark(
summary_path=summary_path,
report_accuracy=False,
ds_type=FLAGS.distribution_strategy)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment