Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
669b0f18
Commit
669b0f18
authored
Aug 25, 2020
by
Zongwei Zhou
Committed by
A. Unique TensorFlower
Aug 25, 2020
Browse files
Internal change
PiperOrigin-RevId: 328390293
parent
f42b8392
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
131 additions
and
16 deletions
+131
-16
official/benchmark/bert_pretrain_benchmark.py
official/benchmark/bert_pretrain_benchmark.py
+131
-16
No files found.
official/benchmark/bert_pretrain_benchmark.py
View file @
669b0f18
...
@@ -41,6 +41,16 @@ MAX_MLM_ACCURACY = 0.645
...
@@ -41,6 +41,16 @@ MAX_MLM_ACCURACY = 0.645
MIN_NSP_ACCURACY
=
0.94
MIN_NSP_ACCURACY
=
0.94
MAX_NSP_ACCURACY
=
0.96
MAX_NSP_ACCURACY
=
0.96
# Pretrain masked lanauge modeling accuracy range:
MIN_MLM_ACCURACY_GPU
=
0.378
MAX_MLM_ACCURACY_GPU
=
0.388
# Pretrain next sentence prediction accuracy range:
MIN_NSP_ACCURACY_GPU
=
0.82
MAX_NSP_ACCURACY_GPU
=
0.84
BERT_PRETRAIN_FILES_SEQ128
=
'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*'
BERT_PRETRAIN_FILES_SEQ128
=
'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*'
BERT_BASE_CONFIG_FILE
=
'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json'
BERT_BASE_CONFIG_FILE
=
'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json'
...
@@ -65,10 +75,11 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -65,10 +75,11 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
output_dir
=
output_dir
,
tpu
=
tpu
,
**
kwargs
)
output_dir
=
output_dir
,
tpu
=
tpu
,
**
kwargs
)
@
benchmark_wrappers
.
enable_runtime_flags
@
benchmark_wrappers
.
enable_runtime_flags
def
_run_and_report_benchmark
(
self
,
summary_path
:
str
,
report_accuracy
:
bool
):
def
_run_and_report_benchmark
(
self
,
summary_path
:
str
,
report_accuracy
:
bool
,
ds_type
:
str
):
"""Runs and reports the benchmark given the provided configuration."""
"""Runs and reports the benchmark given the provided configuration."""
distribution
=
distribution_utils
.
get_distribution_strategy
(
distribution
=
distribution_utils
.
get_distribution_strategy
(
distribution_strategy
=
'tpu'
,
tpu_address
=
self
.
tpu
)
distribution_strategy
=
ds_type
,
tpu_address
=
self
.
tpu
)
logging
.
info
(
'Flags: %s'
,
flags_core
.
get_nondefault_flags_as_str
())
logging
.
info
(
'Flags: %s'
,
flags_core
.
get_nondefault_flags_as_str
())
start_time_sec
=
time
.
time
()
start_time_sec
=
time
.
time
()
run_pretraining
.
run_bert_pretrain
(
run_pretraining
.
run_bert_pretrain
(
...
@@ -78,10 +89,10 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -78,10 +89,10 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
with
tf
.
io
.
gfile
.
GFile
(
summary_path
,
'rb'
)
as
reader
:
with
tf
.
io
.
gfile
.
GFile
(
summary_path
,
'rb'
)
as
reader
:
summary
=
json
.
loads
(
reader
.
read
().
decode
(
'utf-8'
))
summary
=
json
.
loads
(
reader
.
read
().
decode
(
'utf-8'
))
self
.
_report_benchmark
(
summary
,
start_time_sec
,
wall_time_sec
,
self
.
_report_benchmark
(
summary
,
start_time_sec
,
wall_time_sec
,
report_accuracy
)
report_accuracy
,
ds_type
)
def
_report_benchmark
(
self
,
summary
,
start_time_sec
,
wall_time_sec
,
def
_report_benchmark
(
self
,
summary
,
start_time_sec
,
wall_time_sec
,
report_accuracy
):
report_accuracy
,
ds_type
):
metrics
=
[{
metrics
=
[{
'name'
:
'train_loss'
,
'name'
:
'train_loss'
,
'value'
:
summary
[
'train_loss'
],
'value'
:
summary
[
'train_loss'
],
...
@@ -96,16 +107,26 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -96,16 +107,26 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'value'
:
self
.
timer_callback
.
get_startup_time
(
start_time_sec
)
'value'
:
self
.
timer_callback
.
get_startup_time
(
start_time_sec
)
}]
}]
if
report_accuracy
:
if
report_accuracy
:
if
ds_type
==
'tpu'
:
min_mlm_acc
=
MIN_MLM_ACCURACY
max_mlm_acc
=
MAX_MLM_ACCURACY
min_nsp_acc
=
MIN_NSP_ACCURACY
max_nsp_acc
=
MAX_NSP_ACCURACY
else
:
min_mlm_acc
=
MIN_MLM_ACCURACY_GPU
max_mlm_acc
=
MAX_MLM_ACCURACY_GPU
min_nsp_acc
=
MIN_NSP_ACCURACY_GPU
max_nsp_acc
=
MAX_NSP_ACCURACY_GPU
metrics
.
extend
([{
metrics
.
extend
([{
'name'
:
'masked_lm_accuracy'
,
'name'
:
'masked_lm_accuracy'
,
'value'
:
summary
[
'masked_lm_accuracy'
],
'value'
:
summary
[
'masked_lm_accuracy'
],
'min_value'
:
MIN_MLM_ACCURACY
,
'min_value'
:
min_mlm_acc
,
'max_value'
:
MAX_MLM_ACCURACY
,
'max_value'
:
max_mlm_acc
,
},
{
},
{
'name'
:
'next_sentence_accuracy'
,
'name'
:
'next_sentence_accuracy'
,
'value'
:
summary
[
'next_sentence_accuracy'
],
'value'
:
summary
[
'next_sentence_accuracy'
],
'min_value'
:
MIN_NSP_ACCURACY
,
'min_value'
:
min_nsp_acc
,
'max_value'
:
MAX_NSP_ACCURACY
,
'max_value'
:
max_nsp_acc
,
}])
}])
self
.
report_benchmark
(
self
.
report_benchmark
(
iters
=
summary
[
'total_training_steps'
],
iters
=
summary
[
'total_training_steps'
],
...
@@ -115,22 +136,30 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -115,22 +136,30 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
def
_specify_common_flags
(
self
):
def
_specify_common_flags
(
self
):
FLAGS
.
bert_config_file
=
BERT_BASE_CONFIG_FILE
FLAGS
.
bert_config_file
=
BERT_BASE_CONFIG_FILE
FLAGS
.
train_batch_size
=
512
FLAGS
.
learning_rate
=
1e-4
FLAGS
.
learning_rate
=
1e-4
FLAGS
.
warmup_steps
=
10000
FLAGS
.
warmup_steps
=
10000
FLAGS
.
steps_per_loop
=
10000
FLAGS
.
steps_per_loop
=
10000
FLAGS
.
distribution_strategy
=
'tpu'
FLAGS
.
input_files
=
BERT_PRETRAIN_FILES_SEQ128
FLAGS
.
input_files
=
BERT_PRETRAIN_FILES_SEQ128
FLAGS
.
max_seq_length
=
128
FLAGS
.
max_seq_length
=
128
FLAGS
.
max_predictions_per_seq
=
20
FLAGS
.
max_predictions_per_seq
=
20
def
_specify_tpu_common_flags
(
self
):
FLAGS
.
distribution_strategy
=
'tpu'
FLAGS
.
dtype
=
'bf16'
FLAGS
.
dtype
=
'bf16'
def
_specify_gpu_common_flags
(
self
):
FLAGS
.
distribution_strategy
=
'mirrored'
FLAGS
.
dtype
=
'fp16'
FLAGS
.
loss_scale
=
'dynamic'
@
owner_utils
.
Owner
(
'tf-model-garden'
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
def
benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps
(
self
):
def
benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps
(
self
):
"""Test bert pretraining with 8x8 TPU for 500k steps."""
"""Test bert pretraining with 8x8 TPU for 500k steps."""
# This is used for accuracy test.
# This is used for accuracy test.
self
.
_setup
()
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_common_flags
()
self
.
_specify_tpu_common_flags
()
FLAGS
.
train_batch_size
=
512
FLAGS
.
num_steps_per_epoch
=
500000
FLAGS
.
num_steps_per_epoch
=
500000
FLAGS
.
num_train_epochs
=
1
FLAGS
.
num_train_epochs
=
1
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
...
@@ -142,13 +171,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -142,13 +171,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
# accuracy benchmark test.
# accuracy benchmark test.
FLAGS
.
train_summary_interval
=
-
1
FLAGS
.
train_summary_interval
=
-
1
self
.
_run_and_report_benchmark
(
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
True
)
summary_path
=
summary_path
,
report_accuracy
=
True
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
def
benchmark_perf_2x2_tpu_bf16_seq128_10k_steps
(
self
):
def
benchmark_perf_2x2_tpu_bf16_seq128_10k_steps
(
self
):
"""Test bert pretraining with 2x2 TPU for 10000 steps."""
"""Test bert pretraining with 2x2 TPU for 10000 steps."""
self
.
_setup
()
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_common_flags
()
self
.
_specify_tpu_common_flags
()
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_train_epochs
=
2
FLAGS
.
num_train_epochs
=
2
FLAGS
.
train_batch_size
=
128
FLAGS
.
train_batch_size
=
128
...
@@ -158,13 +190,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -158,13 +190,16 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'summaries/training_summary.txt'
)
'summaries/training_summary.txt'
)
# Disable accuracy check.
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
)
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
def
benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir
(
self
):
def
benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir
(
self
):
"""Test bert pretraining with 2x2 TPU with MLIR for 10000 steps."""
"""Test bert pretraining with 2x2 TPU with MLIR for 10000 steps."""
self
.
_setup
()
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_common_flags
()
self
.
_specify_tpu_common_flags
()
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_train_epochs
=
2
FLAGS
.
num_train_epochs
=
2
FLAGS
.
train_batch_size
=
128
FLAGS
.
train_batch_size
=
128
...
@@ -175,13 +210,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -175,13 +210,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
tf
.
config
.
experimental
.
enable_mlir_bridge
()
tf
.
config
.
experimental
.
enable_mlir_bridge
()
# Disable accuracy check.
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
)
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
def
benchmark_perf_4x4_tpu_bf16_seq128_10k_steps
(
self
):
def
benchmark_perf_4x4_tpu_bf16_seq128_10k_steps
(
self
):
"""Test bert pretraining with 4x4 TPU for 10000 steps."""
"""Test bert pretraining with 4x4 TPU for 10000 steps."""
self
.
_setup
()
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_common_flags
()
self
.
_specify_tpu_common_flags
()
FLAGS
.
train_batch_size
=
512
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_train_epochs
=
2
FLAGS
.
num_train_epochs
=
2
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
...
@@ -190,13 +229,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -190,13 +229,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'summaries/training_summary.txt'
)
'summaries/training_summary.txt'
)
# Disable accuracy check.
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
)
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
def
benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir
(
self
):
def
benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir
(
self
):
"""Test bert pretraining with 4x4 TPU with MLIR for 10000 steps."""
"""Test bert pretraining with 4x4 TPU with MLIR for 10000 steps."""
self
.
_setup
()
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_common_flags
()
self
.
_specify_tpu_common_flags
()
FLAGS
.
train_batch_size
=
512
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_train_epochs
=
2
FLAGS
.
num_train_epochs
=
2
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
...
@@ -206,13 +249,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -206,13 +249,17 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
tf
.
config
.
experimental
.
enable_mlir_bridge
()
tf
.
config
.
experimental
.
enable_mlir_bridge
()
# Disable accuracy check.
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
)
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
@
owner_utils
.
Owner
(
'tf-model-garden'
)
def
benchmark_perf_8x8_tpu_bf16_seq128_10k_steps
(
self
):
def
benchmark_perf_8x8_tpu_bf16_seq128_10k_steps
(
self
):
"""Test bert pretraining with 8x8 TPU for 10000 steps."""
"""Test bert pretraining with 8x8 TPU for 10000 steps."""
self
.
_setup
()
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_common_flags
()
self
.
_specify_tpu_common_flags
()
FLAGS
.
train_batch_size
=
512
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_train_epochs
=
2
FLAGS
.
num_train_epochs
=
2
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
...
@@ -221,7 +268,75 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
...
@@ -221,7 +268,75 @@ class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
'summaries/training_summary.txt'
)
'summaries/training_summary.txt'
)
# Disable accuracy check.
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
)
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-dist-strat'
)
def
benchmark_accuracy_1x8_gpu_fp16_seq128_15k_steps
(
self
):
"""Test bert pretraining with 8 GPU for 15k steps."""
# This is used for accuracy test.
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_gpu_common_flags
()
FLAGS
.
train_batch_size
=
96
FLAGS
.
num_steps_per_epoch
=
5000
FLAGS
.
num_train_epochs
=
3
FLAGS
.
steps_per_loop
=
5000
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_accuracy_1x8_gpu_fp16_seq128_15k_steps'
)
summary_path
=
os
.
path
.
join
(
FLAGS
.
model_dir
,
'summaries/training_summary.txt'
)
# Set train_summary_interval to -1 to disable training summary, because
# writing summary to gcs may fail and summaries are not needed for this
# accuracy benchmark test.
FLAGS
.
train_summary_interval
=
-
1
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
True
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-dist-strat'
)
def
benchmark_perf_1x1_gpu_fp16_seq128_200_steps
(
self
):
"""Test bert pretraining with 1 GPU for 200 steps."""
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_gpu_common_flags
()
FLAGS
.
num_steps_per_epoch
=
200
FLAGS
.
num_train_epochs
=
1
FLAGS
.
num_gpus
=
1
FLAGS
.
train_batch_size
=
12
FLAGS
.
steps_per_loop
=
100
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_perf_1x1_gpu_fp16_seq128_200_steps'
)
summary_path
=
os
.
path
.
join
(
FLAGS
.
model_dir
,
'summaries/training_summary.txt'
)
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
@
owner_utils
.
Owner
(
'tf-dist-strat'
)
def
benchmark_perf_1x8_gpu_fp16_seq128_200_steps
(
self
):
"""Test bert pretraining with 8 GPU for 200 steps."""
self
.
_setup
()
self
.
_specify_common_flags
()
self
.
_specify_gpu_common_flags
()
FLAGS
.
num_steps_per_epoch
=
200
FLAGS
.
num_train_epochs
=
1
FLAGS
.
num_gpus
=
8
FLAGS
.
train_batch_size
=
96
FLAGS
.
steps_per_loop
=
100
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_perf_1x8_gpu_fp16_seq128_200_steps'
)
summary_path
=
os
.
path
.
join
(
FLAGS
.
model_dir
,
'summaries/training_summary.txt'
)
# Disable accuracy check.
self
.
_run_and_report_benchmark
(
summary_path
=
summary_path
,
report_accuracy
=
False
,
ds_type
=
FLAGS
.
distribution_strategy
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment