Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
2d05fc8a
Commit
2d05fc8a
authored
Apr 22, 2020
by
Jose Baiocchi
Committed by
A. Unique TensorFlower
Apr 22, 2020
Browse files
Internal change
PiperOrigin-RevId: 307878379
parent
05feb2be
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
90 additions
and
99 deletions
+90
-99
official/benchmark/retinanet_benchmark.py
official/benchmark/retinanet_benchmark.py
+90
-99
No files found.
official/benchmark/retinanet_benchmark.py
View file @
2d05fc8a
...
@@ -19,23 +19,21 @@ from __future__ import division
...
@@ -19,23 +19,21 @@ from __future__ import division
from
__future__
import
print_function
from
__future__
import
print_function
# pylint: disable=g-bad-import-order
# pylint: disable=g-bad-import-order
import
copy
import
json
import
json
import
os
import
time
import
time
from
absl
import
flags
from
absl
import
flags
from
absl
import
logging
from
absl.testing
import
flagsaver
from
absl.testing
import
flagsaver
import
tensorflow
as
tf
import
tensorflow
as
tf
# pylint: enable=g-bad-import-order
# pylint: enable=g-bad-import-order
from
official.benchmark
import
bert_benchmark_utils
as
benchmark_utils
from
official.utils.flags
import
core
as
flags_core
from
official.benchmark
import
benchmark_wrappers
from
official.benchmark
import
benchmark_wrappers
from
official.benchmark
import
perfzero_benchmark
from
official.utils.flags
import
core
as
flags_core
from
official.utils.misc
import
keras_utils
from
official.vision.detection
import
main
as
detection
from
official.vision.detection
import
main
as
detection
from
official.vision.detection.configs
import
base_config
TMP_DIR
=
os
.
getenv
(
'TMPDIR'
)
FLAGS
=
flags
.
FLAGS
FLAGS
=
flags
.
FLAGS
# pylint: disable=line-too-long
# pylint: disable=line-too-long
...
@@ -46,51 +44,41 @@ RESNET_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/retinanet/resnet50-checkpoi
...
@@ -46,51 +44,41 @@ RESNET_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/retinanet/resnet50-checkpoi
# pylint: enable=line-too-long
# pylint: enable=line-too-long
class
DetectionBenchmarkBase
(
tf
.
test
.
Benchmark
):
class
TimerCallback
(
keras_utils
.
TimeHistory
):
"""Base class to hold methods common to test classes."""
"""TimeHistory subclass for benchmark reporting."""
local_flags
=
None
def
__init__
(
self
,
output_dir
=
None
):
def
get_examples_per_sec
(
self
,
warmup
=
1
):
self
.
num_gpus
=
8
# First entry in timestamp_log is the start of the step 1. The rest of the
# entries are the end of each step recorded.
time_log
=
self
.
timestamp_log
seconds
=
time_log
[
-
1
].
timestamp
-
time_log
[
warmup
].
timestamp
steps
=
time_log
[
-
1
].
batch_index
-
time_log
[
warmup
].
batch_index
return
self
.
batch_size
*
steps
/
seconds
if
not
output_dir
:
def
get_startup_time
(
self
,
start_time_sec
):
output_dir
=
'/tmp'
return
self
.
timestamp_log
[
0
].
timestamp
-
start_time_sec
self
.
output_dir
=
output_dir
self
.
timer_callback
=
None
def
_get_model_dir
(
self
,
folder_name
):
"""Returns directory to store info, e.g. saved model and event log."""
return
os
.
path
.
join
(
self
.
output_dir
,
folder_name
)
def
_setup
(
self
):
class
DetectionBenchmarkBase
(
perfzero_benchmark
.
PerfZeroBenchmark
):
"""Sets up and resets flags before each test."""
"""Base class to hold methods common to test classes."""
self
.
timer_callback
=
benchmark_utils
.
BenchmarkTimerCallback
()
def
__init__
(
self
,
**
kwargs
):
if
DetectionBenchmarkBase
.
local_flags
is
None
:
super
(
DetectionBenchmarkBase
,
self
).
__init__
(
**
kwargs
)
# Loads flags to get defaults to then override. List cannot be empty.
self
.
timer_callback
=
None
flags
.
FLAGS
([
'foo'
])
saved_flag_values
=
flagsaver
.
save_flag_values
()
def
_report_benchmark
(
self
,
stats
,
start_time_sec
,
wall_time_sec
,
min_ap
,
DetectionBenchmarkBase
.
local_flags
=
saved_flag_values
max_ap
,
warmup
):
else
:
flagsaver
.
restore_flag_values
(
DetectionBenchmarkBase
.
local_flags
)
def
_report_benchmark
(
self
,
stats
,
wall_time_sec
,
min_ap
,
max_ap
,
train_batch_size
=
None
):
"""Report benchmark results by writing to local protobuf file.
"""Report benchmark results by writing to local protobuf file.
Args:
Args:
stats: dict returned from Detection models with known entries.
stats: dict returned from Detection models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
start_time_sec: the start of the benchmark execution in seconds
wall_time_sec: the duration of the benchmark execution in seconds
min_ap: Minimum detection AP constraint to verify correctness of the
min_ap: Minimum detection AP constraint to verify correctness of the
model.
model.
max_ap: Maximum detection AP accuracy constraint to verify correctness of
max_ap: Maximum detection AP accuracy constraint to verify correctness of
the model.
the model.
train_batch_size: Train batch size. It is needed for computing
warmup: Number of time log entries to ignore when computing examples/sec.
exp_per_second.
"""
"""
metrics
=
[{
metrics
=
[{
'name'
:
'total_loss'
,
'name'
:
'total_loss'
,
...
@@ -99,7 +87,11 @@ class DetectionBenchmarkBase(tf.test.Benchmark):
...
@@ -99,7 +87,11 @@ class DetectionBenchmarkBase(tf.test.Benchmark):
if
self
.
timer_callback
:
if
self
.
timer_callback
:
metrics
.
append
({
metrics
.
append
({
'name'
:
'exp_per_second'
,
'name'
:
'exp_per_second'
,
'value'
:
self
.
timer_callback
.
get_examples_per_sec
(
train_batch_size
)
'value'
:
self
.
timer_callback
.
get_examples_per_sec
(
warmup
)
})
metrics
.
append
({
'name'
:
'startup_time'
,
'value'
:
self
.
timer_callback
.
get_startup_time
(
start_time_sec
)
})
})
else
:
else
:
metrics
.
append
({
metrics
.
append
({
...
@@ -125,17 +117,17 @@ class DetectionBenchmarkBase(tf.test.Benchmark):
...
@@ -125,17 +117,17 @@ class DetectionBenchmarkBase(tf.test.Benchmark):
class
RetinanetBenchmarkBase
(
DetectionBenchmarkBase
):
class
RetinanetBenchmarkBase
(
DetectionBenchmarkBase
):
"""Base class to hold methods common to test classes in the module."""
"""Base class to hold methods common to test classes in the module."""
def
__init__
(
self
,
output_dir
=
None
,
**
kwargs
):
def
__init__
(
self
,
**
kwargs
):
self
.
train_data_path
=
COCO_TRAIN_DATA
self
.
train_data_path
=
COCO_TRAIN_DATA
self
.
eval_data_path
=
COCO_EVAL_DATA
self
.
eval_data_path
=
COCO_EVAL_DATA
self
.
eval_json_path
=
COCO_EVAL_JSON
self
.
eval_json_path
=
COCO_EVAL_JSON
self
.
resnet_checkpoint_path
=
RESNET_CHECKPOINT_PATH
self
.
resnet_checkpoint_path
=
RESNET_CHECKPOINT_PATH
super
(
RetinanetBenchmarkBase
,
self
).
__init__
(
**
kwargs
)
super
(
RetinanetBenchmarkBase
,
self
).
__init__
(
output_dir
=
output_dir
)
def
_run_detection_main
(
self
):
def
_run_detection_main
(
self
):
"""Starts detection job."""
"""Starts detection job."""
if
self
.
timer_callback
:
if
self
.
timer_callback
:
FLAGS
.
log_steps
=
0
# prevent detection.run from adding the same callback
return
detection
.
run
(
callbacks
=
[
self
.
timer_callback
])
return
detection
.
run
(
callbacks
=
[
self
.
timer_callback
])
else
:
else
:
return
detection
.
run
()
return
detection
.
run
()
...
@@ -149,37 +141,41 @@ class RetinanetAccuracy(RetinanetBenchmarkBase):
...
@@ -149,37 +141,41 @@ class RetinanetAccuracy(RetinanetBenchmarkBase):
`benchmark_(number of gpus)_gpu_(dataset type)` format.
`benchmark_(number of gpus)_gpu_(dataset type)` format.
"""
"""
def
__init__
(
self
,
output_dir
=
TMP_DIR
,
**
kwargs
):
super
(
RetinanetAccuracy
,
self
).
__init__
(
output_dir
=
output_dir
)
@
benchmark_wrappers
.
enable_runtime_flags
@
benchmark_wrappers
.
enable_runtime_flags
def
_run_and_report_benchmark
(
self
,
min_ap
=
0.325
,
max_ap
=
0.35
):
def
_run_and_report_benchmark
(
self
,
params
,
min_ap
=
0.325
,
max_ap
=
0.35
,
do_eval
=
True
,
warmup
=
1
):
"""Starts RetinaNet accuracy benchmark test."""
"""Starts RetinaNet accuracy benchmark test."""
FLAGS
.
params_override
=
json
.
dumps
(
params
)
# Need timer callback to measure performance
self
.
timer_callback
=
TimerCallback
(
batch_size
=
params
[
'train'
][
'batch_size'
],
log_steps
=
FLAGS
.
log_steps
,
)
start_time_sec
=
time
.
time
()
start_time_sec
=
time
.
time
()
FLAGS
.
mode
=
'train'
FLAGS
.
mode
=
'train'
summary
,
_
=
self
.
_run_detection_main
()
summary
,
_
=
self
.
_run_detection_main
()
wall_time_sec
=
time
.
time
()
-
start_time_sec
wall_time_sec
=
time
.
time
()
-
start_time_sec
FLAGS
.
mode
=
'eval'
if
do_eval
:
eval_metrics
=
self
.
_run_detection_main
()
FLAGS
.
mode
=
'eval'
summary
.
update
(
eval_metrics
)
eval_metrics
=
self
.
_run_detection_main
()
summary
.
update
(
eval_metrics
)
summary
[
'train_batch_size'
]
=
self
.
params_override
[
'train'
][
'batch_size'
]
summary
[
'total_steps'
]
=
params
[
'train'
][
'total_steps'
]
summary
[
'total_steps'
]
=
self
.
params_override
[
'train'
][
'total_steps'
]
self
.
_report_benchmark
(
summary
,
start_time_sec
,
wall_time_sec
,
min_ap
,
super
(
RetinanetAccuracy
,
self
).
_report_benchmark
(
max_ap
,
warmup
)
stats
=
summary
,
wall_time_sec
=
wall_time_sec
,
min_ap
=
min_ap
,
max_ap
=
max_ap
,
train_batch_size
=
self
.
params_override
[
'train'
][
'batch_size'
])
def
_setup
(
self
):
def
_setup
(
self
):
super
(
RetinanetAccuracy
,
self
).
_setup
()
super
(
RetinanetAccuracy
,
self
).
_setup
()
FLAGS
.
strategy_type
=
'mirrored'
FLAGS
.
model
=
'retinanet'
FLAGS
.
model
=
'retinanet'
self
.
params_override
=
{
def
_params
(
self
):
return
{
'train'
:
{
'train'
:
{
'batch_size'
:
64
,
'batch_size'
:
64
,
'iterations_per_loop'
:
100
,
'iterations_per_loop'
:
100
,
...
@@ -189,6 +185,8 @@ class RetinanetAccuracy(RetinanetBenchmarkBase):
...
@@ -189,6 +185,8 @@ class RetinanetAccuracy(RetinanetBenchmarkBase):
'path'
:
self
.
resnet_checkpoint_path
,
'path'
:
self
.
resnet_checkpoint_path
,
'prefix'
:
'resnet50/'
'prefix'
:
'resnet50/'
},
},
# Speed up ResNet training when loading from the checkpoint.
'frozen_variable_prefix'
:
base_config
.
RESNET_FROZEN_VAR_PREFIX
,
},
},
'eval'
:
{
'eval'
:
{
'batch_size'
:
8
,
'batch_size'
:
8
,
...
@@ -202,13 +200,11 @@ class RetinanetAccuracy(RetinanetBenchmarkBase):
...
@@ -202,13 +200,11 @@ class RetinanetAccuracy(RetinanetBenchmarkBase):
def
benchmark_8_gpu_coco
(
self
):
def
benchmark_8_gpu_coco
(
self
):
"""Run RetinaNet model accuracy test with 8 GPUs."""
"""Run RetinaNet model accuracy test with 8 GPUs."""
self
.
_setup
()
self
.
_setup
()
params
=
copy
.
deepcopy
(
self
.
params
_override
)
params
=
self
.
_
params
(
)
FLAGS
.
params_override
=
json
.
dumps
(
params
)
FLAGS
.
num_gpus
=
8
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_8_gpu_coco'
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_8_gpu_coco'
)
# Sets timer_callback to None as we do not use it now.
FLAGS
.
strategy_type
=
'mirrored'
self
.
timer_callback
=
None
self
.
_run_and_report_benchmark
(
params
)
self
.
_run_and_report_benchmark
()
class
RetinanetBenchmarkReal
(
RetinanetAccuracy
):
class
RetinanetBenchmarkReal
(
RetinanetAccuracy
):
...
@@ -219,15 +215,16 @@ class RetinanetBenchmarkReal(RetinanetAccuracy):
...
@@ -219,15 +215,16 @@ class RetinanetBenchmarkReal(RetinanetAccuracy):
`benchmark_(number of gpus)_gpu` format.
`benchmark_(number of gpus)_gpu` format.
"""
"""
def
__init__
(
self
,
output_dir
=
TMP_DIR
,
**
kwargs
):
def
_setup
(
self
):
super
(
RetinanetBenchmarkReal
,
self
).
__init__
(
output_dir
=
output_dir
)
super
(
RetinanetBenchmarkReal
,
self
).
_setup
()
# Use negative value to avoid saving checkpoints.
FLAGS
.
save_checkpoint_freq
=
-
1
@
flagsaver
.
flagsaver
@
flagsaver
.
flagsaver
def
benchmark_8_gpu_coco
(
self
):
def
benchmark_8_gpu_coco
(
self
):
"""Run RetinaNet model accuracy test with 8 GPUs."""
"""Run RetinaNet model accuracy test with 8 GPUs."""
self
.
num_gpus
=
8
self
.
_setup
()
self
.
_setup
()
params
=
copy
.
deepcopy
(
self
.
params
_override
)
params
=
self
.
_
params
(
)
params
[
'train'
][
'total_steps'
]
=
1875
# One epoch.
params
[
'train'
][
'total_steps'
]
=
1875
# One epoch.
# The iterations_per_loop must be one, otherwise the number of examples per
# The iterations_per_loop must be one, otherwise the number of examples per
# second would be wrong. Currently only support calling callback per batch
# second would be wrong. Currently only support calling callback per batch
...
@@ -237,58 +234,52 @@ class RetinanetBenchmarkReal(RetinanetAccuracy):
...
@@ -237,58 +234,52 @@ class RetinanetBenchmarkReal(RetinanetAccuracy):
# Related bug: b/135933080
# Related bug: b/135933080
params
[
'train'
][
'iterations_per_loop'
]
=
1
params
[
'train'
][
'iterations_per_loop'
]
=
1
params
[
'eval'
][
'eval_samples'
]
=
8
params
[
'eval'
][
'eval_samples'
]
=
8
FLAGS
.
num_gpus
=
self
.
num_gpus
FLAGS
.
num_gpus
=
8
FLAGS
.
params_override
=
json
.
dumps
(
params
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_8_gpu_coco'
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_8_gpu_coco'
)
# Use negative value to avoid saving checkpoints.
FLAGS
.
strategy_type
=
'mirrored'
FLAGS
.
save_checkpoint_freq
=
-
1
self
.
_run_and_report_benchmark
(
params
)
if
self
.
timer_callback
is
None
:
logging
.
error
(
'Cannot measure performance without timer callback'
)
else
:
self
.
_run_and_report_benchmark
()
@
flagsaver
.
flagsaver
@
flagsaver
.
flagsaver
def
benchmark_1_gpu_coco
(
self
):
def
benchmark_1_gpu_coco
(
self
):
"""Run RetinaNet model accuracy test with 1 GPU."""
"""Run RetinaNet model accuracy test with 1 GPU."""
self
.
num_gpus
=
1
self
.
_setup
()
self
.
_setup
()
params
=
copy
.
deepcopy
(
self
.
params
_override
)
params
=
self
.
_
params
(
)
params
[
'train'
][
'batch_size'
]
=
8
params
[
'train'
][
'batch_size'
]
=
8
params
[
'train'
][
'total_steps'
]
=
200
params
[
'train'
][
'total_steps'
]
=
200
params
[
'train'
][
'iterations_per_loop'
]
=
1
params
[
'train'
][
'iterations_per_loop'
]
=
1
params
[
'eval'
][
'eval_samples'
]
=
8
params
[
'eval'
][
'eval_samples'
]
=
8
FLAGS
.
num_gpus
=
self
.
num_gpus
FLAGS
.
num_gpus
=
1
FLAGS
.
params_override
=
json
.
dumps
(
params
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_1_gpu_coco'
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_1_gpu_coco'
)
FLAGS
.
strategy_type
=
'one_device'
FLAGS
.
strategy_type
=
'one_device'
# Use negative value to avoid saving checkpoints.
self
.
_run_and_report_benchmark
(
params
)
FLAGS
.
save_checkpoint_freq
=
-
1
if
self
.
timer_callback
is
None
:
logging
.
error
(
'Cannot measure performance without timer callback'
)
else
:
self
.
_run_and_report_benchmark
()
@
flagsaver
.
flagsaver
@
flagsaver
.
flagsaver
def
benchmark_xla_1_gpu_coco
(
self
):
def
benchmark_xla_1_gpu_coco
(
self
):
"""Run RetinaNet model accuracy test with 1 GPU and XLA enabled."""
"""Run RetinaNet model accuracy test with 1 GPU and XLA enabled."""
self
.
num_gpus
=
1
self
.
_setup
()
self
.
_setup
()
params
=
copy
.
deepcopy
(
self
.
params
_override
)
params
=
self
.
_
params
(
)
params
[
'train'
][
'batch_size'
]
=
8
params
[
'train'
][
'batch_size'
]
=
8
params
[
'train'
][
'total_steps'
]
=
200
params
[
'train'
][
'total_steps'
]
=
200
params
[
'train'
][
'iterations_per_loop'
]
=
1
params
[
'train'
][
'iterations_per_loop'
]
=
1
params
[
'eval'
][
'eval_samples'
]
=
8
params
[
'eval'
][
'eval_samples'
]
=
8
FLAGS
.
num_gpus
=
self
.
num_gpus
FLAGS
.
num_gpus
=
1
FLAGS
.
params_override
=
json
.
dumps
(
params
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_xla_1_gpu_coco'
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_1_gpu_coco'
)
FLAGS
.
strategy_type
=
'one_device'
FLAGS
.
strategy_type
=
'one_device'
FLAGS
.
enable_xla
=
True
FLAGS
.
enable_xla
=
True
# Use negative value to avoid saving checkpoints.
self
.
_run_and_report_benchmark
(
params
)
FLAGS
.
save_checkpoint_freq
=
-
1
if
self
.
timer_callback
is
None
:
@
flagsaver
.
flagsaver
logging
.
error
(
'Cannot measure performance without timer callback'
)
def
benchmark_2x2_tpu_coco
(
self
):
else
:
"""Run RetinaNet model accuracy test with 4 TPUs."""
self
.
_run_and_report_benchmark
()
self
.
_setup
()
params
=
self
.
_params
()
params
[
'train'
][
'batch_size'
]
=
64
params
[
'train'
][
'total_steps'
]
=
1875
# One epoch.
params
[
'train'
][
'iterations_per_loop'
]
=
500
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_2x2_tpu_coco'
)
FLAGS
.
strategy_type
=
'tpu'
self
.
_run_and_report_benchmark
(
params
,
do_eval
=
False
,
warmup
=
0
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment