Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
a67c28c8
Commit
a67c28c8
authored
May 06, 2021
by
A. Unique TensorFlower
Committed by
saberkun
May 07, 2021
Browse files
Internal change
PiperOrigin-RevId: 372471631
parent
027e5dc6
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
477 additions
and
0 deletions
+477
-0
official/benchmark/base_benchmark.py
official/benchmark/base_benchmark.py
+173
-0
official/benchmark/benchmark_definitions.py
official/benchmark/benchmark_definitions.py
+53
-0
official/benchmark/benchmark_lib.py
official/benchmark/benchmark_lib.py
+126
-0
official/benchmark/benchmark_lib_test.py
official/benchmark/benchmark_lib_test.py
+89
-0
official/benchmark/config_utils.py
official/benchmark/config_utils.py
+24
-0
official/benchmark/resnet_ctl_imagenet_benchmark.py
official/benchmark/resnet_ctl_imagenet_benchmark.py
+12
-0
No files found.
official/benchmark/base_benchmark.py
0 → 100644
View file @
a67c28c8
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common benchmark class for model garden models."""
import
os
import
pprint
# Import libraries
from
absl
import
logging
import
tensorflow
as
tf
from
tensorflow.python.platform
import
benchmark
# pylint: disable=unused-import
from
official.common
import
registry_imports
# pylint: disable=unused-import
from
official.benchmark
import
benchmark_lib
from
official.benchmark
import
benchmark_definitions
from
official.benchmark
import
config_utils
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
def
_get_benchmark_params
(
benchmark_models
):
"""Formats benchmark params into a list."""
parameterized_benchmark_params
=
[]
for
_
,
benchmarks
in
benchmark_models
.
items
():
for
name
,
params
in
benchmarks
.
items
():
for
execution_mode
in
[
'performance'
,
'accuracy'
]:
benchmark_name
=
'{}.{}'
.
format
(
name
,
execution_mode
)
benchmark_params
=
(
benchmark_name
,
# First arg is used by ParameterizedBenchmark.
benchmark_name
,
params
[
'experiment_type'
],
execution_mode
,
params
[
'platform'
],
params
[
'precision'
],
params
[
'metric_bounds'
],
params
.
get
(
'config_files'
)
or
[],
params
.
get
(
'params_override'
)
or
None
)
parameterized_benchmark_params
.
append
(
benchmark_params
)
return
parameterized_benchmark_params
class
BaseBenchmark
(
# pylint: disable=undefined-variable
tf
.
test
.
Benchmark
,
metaclass
=
benchmark
.
ParameterizedBenchmark
):
"""Common Benchmark.
benchmark.ParameterizedBenchmark is used to auto create benchmarks from
benchmark method according to the benchmarks defined in
benchmark_definitions. The name of the new benchmark methods is
benchmark__{benchmark_name}. _get_benchmark_params is used to generate the
benchmark name and args.
"""
_benchmark_parameters
=
_get_benchmark_params
(
benchmark_definitions
.
VISION_BENCHMARKS
)
+
_get_benchmark_params
(
benchmark_definitions
.
NLP_BENCHMARKS
)
def
__init__
(
self
,
output_dir
=
None
,
tpu
=
None
):
"""Initialize class.
Args:
output_dir: Base directory to store all output for the test.
tpu: (optional) TPU name to use in a TPU benchmark.
"""
if
os
.
getenv
(
'BENCHMARK_OUTPUT_DIR'
):
self
.
output_dir
=
os
.
getenv
(
'BENCHMARK_OUTPUT_DIR'
)
elif
output_dir
:
self
.
output_dir
=
output_dir
else
:
self
.
output_dir
=
'/tmp'
if
os
.
getenv
(
'BENCHMARK_TPU'
):
self
.
_resolved_tpu
=
os
.
getenv
(
'BENCHMARK_TPU'
)
elif
tpu
:
self
.
_resolved_tpu
=
tpu
else
:
self
.
_resolved_tpu
=
None
def
_get_model_dir
(
self
,
folder_name
):
"""Returns directory to store info, e.g. saved model and event log."""
return
os
.
path
.
join
(
self
.
output_dir
,
folder_name
)
def
benchmark
(
self
,
benchmark_name
,
experiment_type
,
execution_mode
,
platform
,
precision
,
metric_bounds
,
config_files
,
params_override
):
params
=
exp_factory
.
get_exp_config
(
experiment_type
)
for
config_file
in
config_files
:
file_path
=
config_utils
.
get_config_path
(
config_file
)
params
=
hyperparams
.
override_params_dict
(
params
,
file_path
,
is_strict
=
True
)
if
params_override
:
params
=
hyperparams
.
override_params_dict
(
params
,
params_override
,
is_strict
=
True
)
# platform in format tpu.[n]x[n] or gpu.[n]
if
'tpu'
in
platform
:
params
.
runtime
.
distribution_strategy
=
'tpu'
params
.
runtime
.
tpu
=
self
.
_resolved_tpu
elif
'gpu'
in
platform
:
params
.
runtime
.
num_gpus
=
int
(
platform
.
split
(
'.'
)[
-
1
])
params
.
runtime
.
distribution_strategy
=
'mirrored'
else
:
NotImplementedError
(
'platform :{} is not supported'
.
format
(
platform
))
params
.
runtime
.
mixed_precision_dtype
=
precision
params
.
validate
()
params
.
lock
()
tf
.
io
.
gfile
.
makedirs
(
self
.
_get_model_dir
(
benchmark_name
))
hyperparams
.
save_params_dict_to_yaml
(
params
,
os
.
path
.
join
(
self
.
_get_model_dir
(
benchmark_name
),
'params.yaml'
))
pp
=
pprint
.
PrettyPrinter
()
logging
.
info
(
'Final experiment parameters: %s'
,
pp
.
pformat
(
params
.
as_dict
()))
benchmark_data
=
benchmark_lib
.
run_benchmark
(
execution_mode
,
params
,
self
.
_get_model_dir
(
benchmark_name
))
metrics
=
[]
if
execution_mode
==
'accuracy'
:
for
metric_bound
in
metric_bounds
:
metric
=
{
'name'
:
metric_bound
[
'name'
],
'value'
:
benchmark_data
[
'metrics'
][
metric_bound
[
'name'
]],
'min_value'
:
metric_bound
[
'min_value'
],
'max_value'
:
metric_bound
[
'max_value'
]
}
metrics
.
append
(
metric
)
metrics
.
append
({
'name'
:
'startup_time'
,
'value'
:
benchmark_data
[
'startup_time'
]})
metrics
.
append
({
'name'
:
'exp_per_second'
,
'value'
:
benchmark_data
[
'examples_per_second'
]})
self
.
report_benchmark
(
iters
=-
1
,
wall_time
=
benchmark_data
[
'wall_time'
],
metrics
=
metrics
,
extras
=
{
'model_name'
:
benchmark_name
.
split
(
'.'
)[
0
],
'platform'
:
platform
,
'implementation'
:
'orbit.ctl'
,
'parameters'
:
precision
})
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/benchmark/benchmark_definitions.py
0 → 100644
View file @
a67c28c8
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model garden benchmark definitions."""
# tf-vision benchmarks
IMAGE_CLASSIFICATION_BENCHMARKS
=
{
'image_classification.resnet50.tpu.4x4.bf16'
:
dict
(
experiment_type
=
'resnet_imagenet'
,
platform
=
'tpu.4x4'
,
precision
=
'bfloat16'
,
metric_bounds
=
[{
'name'
:
'accuracy'
,
'min_value'
:
0.76
,
'max_value'
:
0.77
}],
config_files
=
[
'official/vision/beta/configs/experiments/'
'image_classification/imagenet_resnet50_tpu.yaml'
]),
'image_classification.resnet50.gpu.8.fp16'
:
dict
(
experiment_type
=
'resnet_imagenet'
,
platform
=
'gpu.8'
,
precision
=
'float16'
,
metric_bounds
=
[{
'name'
:
'accuracy'
,
'min_value'
:
0.76
,
'max_value'
:
0.77
}],
config_files
=
[
'official/vision/beta/configs/experiments/'
'image_classification/imagenet_resnet50_gpu.yaml'
])
}
VISION_BENCHMARKS
=
{
'image_classification'
:
IMAGE_CLASSIFICATION_BENCHMARKS
,
}
NLP_BENCHMARKS
=
{
}
official/benchmark/benchmark_lib.py
0 → 100644
View file @
a67c28c8
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TFM common benchmark training driver."""
import
os
import
time
from
typing
import
Any
,
Mapping
from
absl
import
logging
import
orbit
import
tensorflow
as
tf
from
official.common
import
distribute_utils
from
official.core
import
config_definitions
from
official.core
import
task_factory
from
official.core
import
train_utils
from
official.modeling
import
performance
def
run_benchmark
(
execution_mode
:
str
,
params
:
config_definitions
.
ExperimentConfig
,
model_dir
:
str
,
distribution_strategy
:
tf
.
distribute
.
Strategy
=
None
)
->
Mapping
[
str
,
Any
]:
"""Runs benchmark for a specific experiment.
Args:
execution_mode: A 'str', specifying the mode. Can be 'accuracy', or
'performance'.
params: ExperimentConfig instance.
model_dir: A 'str', a path to store model checkpoints and summaries.
distribution_strategy: A tf.distribute.Strategy to use. If specified,
it will be used instead of inferring the strategy from params.
Returns:
benchmark_data: returns benchmark data in dict format.
"""
# For GPU runs, allow option to set thread mode
if
params
.
runtime
.
gpu_thread_mode
:
os
.
environ
[
'TF_GPU_THREAD_MODE'
]
=
params
.
runtime
.
gpu_thread_mode
logging
.
info
(
'TF_GPU_THREAD_MODE: %s'
,
os
.
environ
[
'TF_GPU_THREAD_MODE'
])
# Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
# can have significant impact on model speeds by utilizing float16 in case of
# GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
# dtype is float16
if
params
.
runtime
.
mixed_precision_dtype
:
performance
.
set_mixed_precision_policy
(
params
.
runtime
.
mixed_precision_dtype
)
strategy
=
distribution_strategy
or
distribute_utils
.
get_distribution_strategy
(
distribution_strategy
=
params
.
runtime
.
distribution_strategy
,
all_reduce_alg
=
params
.
runtime
.
all_reduce_alg
,
num_gpus
=
params
.
runtime
.
num_gpus
,
tpu_address
=
params
.
runtime
.
tpu
)
with
strategy
.
scope
():
task
=
task_factory
.
get_task
(
params
.
task
,
logging_dir
=
model_dir
)
trainer
=
train_utils
.
create_trainer
(
params
,
task
,
train
=
True
,
evaluate
=
(
execution_mode
==
'accuracy'
))
# Initialize the model if possible, e.g., from a pre-trained checkpoint.
trainer
.
initialize
()
steps_per_loop
=
params
.
trainer
.
steps_per_loop
if
(
execution_mode
==
'accuracy'
)
else
100
controller
=
orbit
.
Controller
(
strategy
=
strategy
,
trainer
=
trainer
,
evaluator
=
trainer
if
(
execution_mode
==
'accuracy'
)
else
None
,
global_step
=
trainer
.
global_step
,
steps_per_loop
=
steps_per_loop
)
logging
.
info
(
'Starts to execute execution mode: %s'
,
execution_mode
)
with
strategy
.
scope
():
# Training for one loop, first loop time includes warmup time.
first_loop_start_time
=
time
.
time
()
controller
.
train
(
steps
=
steps_per_loop
)
first_loop_time
=
time
.
time
()
-
first_loop_start_time
# Training for second loop.
second_loop_start_time
=
time
.
time
()
controller
.
train
(
steps
=
2
*
steps_per_loop
)
second_loop_time
=
time
.
time
()
-
second_loop_start_time
if
execution_mode
==
'accuracy'
:
controller
.
train
(
steps
=
params
.
trainer
.
train_steps
)
wall_time
=
time
.
time
()
-
first_loop_time
eval_logs
=
trainer
.
evaluate
(
tf
.
convert_to_tensor
(
params
.
trainer
.
validation_steps
))
benchmark_data
=
{
'metrics'
:
eval_logs
}
elif
execution_mode
==
'performance'
:
benchmark_data
=
{}
else
:
raise
NotImplementedError
(
'The benchmark execution mode is not implemented: %s'
%
execution_mode
)
# First training loop time contains startup time plus training time, while
# second training loop time is purely training time. Startup time can be
# recovered by subtracting second trianing loop time from first training
# loop time.
startup_time
=
first_loop_time
-
second_loop_time
wall_time
=
time
.
time
()
-
first_loop_start_time
examples_per_second
=
steps_per_loop
*
params
.
task
.
train_data
.
global_batch_size
/
second_loop_time
benchmark_data
.
update
(
dict
(
examples_per_second
=
examples_per_second
,
wall_time
=
wall_time
,
startup_time
=
startup_time
))
return
benchmark_data
official/benchmark/benchmark_lib_test.py
0 → 100644
View file @
a67c28c8
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tensorflow_models.official.benchmark.benchmark_lib."""
# pylint: disable=g-direct-tensorflow-import
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.common
import
registry_imports
# pylint: disable=unused-import
from
official.benchmark
import
benchmark_lib
from
official.core
import
exp_factory
from
official.modeling
import
hyperparams
def
all_strategy_combinations
():
return
combinations
.
combine
(
distribution
=
[
strategy_combinations
.
default_strategy
,
strategy_combinations
.
cloud_tpu_strategy
,
strategy_combinations
.
one_device_strategy_gpu
,
],)
class
BenchmarkLibTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
setUp
(
self
):
super
(
BenchmarkLibTest
,
self
).
setUp
()
self
.
_test_config
=
{
'trainer'
:
{
'steps_per_loop'
:
10
,
'optimizer_config'
:
{
'optimizer'
:
{
'type'
:
'sgd'
},
'learning_rate'
:
{
'type'
:
'constant'
}
},
'continuous_eval_timeout'
:
5
,
'train_steps'
:
20
,
'validation_steps'
:
10
},
}
@
combinations
.
generate
(
combinations
.
combine
(
distribution
=
[
strategy_combinations
.
default_strategy
,
strategy_combinations
.
cloud_tpu_strategy
,
strategy_combinations
.
one_device_strategy_gpu
,
],
execution_mode
=
[
'performance'
,
'accuracy'
],
))
def
test_benchmark
(
self
,
distribution
,
execution_mode
):
model_dir
=
self
.
get_temp_dir
()
params
=
exp_factory
.
get_exp_config
(
'mock'
)
params
=
hyperparams
.
override_params_dict
(
params
,
self
.
_test_config
,
is_strict
=
True
)
benchmark_data
=
benchmark_lib
.
run_benchmark
(
execution_mode
,
params
,
model_dir
,
distribution
)
self
.
assertIn
(
'examples_per_second'
,
benchmark_data
)
self
.
assertIn
(
'wall_time'
,
benchmark_data
)
self
.
assertIn
(
'startup_time'
,
benchmark_data
)
if
execution_mode
==
'accuracy'
:
self
.
assertIn
(
'metrics'
,
benchmark_data
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/benchmark/config_utils.py
0 → 100644
View file @
a67c28c8
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmarks config utils."""
import
os
def
get_config_path
(
config_file
:
str
,
base_dir
:
str
=
''
)
->
str
:
"""Gets the absolute path of the config file."""
return
os
.
path
.
join
(
base_dir
,
config_file
)
official/benchmark/resnet_ctl_imagenet_benchmark.py
View file @
a67c28c8
...
@@ -334,6 +334,18 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
...
@@ -334,6 +334,18 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
FLAGS
.
dtype
=
'fp16'
FLAGS
.
dtype
=
'fp16'
self
.
_run_and_report_benchmark
()
self
.
_run_and_report_benchmark
()
def
benchmark_xla_8_gpu_fp16
(
self
):
"""Test Keras model with 8 GPUs with tf.keras mixed precision."""
self
.
_setup
()
FLAGS
.
num_gpus
=
8
FLAGS
.
distribution_strategy
=
'mirrored'
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_xla_8_gpu_fp16'
)
FLAGS
.
batch_size
=
256
*
8
# 8 GPUs
FLAGS
.
dtype
=
'fp16'
FLAGS
.
enable_xla
=
True
self
.
_run_and_report_benchmark
()
def
benchmark_8_gpu_eager
(
self
):
def
benchmark_8_gpu_eager
(
self
):
"""Test Keras model with 8 GPUs, eager, fp32."""
"""Test Keras model with 8 GPUs, eager, fp32."""
self
.
_setup
()
self
.
_setup
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment