Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
6b6f8b0c
Commit
6b6f8b0c
authored
Apr 15, 2022
by
huchen
Browse files
del tensorflow benchmark cls
parent
4749cd5e
Changes
149
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
8858 deletions
+0
-8858
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/flags.cpython-36.pyc
...cripts/tf_cnn_benchmarks/__pycache__/flags.cpython-36.pyc
+0
-0
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/mlperf.cpython-36.pyc
...ripts/tf_cnn_benchmarks/__pycache__/mlperf.cpython-36.pyc
+0
-0
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/preprocessing.cpython-36.pyc
...f_cnn_benchmarks/__pycache__/preprocessing.cpython-36.pyc
+0
-0
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/ssd_constants.cpython-36.pyc
...f_cnn_benchmarks/__pycache__/ssd_constants.cpython-36.pyc
+0
-0
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr.cpython-36.pyc
...tf_cnn_benchmarks/__pycache__/variable_mgr.cpython-36.pyc
+0
-0
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr_util.cpython-36.pyc
...n_benchmarks/__pycache__/variable_mgr_util.cpython-36.pyc
+0
-0
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
...nchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
+0
-290
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
...rk/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
+0
-52
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce.py
...fication/benchmark/scripts/tf_cnn_benchmarks/allreduce.py
+0
-645
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce_test.py
...ion/benchmark/scripts/tf_cnn_benchmarks/allreduce_test.py
+0
-448
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/batch_allreduce.py
...on/benchmark/scripts/tf_cnn_benchmarks/batch_allreduce.py
+0
-628
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn.py
...tion/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+0
-3542
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
...ripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
+0
-493
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py
...f_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py
+0
-122
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_test.py
...benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_test.py
+0
-1493
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util.py
...ification/benchmark/scripts/tf_cnn_benchmarks/cnn_util.py
+0
-253
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util_test.py
...tion/benchmark/scripts/tf_cnn_benchmarks/cnn_util_test.py
+0
-129
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/coco_metric.py
...cation/benchmark/scripts/tf_cnn_benchmarks/coco_metric.py
+0
-198
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/constants.py
...fication/benchmark/scripts/tf_cnn_benchmarks/constants.py
+0
-67
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/convnet_builder.py
...on/benchmark/scripts/tf_cnn_benchmarks/convnet_builder.py
+0
-498
No files found.
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/flags.cpython-36.pyc
deleted
100644 → 0
View file @
4749cd5e
File deleted
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/mlperf.cpython-36.pyc
deleted
100644 → 0
View file @
4749cd5e
File deleted
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/preprocessing.cpython-36.pyc
deleted
100644 → 0
View file @
4749cd5e
File deleted
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/ssd_constants.cpython-36.pyc
deleted
100644 → 0
View file @
4749cd5e
File deleted
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr.cpython-36.pyc
deleted
100644 → 0
View file @
4749cd5e
File deleted
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr_util.cpython-36.pyc
deleted
100644 → 0
View file @
4749cd5e
File deleted
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
useful for benchmarking the performance of just this gradient aggregation,
instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
also accepted by this script, although many are silently ignored.
The number and shapes of the tensors all-reduced are those of the variables of
the model specified by the --model flag.
TODO(reedwm): Allow custom sizes to be specified.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
time
from
absl
import
app
from
absl
import
flags
as
absl_flags
import
tensorflow.compat.v1
as
tf
from
tensorflow.python.ops
import
control_flow_ops
import
benchmark_cnn
import
cnn_util
import
flags
from
cnn_util
import
log_fn
absl_flags
.
DEFINE_integer
(
'iters_per_step'
,
5
,
'Number of iterations to run all-reduce for, per '
'step. Every step, a session will be run on a Graph '
'that contains this many copies of the all-reduce. '
'The copies are run sequentially. Setting this above '
'1 is useful to lower the overhead of starting the '
'session run, running the VariableV2 ops at the '
'start of the step, etc.'
)
flags
.
define_flags
()
for
name
in
flags
.
param_specs
.
keys
():
absl_flags
.
declare_key_flag
(
name
)
def
get_var_shapes
(
model
):
"""Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
with
tf
.
Graph
().
as_default
():
# The variable shapes do not depend on the batch size.
images
=
tf
.
placeholder
(
tf
.
float32
,
model
.
get_input_shapes
(
'train'
)[
0
])
model
.
build_network
([
images
])
return
[[
int
(
d
)
for
d
in
v
.
shape
.
dims
]
for
v
in
tf
.
trainable_variables
()]
def
all_reduce
(
all_device_tensors
,
variable_mgr
):
"""Performs a single batch all-reduce.
Args:
all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
a tensor, where t is the tower the tensor is on and i is the index of
the tensor.
variable_mgr: The VariableMgr to perform the all-reduce.
Returns:
List of list of tensors in the same form as `all_device_tensors`, except the
tensors are aggregated across towers.
"""
tower_grads
=
[[(
g
,
None
)
for
g
in
device_tensors
]
for
device_tensors
in
all_device_tensors
]
_
,
aggregated_tower_grads
=
variable_mgr
.
preprocess_device_grads
(
tower_grads
)
return
[
[
g
for
g
,
_
in
agg_device_tensors
]
for
agg_device_tensors
in
aggregated_tower_grads
]
def
build_all_reduce_iterations
(
all_device_tensors
,
tower_devices
,
variable_mgr
,
num_iters
):
"""Builds the all-reduce ops for multiple iterations to aggregate tensors.
The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
iteration aggregates the results from the previous iteration. The iterations
are run sequentially, so the aggregations for an iteration do not start
running until the previous iteration has completed. Each iteration after the
first is aggregating already-aggregated values, but it does not matter because
we are only aggregating for benchmarking purposes.
Args:
all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
a tensor, where t is the tower the tensor is on and i is the index of
the tensor.
tower_devices: A list of device strings. tower_devices[t] is the device
of the tensors in all_device_tensors[t].
variable_mgr: The VariableMgr to perform the all-reduce.
num_iters: Number of iterations to aggregate tensors for.
Returns:
An op that when run, causes the all-reduce ops to run.
"""
for
i
in
range
(
num_iters
):
with
tf
.
name_scope
(
'iteration_%d'
%
i
):
# Step 1: Do the aggregation.
with
tf
.
name_scope
(
'tensor_aggregation'
):
all_device_tensors
=
all_reduce
(
all_device_tensors
,
variable_mgr
)
# Step 2. Create identity ops, to bring the aggregated results back to
# each device.
new_all_device_tensors
=
[]
for
device
,
device_tensors
in
zip
(
tower_devices
,
all_device_tensors
):
with
tf
.
device
(
device
):
new_all_device_tensors
.
append
([
tf
.
identity
(
t
,
name
=
'identity_after_allreduce'
)
for
t
in
device_tensors
])
all_device_tensors
=
new_all_device_tensors
# Step 3. Add control dependencies to delay the next iteration until this
# iteration is complete. To avoid extra overhead, we do not have any
# cross-device control dependencies, which means it's possible for two
# iterations to slightly overlap.
new_all_device_tensors
=
[]
for
device_tensors
in
all_device_tensors
:
new_all_device_tensors
.
append
([
control_flow_ops
.
with_dependencies
(
device_tensors
,
t
,
name
=
'identity_after_dependencies'
)
for
t
in
device_tensors
])
all_device_tensors
=
new_all_device_tensors
# To prevent the dependency optimizer from removing every op we created,
# we store the results in variables.
ops_to_run
=
[]
for
device
,
device_tensors
in
zip
(
tower_devices
,
all_device_tensors
):
with
tf
.
device
(
device
):
for
t
in
device_tensors
:
# The placeholder initial value is never run.
var
=
tf
.
Variable
(
tf
.
placeholder
(
tf
.
float32
,
t
.
shape
),
collections
=
[])
ops_to_run
.
append
(
var
.
assign
(
t
))
return
tf
.
group
(
*
ops_to_run
)
def
build_graph
(
tower_devices
,
tensor_shapes
,
variable_mgr
,
num_iters
):
"""Builds the graph for the benchmark.
Args:
tower_devices: A list of device strings of the devices to run the all-reduce
benchmark on.
tensor_shapes: A list of shapes of the tensors that will be aggregated for
the all-reduce.
variable_mgr: The VariableMgr to perform the all-reduce.
num_iters: Number of iterations to aggregate tensors for.
Returns:
An op that runs the benchmark.
"""
all_device_tensors
=
[]
for
i
,
tower_device
in
enumerate
(
tower_devices
):
with
tf
.
device
(
tower_device
):
device_tensors
=
[]
for
j
,
shape
in
enumerate
(
tensor_shapes
):
tensor
=
tf
.
Variable
(
tf
.
random_normal
(
shape
,
dtype
=
tf
.
float32
),
name
=
'tensor_%d_on_device_%d'
%
(
j
,
i
))
device_tensors
.
append
(
tensor
)
all_device_tensors
.
append
(
device_tensors
)
log_fn
(
'Building all-reduce ops'
)
benchmark_op
=
build_all_reduce_iterations
(
all_device_tensors
,
tower_devices
,
variable_mgr
,
num_iters
)
log_fn
(
'Done building all-reduce ops'
)
return
benchmark_op
def
run_graph
(
benchmark_op
,
bench_cnn
,
init_ops
,
dummy_loss_op
):
"""Runs the graph for the benchmark.
Args:
benchmark_op: An op that runs the benchmark.
bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
init_ops: A list of ops that are run before `benchmark_op` for
initialization.
dummy_loss_op: Any op. We must pass a loss op to
`benchmark_cnn.benchmark_one_step`, but the result of the op is never
actually used.
"""
config
=
benchmark_cnn
.
create_config_proto
(
bench_cnn
.
params
)
with
tf
.
Session
(
config
=
config
)
as
sess
:
for
op
in
init_ops
:
sess
.
run
(
op
)
step_train_times
=
[]
fetches
=
{
'average_loss'
:
dummy_loss_op
,
'benchmark_op'
:
benchmark_op
}
log_fn
(
'Running warmup'
)
for
i
in
range
(
-
bench_cnn
.
num_warmup_batches
,
bench_cnn
.
num_batches
):
if
i
==
0
:
log_fn
(
'Running all-reduce ops'
)
start
=
time
.
time
()
if
i
>
0
and
i
%
bench_cnn
.
params
.
display_every
==
0
:
log_fn
(
'Iteration: %d. Average time per step so far: %s'
%
(
i
,
(
time
.
time
()
-
start
)
/
i
))
# Call benchmark_one_step instead of directly calling sess.run(...), to
# potentially get a trace file, partitioned graphs, etc.
benchmark_cnn
.
benchmark_one_step
(
sess
=
sess
,
fetches
=
fetches
,
step
=
i
,
# The batch size is only used for the images/sec calculation, which is
# not actually calculated because we pass show_images_per_sec=False.
batch_size
=
None
,
step_train_times
=
step_train_times
,
trace_filename
=
bench_cnn
.
trace_filename
,
partitioned_graph_file_prefix
=
(
bench_cnn
.
params
.
partitioned_graph_file_prefix
),
profiler
=
None
,
image_producer
=
None
,
params
=
bench_cnn
.
params
,
show_images_per_sec
=
False
)
log_fn
(
'Average time per step: %s'
%
((
time
.
time
()
-
start
)
/
bench_cnn
.
num_batches
))
def
run_benchmark
(
bench_cnn
,
num_iters
):
"""Runs the all-reduce benchmark.
Args:
bench_cnn: The BenchmarkCNN where params, the variable manager, and other
attributes are obtained.
num_iters: Number of iterations to do all-reduce for for.
Raises:
ValueError: Invalid params of bench_cnn.
"""
if
bench_cnn
.
params
.
variable_update
!=
'replicated'
:
raise
ValueError
(
'--variable_update=replicated must be specified to use'
'the all-reduce benchmark'
)
if
bench_cnn
.
params
.
variable_consistency
==
'relaxed'
:
raise
ValueError
(
'--variable_consistency=relaxed is not supported'
)
benchmark_op
=
build_graph
(
bench_cnn
.
raw_devices
,
get_var_shapes
(
bench_cnn
.
model
),
bench_cnn
.
variable_mgr
,
num_iters
)
init_ops
=
[
tf
.
global_variables_initializer
(),
bench_cnn
.
variable_mgr
.
get_post_init_ops
()
]
loss_op
=
tf
.
no_op
()
if
bench_cnn
.
graph_file
:
path
,
filename
=
os
.
path
.
split
(
bench_cnn
.
graph_file
)
as_text
=
filename
.
endswith
(
'txt'
)
log_fn
(
'Writing GraphDef as %s to %s'
%
(
'text'
if
as_text
else
'binary'
,
bench_cnn
.
graph_file
))
tf
.
train
.
write_graph
(
tf
.
get_default_graph
().
as_graph_def
(
add_shapes
=
True
),
path
,
filename
,
as_text
)
run_graph
(
benchmark_op
,
bench_cnn
,
init_ops
,
loss_op
)
# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
def
main
(
positional_arguments
):
# Command-line arguments like '--distortions False' are equivalent to
# '--distortions=True False', where False is a positional argument. To prevent
# this from silently running with distortions, we do not allow positional
# arguments.
assert
len
(
positional_arguments
)
>=
1
if
len
(
positional_arguments
)
>
1
:
raise
ValueError
(
'Received unknown positional arguments: %s'
%
positional_arguments
[
1
:])
params
=
benchmark_cnn
.
make_params_from_flags
()
params
=
benchmark_cnn
.
setup
(
params
)
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
tfversion
=
cnn_util
.
tensorflow_version_tuple
()
log_fn
(
'TensorFlow: %i.%i'
%
(
tfversion
[
0
],
tfversion
[
1
]))
run_benchmark
(
bench
,
absl_flags
.
FLAGS
.
iters_per_step
)
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
app
.
run
(
main
)
# Raises error on invalid flags, unlike tf.app.run()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for all_reduce_benchmark.py."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow.compat.v1
as
tf
import
all_reduce_benchmark
import
benchmark_cnn
import
test_util
class
AllReduceBenchmarkTest
(
tf
.
test
.
TestCase
):
"""Tests the all-reduce benchmark."""
def
_test_run_benchmark
(
self
,
params
):
"""Tests that run_benchmark() runs successfully with the params."""
logs
=
[]
with
test_util
.
monkey_patch
(
all_reduce_benchmark
,
log_fn
=
test_util
.
print_and_add_to_list
(
logs
)):
bench_cnn
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
all_reduce_benchmark
.
run_benchmark
(
bench_cnn
,
num_iters
=
5
)
self
.
assertRegex
(
logs
[
-
1
],
'^Average time per step: [0-9.]+$'
)
def
test_run_benchmark
(
self
):
"""Tests that run_benchmark() runs successfully."""
params
=
benchmark_cnn
.
make_params
(
num_batches
=
10
,
variable_update
=
'replicated'
,
num_gpus
=
2
)
self
.
_test_run_benchmark
(
params
)
params
=
params
.
_replace
(
hierarchical_copy
=
True
,
gradient_repacking
=
8
,
num_gpus
=
8
)
self
.
_test_run_benchmark
(
params
)
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
tf
.
test
.
main
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for allreduce."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
as
pycoll
import
re
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow.compat.v1
as
tf
# pylint: disable=g-direct-tensorflow-import
from
tensorflow.python.distribute
import
all_reduce
from
tensorflow.python.framework
import
device
as
pydev
from
tensorflow.python.framework
import
ops
from
tensorflow.python.ops
import
collective_ops
AllReduceSpecTuple
=
pycoll
.
namedtuple
(
'AllReduceSpecTuple'
,
'alg shards limit'
)
def
parse_general_int
(
s
):
"""Parse integer with power-of-2 suffix eg. 32k."""
mo
=
re
.
match
(
r
'(\d+)([KkMGT]?)$'
,
s
)
if
mo
:
i
,
suffix
=
mo
.
group
(
1
,
2
)
v
=
int
(
i
)
if
suffix
:
if
suffix
==
'K'
or
suffix
==
'k'
:
v
*=
1024
elif
suffix
==
'M'
:
v
*=
(
1024
*
1024
)
elif
suffix
==
'G'
:
v
*=
(
1024
*
1024
*
1024
)
elif
suffix
==
'T'
:
v
*=
(
1024
*
1024
*
1024
*
1024
)
else
:
raise
ValueError
(
'invalid integer string %s'
%
s
)
return
v
else
:
v
=
int
(
s
)
return
v
def
parse_all_reduce_spec
(
all_reduce_spec
):
"""Parse all_reduce_spec.
Args:
all_reduce_spec: a string specifying a combination of all-reduce
algorithms to apply for gradient reduction.
Returns:
a list of AllReduceSpecTuple.
Raises:
ValueError: all_reduce_spec is not well-formed.
An all_reduce_spec has BNF form:
int ::= positive whole number
g_int ::= int[KkMGT]?
alg_spec ::= alg | alg#int
range_spec ::= alg_spec | alg_spec/alg_spec
spec ::= range_spec | range_spec:g_int:range_spec
Not all syntactically correct specifications are supported.
Examples of supported all_reduce_spec strings, with semantics explained:
'collective' == apply tf.collective_reduce operator to all tensors.
'collective#2' == apply tf.collective_reduce operator to all tensors,
requesting up to 2 simultaneous transfers at each node, if
feasible, by subdividing tensor by an additional factor of 2.
'xring' == apply ring all-reduce to all tensors
'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
transfer rings, each operating on 1/2 of each tensor.
'nccl' == apply NCCL all-reduce to all tensors (only works within
a single worker process where all devices are GPUs)
'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
to produce at least one full-reduced (locally) value,
then apply ring all-reduce to one such value from each
worker, then apply NCCL broadcast to propagate those globally
reduced values back to every device within each worker.
'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
distributed tensor is reduced by copying all instances to
one of the worker CPUs, computing the reduction there, then
copying back to each participating device. Tensor reductions
are assigned to specific CPUs round-robin.
'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
Each distributed tensor is shuffle reduced against one
such group of 4 GPUs, selected round-robin. That is, each
tensor is split across 4 shards for the reduction.
'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
tensors of size <= 2048 elements, apply 2-shard pscpu to
tensors up to size 64k elements, apply xring to larger tensors.
'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
the worker's CPU, then use 2-shard shuffle to reduce those
locally reduced tensors across workers (on the worker CPUs), then
scatter the globally reduced values locally from each worker CPU.
"""
range_parts
=
all_reduce_spec
.
split
(
':'
)
+
[
'-1'
]
if
len
(
range_parts
)
%
2
:
raise
ValueError
(
'all_reduce_spec not well formed: %s'
%
all_reduce_spec
)
limit
=
0
spec
=
[]
alg
=
None
shards
=
1
for
i
,
range_part
in
enumerate
(
range_parts
):
if
i
%
2
==
1
:
try
:
limit
=
parse_general_int
(
range_part
)
spec
.
append
(
AllReduceSpecTuple
(
alg
=
alg
,
shards
=
shards
,
limit
=
limit
))
except
ValueError
:
raise
ValueError
(
'all_reduce_spec (%s) contains non-integer range %s'
%
(
all_reduce_spec
,
range_part
))
else
:
alg
=
range_part
alg_parts
=
range_part
.
split
(
'#'
)
alg
=
alg_parts
[
0
]
if
len
(
alg_parts
)
>
1
:
try
:
shards
=
int
(
alg_parts
[
1
])
except
ValueError
:
raise
ValueError
(
'all_reduce_spec (%s) contains non-integer '
'shards %s'
%
all_reduce_spec
,
alg_parts
[
1
])
else
:
shards
=
1
if
alg
not
in
[
'nccl'
,
'nccl/xring'
,
'nccl/rechd'
,
'nccl/pscpu'
,
'xring'
,
'pscpu'
,
'psgpu'
,
'pscpu/pscpu'
,
'collective'
]:
raise
ValueError
(
'all_reduce_spec (%s) contains invalid alg %s'
%
(
all_reduce_spec
,
alg
))
return
spec
def
build_all_reduce_device_prefixes
(
job_name
,
num_tasks
):
"""Build list of device prefix names for all_reduce.
Args:
job_name: 'worker', 'ps' or 'localhost'.
num_tasks: number of jobs across which device names should be generated.
Returns:
A list of device name prefix strings. Each element spells out the full
host name without adding the device.
e.g. '/job:worker/task:0'
"""
if
job_name
!=
'localhost'
:
return
[
'/job:%s/task:%d'
%
(
job_name
,
d
)
for
d
in
range
(
0
,
num_tasks
)]
else
:
assert
num_tasks
==
1
return
[
'/job:%s'
%
job_name
]
def
group_device_names
(
devices
,
group_size
):
"""Group device names into groups of group_size.
Args:
devices: list of strings naming devices.
group_size: int >= 1
Returns:
list of lists of devices, where each inner list is group_size long,
and each device appears at least once in an inner list. If
len(devices) % group_size = 0 then each device will appear
exactly once.
Raises:
ValueError: group_size > len(devices)
"""
num_devices
=
len
(
devices
)
if
group_size
>
num_devices
:
raise
ValueError
(
'only %d devices, but group_size=%d'
%
(
num_devices
,
group_size
))
num_groups
=
(
num_devices
//
group_size
+
(
1
if
(
num_devices
%
group_size
!=
0
)
else
0
))
groups
=
[[]
for
i
in
range
(
num_groups
)]
for
i
in
range
(
0
,
num_groups
*
group_size
):
groups
[
i
%
num_groups
].
append
(
devices
[
i
%
num_devices
])
return
groups
def
split_grads_by_size
(
threshold_size
,
device_grads
):
"""Break gradients into two sets according to tensor size.
Args:
threshold_size: int size cutoff for small vs large tensor.
device_grads: List of lists of (gradient, variable) tuples. The outer
list is over devices. The inner list is over individual gradients.
Returns:
small_grads: Subset of device_grads where shape is <= theshold_size
elements.
large_grads: Subset of device_grads where shape is > threshold_size
elements.
"""
small_grads
=
[]
large_grads
=
[]
for
dl
in
device_grads
:
small_dl
=
[]
large_dl
=
[]
for
(
g
,
v
)
in
dl
:
tensor_size
=
g
.
get_shape
().
num_elements
()
if
tensor_size
<=
threshold_size
:
small_dl
.
append
([
g
,
v
])
else
:
large_dl
.
append
([
g
,
v
])
if
small_dl
:
small_grads
.
append
(
small_dl
)
if
large_dl
:
large_grads
.
append
(
large_dl
)
return
small_grads
,
large_grads
_instance_key
=
1
def
new_collective_instance_key
():
"""Returns a new instance key for use in defining a collective op."""
global
_instance_key
v
=
_instance_key
_instance_key
+=
1
return
v
_group_key
=
1
_group_key_table
=
dict
()
def
collective_group_key
(
devices
):
"""Returns a group key for the set of devices.
Args:
devices: list of strings naming devices in a collective group.
Returns:
int key uniquely identifying the set of device names.
"""
global
_group_key
global
_group_key_table
parsed
=
[
pydev
.
DeviceSpec
.
from_string
(
d
)
for
d
in
devices
]
names
=
sorted
([
'%s:%d'
%
(
d
.
device_type
,
d
.
device_index
)
for
d
in
parsed
])
concat
=
','
.
join
(
names
)
if
concat
not
in
_group_key_table
.
keys
():
new_key
=
_group_key
_group_key
+=
1
_group_key_table
[
concat
]
=
new_key
rv
=
_group_key_table
[
concat
]
return
rv
def
build_collective_reduce
(
input_tensors
,
num_workers
,
num_shards
,
red_op
=
'Add'
,
un_op
=
'Id'
):
"""Build a subgraph that does one full all-reduce, using the collective Op.
Args:
input_tensors: tensors within a single worker graph that are to be reduced
together; must be one per device.
num_workers: total number of workers with identical independent graphs that
will be doing this same reduction. The reduction will actually include
the corresponding tensors at all these workers.
num_shards: number of shards into which to divide each per-tick chunk,
normally 1 but could be higher on multi-data-path architectures.
red_op: string naming the reduction op
un_op: string naming the unary final op
Returns:
An array of final tensors, one per device, computed by the full reduction.
Raises:
ValueError: There must be at least two tensors over all the workers.
"""
group_size
=
len
(
input_tensors
)
*
num_workers
if
group_size
<
2
:
raise
ValueError
(
'num_workers * len(input_tensors) must be 2 or greater'
)
devices
=
[
t
.
device
for
t
in
input_tensors
]
num_devices
=
len
(
devices
)
group_key
=
collective_group_key
(
devices
)
instance_key
=
new_collective_instance_key
()
out_tensors
=
[]
if
num_shards
==
1
:
subdiv_offsets
=
[
0
]
elif
num_shards
==
2
:
if
num_devices
>
1
:
subdiv_offsets
=
[
0
,
-
(
num_devices
//
2
)]
else
:
subdiv_offsets
=
[
0
]
else
:
raise
ValueError
(
'Unsupported num_shards %d'
%
num_shards
)
for
d
in
range
(
num_devices
):
with
ops
.
device
(
devices
[
d
]):
reduce_op
=
collective_ops
.
all_reduce
(
input_tensors
[
d
],
group_size
,
group_key
,
instance_key
,
red_op
,
un_op
,
subdiv_offsets
)
out_tensors
.
append
(
reduce_op
)
return
out_tensors
def
broadcast_send
(
t
,
shape
,
dtype
,
group_size
,
group_key
,
instance_key
):
return
collective_ops
.
broadcast_send
(
t
,
shape
,
dtype
,
group_size
,
group_key
,
instance_key
)
def
broadcast_recv
(
shape
,
dtype
,
group_size
,
group_key
,
instance_key
):
return
collective_ops
.
broadcast_recv
(
shape
,
dtype
,
group_size
,
group_key
,
instance_key
)
def
sum_grad_and_var_all_reduce
(
single_session
,
grad_and_vars
,
num_workers
,
alg
,
gpu_indices
,
aux_devices
=
None
,
num_shards
=
1
):
"""Apply all-reduce algorithm over specified gradient tensors."""
scaled_grads
=
[
g
for
g
,
_
in
grad_and_vars
]
if
alg
==
'collective'
:
assert
not
single_session
summed_grads
=
build_collective_reduce
(
scaled_grads
,
num_workers
,
num_shards
,
'Add'
,
'Id'
)
else
:
with
tf
.
name_scope
(
'allreduce'
):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
if
alg
==
'nccl'
:
summed_grads
=
all_reduce
.
build_nccl_all_reduce
(
scaled_grads
,
tf
.
add
)
elif
alg
==
'xring'
:
summed_grads
=
all_reduce
.
build_ring_all_reduce
(
scaled_grads
,
num_workers
,
num_shards
,
gpu_indices
,
tf
.
add
)
elif
alg
==
'nccl/xring'
:
summed_grads
=
all_reduce
.
build_nccl_then_ring
(
scaled_grads
,
num_shards
,
tf
.
add
)
elif
alg
==
'nccl/rechd'
:
summed_grads
=
all_reduce
.
build_nccl_then_recursive_hd
(
scaled_grads
,
tf
.
add
)
elif
alg
==
'nccl/pscpu'
:
summed_grads
=
all_reduce
.
build_nccl_then_shuffle
(
scaled_grads
,
aux_devices
,
tf
.
add
,
tf
.
add_n
)
elif
alg
==
'pscpu/pscpu'
:
summed_grads
=
all_reduce
.
build_shuffle_then_shuffle
(
scaled_grads
,
aux_devices
,
# TODO(tucker): devise a way of better specifying the device set
# for the second level.
[
aux_devices
[
0
]],
tf
.
add_n
)
elif
alg
in
[
'pscpu'
,
'psgpu'
]:
summed_grads
=
all_reduce
.
build_shuffle_all_reduce
(
scaled_grads
,
aux_devices
,
tf
.
add_n
)
else
:
raise
ValueError
(
'unsupported all_reduce alg: '
,
alg
)
result
=
[]
for
(
_
,
v
),
g
in
zip
(
grad_and_vars
,
summed_grads
):
result
.
append
([
g
,
v
])
return
result
def
contains_any
(
haystack
,
needles
):
"""Tests if any needle is a substring of haystack.
Args:
haystack: a string
needles: list of strings
Returns:
True if any element of needles is a substring of haystack,
False otherwise.
"""
for
n
in
needles
:
if
n
in
haystack
:
return
True
return
False
def
sum_gradients_all_reduce
(
single_session
,
dev_prefixes
,
tower_grads
,
num_workers
,
alg
,
num_shards
,
gpu_indices
,
agg_small_grads_max_bytes
=
0
,
agg_small_grads_max_group
=
10
,
allreduce_merge_scope
=
1
):
"""Apply all-reduce algorithm over specified gradient tensors.
Args:
single_session: true if reduction is applied to one graph across
all workers, false if ths application is to a single-worker graph only.
dev_prefixes: list of prefix strings to use to generate PS device names.
tower_grads: the gradients to reduce.
num_workers: number of worker processes across entire job.
alg: the all-reduce algorithm to apply.
num_shards: alg-specific sharding factor.
gpu_indices: indices of local GPUs in order usable for ring-reduce.
agg_small_grads_max_bytes: largest tensor eligible for aggregation,
in number of bytes.
agg_small_grads_max_group: largest permitted aggregation of small
tensors.
allreduce_merge_scope: size of groups into which to partition consecutive
gradients grouped under a common 'allreduce' name scope for application
of ScopedAllocator optimization.
Returns:
list of reduced tensors
"""
alg_contains_shuffle
=
contains_any
(
alg
,
[
'pscpu'
,
'psgpu'
])
is_hierarchical
=
'/'
in
alg
if
'pscpu'
in
alg
:
aux_devices
=
[
prefix
+
'/cpu:0'
for
prefix
in
dev_prefixes
]
elif
'psgpu'
in
alg
:
aux_devices
=
[
prefix
+
'/gpu:%d'
%
i
for
i
in
range
(
len
(
gpu_indices
))
for
prefix
in
dev_prefixes
]
else
:
aux_devices
=
[
'/job:localhost/cpu:0'
]
aux_device_groups
=
group_device_names
(
aux_devices
,
num_shards
if
(
alg
!=
'collective'
and
alg_contains_shuffle
)
else
1
)
group_index
=
0
if
agg_small_grads_max_bytes
>
0
and
agg_small_grads_max_group
>
0
:
tower_grads
,
packing
=
pack_small_tensors
(
tower_grads
,
max_bytes
=
agg_small_grads_max_bytes
,
max_group
=
agg_small_grads_max_group
)
else
:
packing
=
None
reduced_gv_list
=
[]
gv
=
list
(
zip
(
*
tower_grads
))
merge_scope
=
allreduce_merge_scope
if
allreduce_merge_scope
>
0
else
1
chunked_gv
=
[
gv
[
x
:
x
+
merge_scope
]
for
x
in
xrange
(
0
,
len
(
gv
),
merge_scope
)]
for
chunk
in
chunked_gv
:
with
tf
.
name_scope
(
'allreduce'
):
for
grad_and_vars
in
chunk
:
reduced_gv_list
.
append
(
sum_grad_and_var_all_reduce
(
single_session
,
grad_and_vars
,
num_workers
,
alg
,
gpu_indices
,
(
aux_devices
if
is_hierarchical
else
aux_device_groups
[
group_index
]),
num_shards
))
group_index
=
(
group_index
+
1
)
%
len
(
aux_device_groups
)
new_tower_grads
=
[
list
(
x
)
for
x
in
zip
(
*
reduced_gv_list
)]
if
packing
:
new_tower_grads
=
unpack_small_tensors
(
new_tower_grads
,
packing
)
return
new_tower_grads
def
extract_ranges
(
index_list
,
range_size_limit
=
32
):
"""Extract consecutive ranges and singles from index_list.
Args:
index_list: List of monotone increasing non-negative integers.
range_size_limit: Largest size range to return. If a larger
consecutive range exists it will be returned as multiple
ranges.
Returns:
ranges, singles where ranges is a list of [first, last] pairs of
consecutive elements in index_list, and singles is all of the
other elements, in original order.
"""
if
not
index_list
:
return
[],
[]
first
=
index_list
[
0
]
last
=
first
ranges
=
[]
singles
=
[]
for
i
in
index_list
[
1
:]:
if
i
==
last
+
1
and
(
last
-
first
)
<=
range_size_limit
:
last
=
i
else
:
if
last
>
first
:
ranges
.
append
([
first
,
last
])
else
:
singles
.
append
(
first
)
first
=
i
last
=
i
if
last
>
first
:
ranges
.
append
([
first
,
last
])
else
:
singles
.
append
(
first
)
return
ranges
,
singles
GradPackTuple
=
pycoll
.
namedtuple
(
'GradPackTuple'
,
'indices vars shapes'
)
def
pack_range
(
key
,
packing
,
grad_vars
,
rng
):
"""Form the concatenation of a specified range of gradient tensors.
Args:
key: Value under which to store meta-data in packing that will be used
later to restore the grad_var list structure.
packing: Dict holding data describing packed ranges of small tensors.
grad_vars: List of (grad, var) pairs for one tower.
rng: A pair of integers giving the first, last indices of a consecutive
range of tensors to be packed.
Returns:
A tensor that is the concatenation of all the specified small tensors.
"""
to_pack
=
grad_vars
[
rng
[
0
]:
rng
[
1
]
+
1
]
members
=
[]
variables
=
[]
restore_shapes
=
[]
with
tf
.
name_scope
(
'pack'
):
for
g
,
v
in
to_pack
:
variables
.
append
(
v
)
restore_shapes
.
append
(
g
.
shape
)
with
tf
.
device
(
g
.
device
):
members
.
append
(
tf
.
reshape
(
g
,
[
-
1
]))
packing
[
key
]
=
GradPackTuple
(
indices
=
range
(
rng
[
0
],
rng
[
1
]
+
1
),
vars
=
variables
,
shapes
=
restore_shapes
)
with
tf
.
device
(
members
[
0
].
device
):
return
tf
.
concat
(
members
,
0
)
def
unpack_grad_tuple
(
gv
,
gpt
):
"""Unpack a previously packed collection of gradient tensors.
Args:
gv: A (grad, var) pair to be unpacked.
gpt: A GradPackTuple describing the packing operation that produced gv.
Returns:
A list of (grad, var) pairs corresponding to the values that were
originally packed into gv, maybe following subsequent operations like
reduction.
"""
elt_widths
=
[
x
.
num_elements
()
for
x
in
gpt
.
shapes
]
with
tf
.
device
(
gv
[
0
][
0
].
device
):
with
tf
.
name_scope
(
'unpack'
):
splits
=
tf
.
split
(
gv
[
0
],
elt_widths
)
unpacked_gv
=
[]
for
idx
,
s
in
enumerate
(
splits
):
unpacked_gv
.
append
((
tf
.
reshape
(
s
,
gpt
.
shapes
[
idx
]),
gpt
.
vars
[
idx
]))
return
unpacked_gv
def
pack_small_tensors
(
tower_grads
,
max_bytes
=
0
,
max_group
=
0
):
"""Concatenate small gradient tensors together for reduction.
Args:
tower_grads: List of lists of (gradient, variable) tuples.
max_bytes: Int giving max number of bytes in a tensor that
may be considered small.
max_group: Int giving max number of small tensors that may be
concatenated into one new tensor.
Returns:
new_tower_grads, packing where new_tower_grads is identical to
tower_grads except that all feasible small_tensors have been removed
from their places and concatenated into larger tensors that are
now in the front of the list for each tower, and packing contains
the data necessary to restore the tower_grads structure.
Look through the first tower for gradients of the same type (float),
and small size, that are all sequential. For each such group,
replace by a new tensor that is a flattened concatenation. Note
that the corresponding variable will be absent, which doesn't matter
because it isn't used during all-reduce.
Requires:
Every gv_list in towers must have isomorphic structure including identical
tensor sizes and types.
"""
small_indices
=
[]
large_indices
=
[]
for
idx
,
(
g
,
_
)
in
enumerate
(
tower_grads
[
0
]):
if
g
.
dtype
==
tf
.
float32
and
(
4
*
g
.
shape
.
num_elements
())
<=
max_bytes
:
small_indices
.
append
(
idx
)
else
:
large_indices
.
append
(
idx
)
small_ranges
,
small_singles
=
extract_ranges
(
small_indices
,
range_size_limit
=
max_group
)
large_indices
=
sorted
(
large_indices
+
small_singles
)
num_gv
=
len
(
tower_grads
[
0
])
packing
=
{}
if
small_ranges
:
new_tower_grads
=
[]
for
dev_idx
,
gv_list
in
enumerate
(
tower_grads
):
assert
len
(
gv_list
)
==
num_gv
new_gv_list
=
[]
for
r
in
small_ranges
:
key
=
'%d:%d'
%
(
dev_idx
,
len
(
new_gv_list
))
new_gv_list
.
append
((
pack_range
(
key
,
packing
,
gv_list
,
r
),
'packing_var_placeholder'
))
for
i
in
large_indices
:
new_gv_list
.
append
(
gv_list
[
i
])
new_tower_grads
.
append
(
new_gv_list
)
return
new_tower_grads
,
packing
else
:
return
tower_grads
,
None
def
unpack_small_tensors
(
tower_grads
,
packing
):
"""Undo the structure alterations to tower_grads done by pack_small_tensors.
Args:
tower_grads: List of List of (grad, var) tuples.
packing: A dict generated by pack_small_tensors describing the changes
it made to tower_grads.
Returns:
new_tower_grads: identical to tower_grads except that concatentations
of small tensors have been split apart and returned to their original
positions, paired with their original variables.
"""
if
not
packing
:
return
tower_grads
new_tower_grads
=
[]
num_devices
=
len
(
tower_grads
)
num_packed
=
len
(
packing
.
keys
())
//
num_devices
for
dev_idx
,
gv_list
in
enumerate
(
tower_grads
):
new_gv_list
=
gv_list
[
num_packed
:]
for
i
in
xrange
(
0
,
num_packed
):
k
=
'%d:%d'
%
(
dev_idx
,
i
)
gpt
=
packing
[
k
]
gv
=
unpack_grad_tuple
(
gv_list
[
i
],
gpt
)
for
gi
,
idx
in
enumerate
(
gpt
.
indices
):
assert
idx
==
gpt
.
indices
[
gi
]
new_gv_list
.
insert
(
idx
,
gv
[
gi
])
new_tower_grads
.
append
(
new_gv_list
)
return
new_tower_grads
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce_test.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_cnn_benchmark.allreduce."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
as
pycoll
import
numpy
as
np
import
tensorflow.compat.v1
as
tf
from
tensorflow.python.framework
import
ops
from
tensorflow.python.framework
import
test_util
from
tensorflow.python.ops
import
variables
import
allreduce
class
AllReduceTest
(
tf
.
test
.
TestCase
):
def
testGroupKey
(
self
):
d0
=
[
'/job:worker/replica:0/task:0/device:GPU:1'
,
'/job:worker/replica:0/task:0/device:GPU:0'
,
'/job:worker/replica:0/task:0/device:GPU:3'
,]
d1
=
[
'/job:worker/replica:0/task:1/device:GPU:1'
,
'/job:worker/replica:0/task:1/device:GPU:0'
,
'/job:worker/replica:0/task:1/device:GPU:3'
,]
d2
=
[
'/job:worker/replica:0/task:1/device:GPU:1'
,
'/job:worker/replica:0/task:1/device:GPU:3'
,
'/job:worker/replica:0/task:1/device:GPU:0'
,]
d3
=
[
'/job:worker/replica:0/task:1/device:GPU:1'
,
'/job:worker/replica:0/task:1/device:GPU:3'
,
'/job:worker/replica:0/task:1/device:GPU:2'
,]
d4
=
[
'/job:worker/task:0/device:GPU:1'
,
'/job:worker/task:0/device:GPU:2'
,
'/job:worker/task:0/device:GPU:3'
,]
d5
=
[
'/job:worker/task:0/device:CPU:1'
,
'/job:worker/task:0/device:CPU:2'
]
d6
=
[
'/job:worker/task:0/device:CPU:2'
,
'/job:worker/task:0/device:CPU:1'
]
g0
=
allreduce
.
collective_group_key
(
d0
)
g1
=
allreduce
.
collective_group_key
(
d1
)
g2
=
allreduce
.
collective_group_key
(
d2
)
g3
=
allreduce
.
collective_group_key
(
d3
)
g4
=
allreduce
.
collective_group_key
(
d4
)
g5
=
allreduce
.
collective_group_key
(
d5
)
g6
=
allreduce
.
collective_group_key
(
d6
)
self
.
assertEqual
(
g0
,
g1
)
self
.
assertEqual
(
g0
,
g2
)
self
.
assertTrue
(
g0
!=
g3
)
self
.
assertEqual
(
g3
,
g4
)
self
.
assertEqual
(
g5
,
g6
)
self
.
assertTrue
(
g4
!=
g5
)
def
testExtractRanges
(
self
):
x
=
[]
expected_ranges
=
[]
expected_singles
=
[]
ranges
,
singles
=
allreduce
.
extract_ranges
(
x
)
self
.
assertEqual
(
expected_ranges
,
ranges
)
self
.
assertEqual
(
expected_singles
,
singles
)
x
=
[
1
,
3
,
4
,
6
,
7
,
8
,
9
]
expected_ranges
=
[[
3
,
4
],
[
6
,
9
]]
expected_singles
=
[
1
]
ranges
,
singles
=
allreduce
.
extract_ranges
(
x
)
self
.
assertEqual
(
expected_ranges
,
ranges
)
self
.
assertEqual
(
expected_singles
,
singles
)
x
=
[
1
,
2
,
3
,
4
,
6
,
7
,
8
,
9
]
expected_ranges
=
[[
1
,
4
],
[
6
,
9
]]
expected_singles
=
[]
ranges
,
singles
=
allreduce
.
extract_ranges
(
x
)
self
.
assertEqual
(
expected_ranges
,
ranges
)
self
.
assertEqual
(
expected_singles
,
singles
)
x
=
[
1
,
3
,
4
,
6
,
7
,
9
]
expected_ranges
=
[[
3
,
4
],
[
6
,
7
]]
expected_singles
=
[
1
,
9
]
ranges
,
singles
=
allreduce
.
extract_ranges
(
x
)
self
.
assertEqual
(
expected_ranges
,
ranges
)
self
.
assertEqual
(
expected_singles
,
singles
)
x
=
[
1
,
3
,
6
,
9
]
expected_ranges
=
[]
expected_singles
=
[
1
,
3
,
6
,
9
]
ranges
,
singles
=
allreduce
.
extract_ranges
(
x
)
self
.
assertEqual
(
expected_ranges
,
ranges
)
self
.
assertEqual
(
expected_singles
,
singles
)
def
testPackRange
(
self
):
packing
=
{}
t0
=
tf
.
constant
([
0
,
1
,
2
,
3
],
dtype
=
tf
.
float32
)
t1
=
tf
.
constant
([
4
,
5
,
6
,
7
],
dtype
=
tf
.
float32
)
gv
=
[(
t0
,
'v0'
),
(
t1
,
'v1'
)]
new_t
=
allreduce
.
pack_range
(
'0:0'
,
packing
,
gv
,
[
0
,
1
])
self
.
assertEqual
(
1
,
new_t
.
shape
.
ndims
)
self
.
assertEqual
(
8
,
new_t
.
shape
.
dims
[
0
])
self
.
assertEqual
(
packing
,
{
'0:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
2
),
vars
=
[
'v0'
,
'v1'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
])])
})
t2
=
tf
.
constant
([[
0
,
1
,
2
],
[
3
,
4
,
5
],
[
6
,
7
,
8
]],
dtype
=
tf
.
float32
)
t3
=
tf
.
constant
([[
0
,
1
,
2
],
[
3
,
4
,
5
],
[
6
,
7
,
8
]],
dtype
=
tf
.
float32
)
gv
=
[(
t0
,
'v0'
),
(
t1
,
'v1'
),
(
t2
,
'v2'
),
(
t3
,
'v3'
)]
packing
=
{}
new_t
=
allreduce
.
pack_range
(
'1:0'
,
packing
,
gv
,
[
0
,
3
])
self
.
assertEqual
(
1
,
new_t
.
shape
.
ndims
)
self
.
assertEqual
(
26
,
new_t
.
shape
.
dims
[
0
])
self
.
assertEqual
(
packing
,
{
'1:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
4
),
vars
=
[
'v0'
,
'v1'
,
'v2'
,
'v3'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
3
,
3
]),
tf
.
TensorShape
([
3
,
3
])
])
})
def
testUnpackGradTuple
(
self
):
packing
=
{
'0:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
4
),
vars
=
[
'v0'
,
'v1'
,
'v2'
,
'v3'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
3
,
3
]),
tf
.
TensorShape
([
3
,
3
])
])
}
tc
=
tf
.
constant
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
],
dtype
=
tf
.
float32
)
packed_gv
=
[
tc
,
'packing_var_placeholder'
]
gv
=
allreduce
.
unpack_grad_tuple
(
packed_gv
,
packing
[
'0:0'
])
self
.
assertEqual
(
4
,
len
(
gv
))
self
.
assertEqual
(
'v0'
,
gv
[
0
][
1
])
self
.
assertEqual
(
'v1'
,
gv
[
1
][
1
])
self
.
assertEqual
(
'v2'
,
gv
[
2
][
1
])
self
.
assertEqual
(
'v3'
,
gv
[
3
][
1
])
self
.
assertEqual
(
1
,
gv
[
0
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
4
,
gv
[
0
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
1
,
gv
[
1
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
4
,
gv
[
1
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
2
,
gv
[
2
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
3
,
gv
[
2
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
3
,
gv
[
2
][
0
].
shape
.
dims
[
1
])
def
testPackSmallTensors
(
self
):
t0
=
tf
.
constant
([
0
,
1
,
2
,
3
],
dtype
=
tf
.
float32
)
t1
=
tf
.
constant
([
4
,
5
,
6
,
7
],
dtype
=
tf
.
float32
)
t2
=
tf
.
constant
([[
0
,
1
,
2
],
[
3
,
4
,
5
],
[
6
,
7
,
8
]],
dtype
=
tf
.
float32
)
t3
=
tf
.
constant
([[
0
,
1
,
2
],
[
3
,
4
,
5
],
[
6
,
7
,
8
]],
dtype
=
tf
.
float32
)
tower_grads
=
[]
for
d
in
range
(
0
,
3
):
gv
=
[(
t0
,
'v_%d_0'
%
d
),
(
t1
,
'v_%d_1'
%
d
),
(
t2
,
'v_%d_2'
%
d
),
(
t3
,
'v_%d_3'
%
d
)]
tower_grads
.
append
(
gv
)
# 1) Set the size limit so small that nothing gets concatenated.
new_tower_grads
,
packing
=
allreduce
.
pack_small_tensors
(
tower_grads
,
max_bytes
=
12
,
max_group
=
10
)
self
.
assertEqual
(
tower_grads
,
new_tower_grads
)
self
.
assertTrue
(
packing
is
None
)
# 2) Set the size limit so only the first two tensors get concatenated
new_tower_grads
,
packing
=
allreduce
.
pack_small_tensors
(
tower_grads
,
max_bytes
=
16
,
# 16 bytes == 4 elements
max_group
=
10
)
self
.
assertEqual
(
3
,
len
(
new_tower_grads
))
self
.
assertEqual
(
4
,
len
(
tower_grads
[
0
]))
first_tower
=
new_tower_grads
[
0
]
self
.
assertEqual
(
3
,
len
(
first_tower
))
self
.
assertEqual
(
1
,
first_tower
[
0
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
8
,
first_tower
[
0
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
packing
,
{
'0:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
2
),
vars
=
[
'v_0_0'
,
'v_0_1'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
])]),
'1:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
2
),
vars
=
[
'v_1_0'
,
'v_1_1'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
])]),
'2:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
2
),
vars
=
[
'v_2_0'
,
'v_2_1'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
])])})
# 3) Set the size limit so all tensors get concatenated
new_tower_grads
,
packing
=
allreduce
.
pack_small_tensors
(
tower_grads
,
max_bytes
=
256
,
# bytes = 64 elements
max_group
=
10
)
self
.
assertEqual
(
3
,
len
(
new_tower_grads
))
self
.
assertEqual
(
4
,
len
(
tower_grads
[
0
]))
self
.
assertEqual
(
1
,
len
(
new_tower_grads
[
0
]))
first_tower
=
new_tower_grads
[
0
]
self
.
assertEqual
(
1
,
first_tower
[
0
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
26
,
first_tower
[
0
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
packing
,
{
'0:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
4
),
vars
=
[
'v_0_0'
,
'v_0_1'
,
'v_0_2'
,
'v_0_3'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
3
,
3
,]),
tf
.
TensorShape
([
3
,
3
,])]),
'1:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
4
),
vars
=
[
'v_1_0'
,
'v_1_1'
,
'v_1_2'
,
'v_1_3'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
3
,
3
,]),
tf
.
TensorShape
([
3
,
3
,])]),
'2:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
4
),
vars
=
[
'v_2_0'
,
'v_2_1'
,
'v_2_2'
,
'v_2_3'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
3
,
3
,]),
tf
.
TensorShape
([
3
,
3
,])])})
def
testUnpackSmallTensors
(
self
):
packing
=
{
'0:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
2
),
vars
=
[
'v_0_0'
,
'v_0_1'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
])]),
'0:1'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
3
,
5
),
vars
=
[
'v_0_3'
,
'v_0_4'
],
shapes
=
[
tf
.
TensorShape
([
3
,
3
,]),
tf
.
TensorShape
([
3
,
3
,])]),
'1:0'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
2
),
vars
=
[
'v_1_0'
,
'v_1_1'
],
shapes
=
[
tf
.
TensorShape
([
4
]),
tf
.
TensorShape
([
4
])]),
'1:1'
:
allreduce
.
GradPackTuple
(
indices
=
range
(
3
,
5
),
vars
=
[
'v_1_3'
,
'v_1_4'
],
shapes
=
[
tf
.
TensorShape
([
3
,
3
,]),
tf
.
TensorShape
([
3
,
3
,])])}
t0
=
tf
.
constant
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
],
dtype
=
tf
.
float32
)
t1
=
tf
.
constant
([
17
,
17
],
dtype
=
tf
.
float32
)
t2
=
tf
.
constant
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
],
dtype
=
tf
.
float32
)
t3
=
tf
.
constant
([
0
],
dtype
=
tf
.
float32
)
tower_grads
=
[]
for
d
in
range
(
0
,
2
):
one_tower
=
[(
t0
,
'packing_var_placeholder'
),
(
t2
,
'packing_var_placeholder'
),
(
t1
,
'v_%d_2'
%
d
),
(
t3
,
'v_%d_5'
%
d
)]
tower_grads
.
append
(
one_tower
)
new_tower_grads
=
allreduce
.
unpack_small_tensors
(
tower_grads
,
packing
)
self
.
assertEqual
(
2
,
len
(
new_tower_grads
))
for
d
,
tg
in
enumerate
(
new_tower_grads
):
self
.
assertEqual
(
6
,
len
(
tg
))
self
.
assertEqual
(
'v_%d_0'
%
d
,
tg
[
0
][
1
])
self
.
assertEqual
(
'v_%d_1'
%
d
,
tg
[
1
][
1
])
self
.
assertEqual
(
'v_%d_2'
%
d
,
tg
[
2
][
1
])
self
.
assertEqual
(
'v_%d_3'
%
d
,
tg
[
3
][
1
])
self
.
assertEqual
(
'v_%d_4'
%
d
,
tg
[
4
][
1
])
self
.
assertEqual
(
'v_%d_5'
%
d
,
tg
[
5
][
1
])
self
.
assertEqual
(
1
,
tg
[
0
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
4
,
tg
[
0
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
1
,
tg
[
1
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
4
,
tg
[
1
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
1
,
tg
[
2
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
2
,
tg
[
2
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
2
,
tg
[
3
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
3
,
tg
[
3
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
3
,
tg
[
3
][
0
].
shape
.
dims
[
1
])
self
.
assertEqual
(
2
,
tg
[
4
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
3
,
tg
[
4
][
0
].
shape
.
dims
[
0
])
self
.
assertEqual
(
3
,
tg
[
4
][
0
].
shape
.
dims
[
1
])
self
.
assertEqual
(
1
,
tg
[
5
][
0
].
shape
.
ndims
)
self
.
assertEqual
(
1
,
tg
[
5
][
0
].
shape
.
dims
[
0
])
class
DynamicPackingTest
(
test_util
.
TensorFlowTestCase
):
"""Packing/Unpacking tests that require executing a TensorFlow session."""
def
_init_tensors
(
self
,
num_towers
,
tensor_shapes
):
"""Construct a collection of tensors across multiple devices."""
num_tensors
=
len
(
tensor_shapes
)
consts
=
[]
tensors
=
[]
vrbls
=
[]
tower_grads
=
[]
tf
.
Variable
([
-
1
],
dtype
=
tf
.
int32
,
name
=
'packing_var_placeholder'
)
for
dev_idx
in
range
(
0
,
num_towers
):
devname
=
'/job:localhost/device:GPU:%d'
%
dev_idx
consts
.
append
([])
tensors
.
append
([])
vrbls
.
append
([])
with
tf
.
device
(
devname
):
base_value
=
0
gv_tuples
=
[]
for
t_idx
in
range
(
0
,
num_tensors
):
shape
=
tensor_shapes
[
t_idx
]
num_elts
=
0
for
d
in
shape
:
num_elts
=
(
num_elts
or
1
)
*
d
c
=
np
.
fromiter
(
range
(
base_value
,
base_value
+
num_elts
),
dtype
=
np
.
float32
).
reshape
(
shape
)
base_value
+=
num_elts
consts
[
dev_idx
].
append
(
c
)
tensors
[
dev_idx
].
append
(
tf
.
constant
(
c
))
vrbls
[
dev_idx
].
append
(
tf
.
Variable
(
c
,
name
=
'v_d%d_t%d'
%
(
dev_idx
,
t_idx
)))
gv_tuples
.
append
((
tensors
[
dev_idx
][
-
1
],
vrbls
[
dev_idx
][
-
1
]))
tower_grads
.
append
(
gv_tuples
)
return
tower_grads
,
consts
,
tensors
,
vrbls
_test_tuple
=
pycoll
.
namedtuple
(
'_test_tuple'
,
'num_devices, in_shapes out_shapes out_i'
)
def
_do_pack_unpack_test
(
self
,
tt
):
"""Do a single pack-unpack test.
Args:
tt: A _test_tuple defining the parameters of the test to do.
This test executes a graph that performs a pack of tower_grads
followed by an unpack and verifies that the shapes and values
of gradient tensors are unchanged, along with paired variables.
"""
with
ops
.
Graph
().
as_default
():
tower_grads
,
consts
,
_
,
vrbls
=
self
.
_init_tensors
(
tt
.
num_devices
,
tt
.
in_shapes
)
packed_tg
,
packing
=
allreduce
.
pack_small_tensors
(
tower_grads
,
max_bytes
=
40
,
max_group
=
10
)
unpacked_tg
=
allreduce
.
unpack_small_tensors
(
packed_tg
,
packing
)
with
self
.
test_session
()
as
sess
:
sess
.
run
(
variables
.
global_variables_initializer
())
packed
=
sess
.
run
(
packed_tg
)
for
d
in
range
(
0
,
tt
.
num_devices
):
for
t
in
range
(
0
,
len
(
tt
.
out_shapes
)):
num_elts
=
0
for
dim
in
tt
.
out_shapes
[
t
]:
num_elts
=
(
num_elts
or
1
)
*
dim
self
.
assertTrue
(
np
.
array_equal
(
np
.
array
(
range
(
tt
.
out_i
[
t
],
tt
.
out_i
[
t
]
+
num_elts
),
dtype
=
np
.
float32
).
reshape
(
tt
.
out_shapes
[
t
]),
packed
[
d
][
t
][
0
]))
unpacked
=
sess
.
run
(
unpacked_tg
)
for
d
in
range
(
0
,
tt
.
num_devices
):
for
t
in
range
(
0
,
len
(
tt
.
in_shapes
)):
self
.
assertTrue
(
np
.
array_equal
(
consts
[
d
][
t
],
unpacked
[
d
][
t
][
0
]))
self
.
assertEqual
(
vrbls
[
d
][
t
],
unpacked_tg
[
d
][
t
][
1
])
def
testPackUnpack0
(
self
):
self
.
_do_pack_unpack_test
(
self
.
_test_tuple
(
num_devices
=
3
,
in_shapes
=
[[
8
],
[
3
,
3
],
[
12
],
[
5
,
5
,
5
]],
out_shapes
=
[[
17
],
[
12
],
[
5
,
5
,
5
]],
out_i
=
[
0
,
17
,
29
]))
def
testPackUnpack1
(
self
):
self
.
_do_pack_unpack_test
(
self
.
_test_tuple
(
num_devices
=
4
,
in_shapes
=
[[
5
,
5
,
5
],
[
2
,
3
],
[
5
]],
out_shapes
=
[[
11
],
[
5
,
5
,
5
]],
out_i
=
[
125
,
0
]))
def
testPackUnpack2
(
self
):
self
.
_do_pack_unpack_test
(
self
.
_test_tuple
(
num_devices
=
2
,
in_shapes
=
[[
5
,
5
,
5
],
[
2
,
3
],
[
1
,
5
],
[
7
],
[
100
]],
out_shapes
=
[[
18
],
[
5
,
5
,
5
],
[
100
]],
out_i
=
[
125
,
0
,
143
]))
def
_do_all_reduce_pack_test
(
self
,
tt
):
"""Test that all-reduce results are the same with or without packing."""
with
ops
.
Graph
().
as_default
():
tower_grads
,
consts
,
_
,
_
=
self
.
_init_tensors
(
tt
.
num_devices
,
tt
.
in_shapes
)
dev_prefixes
=
[
'/job:localhost'
]
num_workers
=
1
alg
=
'xring'
shards
=
1
single_session
=
True
gpu_indices
=
range
(
0
,
tt
.
num_devices
)
assert
len
(
gpu_indices
)
==
len
(
tower_grads
)
no_pack_all_reduce
=
allreduce
.
sum_gradients_all_reduce
(
single_session
,
dev_prefixes
,
tower_grads
,
num_workers
,
alg
,
shards
,
gpu_indices
,
agg_small_grads_max_bytes
=
0
,
agg_small_grads_max_group
=
1
)
packed_tg
,
packing
=
allreduce
.
pack_small_tensors
(
tower_grads
,
100
,
100
)
packed_all_reduce
=
allreduce
.
sum_gradients_all_reduce
(
single_session
,
dev_prefixes
,
packed_tg
,
num_workers
,
alg
,
shards
,
gpu_indices
,
agg_small_grads_max_bytes
=
0
,
agg_small_grads_max_group
=
1
)
unpacked_tg
=
allreduce
.
unpack_small_tensors
(
packed_all_reduce
,
packing
)
with
self
.
test_session
()
as
sess
:
sess
.
run
(
variables
.
global_variables_initializer
())
no_pack_values
=
sess
.
run
(
no_pack_all_reduce
)
pack_unpack_values
=
sess
.
run
(
unpacked_tg
)
for
d
in
range
(
1
,
tt
.
num_devices
):
for
t
in
range
(
0
,
len
(
tt
.
in_shapes
)):
self
.
assertTrue
(
np
.
allclose
(
no_pack_values
[
d
][
t
][
0
],
tt
.
num_devices
*
consts
[
0
][
t
]))
self
.
assertTrue
(
np
.
array_equal
(
no_pack_values
[
d
][
t
][
0
],
pack_unpack_values
[
d
][
t
][
0
]))
def
testAllReducePacked0
(
self
):
self
.
_do_all_reduce_pack_test
(
self
.
_test_tuple
(
num_devices
=
3
,
in_shapes
=
[[
8
],
[
3
,
3
],
[
12
],
[
5
,
5
,
5
]],
out_shapes
=
[[
17
],
[
12
],
[
5
,
5
,
5
]],
out_i
=
[
0
,
17
,
29
]))
def
testAllReducePacked1
(
self
):
self
.
_do_all_reduce_pack_test
(
self
.
_test_tuple
(
num_devices
=
2
,
in_shapes
=
[[
8
],
[
3
,
3
],
[
12
],
[
5
,
5
,
5
],
[
3
],
[
4
]],
out_shapes
=
[[
17
],
[
7
],
[
12
],
[
5
,
5
,
5
]],
out_i
=
[
0
,
17
,
29
,
154
,
157
]))
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
tf
.
test
.
main
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/batch_allreduce.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains classes and functions for doing a single-machine batch all-reduce.
An all-reduce is taking the reduction (typically a sum) of a list of tensors,
each on a different device. The result must end up back on each device, which is
where the word "all" comes from. In summary, each device starts with a single
tensor, and ends up with the reduction of all tensors.
A batch all-reduce is doing several independent all-reduces. When doing a batch
all-reduce, care is taken to evenly distribute the reduction computations
across devices and inter-device tensor transfers across device links.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
# TODO(reedwm): Support distributed all-reduces in this file.
# TODO(reedwm): Merge this code with allreduce.py, which contains some batch
# all-reduce code that this file calls. allreduce.py also supports distributed
# batch-reduce while this file only supports single-machine all-reduce.
import
abc
import
six
import
tensorflow.compat.v1
as
tf
from
tensorflow.python.ops
import
data_flow_ops
import
allreduce
import
constants
def
_all_reduce_using_copy
(
tensors_across_devices
,
use_mean
):
"""Does an all-reduce of a list of tensors by copying to the current device.
The tensors are copied to the current device and then reduced.
Args:
tensors_across_devices: A list of tensors, each on a different device.
use_mean: Whether to take the mean of the tensors instead of a sum:
Returns:
A reduced tensor on the current device.
"""
reduced_tensor
=
tf
.
add_n
(
tensors_across_devices
)
if
use_mean
:
reduced_tensor
*=
1
/
len
(
tensors_across_devices
)
return
reduced_tensor
@
six
.
add_metaclass
(
abc
.
ABCMeta
)
class
BatchAllReduceAlgorithm
(
object
):
"""Represents an algorithm for performing a batch all-reduce operation."""
def
batch_all_reduce
(
self
,
all_device_tensors
,
num_splits
,
compact_tensors
,
defer_tensors
,
xla_compile
=
False
):
"""Performs a batch all-reduce.
The reduction done is a sum.
`all_device_tensors` is a list of list of tensors that will be batch
all-reduced. All tensors within a single inner list must be on the same
device. The nth element in each list, for any n, will be reduced together.
The return value is in the same form as `all_device_tensors`, except that
each tensor is reduced.
For example, if `all_device_tensors` is:
[[ A, B ], # A and B are on GPU 0
[ C, D ]] # C and D are on GPU 1
Then the return value will be:
[[ A+C, B+D ], # These two tensors are on GPU 0
[ A+C, B+D ]] # These two tensors are on GPU 1
Arguments:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
is a tensor where `i` is the device index and `j` is the tensor index.
num_splits: If not None, tensors will be concatenated and split into this
many pieces during the all-reduce, then split back into their original
shapes afterwards. Has no impact on correctness and can improve
performance. Requires all tensors to be the same type.
compact_tensors: If True, tensors are casted to fp16 before being all-
reduced. Improves performance, but hurts numerical stability.
defer_tensors: If True, every time the return value
`reduced_all_device_tensors` is evaluated, the result will be the
reduced tensors values of `all_device_tensors` from the previous session
run instead of the current session run, or zero on the first session
run. This can improve performance. When training neural networks,
deferring gradients often does not harm training, so this can be used to
improve performance.
xla_compile: If True, use XLA to compile gradients packing and unpacking
ops.
Returns:
reduced_all_device_tensors: A list in the same form as
`all_device_tensors`, except each tensor has been reduced.
warmup_ops: A list of ops needed to be run once before the all-reduce can
occur.
"""
# Before all-reducing tensors, we do several preprocessing functions that
# can speed up the all-reduce. We undo these functions after all-reducing
# the tensors.
# all_device_packed_tensors is a 2-d list of tensors indexed by
# [device_id][tensor_id], holding packed tensors from all devices involved
# in all-reduce.
all_device_packed_tensors
=
[]
# all_device_warmup_ops is a 2-d list of ops indexed by
# [device_id][tensor_id], holding warmup_ops that need to be run once before
# all-reduce can occur.
all_device_warmup_ops
=
[]
# all_device_put_ops is a 2-d list of ops indexed by
# [device_id][tensor_id], holding put ops for deferred tensors. They will be
# called in each all-reduce step automatically due to control dependency.
all_device_put_ops
=
[]
# packers is a list of _TensorPacker, one for each device involved in
# all-reduce.
packers
=
[
_TensorPacker
(
num_splits
,
compact_tensors
)
for
_
in
all_device_tensors
]
for
packer
,
device_tensors
in
zip
(
packers
,
all_device_tensors
):
def
pack_single_device_tensors
(
packer
=
packer
,
device_tensors
=
device_tensors
):
"""Pack gradient tensors of a device."""
packed_tensors
=
packer
.
maybe_concat_tensors
(
device_tensors
)
packed_tensors
=
packer
.
maybe_compact_tensors
(
packed_tensors
)
# When xla_compile=False, defer tensors after concat for better
# performance.
if
defer_tensors
and
not
xla_compile
:
packed_tensors
,
put_ops
,
warmup_ops
=
defer_single_device_tensors
(
packed_tensors
)
all_device_put_ops
.
append
(
put_ops
)
all_device_warmup_ops
.
append
(
warmup_ops
)
packed_tensors
=
packer
.
maybe_split_tensors
(
packed_tensors
)
return
packed_tensors
with
tf
.
device
(
device_tensors
[
0
].
device
):
if
xla_compile
:
packed_tensors
=
tf
.
xla
.
experimental
.
compile
(
pack_single_device_tensors
)
# When xla_compile=True, intermediate tensors in packing process are
# not materialized. Thus, we defer tensors after packing process is
# completed instead of in the middle of it.
if
defer_tensors
:
packed_tensors
,
put_ops
,
warmup_ops
=
defer_single_device_tensors
(
packed_tensors
)
all_device_put_ops
.
append
(
put_ops
)
all_device_warmup_ops
.
append
(
warmup_ops
)
else
:
packed_tensors
=
pack_single_device_tensors
()
all_device_packed_tensors
.
append
(
packed_tensors
)
# Perform all-reduce on packed tensors.
all_device_tensors
=
self
.
_do_batch_all_reduce
(
all_device_packed_tensors
)
all_device_unpacked_tensors
=
[]
for
packer
,
device_tensors
in
zip
(
packers
,
all_device_tensors
):
def
unpack_single_device_tensors
(
packer
=
packer
,
device_tensors
=
device_tensors
):
"""Unpack gradient tensors of a device."""
unpacked_tensors
=
packer
.
undo_maybe_split_tensors
(
device_tensors
)
unpacked_tensors
=
packer
.
undo_maybe_compact_tensors
(
unpacked_tensors
)
unpacked_tensors
=
packer
.
undo_maybe_concat_tensors
(
unpacked_tensors
)
return
unpacked_tensors
with
tf
.
device
(
device_tensors
[
0
].
device
):
if
xla_compile
:
unpacked_device_tensor
=
tf
.
xla
.
experimental
.
compile
(
unpack_single_device_tensors
)
else
:
unpacked_device_tensor
=
unpack_single_device_tensors
()
all_device_unpacked_tensors
.
append
(
unpacked_device_tensor
)
# Note: There is no undo operation for deferring tensors. But we do need to
# call _add_put_op_control_deps at the end if we deferred the tensors.
if
defer_tensors
:
all_device_unpacked_tensors
=
_add_put_op_control_deps
(
all_device_unpacked_tensors
,
num_splits
,
all_device_put_ops
)
return
all_device_unpacked_tensors
,
all_device_warmup_ops
@
abc
.
abstractmethod
def
_do_batch_all_reduce
(
self
,
all_device_tensors
):
"""Performs a batch all-reduce.
Unlike `self.batch_all_reduce`, this does not do any preprocessing of the
tensors.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
is a tensor where `i` is the device index and `j` is the tensor index.
Returns:
reduced_all_device_tensors: A list in the same form as
`all_device_tensors`, except each tensor has been reduced.
"""
pass
class
CopyToDeviceAlgorithm
(
BatchAllReduceAlgorithm
):
"""An algorithm that copies tensors to be reduced to a specific device."""
def
__init__
(
self
,
devices_to_reduce_on
,
use_mean
=
False
):
self
.
_devices
=
devices_to_reduce_on
self
.
_use_mean
=
use_mean
def
_do_batch_all_reduce
(
self
,
all_device_tensors
):
reduced_tensors
=
[]
for
i
,
tensors_across_devices
in
enumerate
(
zip
(
*
all_device_tensors
)):
with
tf
.
device
(
self
.
_devices
[
i
%
len
(
self
.
_devices
)]):
reduced_tensor
=
_all_reduce_using_copy
(
tensors_across_devices
,
self
.
_use_mean
)
reduced_tensors
.
append
(
reduced_tensor
)
# The tensors will be brought back to each device once they are used.
return
[
reduced_tensors
]
*
len
(
all_device_tensors
)
class
HierarchicalCopyAlgorithm
(
BatchAllReduceAlgorithm
):
"""An algorithm that uses hierarchical copies. This is only optimized for
eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100
topology.
"""
def
__init__
(
self
,
network_topology
):
"""Initializer for HierarchicalCopyAlgorithm.
Args:
network_topology: An instance of Enum class constants.NetworkTopology.
"""
self
.
_network_topology
=
network_topology
def
_do_batch_all_reduce
(
self
,
all_device_tensors
):
avail_devices
=
[
device_tensors
[
0
].
device
for
device_tensors
in
all_device_tensors
]
reduced_tensors
=
[]
num_devices
=
len
(
avail_devices
)
group_size
=
num_devices
//
2
for
i
,
tensors_across_devices
in
enumerate
(
zip
(
*
all_device_tensors
)):
group_0_main_device
,
group_1_main_device
=
self
.
__get_main_devices
(
i
,
num_devices
)
if
group_0_main_device
<
group_size
:
group_0_begin
=
0
group_1_begin
=
group_size
else
:
group_0_begin
=
group_size
group_1_begin
=
0
# Reduce the first group.
group_0_tensors
=
tensors_across_devices
[
group_0_begin
:
group_0_begin
+
group_size
]
with
tf
.
device
(
avail_devices
[
group_0_main_device
]):
group_0_reduced_tensor
=
_all_reduce_using_copy
(
group_0_tensors
,
False
)
# Reduce the second group.
group_1_tensors
=
tensors_across_devices
[
group_1_begin
:
group_1_begin
+
group_size
]
with
tf
.
device
(
avail_devices
[
group_1_main_device
]):
group_1_reduced_tensor
=
_all_reduce_using_copy
(
group_1_tensors
,
False
)
# Reduce between the groups.
with
tf
.
device
(
avail_devices
[
group_0_main_device
]):
total_reduced_tensor
=
_all_reduce_using_copy
(
[
group_0_reduced_tensor
,
group_1_reduced_tensor
],
False
)
# Broadcast the result back into the root of each group.
with
tf
.
device
(
avail_devices
[
group_0_main_device
]):
group_0_reduced_tensor_bcast
=
tf
.
identity
(
total_reduced_tensor
)
with
tf
.
device
(
avail_devices
[
group_1_main_device
]):
group_1_reduced_tensor_bcast
=
tf
.
identity
(
total_reduced_tensor
)
reduced_tensors_bcast
=
[]
for
j
in
range
(
len
(
tensors_across_devices
)):
with
tf
.
device
(
avail_devices
[
j
]):
# Broadcast the result back to each member in the group from the root.
if
(
group_0_main_device
<
group_size
)
==
(
j
<
group_size
):
src_device_tensor
=
group_0_reduced_tensor_bcast
else
:
src_device_tensor
=
group_1_reduced_tensor_bcast
reduced_tensors_bcast
.
append
(
tf
.
identity
(
src_device_tensor
))
reduced_tensors
.
append
(
reduced_tensors_bcast
)
reduced_tensors
=
list
(
zip
(
*
reduced_tensors
))
return
reduced_tensors
def
__get_main_devices
(
self
,
tensor_index
,
num_devices
):
"""Returns the pair of main devices to use for initial reduction.
Args:
tensor_index: Index of the current tensor in the list of tensors to copy.
num_devices: Total number of devices.
Returns:
A tuple containing pair of main device indices for the initial
reduction. Then, the first element of the tuple should be used for the
final reduction.
Raises:
ValueError: Invalid input arguments.
"""
if
self
.
_network_topology
==
constants
.
NetworkTopology
.
DGX1
:
return
tensor_index
%
num_devices
,
(
tensor_index
+
(
num_devices
//
2
))
%
num_devices
elif
self
.
_network_topology
==
constants
.
NetworkTopology
.
GCP_V100
:
if
num_devices
!=
8
:
raise
ValueError
(
'HierarchicalCopy only supports eight devices in %s.'
%
self
.
_network_topology
)
# TODO(hinsu): Generalize main device indices to handle any other
# isomorphic connection graph that connects two cliques using connections
# other than 0-5 and 2-7.
main_device_pairs
=
[(
0
,
5
),
(
2
,
7
),
(
5
,
0
),
(
7
,
2
)]
return
main_device_pairs
[
tensor_index
%
len
(
main_device_pairs
)]
else
:
# TODO(reedwm): make this logic more general for arbitrary topology.
raise
ValueError
(
'HierarchicalCopy is not supported for %s network topology.'
%
self
.
_network_topology
)
class
AllReduceSpecAlgorithm
(
BatchAllReduceAlgorithm
):
"""An algorithm that uses an all reduce spec."""
def
__init__
(
self
,
all_reduce_spec
,
gpu_indices
,
agg_small_grads_max_bytes
,
agg_small_grads_max_group
):
spec
=
allreduce
.
parse_all_reduce_spec
(
all_reduce_spec
)
if
len
(
spec
)
!=
1
:
raise
ValueError
(
'Replicated mode does not support hybrid all-reduce strategies'
)
self
.
_all_reduce_spec
=
spec
[
0
]
self
.
_gpu_indices
=
gpu_indices
self
.
_agg_small_grads_max_bytes
=
agg_small_grads_max_bytes
self
.
_agg_small_grads_max_group
=
agg_small_grads_max_group
def
_do_batch_all_reduce
(
self
,
all_device_tensors
):
# TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
# gradient aggregation code, since gradient aggregation is doing an all
# reduce. Currently, we do gradient repacking in two different places.
# TODO(reedwm): Change the allreduce code to reduce tensors instead of
# tower_grads.
tower_grads
=
[[(
t
,
None
)
for
t
in
device_tensors
]
for
device_tensors
in
all_device_tensors
]
aggregated_device_grads
=
allreduce
.
sum_gradients_all_reduce
(
False
,
# single_session
[
'/job:localhost'
],
tower_grads
,
1
,
self
.
_all_reduce_spec
.
alg
,
self
.
_all_reduce_spec
.
shards
,
self
.
_gpu_indices
,
agg_small_grads_max_bytes
=
self
.
_agg_small_grads_max_bytes
,
agg_small_grads_max_group
=
self
.
_agg_small_grads_max_group
)
return
[[
t
for
t
,
_
in
grad_vars
]
for
grad_vars
in
aggregated_device_grads
]
def
algorithm_from_params
(
params
):
"""Returns a BatchAllReduceAlgorithm from a Params tuple."""
if
params
.
all_reduce_spec
:
if
params
.
gpu_indices
:
gpu_indices
=
[
int
(
x
)
for
x
in
params
.
gpu_indices
.
split
(
','
)]
else
:
gpu_indices
=
[
x
for
x
in
range
(
params
.
num_gpus
)]
return
AllReduceSpecAlgorithm
(
params
.
all_reduce_spec
,
gpu_indices
,
params
.
agg_small_grads_max_bytes
,
params
.
agg_small_grads_max_group
)
elif
params
.
hierarchical_copy
:
return
HierarchicalCopyAlgorithm
(
params
.
network_topology
)
else
:
if
params
.
local_parameter_device
==
'gpu'
:
devices_to_reduce_on
=
[
'/gpu:%d'
%
i
for
i
in
range
(
params
.
num_gpus
)]
else
:
devices_to_reduce_on
=
[
'/cpu:0'
]
return
CopyToDeviceAlgorithm
(
devices_to_reduce_on
)
def
_apply_to_all_device_tensors
(
all_device_tensors
,
apply_func
,
colocate
=
True
):
"""Applies a function to each tensor in `all_device_tensors`.
A new list of lists of tensors is returned, where every tensor in
`all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
is not modified.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
a tensor where `i` is the device index and `j` is the tensor index.
apply_func: A function taking in three arguments: tensor, device_index,
tensor_index, and returning a modified tensor.
`tensor` is `all_device_tensors[device_index][tensor_index]`.
colocate: If True, apply_func will be run under context manager colocated
with it's input tensor.
Returns:
A list in the same form as `all_device_tensors`, except each tensor has had
`apply_func` called on it.
"""
new_all_device_tensors
=
[]
for
device_index
,
device_tensors
in
enumerate
(
all_device_tensors
):
new_device_tensors
=
[]
for
tensor_index
,
t
in
enumerate
(
device_tensors
):
if
colocate
:
with
tf
.
colocate_with
(
t
):
new_t
=
apply_func
(
t
,
device_index
,
tensor_index
)
else
:
new_t
=
apply_func
(
t
,
device_index
,
tensor_index
)
new_device_tensors
.
append
(
new_t
)
new_all_device_tensors
.
append
(
new_device_tensors
)
return
new_all_device_tensors
def
_defer_tensor
(
tensor
):
"""Defers the retrieval of a tensor.
The tensor is put into a StagingArea, and the return value is the
retrieval of the tensor from the StagingArea. The effect is that the
tensor returned from this function is the tensor that was put in the
StagingArea for the previous Session.run() call.
Args:
tensor: The tensor to defer for one step.
Returns:
deferred_tensor: The tensor deferred for one step.
put_op: An op to put `tensor` in the StagingArea. Must be run every step
that `deferred_tensor` is run.
warmup_op: A warmup op that should be called before the first step. Puts
a zero tensor into the StagingArea.
"""
tensor_stage
=
data_flow_ops
.
StagingArea
([
tensor
.
dtype
],
[
tensor
.
shape
])
put_op
=
tensor_stage
.
put
([
tensor
])
warmup_op
=
tensor_stage
.
put
([
tf
.
zeros
(
tensor
.
shape
,
dtype
=
tensor
.
dtype
)])
# Fetch the next tensor to use.
(
tensor
,)
=
tensor_stage
.
get
()
return
tensor
,
put_op
,
warmup_op
def
defer_single_device_tensors
(
device_tensors
):
"""Defer tensors (gradients in this case) from a single device.
Arguments:
device_tensors: A list of gradients tensors from a single device to defer.
Returns:
deferred_tensors: A list of tensors deferred for one step.
put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run
every step that `deferred_tensors` is run.
warmup_ops: Warmup ops that should be called before the first step. Puts
zero tensors into the StagingArea.
"""
put_ops
=
[]
warmup_ops
=
[]
deferred_tensors
=
[]
for
tensor
in
device_tensors
:
deferred_tensor
,
put_op
,
warmup_op
=
_defer_tensor
(
tensor
)
deferred_tensors
.
append
(
deferred_tensor
)
put_ops
.
append
(
put_op
)
warmup_ops
.
append
(
warmup_op
)
return
deferred_tensors
,
put_ops
,
warmup_ops
def
_add_put_op_control_deps
(
all_device_tensors
,
num_splits
,
put_ops
):
"""Add control dependencies from `put_ops` to `all_device_tensors`.
This should only be called when deferred tensors are being used.
The control dependencies are added so that the put ops are run whenever
`all_device_tensors` is run. That way, the caller does not have to explicitly
run the put ops.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
a tensor where `i` is the device index and `j` is the tensor index.
num_splits: The number of splits that were used for the all-reduce.
put_ops: A list of put ops from deferring the tensors.
Returns:
A list in the same form as `all_device_tensors`, except each tensor has a
control dependency on an op in `put_ops`.
"""
def
apply_func
(
tensor
,
device_index
,
tensor_index
):
if
num_splits
==
0
:
deps
=
[
put_ops
[
device_index
][
tensor_index
]]
else
:
deps
=
put_ops
[
device_index
]
assert
len
(
deps
)
==
1
with
tf
.
control_dependencies
(
deps
):
return
tf
.
identity
(
tensor
,
name
=
'control_dependency'
)
return
_apply_to_all_device_tensors
(
all_device_tensors
,
apply_func
)
class
_TensorPacker
(
object
):
"""Packs and unpacks tensors into groups.
This class first concatenates a set of tensors, then split the concatenated
tensor into a small number of chunks. This is useful for all-reducing tensors,
as doing a small number of all-reduces on large tensors can be faster than
doing a large number of all-reduces on small tensors.
It also provides option to compact tensors by casting them to fp16, for better
all-reduce performance.
This class maintains states of processed tensors like shapes and types. So
each packer can only be used to pack and unpack one list of tensors. If you
need to pack multiple lists of tensors (say from multiple devices), then you
need multiple _TensorPacker object, one for each device.
"""
def
__init__
(
self
,
num_splits
,
compact
):
"""Initializes the _TensorPacker.
Arguments:
num_splits: The number of tensors to split the concatenated tensor into.
The batch all-reduce will consist of `num_splits` all-reduces. if None
or zero, tensors are not split or concatenated.
compact: If True, tensors are casted to fp16 during packing and casted
back to their original dtypes during unpacking.
"""
self
.
_num_splits
=
num_splits
self
.
_compact
=
compact
self
.
_before_compact_dtypes
=
[]
def
maybe_concat_tensors
(
self
,
device_tensors
):
"""Concatenate tensors into a single tensor."""
if
not
self
.
_num_splits
:
return
device_tensors
flat_tensors
=
[
tf
.
reshape
(
t
,
[
-
1
])
for
t
in
device_tensors
]
self
.
_orig_shapes
=
[
t
.
shape
for
t
in
device_tensors
]
self
.
_orig_sizes
=
[
s
.
num_elements
()
for
s
in
self
.
_orig_shapes
]
# All shapes must be fully defined.
assert
None
not
in
self
.
_orig_sizes
concatenated_grad
=
tf
.
concat
(
flat_tensors
,
0
)
return
[
concatenated_grad
]
def
maybe_split_tensors
(
self
,
concatenated_tensor
):
"""Split concatenated tensor into `num_splits` pieces."""
if
not
self
.
_num_splits
:
return
concatenated_tensor
if
len
(
concatenated_tensor
)
!=
1
:
raise
RuntimeError
(
'tensors must be concatenated via '
'maybe_concat_tensors() before splitting'
)
concatenated_tensor
=
concatenated_tensor
[
0
]
total_tensor_size
=
concatenated_tensor
.
shape
.
num_elements
()
split_size
=
total_tensor_size
//
self
.
_num_splits
split_size_last
=
total_tensor_size
-
split_size
*
(
self
.
_num_splits
-
1
)
split_sizes
=
[
split_size
]
*
(
self
.
_num_splits
-
1
)
+
[
split_size_last
]
tensor_packs
=
tf
.
split
(
concatenated_tensor
,
split_sizes
)
return
tensor_packs
def
undo_maybe_split_tensors
(
self
,
tensor_packs
):
"""Undo maybe_split_tensors()."""
if
not
self
.
_num_splits
:
return
tensor_packs
return
[
tf
.
concat
(
tensor_packs
,
0
)]
def
undo_maybe_concat_tensors
(
self
,
concatenated_tensor
):
"""Undo maybe_concat_tensors()."""
if
not
self
.
_num_splits
:
return
concatenated_tensor
if
len
(
concatenated_tensor
)
!=
1
:
raise
RuntimeError
(
'undo_maybe_split_tensors() must be called before '
'undo_maybe_concat_tensors when num_splits is greater than 1'
)
concatenated_tensor
=
concatenated_tensor
[
0
]
tensors_with_sizes
=
tf
.
split
(
concatenated_tensor
,
self
.
_orig_sizes
)
tensors_with_shapes
=
[
tf
.
reshape
(
grad
,
shape
)
for
grad
,
shape
in
zip
(
tensors_with_sizes
,
self
.
_orig_shapes
)
]
return
tensors_with_shapes
def
maybe_compact_tensors
(
self
,
device_tensors
):
"""Cast tensors to fp16 and store their original types."""
if
not
self
.
_compact
:
return
device_tensors
if
self
.
_before_compact_dtypes
:
raise
RuntimeError
(
'maybe_compact_tensors can only be called once.'
)
self
.
_before_compact_dtypes
=
[
t
.
dtype
for
t
in
device_tensors
]
compact_tensors
=
[
tf
.
cast
(
t
,
tf
.
float16
)
for
t
in
device_tensors
]
return
compact_tensors
def
undo_maybe_compact_tensors
(
self
,
compact_tensors
):
"""Undo maybe_compact_tensors()."""
if
not
self
.
_compact
:
return
compact_tensors
if
not
self
.
_before_compact_dtypes
:
raise
RuntimeError
(
'maybe_compact_tensors() must be called before '
'undo_maybe_compact_tensors()'
)
device_tensors
=
[
tf
.
cast
(
t
,
dtype
)
for
t
,
dtype
in
zip
(
compact_tensors
,
self
.
_before_compact_dtypes
)
]
return
device_tensors
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TensorFlow benchmark library.
See the README for more information.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
argparse
from
collections
import
namedtuple
import
contextlib
import
math
import
multiprocessing
import
os
import
re
import
threading
import
time
import
traceback
from
absl
import
flags
as
absl_flags
import
numpy
as
np
import
six
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow.compat.v1
as
tf
# pylint: disable=g-direct-tensorflow-import
import
cnn_util
import
constants
import
datasets
import
flags
import
mlperf
import
variable_mgr
import
variable_mgr_util
from
cnn_util
import
log_fn
from
models
import
model_config
from
platforms
import
util
as
platforms_util
from
google.protobuf
import
text_format
from
tensorflow.core.protobuf
import
rewriter_config_pb2
from
tensorflow.python
import
debug
as
tf_debug
from
tensorflow.python.client
import
timeline
from
tensorflow.python.framework
import
graph_util
from
tensorflow.python.framework
import
graph_util_impl
from
tensorflow.python.framework
import
importer
from
tensorflow.python.ops
import
data_flow_ops
from
tensorflow.python.platform
import
gfile
from
tensorflow.python.util
import
nest
_DEFAULT_NUM_BATCHES
=
100
# GraphInfo encapsulates the tensors/ops that we care about after building a
# graph. We use them to benchmark the graph.
GraphInfo
=
namedtuple
(
# pylint: disable=invalid-name
'GraphInfo'
,
[
# Ops that produce the input batches (before preprocessing).
'input_producer_op'
,
# Ops that adds the preprocessed images to the staging areas
'enqueue_ops'
,
# Fetches of sess.run()
'fetches'
,
# Op that performs synchronization in distributed mode
'execution_barrier'
,
# The global step variable
'global_step'
,
# Group of ops that perform per-device initialization work
'local_var_init_op_group'
,
# Op to produce summaries
'summary_op'
])
# InputProcessingInfo contains various sources of inputs which will be later fed
# into the model. If synthetic data is used, all three fields are None.
InputProcessingInfo
=
namedtuple
(
'InputProcessingInfo'
,
[
# The first two fields are non-None iff datasets prefetching is not
# used.
# Ops that produce the input batches.
'input_producer_op'
,
# A list of StagingArea for each device.
'input_producer_stages'
,
# Input produced using multi device iterator. Non-None iff datasets
# prefetching is used
'multi_device_iterator_input'
])
# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and
# float flags, and change certain string flags to enum flags.
flags
.
DEFINE_string
(
'model'
,
'trivial'
,
'Name of the model to run, the list of supported models '
'are defined in models/model.py'
)
# The code will first check if it's running under benchmarking mode
# or evaluation mode, depending on 'eval':
# Under the evaluation mode, this script will read a saved model,
# and compute the accuracy of the model against a validation dataset.
# Additional ops for accuracy and top_k predictors are only used under
# this mode.
# Under the benchmarking mode, user can specify whether nor not to use
# the forward-only option, which will only compute the loss function.
# forward-only cannot be enabled with eval at the same time.
flags
.
DEFINE_boolean
(
'eval'
,
False
,
'whether use eval or benchmarking'
)
flags
.
DEFINE_integer
(
'eval_interval_secs'
,
0
,
'How often to run eval on saved checkpoints. Usually the '
'same as save_model_secs from the corresponding training '
'run. Pass 0 to eval only once.'
)
flags
.
DEFINE_integer
(
'eval_during_training_every_n_steps'
,
None
,
'Every n steps during training, pause training, run '
'evaluation, then resume training. Must not be used with '
'--eval, as unlike --eval, this option causes both '
'training and eval to be done. This may take slightly '
'more GPU memory than running just training or evaluation '
'alone. It also may slightly slow down training, even '
'when not taking into account the additional time to '
'evaluate.'
,
lower_bound
=
1
)
flags
.
DEFINE_float
(
'eval_during_training_every_n_epochs'
,
None
,
'After every n training epochs, pause training, run '
'evaluation, then resume training. See '
'--eval_during_training_every_n_steps for more information.'
)
flags
.
DEFINE_list
(
'eval_during_training_at_specified_steps'
,
[],
'Specify a list of training steps, pause training at each of '
'these steps, run evaluation, then resume training. See '
'--eval_during_training_every_n_steps for more information.'
)
flags
.
DEFINE_list
(
'eval_during_training_at_specified_epochs'
,
[],
'Specify a list of training epochs, pause training after '
'each of these epochs, run evaluation, then resume training. '
'See --eval_during_training_every_n_steps for more '
'information.'
)
flags
.
DEFINE_boolean
(
'forward_only'
,
False
,
'whether use forward-only or training for benchmarking'
)
flags
.
DEFINE_boolean
(
'freeze_when_forward_only'
,
False
,
'whether to freeze the graph when in forward-only mode.'
)
flags
.
DEFINE_boolean
(
'print_training_accuracy'
,
False
,
'whether to calculate and print training accuracy during '
'training'
)
flags
.
DEFINE_integer
(
'batch_size'
,
0
,
'batch size per compute device'
)
flags
.
DEFINE_integer
(
'eval_batch_size'
,
0
,
'eval batch size per compute device'
)
flags
.
DEFINE_integer
(
'batch_group_size'
,
1
,
'number of groups of batches processed in the image '
'producer.'
)
flags
.
DEFINE_integer
(
'num_batches'
,
None
,
'number of batches to run, excluding '
'warmup. Defaults to %d'
%
_DEFAULT_NUM_BATCHES
)
flags
.
DEFINE_integer
(
'num_eval_batches'
,
None
,
'number of eval batches to run, excluding warmup. '
'Defaults to --num_batches'
)
flags
.
DEFINE_float
(
'num_epochs'
,
None
,
'number of epochs to run, excluding warmup. '
'This and --num_batches cannot both be specified.'
)
flags
.
DEFINE_float
(
'num_eval_epochs'
,
None
,
'number of eval epochs to run, excluding warmup. '
'Defaults to --num_epochs'
)
flags
.
DEFINE_float
(
'stop_at_top_1_accuracy'
,
None
,
'If set, stops training after the evaluation accuracy hits '
'this number. Can only be used with one of the '
'--eval_during_training_* flags.'
)
flags
.
DEFINE_boolean
(
'collect_eval_results_async'
,
False
,
'If True, start a separate process to postprocess eval '
'results asynchronously. This currently only works with '
'the SSD model.'
)
flags
.
DEFINE_integer
(
'num_warmup_batches'
,
None
,
'number of batches to run before timing'
)
flags
.
DEFINE_integer
(
'autotune_threshold'
,
None
,
'The autotune threshold for the models'
)
# TODO(tucker): change num_gpus to num_devices
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
'the number of GPUs to run on'
)
flags
.
DEFINE_string
(
'gpu_indices'
,
''
,
'indices of worker GPUs in ring order'
)
flags
.
DEFINE_integer
(
'display_every'
,
10
,
'Number of local steps after which progress is printed '
'out'
)
flags
.
DEFINE_float
(
'display_perf_ewma'
,
None
,
'If set, display numbers of images/sec using exponentially '
'weighted moving avearge with the specified weight, which '
'defines how much current value contributes to the reported '
'average. Increasing weight makes the reported performance '
'number reflect more about the real-time speed instead of '
'the entire history'
,
lower_bound
=
0
,
upper_bound
=
1
)
flags
.
DEFINE_string
(
'data_dir'
,
None
,
'Path to dataset in TFRecord format (aka Example '
'protobufs). If not specified, synthetic data will be '
'used.'
)
flags
.
DEFINE_string
(
'data_name'
,
None
,
'Name of dataset: imagenet or cifar10. If not specified, '
'it is automatically guessed based on data_dir.'
)
flags
.
DEFINE_string
(
'resize_method'
,
'bilinear'
,
'Method for resizing input images: crop, nearest, '
'bilinear, bicubic, area, or round_robin. The `crop` mode '
'requires source images to be at least as large as the '
'network input size. The `round_robin` mode applies '
'different resize methods based on position in a batch in '
'a round-robin fashion. Other modes support any sizes and '
'apply random bbox distortions before resizing (even with '
'distortions=False).'
)
flags
.
DEFINE_boolean
(
'distortions'
,
False
,
'Enable/disable distortions during image preprocessing. '
'These include bbox and color distortions.'
)
flags
.
DEFINE_boolean
(
'use_datasets'
,
True
,
'Enable use of datasets for input pipeline'
)
flags
.
DEFINE_string
(
'input_preprocessor'
,
'default'
,
'Name of input preprocessor. The list of supported input '
'preprocessors are defined in preprocessing.py.'
)
flags
.
DEFINE_string
(
'gpu_thread_mode'
,
'gpu_private'
,
'Methods to assign GPU host work to threads. '
'global: all GPUs and CPUs share the same global threads; '
'gpu_private: a private threadpool for each GPU; '
'gpu_shared: all GPUs share the same threadpool.'
)
flags
.
DEFINE_integer
(
'per_gpu_thread_count'
,
0
,
'The number of threads to use for GPU. Only valid when '
'gpu_thread_mode is not global.'
)
flags
.
DEFINE_boolean
(
'hierarchical_copy'
,
False
,
'Use hierarchical copies. Currently only optimized for '
'use on a DGX-1 with 8 GPUs and may perform poorly on '
'other hardware. Requires --num_gpus > 1, and only '
'recommended when --num_gpus=8'
)
# TODO(hinsu): Support auto-detection of the network topology while still
# retaining the ability to specify a particular topology for debugging.
flags
.
DEFINE_enum
(
'network_topology'
,
constants
.
NetworkTopology
.
DGX1
,
(
constants
.
NetworkTopology
.
DGX1
,
constants
.
NetworkTopology
.
GCP_V100
),
'Network topology specifies the topology used to connect multiple devices. '
'Network topology is used to decide the hierarchy to use for the '
'hierarchical_copy.'
)
flags
.
DEFINE_integer
(
'gradient_repacking'
,
0
,
'Use gradient repacking. It'
'currently only works with replicated mode. At the end of'
'of each step, it repacks the gradients for more efficient'
'cross-device transportation. A non-zero value specifies'
'the number of split packs that will be formed.'
,
lower_bound
=
0
)
flags
.
DEFINE_boolean
(
'compact_gradient_transfer'
,
True
,
'Compact gradient'
'as much as possible for cross-device transfer and '
'aggregation.'
)
flags
.
DEFINE_enum
(
'variable_consistency'
,
'strong'
,
(
'strong'
,
'relaxed'
),
'The data consistency for trainable variables. With strong '
'consistency, the variable always have the updates from '
'previous step. With relaxed consistency, all the updates '
'will eventually show up in the variables. Likely one step '
'behind.'
)
flags
.
DEFINE_boolean
(
'datasets_repeat_cached_sample'
,
False
,
'Enable use of a special datasets pipeline that reads a '
'single TFRecord into memory and repeats it infinitely '
'many times. The purpose of this flag is to make it '
'possible to write regression tests that are not '
'bottlenecked by CNS throughput. '
'Use datasets_use_caching to cache input data.'
)
flags
.
DEFINE_enum
(
'local_parameter_device'
,
'gpu'
,
(
'cpu'
,
'gpu'
,
'CPU'
,
'GPU'
),
'Device to use as parameter server: cpu or gpu. For '
'distributed training, it can affect where caching of '
'variables happens.'
)
flags
.
DEFINE_enum
(
'device'
,
'gpu'
,
(
'cpu'
,
'gpu'
,
'CPU'
,
'GPU'
),
'Device to use for computation: cpu or gpu'
)
flags
.
DEFINE_enum
(
'data_format'
,
'NCHW'
,
(
'NHWC'
,
'NCHW'
),
'Data layout to use: NHWC (TF native) or NCHW (cuDNN '
'native, requires GPU).'
)
flags
.
DEFINE_integer
(
'num_intra_threads'
,
None
,
'Number of threads to use for intra-op parallelism. If '
'set to 0, the system will pick an appropriate number. '
'None is the same as 0 except that it disables intra-op '
'parallelism on a GPU.'
)
flags
.
DEFINE_integer
(
'num_inter_threads'
,
0
,
'Number of threads to use for inter-op parallelism. If '
'set to 0, the system will pick an appropriate number.'
)
flags
.
DEFINE_boolean
(
'use_numa_affinity'
,
False
,
'Whether to turn on NUMA affinity for CPU devices. '
'This is probably only useful when --device=cpu.'
)
flags
.
DEFINE_string
(
'trace_file'
,
''
,
'Enable TensorFlow tracing and write trace to this file.'
)
flags
.
DEFINE_boolean
(
'use_chrome_trace_format'
,
True
,
'If True, the trace_file, if specified, will be in a '
'Chrome trace format. If False, then it will be a '
'StepStats raw proto.'
)
_NUM_STEPS_TO_PROFILE
=
10
_NUM_OPS_TO_PRINT
=
20
flags
.
DEFINE_string
(
'tfprof_file'
,
None
,
'If specified, write a tfprof ProfileProto to this file. '
'The performance and other aspects of the model can then '
'be analyzed with tfprof. See '
'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md '
# pylint: disable=line-too-long
'for more info on how to do this. The first %d steps '
'are profiled. Additionally, the top %d most time '
'consuming ops will be printed.
\n
'
'Note: profiling with tfprof is very slow, but most of the '
'overhead is spent between steps. So, profiling results '
'are more accurate than the slowdown would suggest.'
%
(
_NUM_STEPS_TO_PROFILE
,
_NUM_OPS_TO_PRINT
))
flags
.
DEFINE_string
(
'graph_file'
,
None
,
'Write the model
\'
s graph definition to this file. '
'Defaults to binary format unless filename ends in "txt".'
)
flags
.
DEFINE_string
(
'partitioned_graph_file_prefix'
,
None
,
'If specified, after the graph has been partitioned and '
'optimized, write out each partitioned graph to a file '
'with the given prefix.'
)
flags
.
DEFINE_enum
(
'optimizer'
,
'sgd'
,
(
'momentum'
,
'sgd'
,
'rmsprop'
,
'adam'
),
'Optimizer to use'
)
flags
.
DEFINE_float
(
'init_learning_rate'
,
None
,
'Initial learning rate for training.'
)
flags
.
DEFINE_string
(
'piecewise_learning_rate_schedule'
,
None
,
'Specifies a piecewise learning rate schedule based on the '
'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
'where each LRi is a learning rate and each Ei is an epoch '
'indexed from 0. The learning rate is LRi if the '
'E(i-1) <= current_epoch < Ei. For example, if this '
'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
'for the first 10 epochs, then is 0.2 for the next 15 '
'epochs, then is 0.1 until training ends.'
)
flags
.
DEFINE_float
(
'num_epochs_per_decay'
,
0
,
'Steps after which learning rate decays. If 0, the learning '
'rate does not decay.'
)
flags
.
DEFINE_float
(
'learning_rate_decay_factor'
,
0
,
'Learning rate decay factor. Decay by this factor every '
'`num_epochs_per_decay` epochs. If 0, learning rate does '
'not decay.'
)
flags
.
DEFINE_float
(
'num_learning_rate_warmup_epochs'
,
0
,
'Slowly increase to the initial learning rate in the first '
'num_learning_rate_warmup_epochs linearly.'
)
flags
.
DEFINE_float
(
'minimum_learning_rate'
,
0
,
'The minimum learning rate. The learning rate will '
'never decay past this value. Requires `learning_rate`, '
'`num_epochs_per_decay` and `learning_rate_decay_factor` to '
'be set.'
)
flags
.
DEFINE_float
(
'resnet_base_lr'
,
None
,
"Base learning rate at bs=256. Only "
"relevant when training ResNet and utilizing the model's "
"learning rate heuristic (get_learning_rate)."
)
flags
.
DEFINE_float
(
'momentum'
,
0.9
,
'Momentum for training.'
)
flags
.
DEFINE_float
(
'rmsprop_decay'
,
0.9
,
'Decay term for RMSProp.'
)
flags
.
DEFINE_float
(
'rmsprop_momentum'
,
0.9
,
'Momentum in RMSProp.'
)
flags
.
DEFINE_float
(
'rmsprop_epsilon'
,
1.0
,
'Epsilon term for RMSProp.'
)
flags
.
DEFINE_float
(
'adam_beta1'
,
0.9
,
'Beta2 term for the Adam optimizer'
)
flags
.
DEFINE_float
(
'adam_beta2'
,
0.999
,
'Beta2 term for the Adam optimizer'
)
flags
.
DEFINE_float
(
'adam_epsilon'
,
1e-8
,
'Epsilon term for the Adam optimizer'
)
flags
.
DEFINE_float
(
'gradient_clip'
,
None
,
'Gradient clipping magnitude. Disabled by default.'
)
flags
.
DEFINE_float
(
'weight_decay'
,
0.00004
,
'Weight decay factor for training.'
)
flags
.
DEFINE_float
(
'gpu_memory_frac_for_testing'
,
0
,
'If non-zero, the fraction of GPU memory that will be used. '
'Useful for testing the benchmark script, as this allows '
'distributed mode to be run on a single machine. For '
'example, if there are two tasks, each can be allocated '
'~40 percent of the memory on a single machine. This is '
'also useful for using unified memory, as this can be set '
'above 1 to oversubscribe the GPU using unified memory.'
,
lower_bound
=
0.
)
flags
.
DEFINE_boolean
(
'use_unified_memory'
,
None
,
'If True, allocate unified memory enabling larger models '
'to fit in available device RAM.'
)
flags
.
DEFINE_boolean
(
'timestamped_allocator'
,
False
,
'If True marks free BFCAllocator::Chunks with time '
'at which they are freed which can allow more efficient '
'memory allocation in cases like RDMA networking.'
)
flags
.
DEFINE_integer
(
'gpu_kt_max_interval'
,
0
,
'If > 0, the maximum number of GPU Ops that may be queued '
'in a row without also queuing a tracking event.'
)
flags
.
DEFINE_integer
(
'gpu_kt_max_bytes'
,
0
,
'If > 0, the maximum number of bytes '
'of GPU memory that may be allocated by sequential '
'GPU Ops without queuing a tracking event.'
)
flags
.
DEFINE_integer
(
'gpu_kt_max_pending'
,
0
,
'If > 0 no more than this many GPU tracking events may be '
'outstanding at any time. When this limit is reached '
'launch of additional kernels will stall until an '
'outstanding event completes.'
)
flags
.
DEFINE_boolean
(
'use_tf_layers'
,
True
,
'If True, use tf.layers for neural network layers. This '
'should not affect performance or accuracy in any way.'
)
flags
.
DEFINE_integer
(
'tf_random_seed'
,
1234
,
'The TensorFlow random seed. Useful for debugging NaNs, '
'as this can be set to various values to see if the NaNs '
'depend on the seed.'
)
flags
.
DEFINE_string
(
'debugger'
,
None
,
'If set, use the TensorFlow debugger. If set to "cli", use '
'the local CLI debugger. Otherwise, this must be in the '
'form hostname:port (e.g., localhost:7007) in which case '
'the experimental TensorBoard debugger will be used'
)
flags
.
DEFINE_boolean
(
'use_python32_barrier'
,
False
,
'When on, use threading.Barrier at Python 3.2.'
)
flags
.
DEFINE_boolean
(
'ml_perf'
,
False
,
'When True, change how the Imagenet input pipeline works '
'slightly to meet the MLPerf compliance rules. This slows '
'down the input pipeline. Without this option, at the end '
'of the input pipeline, the image is divided by 127.5, '
'then 1.0 is subtracted from it, bringing the image '
'values from [0, 255] to [-1.0, 1.0]. With this option, '
'each of the three channels (red, green, blue) have the '
'average channel value among all image subtracted from '
'it, and no division is done.'
)
flags
.
DEFINE_boolean
(
'datasets_use_prefetch'
,
True
,
'Enable use of prefetched datasets for input pipeline. '
'This option is meaningless if use_datasets=False.'
)
flags
.
DEFINE_integer
(
'datasets_prefetch_buffer_size'
,
1
,
'Prefetching op buffer size per compute device.'
)
flags
.
DEFINE_integer
(
'datasets_num_private_threads'
,
None
,
'Number of threads for a private threadpool created for '
'all datasets computation. By default, we pick an '
'appropriate number. If set to 0, we use the default '
'tf-Compute threads for dataset operations.'
)
flags
.
DEFINE_boolean
(
'datasets_use_caching'
,
False
,
'Cache the compressed input data in memory. This improves '
'the data input performance, at the cost of additional '
'memory.'
)
flags
.
DEFINE_integer
(
'datasets_parallel_interleave_cycle_length'
,
None
,
'Number of parallel file readers interleaving input data.'
)
flags
.
DEFINE_boolean
(
'datasets_sloppy_parallel_interleave'
,
False
,
'Allow parallel interleave to depart from deterministic '
'ordering, by temporarily skipping over files whose '
'elements are not readily available. This can increase '
'througput in particular in the presence of stragglers.'
)
flags
.
DEFINE_integer
(
'datasets_parallel_interleave_prefetch'
,
None
,
'The number of input elements to fetch before they are '
'needed for interleaving.'
)
flags
.
DEFINE_integer
(
'multi_device_iterator_max_buffer_size'
,
1
,
'Configuration parameter for the MultiDeviceIterator that '
' specifies the host side buffer size for each device.'
)
# Performance tuning parameters.
flags
.
DEFINE_boolean
(
'winograd_nonfused'
,
True
,
'Enable/disable using the Winograd non-fused algorithms.'
)
flags
.
DEFINE_boolean
(
'batchnorm_persistent'
,
True
,
'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT '
'mode for batchnorm.'
)
flags
.
DEFINE_boolean
(
'sync_on_finish'
,
False
,
'Enable/disable whether the devices are synced after each '
'step.'
)
flags
.
DEFINE_boolean
(
'staged_vars'
,
False
,
'whether the variables are staged from the main '
'computation'
)
flags
.
DEFINE_boolean
(
'force_gpu_compatible'
,
False
,
'whether to enable force_gpu_compatible in GPU_Options'
)
flags
.
DEFINE_boolean
(
'allow_growth'
,
None
,
'whether to enable allow_growth in GPU_Options'
)
flags
.
DEFINE_boolean
(
'xla'
,
False
,
'whether to enable XLA auto-jit compilation'
)
flags
.
DEFINE_boolean
(
'xla_compile'
,
False
,
'Enable xla to compile the graph. Uncompilable ops will '
'result in fatal errors.'
)
flags
.
DEFINE_boolean
(
'fuse_decode_and_crop'
,
True
,
'Fuse decode_and_crop for image preprocessing.'
)
flags
.
DEFINE_boolean
(
'distort_color_in_yiq'
,
True
,
'Distort color of input images in YIQ space.'
)
flags
.
DEFINE_boolean
(
'enable_optimizations'
,
True
,
'Whether to enable grappler and other optimizations.'
)
flags
.
DEFINE_string
(
'rewriter_config'
,
None
,
'Config for graph optimizers, described as a '
'RewriterConfig proto buffer.'
)
flags
.
DEFINE_enum
(
'loss_type_to_report'
,
'total_loss'
,
(
'base_loss'
,
'total_loss'
),
'Which type of loss to output and to write summaries for. '
'The total loss includes L2 loss while the base loss does '
'not. Note that the total loss is always used while '
'computing gradients during training if weight_decay > 0, '
'but explicitly computing the total loss, instead of just '
'computing its gradients, can have a performance impact.'
)
flags
.
DEFINE_boolean
(
'single_l2_loss_op'
,
False
,
'If True, instead of using an L2 loss op per variable, '
'concatenate the variables into a single tensor and do a '
'single L2 loss on the concatenated tensor.'
)
flags
.
DEFINE_boolean
(
'use_resource_vars'
,
False
,
'Use resource variables instead of normal variables. '
'Resource variables are slower, but this option is useful '
'for debugging their performance.'
)
flags
.
DEFINE_boolean
(
'compute_lr_on_cpu'
,
False
,
'If True, do computations related to learning rate on the '
'CPU instead of the GPU. This will significantly improve '
'XLA performance in some cases.'
)
flags
.
DEFINE_boolean
(
'sparse_to_dense_grads'
,
False
,
'If True, convert all sparse gradients to dense gradients '
'before passing them to the optimizer to update '
'variables. Only affects models with sparse gradients, '
'which currently is only the NCF model.'
)
# Performance tuning specific to MKL.
flags
.
DEFINE_boolean
(
'mkl'
,
False
,
'If true, set MKL environment variables.'
)
flags
.
DEFINE_integer
(
'kmp_blocktime'
,
0
,
'The time, in milliseconds, that a thread should wait, '
'after completing the execution of a parallel region, '
'before sleeping'
)
flags
.
DEFINE_string
(
'kmp_affinity'
,
'granularity=fine,verbose,compact,1,0'
,
'Restricts execution of certain threads (virtual execution '
'units) to a subset of the physical processing units in a '
'multiprocessor computer.'
)
flags
.
DEFINE_integer
(
'kmp_settings'
,
1
,
'If set to 1, MKL settings will be printed.'
)
# fp16 parameters. If use_fp16=False, no other fp16 parameters apply.
flags
.
DEFINE_boolean
(
'use_fp16'
,
False
,
'Use 16-bit floats for certain tensors instead of 32-bit '
'floats. This is currently experimental.'
)
# TODO(reedwm): The default loss scale of 128 causes most models to diverge
# on the second step with synthetic data. Changing the tf.set_random_seed
# call to tf.set_random_seed(1235) or most other seed values causes the
# issue not to occur.
flags
.
DEFINE_float
(
'fp16_loss_scale'
,
None
,
'If fp16 is enabled, the loss is multiplied by this amount '
'right before gradients are computed, then each gradient '
'is divided by this amount. Mathematically, this has no '
'effect, but it helps avoid fp16 underflow. Set to 1 to '
'effectively disable. Ignored during eval.'
)
flags
.
DEFINE_boolean
(
'fp16_vars'
,
False
,
'If fp16 is enabled, also use fp16 for variables. If '
'False, the variables are stored in fp32 and casted to '
'fp16 when retrieved. Recommended to leave as False.'
)
flags
.
DEFINE_boolean
(
'fp16_enable_auto_loss_scale'
,
False
,
'If True and use_fp16 is True, automatically adjust the '
'loss scale during training.'
)
flags
.
DEFINE_integer
(
'fp16_inc_loss_scale_every_n'
,
1000
,
'If fp16 is enabled and fp16_enable_auto_loss_scale is '
'True, increase the loss scale every n steps.'
)
# The method for managing variables:
# parameter_server: variables are stored on a parameter server that holds
# the master copy of the variable. In local execution, a local device
# acts as the parameter server for each variable; in distributed
# execution, the parameter servers are separate processes in the
# cluster.
# For each step, each tower gets a copy of the variables from the
# parameter server, and sends its gradients to the param server.
# replicated: each GPU has its own copy of the variables. To apply
# gradients, an all_reduce algorithm or or regular cross-device
# aggregation is used to replicate the combined gradients to all
# towers (depending on all_reduce_spec parameter setting).
# independent: each GPU has its own copy of the variables, and gradients
# are not shared between towers. This can be used to check performance
# when no data is moved between GPUs.
# distributed_replicated: Distributed training only. Each GPU has a copy
# of the variables, and updates its copy after the parameter servers
# are all updated with the gradients from all servers. Only works with
# cross_replica_sync=true. Unlike 'replicated', currently never uses
# nccl all-reduce for replicating within a server.
# distributed_all_reduce: Distributed training where all replicas run
# in a single session, using all-reduce to mutally reduce the
# gradients. Uses no parameter servers. When there is only one
# worker, this is the same as replicated.
# collective_all_reduce: Distributed training where all replicas run
# independepently except for variable initialization and for
# gradient reduction which is done via collective all-reduce.
# NOTE: collective_all_reduce in conjunction with use_fp16 can
# lead to NaNs in some models (resnet50). TODO(tucker): fix it.
# horovod: Distributed training using Horovod library. Runs workers using
# an MPI framework (e.g. Open MPI). Each worker runs training on
# single GPU, and averages gradients using NCCL or MPI all-reduce.
# See https://github.com/uber/horovod for more details.
flags
.
DEFINE_enum
(
'variable_update'
,
'parameter_server'
,
(
'parameter_server'
,
'replicated'
,
'distributed_replicated'
,
'independent'
,
'distributed_all_reduce'
,
'collective_all_reduce'
,
'horovod'
),
'The method for managing variables: parameter_server, '
'replicated, distributed_replicated, independent, '
'distributed_all_reduce, collective_all_reduce, horovod'
)
flags
.
DEFINE_string
(
'all_reduce_spec'
,
None
,
'A specification of the all_reduce algorithm to be used '
'for reducing gradients. For more details, see '
'parse_all_reduce_spec in variable_mgr.py. An '
'all_reduce_spec has BNF form:
\n
'
'int ::= positive whole number
\n
'
'g_int ::= int[KkMGT]?
\n
'
'alg_spec ::= alg | alg#int
\n
'
'range_spec ::= alg_spec | alg_spec/alg_spec
\n
'
'spec ::= range_spec | range_spec:g_int:range_spec
\n
'
'NOTE: not all syntactically correct constructs are '
'supported.
\n\n
'
'Examples:
\n
'
'"xring" == use one global ring reduction for all '
'tensors
\n
'
'"pscpu" == use CPU at worker 0 to reduce all tensors
\n
'
'"nccl" == use NCCL to locally reduce all tensors. '
'Limited to 1 worker.
\n
'
'"nccl/xring" == locally (to one worker) reduce values '
'using NCCL then ring reduce across workers.
\n
'
'"pscpu:32k:xring" == use pscpu algorithm for tensors of '
'size up to 32kB, then xring for larger tensors.'
)
# If variable_update==distributed_all_reduce then it may be advantageous
# to aggregate small tensors into one prior to reduction. These parameters
# control that aggregation.
flags
.
DEFINE_integer
(
'agg_small_grads_max_bytes'
,
0
,
'If > 0, try to aggregate tensors of less than this '
'number of bytes prior to all-reduce.'
)
flags
.
DEFINE_integer
(
'agg_small_grads_max_group'
,
10
,
'When aggregating small tensors for all-reduce do not '
'aggregate more than this many into one new tensor.'
)
flags
.
DEFINE_integer
(
'allreduce_merge_scope'
,
1
,
'Establish a name scope around this many '
'gradients prior to creating the all-reduce operations. '
'It may affect the ability of the backend to merge '
'parallel ops.'
)
# Distributed training parameters.
flags
.
DEFINE_enum
(
'job_name'
,
''
,
(
'ps'
,
'worker'
,
'controller'
,
''
),
'One of "ps", "worker", "controller", "". Empty for local '
'training'
)
flags
.
DEFINE_string
(
'ps_hosts'
,
''
,
'Comma-separated list of target hosts'
)
flags
.
DEFINE_string
(
'worker_hosts'
,
''
,
'Comma-separated list of target hosts'
)
flags
.
DEFINE_string
(
'controller_host'
,
None
,
'optional controller host'
)
flags
.
DEFINE_integer
(
'task_index'
,
0
,
'Index of task within the job'
)
flags
.
DEFINE_string
(
'server_protocol'
,
'grpc'
,
'protocol for servers'
)
flags
.
DEFINE_boolean
(
'cross_replica_sync'
,
True
,
''
)
flags
.
DEFINE_string
(
'horovod_device'
,
''
,
'Device to do Horovod all-reduce on: '
'empty (default), cpu or gpu. Default with utilize GPU if '
'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE '
'option, and CPU otherwise.'
)
# Summary and Save & load checkpoints.
flags
.
DEFINE_integer
(
'summary_verbosity'
,
0
,
'Verbosity level for summary ops. '
'level 0: disable any summary.
\n
'
'level 1: small and fast ops, e.g.: learning_rate, '
'total_loss.
\n
'
'level 2: medium-cost ops, e.g. histogram of all '
'gradients.
\n
'
'level 3: expensive ops: images and histogram of each '
'gradient.
\n
'
)
flags
.
DEFINE_integer
(
'save_summaries_steps'
,
0
,
'How often to save summaries for trained models. Pass 0 '
'to disable summaries.'
)
flags
.
DEFINE_integer
(
'save_model_secs'
,
0
,
'How often to save trained models. Pass 0 to disable '
'saving checkpoints every N seconds. A checkpoint is '
'saved after training completes regardless of this '
'option.'
)
flags
.
DEFINE_integer
(
'save_model_steps'
,
None
,
'How often to save trained models. If specified, '
'save_model_secs must not be specified.'
)
flags
.
DEFINE_integer
(
'max_ckpts_to_keep'
,
5
,
'Max number of checkpoints to keep.'
)
flags
.
DEFINE_string
(
'train_dir'
,
None
,
'Path to session checkpoints. Pass None to disable saving '
'checkpoint at the end.'
)
flags
.
DEFINE_string
(
'eval_dir'
,
'/tmp/tf_cnn_benchmarks/eval'
,
'Directory where to write eval event logs.'
)
flags
.
DEFINE_string
(
'backbone_model_path'
,
None
,
'Path to pretrained backbone model checkpoint. Pass None '
'if not using a backbone model.'
)
flags
.
DEFINE_enum
(
'trt_mode'
,
''
,
[
''
,
'FP32'
,
'FP16'
,
'INT8'
],
'If this is specified in forward_only mode and '
'freeze_when_forward_only is set to True, use TensorRT to '
'optimize the graph before execution.'
)
flags
.
DEFINE_integer
(
'trt_max_workspace_size_bytes'
,
4
<<
30
,
'Max workspace size bytes used by the TensorRT optimizer.'
)
# Benchmark logging for model garden metric
flags
.
DEFINE_string
(
'benchmark_log_dir'
,
None
,
'The directory to place the log files containing the '
'results of benchmark. The logs are created by '
'BenchmarkFileLogger. Requires the root of the Tensorflow '
'models repository to be in $PYTHTONPATH.'
)
flags
.
DEFINE_string
(
'benchmark_test_id'
,
None
,
'The unique test ID of the benchmark run. It could be the '
'combination of key parameters. It is hardware independent '
'and could be used compare the performance between '
'different test runs. This flag is designed for human '
'consumption, and does not have any impact within the '
'system.'
)
platforms_util
.
define_platform_params
()
class
GlobalStepWatcher
(
threading
.
Thread
):
"""A helper class for global_step.
Polls for changes in the global_step of the model, and finishes when the
number of steps for the global run are done.
"""
def
__init__
(
self
,
sess
,
global_step_op
,
start_at_global_step
,
end_at_global_step
):
threading
.
Thread
.
__init__
(
self
)
self
.
sess
=
sess
self
.
global_step_op
=
global_step_op
self
.
start_at_global_step
=
start_at_global_step
self
.
end_at_global_step
=
end_at_global_step
self
.
start_time
=
0
self
.
start_step
=
0
self
.
finish_time
=
0
self
.
finish_step
=
0
def
run
(
self
):
while
self
.
finish_time
==
0
:
time
.
sleep
(.
25
)
global_step_val
,
=
self
.
sess
.
run
([
self
.
global_step_op
])
if
self
.
start_time
==
0
and
global_step_val
>=
self
.
start_at_global_step
:
# Use tf.logging.info instead of log_fn, since print (which is log_fn)
# is not thread safe and may interleave the outputs from two parallel
# calls to print, which can break tests.
tf
.
logging
.
info
(
'Starting real work at step %s at time %s'
%
(
global_step_val
,
time
.
ctime
()))
self
.
start_time
=
time
.
time
()
self
.
start_step
=
global_step_val
if
self
.
finish_time
==
0
and
global_step_val
>=
self
.
end_at_global_step
:
tf
.
logging
.
info
(
'Finishing real work at step %s at time %s'
%
(
global_step_val
,
time
.
ctime
()))
self
.
finish_time
=
time
.
time
()
self
.
finish_step
=
global_step_val
def
done
(
self
):
return
self
.
finish_time
>
0
def
num_steps
(
self
):
return
self
.
finish_step
-
self
.
start_step
def
elapsed_time
(
self
):
return
self
.
finish_time
-
self
.
start_time
class
CheckpointNotFoundException
(
Exception
):
pass
def
create_config_proto
(
params
):
"""Returns session config proto.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
"""
config
=
tf
.
ConfigProto
()
config
.
allow_soft_placement
=
True
if
params
.
num_intra_threads
is
None
:
if
params
.
device
==
'gpu'
:
config
.
intra_op_parallelism_threads
=
1
else
:
config
.
intra_op_parallelism_threads
=
params
.
num_intra_threads
config
.
inter_op_parallelism_threads
=
params
.
num_inter_threads
config
.
experimental
.
collective_group_leader
=
'/job:worker/replica:0/task:0'
config
.
gpu_options
.
experimental
.
collective_ring_order
=
params
.
gpu_indices
config
.
gpu_options
.
force_gpu_compatible
=
params
.
force_gpu_compatible
config
.
experimental
.
use_numa_affinity
=
params
.
use_numa_affinity
if
params
.
device
==
'cpu'
:
# TODO(tucker): change num_gpus to num_devices
config
.
device_count
[
'CPU'
]
=
params
.
num_gpus
if
params
.
allow_growth
is
not
None
:
config
.
gpu_options
.
allow_growth
=
params
.
allow_growth
if
params
.
gpu_memory_frac_for_testing
>
0
:
config
.
gpu_options
.
per_process_gpu_memory_fraction
=
(
params
.
gpu_memory_frac_for_testing
)
if
params
.
use_unified_memory
:
config
.
gpu_options
.
experimental
.
use_unified_memory
=
(
params
.
use_unified_memory
)
if
params
.
timestamped_allocator
:
config
.
gpu_options
.
experimental
.
timestamped_allocator
=
(
params
.
timestamped_allocator
)
if
params
.
gpu_kt_max_interval
>
0
:
config
.
gpu_options
.
experimental
.
kernel_tracker_max_interval
=
(
params
.
gpu_kt_max_interval
)
if
params
.
gpu_kt_max_bytes
>
0
:
config
.
gpu_options
.
experimental
.
kernel_tracker_max_bytes
=
(
params
.
gpu_kt_max_bytes
)
if
params
.
gpu_kt_max_pending
>
0
:
config
.
gpu_options
.
experimental
.
kernel_tracker_max_pending
=
(
params
.
gpu_kt_max_pending
)
if
params
.
xla
:
config
.
graph_options
.
optimizer_options
.
global_jit_level
=
(
tf
.
OptimizerOptions
.
ON_1
)
if
params
.
rewriter_config
:
rewriter_config
=
rewriter_config_pb2
.
RewriterConfig
()
text_format
.
Merge
(
params
.
rewriter_config
,
rewriter_config
)
config
.
graph_options
.
rewrite_options
.
CopyFrom
(
rewriter_config
)
elif
not
params
.
enable_optimizations
:
config
.
graph_options
.
optimizer_options
.
opt_level
=
tf
.
OptimizerOptions
.
L0
config
.
graph_options
.
rewrite_options
.
disable_meta_optimizer
=
True
elif
params
.
variable_update
==
'collective_all_reduce'
:
rewrite_options
=
config
.
graph_options
.
rewrite_options
rewrite_options
.
scoped_allocator_optimization
=
(
rewriter_config_pb2
.
RewriterConfig
.
ON
)
rewrite_options
.
scoped_allocator_opts
.
enable_op
.
append
(
'CollectiveReduce'
)
if
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
config
.
gpu_options
.
visible_device_list
=
str
(
hvd
.
local_rank
())
# For collective_all_reduce, ignore all devices except current worker.
if
params
.
variable_update
==
'collective_all_reduce'
:
del
config
.
device_filters
[:]
config
.
device_filters
.
append
(
'/job:%s/replica:0/task:%d'
%
(
params
.
job_name
,
params
.
task_index
))
# TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed.
# Currently we have to disable PinToHostOptimizer w/ XLA since it causes
# OOM/perf cliffs.
config
.
graph_options
.
rewrite_options
.
pin_to_host_optimization
=
(
rewriter_config_pb2
.
RewriterConfig
.
OFF
)
return
config
def
get_mode_from_params
(
params
):
"""Returns the mode in which this script is running.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
Raises:
ValueError: Unsupported params settings.
"""
if
params
.
forward_only
and
params
.
eval
:
raise
ValueError
(
'Only one of forward_only and eval parameters is true'
)
if
params
.
eval
:
return
constants
.
BenchmarkMode
.
EVAL
elif
params
.
forward_only
:
return
constants
.
BenchmarkMode
.
FORWARD_ONLY
elif
(
params
.
eval_during_training_every_n_steps
or
params
.
eval_during_training_every_n_epochs
or
params
.
eval_during_training_at_specified_steps
or
params
.
eval_during_training_at_specified_epochs
):
return
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
else
:
return
constants
.
BenchmarkMode
.
TRAIN
# How many digits to show for the loss and accuracies during training.
LOSS_AND_ACCURACY_DIGITS_TO_SHOW
=
3
def
benchmark_one_step
(
sess
,
fetches
,
step
,
batch_size
,
step_train_times
,
trace_filename
,
partitioned_graph_file_prefix
,
profiler
,
image_producer
,
params
,
summary_op
=
None
,
show_images_per_sec
=
True
,
benchmark_logger
=
None
,
collective_graph_key
=
0
):
"""Advance one step of benchmarking."""
should_profile
=
profiler
and
0
<=
step
<
_NUM_STEPS_TO_PROFILE
need_options_and_metadata
=
(
should_profile
or
collective_graph_key
>
0
or
((
trace_filename
or
partitioned_graph_file_prefix
)
and
step
==
-
2
)
)
if
need_options_and_metadata
:
run_options
=
tf
.
RunOptions
()
if
(
trace_filename
and
step
==
-
2
)
or
should_profile
:
run_options
.
trace_level
=
tf
.
RunOptions
.
FULL_TRACE
if
partitioned_graph_file_prefix
and
step
==
-
2
:
run_options
.
output_partition_graphs
=
True
if
collective_graph_key
>
0
:
run_options
.
experimental
.
collective_graph_key
=
collective_graph_key
run_metadata
=
tf
.
RunMetadata
()
else
:
run_options
=
None
run_metadata
=
None
summary_str
=
None
start_time
=
time
.
time
()
if
summary_op
is
None
:
results
=
sess
.
run
(
fetches
,
options
=
run_options
,
run_metadata
=
run_metadata
)
else
:
(
results
,
summary_str
)
=
sess
.
run
(
[
fetches
,
summary_op
],
options
=
run_options
,
run_metadata
=
run_metadata
)
if
not
params
.
forward_only
:
lossval
=
results
[
'average_loss'
]
else
:
lossval
=
0.
if
image_producer
is
not
None
:
image_producer
.
notify_image_consumption
()
train_time
=
time
.
time
()
-
start_time
step_train_times
.
append
(
train_time
)
if
(
show_images_per_sec
and
step
>=
0
and
(
step
==
0
or
(
step
+
1
)
%
params
.
display_every
==
0
)):
speed_mean
,
speed_uncertainty
,
speed_jitter
=
get_perf_timing
(
batch_size
,
step_train_times
,
params
.
display_perf_ewma
)
log_str
=
'%i
\t
%s
\t
%.*f'
%
(
step
+
1
,
get_perf_timing_str
(
speed_mean
,
speed_uncertainty
,
speed_jitter
),
LOSS_AND_ACCURACY_DIGITS_TO_SHOW
,
lossval
)
if
'top_1_accuracy'
in
results
:
log_str
+=
'
\t
%.*f
\t
%.*f'
%
(
LOSS_AND_ACCURACY_DIGITS_TO_SHOW
,
results
[
'top_1_accuracy'
],
LOSS_AND_ACCURACY_DIGITS_TO_SHOW
,
results
[
'top_5_accuracy'
])
log_fn
(
log_str
)
if
benchmark_logger
:
benchmark_logger
.
log_metric
(
'current_examples_per_sec'
,
speed_mean
,
global_step
=
step
+
1
)
if
'top_1_accuracy'
in
results
:
benchmark_logger
.
log_metric
(
'top_1_accuracy'
,
results
[
'top_1_accuracy'
],
global_step
=
step
+
1
)
benchmark_logger
.
log_metric
(
'top_5_accuracy'
,
results
[
'top_5_accuracy'
],
global_step
=
step
+
1
)
if
need_options_and_metadata
:
if
should_profile
:
profiler
.
add_step
(
step
,
run_metadata
)
if
trace_filename
and
step
==
-
2
:
log_fn
(
'Dumping trace to %s'
%
trace_filename
)
trace_dir
=
os
.
path
.
dirname
(
trace_filename
)
if
not
gfile
.
Exists
(
trace_dir
):
gfile
.
MakeDirs
(
trace_dir
)
with
gfile
.
Open
(
trace_filename
,
'w'
)
as
trace_file
:
if
params
.
use_chrome_trace_format
:
trace
=
timeline
.
Timeline
(
step_stats
=
run_metadata
.
step_stats
)
trace_file
.
write
(
trace
.
generate_chrome_trace_format
(
show_memory
=
True
))
else
:
trace_file
.
write
(
str
(
run_metadata
.
step_stats
))
if
partitioned_graph_file_prefix
and
step
==
-
2
:
path
,
filename
=
os
.
path
.
split
(
partitioned_graph_file_prefix
)
if
'.'
in
filename
:
base_filename
,
ext
=
filename
.
rsplit
(
'.'
,
1
)
ext
=
'.'
+
ext
else
:
base_filename
,
ext
=
filename
,
''
as_text
=
filename
.
endswith
(
'txt'
)
for
graph_def
in
run_metadata
.
partition_graphs
:
device
=
graph_def
.
node
[
0
].
device
.
replace
(
'/'
,
'_'
).
replace
(
':'
,
'_'
)
graph_filename
=
'%s%s%s'
%
(
base_filename
,
device
,
ext
)
log_fn
(
'Writing partitioned GraphDef as %s to %s'
%
(
'text'
if
as_text
else
'binary'
,
os
.
path
.
join
(
path
,
graph_filename
)))
tf
.
train
.
write_graph
(
graph_def
,
path
,
graph_filename
,
as_text
)
return
(
summary_str
,
lossval
)
def
get_perf_timing_str
(
speed_mean
,
speed_uncertainty
,
speed_jitter
,
scale
=
1
):
if
scale
==
1
:
# TODO(laigd): rename 'images' to maybe 'inputs', same below.
return
(
'images/sec: %.1f +/- %.1f (jitter = %.1f)'
%
(
speed_mean
,
speed_uncertainty
,
speed_jitter
))
else
:
return
'images/sec: %.1f'
%
speed_mean
def
get_perf_timing
(
batch_size
,
step_train_times
,
ewma_alpha
=
None
,
scale
=
1
):
"""Calculate benchmark processing speed."""
times
=
np
.
array
(
step_train_times
)
speeds
=
batch_size
/
times
if
ewma_alpha
:
weights
=
np
.
logspace
(
len
(
times
)
-
1
,
0
,
len
(
times
),
base
=
1
-
ewma_alpha
)
time_mean
=
np
.
average
(
times
,
weights
=
weights
)
else
:
time_mean
=
np
.
mean
(
times
)
speed_mean
=
scale
*
batch_size
/
time_mean
speed_uncertainty
=
np
.
std
(
speeds
)
/
np
.
sqrt
(
float
(
len
(
speeds
)))
speed_jitter
=
1.4826
*
np
.
median
(
np
.
abs
(
speeds
-
np
.
median
(
speeds
)))
return
speed_mean
,
speed_uncertainty
,
speed_jitter
def
load_checkpoint
(
saver
,
sess
,
ckpt_dir
):
"""Loads checkpoint from provided directory or full path.
Args:
saver: Saver used to restore the checkpoint.
sess: TensorFlow session.
ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
Returns:
Global step.
"""
model_checkpoint_path
=
_get_checkpoint_to_load
(
ckpt_dir
)
global_step
=
model_checkpoint_path
.
split
(
'/'
)[
-
1
].
split
(
'-'
)[
-
1
]
if
not
global_step
.
isdigit
():
global_step
=
0
else
:
global_step
=
int
(
global_step
)
saver
.
restore
(
sess
,
model_checkpoint_path
)
log_fn
(
'Successfully loaded model from %s.'
%
model_checkpoint_path
)
return
global_step
def
_get_checkpoint_to_load
(
ckpt_dir
):
"""Returns which checkpoint to load.
Args:
ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
Returns:
Full path to checkpoint to load.
Raises:
CheckpointNotFoundException: If checkpoint is not found.
"""
p
=
re
.
compile
(
r
'ckpt-\d+$'
)
if
p
.
search
(
ckpt_dir
):
model_checkpoint_path
=
ckpt_dir
else
:
# Finds latest checkpoint in directory provided
ckpt
=
tf
.
train
.
get_checkpoint_state
(
ckpt_dir
)
if
ckpt
and
ckpt
.
model_checkpoint_path
:
model_checkpoint_path
=
ckpt
.
model_checkpoint_path
else
:
raise
CheckpointNotFoundException
(
'No checkpoint file found in dir:{}'
.
format
(
ckpt_dir
))
return
model_checkpoint_path
# Params are passed to BenchmarkCNN's constructor. Params is a map from name
# to value, with one field per key in flags.param_specs.
#
# Call make_params() or make_params_from_flags() below to construct a Params
# tuple with default values from flags.param_specs, rather than constructing
# Params directly.
Params
=
namedtuple
(
'Params'
,
flags
.
param_specs
.
keys
())
# pylint: disable=invalid-name
def
validate_params
(
params
):
"""Validates that the Params tuple had valid values.
When command-line flags are defined for each ParamSpec by calling
flags.define_flags(), calling this function is unnecessary because absl
already does flag validation. Otherwise, this function should be called.
Args:
params: A Params tuple.
Raises:
ValueError: An element of params had an invalid value.
"""
for
name
,
value
in
params
.
_asdict
().
items
():
param_spec
=
flags
.
param_specs
[
name
]
if
param_spec
.
flag_type
in
(
'integer'
,
'float'
):
if
(
value
is
not
None
and
param_spec
.
kwargs
[
'lower_bound'
]
is
not
None
and
value
<
param_spec
.
kwargs
[
'lower_bound'
]):
raise
ValueError
(
'Param %s value of %s is lower than the lower bound '
'of %s'
%
(
name
,
value
,
param_spec
.
kwargs
[
'lower_bound'
]))
if
(
value
is
not
None
and
param_spec
.
kwargs
[
'upper_bound'
]
is
not
None
and
param_spec
.
kwargs
[
'upper_bound'
]
<
value
):
raise
ValueError
(
'Param %s value of %s is higher than the upper bound '
'of %s'
%
(
name
,
value
,
param_spec
.
kwargs
[
'upper_bound'
]))
elif
(
value
is
not
None
and
param_spec
.
flag_type
==
'enum'
and
value
not
in
param_spec
.
kwargs
[
'enum_values'
]):
raise
ValueError
(
'Param %s of value %s is not in %s'
%
(
name
,
value
,
param_spec
.
kwargs
[
'enum_values'
]))
def
make_params
(
**
kwargs
):
"""Create a Params tuple for BenchmarkCNN from kwargs.
Default values are filled in from flags.param_specs.
Args:
**kwargs: kwarg values will override the default values.
Returns:
Params namedtuple for constructing BenchmarkCNN.
"""
# Create a (name: default_value) map from flags.param_specs.
default_kwargs
=
{
name
:
flags
.
param_specs
[
name
].
default_value
for
name
in
flags
.
param_specs
}
params
=
Params
(
**
default_kwargs
).
_replace
(
**
kwargs
)
validate_params
(
params
)
return
params
def
make_params_from_flags
():
"""Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS.
Returns:
Params namedtuple for constructing BenchmarkCNN.
"""
# Collect (name: value) pairs for absl_flags.FLAGS with matching names in
# flags.param_specs.
flag_values
=
{
name
:
getattr
(
absl_flags
.
FLAGS
,
name
)
for
name
in
flags
.
param_specs
.
keys
()}
return
Params
(
**
flag_values
)
def
remove_param_fields
(
params
,
fields_to_remove
):
"""Remove fields from a Params namedtuple."""
params_dict
=
params
.
_asdict
()
for
field
in
fields_to_remove
:
assert
field
in
params_dict
,
'Invalid Params field: '
+
field
params_dict
=
{
k
:
v
for
k
,
v
in
params_dict
.
items
()
if
k
not
in
fields_to_remove
}
new_params_type
=
namedtuple
(
'Params'
,
params_dict
.
keys
())
return
new_params_type
(
**
params_dict
)
def
get_num_batches_and_epochs
(
params
,
batch_size
,
num_examples_per_epoch
):
"""Returns the number of batches and epochs to run for.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
batch_size: The number of images per step.
num_examples_per_epoch: The number of images in a single epoch.
Returns:
num_batches: The number of batches to run for.
num_epochs: The number of epochs to run for. This might be slightly
smaller than params.num_epochs if specified, because the number of batches
must be an integer.
Raises:
ValueError: Invalid or unsupported params.
"""
if
params
.
num_batches
and
params
.
num_epochs
:
raise
ValueError
(
'At most one of --num_batches and --num_epochs may be '
'specified.'
)
if
params
.
num_epochs
:
num_batches
=
int
(
params
.
num_epochs
*
num_examples_per_epoch
+
batch_size
-
1
)
//
batch_size
else
:
num_batches
=
params
.
num_batches
or
_DEFAULT_NUM_BATCHES
num_epochs
=
num_batches
*
batch_size
/
num_examples_per_epoch
return
(
num_batches
,
num_epochs
)
def
get_piecewise_learning_rate
(
piecewise_learning_rate_schedule
,
global_step
,
num_batches_per_epoch
):
"""Returns a piecewise learning rate tensor.
Args:
piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule
parameter
global_step: Scalar tensor representing the global step.
num_batches_per_epoch: float indicating the number of batches per epoch.
Returns:
A scalar float tensor, representing the learning rate.
Raises:
ValueError: piecewise_learning_rate_schedule is not formatted correctly.
"""
pieces
=
piecewise_learning_rate_schedule
.
split
(
';'
)
if
len
(
pieces
)
%
2
==
0
:
raise
ValueError
(
'--piecewise_learning_rate_schedule must have an odd '
'number of components'
)
values
=
[]
boundaries
=
[]
for
i
,
piece
in
enumerate
(
pieces
):
if
i
%
2
==
0
:
try
:
values
.
append
(
float
(
piece
))
except
ValueError
:
raise
ValueError
(
'Invalid learning rate: '
+
piece
)
else
:
try
:
boundaries
.
append
(
int
(
int
(
piece
)
*
num_batches_per_epoch
)
-
1
)
except
ValueError
:
raise
ValueError
(
'Invalid epoch: '
+
piece
)
return
tf
.
train
.
piecewise_constant
(
global_step
,
boundaries
,
values
,
name
=
'piecewise_learning_rate'
)
def
get_learning_rate
(
params
,
global_step
,
num_examples_per_epoch
,
model
,
batch_size
):
"""Returns a learning rate tensor based on global_step.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
global_step: Scalar tensor representing the global step.
num_examples_per_epoch: The number of examples per epoch.
model: The model.Model object to obtain the default learning rate from if no
learning rate is specified.
batch_size: Number of examples per step
Returns:
A scalar float tensor, representing the learning rate. When evaluated, the
learning rate depends on the current value of global_step.
Raises:
ValueError: Invalid or unsupported params.
"""
with
tf
.
name_scope
(
'learning_rate'
):
num_batches_per_epoch
=
num_examples_per_epoch
/
batch_size
if
params
.
piecewise_learning_rate_schedule
:
if
(
params
.
init_learning_rate
is
not
None
or
params
.
learning_rate_decay_factor
or
params
.
minimum_learning_rate
or
params
.
num_epochs_per_decay
):
raise
ValueError
(
'No other learning rate-related flags can be '
'specified if --piecewise_learning_rate_schedule is '
'specified'
)
learning_rate
=
get_piecewise_learning_rate
(
params
.
piecewise_learning_rate_schedule
,
global_step
,
num_batches_per_epoch
)
elif
params
.
init_learning_rate
is
not
None
:
learning_rate
=
params
.
init_learning_rate
if
(
params
.
num_epochs_per_decay
>
0
and
params
.
learning_rate_decay_factor
>
0
):
decay_steps
=
int
(
num_batches_per_epoch
*
params
.
num_epochs_per_decay
)
# Decay the learning rate exponentially based on the number of steps.
learning_rate
=
tf
.
train
.
exponential_decay
(
params
.
init_learning_rate
,
global_step
,
decay_steps
,
params
.
learning_rate_decay_factor
,
staircase
=
True
)
if
params
.
minimum_learning_rate
!=
0.
:
learning_rate
=
tf
.
maximum
(
learning_rate
,
params
.
minimum_learning_rate
)
else
:
learning_rate
=
model
.
get_learning_rate
(
global_step
,
batch_size
)
if
params
.
num_learning_rate_warmup_epochs
>
0
and
(
params
.
init_learning_rate
is
not
None
or
params
.
piecewise_learning_rate_schedule
):
warmup_steps
=
int
(
num_batches_per_epoch
*
params
.
num_learning_rate_warmup_epochs
)
init_lr
=
params
.
init_learning_rate
if
init_lr
is
None
:
init_lr
=
float
(
params
.
piecewise_learning_rate_schedule
.
split
(
';'
)[
0
])
warmup_lr
=
init_lr
*
tf
.
cast
(
global_step
,
tf
.
float32
)
/
tf
.
cast
(
warmup_steps
,
tf
.
float32
)
learning_rate
=
tf
.
cond
(
global_step
<
warmup_steps
,
lambda
:
warmup_lr
,
lambda
:
learning_rate
)
learning_rate
=
mlperf
.
logger
.
log_deferred_tensor_value
(
mlperf
.
tags
.
OPT_LR
,
learning_rate
,
global_step
,
every_n
=
100
)
return
learning_rate
def
get_optimizer
(
params
,
learning_rate
):
"""Returns the optimizer that should be used based on params."""
if
params
.
optimizer
==
'momentum'
:
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
OPT_NAME
,
value
=
mlperf
.
tags
.
SGD_WITH_MOMENTUM
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
OPT_MOMENTUM
,
value
=
params
.
momentum
)
opt
=
tf
.
train
.
MomentumOptimizer
(
learning_rate
,
params
.
momentum
,
use_nesterov
=
True
)
elif
params
.
optimizer
==
'sgd'
:
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
OPT_NAME
,
value
=
mlperf
.
tags
.
SGD
)
opt
=
tf
.
train
.
GradientDescentOptimizer
(
learning_rate
)
elif
params
.
optimizer
==
'rmsprop'
:
opt
=
tf
.
train
.
RMSPropOptimizer
(
learning_rate
,
params
.
rmsprop_decay
,
momentum
=
params
.
rmsprop_momentum
,
epsilon
=
params
.
rmsprop_epsilon
)
elif
params
.
optimizer
==
'adam'
:
opt
=
tf
.
train
.
AdamOptimizer
(
learning_rate
,
params
.
adam_beta1
,
params
.
adam_beta2
,
params
.
adam_epsilon
)
else
:
raise
ValueError
(
'Optimizer "{}" was not recognized'
.
format
(
params
.
optimizer
))
return
opt
def
generate_tfprof_profile
(
profiler
,
tfprof_file
):
"""Generates a tfprof profile, writing it to a file and printing top ops.
Args:
profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been
called.
tfprof_file: The filename to write the ProfileProto to.
"""
profile_proto
=
profiler
.
serialize_to_string
()
log_fn
(
'Dumping ProfileProto to %s'
%
tfprof_file
)
with
gfile
.
Open
(
tfprof_file
,
'wb'
)
as
f
:
f
.
write
(
profile_proto
)
# Print out the execution times of the top operations. Note this
# information can also be obtained with the dumped ProfileProto, but
# printing it means tfprof doesn't have to be used if all the user wants
# is the top ops.
options
=
tf
.
profiler
.
ProfileOptionBuilder
.
time_and_memory
()
options
[
'max_depth'
]
=
_NUM_OPS_TO_PRINT
options
[
'order_by'
]
=
'accelerator_micros'
profiler
.
profile_operations
(
options
)
class
BenchmarkCNN
(
object
):
"""Class for benchmarking a cnn network."""
def
__init__
(
self
,
params
,
dataset
=
None
,
model
=
None
):
"""Initialize BenchmarkCNN.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
dataset: If not None, the dataset to use. Otherwise, params is used to
obtain the dataset.
model: If not None, the model to use. Otherwise, params is used to obtain
the model.
Raises:
ValueError: Unsupported params settings.
"""
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
RUN_START
)
self
.
params
=
params
if
params
.
eval
:
self
.
_doing_eval
=
True
else
:
# Note self._doing_eval can later switch to True in self._do_eval() if
# self.params.eval_during_training_* is specified.
self
.
_doing_eval
=
False
self
.
dataset
=
dataset
or
datasets
.
create_dataset
(
self
.
params
.
data_dir
,
self
.
params
.
data_name
)
self
.
model
=
model
or
model_config
.
get_model_config
(
self
.
params
.
model
,
self
.
dataset
,
self
.
params
)
self
.
trace_filename
=
self
.
params
.
trace_file
self
.
rewriter_config
=
self
.
params
.
rewriter_config
autotune_threshold
=
self
.
params
.
autotune_threshold
if
(
self
.
params
.
autotune_threshold
)
else
1
min_autotune_warmup
=
5
*
autotune_threshold
*
autotune_threshold
self
.
num_warmup_batches
=
self
.
params
.
num_warmup_batches
if
(
self
.
params
.
num_warmup_batches
is
not
None
)
else
max
(
10
,
min_autotune_warmup
)
self
.
graph_file
=
self
.
params
.
graph_file
self
.
resize_method
=
self
.
params
.
resize_method
self
.
sync_queue_counter
=
0
self
.
num_gpus
=
self
.
params
.
num_gpus
if
self
.
params
.
gpu_indices
:
self
.
gpu_indices
=
[
int
(
x
)
for
x
in
self
.
params
.
gpu_indices
.
split
(
','
)]
else
:
self
.
gpu_indices
=
[
x
for
x
in
range
(
self
.
num_gpus
)]
if
(
self
.
params
.
device
==
'cpu'
and
self
.
params
.
data_format
==
'NCHW'
and
not
self
.
params
.
mkl
):
raise
ValueError
(
'device=cpu requires that data_format=NHWC'
)
if
((
self
.
params
.
num_epochs_per_decay
or
self
.
params
.
learning_rate_decay_factor
)
and
not
(
self
.
params
.
init_learning_rate
is
not
None
and
self
.
params
.
num_epochs_per_decay
and
self
.
params
.
learning_rate_decay_factor
)):
raise
ValueError
(
'If one of num_epochs_per_decay or '
'learning_rate_decay_factor is set, both must be set'
'and learning_rate must be set'
)
if
(
self
.
params
.
minimum_learning_rate
and
not
(
self
.
params
.
init_learning_rate
is
not
None
and
self
.
params
.
num_epochs_per_decay
and
self
.
params
.
learning_rate_decay_factor
)):
raise
ValueError
(
'minimum_learning_rate requires learning_rate,'
'num_epochs_per_decay, and '
'learning_rate_decay_factor to be set'
)
if
(
self
.
params
.
use_fp16
and
self
.
params
.
fp16_vars
and
'replicated'
in
self
.
params
.
variable_update
and
self
.
params
.
all_reduce_spec
and
'nccl'
in
self
.
params
.
all_reduce_spec
):
raise
ValueError
(
'fp16 variables are not supported with NCCL'
)
if
(
self
.
params
.
use_fp16
and
self
.
params
.
fp16_vars
and
self
.
params
.
gradient_repacking
):
raise
ValueError
(
'--fp16_vars cannot be used with --gradient_repacking'
)
if
self
.
params
.
variable_update
==
'horovod'
and
self
.
params
.
num_gpus
>
1
:
raise
ValueError
(
'Horovod benchmarks require num_gpus=1 on each worker'
)
if
self
.
params
.
variable_update
==
'horovod'
and
self
.
params
.
job_name
:
raise
ValueError
(
'job_name should not be specified for Horovod.'
)
if
self
.
params
.
use_fp16
and
self
.
params
.
fp16_enable_auto_loss_scale
:
if
self
.
params
.
all_reduce_spec
and
'nccl'
in
self
.
params
.
all_reduce_spec
:
raise
ValueError
(
'Automatic loss scaling is not supported with NCCL.'
)
if
self
.
params
.
variable_update
not
in
(
'parameter_server'
,
'replicated'
,
'independent'
):
raise
ValueError
(
'Automatic loss scaling is not supported with '
'variable_update=%s.'
%
self
.
params
.
variable_update
)
if
self
.
params
.
staged_vars
:
raise
ValueError
(
'Automatic loss scaling is not supported with'
'staged_vars.'
)
if
(
self
.
params
.
debugger
is
not
None
and
self
.
params
.
debugger
!=
'cli'
and
':'
not
in
self
.
params
.
debugger
):
raise
ValueError
(
'--debugger must be "cli" or in the form '
'host:port'
)
if
self
.
params
.
hierarchical_copy
and
self
.
params
.
num_gpus
<=
1
:
raise
ValueError
(
'--hierarchical_copy requires --num_gpus to be greater '
'than 1'
)
if
params
.
save_model_secs
and
params
.
save_model_steps
:
raise
ValueError
(
'At most one of --save_model_secs and '
'--save_model_steps can be specified'
)
eval_during_training_flags
=
list
(
map
(
bool
,
[
params
.
eval_during_training_every_n_steps
,
params
.
eval_during_training_every_n_epochs
,
params
.
eval_during_training_at_specified_steps
,
params
.
eval_during_training_at_specified_epochs
,
]))
if
eval_during_training_flags
.
count
(
True
)
>
1
:
raise
ValueError
(
'At most one flag with --eval_during_training_* prefix '
'must be specified.'
)
eval_during_training_enabled
=
any
(
eval_during_training_flags
)
if
eval_during_training_enabled
:
if
params
.
eval
:
raise
ValueError
(
'At most one of --eval and --eval_during_training_* '
'must be specified'
)
if
params
.
forward_only
:
raise
ValueError
(
'At most one of --forward_only and '
'--eval_during_training_* must be specified'
)
if
params
.
job_name
:
raise
ValueError
(
'--eval_during_training_* is not yet supported in '
'distributed mode.'
)
if
params
.
staged_vars
:
raise
ValueError
(
'--eval_during_training_* is not currently compatible '
'with --staged_vars'
)
if
params
.
stop_at_top_1_accuracy
and
not
eval_during_training_enabled
:
raise
ValueError
(
'--stop_at_top_1_accuracy is only supported with '
'--eval_during_training_*'
)
if
params
.
collect_eval_results_async
and
params
.
model
!=
'ssd300'
:
raise
ValueError
(
'--collect_eval_results_async only works with ssd300 '
'model currently.'
)
if
self
.
params
.
forward_only
and
self
.
params
.
freeze_when_forward_only
:
if
self
.
params
.
train_dir
is
not
None
:
raise
ValueError
(
'In forward_only mode, when --freeze_when_forward_only'
' is True, --train_dir should not be specified'
)
if
self
.
params
.
data_dir
and
not
self
.
params
.
datasets_use_prefetch
:
raise
ValueError
(
'In forward_only mode, when --freeze_when_forward_only'
' is True and --data_dir is set, '
'--datasets_use_prefetch should be set to True'
)
if
self
.
params
.
job_name
:
raise
ValueError
(
'In forward_only mode, when --freeze_when_forward_only'
' is True, --job_name should not be specified and '
'distributed running is not supported'
)
self
.
forward_only_and_freeze
=
True
else
:
self
.
forward_only_and_freeze
=
False
if
self
.
params
.
trt_mode
:
raise
ValueError
(
'--trt_mode should not be specified if one of '
'--forward_only and --freeze_when_forward_only is set '
'to False'
)
self
.
mode
=
get_mode_from_params
(
self
.
params
)
# Use the batch size from the command line if specified, otherwise use the
# model's default batch size. Scale the benchmark's batch size by the
# number of GPUs.
if
self
.
params
.
batch_size
>
0
:
self
.
model
.
set_batch_size
(
self
.
params
.
batch_size
)
self
.
batch_size
=
self
.
model
.
get_batch_size
()
*
self
.
num_gpus
if
self
.
mode
in
(
constants
.
BenchmarkMode
.
TRAIN
,
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
):
self
.
train_batch_size
=
self
.
batch_size
else
:
self
.
train_batch_size
=
None
if
self
.
mode
in
(
constants
.
BenchmarkMode
.
EVAL
,
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
):
if
self
.
params
.
eval_batch_size
>
0
:
self
.
eval_batch_size
=
self
.
params
.
eval_batch_size
*
self
.
num_gpus
else
:
self
.
eval_batch_size
=
self
.
batch_size
else
:
self
.
eval_batch_size
=
None
self
.
batch_group_size
=
self
.
params
.
batch_group_size
self
.
enable_auto_loss_scale
=
(
self
.
params
.
use_fp16
and
self
.
params
.
fp16_enable_auto_loss_scale
)
self
.
loss_scale
=
None
self
.
loss_scale_normal_steps
=
None
self
.
job_name
=
self
.
params
.
job_name
# "" for local training
# PS server is used for distributed jobs not using all-reduce.
use_ps_server
=
self
.
job_name
and
(
self
.
params
.
variable_update
!=
'distributed_all_reduce'
and
self
.
params
.
variable_update
!=
'collective_all_reduce'
)
# controller is used for distributed_all_reduce with > 1 worker.
use_controller
=
(
self
.
params
.
variable_update
==
'distributed_all_reduce'
and
self
.
job_name
)
if
use_controller
and
not
params
.
controller_host
:
raise
ValueError
(
'When variable_update==distributed_all_reduce '
'controller_host must also be specified.'
)
# collective_all_reduce doesn't need a controller or ps
self
.
distributed_collective
=
(
self
.
params
.
variable_update
==
'collective_all_reduce'
and
self
.
job_name
)
self
.
local_parameter_device_flag
=
self
.
params
.
local_parameter_device
if
self
.
job_name
:
self
.
task_index
=
self
.
params
.
task_index
self
.
cluster_manager
=
platforms_util
.
get_cluster_manager
(
params
,
create_config_proto
(
params
))
assert
isinstance
(
self
.
cluster_manager
,
cnn_util
.
BaseClusterManager
)
worker_prefix
=
'/job:worker/replica:0/task:%s'
%
self
.
task_index
if
use_ps_server
:
self
.
param_server_device
=
tf
.
train
.
replica_device_setter
(
worker_device
=
worker_prefix
+
'/cpu:0'
,
cluster
=
self
.
cluster_manager
.
get_cluster_spec
())
# This device on which the queues for managing synchronization between
# servers should be stored.
self
.
sync_queue_devices
=
[
'/job:ps/replica:0/task:%s/cpu:0'
%
i
for
i
in
range
(
self
.
cluster_manager
.
num_ps
())
]
else
:
self
.
sync_queue_devices
=
[
'/job:worker/replica:0/task:0/cpu:0'
]
else
:
self
.
task_index
=
0
self
.
cluster_manager
=
None
worker_prefix
=
''
self
.
param_server_device
=
'/%s:0'
%
self
.
params
.
local_parameter_device
self
.
sync_queue_devices
=
[
self
.
param_server_device
]
if
self
.
cluster_manager
:
self
.
num_workers
=
self
.
cluster_manager
.
num_workers
()
elif
self
.
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
self
.
num_workers
=
hvd
.
size
()
else
:
self
.
num_workers
=
1
self
.
num_ps
=
self
.
cluster_manager
.
num_ps
()
if
self
.
cluster_manager
else
0
if
self
.
num_workers
>
1
and
self
.
params
.
all_reduce_spec
==
'nccl'
:
raise
ValueError
(
'--all_reduce_spec=nccl is invalid in a '
'multi-worker job'
)
# Device to use for ops that need to always run on the local worker's CPU.
self
.
cpu_device
=
'%s/cpu:0'
%
worker_prefix
# Device to use for ops that need to always run on the local worker's
# compute device, and never on a parameter server device.
self
.
raw_devices
=
[
'%s/%s:%i'
%
(
worker_prefix
,
self
.
params
.
device
,
i
)
for
i
in
xrange
(
self
.
num_gpus
)
]
subset
=
'validation'
if
params
.
eval
else
'train'
self
.
num_batches
,
self
.
num_epochs
=
get_num_batches_and_epochs
(
params
,
self
.
batch_size
*
self
.
num_workers
,
self
.
dataset
.
num_examples_per_epoch
(
subset
))
if
self
.
mode
in
(
constants
.
BenchmarkMode
.
EVAL
,
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
):
# TODO(reedwm): Currently we do extra eval logic for num_eval_batches and
# the preprocessor. We should encapsulate this logic into a shared
# function or class.
if
params
.
num_eval_batches
is
None
and
params
.
num_eval_epochs
is
None
:
eval_params
=
self
.
params
else
:
eval_params
=
self
.
params
.
_replace
(
num_batches
=
self
.
params
.
num_eval_batches
,
num_epochs
=
self
.
params
.
num_eval_epochs
)
self
.
num_eval_batches
,
self
.
num_eval_epochs
=
get_num_batches_and_epochs
(
eval_params
,
self
.
eval_batch_size
*
self
.
num_workers
,
self
.
dataset
.
num_examples_per_epoch
(
'validation'
))
else
:
self
.
num_eval_batches
,
self
.
num_eval_epochs
=
None
,
None
num_train_examples_per_epoch
=
self
.
dataset
.
num_examples_per_epoch
(
'train'
)
if
self
.
params
.
eval_during_training_every_n_epochs
:
n_epochs
=
self
.
params
.
eval_during_training_every_n_epochs
self
.
eval_during_training_at_specified_steps
=
{
(
int
(
e
*
num_train_examples_per_epoch
+
self
.
batch_size
-
1
)
//
self
.
batch_size
)
for
e
in
np
.
arange
(
n_epochs
,
self
.
num_epochs
,
n_epochs
)}
if
self
.
params
.
eval_during_training_at_specified_steps
:
try
:
self
.
eval_during_training_at_specified_steps
=
set
(
map
(
int
,
self
.
params
.
eval_during_training_at_specified_steps
))
except
ValueError
:
raise
ValueError
(
'Param eval_during_training_at_specified_steps value '
'of %s cannot be converted to a list of integers.'
%
(
self
.
params
.
eval_during_training_at_specified_steps
))
if
self
.
params
.
eval_during_training_at_specified_epochs
:
try
:
n_epochs
=
list
(
map
(
float
,
self
.
params
.
eval_during_training_at_specified_epochs
))
offset
=
n_epochs
[
0
]
-
1
if
offset
.
is_integer
():
offset
=
int
(
offset
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
EVAL_EPOCH_OFFSET
,
value
=
offset
)
self
.
eval_during_training_at_specified_steps
=
{
(
int
(
e
*
num_train_examples_per_epoch
+
self
.
batch_size
-
1
)
//
self
.
batch_size
)
for
e
in
n_epochs
}
except
ValueError
:
raise
ValueError
(
'Param eval_during_training_at_specified_epochs value '
'of %s cannot be converted to a list of floats.'
%
(
self
.
params
.
eval_during_training_at_specified_epochs
))
if
params
.
eval_during_training_every_n_epochs
:
offset
=
params
.
eval_during_training_every_n_epochs
-
1
if
offset
.
is_integer
():
offset
=
int
(
offset
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
EVAL_EPOCH_OFFSET
,
value
=
offset
)
if
(
self
.
params
.
staged_vars
and
self
.
params
.
variable_update
!=
'parameter_server'
):
raise
ValueError
(
'staged_vars for now is only supported with '
'variable_update=parameter_server'
)
if
self
.
params
.
variable_update
==
'parameter_server'
:
if
self
.
job_name
:
if
not
self
.
params
.
staged_vars
:
self
.
variable_mgr
=
variable_mgr
.
VariableMgrDistributedFetchFromPS
(
self
)
else
:
self
.
variable_mgr
=
(
variable_mgr
.
VariableMgrDistributedFetchFromStagedPS
(
self
))
else
:
if
not
self
.
params
.
staged_vars
:
self
.
variable_mgr
=
variable_mgr
.
VariableMgrLocalFetchFromPS
(
self
)
else
:
self
.
variable_mgr
=
variable_mgr
.
VariableMgrLocalFetchFromStagedPS
(
self
)
elif
self
.
params
.
variable_update
==
'replicated'
:
if
self
.
job_name
:
raise
ValueError
(
'Invalid variable_update in distributed mode: %s'
%
self
.
params
.
variable_update
)
self
.
variable_mgr
=
variable_mgr
.
VariableMgrLocalReplicated
(
self
,
self
.
params
.
all_reduce_spec
,
self
.
params
.
agg_small_grads_max_bytes
,
self
.
params
.
agg_small_grads_max_group
,
self
.
params
.
allreduce_merge_scope
)
elif
self
.
params
.
variable_update
==
'distributed_all_reduce'
:
assert
self
.
params
.
cross_replica_sync
self
.
variable_mgr
=
variable_mgr
.
VariableMgrDistributedAllReduce
(
self
,
self
.
params
.
all_reduce_spec
,
(
'worker'
if
self
.
num_workers
>
1
else
'localhost'
),
self
.
num_workers
,
self
.
params
.
agg_small_grads_max_bytes
,
self
.
params
.
agg_small_grads_max_group
,
self
.
params
.
allreduce_merge_scope
)
elif
self
.
params
.
variable_update
==
'collective_all_reduce'
:
assert
self
.
params
.
cross_replica_sync
self
.
variable_mgr
=
variable_mgr
.
VariableMgrCollectiveAllReduce
(
self
,
self
.
params
.
all_reduce_spec
,
self
.
num_workers
,
self
.
num_gpus
,
self
.
task_index
,
self
.
params
.
allreduce_merge_scope
)
elif
self
.
params
.
variable_update
==
'distributed_replicated'
:
assert
self
.
params
.
cross_replica_sync
if
not
self
.
job_name
:
raise
ValueError
(
'Invalid variable_update in local mode: %s'
%
self
.
params
.
variable_update
)
self
.
variable_mgr
=
variable_mgr
.
VariableMgrDistributedReplicated
(
self
)
elif
self
.
params
.
variable_update
in
(
'independent'
,
'horovod'
):
if
self
.
job_name
:
raise
ValueError
(
'Invalid variable_update in distributed mode: %s'
%
self
.
params
.
variable_update
)
self
.
variable_mgr
=
variable_mgr
.
VariableMgrIndependent
(
self
)
else
:
raise
ValueError
(
'Invalid variable_update: %s'
%
self
.
params
.
variable_update
)
# Device to use for running on the local worker's compute device, but
# with variables assigned to parameter server devices.
self
.
devices
=
self
.
variable_mgr
.
get_devices
()
if
self
.
job_name
:
if
use_ps_server
:
self
.
global_step_device
=
self
.
param_server_device
elif
self
.
params
.
variable_update
==
'collective_all_reduce'
:
self
.
global_step_device
=
self
.
cpu_device
else
:
self
.
global_step_device
=
'/job:worker/replica:0/task:0/cpu:0'
else
:
self
.
global_step_device
=
self
.
cpu_device
self
.
input_preprocessor
=
None
self
.
eval_input_preprocessor
=
None
if
not
self
.
dataset
.
use_synthetic_gpu_inputs
():
if
not
self
.
params
.
eval
:
self
.
input_preprocessor
=
self
.
get_input_preprocessor
()
if
self
.
mode
in
(
constants
.
BenchmarkMode
.
EVAL
,
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
):
with
self
.
_do_eval
():
self
.
eval_input_preprocessor
=
self
.
get_input_preprocessor
()
self
.
datasets_use_prefetch
=
(
self
.
params
.
datasets_use_prefetch
and
# TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the
# CPU.
self
.
params
.
device
.
lower
()
!=
'cpu'
and
self
.
input_preprocessor
and
self
.
input_preprocessor
.
supports_datasets
())
self
.
init_global_step
=
0
self
.
_config_benchmark_logger
()
if
self
.
mode
==
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
:
# Remove "eval" from params so it is not accidentally used. Since eval can
# still occur despite params.eval being False, params.eval should never
# be used. We cannot yet remove this unconditionally, because the SSD
# model still uses params.eval, and hence does not work properly with
# --eval_during_training_*.
# TODO(b/116627045): We should also remove fields that have an eval
# equivalent, like num_batches and num_eval_batches.
self
.
params
=
remove_param_fields
(
self
.
params
,
{
'eval'
})
@
contextlib
.
contextmanager
def
_do_eval
(
self
):
"""Context manager to switches BenchmarkCNN to eval mode.
Any evaluation code should be put under this context manager. This context
manager switches self._doing_eval to True. It also switches certain
attributes, like self.num_batches and self.num_epochs, to be the number of
batches and epochs for evaluation respectively
Yields:
Nothing.
"""
# TODO(b/116627045): Find a more general way of switching attributes to the
# eval equivalents.
old_doing_eval
=
self
.
_doing_eval
old_num_batches
=
self
.
num_batches
old_num_epochs
=
self
.
num_epochs
old_batch_size
=
self
.
batch_size
try
:
self
.
_doing_eval
=
True
self
.
num_batches
=
self
.
num_eval_batches
self
.
num_epochs
=
self
.
num_eval_epochs
self
.
batch_size
=
self
.
eval_batch_size
self
.
model
.
set_batch_size
(
self
.
eval_batch_size
//
self
.
num_gpus
)
yield
finally
:
self
.
_doing_eval
=
old_doing_eval
self
.
num_batches
=
old_num_batches
self
.
num_epochs
=
old_num_epochs
self
.
batch_size
=
old_batch_size
self
.
model
.
set_batch_size
(
old_batch_size
//
self
.
num_gpus
)
def
_config_benchmark_logger
(
self
):
"""Config the model garden benchmark logger."""
model_benchmark_logger
=
None
if
self
.
params
.
benchmark_log_dir
is
not
None
:
try
:
from
official.utils.logs
import
logger
as
models_logger
# pylint: disable=g-import-not-at-top
except
ImportError
:
tf
.
logging
.
fatal
(
'Please include tensorflow/models to the PYTHONPATH '
'in order to use BenchmarkLogger. Configured '
'benchmark_log_dir: %s'
%
self
.
params
.
benchmark_log_dir
)
raise
model_benchmark_logger
=
models_logger
.
BenchmarkFileLogger
(
self
.
params
.
benchmark_log_dir
)
self
.
benchmark_logger
=
model_benchmark_logger
# TODO(laigd): this changes the global device list which is used everywhere,
# consider refactoring it.
def
reset_devices_for_task
(
self
,
task_num
,
is_local
=
False
):
"""Used to imitate another task when building a distributed graph."""
worker_prefix
=
(
'/job:localhost'
if
is_local
else
'/job:worker/replica:0/task:%s'
%
task_num
)
self
.
cpu_device
=
'%s/cpu:0'
%
worker_prefix
self
.
raw_devices
=
[
'%s/%s:%i'
%
(
worker_prefix
,
self
.
params
.
device
,
i
)
for
i
in
xrange
(
self
.
num_gpus
)
]
self
.
devices
=
self
.
variable_mgr
.
get_devices
()
def
raw_devices_across_tasks
(
self
,
is_local
=
False
):
"""Returns list of raw device names across all tasks."""
if
is_local
:
assert
self
.
num_workers
==
1
return
self
.
raw_devices
else
:
return
[
'job:worker/replica:0/task%s/%s:%i'
%
(
t
,
self
.
params
.
device
,
i
)
for
t
in
xrange
(
self
.
num_workers
)
for
i
in
xrange
(
self
.
num_gpus
)
]
def
print_info
(
self
):
"""Print basic information."""
benchmark_info
=
self
.
_get_params_info
()
log_fn
(
'Model: %s'
%
self
.
model
.
get_model_name
())
log_fn
(
'Dataset: %s'
%
benchmark_info
[
'dataset_name'
])
log_fn
(
'Mode: %s'
%
self
.
mode
)
log_fn
(
'SingleSess: %s'
%
benchmark_info
[
'single_session'
])
log_fn
(
'Batch size: %s global'
%
(
self
.
batch_size
*
self
.
num_workers
))
log_fn
(
' %s per device'
%
(
self
.
batch_size
//
len
(
self
.
raw_devices
)))
if
self
.
batch_group_size
>
1
:
log_fn
(
' %d batches per prepocessing group'
%
self
.
batch_group_size
)
log_fn
(
'Num batches: %d'
%
self
.
num_batches
)
log_fn
(
'Num epochs: %.2f'
%
self
.
num_epochs
)
log_fn
(
'Devices: %s'
%
benchmark_info
[
'device_list'
])
log_fn
(
'NUMA bind: %s'
%
self
.
params
.
use_numa_affinity
)
log_fn
(
'Data format: %s'
%
self
.
params
.
data_format
)
if
self
.
rewriter_config
:
log_fn
(
'RewriterConfig: %s'
%
self
.
rewriter_config
)
log_fn
(
'Optimizer: %s'
%
self
.
params
.
optimizer
)
log_fn
(
'Variables: %s'
%
self
.
params
.
variable_update
)
if
(
self
.
params
.
variable_update
==
'replicated'
or
self
.
params
.
variable_update
==
'distributed_all_reduce'
or
self
.
params
.
variable_update
==
'collective_all_reduce'
):
log_fn
(
'AllReduce: %s'
%
self
.
params
.
all_reduce_spec
)
if
self
.
job_name
:
log_fn
(
'Sync: %s'
%
self
.
params
.
cross_replica_sync
)
if
self
.
params
.
staged_vars
:
log_fn
(
'Staged vars: %s'
%
self
.
params
.
staged_vars
)
if
self
.
params
.
variable_update
==
'horovod'
and
self
.
params
.
horovod_device
:
log_fn
(
'Horovod on: %s'
%
self
.
params
.
horovod_device
)
log_fn
(
'=========='
)
def
_get_params_info
(
self
):
"""Get the common parameters info for the benchmark run.
Returns:
A dict of processed parameters.
"""
dataset_name
=
self
.
dataset
.
name
if
self
.
dataset
.
use_synthetic_gpu_inputs
():
dataset_name
+=
' (synthetic)'
single_session
=
self
.
params
.
variable_update
==
'distributed_all_reduce'
if
single_session
:
device_list
=
self
.
raw_devices_across_tasks
()
elif
self
.
params
.
variable_update
==
'horovod'
:
device_list
=
[
'horovod/%s:%d'
%
(
self
.
params
.
device
,
idx
)
for
idx
in
range
(
self
.
num_workers
)]
else
:
device_list
=
self
.
raw_devices
return
{
'dataset_name'
:
dataset_name
,
'single_session'
:
single_session
,
'device_list'
:
device_list
,}
def
_log_benchmark_run
(
self
):
"""Log the benchmark info to the logger.
The info logged here should be similar to print_info(), but in a structured
JSON format.
"""
if
self
.
benchmark_logger
:
benchmark_info
=
self
.
_get_params_info
()
run_param
=
{
'model'
:
self
.
model
.
get_model_name
(),
'dataset'
:
benchmark_info
[
'dataset_name'
],
'mode'
:
self
.
mode
,
'single_sess'
:
benchmark_info
[
'single_session'
],
'devices'
:
benchmark_info
[
'device_list'
],
'batch_size'
:
self
.
batch_size
,
'batch_size_per_device'
:
self
.
batch_size
//
len
(
self
.
raw_devices
),
'num_batches'
:
self
.
num_batches
,
'num_epochs'
:
self
.
num_epochs
,
'data_format'
:
self
.
params
.
data_format
,
'rewrite_config'
:
self
.
rewriter_config
,
'optimizer'
:
self
.
params
.
optimizer
,
'session_config'
:
create_config_proto
(
self
.
params
),
}
# TODO(scottzhu): tf_cnn_benchmark might execute several times with
# different param setting on the same box. This will cause the run file to
# only contain the latest info. The benchmark_log_dir should be updated
# for every new run.
self
.
benchmark_logger
.
log_run_info
(
self
.
model
.
get_model_name
(),
benchmark_info
[
'dataset_name'
],
run_param
,
test_id
=
self
.
params
.
benchmark_test_id
)
def
run
(
self
):
"""Run the benchmark task assigned to this process.
Returns:
Dictionary of statistics for training or eval.
Raises:
ValueError: unrecognized job name.
"""
if
self
.
params
.
job_name
==
'ps'
:
log_fn
(
'Running parameter server %s'
%
self
.
task_index
)
self
.
cluster_manager
.
join_server
()
return
{}
# For distributed_all_reduce with multiple workers, drive
# from a separate controller process.
if
self
.
params
.
variable_update
==
'distributed_all_reduce'
:
if
self
.
params
.
job_name
==
'worker'
:
log_fn
(
'Starting worker %s'
%
self
.
task_index
)
self
.
cluster_manager
.
join_server
()
return
elif
self
.
params
.
job_name
and
self
.
params
.
job_name
!=
'controller'
:
raise
ValueError
(
'unrecognized job name: %s'
%
self
.
params
.
job_name
)
self
.
_log_benchmark_run
()
if
self
.
_doing_eval
:
with
tf
.
Graph
().
as_default
():
# TODO(laigd): freeze the graph in eval mode.
return
self
.
_run_eval
()
else
:
return
self
.
_benchmark_train
()
def
_run_eval
(
self
):
"""Evaluate a model every self.params.eval_interval_secs.
Returns:
Dictionary containing eval statistics. Currently returns an empty
dictionary.
Raises:
ValueError: If self.params.train_dir is unspecified.
"""
if
self
.
params
.
train_dir
is
None
:
raise
ValueError
(
'Trained model directory not specified'
)
graph_info
=
self
.
_build_eval_graph
()
saver
=
tf
.
train
.
Saver
(
self
.
variable_mgr
.
savable_variables
())
summary_writer
=
tf
.
summary
.
FileWriter
(
self
.
params
.
eval_dir
,
tf
.
get_default_graph
())
target
=
''
# TODO(huangyp): Check if checkpoints haven't updated for hours and abort.
while
True
:
with
tf
.
Session
(
target
=
target
,
config
=
create_config_proto
(
self
.
params
))
as
sess
:
image_producer
=
None
try
:
global_step
=
load_checkpoint
(
saver
,
sess
,
self
.
params
.
train_dir
)
image_producer
=
self
.
_initialize_eval_graph
(
graph_info
.
enqueue_ops
,
graph_info
.
input_producer_op
,
graph_info
.
local_var_init_op_group
,
sess
)
except
CheckpointNotFoundException
:
log_fn
(
'Checkpoint not found in %s'
%
self
.
params
.
train_dir
)
else
:
# Only executes if an exception was not thrown
self
.
_eval_once
(
sess
,
summary_writer
,
graph_info
.
fetches
,
graph_info
.
summary_op
,
image_producer
,
global_step
)
if
image_producer
is
not
None
:
image_producer
.
done
()
if
self
.
params
.
eval_interval_secs
<=
0
:
break
time
.
sleep
(
self
.
params
.
eval_interval_secs
)
return
{}
def
_build_eval_graph
(
self
,
scope_name
=
None
):
"""Build the evaluation graph.
Args:
scope_name: String to filter what summaries are collected. Only summary
ops whose name contains `scope_name` will be added, which is useful for
only including evaluation ops.
Returns:
A GraphInfo named_tuple containing various useful ops and tensors of the
evaluation grpah.
"""
with
self
.
_do_eval
():
input_producer_op
,
enqueue_ops
,
fetches
=
self
.
_build_model
()
local_var_init_op
=
tf
.
local_variables_initializer
()
table_init_ops
=
tf
.
tables_initializer
()
variable_mgr_init_ops
=
[
local_var_init_op
]
if
table_init_ops
:
variable_mgr_init_ops
.
extend
([
table_init_ops
])
with
tf
.
control_dependencies
([
local_var_init_op
]):
variable_mgr_init_ops
.
extend
(
self
.
variable_mgr
.
get_post_init_ops
())
local_var_init_op_group
=
tf
.
group
(
*
variable_mgr_init_ops
)
summary_op
=
tf
.
summary
.
merge_all
(
scope
=
scope_name
)
# The eval graph has no execution barrier because it doesn't run in
# distributed mode.
execution_barrier
=
None
# We do not use the global step during evaluation.
global_step
=
None
return
GraphInfo
(
input_producer_op
,
enqueue_ops
,
fetches
,
execution_barrier
,
global_step
,
local_var_init_op_group
,
summary_op
)
# TODO(reedwm): For consistency, we should have a similar
# "_initialize_train_graph" function. They can likely be the same function.
def
_initialize_eval_graph
(
self
,
enqueue_ops
,
input_producer_op
,
local_var_init_op_group
,
sess
):
"""Initializes the evaluation graph.
Args:
enqueue_ops: Ops that adds the preprocessed images to the staging areas.
input_producer_op: Op that produce the input batches (before
preprocessing).
local_var_init_op_group: Group of ops that perform per-device
initialization work.
sess: The session to initialize the eval graph with.
Returns:
An ImageProducer, or None if an ImageProducer isn't being used.
"""
with
self
.
_do_eval
():
if
local_var_init_op_group
is
not
None
:
# We might reinitialize local variables if they were already initialized
# during training. This is OK.
sess
.
run
(
local_var_init_op_group
)
if
self
.
dataset
.
queue_runner_required
():
tf
.
train
.
start_queue_runners
(
sess
=
sess
)
image_producer
=
None
if
input_producer_op
is
not
None
:
image_producer
=
cnn_util
.
ImageProducer
(
sess
,
input_producer_op
,
self
.
batch_group_size
,
self
.
params
.
use_python32_barrier
)
image_producer
.
start
()
if
enqueue_ops
:
for
i
in
xrange
(
len
(
enqueue_ops
)):
sess
.
run
(
enqueue_ops
[:(
i
+
1
)])
if
image_producer
is
not
None
:
image_producer
.
notify_image_consumption
()
return
image_producer
def
_eval_once
(
self
,
sess
,
summary_writer
,
fetches
,
summary_op
,
image_producer
,
global_step
):
"""Evaluate the model using the validation dataset."""
with
self
.
_do_eval
():
mlperf
.
logger
.
log_eval_epoch
(
mlperf
.
tags
.
EVAL_START
,
global_step
,
self
.
batch_size
)
loop_start_time
=
start_time
=
time
.
time
()
# TODO(laigd): refactor the part to compute/report the accuracy. Currently
# it only works for image models.
top_1_accuracy_sum
=
0.0
top_5_accuracy_sum
=
0.0
total_eval_count
=
self
.
num_batches
*
self
.
batch_size
for
step
in
xrange
(
self
.
num_batches
):
if
(
summary_writer
and
self
.
params
.
save_summaries_steps
>
0
and
(
step
+
1
)
%
self
.
params
.
save_summaries_steps
==
0
):
results
,
summary_str
=
sess
.
run
([
fetches
,
summary_op
])
summary_writer
.
add_summary
(
summary_str
)
else
:
results
=
sess
.
run
(
fetches
)
# Make global_step available in results for postprocessing.
results
[
'global_step'
]
=
global_step
results
=
self
.
model
.
postprocess
(
results
)
top_1_accuracy_sum
+=
results
[
'top_1_accuracy'
]
top_5_accuracy_sum
+=
results
[
'top_5_accuracy'
]
if
(
step
+
1
)
%
self
.
params
.
display_every
==
0
:
duration
=
time
.
time
()
-
start_time
examples_per_sec
=
(
self
.
batch_size
*
self
.
params
.
display_every
/
duration
)
log_fn
(
'%i
\t
%.1f examples/sec'
%
(
step
+
1
,
examples_per_sec
))
start_time
=
time
.
time
()
if
image_producer
is
not
None
:
image_producer
.
notify_image_consumption
()
loop_end_time
=
time
.
time
()
accuracy_at_1
=
top_1_accuracy_sum
/
self
.
num_batches
accuracy_at_5
=
top_5_accuracy_sum
/
self
.
num_batches
summary
=
tf
.
Summary
()
summary
.
value
.
add
(
tag
=
'eval/Accuracy@1'
,
simple_value
=
accuracy_at_1
)
summary
.
value
.
add
(
tag
=
'eval/Accuracy@5'
,
simple_value
=
accuracy_at_5
)
for
result_key
,
result_value
in
results
.
items
():
if
result_key
.
startswith
(
constants
.
SIMPLE_VALUE_RESULT_PREFIX
):
prefix_len
=
len
(
constants
.
SIMPLE_VALUE_RESULT_PREFIX
)
summary
.
value
.
add
(
tag
=
'eval/'
+
result_key
[
prefix_len
:],
simple_value
=
result_value
)
if
summary_writer
:
summary_writer
.
add_summary
(
summary
,
global_step
)
log_fn
(
'Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]'
%
(
accuracy_at_1
,
accuracy_at_5
,
total_eval_count
))
elapsed_time
=
loop_end_time
-
loop_start_time
images_per_sec
=
(
self
.
num_batches
*
self
.
batch_size
/
elapsed_time
)
if
self
.
mode
!=
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
:
# Note that we compute the top 1 accuracy and top 5 accuracy for each
# batch, which will have a slight performance impact.
log_fn
(
'-'
*
64
)
log_fn
(
'total images/sec: %.2f'
%
images_per_sec
)
log_fn
(
'-'
*
64
)
if
self
.
benchmark_logger
:
eval_result
=
{
'eval_top_1_accuracy'
,
accuracy_at_1
,
'eval_top_5_accuracy'
,
accuracy_at_5
,
'eval_average_examples_per_sec'
,
images_per_sec
,
tf
.
GraphKeys
.
GLOBAL_STEP
,
global_step
,
}
self
.
benchmark_logger
.
log_evaluation_result
(
eval_result
)
mlperf
.
logger
.
log_eval_epoch
(
mlperf
.
tags
.
EVAL_STOP
,
global_step
,
self
.
batch_size
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
EVAL_SIZE
,
value
=
self
.
num_batches
*
self
.
batch_size
)
if
self
.
params
.
model
!=
'ssd300'
:
# ssd300 logs eval accuracy elsewhere.
mlperf
.
logger
.
log_eval_accuracy
(
accuracy_at_1
,
global_step
,
self
.
train_batch_size
,
examples_per_epoch
=
self
.
dataset
.
num_examples_per_epoch
(
'train'
))
if
self
.
params
.
stop_at_top_1_accuracy
:
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
EVAL_TARGET
,
value
=
self
.
params
.
stop_at_top_1_accuracy
)
return
accuracy_at_1
,
accuracy_at_5
def
_benchmark_train
(
self
):
"""Run cnn in benchmark mode. Skip the backward pass if forward_only is on.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
graph
=
tf
.
Graph
()
with
graph
.
as_default
():
build_result
=
self
.
_build_graph
()
if
self
.
mode
==
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
:
with
self
.
variable_mgr
.
reuse_variables
():
with
tf
.
name_scope
(
'Evaluation'
)
as
ns
:
eval_build_results
=
self
.
_build_eval_graph
(
ns
)
else
:
eval_build_results
=
None
(
graph
,
result_to_benchmark
)
=
self
.
_preprocess_graph
(
graph
,
build_result
)
with
graph
.
as_default
():
return
self
.
_benchmark_graph
(
result_to_benchmark
,
eval_build_results
)
GPU_CACHED_INPUT_VARIABLE_NAME
=
'gpu_cached_inputs'
def
_unfreezable_local_variables
(
self
,
graph
):
"""Get the local variables that we don't want to freeze."""
return
graph
.
get_collection
(
tf
.
GraphKeys
.
LOCAL_VARIABLES
,
# We don't freeze the gpu_cached_images local variable so it won't get
# constant folded with ops which process the input.
scope
=
'.*'
+
BenchmarkCNN
.
GPU_CACHED_INPUT_VARIABLE_NAME
)
def
_build_graph
(
self
):
"""Build the graph.
Returns:
A namedtuple containing the ops/tensors that required by
_benchmark_graph().
"""
if
self
.
params
.
variable_update
==
'distributed_all_reduce'
:
self
.
single_session
=
True
(
input_producer_op
,
enqueue_ops
,
fetches
)
=
(
self
.
_build_model_single_session
())
else
:
self
.
single_session
=
False
(
input_producer_op
,
enqueue_ops
,
fetches
)
=
self
.
_build_model
()
fetches_list
=
nest
.
flatten
(
list
(
fetches
.
values
()))
main_fetch_group
=
tf
.
group
(
*
fetches_list
,
name
=
'main_fetch_group'
)
execution_barrier
=
None
if
(
not
self
.
single_session
and
self
.
job_name
and
not
self
.
params
.
cross_replica_sync
):
execution_barrier
=
self
.
add_sync_queues_and_barrier
(
'execution_barrier_'
,
[])
global_step
=
tf
.
train
.
get_global_step
()
with
tf
.
device
(
self
.
global_step_device
),
tf
.
name_scope
(
'inc_global_step'
):
with
tf
.
control_dependencies
([
main_fetch_group
]):
fetches
[
'inc_global_step'
]
=
global_step
.
assign_add
(
1
)
if
((
not
self
.
single_session
)
and
(
not
self
.
distributed_collective
)
and
self
.
job_name
and
self
.
params
.
cross_replica_sync
):
# Block all replicas until all replicas are ready for next step.
fetches
[
'sync_queues'
]
=
self
.
add_sync_queues_and_barrier
(
'sync_queues_step_end_'
,
[
main_fetch_group
])
# Skips the init ops for freezable local variables in forward_only mode so
# we can remove all the assign ops when converting variables to constants.
with
tf
.
name_scope
(
'local_variable_initialization'
):
if
self
.
forward_only_and_freeze
:
local_var_init_op
=
tf
.
variables_initializer
(
self
.
_unfreezable_local_variables
(
tf
.
get_default_graph
()))
else
:
local_var_init_op
=
tf
.
local_variables_initializer
()
table_init_ops
=
tf
.
tables_initializer
()
variable_manager_init_ops
=
[
local_var_init_op
]
if
table_init_ops
:
variable_manager_init_ops
.
extend
([
table_init_ops
])
if
not
self
.
forward_only_and_freeze
:
with
tf
.
control_dependencies
([
local_var_init_op
]):
variable_manager_init_ops
.
extend
(
self
.
variable_mgr
.
get_post_init_ops
())
if
((
not
self
.
single_session
)
and
(
not
self
.
distributed_collective
)
and
self
.
job_name
and
self
.
params
.
cross_replica_sync
):
# Ensure all workers execute variable_manager_init_ops before they start
# executing the model.
variable_manager_init_ops
.
append
(
self
.
add_sync_queues_and_barrier
(
'init_ops_end_'
,
variable_manager_init_ops
))
local_var_init_op_group
=
tf
.
group
(
*
variable_manager_init_ops
,
name
=
'local_var_init_op_group'
)
summary_op
=
tf
.
summary
.
merge_all
()
return
GraphInfo
(
input_producer_op
=
input_producer_op
,
enqueue_ops
=
enqueue_ops
,
fetches
=
fetches
,
execution_barrier
=
execution_barrier
,
global_step
=
global_step
,
local_var_init_op_group
=
local_var_init_op_group
,
summary_op
=
summary_op
)
def
_benchmark_graph
(
self
,
graph_info
,
eval_graph_info
):
"""Benchmark the training graph.
Args:
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
eval_graph_info: Similar to graph_info but for the eval graph if
--eval_during_training_* is used. Otherwise, None.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
log_fn
(
'Initializing graph'
)
if
self
.
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
# First worker will be 'chief' - it will write summaries and
# save checkpoints.
is_chief
=
hvd
.
rank
()
==
0
else
:
is_chief
=
(
not
self
.
job_name
or
self
.
task_index
==
0
)
summary_writer
=
None
if
(
is_chief
and
self
.
params
.
summary_verbosity
and
self
.
params
.
train_dir
and
self
.
params
.
save_summaries_steps
>
0
):
summary_writer
=
tf
.
summary
.
FileWriter
(
self
.
params
.
train_dir
,
tf
.
get_default_graph
())
# We want to start the benchmark timer right after a image_producer barrier
# and avoids undesired waiting times on barriers.
if
((
self
.
num_warmup_batches
+
len
(
graph_info
.
enqueue_ops
)
-
1
)
%
self
.
batch_group_size
)
!=
0
:
self
.
num_warmup_batches
=
int
(
math
.
ceil
(
(
self
.
num_warmup_batches
+
len
(
graph_info
.
enqueue_ops
)
-
1.0
)
/
(
self
.
batch_group_size
))
*
self
.
batch_group_size
-
len
(
graph_info
.
enqueue_ops
)
+
1
)
log_fn
(
'Round up warm up steps to %d to match batch_group_size'
%
self
.
num_warmup_batches
)
assert
((
self
.
num_warmup_batches
+
len
(
graph_info
.
enqueue_ops
)
-
1
)
%
self
.
batch_group_size
)
==
0
# We run the summaries in the same thread as the training operations by
# passing in None for summary_op to avoid a summary_thread being started.
# Running summaries and training operations in parallel could run out of
# GPU memory.
if
is_chief
and
not
self
.
forward_only_and_freeze
:
saver
=
tf
.
train
.
Saver
(
self
.
variable_mgr
.
savable_variables
(),
save_relative_paths
=
True
,
max_to_keep
=
self
.
params
.
max_ckpts_to_keep
)
else
:
saver
=
None
ready_for_local_init_op
=
None
if
self
.
job_name
and
not
(
self
.
single_session
or
self
.
distributed_collective
):
# In distributed mode, we don't want to run local_var_init_op_group until
# the global variables are initialized, because local_var_init_op_group
# may use global variables (such as in distributed replicated mode). We
# don't set this in non-distributed mode, because in non-distributed mode,
# local_var_init_op_group may itself initialize global variables (such as
# in replicated mode).
ready_for_local_init_op
=
tf
.
report_uninitialized_variables
(
tf
.
global_variables
())
if
self
.
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
bcast_global_variables_op
=
hvd
.
broadcast_global_variables
(
0
)
else
:
bcast_global_variables_op
=
None
if
self
.
params
.
variable_update
==
'collective_all_reduce'
:
# It doesn't matter what this collective_graph_key value is,
# so long as it's > 0 and the same at every worker.
init_run_options
=
tf
.
RunOptions
()
init_run_options
.
experimental
.
collective_graph_key
=
6
else
:
init_run_options
=
tf
.
RunOptions
()
local_var_init_ops
=
[
graph_info
.
local_var_init_op_group
]
if
eval_graph_info
:
# `eval_graph_info.local_var_init_op_group` also includes some of the
# training initializer ops, since it's difficult to filter them out.
# Rerunning the training initializer ops is OK, but we add a control
# dependency since running two sets of training initializer ops at the
# same time can cause race conditions.
with
tf
.
control_dependencies
(
local_var_init_ops
):
local_var_init_ops
.
append
(
eval_graph_info
.
local_var_init_op_group
)
sv
=
tf
.
train
.
Supervisor
(
# For the purpose of Supervisor, all Horovod workers are 'chiefs',
# since we want session to be initialized symmetrically on all the
# workers.
is_chief
=
is_chief
or
(
self
.
params
.
variable_update
==
'horovod'
or
self
.
distributed_collective
),
# Log dir should be unset on non-chief workers to prevent Horovod
# workers from corrupting each other's checkpoints.
logdir
=
self
.
params
.
train_dir
if
is_chief
else
None
,
ready_for_local_init_op
=
ready_for_local_init_op
,
local_init_op
=
local_var_init_ops
,
saver
=
saver
,
global_step
=
graph_info
.
global_step
,
summary_op
=
None
,
save_model_secs
=
self
.
params
.
save_model_secs
,
summary_writer
=
summary_writer
,
local_init_run_options
=
init_run_options
)
profiler
=
tf
.
profiler
.
Profiler
()
if
self
.
params
.
tfprof_file
else
None
if
self
.
graph_file
is
not
None
:
path
,
filename
=
os
.
path
.
split
(
self
.
graph_file
)
as_text
=
filename
.
endswith
(
'txt'
)
log_fn
(
'Writing GraphDef as %s to %s'
%
(
# pyformat break
'text'
if
as_text
else
'binary'
,
self
.
graph_file
))
tf
.
train
.
write_graph
(
tf
.
get_default_graph
().
as_graph_def
(
add_shapes
=
True
),
path
,
filename
,
as_text
)
start_standard_services
=
(
self
.
params
.
train_dir
or
self
.
dataset
.
queue_runner_required
())
target
=
self
.
cluster_manager
.
get_target
()
if
self
.
cluster_manager
else
''
with
sv
.
managed_session
(
master
=
target
,
config
=
create_config_proto
(
self
.
params
),
start_standard_services
=
start_standard_services
)
as
sess
:
# Anything that can potentially raise an OutOfRangeError with 'sess' MUST
# be under this try block. The managed_session() context manager silently
# ignores OutOfRangeError, so we must catch them and wrap them with
# a different exception type so that they can be propagated up to the
# caller.
try
:
stats
=
self
.
benchmark_with_session
(
sess
,
sv
,
graph_info
,
eval_graph_info
,
bcast_global_variables_op
,
is_chief
,
summary_writer
,
profiler
)
except
tf
.
errors
.
OutOfRangeError
:
raise
RuntimeError
(
'Received OutOfRangeError. Wrapping in Runtime error to avoid '
'Supervisor from suppressing the error. Original OutOfRangeError '
'with traceback:
\n
'
+
traceback
.
format_exc
())
sv
.
stop
()
if
profiler
:
generate_tfprof_profile
(
profiler
,
self
.
params
.
tfprof_file
)
return
stats
def
benchmark_with_session
(
self
,
sess
,
supervisor
,
graph_info
,
eval_graph_info
,
bcast_global_variables_op
,
is_chief
,
summary_writer
,
profiler
):
"""Benchmarks the graph with the given session.
Args:
sess: The session to benchmark the graph with
supervisor: The Supervisor that created the session.
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
eval_graph_info: Similar to graph_info but for the eval graph if
--eval_during_training_every_n_steps is used. Otherwise, None.
bcast_global_variables_op: If Horovod is used, the op to broadcast the
global variables to all the processes. None if Horovod is not used.
is_chief: True if this is the chief process.
summary_writer: The SummaryWriter used to write summaries, or None if
summaries are not used.
profiler: The tf.profiler.Profiler, or None if tfprof is not used.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
if
self
.
params
.
backbone_model_path
is
not
None
:
self
.
model
.
load_backbone_model
(
sess
,
self
.
params
.
backbone_model_path
)
if
bcast_global_variables_op
:
sess
.
run
(
bcast_global_variables_op
)
image_producer
=
None
if
graph_info
.
input_producer_op
is
not
None
:
image_producer
=
cnn_util
.
ImageProducer
(
sess
,
graph_info
.
input_producer_op
,
self
.
batch_group_size
,
self
.
params
.
use_python32_barrier
)
image_producer
.
start
()
if
graph_info
.
enqueue_ops
:
for
i
in
xrange
(
len
(
graph_info
.
enqueue_ops
)):
sess
.
run
(
graph_info
.
enqueue_ops
[:(
i
+
1
)])
if
image_producer
is
not
None
:
image_producer
.
notify_image_consumption
()
self
.
init_global_step
,
=
sess
.
run
([
graph_info
.
global_step
])
if
self
.
job_name
and
not
self
.
params
.
cross_replica_sync
:
# TODO(zhengxq): Do we need to use a global step watcher at all?
global_step_watcher
=
GlobalStepWatcher
(
sess
,
graph_info
.
global_step
,
self
.
num_workers
*
self
.
num_warmup_batches
+
self
.
init_global_step
,
self
.
num_workers
*
(
self
.
num_warmup_batches
+
self
.
num_batches
)
-
1
)
global_step_watcher
.
start
()
else
:
global_step_watcher
=
None
eval_image_producer
=
None
if
eval_graph_info
:
# We pass local_var_init_op_group=None because the Supervisor already
# initialized local variables above. We need to have the Supervisor
# initialize the local variables, because otherwise it throws an error
# complaining that not all variables were initialized.
eval_image_producer
=
self
.
_initialize_eval_graph
(
eval_graph_info
.
enqueue_ops
,
eval_graph_info
.
input_producer_op
,
local_var_init_op_group
=
None
,
sess
=
sess
)
step_train_times
=
[]
log_fn
(
'Running warm up'
)
local_step
=
-
1
*
self
.
num_warmup_batches
if
self
.
single_session
:
# In single session mode, each step, the global_step is incremented by
# 1. In non-single session mode, each step, the global_step is
# incremented once per worker. This means we need to divide
# init_global_step by num_workers only in non-single session mode.
end_local_step
=
self
.
num_batches
-
self
.
init_global_step
else
:
end_local_step
=
self
.
num_batches
-
(
self
.
init_global_step
//
self
.
num_workers
)
if
not
global_step_watcher
:
# In cross-replica sync mode, all workers must run the same number of
# local steps, or else the workers running the extra step will block.
done_fn
=
lambda
:
local_step
>=
end_local_step
else
:
done_fn
=
global_step_watcher
.
done
if
self
.
params
.
debugger
is
not
None
:
if
self
.
params
.
debugger
==
'cli'
:
log_fn
(
'The CLI TensorFlow debugger will be used.'
)
sess
=
tf_debug
.
LocalCLIDebugWrapperSession
(
sess
)
else
:
log_fn
(
'The TensorBoard debugger plugin will be used.'
)
sess
=
tf_debug
.
TensorBoardDebugWrapperSession
(
sess
,
self
.
params
.
debugger
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
TRAIN_LOOP
)
skip_final_eval
=
False
accuracy_at_1
=
None
accuracy_at_5
=
None
last_eval_step
=
local_step
loop_start_time
=
time
.
time
()
last_average_loss
=
None
while
not
done_fn
():
if
local_step
==
0
:
log_fn
(
'Done warm up'
)
if
graph_info
.
execution_barrier
:
log_fn
(
'Waiting for other replicas to finish warm up'
)
sess
.
run
([
graph_info
.
execution_barrier
])
# TODO(laigd): rename 'Img' to maybe 'Input'.
header_str
=
(
'Step
\t
Img/sec
\t
'
+
self
.
params
.
loss_type_to_report
.
replace
(
'/'
,
' '
))
if
self
.
params
.
print_training_accuracy
or
self
.
params
.
forward_only
:
# TODO(laigd): use the actual accuracy op names of the model.
header_str
+=
'
\t
top_1_accuracy
\t
top_5_accuracy'
log_fn
(
header_str
)
assert
len
(
step_train_times
)
==
self
.
num_warmup_batches
# reset times to ignore warm up batch
step_train_times
=
[]
loop_start_time
=
time
.
time
()
if
(
summary_writer
and
(
local_step
+
1
)
%
self
.
params
.
save_summaries_steps
==
0
):
fetch_summary
=
graph_info
.
summary_op
else
:
fetch_summary
=
None
collective_graph_key
=
7
if
(
self
.
params
.
variable_update
==
'collective_all_reduce'
)
else
0
(
summary_str
,
last_average_loss
)
=
benchmark_one_step
(
sess
,
graph_info
.
fetches
,
local_step
,
self
.
batch_size
*
(
self
.
num_workers
if
self
.
single_session
else
1
),
step_train_times
,
self
.
trace_filename
,
self
.
params
.
partitioned_graph_file_prefix
,
profiler
,
image_producer
,
self
.
params
,
fetch_summary
,
benchmark_logger
=
self
.
benchmark_logger
,
collective_graph_key
=
collective_graph_key
)
if
summary_str
is
not
None
and
is_chief
:
supervisor
.
summary_computed
(
sess
,
summary_str
)
local_step
+=
1
if
(
self
.
params
.
save_model_steps
and
local_step
%
self
.
params
.
save_model_steps
==
0
and
local_step
>
0
and
is_chief
):
supervisor
.
saver
.
save
(
sess
,
supervisor
.
save_path
,
supervisor
.
global_step
)
if
(
eval_graph_info
and
local_step
>
0
and
not
done_fn
()
and
self
.
_should_eval_during_training
(
local_step
)):
python_global_step
=
sess
.
run
(
graph_info
.
global_step
)
num_steps_since_last_eval
=
local_step
-
last_eval_step
# The INPUT_SIZE tag value might not match the
# PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples
# run, which is INPUT_SIZE, is rounded up to the nearest multiple of
# self.batch_size.
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
INPUT_SIZE
,
value
=
num_steps_since_last_eval
*
self
.
batch_size
)
log_fn
(
'Running evaluation at global_step {}'
.
format
(
python_global_step
))
accuracy_at_1
,
accuracy_at_5
=
self
.
_eval_once
(
sess
,
summary_writer
,
eval_graph_info
.
fetches
,
eval_graph_info
.
summary_op
,
eval_image_producer
,
python_global_step
)
last_eval_step
=
local_step
if
(
self
.
params
.
stop_at_top_1_accuracy
and
accuracy_at_1
>=
self
.
params
.
stop_at_top_1_accuracy
):
log_fn
(
'Stopping, as eval accuracy at least %s was reached'
%
self
.
params
.
stop_at_top_1_accuracy
)
skip_final_eval
=
True
break
else
:
log_fn
(
'Resuming training'
)
if
eval_graph_info
and
self
.
model
.
reached_target
():
log_fn
(
'Stopping, as the model indicates its custom goal was reached'
)
skip_final_eval
=
True
break
loop_end_time
=
time
.
time
()
# Waits for the global step to be done, regardless of done_fn.
if
global_step_watcher
:
while
not
global_step_watcher
.
done
():
time
.
sleep
(.
25
)
if
not
global_step_watcher
:
elapsed_time
=
loop_end_time
-
loop_start_time
average_wall_time
=
elapsed_time
/
local_step
if
local_step
>
0
else
0
images_per_sec
=
(
self
.
num_workers
*
local_step
*
self
.
batch_size
/
elapsed_time
)
num_steps
=
local_step
*
self
.
num_workers
else
:
# NOTE: Each worker independently increases the global step. So,
# num_steps will be the sum of the local_steps from each worker.
num_steps
=
global_step_watcher
.
num_steps
()
elapsed_time
=
global_step_watcher
.
elapsed_time
()
average_wall_time
=
(
elapsed_time
*
self
.
num_workers
/
num_steps
if
num_steps
>
0
else
0
)
images_per_sec
=
num_steps
*
self
.
batch_size
/
elapsed_time
# We skip printing images/sec if --eval_during_training_* is specified,
# because we are both processing training and evaluation images, so a
# singular "images/sec" value is meaningless.
if
self
.
mode
!=
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
:
log_fn
(
'-'
*
64
)
# TODO(laigd): rename 'images' to maybe 'inputs'.
log_fn
(
'total images/sec: %.2f'
%
images_per_sec
)
log_fn
(
'-'
*
64
)
else
:
log_fn
(
'Done with training'
)
num_steps_since_last_eval
=
local_step
-
last_eval_step
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
INPUT_SIZE
,
value
=
num_steps_since_last_eval
*
self
.
batch_size
)
python_global_step
=
sess
.
run
(
graph_info
.
global_step
)
if
eval_graph_info
and
not
skip_final_eval
:
log_fn
(
'Running final evaluation at global_step {}'
.
format
(
python_global_step
))
accuracy_at_1
,
accuracy_at_5
=
self
.
_eval_once
(
sess
,
summary_writer
,
eval_graph_info
.
fetches
,
eval_graph_info
.
summary_op
,
eval_image_producer
,
python_global_step
)
num_epochs_ran
=
(
python_global_step
*
self
.
batch_size
/
self
.
dataset
.
num_examples_per_epoch
(
'train'
))
mlperf
.
logger
.
log_train_epochs
(
num_epochs_ran
)
if
image_producer
is
not
None
:
image_producer
.
done
()
if
eval_image_producer
is
not
None
:
eval_image_producer
.
done
()
if
is_chief
:
if
self
.
benchmark_logger
:
self
.
benchmark_logger
.
log_metric
(
'average_examples_per_sec'
,
images_per_sec
,
global_step
=
num_steps
)
# Save the model checkpoint.
if
self
.
params
.
train_dir
is
not
None
and
is_chief
:
checkpoint_path
=
os
.
path
.
join
(
self
.
params
.
train_dir
,
'model.ckpt'
)
if
not
gfile
.
Exists
(
self
.
params
.
train_dir
):
gfile
.
MakeDirs
(
self
.
params
.
train_dir
)
supervisor
.
saver
.
save
(
sess
,
checkpoint_path
,
graph_info
.
global_step
)
if
graph_info
.
execution_barrier
:
# Wait for other workers to reach the end, so this worker doesn't
# go away underneath them.
sess
.
run
([
graph_info
.
execution_barrier
])
stats
=
{
'num_workers'
:
self
.
num_workers
,
'num_steps'
:
num_steps
,
'average_wall_time'
:
average_wall_time
,
'images_per_sec'
:
images_per_sec
}
if
last_average_loss
is
not
None
:
stats
[
'last_average_loss'
]
=
last_average_loss
if
accuracy_at_1
is
not
None
:
stats
[
'top_1_accuracy'
]
=
accuracy_at_1
if
accuracy_at_5
is
not
None
:
stats
[
'top_5_accuracy'
]
=
accuracy_at_5
success
=
bool
(
self
.
model
.
reached_target
()
or
(
accuracy_at_1
and
self
.
params
.
stop_at_top_1_accuracy
and
accuracy_at_1
>=
self
.
params
.
stop_at_top_1_accuracy
))
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
RUN_STOP
,
value
=
{
'success'
:
success
})
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
RUN_FINAL
)
return
stats
def
_should_eval_during_training
(
self
,
step
):
"""Return True iff should run eval during training at current step."""
assert
self
.
mode
==
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
if
self
.
params
.
eval_during_training_every_n_steps
:
return
step
%
self
.
params
.
eval_during_training_every_n_steps
==
0
# All other --eval_during_training_* flags are converted to step numbers
# at which the model should run evaluation during training.
return
step
in
self
.
eval_during_training_at_specified_steps
def
_preprocess_graph
(
self
,
graph
,
graph_info
):
"""Preprocess the graph before executing.
Depending on the params, it runs various preprocessing on the graph,
including freezing, TensorRT conversion, etc.
Args:
graph: the graph to preprocess.
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
Returns:
The updated graph and graph_info with the ops/tensors/fetches updated
according to the imported graph.
"""
assert
isinstance
(
graph_info
.
fetches
,
dict
)
assert
isinstance
(
graph_info
.
global_step
,
tf
.
Variable
)
if
not
self
.
forward_only_and_freeze
:
return
(
graph
,
graph_info
)
# Get the names of the ops that need to keep during conversion.
flattened_op_names
=
list
(
set
([
v
.
name
.
split
(
':'
)[
0
]
for
v
in
nest
.
flatten
(
graph_info
)
if
v
is
not
None
]))
# Get variables that we don't want to freeze.
# Only keep unfreezable variables in forward_only_and_freeze mode.
# TODO(laigd): consider making global_step a constant.
variables_to_keep
=
{
graph_info
.
global_step
:
tf
.
GraphKeys
.
GLOBAL_VARIABLES
}
variables_to_keep
.
update
({
local_variable
:
tf
.
GraphKeys
.
LOCAL_VARIABLES
for
local_variable
in
self
.
_unfreezable_local_variables
(
graph
)
})
variable_initializers
=
[
variable
.
initializer
.
name
for
variable
in
variables_to_keep
]
output_node_names
=
(
flattened_op_names
+
# Add variable initializer and read ops to the output list, so
# convert_variables_to_constants() will keep them.
variable_initializers
+
[
variable
.
value
().
op
.
name
for
variable
in
variables_to_keep
])
graphdef
=
graph
.
as_graph_def
(
add_shapes
=
True
)
# Freeze the graph.
with
graph
.
as_default
():
with
tf
.
Session
(
config
=
create_config_proto
(
self
.
params
))
as
sess
:
sess
.
run
(
tf
.
global_variables_initializer
())
sess
.
run
(
tf
.
local_variables_initializer
())
graphdef
=
graph_util
.
convert_variables_to_constants
(
sess
,
graphdef
,
output_node_names
,
variable_names_blacklist
=
[
variable
.
op
.
name
for
variable
in
variables_to_keep
])
# Run TensorRT conversion.
if
self
.
params
.
trt_mode
:
# Import here instead of at top, because this will crash if TensorRT is
# not installed
from
tensorflow.python.compiler.tensorrt
import
trt_convert
# pylint: disable=g-import-not-at-top
# Avoid TF-TRT bridge from touching all variable initializer ops and their
# dependencies, since they can directly be fetched by sess.run()s that
# initialize the variables.
# pylint: disable=protected-access
name_to_input_name
,
_
,
_
=
graph_util_impl
.
_extract_graph_summary
(
graphdef
)
initializer_subgraph_ops
=
graph_util_impl
.
_bfs_for_reachable_nodes
(
variable_initializers
,
name_to_input_name
)
# pylint: enable=protected-access
graphdef
=
trt_convert
.
create_inference_graph
(
graphdef
,
outputs
=
output_node_names
+
list
(
initializer_subgraph_ops
),
max_batch_size
=
self
.
model
.
get_batch_size
(),
max_workspace_size_bytes
=
self
.
params
.
trt_max_workspace_size_bytes
,
precision_mode
=
self
.
params
.
trt_mode
)
# Creates a new graph as the default and import the converted graph back.
updated_graph
=
tf
.
Graph
()
def
_get_tensors_or_ops
(
inputs
):
"""Gets the updated tensors or ops from 'updated_graph'."""
def
_get_fn
(
element
):
if
element
is
None
:
return
None
if
':'
in
element
.
name
:
return
updated_graph
.
get_tensor_by_name
(
element
.
name
)
return
updated_graph
.
get_operation_by_name
(
element
.
name
)
if
isinstance
(
inputs
,
(
list
,
dict
,
tuple
)):
return
nest
.
map_structure
(
_get_fn
,
inputs
)
else
:
return
_get_fn
(
inputs
)
with
updated_graph
.
as_default
():
importer
.
import_graph_def
(
graph_def
=
graphdef
,
name
=
''
)
# Update the variables
for
variable
in
variables_to_keep
:
updated_variable
=
tf
.
Variable
.
from_proto
(
variable
.
to_proto
())
tf
.
add_to_collection
(
variables_to_keep
[
variable
],
updated_variable
)
if
variable
is
graph_info
.
global_step
:
updated_global_step
=
updated_variable
updated_graph_info
=
GraphInfo
(
input_producer_op
=
_get_tensors_or_ops
(
graph_info
.
input_producer_op
),
enqueue_ops
=
_get_tensors_or_ops
(
graph_info
.
enqueue_ops
),
execution_barrier
=
_get_tensors_or_ops
(
graph_info
.
execution_barrier
),
local_var_init_op_group
=
_get_tensors_or_ops
(
graph_info
.
local_var_init_op_group
),
fetches
=
_get_tensors_or_ops
(
graph_info
.
fetches
),
global_step
=
updated_global_step
,
summary_op
=
None
)
return
(
updated_graph
,
updated_graph_info
)
def
_build_input_processing
(
self
,
shift_ratio
=
0
):
""""Build the image (pre)processing portion of the model graph.
Args:
shift_ratio: shift_ratio for data_flow_ops.RecordInput.
Returns:
An InputProcessingInfo containing all the input sources to the model.
"""
input_processing_info
=
InputProcessingInfo
(
input_producer_op
=
None
,
input_producer_stages
=
None
,
multi_device_iterator_input
=
None
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
INPUT_ORDER
)
if
not
self
.
_doing_eval
:
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
INPUT_BATCH_SIZE
,
value
=
self
.
batch_size
)
# If using synthetic gpu inputs, do nothing on the cpu side.
if
self
.
dataset
.
use_synthetic_gpu_inputs
():
assert
not
self
.
datasets_use_prefetch
return
input_processing_info
if
self
.
_doing_eval
:
input_preprocessor
=
self
.
eval_input_preprocessor
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
PREPROC_NUM_EVAL_EXAMPLES
,
value
=
self
.
dataset
.
num_examples_per_epoch
(
'validation'
))
else
:
input_preprocessor
=
self
.
input_preprocessor
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
PREPROC_NUM_TRAIN_EXAMPLES
,
value
=
self
.
dataset
.
num_examples_per_epoch
(
'train'
))
# Use prefetching mechanism provided by dataset input pipeline.
if
self
.
datasets_use_prefetch
:
multi_device_iterator
=
(
input_preprocessor
.
build_multi_device_iterator
(
self
.
batch_size
,
len
(
self
.
devices
),
self
.
cpu_device
,
self
.
params
,
self
.
raw_devices
,
self
.
dataset
,
self
.
_doing_eval
))
return
input_processing_info
.
_replace
(
multi_device_iterator_input
=
multi_device_iterator
.
get_next
())
# Not using dataset prefetching. Use a staging area to mimic the prefetching
# behavior instead.
with
tf
.
device
(
self
.
cpu_device
):
if
self
.
_doing_eval
:
subset
=
'validation'
else
:
subset
=
'train'
input_list
=
input_preprocessor
.
minibatch
(
self
.
dataset
,
subset
=
subset
,
params
=
self
.
params
,
shift_ratio
=
shift_ratio
)
input_producer_op
=
[]
input_producer_stages
=
[]
for
device_num
in
range
(
len
(
self
.
devices
)):
staging_area
=
data_flow_ops
.
StagingArea
(
[
parts
[
0
].
dtype
for
parts
in
input_list
],
shapes
=
[
parts
[
0
].
get_shape
()
for
parts
in
input_list
],
shared_name
=
'input_producer_staging_area_%d_eval_%s'
%
(
device_num
,
self
.
_doing_eval
))
input_producer_stages
.
append
(
staging_area
)
for
group_index
in
xrange
(
self
.
batch_group_size
):
batch_index
=
group_index
+
device_num
*
self
.
batch_group_size
put_op
=
staging_area
.
put
(
[
parts
[
batch_index
]
for
parts
in
input_list
])
input_producer_op
.
append
(
put_op
)
assert
input_producer_op
return
input_processing_info
.
_replace
(
input_producer_op
=
input_producer_op
,
input_producer_stages
=
input_producer_stages
)
def
_maybe_initialize_fp16
(
self
):
"""Initialize fp16 settings."""
if
self
.
params
.
use_fp16
and
not
self
.
_doing_eval
:
init_loss_scale_val
=
float
(
self
.
params
.
fp16_loss_scale
or
self
.
model
.
get_fp16_loss_scale
())
self
.
loss_scale
=
None
self
.
loss_scale_normal_steps
=
None
if
self
.
enable_auto_loss_scale
or
init_loss_scale_val
!=
1
:
self
.
loss_scale
=
tf
.
get_variable
(
name
=
'loss_scale'
,
initializer
=
init_loss_scale_val
,
dtype
=
tf
.
float32
,
trainable
=
False
)
if
self
.
enable_auto_loss_scale
:
self
.
loss_scale_normal_steps
=
tf
.
get_variable
(
name
=
'loss_scale_normal_steps'
,
initializer
=
0
,
trainable
=
False
)
def
_build_model
(
self
):
"""Build the TensorFlow graph."""
if
self
.
datasets_use_prefetch
:
assert
not
self
.
params
.
staged_vars
assert
not
self
.
variable_mgr
.
supports_staged_vars
()
# Adjust seed so different workers start read different input files.
if
self
.
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
seed_adjustment
=
hvd
.
rank
()
else
:
seed_adjustment
=
0
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
RUN_SET_RANDOM_SEED
,
value
=
self
.
params
.
tf_random_seed
+
seed_adjustment
)
tf
.
set_random_seed
(
self
.
params
.
tf_random_seed
+
seed_adjustment
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
RUN_SET_RANDOM_SEED
,
value
=
4321
+
seed_adjustment
)
np
.
random
.
seed
(
4321
+
seed_adjustment
)
phase_train
=
not
(
self
.
_doing_eval
or
self
.
params
.
forward_only
)
if
self
.
_doing_eval
:
mode_string
=
'evaluation'
else
:
mode_string
=
'training'
log_fn
(
'Generating {} model'
.
format
(
mode_string
))
losses
=
[]
device_grads
=
[]
all_logits
=
[]
all_accuracy_ops
=
{}
gpu_compute_stage_ops
=
[]
gpu_grad_stage_ops
=
[]
with
tf
.
device
(
self
.
global_step_device
):
global_step
=
tf
.
train
.
get_or_create_global_step
()
self
.
_maybe_initialize_fp16
()
# Build the processing and model for the worker.
input_producer_op
=
None
with
tf
.
name_scope
(
'input_processing'
):
input_processing_info
=
self
.
_build_input_processing
(
shift_ratio
=
0
)
if
input_processing_info
.
input_producer_op
is
not
None
:
input_producer_op
=
tf
.
group
(
*
input_processing_info
.
input_producer_op
)
update_ops
=
None
staging_delta_ops
=
[]
for
device_num
in
range
(
len
(
self
.
devices
)):
with
tf
.
name_scope
(
'tower_%i'
%
device_num
)
as
name_scope
,
(
self
.
variable_mgr
.
create_outer_variable_scope
(
device_num
)):
results
=
self
.
add_forward_pass_and_gradients
(
phase_train
,
device_num
,
device_num
,
input_processing_info
,
gpu_compute_stage_ops
,
gpu_grad_stage_ops
)
if
self
.
params
.
backbone_model_path
:
self
.
model
.
add_backbone_saver
()
if
phase_train
:
losses
.
append
(
results
[
'loss'
])
device_grads
.
append
(
results
[
'gradvars'
])
else
:
all_logits
.
append
(
results
[
'logits'
])
if
not
phase_train
or
self
.
params
.
print_training_accuracy
:
for
name
,
op
in
results
.
items
():
if
name
.
startswith
(
'accuracy:'
):
key
=
name
[
9
:]
if
key
not
in
all_accuracy_ops
:
all_accuracy_ops
[
key
]
=
[]
all_accuracy_ops
[
key
].
append
(
op
)
if
device_num
==
0
:
# Retain the Batch Normalization updates operations only from the
# first tower. These operations update the moving mean and moving
# variance variables, which are updated (but not used) during
# training, and used during evaluation. The moving mean and variance
# approximate the true mean and variance across all images in the
# dataset. Therefore, in replicated mode, these moving averages would
# be almost identical for each tower, and so we only update and save
# the moving averages for one tower. In parameter server mode, all
# towers share a copy of the variables so we also only need to update
# and save the moving averages once.
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
if
self
.
datasets_use_prefetch
:
assert
not
self
.
variable_mgr
.
staging_delta_ops
else
:
staging_delta_ops
=
list
(
self
.
variable_mgr
.
staging_delta_ops
)
enqueue_ops
=
[]
if
not
self
.
datasets_use_prefetch
:
if
self
.
variable_mgr
.
supports_staged_vars
():
for
staging_ops
in
self
.
variable_mgr
.
staging_vars_on_devices
:
gpu_compute_stage_ops
.
extend
(
[
put_op
for
_
,
(
put_op
,
_
)
in
six
.
iteritems
(
staging_ops
)])
enqueue_ops
.
append
(
tf
.
group
(
*
gpu_compute_stage_ops
,
name
=
'gpu_compute_stage_ops_group'
))
if
gpu_grad_stage_ops
:
staging_delta_ops
+=
gpu_grad_stage_ops
if
staging_delta_ops
:
enqueue_ops
.
append
(
tf
.
group
(
*
(
staging_delta_ops
)))
if
(
self
.
mode
==
constants
.
BenchmarkMode
.
TRAIN_AND_EVAL
and
self
.
params
.
variable_update
==
'replicated'
):
# We need to get all the update ops instead of only those for the first
# tower. This is because during evaluation, each tower will read from its
# own tower's moving averages instead of the first tower's moving
# averages.
# TODO(reedwm): Have each tower read from the first tower's moving
# averages for a slight performance gain.
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
INPUT_BN_SPAN
,
value
=
self
.
batch_size
//
len
(
self
.
raw_devices
))
fetches
=
self
.
_build_fetches
(
global_step
,
all_logits
,
losses
,
device_grads
,
enqueue_ops
,
update_ops
,
all_accuracy_ops
,
phase_train
)
return
(
input_producer_op
,
enqueue_ops
,
fetches
)
def
_build_fetches
(
self
,
global_step
,
all_logits
,
losses
,
device_grads
,
enqueue_ops
,
update_ops
,
all_accuracy_ops
,
phase_train
):
"""Complete construction of model graph, populating the fetches map."""
fetches
=
{}
if
enqueue_ops
:
fetches
[
'enqueue_ops'
]
=
enqueue_ops
for
name
,
ops
in
all_accuracy_ops
.
items
():
# For fetches that starts with 'tensor:', keep dimension and skip reducing
# them to scalars.
if
name
.
startswith
(
constants
.
UNREDUCED_ACCURACY_OP_PREFIX
):
key
=
name
[
len
(
constants
.
UNREDUCED_ACCURACY_OP_PREFIX
):]
fetches
[
key
]
=
tf
.
concat
(
ops
,
0
)
else
:
fetches
[
name
]
=
tf
.
reduce_sum
(
ops
)
/
self
.
batch_size
if
self
.
task_index
==
0
and
self
.
params
.
summary_verbosity
>=
1
:
tf
.
summary
.
scalar
(
name
,
fetches
[
name
])
if
not
phase_train
:
if
self
.
params
.
forward_only
:
fetches
[
'all_logits'
]
=
tf
.
concat
(
all_logits
,
0
)
return
fetches
apply_gradient_devices
,
gradient_state
=
(
self
.
variable_mgr
.
preprocess_device_grads
(
device_grads
))
# TODO(reedwm): Greatly simplify the learning rate code.
if
(
self
.
params
.
variable_update
==
'horovod'
or
self
.
params
.
variable_update
==
'collective_all_reduce'
):
# Each worker independently increments global_step.
examples_per_step
=
self
.
batch_size
*
self
.
num_workers
else
:
# global_step is shared by all workers, and so every iteration
# global_step is incremented by num_workers.
examples_per_step
=
self
.
batch_size
if
self
.
params
.
compute_lr_on_cpu
:
with
tf
.
device
(
self
.
cpu_device
):
learning_rate
=
get_learning_rate
(
self
.
params
,
global_step
,
self
.
dataset
.
num_examples_per_epoch
(),
self
.
model
,
examples_per_step
)
training_ops
=
[]
for
d
,
device
in
enumerate
(
apply_gradient_devices
):
with
tf
.
device
(
device
):
with
tf
.
name_scope
(
'average_loss'
):
average_loss
=
tf
.
reduce_mean
(
losses
)
with
tf
.
name_scope
(
'get_gradients_to_apply'
):
avg_grads
=
self
.
variable_mgr
.
get_gradients_to_apply
(
d
,
gradient_state
)
if
not
self
.
params
.
compute_lr_on_cpu
:
# We compute the learning rate once for each device in
# `apply_gradient_devices`.
learning_rate
=
get_learning_rate
(
self
.
params
,
global_step
,
self
.
dataset
.
num_examples_per_epoch
(),
self
.
model
,
examples_per_step
)
gradient_clip
=
self
.
params
.
gradient_clip
if
gradient_clip
is
not
None
:
with
tf
.
name_scope
(
'clip_gradients'
):
clipped_grads
=
[(
tf
.
clip_by_value
(
grad
,
-
gradient_clip
,
+
gradient_clip
),
var
)
for
grad
,
var
in
avg_grads
]
else
:
clipped_grads
=
avg_grads
learning_rate
=
tf
.
identity
(
learning_rate
,
name
=
'learning_rate_tensor'
)
opt
=
get_optimizer
(
self
.
params
,
learning_rate
)
loss_scale_params
=
variable_mgr_util
.
AutoLossScaleParams
(
enable_auto_loss_scale
=
self
.
enable_auto_loss_scale
,
loss_scale
=
self
.
loss_scale
,
loss_scale_normal_steps
=
self
.
loss_scale_normal_steps
,
inc_loss_scale_every_n
=
self
.
params
.
fp16_inc_loss_scale_every_n
,
is_chief
=
not
self
.
job_name
or
self
.
task_index
==
0
)
with
tf
.
name_scope
(
'append_apply_gradient_ops'
):
self
.
variable_mgr
.
append_apply_gradients_ops
(
gradient_state
,
opt
,
clipped_grads
,
training_ops
,
loss_scale_params
)
train_op
=
tf
.
group
(
*
(
training_ops
+
update_ops
),
name
=
'train_ops_group'
)
with
tf
.
device
(
self
.
cpu_device
):
if
self
.
task_index
==
0
and
self
.
params
.
summary_verbosity
>=
1
:
tf
.
summary
.
scalar
(
'learning_rate'
,
learning_rate
)
tf
.
summary
.
scalar
(
self
.
params
.
loss_type_to_report
,
average_loss
)
if
self
.
loss_scale
is
not
None
:
tf
.
summary
.
scalar
(
'loss_scale'
,
self
.
loss_scale
)
if
self
.
loss_scale_normal_steps
:
tf
.
summary
.
scalar
(
'loss_scale_normal_steps'
,
self
.
loss_scale_normal_steps
)
if
self
.
params
.
summary_verbosity
>=
2
:
self
.
gradient_histogram_summary
(
avg_grads
)
if
self
.
params
.
summary_verbosity
>=
3
:
for
grad
,
var
in
avg_grads
:
if
grad
is
not
None
:
tf
.
summary
.
histogram
(
var
.
op
.
name
+
'/gradients'
,
grad
)
for
var
in
tf
.
trainable_variables
():
tf
.
summary
.
histogram
(
var
.
op
.
name
,
var
)
fetches
[
'train_op'
]
=
train_op
fetches
[
'average_loss'
]
=
average_loss
return
fetches
def
gradient_histogram_summary
(
self
,
avg_grads
):
"""Create histogram of log values of all non-zero gradients."""
with
tf
.
name_scope
(
'log_gradients_summary'
):
all_grads
=
[]
for
grad
,
_
in
avg_grads
:
all_grads
.
append
(
tf
.
reshape
(
grad
,
[
-
1
]))
grads
=
tf
.
abs
(
tf
.
concat
(
all_grads
,
0
))
# exclude grads with zero values.
indices_for_non_zero_grads
=
tf
.
where
(
tf
.
not_equal
(
grads
,
0
))
log_grads
=
tf
.
reshape
(
tf
.
log
(
tf
.
gather
(
grads
,
indices_for_non_zero_grads
)),
[
-
1
])
tf
.
summary
.
histogram
(
'log_gradients'
,
log_grads
)
def
_build_model_single_session
(
self
):
"""Build the TensorFlow graph for multiple replicas in a single_session.
Returns:
input_producer_op:
enqueue_ops:
fetches:
Raises:
ValueError: optimizer not recognized.
Single session runs multiple model replicas as part of one large
distributed graph, whose global execution is always step-synchronized.
"""
# verify assumptions
assert
self
.
params
.
task_index
==
0
assert
not
self
.
_doing_eval
assert
not
self
.
params
.
forward_only
assert
not
self
.
params
.
staged_vars
tf
.
set_random_seed
(
self
.
params
.
tf_random_seed
)
np
.
random
.
seed
(
4321
)
phase_train
=
True
log_fn
(
'Generating training model'
)
losses
=
[]
device_grads
=
[]
all_logits
=
[]
all_accuracy_ops
=
{}
gpu_compute_stage_ops
=
[]
gpu_grad_stage_ops
=
[]
with
tf
.
device
(
self
.
global_step_device
):
global_step
=
tf
.
train
.
get_or_create_global_step
()
update_ops
=
[]
global_input_producer_op
=
[]
is_local
=
not
self
.
job_name
if
is_local
:
assert
self
.
num_workers
==
1
for
task_num
in
range
(
self
.
num_workers
):
# Reset the devices that self.variable_mgr knows about to those
# belonging to the next worker (task).
self
.
reset_devices_for_task
(
task_num
,
is_local
)
# Build the per-worker image processing
with
tf
.
name_scope
(
'input_processing'
):
input_processing_info
=
self
.
_build_input_processing
(
shift_ratio
=
(
task_num
/
self
.
num_workers
))
if
input_processing_info
.
input_producer_op
is
not
None
:
global_input_producer_op
.
extend
(
input_processing_info
.
input_producer_op
)
# Build the per-worker model replica.
for
rel_device_num
in
range
(
len
(
self
.
devices
)):
abs_device_num
=
task_num
*
len
(
self
.
devices
)
+
rel_device_num
with
self
.
variable_mgr
.
create_outer_variable_scope
(
abs_device_num
),
tf
.
name_scope
(
'task_%i_tower_%i'
%
(
task_num
,
rel_device_num
))
as
name_scope
:
task_results
=
self
.
add_forward_pass_and_gradients
(
phase_train
,
rel_device_num
,
abs_device_num
,
input_processing_info
,
gpu_compute_stage_ops
,
gpu_grad_stage_ops
)
if
self
.
params
.
backbone_model_path
:
self
.
model
.
add_backbone_saver
()
if
phase_train
:
losses
.
append
(
task_results
[
'loss'
])
device_grads
.
append
(
task_results
[
'gradvars'
])
else
:
all_logits
.
append
(
task_results
[
'logits'
])
if
not
phase_train
or
self
.
params
.
print_training_accuracy
:
for
name
,
op
in
task_results
.
items
():
if
name
.
startswith
(
'accuracy:'
):
key
=
name
[
9
:]
if
key
not
in
all_accuracy_ops
:
all_accuracy_ops
[
key
]
=
[]
all_accuracy_ops
[
key
].
append
(
op
)
if
rel_device_num
==
0
:
# Retain the Batch Normalization updates operations only
# from the first tower. These operations update the moving
# mean and moving variance variables, which are updated
# (but not used) during training, and used during
# evaluation. The moving mean and variance approximate the
# true mean and variance across all images in the
# dataset. Therefore, in replicated mode, these moving
# averages would be almost identical for each tower, and
# so we only update and save the moving averages for one
# tower. In parameter server mode, all towers share a copy
# of the variables so we also only need to update and save
# the moving averages once.
update_ops
.
extend
(
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
))
assert
not
self
.
variable_mgr
.
staging_delta_ops
enqueue_ops
=
[]
if
gpu_compute_stage_ops
:
enqueue_ops
.
append
(
tf
.
group
(
*
gpu_compute_stage_ops
,
name
=
'gpu_compute_stage_ops'
))
assert
not
self
.
variable_mgr
.
supports_staged_vars
()
assert
not
gpu_grad_stage_ops
fetches
=
self
.
_build_fetches
(
global_step
,
all_logits
,
losses
,
device_grads
,
enqueue_ops
,
update_ops
,
all_accuracy_ops
,
phase_train
)
if
global_input_producer_op
:
global_input_producer_op
=
tf
.
group
(
*
global_input_producer_op
)
else
:
global_input_producer_op
=
None
return
(
global_input_producer_op
,
enqueue_ops
,
fetches
)
def
add_forward_pass_and_gradients
(
self
,
phase_train
,
rel_device_num
,
abs_device_num
,
input_processing_info
,
gpu_compute_stage_ops
,
gpu_grad_stage_ops
):
"""Add ops for forward-pass and gradient computations."""
nclass
=
self
.
dataset
.
num_classes
if
self
.
datasets_use_prefetch
:
assert
input_processing_info
.
multi_device_iterator_input
,
(
'multi_device_iterator_input cannot be None if '
'datasets_use_prefetch=True'
)
input_list
=
(
input_processing_info
.
multi_device_iterator_input
[
rel_device_num
])
else
:
if
not
self
.
dataset
.
use_synthetic_gpu_inputs
():
input_producer_stage
=
input_processing_info
.
input_producer_stages
[
rel_device_num
]
with
tf
.
device
(
self
.
cpu_device
):
host_input_list
=
input_producer_stage
.
get
()
with
tf
.
device
(
self
.
raw_devices
[
rel_device_num
]):
gpu_compute_stage
=
data_flow_ops
.
StagingArea
(
[
inp
.
dtype
for
inp
in
host_input_list
],
shapes
=
[
inp
.
get_shape
()
for
inp
in
host_input_list
])
# The CPU-to-GPU copy is triggered here.
gpu_compute_stage_op
=
gpu_compute_stage
.
put
(
host_input_list
)
input_list
=
gpu_compute_stage
.
get
()
gpu_compute_stage_ops
.
append
(
gpu_compute_stage_op
)
else
:
with
tf
.
device
(
self
.
raw_devices
[
rel_device_num
]):
# Minor hack to avoid H2D copy when using synthetic data
input_list
=
self
.
model
.
get_synthetic_inputs
(
BenchmarkCNN
.
GPU_CACHED_INPUT_VARIABLE_NAME
,
nclass
)
# Labels reshaping happens all on gpu:0. Reshaping synthetic labels on
# multiple devices slows down XLA computation for an unknown reason.
# TODO(b/116875203): Find/address root cause of XLA slow down.
labels_device_placement_hack
=
(
self
.
dataset
.
use_synthetic_gpu_inputs
()
and
self
.
params
.
xla_compile
)
def
device_aware_reshape
(
tensor
,
shape
):
device
=
self
.
devices
[
rel_device_num
]
# Labels are int32, place reshapes on gpu:0 (no device placement) when the
# hack is enabled.
if
labels_device_placement_hack
and
tensor
.
dtype
==
tf
.
int32
:
device
=
''
with
tf
.
device
(
device
):
return
tf
.
reshape
(
tensor
,
shape
=
shape
)
subset
=
'validation'
if
self
.
_doing_eval
else
'train'
input_shapes
=
self
.
model
.
get_input_shapes
(
subset
)
input_list
=
[
device_aware_reshape
(
input_list
[
i
],
shape
=
input_shapes
[
i
])
for
i
in
range
(
len
(
input_list
))
]
def
forward_pass_and_gradients
():
"""Builds forward pass and gradient computation network.
When phase_train=True and print_training_accuracy=False:
return [loss] + grads
When phase_train=True and print_training_accuracy=True:
return [logits, loss] + grads
When phase_train=False,
return [logits]
Its output can always be unpacked by
```
outputs = forward_pass_and_gradients()
logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
```
Returns:
outputs: A list of tensors depending on different modes.
"""
build_network_result
=
self
.
model
.
build_network
(
input_list
,
phase_train
,
nclass
)
logits
=
build_network_result
.
logits
if
not
phase_train
:
return
[
logits
]
base_loss
=
self
.
model
.
loss_function
(
input_list
,
build_network_result
)
params
=
self
.
variable_mgr
.
trainable_variables_on_device
(
rel_device_num
,
abs_device_num
)
l2_loss
=
None
total_loss
=
base_loss
with
tf
.
name_scope
(
'l2_loss'
):
fp32_params
=
params
if
self
.
model
.
data_type
==
tf
.
float16
and
self
.
params
.
fp16_vars
:
# fp16 reductions are very slow on GPUs, so cast to fp32 before
# calling tf.nn.l2_loss and tf.add_n.
# TODO(b/36217816): Once the bug is fixed, investigate if we should do
# this reduction in fp16.
fp32_params
=
(
tf
.
cast
(
p
,
tf
.
float32
)
for
p
in
params
)
filtered_params
=
self
.
model
.
filter_l2_loss_vars
(
fp32_params
)
if
rel_device_num
==
len
(
self
.
devices
)
-
1
:
# We compute the L2 loss for only one device instead of all of them,
# because the L2 loss for each device is the same. To adjust for this,
# we multiply the L2 loss by the number of devices. We choose the
# last device because for some reason, on a Volta DGX1, the first four
# GPUs take slightly longer to complete a step than the last four.
# TODO(reedwm): Shard the L2 loss computations across GPUs.
if
self
.
params
.
single_l2_loss_op
:
# TODO(reedwm): If faster, create a fused op that does the L2 loss
# on multiple tensors, and use that instead of concatenating
# tensors.
reshaped_params
=
[
tf
.
reshape
(
p
,
(
-
1
,))
for
p
in
filtered_params
]
l2_loss
=
tf
.
nn
.
l2_loss
(
tf
.
concat
(
reshaped_params
,
axis
=
0
))
else
:
l2_loss
=
tf
.
add_n
([
tf
.
nn
.
l2_loss
(
v
)
for
v
in
filtered_params
])
weight_decay
=
self
.
params
.
weight_decay
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
OPT_WEIGHT_DECAY
,
value
=
weight_decay
)
if
(
weight_decay
is
not
None
and
weight_decay
!=
0.
and
l2_loss
is
not
None
):
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
MODEL_L2_REGULARIZATION
,
value
=
weight_decay
)
total_loss
+=
len
(
self
.
devices
)
*
weight_decay
*
l2_loss
aggmeth
=
tf
.
AggregationMethod
.
DEFAULT
scaled_loss
=
(
total_loss
if
self
.
loss_scale
is
None
else
total_loss
*
self
.
loss_scale
)
grads
=
tf
.
gradients
(
scaled_loss
,
params
,
aggregation_method
=
aggmeth
)
if
self
.
params
.
sparse_to_dense_grads
:
# Passing a sparse gradient to convert_to_tensor turns it into a dense
# gradient. A sparse gradient is an instance of tf.IndexedSlices.
# convert_to_tensor does not modify dense tensors.
grads
=
[
tf
.
convert_to_tensor
(
g
)
for
g
in
grads
]
if
self
.
loss_scale
is
not
None
:
# TODO(reedwm): If automatic loss scaling is not used, we could avoid
# these multiplications by directly modifying the learning rate instead.
# If this is done, care must be taken to ensure that this scaling method
# is correct, as some optimizers square gradients and do other
# operations which might not be compatible with modifying both the
# gradients and the learning rate.
grads
=
[
grad
*
tf
.
cast
(
1.
/
self
.
loss_scale
,
grad
.
dtype
)
for
grad
in
grads
]
if
self
.
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
if
self
.
params
.
horovod_device
:
horovod_device
=
'/%s:0'
%
self
.
params
.
horovod_device
else
:
horovod_device
=
''
# All-reduce gradients using Horovod.
grads
=
[
hvd
.
allreduce
(
grad
,
average
=
False
,
device_dense
=
horovod_device
)
for
grad
in
grads
]
if
self
.
params
.
staged_vars
:
grad_dtypes
=
[
grad
.
dtype
for
grad
in
grads
]
grad_shapes
=
[
grad
.
shape
for
grad
in
grads
]
grad_stage
=
data_flow_ops
.
StagingArea
(
grad_dtypes
,
grad_shapes
)
grad_stage_op
=
grad_stage
.
put
(
grads
)
# In general, this decouples the computation of the gradients and
# the updates of the weights.
# During the pipeline warm up, this runs enough training to produce
# the first set of gradients.
gpu_grad_stage_ops
.
append
(
grad_stage_op
)
grads
=
grad_stage
.
get
()
if
self
.
params
.
loss_type_to_report
==
'total_loss'
:
loss
=
total_loss
else
:
loss
=
base_loss
if
self
.
params
.
print_training_accuracy
:
return
[
logits
,
loss
]
+
grads
else
:
return
[
loss
]
+
grads
def
unpack_forward_pass_and_gradients_output
(
forward_pass_and_grad_outputs
):
"""Unpacks outputs from forward_pass_and_gradients.
Args:
forward_pass_and_grad_outputs: Output from forward_pass_and_gradients.
Returns:
logits: Unscaled probability distribution from forward pass.
If unavailable, None is returned.
loss: Loss function result from logits.
If unavailable, None is returned.
grads: Gradients for all trainable variables.
If unavailable, None is returned.
"""
logits
=
None
# logits is only fetched in non-train mode or when
# print_training_accuracy is set.
if
not
phase_train
or
self
.
params
.
print_training_accuracy
:
logits
=
forward_pass_and_grad_outputs
.
pop
(
0
)
loss
=
(
forward_pass_and_grad_outputs
[
0
]
if
forward_pass_and_grad_outputs
else
None
)
grads
=
(
forward_pass_and_grad_outputs
[
1
:]
if
forward_pass_and_grad_outputs
else
None
)
return
logits
,
loss
,
grads
def
make_results
(
logits
,
loss
,
grads
):
"""Generate results based on logits, loss and grads."""
results
=
{}
# The return value
if
logits
is
not
None
:
results
[
'logits'
]
=
logits
accuracy_ops
=
self
.
model
.
accuracy_function
(
input_list
,
logits
)
for
name
,
op
in
accuracy_ops
.
items
():
results
[
'accuracy:'
+
name
]
=
op
if
loss
is
not
None
:
results
[
'loss'
]
=
loss
if
grads
is
not
None
:
param_refs
=
self
.
variable_mgr
.
trainable_variables_on_device
(
rel_device_num
,
abs_device_num
,
writable
=
True
)
results
[
'gradvars'
]
=
list
(
zip
(
grads
,
param_refs
))
return
results
with
tf
.
device
(
self
.
devices
[
rel_device_num
]):
outputs
=
maybe_compile
(
forward_pass_and_gradients
,
self
.
params
)
logits
,
loss
,
grads
=
unpack_forward_pass_and_gradients_output
(
outputs
)
return
make_results
(
logits
,
loss
,
grads
)
def
get_input_preprocessor
(
self
):
"""Returns the image preprocessor to used, based on the model.
Returns:
The image preprocessor, or None if synthetic data should be used.
"""
shift_ratio
=
0
if
self
.
job_name
:
# shift_ratio prevents multiple workers from processing the same batch
# during a step
shift_ratio
=
self
.
task_index
/
self
.
num_workers
processor_class
=
self
.
dataset
.
get_input_preprocessor
(
self
.
params
.
input_preprocessor
)
assert
processor_class
subset
=
'validation'
if
self
.
_doing_eval
else
'train'
return
processor_class
(
self
.
batch_size
*
self
.
batch_group_size
,
self
.
model
.
get_input_shapes
(
subset
),
len
(
self
.
devices
)
*
self
.
batch_group_size
,
dtype
=
self
.
model
.
data_type
,
train
=
(
not
self
.
_doing_eval
),
# TODO(laigd): refactor away image model specific parameters.
distortions
=
self
.
params
.
distortions
,
resize_method
=
self
.
resize_method
,
shift_ratio
=
shift_ratio
,
summary_verbosity
=
self
.
params
.
summary_verbosity
,
distort_color_in_yiq
=
self
.
params
.
distort_color_in_yiq
,
fuse_decode_and_crop
=
self
.
params
.
fuse_decode_and_crop
,
match_mlperf
=
self
.
params
.
ml_perf
)
def
add_sync_queues_and_barrier
(
self
,
name_prefix
,
enqueue_after_list
):
"""Adds ops to enqueue on all worker queues.
Args:
name_prefix: prefixed for the shared_name of ops.
enqueue_after_list: control dependency from ops.
Returns:
An op that should be used as control dependency before starting next step.
"""
self
.
sync_queue_counter
+=
1
with
tf
.
device
(
self
.
sync_queue_devices
[(
self
.
sync_queue_counter
%
len
(
self
.
sync_queue_devices
))]):
sync_queues
=
[
tf
.
FIFOQueue
(
self
.
num_workers
,
[
tf
.
bool
],
shapes
=
[[]],
shared_name
=
'%s%s'
%
(
name_prefix
,
i
))
for
i
in
range
(
self
.
num_workers
)]
queue_ops
=
[]
# For each other worker, add an entry in a queue, signaling that it can
# finish this step.
token
=
tf
.
constant
(
False
)
with
tf
.
control_dependencies
(
enqueue_after_list
):
for
i
,
q
in
enumerate
(
sync_queues
):
if
i
==
self
.
task_index
:
queue_ops
.
append
(
tf
.
no_op
())
else
:
queue_ops
.
append
(
q
.
enqueue
(
token
))
# Drain tokens off queue for this worker, one for each other worker.
queue_ops
.
append
(
sync_queues
[
self
.
task_index
].
dequeue_many
(
len
(
sync_queues
)
-
1
))
return
tf
.
group
(
*
queue_ops
)
def
_is_mkl_flag_absent
(
mkl_flag
):
return
not
(
absl_flags
.
FLAGS
.
is_parsed
()
and
mkl_flag
in
absl_flags
.
FLAGS
and
absl_flags
.
FLAGS
[
mkl_flag
].
present
)
def
_print_os_env_ignored_warning
(
mkl_flag
,
flag_default_val
,
os_env_var
):
tf
.
logging
.
warn
(
(
'OS ENV variable %s=%s is ignored and script default: '
'%s is used. Use --%s to override.'
)
%
(
os_env_var
,
os
.
environ
[
os_env_var
],
flag_default_val
,
mkl_flag
))
def
set_default_param_values_and_env_vars
(
params
):
"""Sets up the default param values and environment variables ."""
if
params
.
batchnorm_persistent
:
os
.
environ
[
'TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'
]
=
'1'
else
:
os
.
environ
.
pop
(
'TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'
,
None
)
if
params
.
winograd_nonfused
:
os
.
environ
[
'TF_ENABLE_WINOGRAD_NONFUSED'
]
=
'1'
else
:
os
.
environ
.
pop
(
'TF_ENABLE_WINOGRAD_NONFUSED'
,
None
)
if
params
.
autotune_threshold
:
os
.
environ
[
'TF_AUTOTUNE_THRESHOLD'
]
=
str
(
params
.
autotune_threshold
)
os
.
environ
[
'TF_SYNC_ON_FINISH'
]
=
str
(
int
(
params
.
sync_on_finish
))
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
# Sets environment variables for MKL
# If OS ENV vars are overridden by script defaults, a warning msg is printed.
if
params
.
mkl
:
mkl_flags
=
[
'kmp_blocktime'
,
'kmp_settings'
,
'kmp_affinity'
,
'num_intra_threads'
]
for
mkl_flag
in
mkl_flags
:
os_env_var
=
mkl_flag
.
upper
()
if
mkl_flag
==
'num_intra_threads'
:
os_env_var
=
'OMP_NUM_THREADS'
flag_val
=
str
(
getattr
(
params
,
mkl_flag
))
if
_is_mkl_flag_absent
(
mkl_flag
)
and
os_env_var
in
os
.
environ
:
_print_os_env_ignored_warning
(
mkl_flag
,
flag_val
,
os_env_var
)
os
.
environ
[
os_env_var
]
=
flag_val
if
mkl_flag
==
'num_intra_threads'
and
not
params
.
num_intra_threads
:
os
.
environ
.
pop
(
os_env_var
,
None
)
# Sets GPU thread settings
if
params
.
device
.
lower
()
==
'gpu'
:
params
=
params
.
_replace
(
gpu_thread_mode
=
params
.
gpu_thread_mode
.
lower
())
if
params
.
gpu_thread_mode
not
in
[
'global'
,
'gpu_shared'
,
'gpu_private'
]:
raise
ValueError
(
'Invalid gpu_thread_mode: %s'
%
params
.
gpu_thread_mode
)
os
.
environ
[
'TF_GPU_THREAD_MODE'
]
=
params
.
gpu_thread_mode
if
params
.
per_gpu_thread_count
and
params
.
gpu_thread_mode
==
'global'
:
raise
ValueError
(
'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s'
%
params
.
per_gpu_thread_count
)
# Default to two threads. One for the device compute and the other for
# memory copies.
per_gpu_thread_count
=
params
.
per_gpu_thread_count
or
2
total_gpu_thread_count
=
per_gpu_thread_count
*
params
.
num_gpus
if
params
.
gpu_thread_mode
==
'gpu_private'
:
os
.
environ
[
'TF_GPU_THREAD_COUNT'
]
=
str
(
per_gpu_thread_count
)
elif
params
.
gpu_thread_mode
==
'gpu_shared'
:
os
.
environ
[
'TF_GPU_THREAD_COUNT'
]
=
str
(
total_gpu_thread_count
)
cpu_count
=
multiprocessing
.
cpu_count
()
if
not
params
.
num_inter_threads
and
params
.
gpu_thread_mode
in
[
'gpu_private'
,
'gpu_shared'
]:
main_thread_count
=
max
(
cpu_count
-
total_gpu_thread_count
,
1
)
params
=
params
.
_replace
(
num_inter_threads
=
main_thread_count
)
if
(
params
.
datasets_use_prefetch
and
params
.
datasets_num_private_threads
is
None
):
# From the total cpu thread count, subtract the total_gpu_thread_count,
# and then 2 threads per GPU device for event monitoring and sending /
# receiving tensors
num_monitoring_threads
=
2
*
params
.
num_gpus
num_private_threads
=
max
(
cpu_count
-
total_gpu_thread_count
-
num_monitoring_threads
,
1
)
params
=
params
.
_replace
(
datasets_num_private_threads
=
num_private_threads
)
return
params
def
setup
(
params
):
"""Sets up the environment that BenchmarkCNN should run in.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
Returns:
A potentially modified params.
Raises:
ValueError: invalid parames combinations.
"""
# Set up environment variables before doing any other global initialization to
# make sure it uses the appropriate environment variables.
params
=
set_default_param_values_and_env_vars
(
params
)
# horovod needs to be initialized before create_config_proto() call since
# it will be used in config generation if enabled.
if
params
.
variable_update
==
'horovod'
:
import
horovod.tensorflow
as
hvd
# pylint: disable=g-import-not-at-top
hvd
.
init
()
platforms_util
.
initialize
(
params
,
create_config_proto
(
params
))
if
not
params
.
job_name
:
# Create a dummy session to initialize TF global variables using the input
# params. Otherwise, ListDevices function may create global devices using
# the default config instead of using the user provided config.
#
# TODO(hinsu): Find a way to achieve the same for distributed benchmark. It
# is not legal to create distributed session after local session. It is also
# not possible to create distributed session here as that results in
# multiple creation of ClusterManager and Server.
with
tf
.
Session
(
config
=
create_config_proto
(
params
))
as
sess
:
del
sess
return
params
def
maybe_compile
(
computation
,
params
):
if
params
and
params
.
xla_compile
:
return
tf
.
xla
.
experimental
.
compile
(
computation
)
else
:
return
computation
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests running benchmark_cnn in distributed mode.
This is done by spawning one process per task. Each process runs
benchmark_cnn_distributed_test_runner.py.
The output for each process is written to disk and can be viewed to debug tests.
See get_test_output_dir() in platforms/default/util.py for more info.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
os
import
subprocess
import
time
import
unittest
from
absl
import
flags
as
absl_flags
import
portpicker
import
six
import
tensorflow.compat.v1
as
tf
import
flags
import
test_util
from
platforms
import
util
as
platforms_util
FLAGS
=
absl_flags
.
FLAGS
def
_convert_params_to_flags_list
(
params
):
"""Converts Params to a list of flags. Skips default-valued parameters.
E.g., converts
benchmark_cnn.make_params(batch_size=32, model='resnet50')
to
['--batch_size=32', '--model=resnet50']
Args:
params: Params for BenchmarkCNN.
Returns:
A list of flags.
"""
return
[
'--%s=%s'
%
(
k
,
str
(
v
))
for
k
,
v
in
six
.
iteritems
(
params
.
_asdict
())
if
v
!=
flags
.
param_specs
[
k
].
default_value
]
# When outputting a process's output in the log, maximum number of characters
# to output. The log system does not allow us to output more than this in a
# single log message, but this limit is also useful to avoid the logs from
# becoming too large (the full process output is written to disk).
MAX_OUTPUT_CHARS
=
15000
# A process. name is a string identifying the process in logs. stdout and
# stderr are file objects of the process's stdout and stderr, respectively.
_ProcessInfo
=
namedtuple
(
'_ProcessInfo'
,
[
'name'
,
'popen'
,
'stdout'
,
'stderr'
])
def
_create_task_process
(
job_name
,
task_index
,
args
,
env
,
output_dir
):
"""Creates a process for a single task for benchmark_cnn.
Args:
job_name: 'worker' or 'ps' or ''. Empty string used for non-distributed
mode.
task_index: The index of the task within the cluster.
args: A list of arguments to pass to the task. This function additionally
sets --task_index and --job_name
env: The environment to use for the task.
output_dir: Where to place the output files, storing the task's stdout and
stderr.
Returns:
A _ProcessInfo namedtuple of the running process. The stdout and stderr
fields of this tuple must be closed by the caller once the process ends.
"""
args
=
args
[:]
args
+=
[
'--task_index=%s'
%
task_index
,
'--job_name=%s'
%
job_name
]
name_prefix
=
job_name
or
'local'
process_name
=
'%s_%s'
%
(
name_prefix
,
task_index
)
tf
.
logging
.
info
(
'Spawning %s process: %s'
%
(
process_name
,
' '
.
join
(
args
)))
stdout_filename
=
os
.
path
.
join
(
output_dir
,
'%s_stdout.txt'
%
process_name
)
stderr_filename
=
os
.
path
.
join
(
output_dir
,
'%s_stderr.txt'
%
process_name
)
stdout_file
=
open
(
stdout_filename
,
'w+'
)
stderr_file
=
open
(
stderr_filename
,
'w+'
)
popen
=
subprocess
.
Popen
(
args
,
stdout
=
stdout_file
,
stderr
=
stderr_file
,
env
=
env
)
return
_ProcessInfo
(
process_name
,
popen
,
stdout_file
,
stderr_file
)
def
_wait_for_processes
(
wait_processes
,
kill_processes
):
"""Waits until all `wait_processes` finish, then kills `kill_processes`.
Fails an assert if a process in `wait_processes` finishes unsuccessfully.
The processes in `kill_processes` are assumed to never finish so they are
killed.
Args:
wait_processes: A list of _ProcessInfo tuples. This function will wait
for each to finish.
kill_processes: A list of _ProcessInfo tuples. Each will be killed once
every process in `wait_processes` is finished.
Returns:
A list of strings, each which is a string of the stdout of a wait process.
"""
wait_process_stdouts
=
[
None
]
*
len
(
wait_processes
)
finished_wait_processes
=
set
()
while
len
(
finished_wait_processes
)
<
len
(
wait_processes
):
for
i
,
wait_process
in
enumerate
(
wait_processes
):
if
i
in
finished_wait_processes
:
continue
ret_code
=
wait_process
.
popen
.
poll
()
if
ret_code
is
None
:
continue
tf
.
logging
.
info
(
'{} finished'
.
format
(
wait_process
.
name
))
wait_process
.
stdout
.
seek
(
0
)
wait_process_stdouts
[
i
]
=
wait_process
.
stdout
.
read
()
tf
.
logging
.
info
(
'stdout for {} (last {} chars): {}
\n
'
.
format
(
wait_process
.
name
,
MAX_OUTPUT_CHARS
,
wait_process_stdouts
[
i
][
-
MAX_OUTPUT_CHARS
:]))
wait_process
.
stderr
.
seek
(
0
)
tf
.
logging
.
info
(
'stderr for {} (last {} chars): {}
\n
'
.
format
(
wait_process
.
name
,
MAX_OUTPUT_CHARS
,
wait_process
.
stderr
.
read
()[
-
MAX_OUTPUT_CHARS
:]))
assert
ret_code
==
0
,
'Process failed with return code %d'
%
ret_code
finished_wait_processes
.
add
(
i
)
for
kill_process
in
kill_processes
:
ret_code
=
kill_process
.
popen
.
poll
()
# kill processes should not end until we kill them.
assert
ret_code
is
None
,
'Process returned early with code %d'
%
ret_code
time
.
sleep
(
0.25
)
tf
.
logging
.
info
(
'All wait processes finished'
)
for
i
,
kill_process
in
enumerate
(
kill_processes
):
# Kill each kill process.
kill_process
.
popen
.
kill
()
kill_process
.
popen
.
wait
()
kill_process
.
stdout
.
seek
(
0
)
tf
.
logging
.
info
(
'stdout for {} (last {} chars): {}
\n
'
.
format
(
kill_process
.
name
,
MAX_OUTPUT_CHARS
,
kill_process
.
stdout
.
read
()[
-
MAX_OUTPUT_CHARS
:]))
kill_process
.
stderr
.
seek
(
0
)
tf
.
logging
.
info
(
'stderr for {} (last {} chars): {}
\n
'
.
format
(
kill_process
.
name
,
MAX_OUTPUT_CHARS
,
kill_process
.
stderr
.
read
()[
-
MAX_OUTPUT_CHARS
:]))
return
wait_process_stdouts
def
_spawn_benchmark_processes
(
output_dir_path
,
num_workers
,
num_ps
,
num_controllers
,
params
):
"""Run training or evaluation in spawned processes.
Runs locally if num_workers == 1, num_ps == 0, and num_controllers == 0,
otherwise runs in distributed mode. In either case, one process is spawned
per worker and ps. Waits for training/evaluation to finish before returning.
Args:
output_dir_path: Relative path where stdout and stderr files will be
placed.
num_workers: Number of workers to spawn.
num_ps: Number of ps processes to spawn.
num_controllers: Number of controller processes to spawn (must be 0 or 1).
params: Params for BenchmarkCNN in each subprocess.
Returns:
A list output_list of outputs from all processes that output the
images/sec and accuracy. This process is the controller host in
distributed_all_reduce, and the workers otherwise. output_list[i] is a
list of lines from the ith worker's stdout.
"""
run_distributed
=
num_workers
!=
1
or
num_ps
!=
0
or
num_controllers
!=
0
if
params
.
variable_update
==
'distributed_all_reduce'
:
assert
num_controllers
==
1
or
not
run_distributed
assert
num_ps
==
0
else
:
assert
num_controllers
==
0
output_base_dir
=
platforms_util
.
get_test_output_dir
()
output_dir
=
os
.
path
.
join
(
output_base_dir
,
output_dir_path
)
os
.
makedirs
(
output_dir
)
tf
.
logging
.
info
(
'Outputs of processes will be outputted to: %s'
%
output_dir
)
args
=
platforms_util
.
get_command_to_run_python_module
(
'benchmark_cnn_distributed_test_runner'
)
args
+=
_convert_params_to_flags_list
(
params
)
if
run_distributed
:
worker_ports
=
[
portpicker
.
pick_unused_port
()
for
_
in
range
(
num_workers
)]
ps_ports
=
[
portpicker
.
pick_unused_port
()
for
_
in
range
(
num_ps
)]
controller_ports
=
[
portpicker
.
pick_unused_port
()
for
_
in
range
(
num_controllers
)]
# The numerator is 0.7 instead of 1 to leave some memory for the Cuda
# runtime, etc.
gpu_memory_frac
=
0.7
/
num_workers
args
+=
[
'--gpu_memory_frac_for_testing=%f'
%
gpu_memory_frac
,
'--worker_hosts='
+
','
.
join
(
'localhost:%d'
%
p
for
p
in
worker_ports
)
]
if
num_ps
>
0
:
ps_hosts_str
=
','
.
join
(
'localhost:%d'
%
p
for
p
in
ps_ports
)
args
.
append
(
'--ps_hosts='
+
ps_hosts_str
)
else
:
controller_host_str
=
','
.
join
(
'localhost:%d'
%
p
for
p
in
controller_ports
)
args
.
append
(
'--controller_host='
+
controller_host_str
)
env
=
os
.
environ
.
copy
()
# Allow stdout to be viewed before the process ends.
env
[
'PYTHONUNBUFFERED'
]
=
'1'
worker_processes
=
[]
ps_processes
=
[]
controller_processes
=
[]
try
:
for
i
in
range
(
num_workers
):
job_name
=
'worker'
if
run_distributed
else
''
process
=
_create_task_process
(
job_name
,
i
,
args
,
env
,
output_dir
)
worker_processes
.
append
(
process
)
# Don't let ps or controller processes use the gpu.
env
[
'CUDA_VISIBLE_DEVICES'
]
=
''
for
i
in
range
(
num_ps
):
process
=
_create_task_process
(
'ps'
,
i
,
args
,
env
,
output_dir
)
ps_processes
.
append
(
process
)
for
i
in
range
(
num_controllers
):
process
=
_create_task_process
(
'controller'
,
i
,
args
,
env
,
output_dir
)
controller_processes
.
append
(
process
)
# If all distributed all reduce mode is being used, the controller process
# finishes and the worker processes block forever. Otherwise, the worker
# processes finish and the ps processes block forever. We set
# wait_processes and kill_processes accordingly.
if
controller_processes
:
wait_processes
=
controller_processes
kill_processes
=
worker_processes
else
:
wait_processes
=
worker_processes
kill_processes
=
ps_processes
outputs
=
_wait_for_processes
(
wait_processes
,
kill_processes
)
finally
:
for
process
in
worker_processes
+
ps_processes
+
controller_processes
:
try
:
process
.
popen
.
kill
()
except
OSError
:
pass
# It's OK (and expected) if the process already exited.
process
.
stdout
.
close
()
process
.
stderr
.
close
()
return
[
output
.
splitlines
()
for
output
in
outputs
]
# When this test class is run, a method will fail about 0.3% of the time with a
# gRPC error. It is not clear why this occurs.
# TODO(reedwm): Fix this test class.
class
TfCnnBenchmarksDistributedTest
(
tf
.
test
.
TestCase
):
"""Tests running benchmark_cnn in distributed mode."""
# We cannot check for a GPU via tf.test.is_gpu_available() before the tests in
# this class because it allocates all the GPU memory which would cause the
# spawned processes to run out of GPU memory.
def
_test_distributed
(
self
,
test_name
,
num_workers
,
num_ps
,
params
,
num_controllers
=
0
,
check_output_values
=
False
,
skip
=
None
):
# TODO(reedwm): check_output_values should default to True and be enabled
# on every test. See the TODO in benchmark_cnn_test.py.
def
run_fn
(
run_type
,
inner_params
):
output_dir_path
=
os
.
path
.
join
(
test_name
,
run_type
)
if
run_type
==
'Evaluation'
:
# Distributed evaluation is not supported, so we use a single process.
# We still must spawn another process, because if we evaluate in the
# current process, it would allocate the GPU memory causing future test
# methods to fail.
if
inner_params
.
variable_update
==
'distributed_replicated'
:
inner_params
=
inner_params
.
_replace
(
variable_update
=
'replicated'
)
return
_spawn_benchmark_processes
(
output_dir_path
,
num_workers
=
1
,
num_ps
=
0
,
num_controllers
=
0
,
params
=
inner_params
)
else
:
return
_spawn_benchmark_processes
(
output_dir_path
,
num_workers
,
num_ps
,
num_controllers
,
inner_params
)
return
test_util
.
train_and_eval
(
self
,
run_fn
,
params
,
check_output_values
=
check_output_values
,
skip
=
skip
)
def
testParameterServer
(
self
):
test_name
=
'testParameterServer'
params
=
test_util
.
get_params
(
test_name
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testParameterServerStaged
(
self
):
test_name
=
'testParameterServerStaged'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
staged_vars
=
True
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testReplicated
(
self
):
test_name
=
'testReplicated'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
variable_update
=
'distributed_replicated'
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testAllReducePsgpu
(
self
):
test_name
=
'testAllReducePsgpu'
flags_dict
=
test_util
.
get_params
(
test_name
).
_replace
(
variable_update
=
'distributed_all_reduce'
,
all_reduce_spec
=
'psgpu#4'
)
self
.
_test_distributed
(
test_name
,
2
,
0
,
flags_dict
,
num_controllers
=
1
)
def
testAllReducePscpuXring
(
self
):
test_name
=
'testAllReducePscpuXring'
flags_dict
=
test_util
.
get_params
(
test_name
).
_replace
(
variable_update
=
'distributed_all_reduce'
,
all_reduce_spec
=
'pscpu:2k:xring'
)
self
.
_test_distributed
(
test_name
,
2
,
0
,
flags_dict
,
num_controllers
=
1
)
def
testForwardOnly
(
self
):
test_name
=
'testForwardOnly'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
forward_only
=
True
)
# Evaluation is not supported with --forward_only, so we set skip='eval'.
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
,
skip
=
'eval'
)
def
testSingleWorkerAndPs
(
self
):
test_name
=
'testSingleWorkerAndPs'
params
=
test_util
.
get_params
(
test_name
)
self
.
_test_distributed
(
test_name
,
1
,
1
,
params
)
def
testThreeWorkersAndPses
(
self
):
test_name
=
'testThreeWorkersAndPses'
params
=
test_util
.
get_params
(
test_name
)
self
.
_test_distributed
(
test_name
,
3
,
3
,
params
)
def
testOneWorkerThreePses
(
self
):
test_name
=
'testOneWorkerThreePses'
params
=
test_util
.
get_params
(
test_name
)
self
.
_test_distributed
(
test_name
,
1
,
3
,
params
)
def
testThreeWorkersOnePs
(
self
):
test_name
=
'testThreeWorkersOnePs'
params
=
test_util
.
get_params
(
test_name
)
self
.
_test_distributed
(
test_name
,
3
,
1
,
params
)
def
testNoPrintTrainingAccuracy
(
self
):
test_name
=
'testNoPrintTrainingAccuracy'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
print_training_accuracy
=
False
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testRmspropParameterServer
(
self
):
test_name
=
'testRmspropParameterServer'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
optimizer
=
'rmsprop'
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testMomentumReplicated
(
self
):
test_name
=
'testMomentumReplicated'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
optimizer
=
'momentum'
,
variable_update
=
'distributed_replicated'
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testNoCrossReplicaSyncParameterServerStaged
(
self
):
test_name
=
'testNoCrossReplicaSyncParameterServerStaged'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
staged_vars
=
True
,
cross_replica_sync
=
False
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testSingleGpu
(
self
):
test_name
=
'testSingleGpu'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
num_gpus
=
1
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testBatchGroupSize
(
self
):
test_name
=
'testBatchGroupSize'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
batch_group_size
=
4
,
num_batches
=
100
,
num_warmup_batches
=
5
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testFp16WithFp32Vars
(
self
):
test_name
=
'testFp16WithFp32Vars'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
use_fp16
=
True
,
fp16_vars
=
False
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testFp16WithFp16Vars
(
self
):
test_name
=
'testFp16WithFp16Vars'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
use_fp16
=
True
,
fp16_vars
=
True
,
fp16_loss_scale
=
1.
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
def
testFp16Replicated
(
self
):
test_name
=
'testFp16Replicated'
params
=
test_util
.
get_params
(
test_name
).
_replace
(
use_fp16
=
True
,
variable_update
=
'distributed_replicated'
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
@
unittest
.
skip
(
'b/147310862: Fails for unknown reason'
)
def
testReplicatedRealData
(
self
):
test_name
=
'testReplicatedRealData'
imagenet_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'fake_tf_record_data'
)
params
=
test_util
.
get_params
(
test_name
).
_replace
(
variable_update
=
'distributed_replicated'
,
data_dir
=
imagenet_dir
,
data_name
=
'imagenet'
)
self
.
_test_distributed
(
test_name
,
2
,
2
,
params
)
class
DistributedVariableUpdateTest
(
tf
.
test
.
TestCase
):
"""Tests that variables are updated correctly in distributed mode."""
def
_test_variable_update
(
self
,
test_name
,
num_workers
,
num_ps
,
params
,
num_controllers
=
0
):
"""Tests variables are updated correctly when the given params are used."""
output_dir_path
=
os
.
path
.
join
(
test_name
,
'variable_update'
)
logs
=
_spawn_benchmark_processes
(
output_dir_path
,
num_workers
,
num_ps
,
num_controllers
,
params
)
actual_losses
=
[]
for
worker_logs
in
logs
:
outputs
=
test_util
.
get_training_outputs_from_logs
(
worker_logs
,
params
.
print_training_accuracy
)
actual_losses
.
append
([
x
.
loss
for
x
in
outputs
])
inputs
=
test_util
.
get_fake_var_update_inputs
()
expected_losses
=
test_util
.
TestCNNModel
().
manually_compute_losses
(
inputs
,
num_workers
,
params
)
if
params
.
variable_update
==
'distributed_all_reduce'
:
# In distributed all reduce, each step, the controller outputs the average
# of the loss from each worker. So we modify expected losses accordingly.
# E.g, we change [[1, 2], [4, 5]] to [[2.5, 3.5]]
expected_losses
=
[[
sum
(
losses
)
/
num_workers
for
losses
in
zip
(
*
expected_losses
)]]
rtol
=
3e-2
if
params
.
use_fp16
else
1e-5
for
worker_actual_losses
,
worker_expected_losses
in
zip
(
actual_losses
,
expected_losses
):
self
.
assertAllClose
(
worker_actual_losses
[:
len
(
worker_expected_losses
)],
worker_expected_losses
,
rtol
=
rtol
,
atol
=
0.
)
def
_test_variable_updates
(
self
,
test_name
,
params
):
"""Tests variables are updated correctly with various variable updates."""
# Unfortunately, distributed parameter server is non-deterministic with
# multiple workers, because one worker may write to a variable before
# another worker reads it. This probably does not harm training, but it
# does mean we cannot easily test that case. So, we use one worker.
self
.
_test_variable_update
(
test_name
+
'_ps'
,
num_workers
=
1
,
num_ps
=
2
,
num_controllers
=
0
,
params
=
params
.
_replace
(
variable_update
=
'parameter_server'
))
self
.
_test_variable_update
(
test_name
+
'_rep'
,
num_workers
=
2
,
num_ps
=
1
,
num_controllers
=
0
,
params
=
params
.
_replace
(
variable_update
=
'distributed_replicated'
))
self
.
_test_variable_update
(
test_name
+
'_allreduce'
,
num_workers
=
2
,
num_ps
=
0
,
num_controllers
=
1
,
params
=
params
.
_replace
(
variable_update
=
'distributed_all_reduce'
,
all_reduce_spec
=
'psgpu#%d'
%
params
.
num_gpus
))
def
testVarUpdateDefault
(
self
):
params
=
test_util
.
get_var_update_params
()
self
.
_test_variable_updates
(
'testVarUpdateDefault'
,
params
)
def
testVarUpdateCpuAsLocalParamDevice
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
local_parameter_device
=
'cpu'
)
self
.
_test_variable_updates
(
'testVarUpdateCpuAsLocalParamDevice'
,
params
)
def
testVarUpdateFp16
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
use_fp16
=
True
)
self
.
_test_variable_updates
(
'testVarUpdateFp16'
,
params
)
def
testVarUpdateResourceVars
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
use_resource_vars
=
True
)
self
.
_test_variable_updates
(
'testVarUpdateResourceVars'
,
params
)
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
tf
.
test
.
main
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Used to run benchmark_cnn for distributed tests.
In distributed tests, we spawn processes to run tf_cnn_benchmark tasks. We could
directly spawn tf_cnn_benchmark processes, but we want some added functionality,
such as being able to inject custom images during training. So instead, this
file is spawned as a Python process, which supports the added functionality.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
absl
import
flags
as
absl_flags
import
numpy
as
np
import
tensorflow.compat.v1
as
tf
import
benchmark_cnn
import
flags
import
preprocessing
import
test_util
absl_flags
.
DEFINE_string
(
'fake_input'
,
'none'
,
"""What fake input to inject into benchmark_cnn. This
is ignored if --model=test_model.
Options are:
none: Do not inject any fake input.
zeros_and_ones: Half the images will be all 0s with
a label of 0. Half the images will be all 1s with a
label of 1."""
)
flags
.
define_flags
()
FLAGS
=
flags
.
FLAGS
def
get_test_image_preprocessor
(
batch_size
,
params
):
"""Returns the preprocessing.TestImagePreprocessor that should be injected.
Returns None if no preprocessor should be injected.
Args:
batch_size: The batch size across all GPUs.
params: BenchmarkCNN's parameters.
Returns:
Returns the preprocessing.TestImagePreprocessor that should be injected.
Raises:
ValueError: Flag --fake_input is an invalid value.
"""
if
FLAGS
.
fake_input
==
'none'
:
return
None
elif
FLAGS
.
fake_input
==
'zeros_and_ones'
:
half_batch_size
=
batch_size
//
2
images
=
np
.
zeros
((
batch_size
,
227
,
227
,
3
),
dtype
=
np
.
float32
)
images
[
half_batch_size
:,
:,
:,
:]
=
1
labels
=
np
.
array
([
0
]
*
half_batch_size
+
[
1
]
*
half_batch_size
,
dtype
=
np
.
int32
)
preprocessor
=
preprocessing
.
TestImagePreprocessor
(
batch_size
,
[
227
,
227
,
3
],
params
.
num_gpus
,
benchmark_cnn
.
get_data_type
(
params
))
preprocessor
.
set_fake_data
(
images
,
labels
)
preprocessor
.
expected_subset
=
'validation'
if
params
.
eval
else
'train'
return
preprocessor
else
:
raise
ValueError
(
'Invalid --fake_input: %s'
%
FLAGS
.
fake_input
)
def
run_with_real_model
(
params
):
"""Runs tf_cnn_benchmarks with a real model."""
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
print_info
()
preprocessor
=
get_test_image_preprocessor
(
bench
.
batch_size
,
params
)
if
preprocessor
is
not
None
:
# The test image preprocessor requires queue runners. Since this file is
# used for testing, it is OK to access protected members.
# pylint: disable=protected-access
bench
.
dataset
.
_queue_runner_required
=
True
# pylint: enable=protected-access
bench
.
input_preprocessor
=
preprocessor
bench
.
run
()
def
run_with_test_model
(
params
):
"""Runs tf_cnn_benchmarks with a test model."""
model
=
test_util
.
TestCNNModel
()
inputs
=
test_util
.
get_fake_var_update_inputs
()
with
test_util
.
monkey_patch
(
benchmark_cnn
,
LOSS_AND_ACCURACY_DIGITS_TO_SHOW
=
15
):
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
,
dataset
=
test_util
.
TestDataSet
(),
model
=
model
)
# The test model does not use labels when computing loss, so the label
# values do not matter as long as it's the right shape.
labels
=
np
.
array
([
1
]
*
inputs
.
shape
[
0
])
bench
.
input_preprocessor
.
set_fake_data
(
inputs
,
labels
)
bench
.
run
()
def
main
(
_
):
params
=
benchmark_cnn
.
make_params_from_flags
()
params
=
benchmark_cnn
.
setup
(
params
)
if
params
.
model
==
'test_model'
:
run_with_test_model
(
params
)
else
:
run_with_real_model
(
params
)
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
tf
.
app
.
run
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_test.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark_cnn."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
glob
import
os
import
re
import
unittest
import
mock
import
numpy
as
np
import
tensorflow.compat.v1
as
tf
from
google.protobuf
import
text_format
from
tensorflow.core.framework
import
step_stats_pb2
from
tensorflow.core.profiler
import
tfprof_log_pb2
from
tensorflow.python.platform
import
test
import
benchmark_cnn
import
datasets
import
flags
import
preprocessing
import
test_util
import
variable_mgr_util
from
platforms
import
util
as
platforms_util
def
_check_has_gpu
():
if
not
test
.
is_gpu_available
(
cuda_only
=
True
):
raise
ValueError
(
"""You have asked to run part or all of this on GPU, but it appears
that no GPU is available. If your machine has GPUs it is possible you
do not have a version of TensorFlow with GPU support. To build with GPU
support, add --config=cuda to the build flags.
\n
"""
)
class
TfCnnBenchmarksModelTest
(
tf
.
test
.
TestCase
):
"""Tests which are run with multiple models."""
def
setUp
(
self
):
super
(
TfCnnBenchmarksModelTest
,
self
).
setUp
()
benchmark_cnn
.
setup
(
benchmark_cnn
.
make_params
())
def
get_model_name
(
self
):
return
None
# Return true to run tests that don't need to be run on every model.
# This should be done for one or two cheap models.
def
extended_tests
(
self
):
return
False
# Return false to suppress actually running the model; this is useful
# for tests that are large.
def
model_execution_test
(
self
):
return
False
# Return false to suppress actually saving and loading the model.
def
model_save_load_test
(
self
):
return
False
def
testSaveLoadModel
(
self
):
_check_has_gpu
()
if
not
self
.
get_model_name
()
or
not
self
.
model_save_load_test
():
return
params
=
benchmark_cnn
.
make_params
(
model
=
self
.
get_model_name
(),
num_batches
=
1
,
num_intra_threads
=
0
,
num_inter_threads
=
0
,
distortions
=
False
,
batch_size
=
2
,
variable_update
=
'replicated'
,
num_warmup_batches
=
0
,
num_gpus
=
2
,
train_dir
=
test_util
.
get_temp_dir
(
'testSaveLoadModel_'
+
self
.
get_model_name
()))
# Run one batch and save the model.
# Note that this uses a non-test session.
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
run
()
self
.
assertEqual
(
bench
.
init_global_step
,
0
)
# Clear the default graph.
tf
.
reset_default_graph
()
# Test if checkpoint had been saved.
ckpt
=
tf
.
train
.
get_checkpoint_state
(
params
.
train_dir
)
match
=
re
.
match
(
os
.
path
.
join
(
params
.
train_dir
,
r
'model.ckpt-(\d+).index'
),
ckpt
.
model_checkpoint_path
+
'.index'
)
self
.
assertTrue
(
match
)
self
.
assertGreaterEqual
(
int
(
match
.
group
(
1
)),
params
.
num_batches
)
params
=
params
.
_replace
(
num_batches
=
2
)
# Reload the model
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
run
()
# Check if global step has been restored.
self
.
assertNotEqual
(
bench
.
init_global_step
,
0
)
ckpt
=
tf
.
train
.
get_checkpoint_state
(
params
.
train_dir
)
match
=
re
.
match
(
os
.
path
.
join
(
params
.
train_dir
,
r
'model.ckpt-(\d+).index'
),
ckpt
.
model_checkpoint_path
+
'.index'
)
self
.
assertTrue
(
match
)
self
.
assertGreaterEqual
(
int
(
match
.
group
(
1
)),
params
.
num_batches
)
# Check that the batch norm moving averages are restored from checkpoints
with
tf
.
Graph
().
as_default
():
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
_build_model
()
saver
=
tf
.
train
.
Saver
(
bench
.
variable_mgr
.
savable_variables
())
with
tf
.
Session
(
config
=
benchmark_cnn
.
create_config_proto
(
params
))
as
sess
:
benchmark_cnn
.
load_checkpoint
(
saver
,
sess
,
params
.
train_dir
)
sess
.
run
(
bench
.
variable_mgr
.
get_post_init_ops
())
bn_moving_vars
=
[
v
for
v
in
tf
.
global_variables
()
if
'/batchnorm'
in
v
.
name
and
'/moving'
in
v
.
name
]
self
.
assertGreater
(
len
(
bn_moving_vars
),
0
)
for
moving_var
in
bn_moving_vars
:
moving_var_value
=
sess
.
run
(
moving_var
)
# Check that the moving means and moving variances have been restored
# by asserting they are not their default values of 0 and 1,
# respectively
if
'/moving_mean'
in
moving_var
.
name
:
self
.
assertFalse
(
np
.
array_equal
(
moving_var_value
,
np
.
zeros
(
moving_var_value
.
shape
,
moving_var_value
.
dtype
)))
else
:
self
.
assertIn
(
'/moving_variance'
,
moving_var
.
name
)
self
.
assertFalse
(
np
.
array_equal
(
moving_var_value
,
np
.
ones
(
moving_var_value
.
shape
,
moving_var_value
.
dtype
)))
def
testModel
(
self
):
_check_has_gpu
()
if
not
self
.
get_model_name
()
or
not
self
.
model_execution_test
():
return
params
=
benchmark_cnn
.
make_params
(
model
=
self
.
get_model_name
(),
num_batches
=
1
,
num_intra_threads
=
1
,
num_inter_threads
=
12
,
batch_size
=
2
,
distortions
=
False
)
# Run this one; note that this uses a non-test session.
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
run
()
def
testSendRecvVariables
(
self
):
self
.
_testVariables
(
'parameter_server'
)
if
self
.
extended_tests
():
self
.
_testVariables
(
'parameter_server'
,
local_parameter_device
=
'CPU'
)
self
.
_testVariables
(
'parameter_server'
,
optimizer
=
'sgd'
)
def
testReplicatedVariables
(
self
):
self
.
_testVariables
(
'replicated'
)
if
self
.
extended_tests
():
self
.
_testVariables
(
'replicated'
,
all_reduce_spec
=
None
)
self
.
_testVariables
(
'replicated'
,
use_fp16
=
True
,
fp16_vars
=
False
)
self
.
_testVariables
(
'replicated'
,
all_reduce_spec
=
None
,
use_fp16
=
True
,
fp16_vars
=
False
,
fp16_enable_auto_loss_scale
=
True
,
fp16_inc_loss_scale_every_n
=
4
)
def
testIndependentVariables
(
self
):
self
.
_testVariables
(
'independent'
)
self
.
_testVariables
(
'independent'
,
all_reduce_spec
=
None
,
use_fp16
=
True
,
fp16_vars
=
False
,
fp16_enable_auto_loss_scale
=
True
,
fp16_inc_loss_scale_every_n
=
4
)
def
testSummaryVerbosity
(
self
):
self
.
_testVariables
(
'parameter_server'
,
summary_verbosity
=
1
)
if
self
.
extended_tests
():
self
.
_testVariables
(
'parameter_server'
,
summary_verbosity
=
2
)
self
.
_testVariables
(
'parameter_server'
,
summary_verbosity
=
3
)
def
testStagedVariables
(
self
):
self
.
_testVariables
(
'parameter_server'
,
staged_vars
=
True
)
if
self
.
extended_tests
():
self
.
_testVariables
(
'parameter_server'
,
staged_vars
=
True
,
local_parameter_device
=
'CPU'
)
self
.
_testVariables
(
'parameter_server'
,
staged_vars
=
True
,
use_fp16
=
True
,
fp16_vars
=
True
)
def
_assert_correct_var_type
(
self
,
var
,
params
):
if
'gpu_cached_inputs'
not
in
var
.
name
:
if
params
.
use_fp16
and
params
.
fp16_vars
and
'batchnorm'
not
in
var
.
name
:
expected_type
=
tf
.
float16
else
:
expected_type
=
tf
.
float32
self
.
assertEqual
(
var
.
dtype
.
base_dtype
,
expected_type
)
def
_testVariables
(
self
,
variable_update
,
summary_verbosity
=
0
,
local_parameter_device
=
'GPU'
,
staged_vars
=
False
,
optimizer
=
'momentum'
,
# TODO(b/80125832): Enable nccl in tests
# all_reduce_spec='nccl',
all_reduce_spec
=
''
,
use_fp16
=
False
,
fp16_vars
=
False
,
fp16_enable_auto_loss_scale
=
False
,
fp16_inc_loss_scale_every_n
=
10
):
if
not
self
.
get_model_name
():
return
_check_has_gpu
()
params
=
benchmark_cnn
.
make_params
(
model
=
self
.
get_model_name
(),
num_batches
=
1
,
num_intra_threads
=
1
,
num_inter_threads
=
12
,
distortions
=
False
,
variable_update
=
variable_update
,
local_parameter_device
=
local_parameter_device
,
num_gpus
=
2
,
summary_verbosity
=
summary_verbosity
,
staged_vars
=
staged_vars
,
optimizer
=
optimizer
,
all_reduce_spec
=
all_reduce_spec
,
compact_gradient_transfer
=
False
if
all_reduce_spec
==
'nccl'
else
True
,
use_fp16
=
use_fp16
,
fp16_loss_scale
=
2.
,
fp16_vars
=
fp16_vars
,
fp16_enable_auto_loss_scale
=
fp16_enable_auto_loss_scale
,
fp16_inc_loss_scale_every_n
=
fp16_inc_loss_scale_every_n
,
)
# Test building models using multiple GPUs, but don't
# run them.
with
self
.
test_session
(
graph
=
tf
.
Graph
()):
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
_build_model
()
# Rough validation of variable type and placement, depending on mode.
all_vars
=
tf
.
global_variables
()
+
tf
.
local_variables
()
if
params
.
variable_update
==
'parameter_server'
:
for
v
in
all_vars
:
tf
.
logging
.
debug
(
'var: %s'
%
v
.
name
)
match
=
re
.
match
(
r
'tower_(\d+)/v/gpu_cached_inputs:0'
,
v
.
name
)
if
match
:
self
.
assertEqual
(
v
.
device
,
'/device:GPU:%s'
%
match
.
group
(
1
))
elif
v
.
name
.
startswith
(
'v/'
):
self
.
assertEqual
(
v
.
device
,
'/device:%s:0'
%
local_parameter_device
)
self
.
_assert_correct_var_type
(
v
,
params
)
elif
v
.
name
in
(
'input_processing/images:0'
,
'input_processing/labels:0'
,
'init_learning_rate:0'
,
'global_step:0'
,
'loss_scale:0'
,
'loss_scale_normal_steps:0'
):
self
.
assertEqual
(
v
.
device
,
'/device:CPU:0'
)
else
:
raise
ValueError
(
'Unexpected variable %s'
%
v
.
name
)
else
:
v0_count
=
0
v1_count
=
0
for
v
in
all_vars
:
if
v
.
name
.
startswith
(
'tower_0/v0/'
):
self
.
assertEqual
(
v
.
name
,
'tower_0/v0/gpu_cached_inputs:0'
)
self
.
assertEqual
(
v
.
device
,
'/device:GPU:0'
)
elif
v
.
name
.
startswith
(
'tower_1/v1/'
):
self
.
assertEqual
(
v
.
name
,
'tower_1/v1/gpu_cached_inputs:0'
)
self
.
assertEqual
(
v
.
device
,
'/device:GPU:1'
)
elif
v
.
name
.
startswith
(
'v0/'
):
v0_count
+=
1
self
.
assertEqual
(
v
.
device
,
'/device:GPU:0'
)
self
.
_assert_correct_var_type
(
v
,
params
)
elif
v
.
name
.
startswith
(
'v1/'
):
v1_count
+=
1
self
.
assertEqual
(
v
.
device
,
'/device:GPU:1'
)
self
.
_assert_correct_var_type
(
v
,
params
)
elif
v
.
name
in
(
'input_processing/images:0'
,
'input_processing/labels:0'
,
'init_learning_rate:0'
,
'global_step:0'
,
'loss_scale:0'
,
'loss_scale_normal_steps:0'
):
self
.
assertEqual
(
v
.
device
,
'/device:CPU:0'
)
else
:
raise
ValueError
(
'Unexpected variable %s'
%
v
.
name
)
self
.
assertEqual
(
v0_count
,
v1_count
)
# Validate summary ops in the model depending on verbosity level
summary_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
SUMMARIES
)
num_summary_ops
=
len
(
summary_ops
)
self
.
assertEqual
(
num_summary_ops
>
0
,
summary_verbosity
>
0
)
if
summary_verbosity
>
0
:
has_affine_histogram
=
False
has_gradient_histogram
=
False
has_log_gradients_histogram
=
False
for
op
in
summary_ops
:
if
'/gradients'
in
op
.
name
:
has_gradient_histogram
=
True
elif
'/affine'
in
op
.
name
:
has_affine_histogram
=
True
elif
'log_gradients'
in
op
.
name
:
has_log_gradients_histogram
=
True
self
.
assertEqual
(
summary_verbosity
>=
3
,
has_affine_histogram
)
self
.
assertEqual
(
summary_verbosity
>=
3
,
has_gradient_histogram
)
self
.
assertEqual
(
summary_verbosity
>=
2
,
has_log_gradients_histogram
)
if
summary_verbosity
==
1
:
self
.
assertLess
(
num_summary_ops
,
10
)
class
TrivialModelTest
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'trivial'
class
TestVgg1Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'vgg11'
class
TestVgg19Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'vgg19'
class
TestLenet5Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'lenet'
class
TestGooglenetModel
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'googlenet'
class
TestOverfeatModel
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'overfeat'
class
TestAlexnetModel
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'alexnet'
def
extended_tests
(
self
):
return
True
class
TestTrivialModel
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'trivial'
class
TestInceptionv3Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'inception3'
def
extended_tests
(
self
):
return
True
class
TestInceptionv4Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'inception4'
class
TestResnet50Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'resnet50'
def
model_save_load_test
(
self
):
return
True
class
TestResnet101Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'resnet101'
class
TestResnet152Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'resnet152'
class
TestResnet50V2Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'resnet50_v2'
class
TestResnet101V2Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'resnet101_v2'
class
TestResnet152V2Model
(
TfCnnBenchmarksModelTest
):
def
get_model_name
(
self
):
return
'resnet152_v2'
class
TfCnnBenchmarksTest
(
tf
.
test
.
TestCase
):
"""Tests that benchmark_cnn runs correctly."""
def
setUp
(
self
):
super
(
TfCnnBenchmarksTest
,
self
).
setUp
()
_check_has_gpu
()
benchmark_cnn
.
setup
(
benchmark_cnn
.
make_params
())
def
_run_benchmark_cnn
(
self
,
params
):
logs
=
[]
benchmark_cnn
.
log_fn
=
test_util
.
print_and_add_to_list
(
logs
)
benchmark_cnn
.
BenchmarkCNN
(
params
).
run
()
return
logs
def
_run_benchmark_cnn_with_fake_images
(
self
,
params
,
images
,
labels
):
logs
=
[]
benchmark_cnn
.
log_fn
=
test_util
.
print_and_add_to_list
(
logs
)
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
bench
.
input_preprocessor
=
preprocessing
.
TestImagePreprocessor
(
params
.
batch_size
*
params
.
num_gpus
,
[[
params
.
batch_size
,
227
,
227
,
3
],
[
params
.
batch_size
]],
params
.
num_gpus
,
bench
.
model
.
data_type
)
bench
.
dataset
.
_queue_runner_required
=
True
bench
.
input_preprocessor
.
set_fake_data
(
images
,
labels
)
bench
.
input_preprocessor
.
expected_subset
=
(
'validation'
if
params
.
eval
else
'train'
)
bench
.
run
()
return
logs
def
_run_benchmark_cnn_with_black_and_white_images
(
self
,
params
):
"""Runs BenchmarkCNN with black and white images.
A BenchmarkCNN is created and run with black and white images as input. Half
the images are black (i.e., filled with 0s) and half are white (i.e., filled
with 255s).
Args:
params: Params for BenchmarkCNN.
Returns:
A list of lines from the output of BenchmarkCNN.
"""
# TODO(reedwm): Instead of generating images here, use black and white
# tfrecords by calling test_util.create_black_and_white_images().
effective_batch_size
=
params
.
batch_size
*
params
.
num_gpus
half_batch_size
=
effective_batch_size
//
2
images
=
np
.
zeros
((
effective_batch_size
,
227
,
227
,
3
),
dtype
=
np
.
float32
)
images
[
half_batch_size
:,
:,
:,
:]
=
255
labels
=
np
.
array
([
0
]
*
half_batch_size
+
[
1
]
*
half_batch_size
,
dtype
=
np
.
int32
)
return
self
.
_run_benchmark_cnn_with_fake_images
(
params
,
images
,
labels
)
def
_train_and_eval_local
(
self
,
params
,
check_output_values
=
False
,
max_final_loss
=
10.
,
skip
=
None
,
use_test_preprocessor
=
True
):
# TODO(reedwm): check_output_values should default to True and be enabled
# on every test. Currently, if check_output_values=True and the calls to
# tf.set_random_seed(...) and np.seed(...) are passed certain seed values in
# benchmark_cnn.py, then most tests will fail. This indicates the tests
# are brittle and could fail with small changes when
# check_output_values=True, so check_output_values defaults to False for
# now.
def
run_fn
(
run_type
,
inner_params
):
del
run_type
if
use_test_preprocessor
:
return
[
self
.
_run_benchmark_cnn_with_black_and_white_images
(
inner_params
)
]
else
:
return
[
self
.
_run_benchmark_cnn
(
inner_params
)]
return
test_util
.
train_and_eval
(
self
,
run_fn
,
params
,
check_output_values
=
check_output_values
,
max_final_loss
=
max_final_loss
,
skip
=
skip
)
def
testAlexnet
(
self
):
params
=
test_util
.
get_params
(
'testAlexnet'
).
_replace
(
num_batches
=
30
,
init_learning_rate
=
0.01
,
model
=
'alexnet'
)
self
.
_train_and_eval_local
(
params
)
def
testNoPrintAccuracy
(
self
):
params
=
test_util
.
get_params
(
'testNoPrintAccuracy'
).
_replace
(
print_training_accuracy
=
False
)
self
.
_train_and_eval_local
(
params
)
def
testLowAccuracy
(
self
):
params
=
test_util
.
get_params
(
'testLowAccuracy'
).
_replace
(
print_training_accuracy
=
True
,
batch_size
=
5
,
num_batches
=
10
)
# We force low accuracy by having each batch containing 10 identical images,
# each with a different label. This guarantees a top-1 accuracy of exactly
# 0.1 and a top-5 accuracy of exactly 0.5.
images
=
np
.
zeros
((
10
,
227
,
227
,
3
),
dtype
=
np
.
float32
)
labels
=
np
.
arange
(
10
,
dtype
=
np
.
int32
)
logs
=
self
.
_run_benchmark_cnn_with_fake_images
(
params
,
images
,
labels
)
training_outputs
=
test_util
.
get_training_outputs_from_logs
(
logs
,
params
.
print_training_accuracy
)
last_output
=
training_outputs
[
-
1
]
# TODO(reedwm): These should be assertEqual but for some reason,
# occasionally the accuracies are lower (Running this test 500 times, these
# asserts failed twice). Investigate this problem.
self
.
assertLessEqual
(
last_output
.
top_1_accuracy
,
0.1
)
self
.
assertLessEqual
(
last_output
.
top_5_accuracy
,
0.5
)
def
testParameterServer
(
self
):
params
=
test_util
.
get_params
(
'testParameterServer'
)
self
.
_train_and_eval_local
(
params
)
def
testParameterServerStaged
(
self
):
params
=
test_util
.
get_params
(
'testParameterServerStaged'
).
_replace
(
staged_vars
=
True
)
self
.
_train_and_eval_local
(
params
)
def
testReplicated
(
self
):
params
=
test_util
.
get_params
(
'testReplicated'
).
_replace
(
variable_update
=
'replicated'
)
self
.
_train_and_eval_local
(
params
)
def
testIndependent
(
self
):
params
=
test_util
.
get_params
(
'testIndependent'
).
_replace
(
variable_update
=
'independent'
)
self
.
_train_and_eval_local
(
params
)
def
testForwardOnly
(
self
):
params
=
test_util
.
get_params
(
'testForwardOnly'
).
_replace
(
forward_only
=
True
)
# Evaluation is not supported with --forward_only, so we set skip='eval'.
self
.
_train_and_eval_local
(
params
,
skip
=
'eval'
)
def
testForwardOnlyAndFreeze
(
self
):
params
=
test_util
.
get_params
(
'testForwardOnlyAndFreeze'
).
_replace
(
forward_only
=
True
,
freeze_when_forward_only
=
True
,
train_dir
=
None
)
# Training is not supported with --freeze_when_forward_only.
self
.
_train_and_eval_local
(
params
,
skip
=
'eval_and_train_from_checkpoint'
)
def
testNoDistortions
(
self
):
params
=
test_util
.
get_params
(
'testNoDistortions'
).
_replace
(
distortions
=
False
)
self
.
_train_and_eval_local
(
params
)
def
testCpuAsLocalParamDevice
(
self
):
params
=
test_util
.
get_params
(
'testCpuAsLocalParamDevice'
).
_replace
(
local_parameter_device
=
'cpu'
)
self
.
_train_and_eval_local
(
params
)
def
testNHWC
(
self
):
params
=
test_util
.
get_params
(
'testNHWC'
).
_replace
(
data_format
=
'NHWC'
)
self
.
_train_and_eval_local
(
params
)
def
testCpuAsDevice
(
self
):
params
=
test_util
.
get_params
(
'testCpuAsDevice'
).
_replace
(
device
=
'cpu'
,
data_format
=
'NHWC'
)
# NHWC required when --device=cpu
self
.
_train_and_eval_local
(
params
)
def
testMomentumParameterServer
(
self
):
params
=
test_util
.
get_params
(
'testMomentumParameterServer'
).
_replace
(
optimizer
=
'momentum'
,
momentum
=
0.8
)
self
.
_train_and_eval_local
(
params
)
def
testRmspropReplicated
(
self
):
params
=
test_util
.
get_params
(
'testRmspropReplicated'
).
_replace
(
variable_update
=
'replicated'
,
optimizer
=
'rmsprop'
,
rmsprop_decay
=
0.8
,
rmsprop_momentum
=
0.6
,
rmsprop_epsilon
=
0.7
,
init_learning_rate
=
0.01
)
self
.
_train_and_eval_local
(
params
)
def
testBatchGroupSize
(
self
):
params
=
test_util
.
get_params
(
'testBatchGroupSize'
).
_replace
(
batch_group_size
=
4
,
num_batches
=
100
,
num_warmup_batches
=
5
)
self
.
_train_and_eval_local
(
params
)
def
testGradientClip
(
self
):
params
=
test_util
.
get_params
(
'testGradientClip'
).
_replace
(
gradient_clip
=
100.0
)
self
.
_train_and_eval_local
(
params
)
def
testWeightDecay
(
self
):
params
=
test_util
.
get_params
(
'testWeightDecay'
).
_replace
(
weight_decay
=
0.0001
)
self
.
_train_and_eval_local
(
params
)
def
testNoLayers
(
self
):
params
=
test_util
.
get_params
(
'testNoLayers'
).
_replace
(
use_tf_layers
=
False
)
self
.
_train_and_eval_local
(
params
)
def
testSaveModelSteps
(
self
):
params
=
test_util
.
get_params
(
'testSaveModelSteps'
).
_replace
(
save_model_steps
=
2
,
num_warmup_batches
=
0
,
num_batches
=
10
,
max_ckpts_to_keep
=
3
)
self
.
_train_and_eval_local
(
params
)
for
i
in
range
(
1
,
20
+
1
):
# We train for 20 steps, since self._train_and_eval_local() does two
# training runs of 10 steps each. We save a checkpoint every 2 steps and
# keep the last 3 checkpoints, so at the end, we should have checkpoints
# for steps 16, 18, and 20.
matches
=
glob
.
glob
(
os
.
path
.
join
(
params
.
train_dir
,
'model.ckpt-{}.*'
.
format
(
i
)))
if
i
in
(
16
,
18
,
20
):
self
.
assertTrue
(
matches
)
else
:
self
.
assertFalse
(
matches
)
def
testFp16WithFp32Vars
(
self
):
params
=
test_util
.
get_params
(
'testFp16WithFp32Vars'
).
_replace
(
use_fp16
=
True
,
fp16_vars
=
False
,
fp16_loss_scale
=
1.
)
self
.
_train_and_eval_local
(
params
)
def
testFp16WithFp16Vars
(
self
):
params
=
test_util
.
get_params
(
'testFp16WithFp16Vars'
).
_replace
(
use_fp16
=
True
,
fp16_vars
=
True
)
self
.
_train_and_eval_local
(
params
)
def
testXlaCompile
(
self
):
params
=
test_util
.
get_params
(
'testXlaCompile'
).
_replace
(
xla_compile
=
True
)
self
.
_train_and_eval_local
(
params
)
@
unittest
.
skip
(
'Fails for unknown reason'
)
def
testXlaCompileWithFp16
(
self
):
params
=
test_util
.
get_params
(
'testXlaCompileWithFp16'
).
_replace
(
use_fp16
=
True
,
xla_compile
=
True
)
self
.
_train_and_eval_local
(
params
)
def
testGradientRepacking
(
self
):
params
=
test_util
.
get_params
(
'testGradientRepacking1'
).
_replace
(
gradient_repacking
=
2
)
self
.
_train_and_eval_local
(
params
,
skip
=
'eval_and_train_from_checkpoint'
)
params
=
test_util
.
get_params
(
'testGradientRepacking2'
).
_replace
(
gradient_repacking
=
2
,
use_fp16
=
True
)
self
.
_train_and_eval_local
(
params
,
skip
=
'eval_and_train_from_checkpoint'
)
def
testTraceFileChromeTraceFormat
(
self
):
trace_file
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'testTraceFileChromeTraceFormat_tracefile'
)
params
=
test_util
.
get_params
(
'testTraceFileChromeTraceFormat'
).
_replace
(
trace_file
=
trace_file
,
use_chrome_trace_format
=
True
)
self
.
_train_and_eval_local
(
params
)
self
.
assertGreater
(
os
.
stat
(
trace_file
).
st_size
,
0
)
def
testTraceFileStepStatsProto
(
self
):
trace_file
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'testTraceFileStepStatsProto_tracefile'
)
params
=
test_util
.
get_params
(
'testTraceFileStepStatsProto'
).
_replace
(
trace_file
=
trace_file
,
use_chrome_trace_format
=
False
)
self
.
_train_and_eval_local
(
params
)
self
.
assertGreater
(
os
.
stat
(
trace_file
).
st_size
,
0
)
with
open
(
trace_file
)
as
f
:
step_stats
=
step_stats_pb2
.
StepStats
()
# The following statement should not raise an exception.
contents
=
f
.
read
()
text_format
.
Merge
(
contents
,
step_stats
)
def
testTfprofFile
(
self
):
tfprof_file
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'testTfprofFile_tfproffile'
)
params
=
test_util
.
get_params
(
'testTfprofFile'
).
_replace
(
tfprof_file
=
tfprof_file
)
self
.
_train_and_eval_local
(
params
,
skip
=
'eval_and_train_from_checkpoint'
)
self
.
assertGreater
(
os
.
stat
(
tfprof_file
).
st_size
,
0
)
with
open
(
tfprof_file
,
'rb'
)
as
f
:
profile_proto
=
tfprof_log_pb2
.
ProfileProto
()
# The following statement should not raise an exception.
profile_proto
.
ParseFromString
(
f
.
read
())
@
unittest
.
skip
(
'Fails for unknown reason'
)
def
testMoveTrainDir
(
self
):
params
=
test_util
.
get_params
(
'testMoveTrainDir'
)
self
.
_train_and_eval_local
(
params
)
new_train_dir
=
params
.
train_dir
+
'_moved'
os
.
rename
(
params
.
train_dir
,
new_train_dir
)
params
=
params
.
_replace
(
train_dir
=
new_train_dir
,
eval
=
True
)
self
.
_run_benchmark_cnn_with_black_and_white_images
(
params
)
@
mock
.
patch
(
'tensorflow.compat.v1.train.Saver'
)
@
mock
.
patch
(
'benchmark_cnn._get_checkpoint_to_load'
)
def
testLoadCheckpoint
(
self
,
mock_checkpoint_to_load
,
mock_saver
):
"""Tests load checkpoint with full path to checkpoint."""
expected_checkpoint
=
'/path/to/checkpoints/model.ckpt-1243'
mock_checkpoint_to_load
.
return_value
=
expected_checkpoint
global_batch
=
benchmark_cnn
.
load_checkpoint
(
mock_saver
,
None
,
expected_checkpoint
)
self
.
assertEqual
(
global_batch
,
1243
)
def
testGetCheckpointToLoadFullPath
(
self
):
"""Tests passing full path."""
ckpt_path
=
'/foo/bar/model.ckpt-189'
full_path
=
benchmark_cnn
.
_get_checkpoint_to_load
(
ckpt_path
)
self
.
assertEqual
(
full_path
,
ckpt_path
)
def
testGetCheckpointToLoadException
(
self
):
"""Tests exception for directory without a checkpoint."""
ckpt_path
=
'/foo/bar/checkpoints'
self
.
assertRaises
(
benchmark_cnn
.
CheckpointNotFoundException
,
benchmark_cnn
.
_get_checkpoint_to_load
,
ckpt_path
)
@
mock
.
patch
(
'tensorflow.compat.v1.train.get_checkpoint_state'
)
def
testGetCheckpointToLoad
(
self
,
mock_checkpoint_state
):
"""Tests passing path to checkpoint folder."""
expected_checkpoint
=
'/path/to/checkpoints/model.ckpt-1243'
mock_checkpoint_state
.
return_value
=
mock
.
Mock
(
model_checkpoint_path
=
expected_checkpoint
)
ckpt_path
=
'/path/to/checkpoints/'
full_path
=
benchmark_cnn
.
_get_checkpoint_to_load
(
ckpt_path
)
self
.
assertEqual
(
full_path
,
expected_checkpoint
)
def
testImagenetPreprocessor
(
self
):
imagenet_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'fake_tf_record_data'
)
params
=
test_util
.
get_params
(
'testImagenetPreprocessor'
).
_replace
(
data_dir
=
imagenet_dir
,
data_name
=
'imagenet'
)
self
.
_train_and_eval_local
(
params
,
use_test_preprocessor
=
False
)
def
testImagenetPreprocessorNoDistortions
(
self
):
imagenet_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'fake_tf_record_data'
)
params
=
test_util
.
get_params
(
'testImagenetPreprocessorNoDistortions'
).
_replace
(
data_dir
=
imagenet_dir
,
data_name
=
'imagenet'
,
distortions
=
False
)
self
.
_train_and_eval_local
(
params
,
use_test_preprocessor
=
False
)
def
testImagenetPreprocessorVerboseSummary
(
self
):
imagenet_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'fake_tf_record_data'
)
params
=
test_util
.
get_params
(
'testImagenetPreprocessorVerboseSummary'
).
_replace
(
data_dir
=
imagenet_dir
,
data_name
=
'imagenet'
,
distortions
=
False
,
summary_verbosity
=
2
)
self
.
_train_and_eval_local
(
params
,
use_test_preprocessor
=
False
)
def
testCifar10SyntheticData
(
self
):
params
=
test_util
.
get_params
(
'testCifar10SyntheticData'
).
_replace
(
data_name
=
'cifar10'
)
self
.
_train_and_eval_local
(
params
)
def
testShiftRatio
(
self
):
test_util
.
monkey_patch_base_cluster_manager
()
params
=
benchmark_cnn
.
make_params
(
data_name
=
'imagenet'
,
data_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'fake_tf_record_data'
),
job_name
=
'worker'
,
worker_hosts
=
'w1,w2,w3,w4'
,
ps_hosts
=
'p1'
,
task_index
=
0
)
self
.
assertEqual
(
benchmark_cnn
.
BenchmarkCNN
(
params
).
input_preprocessor
.
shift_ratio
,
0.0
)
params
=
params
.
_replace
(
task_index
=
3
)
self
.
assertEqual
(
benchmark_cnn
.
BenchmarkCNN
(
params
).
input_preprocessor
.
shift_ratio
,
0.75
)
def
testDistributedReplicatedSavableVars
(
self
):
test_util
.
monkey_patch_base_cluster_manager
()
params
=
benchmark_cnn
.
make_params
(
variable_update
=
'distributed_replicated'
,
model
=
'inception4'
,
data_name
=
'imagenet'
,
data_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'fake_tf_record_data'
),
job_name
=
'worker'
,
worker_hosts
=
'w1,w2,w3,w4'
,
ps_hosts
=
'p1'
,
datasets_use_prefetch
=
False
)
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
with
tf
.
Graph
().
as_default
():
bench
.
_build_model
()
savable_vars
=
bench
.
variable_mgr
.
savable_variables
()
# Assert all global variables are in savable_vars
for
v
in
tf
.
global_variables
():
if
not
v
.
name
.
startswith
(
variable_mgr_util
.
PS_SHADOW_VAR_PREFIX
+
'/v0'
):
self
.
assertEqual
(
v
.
name
,
'global_step:0'
)
name
=
bench
.
variable_mgr
.
_strip_port
(
v
.
name
)
if
name
.
startswith
(
variable_mgr_util
.
PS_SHADOW_VAR_PREFIX
):
name
=
name
[
len
(
variable_mgr_util
.
PS_SHADOW_VAR_PREFIX
+
'/'
):]
self
.
assertIn
(
name
,
savable_vars
)
self
.
assertIn
(
savable_vars
[
name
],
tf
.
global_variables
())
# Assert all local variables on the first tower are in savable_vars
for
v
in
tf
.
local_variables
():
if
v
.
name
.
startswith
(
'v0/'
):
name
=
bench
.
variable_mgr
.
_strip_port
(
v
.
name
)
self
.
assertIn
(
name
,
savable_vars
)
def
_test_preprocessing_eval
(
self
,
image_height
,
image_width
,
output_height
,
output_width
):
image
=
tf
.
fill
((
image_height
,
image_width
,
3
),
tf
.
constant
(
128
,
dtype
=
tf
.
uint8
))
params
=
benchmark_cnn
.
make_params
()
new_image
=
preprocessing
.
eval_image
(
image
,
output_height
,
output_width
,
0
,
'bilinear'
,
params
.
summary_verbosity
)
with
self
.
test_session
()
as
sess
:
new_image_value
=
sess
.
run
(
new_image
)
self
.
assertAllEqual
(
new_image_value
,
np
.
full
((
output_height
,
output_width
,
3
),
128
,
dtype
=
np
.
uint8
))
def
testPreprocessingEval
(
self
):
self
.
_test_preprocessing_eval
(
10
,
10
,
4
,
4
)
self
.
_test_preprocessing_eval
(
4
,
4
,
10
,
10
)
self
.
_test_preprocessing_eval
(
1
,
100
,
100
,
1
)
self
.
_test_preprocessing_eval
(
100
,
1
,
1
,
100
)
self
.
_test_preprocessing_eval
(
1
,
100
,
1
,
100
)
def
_test_preprocessing_traing
(
self
,
image_buf
,
image_color
,
output_height
,
output_width
,
bbox
,
batch_position
,
resize_method
,
distortions
,
summary_verbosity
,
fuse_decode_and_crop
):
new_image
=
preprocessing
.
train_image
(
image_buf
,
output_height
,
output_width
,
bbox
,
batch_position
,
resize_method
,
distortions
,
summary_verbosity
=
summary_verbosity
,
fuse_decode_and_crop
=
fuse_decode_and_crop
)
self
.
assertEqual
(
new_image
.
shape
,
[
output_height
,
output_width
,
3
])
with
self
.
test_session
(
use_gpu
=
True
)
as
sess
:
new_image_value
=
sess
.
run
(
new_image
)
self
.
assertAllClose
(
new_image_value
,
np
.
full
(
[
output_height
,
output_width
,
3
],
image_color
,
dtype
=
np
.
float32
),
atol
=
50.
,
rtol
=
0.
)
def
testPreprocessingTrain
(
self
):
test_data_dir
=
os
.
path
.
join
(
platforms_util
.
get_test_data_dir
(),
'images'
)
black_file
=
os
.
path
.
join
(
test_data_dir
,
'black_image.jpg'
)
with
open
(
black_file
,
'rb'
)
as
f
:
black_jpg_buffer
=
f
.
read
()
white_file
=
os
.
path
.
join
(
test_data_dir
,
'white_image.jpg'
)
with
open
(
white_file
,
'rb'
)
as
f
:
white_jpg_buffer
=
f
.
read
()
bbox
=
tf
.
zeros
((
1
,
0
,
4
),
dtype
=
tf
.
float32
)
batch_position
=
0
# Each size config is (output_height, output_width, resize_method)
size_configs
=
[(
100
,
100
,
'round_robin'
),
(
150
,
10
,
'bilinear'
),
(
10
,
150
,
'nearest'
)]
# Each image config is (image_buf, image_color)
image_configs
=
[(
white_jpg_buffer
,
255
),
(
black_jpg_buffer
,
0
)]
for
(
image_buf
,
image_color
)
in
image_configs
:
for
output_height
,
output_width
,
resize_method
in
size_configs
:
for
distortions
in
[
True
,
False
]:
for
summary_verbosity
in
[
0
,
2
]:
for
fuse_decode_and_crop
in
[
True
,
False
]:
self
.
_test_preprocessing_traing
(
image_buf
,
image_color
,
output_height
,
output_width
,
bbox
,
batch_position
,
resize_method
,
distortions
,
summary_verbosity
,
fuse_decode_and_crop
)
def
_test_learning_rate
(
self
,
params
,
global_step_to_expected_learning_rate
):
self
.
longMessage
=
True
# pylint: disable=invalid-name
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
with
tf
.
Graph
().
as_default
()
as
graph
:
bench
.
_build_model
()
global_step
=
graph
.
get_tensor_by_name
(
'global_step:0'
)
learning_rate
=
graph
.
get_tensor_by_name
(
'learning_rate_tensor:0'
)
with
self
.
test_session
(
graph
=
graph
,
use_gpu
=
True
)
as
sess
:
items
=
global_step_to_expected_learning_rate
.
items
()
for
global_step_val
,
expected_learning_rate
in
items
:
self
.
assertAlmostEqual
(
sess
.
run
(
learning_rate
,
{
global_step
:
global_step_val
}),
expected_learning_rate
,
msg
=
'at global_step:{}'
.
format
(
global_step_val
))
def
testLearningRateModelSpecificResNet
(
self
):
params
=
benchmark_cnn
.
make_params
(
model
=
'resnet50'
,
batch_size
=
256
,
variable_update
=
'parameter_server'
,
num_gpus
=
1
)
self
.
_test_learning_rate
(
params
,
{
0
:
0
,
150136
:
0.128
,
150137
:
0.0128
,
300273
:
0.0128
,
300274
:
0.00128
,
10000000
:
0.0000128
})
def
testLearningRateUserProvidedInitLr
(
self
):
params
=
benchmark_cnn
.
make_params
(
model
=
'resnet50'
,
batch_size
=
256
,
variable_update
=
'replicated'
,
init_learning_rate
=
1.
)
self
.
_test_learning_rate
(
params
,
{
0
:
1.
,
10000000
:
1.
})
def
testLearningRateUserProvidedInitLrAndWarmup
(
self
):
params
=
benchmark_cnn
.
make_params
(
model
=
'resnet50'
,
batch_size
=
256
,
variable_update
=
'replicated'
,
init_learning_rate
=
1.
,
num_learning_rate_warmup_epochs
=
5
)
self
.
_test_learning_rate
(
params
,
{
0
:
0.
,
12511
:
0.5
,
25022
:
1.
,
10000000
:
1.
})
def
testLearningRateUserProvidedDecayInfo
(
self
):
params
=
benchmark_cnn
.
make_params
(
model
=
'resnet50'
,
init_learning_rate
=
1.
,
learning_rate_decay_factor
=
0.5
,
num_epochs_per_decay
=
2
,
minimum_learning_rate
=
0.3750
,
batch_size
=
32
)
self
.
_test_learning_rate
(
params
,
{
0
:
1.
,
80071
:
1.
,
80072
:
0.5
,
160143
:
0.5
,
160144
:
0.375
,
10000000
:
0.375
})
def
testLearningRateUserProvidedZeroDecay
(
self
):
params
=
benchmark_cnn
.
make_params
(
model
=
'resnet50'
,
num_learning_rate_warmup_epochs
=
0
,
learning_rate_decay_factor
=
0.5
,
num_epochs_per_decay
=
0
,
minimum_learning_rate
=
0.3750
,
batch_size
=
32
)
with
self
.
assertRaises
(
ValueError
):
with
tf
.
Graph
().
as_default
():
# This will fail because params.learning_rate_decay_factor cannot be
# nonzero if params.num_epochs_per_decay is zero.
benchmark_cnn
.
BenchmarkCNN
(
params
).
_build_model
()
def
testLearningRateUserProvidedSchedule
(
self
):
params
=
benchmark_cnn
.
make_params
(
model
=
'trivial'
,
batch_size
=
32
,
piecewise_learning_rate_schedule
=
'1;3;.1;5;.01'
)
self
.
_test_learning_rate
(
params
,
{
0
:
1.
,
120108
:
1.
,
120109
:
0.1
,
200181
:
0.1
,
200182
:
0.01
,
100000000
:
0.01
})
def
testNumBatchesAndEpochs
(
self
):
params
=
benchmark_cnn
.
make_params
()
batches
,
epochs
=
benchmark_cnn
.
get_num_batches_and_epochs
(
params
,
10
,
100
)
self
.
assertEqual
(
batches
,
benchmark_cnn
.
_DEFAULT_NUM_BATCHES
)
self
.
assertAlmostEqual
(
epochs
,
float
(
benchmark_cnn
.
_DEFAULT_NUM_BATCHES
)
/
10
)
params
=
benchmark_cnn
.
make_params
(
num_batches
=
21
)
batches
,
epochs
=
benchmark_cnn
.
get_num_batches_and_epochs
(
params
,
25
,
50
)
self
.
assertEqual
(
batches
,
21
)
self
.
assertAlmostEqual
(
epochs
,
10.5
)
params
=
benchmark_cnn
.
make_params
(
num_epochs
=
3
)
batches
,
epochs
=
benchmark_cnn
.
get_num_batches_and_epochs
(
params
,
2
,
3
)
self
.
assertEqual
(
batches
,
5
)
self
.
assertAlmostEqual
(
epochs
,
10.
/
3.
)
params
=
benchmark_cnn
.
make_params
(
num_epochs
=
4
)
batches
,
epochs
=
benchmark_cnn
.
get_num_batches_and_epochs
(
params
,
2
,
3
)
self
.
assertEqual
(
batches
,
6
)
self
.
assertAlmostEqual
(
epochs
,
4
)
with
self
.
assertRaises
(
ValueError
):
params
=
benchmark_cnn
.
make_params
(
num_batches
=
100
,
num_epochs
=
100
)
benchmark_cnn
.
get_num_batches_and_epochs
(
params
,
1
,
1
)
def
_testEvalDuringTraining
(
self
,
params
,
expected_num_eval_batches_found
):
# The idea of this test is that all train images are black and all eval
# images are white. We pass the images through the TestModel, and ensure
# the outputs are as expected.
batch_size
=
params
.
batch_size
eval_batch_size
=
params
.
eval_batch_size
or
params
.
batch_size
class
TestModel
(
test_util
.
TestCNNModel
):
def
__init__
(
self
):
super
(
TestModel
,
self
).
__init__
()
self
.
depth
=
3
def
add_inference
(
self
,
cnn
):
if
cnn
.
phase_train
:
# This will allow us to test that 100 is only added during training
# and not during eval.
cnn
.
top_layer
+=
100
assert
cnn
.
top_layer
.
shape
[
0
]
==
batch_size
else
:
assert
cnn
.
top_layer
.
shape
[
0
]
==
eval_batch_size
# Reduce the image to a single number. The number should be (-1 + 100)
# during training and 1 during testing.
cnn
.
top_layer
=
tf
.
reshape
(
cnn
.
top_layer
,
(
cnn
.
top_layer
.
shape
[
0
],
-
1
))
cnn
.
top_layer
=
tf
.
reduce_mean
(
cnn
.
top_layer
,
axis
=
1
)
cnn
.
top_layer
=
tf
.
reshape
(
cnn
.
top_layer
,
(
cnn
.
top_layer
.
shape
[
0
],
1
,
1
,
1
))
cnn
.
top_size
=
1
trainable_vars
=
tf
.
trainable_variables
()
# The super method will compute image*A*B, where A=1 and B=2.
super
(
TestModel
,
self
).
add_inference
(
cnn
)
if
not
cnn
.
phase_train
:
# Assert no new variables were added, since they should be reused from
# training.
assert
len
(
trainable_vars
)
==
len
(
tf
.
trainable_variables
())
model
=
TestModel
()
dataset
=
datasets
.
ImagenetDataset
(
params
.
data_dir
)
logs
=
[]
bench_cnn
=
benchmark_cnn
.
BenchmarkCNN
(
params
,
model
=
model
,
dataset
=
dataset
)
with
test_util
.
monkey_patch
(
benchmark_cnn
,
log_fn
=
test_util
.
print_and_add_to_list
(
logs
)):
bench_cnn
.
run
()
training_outputs
=
test_util
.
get_training_outputs_from_logs
(
logs
,
print_training_accuracy
=
False
)
self
.
assertEqual
(
len
(
training_outputs
),
params
.
num_batches
)
expected_training_output
=
(
-
1
+
100
)
*
1
*
2
for
training_output
in
training_outputs
:
self
.
assertEqual
(
training_output
.
loss
,
expected_training_output
)
eval_outputs
=
test_util
.
get_evaluation_outputs_from_logs
(
logs
)
self
.
assertTrue
(
eval_outputs
)
expected_eval_output
=
1
*
1
*
2
for
eval_output
in
eval_outputs
:
self
.
assertEqual
(
eval_output
.
top_1_accuracy
,
expected_eval_output
)
self
.
assertEqual
(
eval_output
.
top_5_accuracy
,
expected_eval_output
)
num_eval_batches_found
=
0
eval_batch_regex
=
re
.
compile
(
r
'^\d+\t[0-9.]+ examples/sec$'
)
for
log
in
logs
:
if
eval_batch_regex
.
match
(
log
):
num_eval_batches_found
+=
1
self
.
assertEqual
(
num_eval_batches_found
,
expected_num_eval_batches_found
)
def
testEvalDuringTraining
(
self
):
data_dir
=
test_util
.
create_black_and_white_images
()
base_params
=
test_util
.
get_params
(
'testEvalDuringTraining'
)
train_dir
=
base_params
.
train_dir
base_params
=
base_params
.
_replace
(
train_dir
=
None
,
print_training_accuracy
=
False
,
num_warmup_batches
=
0
,
num_batches
=
7
,
num_eval_batches
=
2
,
display_every
=
1
,
init_learning_rate
=
0
,
weight_decay
=
0
,
distortions
=
False
,
data_dir
=
data_dir
)
expected_num_eval_batches_found
=
(
base_params
.
num_eval_batches
*
(
base_params
.
num_batches
//
2
+
1
))
# Test --eval_during_training_every_n_steps
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_every_n_steps
=
2
,
variable_update
=
'parameter_server'
),
expected_num_eval_batches_found
)
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_every_n_steps
=
2
,
variable_update
=
'replicated'
),
expected_num_eval_batches_found
)
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_every_n_steps
=
2
,
variable_update
=
'replicated'
,
summary_verbosity
=
2
,
save_summaries_steps
=
2
,
datasets_use_prefetch
=
False
),
expected_num_eval_batches_found
)
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_every_n_steps
=
2
,
variable_update
=
'replicated'
,
use_fp16
=
True
,
train_dir
=
train_dir
,
eval_batch_size
=
base_params
.
batch_size
+
2
),
expected_num_eval_batches_found
)
# Test --eval_during_training_every_n_epochs
every_n_epochs
=
(
2
*
base_params
.
batch_size
*
base_params
.
num_gpus
/
datasets
.
IMAGENET_NUM_TRAIN_IMAGES
)
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_every_n_epochs
=
every_n_epochs
,
variable_update
=
'replicated'
),
expected_num_eval_batches_found
)
# Test --eval_during_training_at_specified_steps
list_steps
=
[
2
,
3
,
5
,
7
,
1000
]
num_eval_steps
=
1
+
sum
(
1
for
step
in
list_steps
if
step
<
base_params
.
num_batches
)
expected_num_eval_batches_found
=
(
base_params
.
num_eval_batches
*
num_eval_steps
)
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_at_specified_steps
=
list_steps
,
variable_update
=
'replicated'
),
expected_num_eval_batches_found
)
# Test --eval_during_training_at_specified_epochs
list_epochs
=
[(
step
*
base_params
.
batch_size
*
base_params
.
num_gpus
/
datasets
.
IMAGENET_NUM_TRAIN_IMAGES
)
for
step
in
list_steps
]
self
.
_testEvalDuringTraining
(
base_params
.
_replace
(
eval_during_training_at_specified_epochs
=
list_epochs
,
variable_update
=
'replicated'
),
expected_num_eval_batches_found
)
# Test --eval_during_training_every_n_steps runs with synthetic data.
params
=
base_params
.
_replace
(
variable_update
=
'replicated'
,
data_dir
=
None
,
eval_during_training_every_n_steps
=
2
,
num_batches
=
2
)
benchmark_cnn
.
BenchmarkCNN
(
params
).
run
()
def
testEvalDuringTrainingNumEpochs
(
self
):
params
=
benchmark_cnn
.
make_params
(
batch_size
=
1
,
eval_batch_size
=
2
,
eval_during_training_every_n_steps
=
1
,
num_batches
=
30
,
num_eval_epochs
=
100
/
datasets
.
IMAGENET_NUM_VAL_IMAGES
)
bench_cnn
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
self
.
assertEqual
(
bench_cnn
.
num_batches
,
30
)
self
.
assertAlmostEqual
(
bench_cnn
.
num_epochs
,
30
/
datasets
.
IMAGENET_NUM_TRAIN_IMAGES
)
self
.
assertAlmostEqual
(
bench_cnn
.
num_eval_batches
,
50
)
self
.
assertAlmostEqual
(
bench_cnn
.
num_eval_epochs
,
100
/
datasets
.
IMAGENET_NUM_VAL_IMAGES
)
def
testEarlyStopping
(
self
):
params
=
benchmark_cnn
.
make_params
(
batch_size
=
2
,
display_every
=
1
,
num_batches
=
100
,
eval_during_training_every_n_steps
=
2
,
stop_at_top_1_accuracy
=
0.4
,
)
with
mock
.
patch
.
object
(
benchmark_cnn
.
BenchmarkCNN
,
'_eval_once'
,
side_effect
=
[(
0.1
,
0.1
),
(
0.5
,
0.5
),
(
0.2
,
0.2
)]
)
as
mock_eval_once
:
logs
=
[]
bench_cnn
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
with
test_util
.
monkey_patch
(
benchmark_cnn
,
log_fn
=
test_util
.
print_and_add_to_list
(
logs
)):
bench_cnn
.
run
()
training_outputs
=
test_util
.
get_training_outputs_from_logs
(
logs
,
print_training_accuracy
=
False
)
# We should stop after the second evaluation, and we evaluate every 2
# steps. So there should be 2 * 2 = 4 training outputs.
self
.
assertEqual
(
len
(
training_outputs
),
4
)
self
.
assertEqual
(
mock_eval_once
.
call_count
,
2
)
def
testOutOfRangeErrorsAreNotIgnored
(
self
):
error_msg
=
'Fake OutOfRangeError error message'
with
mock
.
patch
.
object
(
benchmark_cnn
.
BenchmarkCNN
,
'benchmark_with_session'
,
side_effect
=
tf
.
errors
.
OutOfRangeError
(
None
,
None
,
error_msg
)):
with
self
.
assertRaisesRegex
(
RuntimeError
,
error_msg
):
benchmark_cnn
.
BenchmarkCNN
(
benchmark_cnn
.
make_params
()).
run
()
def
testInvalidFlags
(
self
):
params
=
benchmark_cnn
.
make_params
(
device
=
'cpu'
,
data_format
=
'NCHW'
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
BenchmarkCNN
(
params
)
params
=
benchmark_cnn
.
make_params
(
use_fp16
=
True
,
fp16_vars
=
True
,
variable_update
=
'replicated'
,
all_reduce_spec
=
'nccl'
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
BenchmarkCNN
(
params
)
# Automatic loss scaling is only supported for 'replicated', 'ps',
# and 'independent' variable_updates.
invalid_variable_updates
=
[
'distributed_replicated'
,
'distributed_all_reduce'
]
for
variable_update
in
invalid_variable_updates
:
params
=
benchmark_cnn
.
make_params
(
use_fp16
=
True
,
fp16_vars
=
True
,
fp16_enable_auto_loss_scale
=
True
,
variable_update
=
variable_update
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
BenchmarkCNN
(
params
)
# Automatic loss scaling is not supported for 'nccl'.
params
=
benchmark_cnn
.
make_params
(
use_fp16
=
True
,
fp16_vars
=
True
,
fp16_enable_auto_loss_scale
=
True
,
all_reduce_spec
=
'nccl'
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
BenchmarkCNN
(
params
)
# Automatic loss scaling is not supported for 'staged_vars'.
params
=
benchmark_cnn
.
make_params
(
use_fp16
=
True
,
fp16_vars
=
True
,
fp16_enable_auto_loss_scale
=
True
,
staged_vars
=
True
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
BenchmarkCNN
(
params
)
def
testMakeParams
(
self
):
default_params
=
benchmark_cnn
.
make_params
()
self
.
assertEqual
(
default_params
.
model
,
flags
.
param_specs
[
'model'
].
default_value
)
params
=
benchmark_cnn
.
make_params
(
model
=
'foo'
)
self
.
assertEqual
(
params
.
model
,
'foo'
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
make_params
(
job_name
=
'foo'
)
with
self
.
assertRaises
(
ValueError
):
benchmark_cnn
.
make_params
(
gpu_memory_frac_for_testing
=-
1.
)
class
VariableUpdateTest
(
tf
.
test
.
TestCase
):
"""Tests that variables are updated correctly.
These tests use a very simple deterministic model. For example, some tests use
the model
loss = image * A * B
where image is a 1x1 images (with a single scalar value), and A and B are
scalar variables. Tests will run tf_cnn_benchmarks with such a model, on a
sequence of scalar images, and assert that the losses are the correct value.
Since the losses depend on the variables, this indirectly tests variables are
updated correctly.
"""
def
setUp
(
self
):
super
(
VariableUpdateTest
,
self
).
setUp
()
_check_has_gpu
()
benchmark_cnn
.
setup
(
benchmark_cnn
.
make_params
())
def
_get_benchmark_cnn_losses
(
self
,
inputs
,
params
):
"""Returns the losses of BenchmarkCNN on the given inputs and params."""
logs
=
[]
model
=
test_util
.
TestCNNModel
()
with
test_util
.
monkey_patch
(
benchmark_cnn
,
log_fn
=
test_util
.
print_and_add_to_list
(
logs
),
LOSS_AND_ACCURACY_DIGITS_TO_SHOW
=
15
):
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
,
dataset
=
test_util
.
TestDataSet
(),
model
=
model
)
# The test model does not use labels when computing loss, so the label
# values do not matter as long as it's the right shape.
labels
=
np
.
array
([
1
]
*
inputs
.
shape
[
0
])
bench
.
input_preprocessor
.
set_fake_data
(
inputs
,
labels
)
if
bench
.
eval_input_preprocessor
:
bench
.
eval_input_preprocessor
.
set_fake_data
(
inputs
,
labels
)
bench
.
run
()
outputs
=
test_util
.
get_training_outputs_from_logs
(
logs
,
params
.
print_training_accuracy
)
return
[
x
.
loss
for
x
in
outputs
]
def
_test_variable_update
(
self
,
params
):
"""Tests variables are updated correctly when the given params are used.
A BenchmarkCNN is created with a TestCNNModel, and is run with some scalar
images. The losses are then compared with the losses obtained with
TestCNNModel().manually_compute_losses()
Args:
params: a Params tuple used to create BenchmarkCNN.
"""
inputs
=
test_util
.
get_fake_var_update_inputs
()
actual_losses
=
self
.
_get_benchmark_cnn_losses
(
inputs
,
params
)
expected_losses
,
=
test_util
.
TestCNNModel
().
manually_compute_losses
(
inputs
,
1
,
params
)
rtol
=
3e-2
if
params
.
use_fp16
else
1e-5
self
.
assertAllClose
(
actual_losses
[:
len
(
expected_losses
)],
expected_losses
,
rtol
=
rtol
,
atol
=
0.
)
def
_test_variable_updates
(
self
,
params
,
var_updates
=
(
'parameter_server'
,
'replicated'
)):
for
var_update
in
var_updates
:
self
.
_test_variable_update
(
params
.
_replace
(
variable_update
=
var_update
))
def
testDefault
(
self
):
params
=
test_util
.
get_var_update_params
()
self
.
_test_variable_updates
(
params
)
# For some reason, this test doesn't always pass
# def testCpuAsDevice(self):
# params = test_util.get_var_update_params()._replace(
# device='cpu',
# data_format='NHWC') # NHWC required when --device=cpu
# self._test_variable_updates(params)
def
testCpuAsLocalParamDevice
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
local_parameter_device
=
'cpu'
)
self
.
_test_variable_updates
(
params
)
def
testFp16
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
use_fp16
=
True
)
self
.
_test_variable_updates
(
params
)
def
testMomentum
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
optimizer
=
'momentum'
)
self
.
_test_variable_updates
(
params
)
def
testRmsprop
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
optimizer
=
'rmsprop'
)
self
.
_test_variable_updates
(
params
)
def
testNoLayers
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
use_tf_layers
=
False
)
self
.
_test_variable_updates
(
params
)
def
testVariousAllReduceSpecs
(
self
):
# We do not test xring, because it requires all Variables to have at least
# two elements.
params
=
test_util
.
get_var_update_params
().
_replace
(
all_reduce_spec
=
'pscpu'
)
self
.
_test_variable_updates
(
params
,
var_updates
=
(
'replicated'
,))
params
=
params
.
_replace
(
all_reduce_spec
=
'psgpu'
)
self
.
_test_variable_updates
(
params
,
var_updates
=
(
'replicated'
,))
# TODO(b/80125832): Enable nccl in tests
# params = params._replace(all_reduce_spec='nccl',
# compact_gradient_transfer=False)
# self._test_variable_updates(params, var_updates=('replicated',))
def
testPrintBaseLoss
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
loss_type_to_report
=
'base_loss'
)
self
.
_test_variable_updates
(
params
)
def
testSingleL2LossOp
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
single_l2_loss_op
=
True
)
self
.
_test_variable_updates
(
params
)
def
testResourceVars
(
self
):
params
=
test_util
.
get_var_update_params
().
_replace
(
use_resource_vars
=
True
)
self
.
_test_variable_updates
(
params
)
def
testEvalDuringTrainingEveryNSteps
(
self
):
# TODO(reedwm): Test that the eval results are correct. This only tests that
# training results are correct.
params
=
test_util
.
get_var_update_params
().
_replace
(
eval_during_training_every_n_steps
=
1
)
self
.
_test_variable_updates
(
params
,
var_updates
=
(
'replicated'
,))
class
VariableMgrLocalReplicatedTest
(
tf
.
test
.
TestCase
):
def
_test_grad_aggregation_with_var_mgr
(
self
,
variable_mgr
,
num_towers
,
num_vars
,
deferred_grads
):
tower_devices
=
[
'/gpu:%d'
%
i
for
i
in
range
(
num_towers
)]
tower_grads
=
[]
expected_sums
=
[
0.
]
*
num_vars
for
i
,
tower_device
in
enumerate
(
tower_devices
):
with
tf
.
device
(
tower_device
):
grad_vars
=
[]
for
j
in
range
(
num_vars
):
n
=
num_towers
*
i
+
j
grad_vars
.
append
((
tf
.
constant
(
n
,
dtype
=
tf
.
float32
),
tf
.
Variable
(
n
,
dtype
=
tf
.
float32
)))
expected_sums
[
j
]
+=
n
tower_grads
.
append
(
grad_vars
)
_
,
agg_device_grads
=
variable_mgr
.
preprocess_device_grads
(
tower_grads
)
expected_device_grads
=
[]
for
i
in
range
(
num_towers
):
expected_grad_vars
=
[]
for
j
in
range
(
num_vars
):
expected_grad_and_var
=
[
expected_sums
[
j
],
num_towers
*
i
+
j
]
if
isinstance
(
agg_device_grads
[
i
][
j
],
tuple
):
# agg_device_grads[i][j] can be a list or tuple.
expected_grad_and_var
=
tuple
(
expected_grad_and_var
)
expected_grad_vars
.
append
(
expected_grad_and_var
)
if
isinstance
(
agg_device_grads
[
i
],
tuple
):
# agg_device_grads[i] can be a list or tuple.
expected_grad_vars
=
tuple
(
expected_grad_vars
)
expected_device_grads
.
append
(
expected_grad_vars
)
config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
)
with
tf
.
Session
(
config
=
config
)
as
sess
:
sess
.
run
(
tf
.
initialize_all_variables
())
sess
.
run
(
variable_mgr
.
_warmup_ops
)
if
deferred_grads
:
# With deferred grads, the result of a session run is always the summed
# gradients from the previous session run.
sess
.
run
(
agg_device_grads
)
feed_dict
=
{
g
:
0
for
grad_vars
in
tower_grads
for
g
,
_
in
grad_vars
}
agg_device_grads_
=
sess
.
run
(
agg_device_grads
,
feed_dict
)
else
:
agg_device_grads_
=
sess
.
run
(
agg_device_grads
)
self
.
assertEqual
(
agg_device_grads_
,
expected_device_grads
)
def
_test_grad_aggregation
(
self
,
params
,
num_vars
):
bench
=
benchmark_cnn
.
BenchmarkCNN
(
params
)
deferred_grads
=
(
params
.
variable_consistency
==
'relaxed'
)
self
.
_test_grad_aggregation_with_var_mgr
(
bench
.
variable_mgr
,
bench
.
num_gpus
,
num_vars
,
deferred_grads
)
def
test_grad_aggregation
(
self
):
base_params
=
benchmark_cnn
.
make_params
(
num_gpus
=
10
,
variable_update
=
'replicated'
,
use_fp16
=
True
)
params
=
base_params
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
gradient_repacking
=
3
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
variable_consistency
=
'relaxed'
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
compact_gradient_transfer
=
False
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
gradient_repacking
=
3
,
variable_consistency
=
'relaxed'
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
gradient_repacking
=
3
,
compact_gradient_transfer
=
False
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
variable_consistency
=
'relaxed'
,
compact_gradient_transfer
=
False
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
gradient_repacking
=
3
,
variable_consistency
=
'relaxed'
,
compact_gradient_transfer
=
False
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
num_gpus
=
8
,
hierarchical_copy
=
True
)
self
.
_test_grad_aggregation
(
params
,
10
)
# TODO(b/80125832): Enable nccl in tests
# params = base_params._replace(all_reduce_spec='nccl',
# compact_gradient_transfer=False,
# # For some reason, this test freezes when
# # num_gpus=10
# num_gpus=8)
# self._test_grad_aggregation(params, 10)
params
=
base_params
.
_replace
(
all_reduce_spec
=
'pscpu'
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
num_gpus
=
8
,
gradient_repacking
=
3
,
variable_consistency
=
'relaxed'
,
hierarchical_copy
=
True
)
self
.
_test_grad_aggregation
(
params
,
10
)
# TODO(b/80125832): Enable nccl in tests
# params = base_params._replace(num_gpus=8,
# gradient_repacking=3,
# variable_consistency='relaxed',
# all_reduce_spec='nccl',
# compact_gradient_transfer=False)
# self._test_grad_aggregation(params, 10)
params
=
base_params
.
_replace
(
gradient_repacking
=
3
,
variable_consistency
=
'relaxed'
,
all_reduce_spec
=
'pscpu'
)
self
.
_test_grad_aggregation
(
params
,
10
)
params
=
base_params
.
_replace
(
gradient_repacking
=
3
,
variable_consistency
=
'relaxed'
,
all_reduce_spec
=
'xring'
)
self
.
_test_grad_aggregation
(
params
,
10
)
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
tf
.
test
.
main
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for CNN benchmarks."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
threading
import
numpy
as
np
import
tensorflow.compat.v1
as
tf
def
tensorflow_version_tuple
():
v
=
tf
.
__version__
major
,
minor
,
patch
=
v
.
split
(
'.'
)
return
(
int
(
major
),
int
(
minor
),
patch
)
def
tensorflow_version
():
vt
=
tensorflow_version_tuple
()
return
vt
[
0
]
*
1000
+
vt
[
1
]
def
log_fn
(
log
):
print
(
log
)
def
roll_numpy_batches
(
array
,
batch_size
,
shift_ratio
):
"""Moves a proportion of batches from start to the end of the array.
This function moves a proportion of batches, specified by `shift_ratio`, from
the starts of the array to the end. The number of batches moved is rounded
down to the nearest integer. For example,
```
roll_numpy_batches([1, 2, 3, 4, 5, 6], 2, 0.34) == [3, 4, 5, 6, 1, 2]
```
Args:
array: A Numpy array whose first dimension is the batch dimension.
batch_size: The batch size.
shift_ratio: Proportion of batches to move from the start of the array to
the end of the array.
Returns:
A new Numpy array, with a proportion of the batches at the start of `array`
moved to the end.
"""
num_items
=
array
.
shape
[
0
]
assert
num_items
%
batch_size
==
0
num_batches
=
num_items
//
batch_size
starting_batch
=
int
(
num_batches
*
shift_ratio
)
starting_item
=
starting_batch
*
batch_size
return
np
.
roll
(
array
,
-
starting_item
,
axis
=
0
)
# For Python 2.7 compatibility, we do not use threading.Barrier.
class
Barrier
(
object
):
"""Implements a lightweight Barrier.
Useful for synchronizing a fixed number of threads at known synchronization
points. Threads block on 'wait()' and simultaneously return once they have
all made that call.
# Implementation adopted from boost/thread/barrier.hpp
"""
def
__init__
(
self
,
parties
):
"""Create a barrier, initialised to 'parties' threads."""
self
.
cond
=
threading
.
Condition
(
threading
.
Lock
())
self
.
parties
=
parties
# Indicates the number of waiting parties.
self
.
waiting
=
0
# generation is needed to deal with spurious wakeups. If self.cond.wait()
# wakes up for other reasons, generation will force it go back to wait().
self
.
generation
=
0
self
.
broken
=
False
def
wait
(
self
):
"""Wait for the barrier."""
with
self
.
cond
:
# Check if the barrier has been disabled or not.
if
self
.
broken
:
return
gen
=
self
.
generation
self
.
waiting
+=
1
if
self
.
waiting
==
self
.
parties
:
self
.
waiting
=
0
self
.
generation
+=
1
self
.
cond
.
notify_all
()
# loop because of spurious wakeups
while
gen
==
self
.
generation
:
self
.
cond
.
wait
()
# TODO(huangyp): Remove this method once we find a way to know which step
# is the last barrier.
def
abort
(
self
):
"""Clear existing barrier and disable this barrier."""
with
self
.
cond
:
if
self
.
waiting
>
0
:
self
.
generation
+=
1
self
.
cond
.
notify_all
()
self
.
broken
=
True
class
ImageProducer
(
object
):
"""An image producer that puts images into a staging area periodically.
This class is useful for periodically running a set of ops, `put_ops` on a
different thread every `batch_group_size` steps.
The notify_image_consumption() method is used to increment an internal counter
so that every `batch_group_size` times it is called, `put_ops` is executed. A
barrier is placed so that notify_image_consumption() will block until
the previous call to `put_ops` has been executed.
The start() method is used to start the thread that runs `put_ops`.
The done() method waits until the last put_ops is executed and stops the
thread.
The purpose of this class is to fill an image input pipeline every
`batch_group_size` steps. Suppose `put_ops` supplies `batch_group_size` images
to the input pipeline when run, and that every step, 1 batch of images is
consumed. Then, by calling notify_image_consumption() every step, images are
supplied to the input pipeline at the same amount they are consumed.
Example usage:
```
put_ops = ... # Enqueues `batch_group_size` batches to a StagingArea
get_op = ... # Dequeues 1 batch, and does some operations on it
batch_group_size = 4
with tf.Session() as sess:
image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size)
image_producer.start()
for _ in range(100):
sess.run(get_op)
image_producer.notify_image_consumption()
```
"""
def
__init__
(
self
,
sess
,
put_ops
,
batch_group_size
,
use_python32_barrier
):
self
.
sess
=
sess
self
.
num_gets
=
0
self
.
put_ops
=
put_ops
self
.
batch_group_size
=
batch_group_size
self
.
done_event
=
threading
.
Event
()
if
(
use_python32_barrier
and
sys
.
version_info
[
0
]
==
3
and
sys
.
version_info
[
1
]
>=
2
):
self
.
put_barrier
=
threading
.
Barrier
(
2
)
else
:
self
.
put_barrier
=
Barrier
(
2
)
def
_should_put
(
self
):
return
(
self
.
num_gets
+
1
)
%
self
.
batch_group_size
==
0
def
done
(
self
):
"""Stop the image producer."""
self
.
done_event
.
set
()
self
.
put_barrier
.
abort
()
self
.
thread
.
join
()
def
start
(
self
):
"""Start the image producer."""
self
.
sess
.
run
([
self
.
put_ops
])
self
.
thread
=
threading
.
Thread
(
target
=
self
.
_loop_producer
)
# Set daemon to true to allow Ctrl + C to terminate all threads.
self
.
thread
.
daemon
=
True
self
.
thread
.
start
()
def
notify_image_consumption
(
self
):
"""Increment the counter of image_producer by 1.
This should only be called by the main thread that consumes images and runs
the model computation. One batch of images should be consumed between
calling start() and the first call to this method. Then, one batch of images
should be consumed between any two successive calls to this method.
"""
if
self
.
_should_put
():
self
.
put_barrier
.
wait
()
self
.
num_gets
+=
1
def
_loop_producer
(
self
):
while
not
self
.
done_event
.
isSet
():
self
.
sess
.
run
([
self
.
put_ops
])
self
.
put_barrier
.
wait
()
class
BaseClusterManager
(
object
):
"""The manager for the cluster of servers running the benchmark."""
def
__init__
(
self
,
params
):
worker_hosts
=
params
.
worker_hosts
.
split
(
','
)
ps_hosts
=
params
.
ps_hosts
.
split
(
','
)
if
params
.
ps_hosts
else
[]
cluster
=
{
'worker'
:
worker_hosts
}
if
ps_hosts
:
cluster
[
'ps'
]
=
ps_hosts
self
.
_cluster_spec
=
tf
.
train
.
ClusterSpec
(
cluster
)
def
get_target
(
self
):
"""Returns a target to be passed to tf.Session()."""
raise
NotImplementedError
(
'get_target must be implemented by subclass'
)
def
join_server
(
self
):
raise
NotImplementedError
(
'join must be implemented by subclass'
)
def
get_cluster_spec
(
self
):
return
self
.
_cluster_spec
def
num_workers
(
self
):
return
len
(
self
.
_cluster_spec
.
job_tasks
(
'worker'
))
def
num_ps
(
self
):
if
'ps'
in
self
.
_cluster_spec
.
jobs
:
return
len
(
self
.
_cluster_spec
.
job_tasks
(
'ps'
))
else
:
return
0
class
GrpcClusterManager
(
BaseClusterManager
):
"""A cluster manager for a cluster networked with gRPC."""
def
__init__
(
self
,
params
,
config_proto
):
super
(
GrpcClusterManager
,
self
).
__init__
(
params
)
if
params
.
job_name
==
'controller'
:
self
.
_target
=
'grpc://%s'
%
self
.
_cluster_spec
.
job_tasks
(
'worker'
)[
0
]
else
:
self
.
_server
=
tf
.
train
.
Server
(
self
.
_cluster_spec
,
job_name
=
params
.
job_name
,
task_index
=
params
.
task_index
,
config
=
config_proto
,
protocol
=
params
.
server_protocol
)
self
.
_target
=
self
.
_server
.
target
def
get_target
(
self
):
return
self
.
_target
def
join_server
(
self
):
return
self
.
_server
.
join
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util_test.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_cnn_benchmarks.cnn_util."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
threading
import
time
import
tensorflow.compat.v1
as
tf
import
cnn_util
class
CnnUtilBarrierTest
(
tf
.
test
.
TestCase
):
def
testBarrier
(
self
):
num_tasks
=
20
num_waits
=
4
barrier
=
cnn_util
.
Barrier
(
num_tasks
)
threads
=
[]
sync_matrix
=
[]
for
i
in
range
(
num_tasks
):
sync_times
=
[
0
]
*
num_waits
thread
=
threading
.
Thread
(
target
=
self
.
_run_task
,
args
=
(
barrier
,
sync_times
))
thread
.
start
()
threads
.
append
(
thread
)
sync_matrix
.
append
(
sync_times
)
for
thread
in
threads
:
thread
.
join
()
for
wait_index
in
range
(
num_waits
-
1
):
# Max of times at iteration i < min of times at iteration i + 1
self
.
assertLessEqual
(
max
([
sync_matrix
[
i
][
wait_index
]
for
i
in
range
(
num_tasks
)]),
min
([
sync_matrix
[
i
][
wait_index
+
1
]
for
i
in
range
(
num_tasks
)]))
def
_run_task
(
self
,
barrier
,
sync_times
):
for
wait_index
in
range
(
len
(
sync_times
)):
sync_times
[
wait_index
]
=
time
.
time
()
barrier
.
wait
()
def
testBarrierAbort
(
self
):
num_tasks
=
2
num_waits
=
1
sync_times
=
[
0
]
*
num_waits
barrier
=
cnn_util
.
Barrier
(
num_tasks
)
thread
=
threading
.
Thread
(
target
=
self
.
_run_task
,
args
=
(
barrier
,
sync_times
))
thread
.
start
()
barrier
.
abort
()
# thread won't be blocked by done barrier.
thread
.
join
()
class
ImageProducerTest
(
tf
.
test
.
TestCase
):
def
_slow_tensorflow_op
(
self
):
"""Returns a TensorFlow op that takes approximately 0.1s to complete."""
def
slow_func
(
v
):
time
.
sleep
(
0.1
)
return
v
return
tf
.
py_func
(
slow_func
,
[
tf
.
constant
(
0.
)],
tf
.
float32
).
op
def
_test_image_producer
(
self
,
batch_group_size
,
put_slower_than_get
):
# We use the variable x to simulate a staging area of images. x represents
# the number of batches in the staging area.
x
=
tf
.
Variable
(
0
,
dtype
=
tf
.
int32
)
if
put_slower_than_get
:
put_dep
=
self
.
_slow_tensorflow_op
()
get_dep
=
tf
.
no_op
()
else
:
put_dep
=
tf
.
no_op
()
get_dep
=
self
.
_slow_tensorflow_op
()
with
tf
.
control_dependencies
([
put_dep
]):
put_op
=
x
.
assign_add
(
batch_group_size
,
use_locking
=
True
)
with
tf
.
control_dependencies
([
get_dep
]):
get_op
=
x
.
assign_sub
(
1
,
use_locking
=
True
)
with
self
.
test_session
()
as
sess
:
sess
.
run
(
tf
.
variables_initializer
([
x
]))
image_producer
=
cnn_util
.
ImageProducer
(
sess
,
put_op
,
batch_group_size
,
use_python32_barrier
=
False
)
image_producer
.
start
()
for
_
in
range
(
5
*
batch_group_size
):
sess
.
run
(
get_op
)
# We assert x is nonnegative, to ensure image_producer never causes
# an unstage op to block. We assert x is at most 2 * batch_group_size,
# to ensure it doesn't use too much memory by storing too many batches
# in the staging area.
self
.
assertGreaterEqual
(
sess
.
run
(
x
),
0
)
self
.
assertLessEqual
(
sess
.
run
(
x
),
2
*
batch_group_size
)
image_producer
.
notify_image_consumption
()
self
.
assertGreaterEqual
(
sess
.
run
(
x
),
0
)
self
.
assertLessEqual
(
sess
.
run
(
x
),
2
*
batch_group_size
)
image_producer
.
done
()
time
.
sleep
(
0.1
)
self
.
assertGreaterEqual
(
sess
.
run
(
x
),
0
)
self
.
assertLessEqual
(
sess
.
run
(
x
),
2
*
batch_group_size
)
def
test_image_producer
(
self
):
self
.
_test_image_producer
(
1
,
False
)
self
.
_test_image_producer
(
1
,
True
)
self
.
_test_image_producer
(
2
,
False
)
self
.
_test_image_producer
(
2
,
True
)
self
.
_test_image_producer
(
3
,
False
)
self
.
_test_image_producer
(
3
,
True
)
self
.
_test_image_producer
(
8
,
False
)
self
.
_test_image_producer
(
8
,
True
)
if
__name__
==
'__main__'
:
tf
.
disable_v2_behavior
()
tf
.
test
.
main
()
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/coco_metric.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""COCO-style evaluation metrics.
Forked from reference model implementation.
COCO API: github.com/cocodataset/cocoapi/
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
atexit
import
tempfile
from
absl
import
flags
import
numpy
as
np
from
pycocotools.coco
import
COCO
from
pycocotools.cocoeval
import
COCOeval
import
six
import
tensorflow.compat.v1
as
tf
import
mlperf
import
ssd_constants
FLAGS
=
flags
.
FLAGS
# https://github.com/cocodataset/cocoapi/issues/49
if
six
.
PY3
:
import
pycocotools.coco
pycocotools
.
coco
.
unicode
=
str
def
async_eval_runner
(
queue_predictions
,
queue_results
,
val_json_file
):
"""Load intermediate eval results and get COCO metrics."""
while
True
:
message
=
queue_predictions
.
get
()
if
message
==
'STOP'
:
# poison pill
break
step
,
predictions
=
message
results
=
compute_map
(
predictions
,
val_json_file
)
queue_results
.
put
((
step
,
results
))
def
compute_map
(
predictions
,
val_json_file
):
"""Use model predictions to compute mAP.
Args:
predictions: a list of tuples returned by decoded_predictions function,
each containing the following elements:
image source_id, box coordinates in XYWH order, probability score, label
val_json_file: path to COCO annotation file
Returns:
A dictionary that maps all COCO metrics (keys) to their values
"""
if
val_json_file
.
startswith
(
"gs://"
):
_
,
local_val_json
=
tempfile
.
mkstemp
(
suffix
=
".json"
)
tf
.
gfile
.
Remove
(
local_val_json
)
tf
.
gfile
.
Copy
(
val_json_file
,
local_val_json
)
atexit
.
register
(
tf
.
gfile
.
Remove
,
local_val_json
)
else
:
local_val_json
=
val_json_file
cocoGt
=
COCO
(
local_val_json
)
cocoDt
=
cocoGt
.
loadRes
(
np
.
array
(
predictions
))
E
=
COCOeval
(
cocoGt
,
cocoDt
,
iouType
=
'bbox'
)
E
.
evaluate
()
E
.
accumulate
()
E
.
summarize
()
print
(
"Current AP: {:.5f}"
.
format
(
E
.
stats
[
0
]))
metric_names
=
[
'AP'
,
'AP50'
,
'AP75'
,
'APs'
,
'APm'
,
'APl'
,
'ARmax1'
,
'ARmax10'
,
'ARmax100'
,
'ARs'
,
'ARm'
,
'ARl'
]
# Prefix with "COCO" to group in TensorBoard.
return
{
"COCO/"
+
key
:
value
for
key
,
value
in
zip
(
metric_names
,
E
.
stats
)}
def
calc_iou
(
target
,
candidates
):
target_tiled
=
np
.
tile
(
target
[
np
.
newaxis
,
:],
(
candidates
.
shape
[
0
],
1
))
# Left Top & Right Bottom
lt
=
np
.
maximum
(
target_tiled
[:,:
2
],
candidates
[:,:
2
])
rb
=
np
.
minimum
(
target_tiled
[:,
2
:],
candidates
[:,
2
:])
delta
=
np
.
maximum
(
rb
-
lt
,
0
)
intersect
=
delta
[:,
0
]
*
delta
[:,
1
]
delta1
=
target_tiled
[:,
2
:]
-
candidates
[:,:
2
]
area1
=
delta1
[:,
0
]
*
delta1
[:,
1
]
delta2
=
target_tiled
[:,
2
:]
-
candidates
[:,:
2
]
area2
=
delta2
[:,
0
]
*
delta2
[:,
1
]
iou
=
intersect
/
(
area1
+
area2
-
intersect
)
return
iou
# TODO(haoyuzhang): Rewrite this NumPy based implementation to TensorFlow based
# implementation under ssd_model.py accuracy_function.
def
decode_predictions
(
labels_and_predictions
):
"""Decode predictions and remove unused boxes and labels."""
predictions
=
[]
for
example
in
labels_and_predictions
:
source_id
=
int
(
example
[
ssd_constants
.
SOURCE_ID
])
pred_box
=
example
[
ssd_constants
.
PRED_BOXES
]
pred_scores
=
example
[
ssd_constants
.
PRED_SCORES
]
locs
,
labels
,
probs
=
decode_single
(
pred_box
,
pred_scores
,
ssd_constants
.
OVERLAP_CRITERIA
,
ssd_constants
.
MAX_NUM_EVAL_BOXES
,
ssd_constants
.
MAX_NUM_EVAL_BOXES
)
raw_height
,
raw_width
,
_
=
example
[
ssd_constants
.
RAW_SHAPE
]
for
loc
,
label
,
prob
in
zip
(
locs
,
labels
,
probs
):
# Ordering convention differs, hence [1], [0] rather than [0], [1]
x
,
y
=
loc
[
1
]
*
raw_width
,
loc
[
0
]
*
raw_height
w
,
h
=
(
loc
[
3
]
-
loc
[
1
])
*
raw_width
,
(
loc
[
2
]
-
loc
[
0
])
*
raw_height
predictions
.
append
(
[
source_id
,
x
,
y
,
w
,
h
,
prob
,
ssd_constants
.
CLASS_INV_MAP
[
label
]])
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
NMS_THRESHOLD
,
value
=
ssd_constants
.
OVERLAP_CRITERIA
)
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
NMS_MAX_DETECTIONS
,
value
=
ssd_constants
.
MAX_NUM_EVAL_BOXES
)
return
predictions
def
decode_single
(
bboxes_in
,
scores_in
,
criteria
,
max_output
,
max_num
=
200
):
# Reference to https://github.com/amdegroot/ssd.pytorch
bboxes_out
=
[]
scores_out
=
[]
labels_out
=
[]
for
i
,
score
in
enumerate
(
np
.
split
(
scores_in
,
scores_in
.
shape
[
1
],
1
)):
score
=
np
.
squeeze
(
score
,
1
)
# skip background
if
i
==
0
:
continue
mask
=
score
>
ssd_constants
.
MIN_SCORE
if
not
np
.
any
(
mask
):
continue
bboxes
,
score
=
bboxes_in
[
mask
,
:],
score
[
mask
]
score_idx_sorted
=
np
.
argsort
(
score
)
score_sorted
=
score
[
score_idx_sorted
]
score_idx_sorted
=
score_idx_sorted
[
-
max_num
:]
candidates
=
[]
# perform non-maximum suppression
while
len
(
score_idx_sorted
):
idx
=
score_idx_sorted
[
-
1
]
bboxes_sorted
=
bboxes
[
score_idx_sorted
,
:]
bboxes_idx
=
bboxes
[
idx
,
:]
iou
=
calc_iou
(
bboxes_idx
,
bboxes_sorted
)
score_idx_sorted
=
score_idx_sorted
[
iou
<
criteria
]
candidates
.
append
(
idx
)
bboxes_out
.
append
(
bboxes
[
candidates
,
:])
scores_out
.
append
(
score
[
candidates
])
labels_out
.
extend
([
i
]
*
len
(
candidates
))
if
len
(
scores_out
)
==
0
:
tf
.
logging
.
info
(
"No objects detected. Returning dummy values."
)
return
(
np
.
zeros
(
shape
=
(
1
,
4
),
dtype
=
np
.
float32
),
np
.
zeros
(
shape
=
(
1
,),
dtype
=
np
.
int32
),
np
.
ones
(
shape
=
(
1
,),
dtype
=
np
.
float32
)
*
ssd_constants
.
DUMMY_SCORE
,
)
bboxes_out
=
np
.
concatenate
(
bboxes_out
,
axis
=
0
)
scores_out
=
np
.
concatenate
(
scores_out
,
axis
=
0
)
labels_out
=
np
.
array
(
labels_out
)
max_ids
=
np
.
argsort
(
scores_out
)[
-
max_output
:]
return
bboxes_out
[
max_ids
,
:],
labels_out
[
max_ids
],
scores_out
[
max_ids
]
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/constants.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Constants used in tf_cnn_benchmarks."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
enum
import
Enum
# Results fetched with this prefix will not be reduced. Instead, they will be
# passed as matrices to model's postprocess function.
UNREDUCED_ACCURACY_OP_PREFIX
=
"tensor:"
# Eval result values with this name prefix will be included in summary.
SIMPLE_VALUE_RESULT_PREFIX
=
"simple_value:"
class
BenchmarkMode
(
object
):
"""Benchmark running mode."""
TRAIN
=
"training"
EVAL
=
"evaluation"
TRAIN_AND_EVAL
=
"training + evaluation"
FORWARD_ONLY
=
"forward only"
class
NetworkTopology
(
str
,
Enum
):
"""Network topology describes how multiple GPUs are inter-connected.
"""
# DGX-1 uses hybrid cube mesh topology with the following device peer to peer
# matrix:
# DMA: 0 1 2 3 4 5 6 7
# 0: Y Y Y Y Y N N N
# 1: Y Y Y Y N Y N N
# 2: Y Y Y Y N N Y N
# 3: Y Y Y Y N N N Y
# 4: Y N N N Y Y Y Y
# 5: N Y N N Y Y Y Y
# 6: N N Y N Y Y Y Y
# 7: N N N Y Y Y Y Y
DGX1
=
"dgx1"
# V100 in GCP are connected with the following device peer to peer matrix.
# In this topology, bandwidth of the connection depends on if it uses NVLink
# or PCIe link.
# DMA: 0 1 2 3 4 5 6 7
# 0: Y Y Y Y N Y N N
# 1: Y Y Y Y N N N N
# 2: Y Y Y Y N N N Y
# 3: Y Y Y Y N N N N
# 4: N N N N Y Y Y Y
# 5: Y N N N Y Y Y Y
# 6: N N N N Y Y Y Y
# 7: N N Y N Y Y Y Y
GCP_V100
=
"gcp_v100"
TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/convnet_builder.py
deleted
100644 → 0
View file @
4749cd5e
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CNN builder."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
defaultdict
import
contextlib
import
numpy
as
np
import
tensorflow.compat.v1
as
tf
# pylint: disable=g-direct-tensorflow-import
import
mlperf
from
tensorflow.python.layers
import
convolutional
as
conv_layers
from
tensorflow.python.layers
import
core
as
core_layers
from
tensorflow.python.layers
import
normalization
as
normalization_layers
from
tensorflow.python.layers
import
pooling
as
pooling_layers
from
tensorflow.python.training
import
moving_averages
_data_format_to_channel_axis
=
{
'NCHW'
:
1
,
'NHWC'
:
3
}
class
ConvNetBuilder
(
object
):
"""Builder of cnn net."""
def
__init__
(
self
,
input_op
,
input_nchan
,
phase_train
,
use_tf_layers
,
data_format
=
'NCHW'
,
dtype
=
tf
.
float32
,
variable_dtype
=
tf
.
float32
):
self
.
top_layer
=
input_op
self
.
top_size
=
input_nchan
self
.
phase_train
=
phase_train
self
.
use_tf_layers
=
use_tf_layers
self
.
data_format
=
data_format
self
.
dtype
=
dtype
self
.
variable_dtype
=
variable_dtype
self
.
counts
=
defaultdict
(
lambda
:
0
)
self
.
use_batch_norm
=
False
self
.
batch_norm_config
=
{}
# 'decay': 0.997, 'scale': True}
self
.
channel_pos
=
(
'channels_last'
if
data_format
==
'NHWC'
else
'channels_first'
)
self
.
aux_top_layer
=
None
self
.
aux_top_size
=
0
def
get_custom_getter
(
self
):
"""Returns a custom getter that this class's methods must be called under.
All methods of this class must be called under a variable scope that was
passed this custom getter. Example:
```python
network = ConvNetBuilder(...)
with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
network.conv(...)
# Call more methods of network here
```
Currently, this custom getter only does anything if self.use_tf_layers is
True. In that case, it causes variables to be stored as dtype
self.variable_type, then casted to the requested dtype, instead of directly
storing the variable as the requested dtype.
"""
def
inner_custom_getter
(
getter
,
*
args
,
**
kwargs
):
"""Custom getter that forces variables to have type self.variable_type."""
if
not
self
.
use_tf_layers
:
return
getter
(
*
args
,
**
kwargs
)
requested_dtype
=
kwargs
[
'dtype'
]
if
not
(
requested_dtype
==
tf
.
float32
and
self
.
variable_dtype
==
tf
.
float16
):
# Only change the variable dtype if doing so does not decrease variable
# precision.
kwargs
[
'dtype'
]
=
self
.
variable_dtype
var
=
getter
(
*
args
,
**
kwargs
)
# This if statement is needed to guard the cast, because batch norm
# assigns directly to the return value of this custom getter. The cast
# makes the return value not a variable so it cannot be assigned. Batch
# norm variables are always in fp32 so this if statement is never
# triggered for them.
if
var
.
dtype
.
base_dtype
!=
requested_dtype
:
var
=
tf
.
cast
(
var
,
requested_dtype
)
return
var
return
inner_custom_getter
@
contextlib
.
contextmanager
def
switch_to_aux_top_layer
(
self
):
"""Context that construct cnn in the auxiliary arm."""
if
self
.
aux_top_layer
is
None
:
raise
RuntimeError
(
'Empty auxiliary top layer in the network.'
)
saved_top_layer
=
self
.
top_layer
saved_top_size
=
self
.
top_size
self
.
top_layer
=
self
.
aux_top_layer
self
.
top_size
=
self
.
aux_top_size
yield
self
.
aux_top_layer
=
self
.
top_layer
self
.
aux_top_size
=
self
.
top_size
self
.
top_layer
=
saved_top_layer
self
.
top_size
=
saved_top_size
def
get_variable
(
self
,
name
,
shape
,
dtype
,
cast_dtype
,
*
args
,
**
kwargs
):
# TODO(reedwm): Currently variables and gradients are transferred to other
# devices and machines as type `dtype`, not `cast_dtype`. In particular,
# this means in fp16 mode, variables are transferred as fp32 values, not
# fp16 values, which uses extra bandwidth.
var
=
tf
.
get_variable
(
name
,
shape
,
dtype
,
*
args
,
**
kwargs
)
return
tf
.
cast
(
var
,
cast_dtype
)
def
_conv2d_impl
(
self
,
input_layer
,
num_channels_in
,
filters
,
kernel_size
,
strides
,
padding
,
kernel_initializer
):
if
self
.
use_tf_layers
:
return
conv_layers
.
conv2d
(
input_layer
,
filters
,
kernel_size
,
strides
,
padding
,
self
.
channel_pos
,
kernel_initializer
=
kernel_initializer
,
use_bias
=
False
)
else
:
weights_shape
=
[
kernel_size
[
0
],
kernel_size
[
1
],
num_channels_in
,
filters
]
# We use the name 'conv2d/kernel' so the variable has the same name as its
# tf.layers equivalent. This way, if a checkpoint is written when
# self.use_tf_layers == True, it can be loaded when
# self.use_tf_layers == False, and vice versa.
weights
=
self
.
get_variable
(
'conv2d/kernel'
,
weights_shape
,
self
.
variable_dtype
,
self
.
dtype
,
initializer
=
kernel_initializer
)
if
self
.
data_format
==
'NHWC'
:
strides
=
[
1
]
+
strides
+
[
1
]
else
:
strides
=
[
1
,
1
]
+
strides
return
tf
.
nn
.
conv2d
(
input_layer
,
weights
,
strides
,
padding
,
data_format
=
self
.
data_format
)
def
conv
(
self
,
num_out_channels
,
k_height
,
k_width
,
d_height
=
1
,
d_width
=
1
,
mode
=
'SAME'
,
input_layer
=
None
,
num_channels_in
=
None
,
use_batch_norm
=
None
,
stddev
=
None
,
activation
=
'relu'
,
bias
=
0.0
,
kernel_initializer
=
None
):
"""Construct a conv2d layer on top of cnn."""
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
if
num_channels_in
is
None
:
num_channels_in
=
self
.
top_size
if
stddev
is
not
None
and
kernel_initializer
is
None
:
kernel_initializer
=
tf
.
truncated_normal_initializer
(
stddev
=
stddev
)
if
kernel_initializer
is
None
:
kernel_initializer
=
tf
.
variance_scaling_initializer
()
name
=
'conv'
+
str
(
self
.
counts
[
'conv'
])
self
.
counts
[
'conv'
]
+=
1
with
tf
.
variable_scope
(
name
):
strides
=
[
1
,
d_height
,
d_width
,
1
]
if
self
.
data_format
==
'NCHW'
:
strides
=
[
strides
[
0
],
strides
[
3
],
strides
[
1
],
strides
[
2
]]
if
mode
!=
'SAME_RESNET'
:
conv
=
self
.
_conv2d_impl
(
input_layer
,
num_channels_in
,
num_out_channels
,
kernel_size
=
[
k_height
,
k_width
],
strides
=
[
d_height
,
d_width
],
padding
=
mode
,
kernel_initializer
=
kernel_initializer
)
else
:
# Special padding mode for ResNet models
if
d_height
==
1
and
d_width
==
1
:
conv
=
self
.
_conv2d_impl
(
input_layer
,
num_channels_in
,
num_out_channels
,
kernel_size
=
[
k_height
,
k_width
],
strides
=
[
d_height
,
d_width
],
padding
=
'SAME'
,
kernel_initializer
=
kernel_initializer
)
else
:
rate
=
1
# Unused (for 'a trous' convolutions)
kernel_height_effective
=
k_height
+
(
k_height
-
1
)
*
(
rate
-
1
)
pad_h_beg
=
(
kernel_height_effective
-
1
)
//
2
pad_h_end
=
kernel_height_effective
-
1
-
pad_h_beg
kernel_width_effective
=
k_width
+
(
k_width
-
1
)
*
(
rate
-
1
)
pad_w_beg
=
(
kernel_width_effective
-
1
)
//
2
pad_w_end
=
kernel_width_effective
-
1
-
pad_w_beg
padding
=
[[
0
,
0
],
[
pad_h_beg
,
pad_h_end
],
[
pad_w_beg
,
pad_w_end
],
[
0
,
0
]]
if
self
.
data_format
==
'NCHW'
:
padding
=
[
padding
[
0
],
padding
[
3
],
padding
[
1
],
padding
[
2
]]
padded_input_layer
=
tf
.
pad
(
input_layer
,
padding
)
conv
=
self
.
_conv2d_impl
(
padded_input_layer
,
num_channels_in
,
num_out_channels
,
kernel_size
=
[
k_height
,
k_width
],
strides
=
[
d_height
,
d_width
],
padding
=
'VALID'
,
kernel_initializer
=
kernel_initializer
)
if
use_batch_norm
is
None
:
use_batch_norm
=
self
.
use_batch_norm
mlperf
.
logger
.
log_conv2d
(
input_tensor
=
input_layer
,
output_tensor
=
conv
,
stride_height
=
d_height
,
stride_width
=
d_width
,
filters
=
num_out_channels
,
initializer
=
kernel_initializer
,
use_bias
=
not
use_batch_norm
and
bias
is
not
None
)
if
not
use_batch_norm
:
if
bias
is
not
None
:
biases
=
self
.
get_variable
(
'biases'
,
[
num_out_channels
],
self
.
variable_dtype
,
self
.
dtype
,
initializer
=
tf
.
constant_initializer
(
bias
))
biased
=
tf
.
reshape
(
tf
.
nn
.
bias_add
(
conv
,
biases
,
data_format
=
self
.
data_format
),
conv
.
get_shape
())
else
:
biased
=
conv
else
:
self
.
top_layer
=
conv
self
.
top_size
=
num_out_channels
biased
=
self
.
batch_norm
(
**
self
.
batch_norm_config
)
if
activation
==
'relu'
:
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
MODEL_HP_RELU
)
conv1
=
tf
.
nn
.
relu
(
biased
)
elif
activation
==
'linear'
or
activation
is
None
:
conv1
=
biased
elif
activation
==
'tanh'
:
conv1
=
tf
.
nn
.
tanh
(
biased
)
else
:
raise
KeyError
(
'Invalid activation type
\'
%s
\'
'
%
activation
)
self
.
top_layer
=
conv1
self
.
top_size
=
num_out_channels
return
conv1
def
_pool
(
self
,
pool_name
,
pool_function
,
k_height
,
k_width
,
d_height
,
d_width
,
mode
,
input_layer
,
num_channels_in
):
"""Construct a pooling layer."""
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
else
:
self
.
top_size
=
num_channels_in
name
=
pool_name
+
str
(
self
.
counts
[
pool_name
])
self
.
counts
[
pool_name
]
+=
1
if
self
.
use_tf_layers
:
pool
=
pool_function
(
input_layer
,
[
k_height
,
k_width
],
[
d_height
,
d_width
],
padding
=
mode
,
data_format
=
self
.
channel_pos
,
name
=
name
)
else
:
if
self
.
data_format
==
'NHWC'
:
ksize
=
[
1
,
k_height
,
k_width
,
1
]
strides
=
[
1
,
d_height
,
d_width
,
1
]
else
:
ksize
=
[
1
,
1
,
k_height
,
k_width
]
strides
=
[
1
,
1
,
d_height
,
d_width
]
pool
=
tf
.
nn
.
max_pool
(
input_layer
,
ksize
,
strides
,
padding
=
mode
,
data_format
=
self
.
data_format
,
name
=
name
)
if
pool_name
==
'mpool'
:
mlperf
.
logger
.
log_max_pool
(
input_tensor
=
input_layer
,
output_tensor
=
pool
)
self
.
top_layer
=
pool
return
pool
def
mpool
(
self
,
k_height
,
k_width
,
d_height
=
2
,
d_width
=
2
,
mode
=
'VALID'
,
input_layer
=
None
,
num_channels_in
=
None
):
"""Construct a max pooling layer."""
return
self
.
_pool
(
'mpool'
,
pooling_layers
.
max_pooling2d
,
k_height
,
k_width
,
d_height
,
d_width
,
mode
,
input_layer
,
num_channels_in
)
def
apool
(
self
,
k_height
,
k_width
,
d_height
=
2
,
d_width
=
2
,
mode
=
'VALID'
,
input_layer
=
None
,
num_channels_in
=
None
):
"""Construct an average pooling layer."""
return
self
.
_pool
(
'apool'
,
pooling_layers
.
average_pooling2d
,
k_height
,
k_width
,
d_height
,
d_width
,
mode
,
input_layer
,
num_channels_in
)
def
reshape
(
self
,
shape
,
input_layer
=
None
):
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
self
.
top_layer
=
tf
.
reshape
(
input_layer
,
shape
)
self
.
top_size
=
shape
[
-
1
]
# HACK This may not always work
return
self
.
top_layer
def
affine
(
self
,
num_out_channels
,
input_layer
=
None
,
num_channels_in
=
None
,
bias
=
0.0
,
stddev
=
None
,
activation
=
'relu'
):
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
if
num_channels_in
is
None
:
num_channels_in
=
self
.
top_size
name
=
'affine'
+
str
(
self
.
counts
[
'affine'
])
self
.
counts
[
'affine'
]
+=
1
with
tf
.
variable_scope
(
name
):
init_factor
=
2.
if
activation
==
'relu'
else
1.
stddev
=
stddev
or
np
.
sqrt
(
init_factor
/
num_channels_in
)
kernel
=
self
.
get_variable
(
'weights'
,
[
num_channels_in
,
num_out_channels
],
self
.
variable_dtype
,
self
.
dtype
,
initializer
=
tf
.
truncated_normal_initializer
(
stddev
=
stddev
))
biases
=
self
.
get_variable
(
'biases'
,
[
num_out_channels
],
self
.
variable_dtype
,
self
.
dtype
,
initializer
=
tf
.
constant_initializer
(
bias
))
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
MODEL_HP_DENSE
,
value
=
num_out_channels
)
logits
=
tf
.
nn
.
xw_plus_b
(
input_layer
,
kernel
,
biases
)
if
activation
==
'relu'
:
mlperf
.
logger
.
log
(
key
=
mlperf
.
tags
.
MODEL_HP_RELU
)
affine1
=
tf
.
nn
.
relu
(
logits
,
name
=
name
)
elif
activation
==
'linear'
or
activation
is
None
:
affine1
=
logits
else
:
raise
KeyError
(
'Invalid activation type
\'
%s
\'
'
%
activation
)
self
.
top_layer
=
affine1
self
.
top_size
=
num_out_channels
return
affine1
def
inception_module
(
self
,
name
,
cols
,
input_layer
=
None
,
in_size
=
None
):
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
if
in_size
is
None
:
in_size
=
self
.
top_size
name
+=
str
(
self
.
counts
[
name
])
self
.
counts
[
name
]
+=
1
with
tf
.
variable_scope
(
name
):
col_layers
=
[]
col_layer_sizes
=
[]
for
c
,
col
in
enumerate
(
cols
):
col_layers
.
append
([])
col_layer_sizes
.
append
([])
for
l
,
layer
in
enumerate
(
col
):
ltype
,
args
=
layer
[
0
],
layer
[
1
:]
kwargs
=
{
'input_layer'
:
input_layer
,
'num_channels_in'
:
in_size
}
if
l
==
0
else
{}
if
ltype
==
'conv'
:
self
.
conv
(
*
args
,
**
kwargs
)
elif
ltype
==
'mpool'
:
self
.
mpool
(
*
args
,
**
kwargs
)
elif
ltype
==
'apool'
:
self
.
apool
(
*
args
,
**
kwargs
)
elif
ltype
==
'share'
:
# Share matching layer from previous column
self
.
top_layer
=
col_layers
[
c
-
1
][
l
]
self
.
top_size
=
col_layer_sizes
[
c
-
1
][
l
]
else
:
raise
KeyError
(
'Invalid layer type for inception module:
\'
%s
\'
'
%
ltype
)
col_layers
[
c
].
append
(
self
.
top_layer
)
col_layer_sizes
[
c
].
append
(
self
.
top_size
)
catdim
=
3
if
self
.
data_format
==
'NHWC'
else
1
self
.
top_layer
=
tf
.
concat
([
layers
[
-
1
]
for
layers
in
col_layers
],
catdim
)
self
.
top_size
=
sum
([
sizes
[
-
1
]
for
sizes
in
col_layer_sizes
])
return
self
.
top_layer
def
spatial_mean
(
self
,
keep_dims
=
False
):
name
=
'spatial_mean'
+
str
(
self
.
counts
[
'spatial_mean'
])
self
.
counts
[
'spatial_mean'
]
+=
1
axes
=
[
1
,
2
]
if
self
.
data_format
==
'NHWC'
else
[
2
,
3
]
self
.
top_layer
=
tf
.
reduce_mean
(
self
.
top_layer
,
axes
,
keepdims
=
keep_dims
,
name
=
name
)
return
self
.
top_layer
def
dropout
(
self
,
keep_prob
=
0.5
,
input_layer
=
None
):
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
else
:
self
.
top_size
=
None
name
=
'dropout'
+
str
(
self
.
counts
[
'dropout'
])
with
tf
.
variable_scope
(
name
):
if
not
self
.
phase_train
:
keep_prob
=
1.0
if
self
.
use_tf_layers
:
dropout
=
core_layers
.
dropout
(
input_layer
,
1.
-
keep_prob
,
training
=
self
.
phase_train
)
else
:
dropout
=
tf
.
nn
.
dropout
(
input_layer
,
keep_prob
)
self
.
top_layer
=
dropout
return
dropout
def
_batch_norm_without_layers
(
self
,
input_layer
,
decay
,
use_scale
,
epsilon
):
"""Batch normalization on `input_layer` without tf.layers."""
# We make this function as similar as possible to the
# tf.contrib.layers.batch_norm, to minimize the differences between using
# layers and not using layers.
shape
=
input_layer
.
shape
num_channels
=
shape
[
3
]
if
self
.
data_format
==
'NHWC'
else
shape
[
1
]
beta
=
self
.
get_variable
(
'beta'
,
[
num_channels
],
tf
.
float32
,
tf
.
float32
,
initializer
=
tf
.
zeros_initializer
())
if
use_scale
:
gamma
=
self
.
get_variable
(
'gamma'
,
[
num_channels
],
tf
.
float32
,
tf
.
float32
,
initializer
=
tf
.
ones_initializer
())
else
:
gamma
=
tf
.
constant
(
1.0
,
tf
.
float32
,
[
num_channels
])
# For moving variables, we use tf.get_variable instead of self.get_variable,
# since self.get_variable returns the result of tf.cast which we cannot
# assign to.
moving_mean
=
tf
.
get_variable
(
'moving_mean'
,
[
num_channels
],
tf
.
float32
,
initializer
=
tf
.
zeros_initializer
(),
trainable
=
False
)
moving_variance
=
tf
.
get_variable
(
'moving_variance'
,
[
num_channels
],
tf
.
float32
,
initializer
=
tf
.
ones_initializer
(),
trainable
=
False
)
if
self
.
phase_train
:
bn
,
batch_mean
,
batch_variance
=
tf
.
nn
.
fused_batch_norm
(
input_layer
,
gamma
,
beta
,
epsilon
=
epsilon
,
data_format
=
self
.
data_format
,
is_training
=
True
)
mean_update
=
moving_averages
.
assign_moving_average
(
moving_mean
,
batch_mean
,
decay
=
decay
,
zero_debias
=
False
)
variance_update
=
moving_averages
.
assign_moving_average
(
moving_variance
,
batch_variance
,
decay
=
decay
,
zero_debias
=
False
)
tf
.
add_to_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
mean_update
)
tf
.
add_to_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
variance_update
)
else
:
bn
,
_
,
_
=
tf
.
nn
.
fused_batch_norm
(
input_layer
,
gamma
,
beta
,
mean
=
moving_mean
,
variance
=
moving_variance
,
epsilon
=
epsilon
,
data_format
=
self
.
data_format
,
is_training
=
False
)
return
bn
def
batch_norm
(
self
,
input_layer
=
None
,
decay
=
0.999
,
scale
=
False
,
epsilon
=
0.001
):
"""Adds a Batch Normalization layer."""
if
input_layer
is
None
:
input_layer
=
self
.
top_layer
else
:
self
.
top_size
=
None
name
=
'batchnorm'
+
str
(
self
.
counts
[
'batchnorm'
])
self
.
counts
[
'batchnorm'
]
+=
1
center
=
True
with
tf
.
variable_scope
(
name
)
as
scope
:
if
self
.
use_tf_layers
:
layer_obj
=
normalization_layers
.
BatchNormalization
(
momentum
=
decay
,
scale
=
scale
,
epsilon
=
epsilon
,
fused
=
True
,
axis
=
_data_format_to_channel_axis
[
self
.
data_format
],
# We pass this 'scope' argument for compatibility with checkpoints
# created with the contrib version of batch norm. tf_cnn_benchmarks
# used to use the contrib version.
_scope
=
scope
,
center
=
center
,
name
=
scope
.
name
)
bn
=
layer_obj
.
apply
(
input_layer
,
training
=
self
.
phase_train
)
else
:
bn
=
self
.
_batch_norm_without_layers
(
input_layer
,
decay
,
scale
,
epsilon
)
self
.
top_layer
=
bn
self
.
top_size
=
bn
.
shape
[
3
]
if
self
.
data_format
==
'NHWC'
else
bn
.
shape
[
1
]
self
.
top_size
=
int
(
self
.
top_size
)
mlperf
.
logger
.
log_batch_norm
(
input_tensor
=
input_layer
,
output_tensor
=
bn
,
momentum
=
decay
,
epsilon
=
epsilon
,
center
=
center
,
scale
=
scale
,
training
=
self
.
phase_train
)
return
bn
def
lrn
(
self
,
depth_radius
,
bias
,
alpha
,
beta
):
"""Adds a local response normalization layer."""
name
=
'lrn'
+
str
(
self
.
counts
[
'lrn'
])
self
.
counts
[
'lrn'
]
+=
1
self
.
top_layer
=
tf
.
nn
.
lrn
(
self
.
top_layer
,
depth_radius
,
bias
,
alpha
,
beta
,
name
=
name
)
return
self
.
top_layer
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment