Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
7bea23de
"tests/git@developer.sourcefind.cn:wangsen/mineru.git" did not exist on "a09d9e8ca6a3c243a0b474576414b9fa31452933"
Commit
7bea23de
authored
Jul 28, 2017
by
Marianne Linhares Monteiro
Committed by
GitHub
Jul 28, 2017
Browse files
Adding hook for images/sec and other small changes
parent
f24c44d5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
104 additions
and
24 deletions
+104
-24
tutorials/image/cifar10_estimator/cifar10_main.py
tutorials/image/cifar10_estimator/cifar10_main.py
+104
-24
No files found.
tutorials/image/cifar10_estimator/cifar10_main.py
View file @
7bea23de
...
@@ -36,6 +36,10 @@ import os
...
@@ -36,6 +36,10 @@ import os
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
tensorflow.python.platform
import
tf_logging
as
logging
from
tensorflow.python.training
import
basic_session_run_hooks
from
tensorflow.python.training
import
session_run_hook
from
tensorflow.python.training
import
training_util
import
cifar10
import
cifar10
import
cifar10_model
import
cifar10_model
...
@@ -50,7 +54,7 @@ tf.flags.DEFINE_string('data_dir', '',
...
@@ -50,7 +54,7 @@ tf.flags.DEFINE_string('data_dir', '',
tf
.
flags
.
DEFINE_string
(
'model_dir'
,
''
,
tf
.
flags
.
DEFINE_string
(
'model_dir'
,
''
,
'The directory where the model will be stored.'
)
'The directory where the model will be stored.'
)
tf
.
flags
.
DEFINE_boolean
(
'is_cpu_ps'
,
Fals
e
,
tf
.
flags
.
DEFINE_boolean
(
'is_cpu_ps'
,
Tru
e
,
'If using CPU as the parameter server.'
)
'If using CPU as the parameter server.'
)
tf
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
...
@@ -58,7 +62,7 @@ tf.flags.DEFINE_integer('num_gpus', 1,
...
@@ -58,7 +62,7 @@ tf.flags.DEFINE_integer('num_gpus', 1,
tf
.
flags
.
DEFINE_integer
(
'num_layers'
,
44
,
'The number of layers of the model.'
)
tf
.
flags
.
DEFINE_integer
(
'num_layers'
,
44
,
'The number of layers of the model.'
)
tf
.
flags
.
DEFINE_integer
(
'train_steps'
,
1
0000
,
tf
.
flags
.
DEFINE_integer
(
'train_steps'
,
8
0000
,
'The number of steps to use for training.'
)
'The number of steps to use for training.'
)
tf
.
flags
.
DEFINE_integer
(
'train_batch_size'
,
128
,
'Batch size for training.'
)
tf
.
flags
.
DEFINE_integer
(
'train_batch_size'
,
128
,
'Batch size for training.'
)
...
@@ -75,14 +79,16 @@ tf.flags.DEFINE_boolean('use_distortion_for_training', True,
...
@@ -75,14 +79,16 @@ tf.flags.DEFINE_boolean('use_distortion_for_training', True,
tf
.
flags
.
DEFINE_boolean
(
'run_experiment'
,
False
,
tf
.
flags
.
DEFINE_boolean
(
'run_experiment'
,
False
,
'If True will run an experiment,'
'If True will run an experiment,'
'otherwise will run training and evaluation'
'otherwise will run training and evaluation'
'using the estimator interface'
)
'using the estimator interface.'
'Experiments perform training on several workers in parallel'
', in other words experiments know how to invoke train and'
' eval in a sensible fashion for distributed training.'
)
tf
.
flags
.
DEFINE_boolean
(
'sync'
,
False
,
tf
.
flags
.
DEFINE_boolean
(
'sync'
,
False
,
'If true when running in a distributed environment'
'If true when running in a distributed environment'
'will run on sync mode'
)
'will run on sync mode'
)
tf
.
flags
.
DEFINE_integer
(
'num_workers'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_workers'
,
1
,
'Number of workers'
)
'Number of workers'
)
# Perf flags
# Perf flags
tf
.
flags
.
DEFINE_integer
(
'num_intra_threads'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_intra_threads'
,
1
,
...
@@ -110,16 +116,80 @@ tf.flags.DEFINE_boolean('log_device_placement', False,
...
@@ -110,16 +116,80 @@ tf.flags.DEFINE_boolean('log_device_placement', False,
'Whether to log device placement.'
)
'Whether to log device placement.'
)
# TODO(jamesqin): Replace with fix in b/62239022
class
ExamplesPerSecondHook
(
session_run_hook
.
SessionRunHook
):
class
ParamServerDeviceSetter
(
object
):
"""Hook to print out examples per second
"""Helper class to assign variables on the least loaded ps-device."""
def
__init__
(
self
,
worker_device
,
ps_devices
):
Total time is tracked and then divided by the total number of steps
"""Initializer for ParamServerDeviceSetter.
to get the average step time and then batch_size is used to determine
the running average of examples per second. The examples per second for the
most recent interval is also logged.
"""
def
__init__
(
self
,
batch_size
,
every_n_steps
=
100
,
every_n_secs
=
None
,):
"""Initializer for ExamplesPerSecondHook.
Args:
batch_size: Total batch size used to calculate examples/second from
global time.
every_n_steps: Log stats every n steps.
every_n_secs: Log stats every n seconds.
"""
if
(
every_n_steps
is
None
)
==
(
every_n_secs
is
None
):
raise
ValueError
(
'exactly one of every_n_steps and every_n_secs should be provided.'
)
self
.
_timer
=
basic_session_run_hooks
.
SecondOrStepTimer
(
every_steps
=
every_n_steps
,
every_secs
=
every_n_secs
)
self
.
_step_train_time
=
0
self
.
_total_steps
=
0
self
.
_batch_size
=
batch_size
def
begin
(
self
):
self
.
_global_step_tensor
=
training_util
.
get_global_step
()
if
self
.
_global_step_tensor
is
None
:
raise
RuntimeError
(
'Global step should be created to use StepCounterHook.'
)
def
before_run
(
self
,
run_context
):
# pylint: disable=unused-argument
return
basic_session_run_hooks
.
SessionRunArgs
(
self
.
_global_step_tensor
)
def
after_run
(
self
,
run_context
,
run_values
):
_
=
run_context
global_step
=
run_values
.
results
if
self
.
_timer
.
should_trigger_for_step
(
global_step
):
elapsed_time
,
elapsed_steps
=
self
.
_timer
.
update_last_triggered_step
(
global_step
)
if
elapsed_time
is
not
None
:
steps_per_sec
=
elapsed_steps
/
elapsed_time
self
.
_step_train_time
+=
elapsed_time
self
.
_total_steps
+=
elapsed_steps
average_examples_per_sec
=
self
.
_batch_size
*
(
self
.
_total_steps
/
self
.
_step_train_time
)
current_examples_per_sec
=
steps_per_sec
*
self
.
_batch_size
# Average examples/sec followed by current examples/sec
logging
.
info
(
'%s: %g (%g), step = %g'
,
'Average examples/sec'
,
average_examples_per_sec
,
current_examples_per_sec
,
self
.
_total_steps
)
class
GpuParamServerDeviceSetter
(
object
):
"""Used with tf.device() to place variables on the least loaded GPU.
A common use for this class is to pass a list of GPU devices, e.g. ['gpu:0',
'gpu:1','gpu:2'], as ps_devices. When each variable is placed, it will be
placed on the least loaded gpu. All other Ops, which will be the computation
Ops, will be placed on the worker_device.
"""
def
__init__
(
self
,
worker_device
,
ps_devices
):
"""Initializer for GpuParamServerDeviceSetter.
Args:
Args:
worker_device: the device to use for comput
er o
ps.
worker_device: the device to use for comput
ation O
ps.
ps_devices: a list of devices to use for Variable
o
ps. Each variable is
ps_devices: a list of devices to use for Variable
O
ps. Each variable is
assigned to the least loaded device.
assigned to the least loaded device.
"""
"""
self
.
ps_devices
=
ps_devices
self
.
ps_devices
=
ps_devices
...
@@ -132,6 +202,7 @@ class ParamServerDeviceSetter(object):
...
@@ -132,6 +202,7 @@ class ParamServerDeviceSetter(object):
if
op
.
type
not
in
[
'Variable'
,
'VariableV2'
,
'VarHandleOp'
]:
if
op
.
type
not
in
[
'Variable'
,
'VariableV2'
,
'VarHandleOp'
]:
return
self
.
worker_device
return
self
.
worker_device
# Gets the least loaded ps_device
device_index
,
_
=
min
(
enumerate
(
self
.
ps_sizes
),
key
=
operator
.
itemgetter
(
1
))
device_index
,
_
=
min
(
enumerate
(
self
.
ps_sizes
),
key
=
operator
.
itemgetter
(
1
))
device_name
=
self
.
ps_devices
[
device_index
]
device_name
=
self
.
ps_devices
[
device_index
]
var_size
=
op
.
outputs
[
0
].
get_shape
().
num_elements
()
var_size
=
op
.
outputs
[
0
].
get_shape
().
num_elements
()
...
@@ -139,15 +210,16 @@ class ParamServerDeviceSetter(object):
...
@@ -139,15 +210,16 @@ class ParamServerDeviceSetter(object):
return
device_name
return
device_name
def
_create_device_setter
(
is_cpu_ps
,
worker
,
num_gpus
):
def
_create_device_setter
(
is_cpu_ps
,
worker
):
"""Create device setter object."""
"""Create device setter object."""
if
is_cpu_ps
:
if
is_cpu_ps
:
# tf.train.replica_device_setter supports placing variables on the CPU, all
# on one GPU, or on ps_servers defined in a cluster_spec.
return
tf
.
train
.
replica_device_setter
(
return
tf
.
train
.
replica_device_setter
(
worker_device
=
worker
,
ps_device
=
'/cpu:0'
,
ps_tasks
=
1
)
worker_device
=
worker
,
ps_device
=
'/cpu:0'
,
ps_tasks
=
1
)
else
:
else
:
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
FLAGS
.
num_gpus
)]
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
num_gpus
)]
return
ParamServerDeviceSetter
(
worker
,
gpus
)
return
Gpu
ParamServerDeviceSetter
(
worker
,
gpus
)
def
_resnet_model_fn
(
features
,
labels
,
mode
):
def
_resnet_model_fn
(
features
,
labels
,
mode
):
...
@@ -181,7 +253,7 @@ def _resnet_model_fn(features, labels, mode):
...
@@ -181,7 +253,7 @@ def _resnet_model_fn(features, labels, mode):
if
num_gpus
!=
0
:
if
num_gpus
!=
0
:
for
i
in
range
(
num_gpus
):
for
i
in
range
(
num_gpus
):
worker
=
'/gpu:%d'
%
i
worker
=
'/gpu:%d'
%
i
device_setter
=
_create_device_setter
(
is_cpu_ps
,
worker
)
device_setter
=
_create_device_setter
(
is_cpu_ps
,
worker
,
FLAGS
.
num_gpus
)
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
device
(
device_setter
):
with
tf
.
device
(
device_setter
):
...
@@ -210,7 +282,7 @@ def _resnet_model_fn(features, labels, mode):
...
@@ -210,7 +282,7 @@ def _resnet_model_fn(features, labels, mode):
ps_device
=
'/cpu:0'
if
is_cpu_ps
else
'/gpu:0'
ps_device
=
'/cpu:0'
if
is_cpu_ps
else
'/gpu:0'
with
tf
.
device
(
ps_device
):
with
tf
.
device
(
ps_device
):
with
tf
.
name_scope
(
'gradient_averaging'
):
with
tf
.
name_scope
(
'gradient_averaging'
):
loss
=
tf
.
reduce_mean
(
tower_losses
)
loss
=
tf
.
reduce_mean
(
tower_losses
,
name
=
'loss'
)
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
# Averaging one var's gradients computed from multiple towers
# Averaging one var's gradients computed from multiple towers
var
=
zipped_gradvars
[
0
][
1
]
var
=
zipped_gradvars
[
0
][
1
]
...
@@ -403,7 +475,7 @@ def main(unused_argv):
...
@@ -403,7 +475,7 @@ def main(unused_argv):
train_steps
=
FLAGS
.
train_steps
train_steps
=
FLAGS
.
train_steps
eval_steps
=
num_eval_examples
//
FLAGS
.
eval_batch_size
eval_steps
=
num_eval_examples
//
FLAGS
.
eval_batch_size
#
s
ession configuration
#
S
ession configuration
.
sess_config
=
tf
.
ConfigProto
()
sess_config
=
tf
.
ConfigProto
()
sess_config
.
allow_soft_placement
=
True
sess_config
.
allow_soft_placement
=
True
sess_config
.
log_device_placement
=
FLAGS
.
log_device_placement
sess_config
.
log_device_placement
=
FLAGS
.
log_device_placement
...
@@ -411,18 +483,26 @@ def main(unused_argv):
...
@@ -411,18 +483,26 @@ def main(unused_argv):
sess_config
.
inter_op_parallelism_threads
=
FLAGS
.
num_inter_threads
sess_config
.
inter_op_parallelism_threads
=
FLAGS
.
num_inter_threads
sess_config
.
gpu_options
.
force_gpu_compatible
=
FLAGS
.
force_gpu_compatible
sess_config
.
gpu_options
.
force_gpu_compatible
=
FLAGS
.
force_gpu_compatible
# log learning_rate
# Hooks that add extra logging that is useful to see the loss more often in
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
}
# the console as well as examples per second.
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
,
'loss'
:
'gradient_averaging/loss'
}
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
examples_sec_hook
=
ExamplesPerSecondHook
(
FLAGS
.
train_batch_size
,
every_n_steps
=
10
)
hooks
=
[
logging_hook
,
examples_sec_hook
]
if
FLAGS
.
run_experiment
:
if
FLAGS
.
run_experiment
:
config
=
tf
.
contrib
.
learn
.
RunConfig
(
model_dir
=
FLAGS
.
model_dir
)
config
=
tf
.
contrib
.
learn
.
RunConfig
(
model_dir
=
FLAGS
.
model_dir
)
config
=
config
.
replace
(
session_config
=
sess_config
)
config
=
config
.
replace
(
session_config
=
sess_config
)
tf
.
contrib
.
learn
.
learn_runner
.
run
(
tf
.
contrib
.
learn
.
learn_runner
.
run
(
get_experiment_fn
(
train_input_fn
,
eval_input_fn
,
get_experiment_fn
(
train_input_fn
,
eval_input_fn
,
train_steps
,
eval_steps
,
train_steps
,
eval_steps
,
[
logging_
hook
]
),
run_config
=
config
)
hook
s
),
run_config
=
config
)
else
:
else
:
config
=
tf
.
estimator
.
RunConfig
()
config
=
tf
.
estimator
.
RunConfig
()
...
@@ -433,7 +513,7 @@ def main(unused_argv):
...
@@ -433,7 +513,7 @@ def main(unused_argv):
print
(
'Starting to train...'
)
print
(
'Starting to train...'
)
classifier
.
train
(
input_fn
=
train_input_fn
,
classifier
.
train
(
input_fn
=
train_input_fn
,
steps
=
train_steps
,
steps
=
train_steps
,
hooks
=
[
logging_
hook
]
)
hooks
=
hook
s
)
print
(
'Starting to evaluate...'
)
print
(
'Starting to evaluate...'
)
eval_results
=
classifier
.
evaluate
(
eval_results
=
classifier
.
evaluate
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment