Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
2164c8db
"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "63cdf9c0ba20d11f30c07c6b73a3e80ae9eb99dd"
Commit
2164c8db
authored
Aug 17, 2017
by
Eli Bixby
Browse files
Move device and hook to utils. Fix device stuff
parent
8b829873
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
162 additions
and
178 deletions
+162
-178
tutorials/image/cifar10_estimator/README.md
tutorials/image/cifar10_estimator/README.md
+1
-7
tutorials/image/cifar10_estimator/cifar10_main.py
tutorials/image/cifar10_estimator/cifar10_main.py
+54
-159
tutorials/image/cifar10_estimator/cifar10_utils.py
tutorials/image/cifar10_estimator/cifar10_utils.py
+100
-0
tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
...als/image/cifar10_estimator/generate_cifar10_tfrecords.py
+7
-12
No files found.
tutorials/image/cifar10_estimator/README.md
View file @
2164c8db
...
@@ -59,7 +59,6 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
...
@@ -59,7 +59,6 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
# Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
# Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \
--job-dir=/tmp/cifar10 \
--force-gpu-compatible \
--num-gpus=2 \
--num-gpus=2 \
--train-steps=1000
--train-steps=1000
...
@@ -68,8 +67,7 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
...
@@ -68,8 +67,7 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
# a couple of times to perform evaluation.
# a couple of times to perform evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
--job-dir=/tmp/cifar10 \
--job-dir=/tmp/cifar10 \
--avg-on-gpu \
--variable-strategy GPU \
--force-gpu-compatible \
--num-gpus=2 \
--num-gpus=2 \
...
@@ -102,7 +100,6 @@ gcloud ml-engine jobs submit training cifarmultigpu \
...
@@ -102,7 +100,6 @@ gcloud ml-engine jobs submit training cifarmultigpu \
--module-name cifar10_estimator.cifar10_main \
--module-name cifar10_estimator.cifar10_main \
-- \
-- \
--data-dir=$MY_BUCKET/cifar-10-batches-py \
--data-dir=$MY_BUCKET/cifar-10-batches-py \
--force-gpu-compatible \
--num-gpus=4 \
--num-gpus=4 \
--train-steps=1000
--train-steps=1000
```
```
...
@@ -183,11 +180,9 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
...
@@ -183,11 +180,9 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
# Make sure the model_dir is the same as defined on the TF_CONFIG.
# Make sure the model_dir is the same as defined on the TF_CONFIG.
$
python cifar10_main.py
--data-dir
=
gs://path/cifar-10-batches-py
\
$
python cifar10_main.py
--data-dir
=
gs://path/cifar-10-batches-py
\
--job-dir
=
gs://path/model_dir/
\
--job-dir
=
gs://path/model_dir/
\
--force-gpu-compatible
\
--num-gpus
=
4
\
--num-gpus
=
4
\
--train-steps
=
40000
\
--train-steps
=
40000
\
--sync
\
--sync
\
\
--num-workers
=
2
--num-workers
=
2
```
```
...
@@ -325,7 +320,6 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
...
@@ -325,7 +320,6 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
# Make sure the model_dir is the same as defined on the TF_CONFIG.
# Make sure the model_dir is the same as defined on the TF_CONFIG.
$
python cifar10_main.py
--data-dir
=
gs://path/cifar-10-batches-py
\
$
python cifar10_main.py
--data-dir
=
gs://path/cifar-10-batches-py
\
--job-dir
=
gs://path/model_dir/
\
--job-dir
=
gs://path/model_dir/
\
--force-gpu-compatible
\
--num-gpus
=
4
\
--num-gpus
=
4
\
--train-steps
=
40000
\
--train-steps
=
40000
\
--sync
--sync
...
...
tutorials/image/cifar10_estimator/cifar10_main.py
View file @
2164c8db
...
@@ -30,138 +30,28 @@ from __future__ import print_function
...
@@ -30,138 +30,28 @@ from __future__ import print_function
import
argparse
import
argparse
import
functools
import
functools
import
op
er
a
to
r
import
it
erto
ols
import
os
import
os
import
six
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
tensorflow.python.platform
import
tf_logging
as
logging
from
tensorflow.python.training
import
basic_session_run_hooks
from
tensorflow.python.training
import
session_run_hook
from
tensorflow.python.training
import
training_util
import
cifar10
import
cifar10
import
cifar10_model
import
cifar10_model
import
cifar10_utils
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
class
ExamplesPerSecondHook
(
session_run_hook
.
SessionRunHook
):
"""Hook to print out examples per second.
Total time is tracked and then divided by the total number of steps
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
to get the average step time and then batch_size is used to determine
the running average of examples per second. The examples per second for the
most recent interval is also logged.
"""
def
__init__
(
self
,
batch_size
,
every_n_steps
=
100
,
every_n_secs
=
None
,):
"""Initializer for ExamplesPerSecondHook.
Args:
batch_size: Total batch size used to calculate examples/second from
global time.
every_n_steps: Log stats every n steps.
every_n_secs: Log stats every n seconds.
"""
if
(
every_n_steps
is
None
)
==
(
every_n_secs
is
None
):
raise
ValueError
(
'exactly one of every_n_steps'
' and every_n_secs should be provided.'
)
self
.
_timer
=
basic_session_run_hooks
.
SecondOrStepTimer
(
every_steps
=
every_n_steps
,
every_secs
=
every_n_secs
)
self
.
_step_train_time
=
0
self
.
_total_steps
=
0
self
.
_batch_size
=
batch_size
def
begin
(
self
):
self
.
_global_step_tensor
=
training_util
.
get_global_step
()
if
self
.
_global_step_tensor
is
None
:
raise
RuntimeError
(
'Global step should be created to use StepCounterHook.'
)
def
before_run
(
self
,
run_context
):
# pylint: disable=unused-argument
return
basic_session_run_hooks
.
SessionRunArgs
(
self
.
_global_step_tensor
)
def
after_run
(
self
,
run_context
,
run_values
):
_
=
run_context
global_step
=
run_values
.
results
if
self
.
_timer
.
should_trigger_for_step
(
global_step
):
elapsed_time
,
elapsed_steps
=
self
.
_timer
.
update_last_triggered_step
(
global_step
)
if
elapsed_time
is
not
None
:
steps_per_sec
=
elapsed_steps
/
elapsed_time
self
.
_step_train_time
+=
elapsed_time
self
.
_total_steps
+=
elapsed_steps
average_examples_per_sec
=
self
.
_batch_size
*
(
self
.
_total_steps
/
self
.
_step_train_time
)
current_examples_per_sec
=
steps_per_sec
*
self
.
_batch_size
# Average examples/sec followed by current examples/sec
logging
.
info
(
'%s: %g (%g), step = %g'
,
'Average examples/sec'
,
average_examples_per_sec
,
current_examples_per_sec
,
self
.
_total_steps
)
class
GpuParamServerDeviceSetter
(
object
):
"""Used with tf.device() to place variables on the least loaded GPU.
A common use for this class is to pass a list of GPU devices, e.g. ['gpu:0',
'gpu:1','gpu:2'], as ps_devices. When each variable is placed, it will be
placed on the least loaded gpu. All other Ops, which will be the computation
Ops, will be placed on the worker_device.
"""
def
__init__
(
self
,
worker_device
,
ps_devices
):
"""Initializer for GpuParamServerDeviceSetter.
Args:
def
get_model_fn
(
num_gpus
,
variable_strategy
,
num_workers
):
worker_device: the device to use for computation Ops.
ps_devices: a list of devices to use for Variable Ops. Each variable is
assigned to the least loaded device.
"""
self
.
ps_devices
=
ps_devices
self
.
worker_device
=
worker_device
self
.
ps_sizes
=
[
0
]
*
len
(
self
.
ps_devices
)
def
__call__
(
self
,
op
):
if
op
.
device
:
return
op
.
device
if
op
.
type
not
in
[
'Variable'
,
'VariableV2'
,
'VarHandleOp'
]:
return
self
.
worker_device
# Gets the least loaded ps_device
device_index
,
_
=
min
(
enumerate
(
self
.
ps_sizes
),
key
=
operator
.
itemgetter
(
1
))
device_name
=
self
.
ps_devices
[
device_index
]
var_size
=
op
.
outputs
[
0
].
get_shape
().
num_elements
()
self
.
ps_sizes
[
device_index
]
+=
var_size
return
device_name
def
_create_device_setter
(
avg_on_gpu
,
worker
,
num_gpus
):
"""Create device setter object."""
if
avg_on_gpu
:
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
num_gpus
)]
return
GpuParamServerDeviceSetter
(
worker
,
gpus
)
else
:
# tf.train.replica_device_setter supports placing variables on the CPU, all
# on one GPU, or on ps_servers defined in a cluster_spec.
return
tf
.
train
.
replica_device_setter
(
worker_device
=
worker
,
ps_device
=
'/cpu:0'
,
ps_tasks
=
1
)
def
get_model_fn
(
num_gpus
,
avg_on_gpu
,
num_workers
):
def
_resnet_model_fn
(
features
,
labels
,
mode
,
params
):
def
_resnet_model_fn
(
features
,
labels
,
mode
,
params
):
"""Resnet model body.
"""Resnet model body.
Support single host, one or more GPU training. Parameter distribution can
be
Support single host, one or more GPU training. Parameter distribution can
either one of the following scheme.
be
either one of the following scheme.
1. CPU is the parameter server and manages gradient updates.
1. CPU is the parameter server and manages gradient updates.
2. Parameters are distributed evenly across all GPUs, and the first GPU
2. Parameters are distributed evenly across all GPUs, and the first GPU
manages gradient updates.
manages gradient updates.
...
@@ -186,8 +76,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
...
@@ -186,8 +76,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
if
num_gpus
!=
0
:
if
num_gpus
!=
0
:
for
i
in
range
(
num_gpus
):
for
i
in
range
(
num_gpus
):
worker
=
'/gpu:%d'
%
i
worker_device
=
'/gpu:{}'
.
format
(
i
)
device_setter
=
_create_device_setter
(
avg_on_gpu
,
worker
,
num_gpus
)
if
variable_strategy
==
'CPU'
:
device_setter
=
cifar10_utils
.
local_device_setter
(
worker_device
=
worker_device
)
elif
variable_strategy
==
'GPU'
:
device_setter
=
cifar10_utils
.
local_device_setter
(
ps_device_type
=
'gpu'
,
worker_device
=
worker_device
,
ps_strategy
=
tf
.
contrib
.
training
.
GreedyLoadBalancingStrategy
(
num_gpus
,
tf
.
contrib
.
training
.
byte_size_load_fn
)
)
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
device
(
device_setter
):
with
tf
.
device
(
device_setter
):
...
@@ -231,22 +132,25 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
...
@@ -231,22 +132,25 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
# Now compute global loss and gradients.
# Now compute global loss and gradients.
gradvars
=
[]
gradvars
=
[]
# Server that runs the ops to apply global gradient updates.
with
tf
.
name_scope
(
'gradient_averaging'
):
avg_device
=
'/gpu:0'
if
avg_on_gpu
else
'/cpu:0'
all_grads
=
{}
with
tf
.
device
(
avg_device
):
for
grad
,
var
in
itertools
.
chain
(
*
tower_gradvars
):
with
tf
.
name_scope
(
'gradient_averaging'
):
if
grad
is
not
None
:
loss
=
tf
.
reduce_mean
(
tower_losses
,
name
=
'loss'
)
all_grads
.
setdefault
(
var
,
[]).
append
(
grad
)
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
for
var
,
grads
in
six
.
iteritems
(
all_grads
):
# Averaging one var's gradients computed from multiple towers
# Average gradients on the same device as the variables
var
=
zipped_gradvars
[
0
][
1
]
# to which they apply.
grads
=
[
gv
[
0
]
for
gv
in
zipped_gradvars
]
with
tf
.
device
(
var
.
device
):
with
tf
.
device
(
var
.
device
):
if
len
(
grads
)
==
1
:
if
len
(
grads
)
==
1
:
avg_grad
=
grads
[
0
]
avg_grad
=
grads
[
0
]
else
:
else
:
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
gradvars
.
append
((
avg_grad
,
var
))
gradvars
.
append
((
avg_grad
,
var
))
# Device that runs the ops to apply global gradient updates.
consolidation_device
=
'/gpu:0'
if
variable_strategy
==
'GPU'
else
'/cpu:0'
with
tf
.
device
(
consolidation_device
):
# Suggested learning rate scheduling from
# Suggested learning rate scheduling from
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
# users could apply other scheduling.
# users could apply other scheduling.
...
@@ -292,6 +196,7 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
...
@@ -292,6 +196,7 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
metrics
=
{
metrics
=
{
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
}
}
loss
=
tf
.
reduce_mean
(
tower_losses
,
name
=
'loss'
)
return
tf
.
estimator
.
EstimatorSpec
(
return
tf
.
estimator
.
EstimatorSpec
(
mode
=
mode
,
mode
=
mode
,
...
@@ -345,7 +250,7 @@ def _tower_fn(is_training,
...
@@ -345,7 +250,7 @@ def _tower_fn(is_training,
tower_grad
=
tf
.
gradients
(
tower_loss
,
model_params
)
tower_grad
=
tf
.
gradients
(
tower_loss
,
model_params
)
return
tower_loss
,
tower_grad
,
tower_pred
return
tower_loss
,
zip
(
tower_grad
,
model_params
),
tower_pred
def
input_fn
(
data_dir
,
subset
,
num_shards
,
batch_size
,
def
input_fn
(
data_dir
,
subset
,
num_shards
,
batch_size
,
...
@@ -433,11 +338,11 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
...
@@ -433,11 +338,11 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
train_steps
=
hparams
.
train_steps
train_steps
=
hparams
.
train_steps
eval_steps
=
num_eval_examples
//
hparams
.
eval_batch_size
eval_steps
=
num_eval_examples
//
hparams
.
eval_batch_size
examples_sec_hook
=
ExamplesPerSecondHook
(
examples_sec_hook
=
cifar10_utils
.
ExamplesPerSecondHook
(
hparams
.
train_batch_size
,
every_n_steps
=
10
)
hparams
.
train_batch_size
,
every_n_steps
=
10
)
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
,
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
,
'loss'
:
'
gradient_averaging/
loss'
}
'loss'
:
'loss'
}
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
...
@@ -446,7 +351,7 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
...
@@ -446,7 +351,7 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
classifier
=
tf
.
estimator
.
Estimator
(
classifier
=
tf
.
estimator
.
Estimator
(
model_fn
=
get_model_fn
(
model_fn
=
get_model_fn
(
num_gpus
,
is_gpu_ps
,
run_config
.
num_worker_replicas
),
num_gpus
,
is_gpu_ps
,
run_config
.
num_worker_replicas
or
1
),
config
=
run_config
,
config
=
run_config
,
params
=
vars
(
hparams
)
params
=
vars
(
hparams
)
)
)
...
@@ -467,11 +372,10 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
...
@@ -467,11 +372,10 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
def
main
(
job_dir
,
def
main
(
job_dir
,
data_dir
,
data_dir
,
num_gpus
,
num_gpus
,
avg_on_gpu
,
variable_strategy
,
use_distortion_for_training
,
use_distortion_for_training
,
log_device_placement
,
log_device_placement
,
num_intra_threads
,
num_intra_threads
,
force_gpu_compatible
,
**
hparams
):
**
hparams
):
# The env variable is on deprecation path, default is set to off.
# The env variable is on deprecation path, default is set to off.
os
.
environ
[
'TF_SYNC_ON_FINISH'
]
=
'0'
os
.
environ
[
'TF_SYNC_ON_FINISH'
]
=
'0'
...
@@ -482,7 +386,7 @@ def main(job_dir,
...
@@ -482,7 +386,7 @@ def main(job_dir,
log_device_placement
=
log_device_placement
,
log_device_placement
=
log_device_placement
,
intra_op_parallelism_threads
=
num_intra_threads
,
intra_op_parallelism_threads
=
num_intra_threads
,
gpu_options
=
tf
.
GPUOptions
(
gpu_options
=
tf
.
GPUOptions
(
force_gpu_compatible
=
force_gpu_compatibl
e
force_gpu_compatible
=
Tru
e
)
)
)
)
...
@@ -493,7 +397,7 @@ def main(job_dir,
...
@@ -493,7 +397,7 @@ def main(job_dir,
get_experiment_fn
(
get_experiment_fn
(
data_dir
,
data_dir
,
num_gpus
,
num_gpus
,
avg_on_gpu
,
variable_strategy
,
use_distortion_for_training
use_distortion_for_training
),
),
run_config
=
config
,
run_config
=
config
,
...
@@ -516,10 +420,11 @@ if __name__ == '__main__':
...
@@ -516,10 +420,11 @@ if __name__ == '__main__':
help
=
'The directory where the model will be stored.'
help
=
'The directory where the model will be stored.'
)
)
parser
.
add_argument
(
parser
.
add_argument
(
'--avg-on-gpu'
,
'--variable_strategy'
,
action
=
'store_true'
,
choices
=
[
'CPU'
,
'GPU'
],
default
=
False
,
type
=
str
,
help
=
'If present, use GPU to average gradients.'
default
=
'CPU'
,
help
=
'Where to locate variable operations'
)
)
parser
.
add_argument
(
parser
.
add_argument
(
'--num-gpus'
,
'--num-gpus'
,
...
@@ -562,8 +467,7 @@ if __name__ == '__main__':
...
@@ -562,8 +467,7 @@ if __name__ == '__main__':
type
=
float
,
type
=
float
,
default
=
2e-4
,
default
=
2e-4
,
help
=
'Weight decay for convolutions.'
help
=
'Weight decay for convolutions.'
)
)
parser
.
add_argument
(
parser
.
add_argument
(
'--learning-rate'
,
'--learning-rate'
,
type
=
float
,
type
=
float
,
...
@@ -609,15 +513,6 @@ if __name__ == '__main__':
...
@@ -609,15 +513,6 @@ if __name__ == '__main__':
system will pick an appropriate number.
\
system will pick an appropriate number.
\
"""
"""
)
)
parser
.
add_argument
(
'--force-gpu-compatible'
,
action
=
'store_true'
,
default
=
False
,
help
=
"""
\
Whether to enable force_gpu_compatible in GPU_Options. Check
tensorflow/core/protobuf/config.proto#L69 for details.
\
"""
)
parser
.
add_argument
(
parser
.
add_argument
(
'--log-device-placement'
,
'--log-device-placement'
,
action
=
'store_true'
,
action
=
'store_true'
,
...
@@ -641,7 +536,7 @@ if __name__ == '__main__':
...
@@ -641,7 +536,7 @@ if __name__ == '__main__':
if
args
.
num_gpus
<
0
:
if
args
.
num_gpus
<
0
:
raise
ValueError
(
raise
ValueError
(
'Invalid GPU count:
\"
num_gpus
\"
must be 0 or a positive integer.'
)
'Invalid GPU count:
\"
num_gpus
\"
must be 0 or a positive integer.'
)
if
args
.
num_gpus
==
0
and
args
.
avg_on_gpu
:
if
args
.
num_gpus
==
0
and
args
.
variable_strategy
==
'GPU'
:
raise
ValueError
(
raise
ValueError
(
'No GPU available for use, must use CPU to average gradients.'
)
'No GPU available for use, must use CPU to average gradients.'
)
if
(
args
.
num_layers
-
2
)
%
6
!=
0
:
if
(
args
.
num_layers
-
2
)
%
6
!=
0
:
...
...
tutorials/image/cifar10_estimator/cifar10_utils.py
0 → 100644
View file @
2164c8db
import
six
from
tensorflow.python.platform
import
tf_logging
as
logging
from
tensorflow.core.framework
import
node_def_pb2
from
tensorflow.python.framework
import
device
as
pydev
from
tensorflow.python.training
import
basic_session_run_hooks
from
tensorflow.python.training
import
session_run_hook
from
tensorflow.python.training
import
training_util
from
tensorflow.python.training
import
device_setter
class
ExamplesPerSecondHook
(
session_run_hook
.
SessionRunHook
):
"""Hook to print out examples per second.
Total time is tracked and then divided by the total number of steps
to get the average step time and then batch_size is used to determine
the running average of examples per second. The examples per second for the
most recent interval is also logged.
"""
def
__init__
(
self
,
batch_size
,
every_n_steps
=
100
,
every_n_secs
=
None
,):
"""Initializer for ExamplesPerSecondHook.
Args:
batch_size: Total batch size used to calculate examples/second from
global time.
every_n_steps: Log stats every n steps.
every_n_secs: Log stats every n seconds.
"""
if
(
every_n_steps
is
None
)
==
(
every_n_secs
is
None
):
raise
ValueError
(
'exactly one of every_n_steps'
' and every_n_secs should be provided.'
)
self
.
_timer
=
basic_session_run_hooks
.
SecondOrStepTimer
(
every_steps
=
every_n_steps
,
every_secs
=
every_n_secs
)
self
.
_step_train_time
=
0
self
.
_total_steps
=
0
self
.
_batch_size
=
batch_size
def
begin
(
self
):
self
.
_global_step_tensor
=
training_util
.
get_global_step
()
if
self
.
_global_step_tensor
is
None
:
raise
RuntimeError
(
'Global step should be created to use StepCounterHook.'
)
def
before_run
(
self
,
run_context
):
# pylint: disable=unused-argument
return
basic_session_run_hooks
.
SessionRunArgs
(
self
.
_global_step_tensor
)
def
after_run
(
self
,
run_context
,
run_values
):
_
=
run_context
global_step
=
run_values
.
results
if
self
.
_timer
.
should_trigger_for_step
(
global_step
):
elapsed_time
,
elapsed_steps
=
self
.
_timer
.
update_last_triggered_step
(
global_step
)
if
elapsed_time
is
not
None
:
steps_per_sec
=
elapsed_steps
/
elapsed_time
self
.
_step_train_time
+=
elapsed_time
self
.
_total_steps
+=
elapsed_steps
average_examples_per_sec
=
self
.
_batch_size
*
(
self
.
_total_steps
/
self
.
_step_train_time
)
current_examples_per_sec
=
steps_per_sec
*
self
.
_batch_size
# Average examples/sec followed by current examples/sec
logging
.
info
(
'%s: %g (%g), step = %g'
,
'Average examples/sec'
,
average_examples_per_sec
,
current_examples_per_sec
,
self
.
_total_steps
)
def
local_device_setter
(
num_devices
=
1
,
ps_device_type
=
'cpu'
,
worker_device
=
'/cpu:0'
,
ps_ops
=
None
,
ps_strategy
=
None
):
if
ps_ops
==
None
:
ps_ops
=
[
'Variable'
,
'VariableV2'
,
'VarHandleOp'
]
if
ps_strategy
is
None
:
ps_strategy
=
device_setter
.
_RoundRobinStrategy
(
num_devices
)
if
not
six
.
callable
(
ps_strategy
):
raise
TypeError
(
"ps_strategy must be callable"
)
def
_local_device_chooser
(
op
):
current_device
=
pydev
.
DeviceSpec
.
from_string
(
op
.
device
or
""
)
node_def
=
op
if
isinstance
(
op
,
node_def_pb2
.
NodeDef
)
else
op
.
node_def
if
node_def
.
op
in
ps_ops
:
ps_device_spec
=
pydev
.
DeviceSpec
.
from_string
(
'/{}:{}'
.
format
(
ps_device_type
,
ps_strategy
(
op
)))
ps_device_spec
.
merge_from
(
current_device
)
return
ps_device_spec
.
to_string
()
else
:
worker_device_spec
=
pydev
.
DeviceSpec
.
from_string
(
worker_device
or
""
)
worker_device_spec
.
merge_from
(
current_device
)
return
worker_device_spec
.
to_string
()
return
_local_device_chooser
tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
View file @
2164c8db
...
@@ -28,10 +28,6 @@ import os
...
@@ -28,10 +28,6 @@ import os
import
tensorflow
as
tf
import
tensorflow
as
tf
FLAGS
=
None
def
_int64_feature
(
value
):
def
_int64_feature
(
value
):
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
value
]))
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
[
value
]))
...
@@ -75,12 +71,12 @@ def convert_to_tfrecord(input_files, output_file):
...
@@ -75,12 +71,12 @@ def convert_to_tfrecord(input_files, output_file):
record_writer
.
write
(
example
.
SerializeToString
())
record_writer
.
write
(
example
.
SerializeToString
())
def
main
(
unused_argv
):
def
main
(
input_dir
,
output_dir
):
file_names
=
_get_file_names
()
file_names
=
_get_file_names
()
for
mode
,
files
in
file_names
.
items
():
for
mode
,
files
in
file_names
.
items
():
input_files
=
[
input_files
=
[
os
.
path
.
join
(
FLAGS
.
input_dir
,
f
)
for
f
in
files
]
os
.
path
.
join
(
input_dir
,
f
)
for
f
in
files
]
output_file
=
os
.
path
.
join
(
FLAGS
.
output_dir
,
mode
+
'.tfrecords'
)
output_file
=
os
.
path
.
join
(
output_dir
,
mode
+
'.tfrecords'
)
# Convert to Examples and write the result to TFRecords.
# Convert to Examples and write the result to TFRecords.
convert_to_tfrecord
(
input_files
,
output_file
)
convert_to_tfrecord
(
input_files
,
output_file
)
print
(
'Done!'
)
print
(
'Done!'
)
...
@@ -89,13 +85,13 @@ def main(unused_argv):
...
@@ -89,13 +85,13 @@ def main(unused_argv):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
parser
.
add_argument
(
'--input
_
dir'
,
'--input
-
dir'
,
type
=
str
,
type
=
str
,
default
=
''
,
default
=
''
,
help
=
'Directory where CIFAR10 data is located.'
help
=
'Directory where CIFAR10 data is located.'
)
)
parser
.
add_argument
(
parser
.
add_argument
(
'--output
_
dir'
,
'--output
-
dir'
,
type
=
str
,
type
=
str
,
default
=
''
,
default
=
''
,
help
=
"""
\
help
=
"""
\
...
@@ -103,6 +99,5 @@ if __name__ == '__main__':
...
@@ -103,6 +99,5 @@ if __name__ == '__main__':
name as the CIFAR10 inputs + .tfrecords.
\
name as the CIFAR10 inputs + .tfrecords.
\
"""
"""
)
)
FLAGS
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
.
input_dir
,
args
.
output_dir
)
tf
.
app
.
run
(
main
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment