Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
74ecc048
Commit
74ecc048
authored
Jul 24, 2017
by
Marianne Linhares Monteiro
Committed by
GitHub
Jul 24, 2017
Browse files
Refactoring and adding sync mode
parent
28328ae3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
72 additions
and
98 deletions
+72
-98
tutorials/image/cifar10_estimator/cifar10_main.py
tutorials/image/cifar10_estimator/cifar10_main.py
+72
-98
No files found.
tutorials/image/cifar10_estimator/cifar10_main.py
View file @
74ecc048
...
@@ -36,6 +36,7 @@ import os
...
@@ -36,6 +36,7 @@ import os
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
tensorflow.contrib.learn.python.learn
import
learn_runner
# run the experiment
import
cifar10
import
cifar10
import
cifar10_model
import
cifar10_model
...
@@ -44,13 +45,13 @@ tf.logging.set_verbosity(tf.logging.INFO)
...
@@ -44,13 +45,13 @@ tf.logging.set_verbosity(tf.logging.INFO)
FLAGS
=
tf
.
flags
.
FLAGS
FLAGS
=
tf
.
flags
.
FLAGS
tf
.
flags
.
DEFINE_string
(
'data_dir'
,
''
,
tf
.
flags
.
DEFINE_string
(
'data_dir'
,
'
cifar10
'
,
'The directory where the CIFAR-10 input data is stored.'
)
'The directory where the CIFAR-10 input data is stored.'
)
tf
.
flags
.
DEFINE_string
(
'model_dir'
,
''
,
tf
.
flags
.
DEFINE_string
(
'model_dir'
,
'
output2_2
'
,
'The directory where the model will be stored.'
)
'The directory where the model will be stored.'
)
tf
.
flags
.
DEFINE_boolean
(
'is_cpu_ps'
,
Fals
e
,
tf
.
flags
.
DEFINE_boolean
(
'is_cpu_ps'
,
Tru
e
,
'If using CPU as the parameter server.'
)
'If using CPU as the parameter server.'
)
tf
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
...
@@ -58,12 +59,12 @@ tf.flags.DEFINE_integer('num_gpus', 1,
...
@@ -58,12 +59,12 @@ tf.flags.DEFINE_integer('num_gpus', 1,
tf
.
flags
.
DEFINE_integer
(
'num_layers'
,
44
,
'The number of layers of the model.'
)
tf
.
flags
.
DEFINE_integer
(
'num_layers'
,
44
,
'The number of layers of the model.'
)
tf
.
flags
.
DEFINE_integer
(
'train_steps'
,
10000
,
tf
.
flags
.
DEFINE_integer
(
'train_batch_size'
,
1024
,
'Batch size for training.'
)
'The number of steps to use for training.'
)
tf
.
flags
.
DEFINE_integer
(
'train_batch_size'
,
128
,
'Batch size for training.'
)
tf
.
flags
.
DEFINE_integer
(
'train_steps'
,
(
50000.0
/
FLAGS
.
train_batch_size
)
*
40
,
'The number of steps to use for training.'
)
# 40 epochs
tf
.
flags
.
DEFINE_integer
(
'eval_batch_size'
,
1
00
,
'Batch size for validation.'
)
tf
.
flags
.
DEFINE_integer
(
'eval_batch_size'
,
2
00
,
'Batch size for validation.'
)
tf
.
flags
.
DEFINE_float
(
'momentum'
,
0.9
,
'Momentum for MomentumOptimizer.'
)
tf
.
flags
.
DEFINE_float
(
'momentum'
,
0.9
,
'Momentum for MomentumOptimizer.'
)
...
@@ -71,10 +72,6 @@ tf.flags.DEFINE_float('weight_decay', 1e-4, 'Weight decay for convolutions.')
...
@@ -71,10 +72,6 @@ tf.flags.DEFINE_float('weight_decay', 1e-4, 'Weight decay for convolutions.')
tf
.
flags
.
DEFINE_boolean
(
'use_distortion_for_training'
,
True
,
tf
.
flags
.
DEFINE_boolean
(
'use_distortion_for_training'
,
True
,
'If doing image distortion for training.'
)
'If doing image distortion for training.'
)
tf
.
flags
.
DEFINE_boolean
(
'run_experiment'
,
False
,
'If True will run an experiment,'
'otherwise will run training and evaluation'
'using the estimator interface'
)
# Perf flags
# Perf flags
tf
.
flags
.
DEFINE_integer
(
'num_intra_threads'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_intra_threads'
,
1
,
...
@@ -141,7 +138,6 @@ def _create_device_setter(is_cpu_ps, worker):
...
@@ -141,7 +138,6 @@ def _create_device_setter(is_cpu_ps, worker):
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
FLAGS
.
num_gpus
)]
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
FLAGS
.
num_gpus
)]
return
ParamServerDeviceSetter
(
worker
,
gpus
)
return
ParamServerDeviceSetter
(
worker
,
gpus
)
def
_resnet_model_fn
(
features
,
labels
,
mode
):
def
_resnet_model_fn
(
features
,
labels
,
mode
):
"""Resnet model body.
"""Resnet model body.
...
@@ -175,24 +171,24 @@ def _resnet_model_fn(features, labels, mode):
...
@@ -175,24 +171,24 @@ def _resnet_model_fn(features, labels, mode):
worker
=
'/gpu:%d'
%
i
worker
=
'/gpu:%d'
%
i
device_setter
=
_create_device_setter
(
is_cpu_ps
,
worker
)
device_setter
=
_create_device_setter
(
is_cpu_ps
,
worker
)
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
device
(
device_setter
):
with
tf
.
device
(
device_setter
):
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
tower_labels
[
i
],
tower_losses
,
tower_gradvars
,
tower_labels
[
i
],
tower_losses
,
tower_gradvars
,
tower_preds
,
False
)
tower_preds
,
False
)
if
i
==
0
:
if
i
==
0
:
# Only trigger batch_norm moving mean and variance update from the
# Only trigger batch_norm moving mean and variance update from the
# 1st tower. Ideally, we should grab the updates from all towers
# 1st tower. Ideally, we should grab the updates from all towers
# but these stats accumulate extremely fast so we can ignore the
# but these stats accumulate extremely fast so we can ignore the
# other stats from the other towers without significant detriment.
# other stats from the other towers without significant detriment.
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
name_scope
)
else
:
else
:
with
tf
.
variable_scope
(
'resnet'
),
tf
.
device
(
'/cpu:0'
):
with
tf
.
variable_scope
(
'resnet'
),
tf
.
device
(
'/cpu:0'
):
with
tf
.
name_scope
(
'tower_cpu'
)
as
name_scope
:
with
tf
.
name_scope
(
'tower_cpu'
)
as
name_scope
:
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
0
],
tower_labels
[
0
],
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
0
],
tower_labels
[
0
],
tower_losses
,
tower_gradvars
,
tower_preds
,
True
)
tower_losses
,
tower_gradvars
,
tower_preds
,
True
)
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
# Now compute global loss and gradients.
# Now compute global loss and gradients.
gradvars
=
[]
gradvars
=
[]
...
@@ -204,51 +200,52 @@ def _resnet_model_fn(features, labels, mode):
...
@@ -204,51 +200,52 @@ def _resnet_model_fn(features, labels, mode):
with
tf
.
name_scope
(
'gradient_averaging'
):
with
tf
.
name_scope
(
'gradient_averaging'
):
loss
=
tf
.
reduce_mean
(
tower_losses
)
loss
=
tf
.
reduce_mean
(
tower_losses
)
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
# Averaging one var's gradients computed from multiple towers
# Averaging one var's gradients computed from multiple towers
var
=
zipped_gradvars
[
0
][
1
]
var
=
zipped_gradvars
[
0
][
1
]
grads
=
[
gv
[
0
]
for
gv
in
zipped_gradvars
]
grads
=
[
gv
[
0
]
for
gv
in
zipped_gradvars
]
with
tf
.
device
(
var
.
device
):
with
tf
.
device
(
var
.
device
):
if
len
(
grads
)
==
1
:
if
len
(
grads
)
==
1
:
avg_grad
=
grads
[
0
]
avg_grad
=
grads
[
0
]
else
:
else
:
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
gradvars
.
append
((
avg_grad
,
var
))
gradvars
.
append
((
avg_grad
,
var
))
# Suggested learning rate scheduling from
# Suggested learning rate scheduling from
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
# users could apply other scheduling.
# users could apply other scheduling.
num_batches_per_epoch
=
cifar10
.
Cifar10DataSet
.
num_examples_per_epoch
(
num_batches_per_epoch
=
cifar10
.
Cifar10DataSet
.
num_examples_per_epoch
(
'train'
)
//
FLAGS
.
train_batch_size
'train'
)
//
FLAGS
.
train_batch_size
boundaries
=
[
boundaries
=
[
num_batches_per_epoch
*
x
num_batches_per_epoch
*
x
for
x
in
np
.
array
([
82
,
123
,
300
],
dtype
=
np
.
int64
)
for
x
in
np
.
array
([
82
,
123
,
300
],
dtype
=
np
.
int64
)
]
]
staged_lr
=
[
0.1
,
0.01
,
0.001
,
0.0002
]
staged_lr
=
[
0.1
,
0.01
,
0.001
,
0.0002
]
learning_rate
=
tf
.
train
.
piecewise_constant
(
tf
.
train
.
get_global_step
(),
boundaries
,
staged_lr
)
global_step
=
tf
.
train
.
get_global_step
()
learning_rate
=
tf
.
train
.
piecewise_constant
(
global_step
,
boundaries
,
staged_lr
)
# Create a nicely-named tensor for logging
# Create a nicely-named tensor for logging
learning_rate
=
tf
.
identity
(
learning_rate
,
name
=
'learning_rate'
)
learning_rate
=
tf
.
identity
(
learning_rate
,
name
=
'learning_rate'
)
optimizer
=
tf
.
train
.
MomentumOptimizer
(
optimizer
=
tf
.
train
.
MomentumOptimizer
(
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
momentum
=
momentum
)
momentum
=
momentum
)
# Create single grouped train op
# Create single grouped train op
train_op
=
[
train_op
=
[
optimizer
.
apply_gradients
(
optimizer
.
apply_gradients
(
gradvars
,
global_step
=
tf
.
train
.
get_
global_step
()
)
gradvars
,
global_step
=
global_step
)
]
]
train_op
.
extend
(
update_ops
)
train_op
.
extend
(
update_ops
)
train_op
=
tf
.
group
(
*
train_op
)
train_op
=
tf
.
group
(
*
train_op
)
predictions
=
{
predictions
=
{
'classes'
:
'classes'
:
tf
.
concat
([
p
[
'classes'
]
for
p
in
tower_preds
],
axis
=
0
),
tf
.
concat
([
p
[
'classes'
]
for
p
in
tower_preds
],
axis
=
0
),
'probabilities'
:
'probabilities'
:
tf
.
concat
([
p
[
'probabilities'
]
for
p
in
tower_preds
],
axis
=
0
)
tf
.
concat
([
p
[
'probabilities'
]
for
p
in
tower_preds
],
axis
=
0
)
}
}
stacked_labels
=
tf
.
concat
(
labels
,
axis
=
0
)
stacked_labels
=
tf
.
concat
(
labels
,
axis
=
0
)
metrics
=
{
metrics
=
{
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
}
}
return
tf
.
estimator
.
EstimatorSpec
(
return
tf
.
estimator
.
EstimatorSpec
(
...
@@ -363,23 +360,21 @@ def input_fn(subset, num_shards):
...
@@ -363,23 +360,21 @@ def input_fn(subset, num_shards):
label_shards
=
[
tf
.
parallel_stack
(
x
)
for
x
in
label_shards
]
label_shards
=
[
tf
.
parallel_stack
(
x
)
for
x
in
label_shards
]
return
feature_shards
,
label_shards
return
feature_shards
,
label_shards
def
create_experiment_fn
(
train_input
,
test_input
,
hooks
):
def
_experiment_fn
(
run_config
,
hparams
):
estimator
=
tf
.
estimator
.
Estimator
(
model_fn
=
_resnet_model_fn
,
config
=
run_config
,
model_dir
=
FLAGS
.
model_dir
)
experiment
=
tf
.
contrib
.
learn
.
Experiment
(
estimator
,
train_input_fn
=
train_input
,
eval_input_fn
=
test_input
,
train_steps
=
FLAGS
.
train_steps
)
# create experiment
experiment
.
extend_train_hooks
(
hooks
)
def
get_experiment_fn
(
train_input_fn
,
eval_input_fn
,
train_steps
,
eval_steps
):
return
experiment
def
_experiment_fn
(
run_config
,
hparams
):
del
hparams
# unused arg
# create estimator
classifier
=
tf
.
estimator
.
Estimator
(
model_fn
=
_resnet_model_fn
,
config
=
run_config
)
return
tf
.
contrib
.
learn
.
Experiment
(
classifier
,
train_input_fn
=
train_input_fn
,
eval_input_fn
=
eval_input_fn
,
train_steps
=
train_steps
,
eval_steps
=
eval_steps
)
return
_experiment_fn
return
_experiment_fn
def
main
(
unused_argv
):
def
main
(
unused_argv
):
# The env variable is on deprecation path, default is set to off.
# The env variable is on deprecation path, default is set to off.
...
@@ -411,38 +406,17 @@ def main(unused_argv):
...
@@ -411,38 +406,17 @@ def main(unused_argv):
sess_config
.
gpu_options
.
force_gpu_compatible
=
FLAGS
.
force_gpu_compatible
sess_config
.
gpu_options
.
force_gpu_compatible
=
FLAGS
.
force_gpu_compatible
config
=
config
.
replace
(
session_config
=
sess_config
)
config
=
config
.
replace
(
session_config
=
sess_config
)
train_input_fn
=
functools
.
partial
(
input_fn
,
subset
=
'train'
,
train_input
=
functools
.
partial
(
input_fn
,
subset
=
'train'
,
num_shards
=
FLAGS
.
num_gpus
)
num_shards
=
FLAGS
.
num_gpus
)
test_input
=
functools
.
partial
(
input_fn
,
subset
=
'eval'
,
num_shards
=
FLAGS
.
num_gpus
)
eval_input_fn
=
functools
.
partial
(
input_fn
,
subset
=
'eval'
,
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
}
num_shards
=
FLAGS
.
num_gpus
)
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
train_steps
=
FLAGS
.
train_steps
eval_steps
=
num_eval_examples
//
FLAGS
.
eval_batch_size
hooks
=
[
logging_hook
]
if
FLAGS
.
run_experiment
:
# run experiment
tf
.
contrib
.
learn
.
learn_runner
.
run
(
learn_runner
.
run
(
create_experiment_fn
(
train_input
,
test_input
,
hooks
),
run_config
=
config
)
get_experiment_fn
(
train_input_fn
,
eval_input_fn
,
train_steps
,
eval_steps
),
run_config
=
config
)
else
:
classifier
=
tf
.
estimator
.
Estimator
(
model_fn
=
_resnet_model_fn
,
config
=
config
)
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
}
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
print
(
'Starting to train...'
)
classifier
.
train
(
input_fn
=
train_input_fn
,
steps
=
train_steps
,
hooks
=
[
logging_hook
])
print
(
'Starting to evaluate...'
)
eval_results
=
classifier
.
evaluate
(
input_fn
=
eval_input_fn
,
steps
=
eval_steps
)
print
(
eval_results
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tf
.
app
.
run
()
tf
.
app
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment