Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
74ecc048
"tests/nn/git@developer.sourcefind.cn:OpenDAS/fairscale.git" did not exist on "47ce21ac813347c478c7267bb2e9490373fd4444"
Commit
74ecc048
authored
Jul 24, 2017
by
Marianne Linhares Monteiro
Committed by
GitHub
Jul 24, 2017
Browse files
Refactoring and adding sync mode
parent
28328ae3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
72 additions
and
98 deletions
+72
-98
tutorials/image/cifar10_estimator/cifar10_main.py
tutorials/image/cifar10_estimator/cifar10_main.py
+72
-98
No files found.
tutorials/image/cifar10_estimator/cifar10_main.py
View file @
74ecc048
...
@@ -36,6 +36,7 @@ import os
...
@@ -36,6 +36,7 @@ import os
import
numpy
as
np
import
numpy
as
np
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
tensorflow.contrib.learn.python.learn
import
learn_runner
# run the experiment
import
cifar10
import
cifar10
import
cifar10_model
import
cifar10_model
...
@@ -44,13 +45,13 @@ tf.logging.set_verbosity(tf.logging.INFO)
...
@@ -44,13 +45,13 @@ tf.logging.set_verbosity(tf.logging.INFO)
FLAGS
=
tf
.
flags
.
FLAGS
FLAGS
=
tf
.
flags
.
FLAGS
tf
.
flags
.
DEFINE_string
(
'data_dir'
,
''
,
tf
.
flags
.
DEFINE_string
(
'data_dir'
,
'
cifar10
'
,
'The directory where the CIFAR-10 input data is stored.'
)
'The directory where the CIFAR-10 input data is stored.'
)
tf
.
flags
.
DEFINE_string
(
'model_dir'
,
''
,
tf
.
flags
.
DEFINE_string
(
'model_dir'
,
'
output2_2
'
,
'The directory where the model will be stored.'
)
'The directory where the model will be stored.'
)
tf
.
flags
.
DEFINE_boolean
(
'is_cpu_ps'
,
Fals
e
,
tf
.
flags
.
DEFINE_boolean
(
'is_cpu_ps'
,
Tru
e
,
'If using CPU as the parameter server.'
)
'If using CPU as the parameter server.'
)
tf
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
...
@@ -58,12 +59,12 @@ tf.flags.DEFINE_integer('num_gpus', 1,
...
@@ -58,12 +59,12 @@ tf.flags.DEFINE_integer('num_gpus', 1,
tf
.
flags
.
DEFINE_integer
(
'num_layers'
,
44
,
'The number of layers of the model.'
)
tf
.
flags
.
DEFINE_integer
(
'num_layers'
,
44
,
'The number of layers of the model.'
)
tf
.
flags
.
DEFINE_integer
(
'train_steps'
,
10000
,
tf
.
flags
.
DEFINE_integer
(
'train_batch_size'
,
1024
,
'Batch size for training.'
)
'The number of steps to use for training.'
)
tf
.
flags
.
DEFINE_integer
(
'train_batch_size'
,
128
,
'Batch size for training.'
)
tf
.
flags
.
DEFINE_integer
(
'train_steps'
,
(
50000.0
/
FLAGS
.
train_batch_size
)
*
40
,
'The number of steps to use for training.'
)
# 40 epochs
tf
.
flags
.
DEFINE_integer
(
'eval_batch_size'
,
1
00
,
'Batch size for validation.'
)
tf
.
flags
.
DEFINE_integer
(
'eval_batch_size'
,
2
00
,
'Batch size for validation.'
)
tf
.
flags
.
DEFINE_float
(
'momentum'
,
0.9
,
'Momentum for MomentumOptimizer.'
)
tf
.
flags
.
DEFINE_float
(
'momentum'
,
0.9
,
'Momentum for MomentumOptimizer.'
)
...
@@ -71,10 +72,6 @@ tf.flags.DEFINE_float('weight_decay', 1e-4, 'Weight decay for convolutions.')
...
@@ -71,10 +72,6 @@ tf.flags.DEFINE_float('weight_decay', 1e-4, 'Weight decay for convolutions.')
tf
.
flags
.
DEFINE_boolean
(
'use_distortion_for_training'
,
True
,
tf
.
flags
.
DEFINE_boolean
(
'use_distortion_for_training'
,
True
,
'If doing image distortion for training.'
)
'If doing image distortion for training.'
)
tf
.
flags
.
DEFINE_boolean
(
'run_experiment'
,
False
,
'If True will run an experiment,'
'otherwise will run training and evaluation'
'using the estimator interface'
)
# Perf flags
# Perf flags
tf
.
flags
.
DEFINE_integer
(
'num_intra_threads'
,
1
,
tf
.
flags
.
DEFINE_integer
(
'num_intra_threads'
,
1
,
...
@@ -141,7 +138,6 @@ def _create_device_setter(is_cpu_ps, worker):
...
@@ -141,7 +138,6 @@ def _create_device_setter(is_cpu_ps, worker):
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
FLAGS
.
num_gpus
)]
gpus
=
[
'/gpu:%d'
%
i
for
i
in
range
(
FLAGS
.
num_gpus
)]
return
ParamServerDeviceSetter
(
worker
,
gpus
)
return
ParamServerDeviceSetter
(
worker
,
gpus
)
def
_resnet_model_fn
(
features
,
labels
,
mode
):
def
_resnet_model_fn
(
features
,
labels
,
mode
):
"""Resnet model body.
"""Resnet model body.
...
@@ -175,24 +171,24 @@ def _resnet_model_fn(features, labels, mode):
...
@@ -175,24 +171,24 @@ def _resnet_model_fn(features, labels, mode):
worker
=
'/gpu:%d'
%
i
worker
=
'/gpu:%d'
%
i
device_setter
=
_create_device_setter
(
is_cpu_ps
,
worker
)
device_setter
=
_create_device_setter
(
is_cpu_ps
,
worker
)
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
device
(
device_setter
):
with
tf
.
device
(
device_setter
):
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
tower_labels
[
i
],
tower_losses
,
tower_gradvars
,
tower_labels
[
i
],
tower_losses
,
tower_gradvars
,
tower_preds
,
False
)
tower_preds
,
False
)
if
i
==
0
:
if
i
==
0
:
# Only trigger batch_norm moving mean and variance update from the
# Only trigger batch_norm moving mean and variance update from the
# 1st tower. Ideally, we should grab the updates from all towers
# 1st tower. Ideally, we should grab the updates from all towers
# but these stats accumulate extremely fast so we can ignore the
# but these stats accumulate extremely fast so we can ignore the
# other stats from the other towers without significant detriment.
# other stats from the other towers without significant detriment.
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
name_scope
)
else
:
else
:
with
tf
.
variable_scope
(
'resnet'
),
tf
.
device
(
'/cpu:0'
):
with
tf
.
variable_scope
(
'resnet'
),
tf
.
device
(
'/cpu:0'
):
with
tf
.
name_scope
(
'tower_cpu'
)
as
name_scope
:
with
tf
.
name_scope
(
'tower_cpu'
)
as
name_scope
:
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
0
],
tower_labels
[
0
],
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
0
],
tower_labels
[
0
],
tower_losses
,
tower_gradvars
,
tower_preds
,
True
)
tower_losses
,
tower_gradvars
,
tower_preds
,
True
)
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
# Now compute global loss and gradients.
# Now compute global loss and gradients.
gradvars
=
[]
gradvars
=
[]
...
@@ -204,51 +200,52 @@ def _resnet_model_fn(features, labels, mode):
...
@@ -204,51 +200,52 @@ def _resnet_model_fn(features, labels, mode):
with
tf
.
name_scope
(
'gradient_averaging'
):
with
tf
.
name_scope
(
'gradient_averaging'
):
loss
=
tf
.
reduce_mean
(
tower_losses
)
loss
=
tf
.
reduce_mean
(
tower_losses
)
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
for
zipped_gradvars
in
zip
(
*
tower_gradvars
):
# Averaging one var's gradients computed from multiple towers
# Averaging one var's gradients computed from multiple towers
var
=
zipped_gradvars
[
0
][
1
]
var
=
zipped_gradvars
[
0
][
1
]
grads
=
[
gv
[
0
]
for
gv
in
zipped_gradvars
]
grads
=
[
gv
[
0
]
for
gv
in
zipped_gradvars
]
with
tf
.
device
(
var
.
device
):
with
tf
.
device
(
var
.
device
):
if
len
(
grads
)
==
1
:
if
len
(
grads
)
==
1
:
avg_grad
=
grads
[
0
]
avg_grad
=
grads
[
0
]
else
:
else
:
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
gradvars
.
append
((
avg_grad
,
var
))
gradvars
.
append
((
avg_grad
,
var
))
# Suggested learning rate scheduling from
# Suggested learning rate scheduling from
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
# users could apply other scheduling.
# users could apply other scheduling.
num_batches_per_epoch
=
cifar10
.
Cifar10DataSet
.
num_examples_per_epoch
(
num_batches_per_epoch
=
cifar10
.
Cifar10DataSet
.
num_examples_per_epoch
(
'train'
)
//
FLAGS
.
train_batch_size
'train'
)
//
FLAGS
.
train_batch_size
boundaries
=
[
boundaries
=
[
num_batches_per_epoch
*
x
num_batches_per_epoch
*
x
for
x
in
np
.
array
([
82
,
123
,
300
],
dtype
=
np
.
int64
)
for
x
in
np
.
array
([
82
,
123
,
300
],
dtype
=
np
.
int64
)
]
]
staged_lr
=
[
0.1
,
0.01
,
0.001
,
0.0002
]
staged_lr
=
[
0.1
,
0.01
,
0.001
,
0.0002
]
learning_rate
=
tf
.
train
.
piecewise_constant
(
tf
.
train
.
get_global_step
(),
boundaries
,
staged_lr
)
global_step
=
tf
.
train
.
get_global_step
()
learning_rate
=
tf
.
train
.
piecewise_constant
(
global_step
,
boundaries
,
staged_lr
)
# Create a nicely-named tensor for logging
# Create a nicely-named tensor for logging
learning_rate
=
tf
.
identity
(
learning_rate
,
name
=
'learning_rate'
)
learning_rate
=
tf
.
identity
(
learning_rate
,
name
=
'learning_rate'
)
optimizer
=
tf
.
train
.
MomentumOptimizer
(
optimizer
=
tf
.
train
.
MomentumOptimizer
(
learning_rate
=
learning_rate
,
learning_rate
=
learning_rate
,
momentum
=
momentum
)
momentum
=
momentum
)
# Create single grouped train op
# Create single grouped train op
train_op
=
[
train_op
=
[
optimizer
.
apply_gradients
(
optimizer
.
apply_gradients
(
gradvars
,
global_step
=
tf
.
train
.
get_
global_step
()
)
gradvars
,
global_step
=
global_step
)
]
]
train_op
.
extend
(
update_ops
)
train_op
.
extend
(
update_ops
)
train_op
=
tf
.
group
(
*
train_op
)
train_op
=
tf
.
group
(
*
train_op
)
predictions
=
{
predictions
=
{
'classes'
:
'classes'
:
tf
.
concat
([
p
[
'classes'
]
for
p
in
tower_preds
],
axis
=
0
),
tf
.
concat
([
p
[
'classes'
]
for
p
in
tower_preds
],
axis
=
0
),
'probabilities'
:
'probabilities'
:
tf
.
concat
([
p
[
'probabilities'
]
for
p
in
tower_preds
],
axis
=
0
)
tf
.
concat
([
p
[
'probabilities'
]
for
p
in
tower_preds
],
axis
=
0
)
}
}
stacked_labels
=
tf
.
concat
(
labels
,
axis
=
0
)
stacked_labels
=
tf
.
concat
(
labels
,
axis
=
0
)
metrics
=
{
metrics
=
{
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
}
}
return
tf
.
estimator
.
EstimatorSpec
(
return
tf
.
estimator
.
EstimatorSpec
(
...
@@ -363,23 +360,21 @@ def input_fn(subset, num_shards):
...
@@ -363,23 +360,21 @@ def input_fn(subset, num_shards):
label_shards
=
[
tf
.
parallel_stack
(
x
)
for
x
in
label_shards
]
label_shards
=
[
tf
.
parallel_stack
(
x
)
for
x
in
label_shards
]
return
feature_shards
,
label_shards
return
feature_shards
,
label_shards
def
create_experiment_fn
(
train_input
,
test_input
,
hooks
):
def
_experiment_fn
(
run_config
,
hparams
):
estimator
=
tf
.
estimator
.
Estimator
(
model_fn
=
_resnet_model_fn
,
config
=
run_config
,
model_dir
=
FLAGS
.
model_dir
)
experiment
=
tf
.
contrib
.
learn
.
Experiment
(
estimator
,
train_input_fn
=
train_input
,
eval_input_fn
=
test_input
,
train_steps
=
FLAGS
.
train_steps
)
# create experiment
experiment
.
extend_train_hooks
(
hooks
)
def
get_experiment_fn
(
train_input_fn
,
eval_input_fn
,
train_steps
,
eval_steps
):
return
experiment
def
_experiment_fn
(
run_config
,
hparams
):
del
hparams
# unused arg
# create estimator
classifier
=
tf
.
estimator
.
Estimator
(
model_fn
=
_resnet_model_fn
,
config
=
run_config
)
return
tf
.
contrib
.
learn
.
Experiment
(
classifier
,
train_input_fn
=
train_input_fn
,
eval_input_fn
=
eval_input_fn
,
train_steps
=
train_steps
,
eval_steps
=
eval_steps
)
return
_experiment_fn
return
_experiment_fn
def
main
(
unused_argv
):
def
main
(
unused_argv
):
# The env variable is on deprecation path, default is set to off.
# The env variable is on deprecation path, default is set to off.
...
@@ -411,38 +406,17 @@ def main(unused_argv):
...
@@ -411,38 +406,17 @@ def main(unused_argv):
sess_config
.
gpu_options
.
force_gpu_compatible
=
FLAGS
.
force_gpu_compatible
sess_config
.
gpu_options
.
force_gpu_compatible
=
FLAGS
.
force_gpu_compatible
config
=
config
.
replace
(
session_config
=
sess_config
)
config
=
config
.
replace
(
session_config
=
sess_config
)
train_input_fn
=
functools
.
partial
(
input_fn
,
subset
=
'train'
,
train_input
=
functools
.
partial
(
input_fn
,
subset
=
'train'
,
num_shards
=
FLAGS
.
num_gpus
)
num_shards
=
FLAGS
.
num_gpus
)
test_input
=
functools
.
partial
(
input_fn
,
subset
=
'eval'
,
num_shards
=
FLAGS
.
num_gpus
)
eval_input_fn
=
functools
.
partial
(
input_fn
,
subset
=
'eval'
,
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
}
num_shards
=
FLAGS
.
num_gpus
)
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
train_steps
=
FLAGS
.
train_steps
eval_steps
=
num_eval_examples
//
FLAGS
.
eval_batch_size
hooks
=
[
logging_hook
]
if
FLAGS
.
run_experiment
:
# run experiment
tf
.
contrib
.
learn
.
learn_runner
.
run
(
learn_runner
.
run
(
create_experiment_fn
(
train_input
,
test_input
,
hooks
),
run_config
=
config
)
get_experiment_fn
(
train_input_fn
,
eval_input_fn
,
train_steps
,
eval_steps
),
run_config
=
config
)
else
:
classifier
=
tf
.
estimator
.
Estimator
(
model_fn
=
_resnet_model_fn
,
config
=
config
)
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
}
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
print
(
'Starting to train...'
)
classifier
.
train
(
input_fn
=
train_input_fn
,
steps
=
train_steps
,
hooks
=
[
logging_hook
])
print
(
'Starting to evaluate...'
)
eval_results
=
classifier
.
evaluate
(
input_fn
=
eval_input_fn
,
steps
=
eval_steps
)
print
(
eval_results
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tf
.
app
.
run
()
tf
.
app
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment