Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
add2845a
Commit
add2845a
authored
Aug 20, 2017
by
Toby Boyd
Browse files
Style cleanup
parent
a7531875
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
105 additions
and
119 deletions
+105
-119
tutorials/image/cifar10_estimator/cifar10.py
tutorials/image/cifar10_estimator/cifar10.py
+2
-2
tutorials/image/cifar10_estimator/cifar10_main.py
tutorials/image/cifar10_estimator/cifar10_main.py
+103
-117
No files found.
tutorials/image/cifar10_estimator/cifar10.py
View file @
add2845a
...
...
@@ -74,8 +74,8 @@ class Cifar10DataSet(object):
dataset
=
tf
.
contrib
.
data
.
TFRecordDataset
(
filenames
).
repeat
()
# Parse records.
dataset
=
dataset
.
map
(
self
.
parser
,
num_threads
=
batch_size
,
output_buffer_size
=
2
*
batch_size
)
dataset
=
dataset
.
map
(
self
.
parser
,
num_threads
=
batch_size
,
output_buffer_size
=
2
*
batch_size
)
# Potentially shuffle records.
if
self
.
subset
==
'train'
:
...
...
tutorials/image/cifar10_estimator/cifar10_main.py
View file @
add2845a
...
...
@@ -29,25 +29,23 @@ from __future__ import division
from
__future__
import
print_function
import
argparse
import
collections
import
functools
import
itertools
import
os
import
six
import
numpy
as
np
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
import
cifar10
import
cifar10_model
import
cifar10_utils
import
numpy
as
np
import
six
from
six.moves
import
xrange
# pylint: disable=redefined-builtin
import
tensorflow
as
tf
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
def
get_model_fn
(
num_gpus
,
variable_strategy
,
num_workers
):
def
get_model_fn
(
num_gpus
,
variable_strategy
,
data_format
,
num_workers
):
"""Returns a function that will build the resnet model."""
def
_resnet_model_fn
(
features
,
labels
,
mode
,
params
):
"""Resnet model body.
...
...
@@ -92,21 +90,13 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
ps_device_type
=
'gpu'
,
worker_device
=
worker_device
,
ps_strategy
=
tf
.
contrib
.
training
.
GreedyLoadBalancingStrategy
(
num_gpus
,
tf
.
contrib
.
training
.
byte_size_load_fn
)
)
num_gpus
,
tf
.
contrib
.
training
.
byte_size_load_fn
))
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
device
(
device_setter
):
loss
,
gradvars
,
preds
=
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
tower_labels
[
i
],
(
device_type
==
'cpu'
),
params
[
'num_layers'
],
params
[
'batch_norm_decay'
],
is_training
,
weight_decay
,
tower_features
[
i
],
tower_labels
[
i
],
data_format
,
params
[
'num_layers'
],
params
[
'batch_norm_decay'
],
params
[
'batch_norm_epsilon'
])
tower_losses
.
append
(
loss
)
tower_gradvars
.
append
(
gradvars
)
...
...
@@ -137,7 +127,6 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
avg_grad
=
tf
.
multiply
(
tf
.
add_n
(
grads
),
1.
/
len
(
grads
))
gradvars
.
append
((
avg_grad
,
var
))
# Device that runs the ops to apply global gradient updates.
consolidation_device
=
'/gpu:0'
if
variable_strategy
==
'GPU'
else
'/cpu:0'
with
tf
.
device
(
consolidation_device
):
...
...
@@ -163,8 +152,7 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
chief_hooks
=
[]
if
params
[
'sync'
]:
optimizer
=
tf
.
train
.
SyncReplicasOptimizer
(
optimizer
,
replicas_to_aggregate
=
num_workers
)
optimizer
,
replicas_to_aggregate
=
num_workers
)
sync_replicas_hook
=
optimizer
.
make_session_run_hook
(
True
)
chief_hooks
.
append
(
sync_replicas_hook
)
...
...
@@ -184,7 +172,8 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
}
stacked_labels
=
tf
.
concat
(
labels
,
axis
=
0
)
metrics
=
{
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
'accuracy'
:
tf
.
metrics
.
accuracy
(
stacked_labels
,
predictions
[
'classes'
])
}
loss
=
tf
.
reduce_mean
(
tower_losses
,
name
=
'loss'
)
...
...
@@ -195,35 +184,35 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
train_op
=
train_op
,
training_chief_hooks
=
chief_hooks
,
eval_metric_ops
=
metrics
)
return
_resnet_model_fn
def
_tower_fn
(
is_training
,
weight_decay
,
feature
,
label
,
is_cpu
,
num_layers
,
batch_norm_decay
,
batch_norm_epsilon
):
"""Build computation tower for each device (CPU or GPU).
def
_tower_fn
(
is_training
,
weight_decay
,
feature
,
label
,
data_format
,
num_layers
,
batch_norm_decay
,
batch_norm_epsilon
):
"""Build computation tower (Resnet).
Args:
is_training: true if is training graph.
weight_decay: weight regularization strength, a float.
feature: a Tensor.
label: a Tensor.
tower_losses: a list to be appended with current tower's loss.
tower_gradvars: a list to be appended with current tower's gradients.
tower_preds: a list to be appended with current tower's predictions.
is_cpu: true if build tower on CPU.
data_format: channels_last (NHWC) or channels_first (NCHW).
num_layers: number of layers, an int.
batch_norm_decay: decay for batch normalization, a float.
batch_norm_epsilon: epsilon for batch normalization, a float.
Returns:
A tuple with the loss for the tower, the gradients and parameters, and
predictions.
"""
data_format
=
'channels_last'
if
is_cpu
else
'channels_first'
model
=
cifar10_model
.
ResNetCifar10
(
num_layers
,
batch_norm_decay
=
batch_norm_decay
,
batch_norm_epsilon
=
batch_norm_epsilon
,
is_training
=
is_training
,
data_format
=
data_format
)
is_training
=
is_training
,
data_format
=
data_format
)
logits
=
model
.
forward_pass
(
feature
,
input_data_format
=
'channels_last'
)
tower_pred
=
{
'classes'
:
tf
.
argmax
(
input
=
logits
,
axis
=
1
),
...
...
@@ -243,13 +232,20 @@ def _tower_fn(is_training,
return
tower_loss
,
zip
(
tower_grad
,
model_params
),
tower_pred
def
input_fn
(
data_dir
,
subset
,
num_shards
,
batch_size
,
def
input_fn
(
data_dir
,
subset
,
num_shards
,
batch_size
,
use_distortion_for_training
=
True
):
"""Create input graph for model.
Args:
data_dir: Directory where TFRecords representing the dataset are located.
subset: one of 'train', 'validate' and 'eval'.
num_shards: num of towers participating in data-parallel training.
batch_size: total batch size for training to be divided by the number of
shards.
use_distortion_for_training: True to use distortions.
Returns:
two lists of tensors for features and labels, each of num_shards length.
"""
...
...
@@ -279,7 +275,10 @@ def input_fn(data_dir, subset, num_shards, batch_size,
# create experiment
def
get_experiment_fn
(
data_dir
,
num_gpus
,
is_gpu_ps
,
def
get_experiment_fn
(
data_dir
,
num_gpus
,
variable_strategy
,
data_format
,
use_distortion_for_training
=
True
):
"""Returns an Experiment function.
...
...
@@ -292,7 +291,9 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
Args:
data_dir: str. Location of the data for input_fns.
num_gpus: int. Number of GPUs on each worker.
is_gpu_ps: bool. If true, average gradients on GPUs.
variable_strategy: String. CPU to use CPU as the parameter server
and GPU to use the GPUs as the parameter server.
data_format: String. channels_first or channels_last.
use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
Returns:
A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
...
...
@@ -302,6 +303,7 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
methods on Experiment (train, evaluate) based on information
about the current runner in `run_config`.
"""
def
_experiment_fn
(
run_config
,
hparams
):
"""Returns an Experiment."""
# Create estimator.
...
...
@@ -311,28 +313,26 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
subset
=
'train'
,
num_shards
=
num_gpus
,
batch_size
=
hparams
.
train_batch_size
,
use_distortion_for_training
=
use_distortion_for_training
)
use_distortion_for_training
=
use_distortion_for_training
)
eval_input_fn
=
functools
.
partial
(
input_fn
,
data_dir
,
subset
=
'eval'
,
batch_size
=
hparams
.
eval_batch_size
,
num_shards
=
num_gpus
)
num_shards
=
num_gpus
)
num_eval_examples
=
cifar10
.
Cifar10DataSet
.
num_examples_per_epoch
(
'eval'
)
if
num_eval_examples
%
hparams
.
eval_batch_size
!=
0
:
raise
ValueError
(
'validation set size must be multiple of eval_batch_size'
)
raise
ValueError
(
'validation set size must be multiple of eval_batch_size'
)
train_steps
=
hparams
.
train_steps
eval_steps
=
num_eval_examples
//
hparams
.
eval_batch_size
examples_sec_hook
=
cifar10_utils
.
ExamplesPerSecondHook
(
hparams
.
train_batch_size
,
every_n_steps
=
10
)
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
,
'loss'
:
'loss'
}
tensors_to_log
=
{
'learning_rate'
:
'learning_rate'
,
'loss'
:
'loss'
}
logging_hook
=
tf
.
train
.
LoggingTensorHook
(
tensors
=
tensors_to_log
,
every_n_iter
=
100
)
...
...
@@ -340,11 +340,10 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
hooks
=
[
logging_hook
,
examples_sec_hook
]
classifier
=
tf
.
estimator
.
Estimator
(
model_fn
=
get_model_fn
(
num_gpus
,
is_gpu_ps
,
run_config
.
num_worker_replicas
or
1
),
model_fn
=
get_model_fn
(
num_gpus
,
variable_strategy
,
data_format
,
run_config
.
num_worker_replicas
or
1
),
config
=
run_config
,
params
=
vars
(
hparams
)
)
params
=
vars
(
hparams
))
# Create experiment.
experiment
=
tf
.
contrib
.
learn
.
Experiment
(
...
...
@@ -356,43 +355,40 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
# Adding hooks to be used by the estimator on training modes
experiment
.
extend_train_hooks
(
hooks
)
return
experiment
return
_experiment_fn
def
main
(
job_dir
,
data_dir
,
num_gpus
,
variable_strategy
,
use_distortion_for_training
,
log_device_placement
,
num_intra_threads
,
def
main
(
job_dir
,
data_dir
,
num_gpus
,
variable_strategy
,
data_format
,
use_distortion_for_training
,
log_device_placement
,
num_intra_threads
,
**
hparams
):
# The env variable is on deprecation path, default is set to off.
os
.
environ
[
'TF_SYNC_ON_FINISH'
]
=
'0'
os
.
environ
[
'TF_ENABLE_WINOGRAD_NONFUSED'
]
=
'1'
# channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
# on CPU. The exception is Intel MKL on CPU which is optimal with
# channels_last.
if
not
data_format
:
if
num_gpus
==
0
:
data_format
=
'channels_last'
else
:
data_format
=
'channels_first'
# Session configuration.
sess_config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
,
log_device_placement
=
log_device_placement
,
intra_op_parallelism_threads
=
num_intra_threads
,
gpu_options
=
tf
.
GPUOptions
(
force_gpu_compatible
=
True
)
)
gpu_options
=
tf
.
GPUOptions
(
force_gpu_compatible
=
True
))
config
=
cifar10_utils
.
RunConfig
(
session_config
=
sess_config
,
model_dir
=
job_dir
)
session_config
=
sess_config
,
model_dir
=
job_dir
)
tf
.
contrib
.
learn
.
learn_runner
.
run
(
get_experiment_fn
(
data_dir
,
num_gpus
,
variable_strategy
,
use_distortion_for_training
),
get_experiment_fn
(
data_dir
,
num_gpus
,
variable_strategy
,
data_format
,
use_distortion_for_training
),
run_config
=
config
,
hparams
=
tf
.
contrib
.
training
.
HParams
(
**
hparams
)
)
hparams
=
tf
.
contrib
.
training
.
HParams
(
**
hparams
))
if
__name__
==
'__main__'
:
...
...
@@ -401,63 +397,53 @@ if __name__ == '__main__':
'--data-dir'
,
type
=
str
,
required
=
True
,
help
=
'The directory where the CIFAR-10 input data is stored.'
)
help
=
'The directory where the CIFAR-10 input data is stored.'
)
parser
.
add_argument
(
'--job-dir'
,
type
=
str
,
required
=
True
,
help
=
'The directory where the model will be stored.'
)
help
=
'The directory where the model will be stored.'
)
parser
.
add_argument
(
'--variable-strategy'
,
choices
=
[
'CPU'
,
'GPU'
],
type
=
str
,
default
=
'CPU'
,
help
=
'Where to locate variable operations'
)
help
=
'Where to locate variable operations'
)
parser
.
add_argument
(
'--num-gpus'
,
type
=
int
,
default
=
1
,
help
=
'The number of gpus used. Uses only CPU if set to 0.'
)
help
=
'The number of gpus used. Uses only CPU if set to 0.'
)
parser
.
add_argument
(
'--num-layers'
,
type
=
int
,
default
=
44
,
help
=
'The number of layers of the model.'
)
help
=
'The number of layers of the model.'
)
parser
.
add_argument
(
'--train-steps'
,
type
=
int
,
default
=
80000
,
help
=
'The number of steps to use for training.'
)
help
=
'The number of steps to use for training.'
)
parser
.
add_argument
(
'--train-batch-size'
,
type
=
int
,
default
=
128
,
help
=
'Batch size for training.'
)
help
=
'Batch size for training.'
)
parser
.
add_argument
(
'--eval-batch-size'
,
type
=
int
,
default
=
100
,
help
=
'Batch size for validation.'
)
help
=
'Batch size for validation.'
)
parser
.
add_argument
(
'--momentum'
,
type
=
float
,
default
=
0.9
,
help
=
'Momentum for MomentumOptimizer.'
)
help
=
'Momentum for MomentumOptimizer.'
)
parser
.
add_argument
(
'--weight-decay'
,
type
=
float
,
default
=
2e-4
,
help
=
'Weight decay for convolutions.'
)
help
=
'Weight decay for convolutions.'
)
parser
.
add_argument
(
'--learning-rate'
,
type
=
float
,
...
...
@@ -466,22 +452,19 @@ if __name__ == '__main__':
This is the inital learning rate value. The learning rate will decrease
during training. For more details check the model_fn implementation in
this file.
\
"""
)
"""
)
parser
.
add_argument
(
'--use-distortion-for-training'
,
type
=
bool
,
default
=
True
,
help
=
'If doing image distortion for training.'
)
help
=
'If doing image distortion for training.'
)
parser
.
add_argument
(
'--sync'
,
action
=
'store_true'
,
default
=
False
,
help
=
"""
\
If present when running in a distributed environment will run on sync mode.
\
"""
)
"""
)
parser
.
add_argument
(
'--num-intra-threads'
,
type
=
int
,
...
...
@@ -492,8 +475,7 @@ if __name__ == '__main__':
example CPU only handles the input pipeline and gradient aggregation
(when --is-cpu-ps). Ops that could potentially benefit from intra-op
parallelism are scheduled to run on GPUs.
\
"""
)
"""
)
parser
.
add_argument
(
'--num-inter-threads'
,
type
=
int
,
...
...
@@ -501,26 +483,30 @@ if __name__ == '__main__':
help
=
"""
\
Number of threads to use for inter-op parallelism. If set to 0, the
system will pick an appropriate number.
\
"""
)
"""
)
parser
.
add_argument
(
'--data-format'
,
type
=
str
,
default
=
None
,
help
=
"""
\
If not set, the data format best for the training device is used.
Allowed values: channels_first (NCHW) channels_last (NHWC).
\
"""
)
parser
.
add_argument
(
'--log-device-placement'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Whether to log device placement.'
)
help
=
'Whether to log device placement.'
)
parser
.
add_argument
(
'--batch-norm-decay'
,
type
=
float
,
default
=
0.997
,
help
=
'Decay for batch norm.'
)
help
=
'Decay for batch norm.'
)
parser
.
add_argument
(
'--batch-norm-epsilon'
,
type
=
float
,
default
=
1e-5
,
help
=
'Epsilon for batch norm.'
)
help
=
'Epsilon for batch norm.'
)
args
=
parser
.
parse_args
()
if
args
.
num_gpus
<
0
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment