Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
8b829873
Commit
8b829873
authored
Aug 16, 2017
by
Eli Bixby
Browse files
Move to argparse some other modifications
parent
d067ce0a
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
83 additions
and
41 deletions
+83
-41
tutorials/image/cifar10_estimator/README.md
tutorials/image/cifar10_estimator/README.md
+9
-14
tutorials/image/cifar10_estimator/cifar10_main.py
tutorials/image/cifar10_estimator/cifar10_main.py
+56
-15
tutorials/image/cifar10_estimator/cifar10_model.py
tutorials/image/cifar10_estimator/cifar10_model.py
+13
-4
tutorials/image/cifar10_estimator/model_base.py
tutorials/image/cifar10_estimator/model_base.py
+5
-8
No files found.
tutorials/image/cifar10_estimator/README.md
View file @
8b829873
...
...
@@ -53,15 +53,13 @@ train.tfrecords validation.tfrecords eval.tfrecords
# Run the model on CPU only. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \
--is-cpu-ps=True \
--num-gpus=0 \
--train-steps=1000
# Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \
--is-cpu-ps=True \
--force-gpu-compatible=True \
--force-gpu-compatible \
--num-gpus=2 \
--train-steps=1000
...
...
@@ -70,8 +68,8 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
# a couple of times to perform evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
--job-dir=/tmp/cifar10 \
--
is-cpu-ps=False
\
--force-gpu-compatible
=True
\
--
avg-on-gpu
\
--force-gpu-compatible \
--num-gpus=2 \
...
...
@@ -104,8 +102,7 @@ gcloud ml-engine jobs submit training cifarmultigpu \
--module-name cifar10_estimator.cifar10_main \
-- \
--data-dir=$MY_BUCKET/cifar-10-batches-py \
--is-cpu-ps=True \
--force-gpu-compatible=True \
--force-gpu-compatible \
--num-gpus=4 \
--train-steps=1000
```
...
...
@@ -186,11 +183,10 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
# Make sure the model_dir is the same as defined on the TF_CONFIG.
$
python cifar10_main.py
--data-dir
=
gs://path/cifar-10-batches-py
\
--job-dir
=
gs://path/model_dir/
\
--is-cpu-ps
=
True
\
--force-gpu-compatible
=
True
\
--force-gpu-compatible
\
--num-gpus
=
4
\
--train-steps
=
40000
\
--sync
=
True
\
--sync
\
\
--num-workers
=
2
```
...
...
@@ -329,11 +325,10 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
# Make sure the model_dir is the same as defined on the TF_CONFIG.
$
python cifar10_main.py
--data-dir
=
gs://path/cifar-10-batches-py
\
--job-dir
=
gs://path/model_dir/
\
--is-cpu-ps
=
True
\
--force-gpu-compatible
=
True
\
--force-gpu-compatible
\
--num-gpus
=
4
\
--train-steps
=
40000
\
--sync
=
True
--sync
```
*Output:*
...
...
@@ -480,7 +475,7 @@ $ tensorboard --log-dir="sentiment_analysis_output"
## Warnings
When runninng
`cifar10_main.py`
with
`--sync
=True
`
argument you may see an error similar to:
When runninng
`cifar10_main.py`
with
`--sync`
argument you may see an error similar to:
```
python
File
"cifar10_main.py"
,
line
538
,
in
<
module
>
...
...
tutorials/image/cifar10_estimator/cifar10_main.py
View file @
8b829873
...
...
@@ -25,7 +25,6 @@ http://www.cs.toronto.edu/~kriz/cifar.html
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
...
...
@@ -42,8 +41,8 @@ from tensorflow.python.training import basic_session_run_hooks
from
tensorflow.python.training
import
session_run_hook
from
tensorflow.python.training
import
training_util
from
.
import
cifar10
from
.
import
cifar10_model
import
cifar10
import
cifar10_model
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
...
...
@@ -192,9 +191,18 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
with
tf
.
variable_scope
(
'resnet'
,
reuse
=
bool
(
i
!=
0
)):
with
tf
.
name_scope
(
'tower_%d'
%
i
)
as
name_scope
:
with
tf
.
device
(
device_setter
):
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
tower_labels
[
i
],
tower_losses
,
tower_gradvars
,
tower_preds
,
False
,
params
[
'num_layers'
])
loss
,
gradvars
,
preds
=
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
i
],
tower_labels
[
i
],
False
,
params
[
'num_layers'
],
params
[
'batch_norm_decay'
],
params
[
'batch_norm_epsilon'
])
tower_losses
.
append
(
loss
)
tower_gradvars
.
append
(
gradvars
)
tower_preds
.
append
(
preds
)
if
i
==
0
:
# Only trigger batch_norm moving mean and variance update from
# the 1st tower. Ideally, we should grab the updates from all
...
...
@@ -206,8 +214,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
else
:
with
tf
.
variable_scope
(
'resnet'
),
tf
.
device
(
'/cpu:0'
):
with
tf
.
name_scope
(
'tower_cpu'
)
as
name_scope
:
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
0
],
tower_labels
[
0
],
tower_losses
,
tower_gradvars
,
tower_preds
,
True
)
loss
,
gradvars
,
preds
=
_tower_fn
(
is_training
,
weight_decay
,
tower_features
[
0
],
tower_labels
[
0
],
True
,
params
[
'num_layers'
],
params
[
'batch_norm_decay'
],
params
[
'batch_norm_epsilon'
])
tower_losses
.
append
(
loss
)
tower_gradvars
.
append
(
gradvars
)
tower_preds
.
append
(
preds
)
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
name_scope
)
# Now compute global loss and gradients.
...
...
@@ -281,10 +300,17 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
train_op
=
train_op
,
training_chief_hooks
=
chief_hooks
,
eval_metric_ops
=
metrics
)
return
_resnet_model_fn
def
_tower_fn
(
is_training
,
weight_decay
,
feature
,
label
,
tower_losses
,
tower_gradvars
,
tower_preds
,
is_cpu
,
num_layers
):
def
_tower_fn
(
is_training
,
weight_decay
,
feature
,
label
,
is_cpu
,
num_layers
,
batch_norm_decay
,
batch_norm_epsilon
):
"""Build computation tower for each device (CPU or GPU).
Args:
...
...
@@ -299,13 +325,15 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
"""
data_format
=
'channels_last'
if
is_cpu
else
'channels_first'
model
=
cifar10_model
.
ResNetCifar10
(
num_layers
,
is_training
=
is_training
,
data_format
=
data_format
)
num_layers
,
batch_norm_decay
=
batch_norm_decay
,
batch_norm_epsilon
=
batch_norm_epsilon
,
is_training
=
is_training
,
data_format
=
data_format
)
logits
=
model
.
forward_pass
(
feature
,
input_data_format
=
'channels_last'
)
tower_pred
=
{
'classes'
:
tf
.
argmax
(
input
=
logits
,
axis
=
1
),
'probabilities'
:
tf
.
nn
.
softmax
(
logits
)
}
tower_preds
.
append
(
tower_pred
)
tower_loss
=
tf
.
losses
.
sparse_softmax_cross_entropy
(
logits
=
logits
,
labels
=
label
)
...
...
@@ -314,10 +342,10 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
model_params
=
tf
.
trainable_variables
()
tower_loss
+=
weight_decay
*
tf
.
add_n
(
[
tf
.
nn
.
l2_loss
(
v
)
for
v
in
model_params
])
tower_losses
.
append
(
tower_loss
)
tower_grad
=
tf
.
gradients
(
tower_loss
,
model_params
)
tower_gradvars
.
append
(
zip
(
tower_grad
,
model_params
))
return
tower_loss
,
tower_grad
,
tower_pred
def
input_fn
(
data_dir
,
subset
,
num_shards
,
batch_size
,
...
...
@@ -535,6 +563,7 @@ if __name__ == '__main__':
default
=
2e-4
,
help
=
'Weight decay for convolutions.'
)
parser
.
add_argument
(
'--learning-rate'
,
type
=
float
,
...
...
@@ -595,12 +624,24 @@ if __name__ == '__main__':
default
=
False
,
help
=
'Whether to log device placement.'
)
parser
.
add_argument
(
'--batch_norm_decay'
,
type
=
float
,
default
=
0.997
,
help
=
'Decay for batch norm.'
)
parser
.
add_argument
(
'--batch_norm_epsilon'
,
type
=
float
,
default
=
1e-5
,
help
=
'Epsilon for batch norm.'
)
args
=
parser
.
parse_args
()
if
args
.
num_gpus
<
0
:
raise
ValueError
(
'Invalid GPU count:
\"
num_gpus
\"
must be 0 or a positive integer.'
)
if
args
.
num_gpus
==
0
and
not
args
.
avg_on_gpu
:
if
args
.
num_gpus
==
0
and
args
.
avg_on_gpu
:
raise
ValueError
(
'No GPU available for use, must use CPU to average gradients.'
)
if
(
args
.
num_layers
-
2
)
%
6
!=
0
:
...
...
tutorials/image/cifar10_estimator/cifar10_model.py
View file @
8b829873
...
...
@@ -13,20 +13,29 @@
# limitations under the License.
# ==============================================================================
"""Model class for Cifar10 Dataset."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
from
.
import
model_base
import
model_base
class
ResNetCifar10
(
model_base
.
ResNet
):
"""Cifar10 model with ResNetV1 and basic residual block."""
def
__init__
(
self
,
num_layers
,
is_training
,
data_format
=
'channels_first'
):
super
(
ResNetCifar10
,
self
).
__init__
(
is_training
,
data_format
)
def
__init__
(
self
,
num_layers
,
is_training
,
batch_norm_decay
,
batch_norm_epsilon
,
data_format
=
'channels_first'
):
super
(
ResNetCifar10
,
self
).
__init__
(
is_training
,
data_format
,
batch_norm_decay
,
batch_norm_epsilon
)
self
.
n
=
(
num_layers
-
2
)
//
6
# Add one in case label starts with 1. No impact if label starts with 0.
self
.
num_classes
=
10
+
1
...
...
tutorials/image/cifar10_estimator/model_base.py
View file @
8b829873
...
...
@@ -23,18 +23,13 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
argparse
import
tensorflow
as
tf
FLAGS
=
None
class
ResNet
(
object
):
"""ResNet model."""
def
__init__
(
self
,
is_training
,
data_format
):
def
__init__
(
self
,
is_training
,
data_format
,
batch_norm_decay
,
batch_norm_epsilon
):
"""ResNet constructor.
Args:
...
...
@@ -42,6 +37,8 @@ class ResNet(object):
data_format: the data_format used during computation.
one of 'channels_first' or 'channels_last'.
"""
self
.
_batch_norm_decay
=
batch_norm_decay
self
.
_batch_norm_epsilon
=
batch_norm_epsilon
self
.
_is_training
=
is_training
assert
data_format
in
(
'channels_first'
,
'channels_last'
)
self
.
_data_format
=
data_format
...
...
@@ -185,10 +182,10 @@ class ResNet(object):
data_format
=
'NHWC'
return
tf
.
contrib
.
layers
.
batch_norm
(
x
,
decay
=
FLAGS
.
batch_norm_decay
,
decay
=
self
.
_
batch_norm_decay
,
center
=
True
,
scale
=
True
,
epsilon
=
FLAGS
.
batch_norm_epsilon
,
epsilon
=
self
.
_
batch_norm_epsilon
,
is_training
=
self
.
_is_training
,
fused
=
True
,
data_format
=
data_format
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment