Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
c923a420
Commit
c923a420
authored
Dec 21, 2018
by
Shining Sun
Browse files
bug fixes
parent
80dcd27c
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
87 additions
and
107 deletions
+87
-107
official/resnet/keras/keras_cifar_main.py
official/resnet/keras/keras_cifar_main.py
+24
-23
official/resnet/keras/keras_common.py
official/resnet/keras/keras_common.py
+13
-31
official/resnet/keras/keras_imagenet_main.py
official/resnet/keras/keras_imagenet_main.py
+25
-31
official/resnet/keras/resnet_cifar_model.py
official/resnet/keras/resnet_cifar_model.py
+2
-2
official/resnet/keras/resnet_model.py
official/resnet/keras/resnet_model.py
+2
-1
official/resnet/resnet_run_loop.py
official/resnet/resnet_run_loop.py
+2
-5
official/utils/misc/distribution_utils.py
official/utils/misc/distribution_utils.py
+19
-14
No files found.
official/resnet/keras/keras_cifar_main.py
View file @
c923a420
...
...
@@ -25,7 +25,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from
official.resnet
import
cifar10_main
as
cifar_main
from
official.resnet
import
resnet_run_loop
from
official.resnet.keras
import
keras_common
from
official.resnet.keras
import
resnet
56
from
official.resnet.keras
import
resnet
_cifar_model
from
official.utils.flags
import
core
as
flags_core
from
official.utils.logs
import
logger
from
official.utils.misc
import
distribution_utils
...
...
@@ -39,8 +39,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
def
learning_rate_schedule
(
current_epoch
,
current_batch
,
batches_per_epoch
,
batch_size
):
"""Handles linear scaling rule, gradual warmup, and LR decay.
Th
e learning rate
starts at base learning_rate, then after 91, 136 and
182 epochs, the learning rate is divided by 10
.
Scal
e learning rate
at epoch boundaries provided in LR_SCHEDULE by the provided scaling
factor
.
Args:
current_epoch: integer, current epoch indexed from 0.
...
...
@@ -65,7 +65,7 @@ def parse_record_keras(raw_record, is_training, dtype):
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
This method converts the label to onhot to fit the loss function.
This method converts the label to on
e
hot to fit the loss function.
Args:
raw_record: scalar Tensor tf.string containing a serialized
...
...
@@ -127,9 +127,9 @@ def run(flags_obj):
optimizer
=
keras_common
.
get_optimizer
()
strategy
=
distribution_utils
.
get_distribution_strategy
(
flags_obj
.
num_gpus
,
flags_obj
.
use_one_device
_strategy
)
flags_obj
.
num_gpus
,
flags_obj
.
turn_off_distribution
_strategy
)
model
=
resnet
56
.
R
es
N
et56
(
input_shape
=
(
32
,
32
,
3
),
model
=
resnet
_cifar_model
.
r
es
n
et56
(
input_shape
=
(
32
,
32
,
3
),
classes
=
cifar_main
.
NUM_CLASSES
)
model
.
compile
(
loss
=
'categorical_crossentropy'
,
...
...
@@ -150,6 +150,11 @@ def run(flags_obj):
num_eval_steps
=
(
cifar_main
.
NUM_IMAGES
[
'validation'
]
//
flags_obj
.
batch_size
)
validation_data
=
eval_input_dataset
if
flags_obj
.
skip_eval
:
num_eval_steps
=
None
validation_data
=
None
history
=
model
.
fit
(
train_input_dataset
,
epochs
=
train_epochs
,
steps_per_epoch
=
train_steps
,
...
...
@@ -159,7 +164,7 @@ def run(flags_obj):
tensorboard_callback
],
validation_steps
=
num_eval_steps
,
validation_data
=
e
val
_input
_data
set
,
validation_data
=
val
idation
_data
,
verbose
=
1
)
if
not
flags_obj
.
skip_eval
:
...
...
@@ -167,10 +172,6 @@ def run(flags_obj):
steps
=
num_eval_steps
,
verbose
=
1
)
stats
=
keras_common
.
analyze_fit_and_eval_result
(
history
,
eval_output
)
return
stats
def
main
(
_
):
with
logger
.
benchmark_context
(
flags
.
FLAGS
):
...
...
official/resnet/keras/keras_common.py
View file @
c923a420
...
...
@@ -42,28 +42,26 @@ class TimeHistory(tf.keras.callbacks.Callback):
"""
self
.
_batch_size
=
batch_size
super
(
TimeHistory
,
self
).
__init__
()
self
.
log_
batch_size
=
100
self
.
log_
steps
=
100
def
on_train_begin
(
self
,
logs
=
None
):
self
.
batch_times_secs
=
[]
self
.
record_batch
=
True
def
on_batch_begin
(
self
,
batch
,
logs
=
None
):
if
self
.
record_batch
:
self
.
batch_time_start
=
time
.
time
()
self
.
start_time
=
time
.
time
()
self
.
record_batch
=
False
def
on_batch_end
(
self
,
batch
,
logs
=
None
):
if
batch
%
self
.
log_batch_size
==
0
:
last_n_batches
=
time
.
time
()
-
self
.
batch_time_start
examples_per_second
=
(
self
.
_batch_size
*
self
.
log_batch_size
)
/
last_n_batches
self
.
batch_times_secs
.
append
(
last_n_batches
)
if
batch
%
self
.
log_steps
==
0
:
elapsed_time
=
time
.
time
()
-
self
.
start_time
examples_per_second
=
(
self
.
_batch_size
*
self
.
log_steps
)
/
elapsed_time
self
.
record_batch
=
True
# TODO(anjalisridhar): add timestamp as well.
if
batch
!=
0
:
tf
.
logging
.
info
(
"BenchmarkMetric: {'num_batches':%d, 'time_taken': %f,"
"'images_per_second': %f}"
%
(
batch
,
la
st_n_batches
,
examples_per_second
))
(
batch
,
e
la
psed_time
,
examples_per_second
))
class
LearningRateBatchScheduler
(
tf
.
keras
.
callbacks
.
Callback
):
"""Callback to update learning rate on every batch (not epoch boundaries).
...
...
@@ -95,20 +93,14 @@ class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
raise
ValueError
(
'The output of the "schedule" function should be float.'
)
if
lr
!=
self
.
prev_lr
:
self
.
model
.
optimizer
.
learning_rate
=
lr
# lr should be a float here
# tf.keras.backend.set_value(self.model.optimizer.learning_rate, lr)
self
.
prev_lr
=
lr
tf
.
logging
.
debug
(
'Epoch %05d Batch %05d: LearningRateBatchScheduler change '
'learning rate to %s.'
,
self
.
epochs
,
batch
,
lr
)
def
get_optimizer
():
if
FLAGS
.
use_tf_momentum_optimizer
:
learning_rate
=
BASE_LEARNING_RATE
*
FLAGS
.
batch_size
/
256
optimizer
=
tf
.
train
.
MomentumOptimizer
(
learning_rate
=
learning_rate
,
momentum
=
0.9
)
else
:
# optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
optimizer
=
gradient_descent_v2
.
SGD
(
learning_rate
=
0.1
,
momentum
=
0.9
)
return
optimizer
# The learning rate set here is a placeholder and not use. It will be overwritten
# at the beginning of each batch by callback
return
gradient_descent_v2
.
SGD
(
learning_rate
=
0.1
,
momentum
=
0.9
)
def
get_callbacks
(
learning_rate_schedule_fn
,
num_images
):
...
...
@@ -124,25 +116,15 @@ def get_callbacks(learning_rate_schedule_fn, num_images):
return
time_callback
,
tensorboard_callback
,
lr_callback
def
analyze_fit_and_eval_result
(
history
,
eval_output
):
stats
=
{}
stats
[
'accuracy_top_1'
]
=
eval_output
[
1
]
stats
[
'eval_loss'
]
=
eval_output
[
0
]
stats
[
'training_loss'
]
=
history
.
history
[
'loss'
][
-
1
]
stats
[
'training_accuracy_top_1'
]
=
history
.
history
[
'categorical_accuracy'
][
-
1
]
print
(
'Test loss:{}'
.
format
(
stats
[
'eval_loss'
]))
print
(
'top_1 accuracy:{}'
.
format
(
stats
[
'accuracy_top_1'
]))
print
(
'top_1_training_accuracy:{}'
.
format
(
stats
[
'training_accuracy_top_1'
]))
return
stats
def
define_keras_flags
():
flags
.
DEFINE_boolean
(
name
=
'enable_eager'
,
default
=
False
,
help
=
'Enable eager?'
)
flags
.
DEFINE_boolean
(
name
=
'skip_eval'
,
default
=
False
,
help
=
'Skip evaluation?'
)
flags
.
DEFINE_integer
(
name
=
"train_steps"
,
default
=
None
,
help
=
"The number of steps to run for training"
)
help
=
"The number of steps to run for training. If it is larger than "
"# batches per epoch, then use # bathes per epoch. When this flag is "
"set, only one epoch is going to run for training."
)
def
get_synth_input_fn
(
height
,
width
,
num_channels
,
num_classes
,
...
...
@@ -152,7 +134,7 @@ def get_synth_input_fn(height, width, num_channels, num_classes,
This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when
tun
n
ing the full input pipeline.
tuning the full input pipeline.
Args:
height: Integer height that will be used to create a fake image tensor.
...
...
official/resnet/keras/keras_imagenet_main.py
View file @
c923a420
...
...
@@ -26,13 +26,11 @@ from official.resnet import imagenet_main
from
official.resnet
import
imagenet_preprocessing
from
official.resnet
import
resnet_run_loop
from
official.resnet.keras
import
keras_common
from
official.resnet.keras
import
resnet
50
from
official.resnet.keras
import
resnet
_model
from
official.utils.flags
import
core
as
flags_core
from
official.utils.logs
import
logger
from
official.utils.misc
import
distribution_utils
# import os
# os.environ['TF2_BEHAVIOR'] = 'enabled'
LR_SCHEDULE
=
[
# (multiplier, epoch to start) tuples
(
1.0
,
5
),
(
0.1
,
30
),
(
0.01
,
60
),
(
0.001
,
80
)
...
...
@@ -42,12 +40,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
def
learning_rate_schedule
(
current_epoch
,
current_batch
,
batches_per_epoch
,
batch_size
):
"""Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at 0, then it increases linearly per step.
After 5 epochs we reach the base learning rate (scaled to account
for batch size).
After 30, 60 and 80 epochs the learning rate is divided by 10.
After 90 epochs training stops and the LR is set to 0. This ensures
that we train for exactly 90 epochs for reproducibility.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
factor.
Args:
current_epoch: integer, current epoch indexed from 0.
...
...
@@ -81,7 +75,7 @@ def parse_record_keras(raw_record, is_training, dtype):
return
image
,
label
def
run
_imagenet_with_keras
(
flags_obj
):
def
run
(
flags_obj
):
"""Run ResNet ImageNet training and eval loop using native Keras APIs.
Args:
...
...
@@ -128,9 +122,9 @@ def run_imagenet_with_keras(flags_obj):
optimizer
=
keras_common
.
get_optimizer
()
strategy
=
distribution_utils
.
get_distribution_strategy
(
flags_obj
.
num_gpus
,
flags_obj
.
use_one_device
_strategy
)
flags_obj
.
num_gpus
,
flags_obj
.
turn_off_distribution
_strategy
)
model
=
resnet
50
.
R
es
N
et50
(
num_classes
=
imagenet_main
.
NUM_CLASSES
)
model
=
resnet
_model
.
r
es
n
et50
(
num_classes
=
imagenet_main
.
NUM_CLASSES
)
model
.
compile
(
loss
=
'sparse_categorical_crossentropy'
,
optimizer
=
optimizer
,
...
...
@@ -140,10 +134,6 @@ def run_imagenet_with_keras(flags_obj):
time_callback
,
tensorboard_callback
,
lr_callback
=
keras_common
.
get_callbacks
(
learning_rate_schedule
,
imagenet_main
.
NUM_IMAGES
[
'train'
])
steps_per_epoch
=
imagenet_main
.
NUM_IMAGES
[
'train'
]
//
flags_obj
.
batch_size
num_eval_steps
=
(
imagenet_main
.
NUM_IMAGES
[
'validation'
]
//
flags_obj
.
batch_size
)
train_steps
=
imagenet_main
.
NUM_IMAGES
[
'train'
]
//
flags_obj
.
batch_size
train_epochs
=
flags_obj
.
train_epochs
...
...
@@ -151,6 +141,14 @@ def run_imagenet_with_keras(flags_obj):
train_steps
=
min
(
flags_obj
.
train_steps
,
train_steps
)
train_epochs
=
1
num_eval_steps
=
(
imagenet_main
.
NUM_IMAGES
[
'validation'
]
//
flags_obj
.
batch_size
)
validation_data
=
eval_input_dataset
if
flags_obj
.
skip_eval
:
num_eval_steps
=
None
validation_data
=
None
history
=
model
.
fit
(
train_input_dataset
,
epochs
=
train_epochs
,
steps_per_epoch
=
train_steps
,
...
...
@@ -160,7 +158,7 @@ def run_imagenet_with_keras(flags_obj):
tensorboard_callback
],
validation_steps
=
num_eval_steps
,
validation_data
=
e
val
_input
_data
set
,
validation_data
=
val
idation
_data
,
verbose
=
1
)
if
not
flags_obj
.
skip_eval
:
...
...
@@ -168,14 +166,10 @@ def run_imagenet_with_keras(flags_obj):
steps
=
num_eval_steps
,
verbose
=
1
)
stats
=
keras_common
.
analyze_fit_and_eval_result
(
history
,
eval_output
)
return
stats
def
main
(
_
):
with
logger
.
benchmark_context
(
flags
.
FLAGS
):
run
_imagenet_with_keras
(
flags
.
FLAGS
)
run
(
flags
.
FLAGS
)
if
__name__
==
'__main__'
:
...
...
official/resnet/keras/resnet
56
.py
→
official/resnet/keras/resnet
_cifar_model
.py
View file @
c923a420
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet5
0
model for Keras adapted from tf.keras.applications.ResNet50.
"""ResNet5
6
model for Keras adapted from tf.keras.applications.ResNet50.
# Reference:
- [Deep Residual Learning for Image Recognition](
...
...
@@ -200,7 +200,7 @@ def conv_building_block(input_tensor,
return
x
def
R
es
N
et56
(
input_shape
=
None
,
classes
=
1000
):
def
r
es
n
et56
(
input_shape
=
None
,
classes
=
1000
):
"""Instantiates the ResNet56 architecture.
Arguments:
...
...
official/resnet/keras/resnet
50
.py
→
official/resnet/keras/resnet
_model
.py
View file @
c923a420
...
...
@@ -15,6 +15,7 @@
"""ResNet50 model for Keras.
Adapted from tf.keras.applications.resnet50.ResNet50().
This is ResNet model version 1.5.
Related papers/blogs:
- https://arxiv.org/abs/1512.03385
...
...
@@ -179,7 +180,7 @@ def conv_block(input_tensor,
return
x
def
R
es
N
et50
(
num_classes
):
def
r
es
n
et50
(
num_classes
):
"""Instantiates the ResNet50 architecture.
Args:
...
...
official/resnet/resnet_run_loop.py
View file @
c923a420
...
...
@@ -629,12 +629,9 @@ def define_resnet_flags(resnet_size_choices=None):
'inference. Note, this flag only applies to ImageNet and cannot '
'be used for CIFAR.'
))
flags
.
DEFINE_boolean
(
name
=
'
use_one_device
_strategy'
,
default
=
Tru
e
,
help
=
flags_core
.
help_wrap
(
'Set to
Fals
e to not use distribution '
name
=
'
turn_off_distribution
_strategy'
,
default
=
Fals
e
,
help
=
flags_core
.
help_wrap
(
'Set to
Tru
e to not use distribution '
'strategies.'
))
flags
.
DEFINE_boolean
(
name
=
'use_tf_momentum_optimizer'
,
default
=
False
,
help
=
'Use tf MomentumOptimizer.'
)
choice_kwargs
=
dict
(
name
=
'resnet_size'
,
short_name
=
'rs'
,
default
=
'50'
,
help
=
flags_core
.
help_wrap
(
'The size of the ResNet model to use.'
))
...
...
official/utils/misc/distribution_utils.py
View file @
c923a420
...
...
@@ -22,7 +22,7 @@ import tensorflow as tf
def
get_distribution_strategy
(
num_gpus
,
all_reduce_alg
=
None
,
use_one_device
_strategy
=
Tru
e
):
num_gpus
,
all_reduce_alg
=
None
,
turn_off_distribution
_strategy
=
Fals
e
):
"""Return a DistributionStrategy for running the model.
Args:
...
...
@@ -31,25 +31,30 @@ def get_distribution_strategy(
See tf.contrib.distribute.AllReduceCrossDeviceOps for available
algorithms. If None, DistributionStrategy will choose based on device
topology.
use_one_device_strategy: Should only be set to Truen when num_gpus is 1.
If True, then use OneDeviceStrategy; otherwise, do not use any
distribution strategy
.
turn_off_distribution_strategy: when set to True, do not use any
distribution strategy. Note that when it is True, and num_gpus is
larger than 1, it will raise a ValueError
.
Returns:
tf.contrib.distribute.DistibutionStrategy object.
Raises:
ValueError: if turn_off_distribution_strategy is True and num_gpus is
larger than 1
"""
if
num_gpus
==
0
and
use_one_device_strategy
:
return
tf
.
contrib
.
distribute
.
OneDeviceStrategy
(
"device:CPU:0"
)
elif
num_gpus
==
0
:
if
num_gpus
==
0
:
if
turn_off_distribution_strategy
:
return
None
elif
num_gpus
==
1
and
use_one_device_strategy
:
return
tf
.
contrib
.
distribute
.
OneDeviceStrategy
(
"device:
G
PU:0"
)
else
:
return
tf
.
contrib
.
distribute
.
OneDeviceStrategy
(
"device:
C
PU:0"
)
elif
num_gpus
==
1
:
if
turn_off_distribution_strategy
:
return
None
elif
use_one_device_strategy
:
raise
ValueError
(
"When %d GPUs are specified, use_one_device_strategy"
else
:
return
tf
.
contrib
.
distribute
.
OneDeviceStrategy
(
"device:GPU:0"
)
elif
turn_off_distribution_strategy
:
raise
ValueError
(
"When %d GPUs are specified, turn_off_distribution_strategy"
" flag cannot be set to True."
.
format
(
num_gpus
))
else
:
# num_gpus > 1 and not
use_one_device
_strategy
else
:
# num_gpus > 1 and not
turn_off_distribution
_strategy
if
all_reduce_alg
:
return
tf
.
contrib
.
distribute
.
MirroredStrategy
(
num_gpus
=
num_gpus
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment