Commit c923a420 authored by Shining Sun's avatar Shining Sun
Browse files

bug fixes

parent 80dcd27c
...@@ -25,7 +25,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order ...@@ -25,7 +25,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.resnet import cifar10_main as cifar_main from official.resnet import cifar10_main as cifar_main
from official.resnet import resnet_run_loop from official.resnet import resnet_run_loop
from official.resnet.keras import keras_common from official.resnet.keras import keras_common
from official.resnet.keras import resnet56 from official.resnet.keras import resnet_cifar_model
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
from official.utils.logs import logger from official.utils.logs import logger
from official.utils.misc import distribution_utils from official.utils.misc import distribution_utils
...@@ -39,8 +39,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples ...@@ -39,8 +39,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size): def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay. """Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at base learning_rate, then after 91, 136 and Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
182 epochs, the learning rate is divided by 10. factor.
Args: Args:
current_epoch: integer, current epoch indexed from 0. current_epoch: integer, current epoch indexed from 0.
...@@ -65,7 +65,7 @@ def parse_record_keras(raw_record, is_training, dtype): ...@@ -65,7 +65,7 @@ def parse_record_keras(raw_record, is_training, dtype):
The input record is parsed into a label and image, and the image is passed The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on). through preprocessing steps (cropping, flipping, and so on).
This method converts the label to onhot to fit the loss function. This method converts the label to one hot to fit the loss function.
Args: Args:
raw_record: scalar Tensor tf.string containing a serialized raw_record: scalar Tensor tf.string containing a serialized
...@@ -127,9 +127,9 @@ def run(flags_obj): ...@@ -127,9 +127,9 @@ def run(flags_obj):
optimizer = keras_common.get_optimizer() optimizer = keras_common.get_optimizer()
strategy = distribution_utils.get_distribution_strategy( strategy = distribution_utils.get_distribution_strategy(
flags_obj.num_gpus, flags_obj.use_one_device_strategy) flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)
model = resnet56.ResNet56(input_shape=(32, 32, 3), model = resnet_cifar_model.resnet56(input_shape=(32, 32, 3),
classes=cifar_main.NUM_CLASSES) classes=cifar_main.NUM_CLASSES)
model.compile(loss='categorical_crossentropy', model.compile(loss='categorical_crossentropy',
...@@ -150,26 +150,27 @@ def run(flags_obj): ...@@ -150,26 +150,27 @@ def run(flags_obj):
num_eval_steps = (cifar_main.NUM_IMAGES['validation'] // num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
flags_obj.batch_size) flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
num_eval_steps = None
validation_data = None
history = model.fit(train_input_dataset, history = model.fit(train_input_dataset,
epochs=train_epochs, epochs=train_epochs,
steps_per_epoch=train_steps, steps_per_epoch=train_steps,
callbacks=[ callbacks=[
time_callback, time_callback,
lr_callback, lr_callback,
tensorboard_callback tensorboard_callback
], ],
validation_steps=num_eval_steps, validation_steps=num_eval_steps,
validation_data=eval_input_dataset, validation_data=validation_data,
verbose=1) verbose=1)
if not flags_obj.skip_eval: if not flags_obj.skip_eval:
eval_output = model.evaluate(eval_input_dataset, eval_output = model.evaluate(eval_input_dataset,
steps=num_eval_steps, steps=num_eval_steps,
verbose=1) verbose=1)
stats = keras_common.analyze_fit_and_eval_result(history, eval_output)
return stats
def main(_): def main(_):
......
...@@ -42,28 +42,26 @@ class TimeHistory(tf.keras.callbacks.Callback): ...@@ -42,28 +42,26 @@ class TimeHistory(tf.keras.callbacks.Callback):
""" """
self._batch_size = batch_size self._batch_size = batch_size
super(TimeHistory, self).__init__() super(TimeHistory, self).__init__()
self.log_batch_size = 100 self.log_steps = 100
def on_train_begin(self, logs=None): def on_train_begin(self, logs=None):
self.batch_times_secs = []
self.record_batch = True self.record_batch = True
def on_batch_begin(self, batch, logs=None): def on_batch_begin(self, batch, logs=None):
if self.record_batch: if self.record_batch:
self.batch_time_start = time.time() self.start_time= time.time()
self.record_batch = False self.record_batch = False
def on_batch_end(self, batch, logs=None): def on_batch_end(self, batch, logs=None):
if batch % self.log_batch_size == 0: if batch % self.log_steps == 0:
last_n_batches = time.time() - self.batch_time_start elapsed_time = time.time() - self.start_time
examples_per_second = (self._batch_size * self.log_batch_size) / last_n_batches examples_per_second = (self._batch_size * self.log_steps) / elapsed_time
self.batch_times_secs.append(last_n_batches)
self.record_batch = True self.record_batch = True
# TODO(anjalisridhar): add timestamp as well. # TODO(anjalisridhar): add timestamp as well.
if batch != 0: if batch != 0:
tf.logging.info("BenchmarkMetric: {'num_batches':%d, 'time_taken': %f," tf.logging.info("BenchmarkMetric: {'num_batches':%d, 'time_taken': %f,"
"'images_per_second': %f}" % "'images_per_second': %f}" %
(batch, last_n_batches, examples_per_second)) (batch, elapsed_time, examples_per_second))
class LearningRateBatchScheduler(tf.keras.callbacks.Callback): class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
"""Callback to update learning rate on every batch (not epoch boundaries). """Callback to update learning rate on every batch (not epoch boundaries).
...@@ -95,20 +93,14 @@ class LearningRateBatchScheduler(tf.keras.callbacks.Callback): ...@@ -95,20 +93,14 @@ class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
raise ValueError('The output of the "schedule" function should be float.') raise ValueError('The output of the "schedule" function should be float.')
if lr != self.prev_lr: if lr != self.prev_lr:
self.model.optimizer.learning_rate = lr # lr should be a float here self.model.optimizer.learning_rate = lr # lr should be a float here
# tf.keras.backend.set_value(self.model.optimizer.learning_rate, lr)
self.prev_lr = lr self.prev_lr = lr
tf.logging.debug('Epoch %05d Batch %05d: LearningRateBatchScheduler change ' tf.logging.debug('Epoch %05d Batch %05d: LearningRateBatchScheduler change '
'learning rate to %s.', self.epochs, batch, lr) 'learning rate to %s.', self.epochs, batch, lr)
def get_optimizer(): def get_optimizer():
if FLAGS.use_tf_momentum_optimizer: # The learning rate set here is a placeholder and not use. It will be overwritten
learning_rate = BASE_LEARNING_RATE * FLAGS.batch_size / 256 # at the beginning of each batch by callback
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) return gradient_descent_v2.SGD(learning_rate=0.1, momentum=0.9)
else:
# optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
optimizer = gradient_descent_v2.SGD(learning_rate=0.1, momentum=0.9)
return optimizer
def get_callbacks(learning_rate_schedule_fn, num_images): def get_callbacks(learning_rate_schedule_fn, num_images):
...@@ -124,25 +116,15 @@ def get_callbacks(learning_rate_schedule_fn, num_images): ...@@ -124,25 +116,15 @@ def get_callbacks(learning_rate_schedule_fn, num_images):
return time_callback, tensorboard_callback, lr_callback return time_callback, tensorboard_callback, lr_callback
def analyze_fit_and_eval_result(history, eval_output):
stats = {}
stats['accuracy_top_1'] = eval_output[1]
stats['eval_loss'] = eval_output[0]
stats['training_loss'] = history.history['loss'][-1]
stats['training_accuracy_top_1'] = history.history['categorical_accuracy'][-1]
print('Test loss:{}'.format(stats['eval_loss']))
print('top_1 accuracy:{}'.format(stats['accuracy_top_1']))
print('top_1_training_accuracy:{}'.format(stats['training_accuracy_top_1']))
return stats
def define_keras_flags(): def define_keras_flags():
flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?') flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?') flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
flags.DEFINE_integer( flags.DEFINE_integer(
name="train_steps", default=None, name="train_steps", default=None,
help="The number of steps to run for training") help="The number of steps to run for training. If it is larger than "
"# batches per epoch, then use # bathes per epoch. When this flag is "
"set, only one epoch is going to run for training.")
def get_synth_input_fn(height, width, num_channels, num_classes, def get_synth_input_fn(height, width, num_channels, num_classes,
...@@ -152,7 +134,7 @@ def get_synth_input_fn(height, width, num_channels, num_classes, ...@@ -152,7 +134,7 @@ def get_synth_input_fn(height, width, num_channels, num_classes,
This input_fn returns a data set that iterates over a set of random data and This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when copy is still included. This used to find the upper throughput bound when
tunning the full input pipeline. tuning the full input pipeline.
Args: Args:
height: Integer height that will be used to create a fake image tensor. height: Integer height that will be used to create a fake image tensor.
......
...@@ -26,13 +26,11 @@ from official.resnet import imagenet_main ...@@ -26,13 +26,11 @@ from official.resnet import imagenet_main
from official.resnet import imagenet_preprocessing from official.resnet import imagenet_preprocessing
from official.resnet import resnet_run_loop from official.resnet import resnet_run_loop
from official.resnet.keras import keras_common from official.resnet.keras import keras_common
from official.resnet.keras import resnet50 from official.resnet.keras import resnet_model
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
from official.utils.logs import logger from official.utils.logs import logger
from official.utils.misc import distribution_utils from official.utils.misc import distribution_utils
# import os
# os.environ['TF2_BEHAVIOR'] = 'enabled'
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80) (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
...@@ -42,12 +40,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples ...@@ -42,12 +40,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size): def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay. """Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at 0, then it increases linearly per step. Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
After 5 epochs we reach the base learning rate (scaled to account factor.
for batch size).
After 30, 60 and 80 epochs the learning rate is divided by 10.
After 90 epochs training stops and the LR is set to 0. This ensures
that we train for exactly 90 epochs for reproducibility.
Args: Args:
current_epoch: integer, current epoch indexed from 0. current_epoch: integer, current epoch indexed from 0.
...@@ -81,7 +75,7 @@ def parse_record_keras(raw_record, is_training, dtype): ...@@ -81,7 +75,7 @@ def parse_record_keras(raw_record, is_training, dtype):
return image, label return image, label
def run_imagenet_with_keras(flags_obj): def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using native Keras APIs. """Run ResNet ImageNet training and eval loop using native Keras APIs.
Args: Args:
...@@ -128,9 +122,9 @@ def run_imagenet_with_keras(flags_obj): ...@@ -128,9 +122,9 @@ def run_imagenet_with_keras(flags_obj):
optimizer = keras_common.get_optimizer() optimizer = keras_common.get_optimizer()
strategy = distribution_utils.get_distribution_strategy( strategy = distribution_utils.get_distribution_strategy(
flags_obj.num_gpus, flags_obj.use_one_device_strategy) flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)
model = resnet50.ResNet50(num_classes=imagenet_main.NUM_CLASSES) model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES)
model.compile(loss='sparse_categorical_crossentropy', model.compile(loss='sparse_categorical_crossentropy',
optimizer=optimizer, optimizer=optimizer,
...@@ -140,10 +134,6 @@ def run_imagenet_with_keras(flags_obj): ...@@ -140,10 +134,6 @@ def run_imagenet_with_keras(flags_obj):
time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
learning_rate_schedule, imagenet_main.NUM_IMAGES['train']) learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])
steps_per_epoch = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
train_epochs = flags_obj.train_epochs train_epochs = flags_obj.train_epochs
...@@ -151,31 +141,35 @@ def run_imagenet_with_keras(flags_obj): ...@@ -151,31 +141,35 @@ def run_imagenet_with_keras(flags_obj):
train_steps = min(flags_obj.train_steps, train_steps) train_steps = min(flags_obj.train_steps, train_steps)
train_epochs = 1 train_epochs = 1
num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
num_eval_steps = None
validation_data = None
history = model.fit(train_input_dataset, history = model.fit(train_input_dataset,
epochs=train_epochs, epochs=train_epochs,
steps_per_epoch=train_steps, steps_per_epoch=train_steps,
callbacks=[ callbacks=[
time_callback, time_callback,
lr_callback, lr_callback,
tensorboard_callback tensorboard_callback
], ],
validation_steps=num_eval_steps, validation_steps=num_eval_steps,
validation_data=eval_input_dataset, validation_data=validation_data,
verbose=1) verbose=1)
if not flags_obj.skip_eval: if not flags_obj.skip_eval:
eval_output = model.evaluate(eval_input_dataset, eval_output = model.evaluate(eval_input_dataset,
steps=num_eval_steps, steps=num_eval_steps,
verbose=1) verbose=1)
stats = keras_common.analyze_fit_and_eval_result(history, eval_output)
return stats
def main(_): def main(_):
with logger.benchmark_context(flags.FLAGS): with logger.benchmark_context(flags.FLAGS):
run_imagenet_with_keras(flags.FLAGS) run(flags.FLAGS)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""ResNet50 model for Keras adapted from tf.keras.applications.ResNet50. """ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
# Reference: # Reference:
- [Deep Residual Learning for Image Recognition]( - [Deep Residual Learning for Image Recognition](
...@@ -200,7 +200,7 @@ def conv_building_block(input_tensor, ...@@ -200,7 +200,7 @@ def conv_building_block(input_tensor,
return x return x
def ResNet56(input_shape=None, classes=1000): def resnet56(input_shape=None, classes=1000):
"""Instantiates the ResNet56 architecture. """Instantiates the ResNet56 architecture.
Arguments: Arguments:
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
"""ResNet50 model for Keras. """ResNet50 model for Keras.
Adapted from tf.keras.applications.resnet50.ResNet50(). Adapted from tf.keras.applications.resnet50.ResNet50().
This is ResNet model version 1.5.
Related papers/blogs: Related papers/blogs:
- https://arxiv.org/abs/1512.03385 - https://arxiv.org/abs/1512.03385
...@@ -179,7 +180,7 @@ def conv_block(input_tensor, ...@@ -179,7 +180,7 @@ def conv_block(input_tensor,
return x return x
def ResNet50(num_classes): def resnet50(num_classes):
"""Instantiates the ResNet50 architecture. """Instantiates the ResNet50 architecture.
Args: Args:
......
...@@ -629,12 +629,9 @@ def define_resnet_flags(resnet_size_choices=None): ...@@ -629,12 +629,9 @@ def define_resnet_flags(resnet_size_choices=None):
'inference. Note, this flag only applies to ImageNet and cannot ' 'inference. Note, this flag only applies to ImageNet and cannot '
'be used for CIFAR.')) 'be used for CIFAR.'))
flags.DEFINE_boolean( flags.DEFINE_boolean(
name='use_one_device_strategy', default=True, name='turn_off_distribution_strategy', default=False,
help=flags_core.help_wrap('Set to False to not use distribution ' help=flags_core.help_wrap('Set to True to not use distribution '
'strategies.')) 'strategies.'))
flags.DEFINE_boolean(name='use_tf_momentum_optimizer', default=False,
help='Use tf MomentumOptimizer.')
choice_kwargs = dict( choice_kwargs = dict(
name='resnet_size', short_name='rs', default='50', name='resnet_size', short_name='rs', default='50',
help=flags_core.help_wrap('The size of the ResNet model to use.')) help=flags_core.help_wrap('The size of the ResNet model to use.'))
......
...@@ -22,7 +22,7 @@ import tensorflow as tf ...@@ -22,7 +22,7 @@ import tensorflow as tf
def get_distribution_strategy( def get_distribution_strategy(
num_gpus, all_reduce_alg=None, use_one_device_strategy=True): num_gpus, all_reduce_alg=None, turn_off_distribution_strategy=False):
"""Return a DistributionStrategy for running the model. """Return a DistributionStrategy for running the model.
Args: Args:
...@@ -31,25 +31,30 @@ def get_distribution_strategy( ...@@ -31,25 +31,30 @@ def get_distribution_strategy(
See tf.contrib.distribute.AllReduceCrossDeviceOps for available See tf.contrib.distribute.AllReduceCrossDeviceOps for available
algorithms. If None, DistributionStrategy will choose based on device algorithms. If None, DistributionStrategy will choose based on device
topology. topology.
use_one_device_strategy: Should only be set to Truen when num_gpus is 1. turn_off_distribution_strategy: when set to True, do not use any
If True, then use OneDeviceStrategy; otherwise, do not use any distribution strategy. Note that when it is True, and num_gpus is
distribution strategy. larger than 1, it will raise a ValueError.
Returns: Returns:
tf.contrib.distribute.DistibutionStrategy object. tf.contrib.distribute.DistibutionStrategy object.
Raises:
ValueError: if turn_off_distribution_strategy is True and num_gpus is
larger than 1
""" """
if num_gpus == 0 and use_one_device_strategy: if num_gpus == 0:
return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0") if turn_off_distribution_strategy:
elif num_gpus == 0: return None
return None else:
elif num_gpus == 1 and use_one_device_strategy: return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0")
return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
elif num_gpus == 1: elif num_gpus == 1:
return None if turn_off_distribution_strategy:
elif use_one_device_strategy: return None
raise ValueError("When %d GPUs are specified, use_one_device_strategy" else:
return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
elif turn_off_distribution_strategy:
raise ValueError("When %d GPUs are specified, turn_off_distribution_strategy"
" flag cannot be set to True.".format(num_gpus)) " flag cannot be set to True.".format(num_gpus))
else: # num_gpus > 1 and not use_one_device_strategy else: # num_gpus > 1 and not turn_off_distribution_strategy
if all_reduce_alg: if all_reduce_alg:
return tf.contrib.distribute.MirroredStrategy( return tf.contrib.distribute.MirroredStrategy(
num_gpus=num_gpus, num_gpus=num_gpus,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment