Commit c923a420 authored by Shining Sun's avatar Shining Sun
Browse files

bug fixes

parent 80dcd27c
......@@ -25,7 +25,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.resnet import cifar10_main as cifar_main
from official.resnet import resnet_run_loop
from official.resnet.keras import keras_common
from official.resnet.keras import resnet56
from official.resnet.keras import resnet_cifar_model
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.utils.misc import distribution_utils
......@@ -39,8 +39,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at base learning_rate, then after 91, 136 and
182 epochs, the learning rate is divided by 10.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
factor.
Args:
current_epoch: integer, current epoch indexed from 0.
......@@ -65,7 +65,7 @@ def parse_record_keras(raw_record, is_training, dtype):
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
This method converts the label to onhot to fit the loss function.
This method converts the label to one hot to fit the loss function.
Args:
raw_record: scalar Tensor tf.string containing a serialized
......@@ -127,9 +127,9 @@ def run(flags_obj):
optimizer = keras_common.get_optimizer()
strategy = distribution_utils.get_distribution_strategy(
flags_obj.num_gpus, flags_obj.use_one_device_strategy)
flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)
model = resnet56.ResNet56(input_shape=(32, 32, 3),
model = resnet_cifar_model.resnet56(input_shape=(32, 32, 3),
classes=cifar_main.NUM_CLASSES)
model.compile(loss='categorical_crossentropy',
......@@ -150,6 +150,11 @@ def run(flags_obj):
num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
num_eval_steps = None
validation_data = None
history = model.fit(train_input_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
......@@ -159,7 +164,7 @@ def run(flags_obj):
tensorboard_callback
],
validation_steps=num_eval_steps,
validation_data=eval_input_dataset,
validation_data=validation_data,
verbose=1)
if not flags_obj.skip_eval:
......@@ -167,10 +172,6 @@ def run(flags_obj):
steps=num_eval_steps,
verbose=1)
stats = keras_common.analyze_fit_and_eval_result(history, eval_output)
return stats
def main(_):
with logger.benchmark_context(flags.FLAGS):
......
......@@ -42,28 +42,26 @@ class TimeHistory(tf.keras.callbacks.Callback):
"""
self._batch_size = batch_size
super(TimeHistory, self).__init__()
self.log_batch_size = 100
self.log_steps = 100
def on_train_begin(self, logs=None):
self.batch_times_secs = []
self.record_batch = True
def on_batch_begin(self, batch, logs=None):
if self.record_batch:
self.batch_time_start = time.time()
self.start_time= time.time()
self.record_batch = False
def on_batch_end(self, batch, logs=None):
if batch % self.log_batch_size == 0:
last_n_batches = time.time() - self.batch_time_start
examples_per_second = (self._batch_size * self.log_batch_size) / last_n_batches
self.batch_times_secs.append(last_n_batches)
if batch % self.log_steps == 0:
elapsed_time = time.time() - self.start_time
examples_per_second = (self._batch_size * self.log_steps) / elapsed_time
self.record_batch = True
# TODO(anjalisridhar): add timestamp as well.
if batch != 0:
tf.logging.info("BenchmarkMetric: {'num_batches':%d, 'time_taken': %f,"
"'images_per_second': %f}" %
(batch, last_n_batches, examples_per_second))
(batch, elapsed_time, examples_per_second))
class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
"""Callback to update learning rate on every batch (not epoch boundaries).
......@@ -95,20 +93,14 @@ class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
raise ValueError('The output of the "schedule" function should be float.')
if lr != self.prev_lr:
self.model.optimizer.learning_rate = lr # lr should be a float here
# tf.keras.backend.set_value(self.model.optimizer.learning_rate, lr)
self.prev_lr = lr
tf.logging.debug('Epoch %05d Batch %05d: LearningRateBatchScheduler change '
'learning rate to %s.', self.epochs, batch, lr)
def get_optimizer():
if FLAGS.use_tf_momentum_optimizer:
learning_rate = BASE_LEARNING_RATE * FLAGS.batch_size / 256
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
else:
# optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
optimizer = gradient_descent_v2.SGD(learning_rate=0.1, momentum=0.9)
return optimizer
# The learning rate set here is a placeholder and not use. It will be overwritten
# at the beginning of each batch by callback
return gradient_descent_v2.SGD(learning_rate=0.1, momentum=0.9)
def get_callbacks(learning_rate_schedule_fn, num_images):
......@@ -124,25 +116,15 @@ def get_callbacks(learning_rate_schedule_fn, num_images):
return time_callback, tensorboard_callback, lr_callback
def analyze_fit_and_eval_result(history, eval_output):
stats = {}
stats['accuracy_top_1'] = eval_output[1]
stats['eval_loss'] = eval_output[0]
stats['training_loss'] = history.history['loss'][-1]
stats['training_accuracy_top_1'] = history.history['categorical_accuracy'][-1]
print('Test loss:{}'.format(stats['eval_loss']))
print('top_1 accuracy:{}'.format(stats['accuracy_top_1']))
print('top_1_training_accuracy:{}'.format(stats['training_accuracy_top_1']))
return stats
def define_keras_flags():
flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
flags.DEFINE_integer(
name="train_steps", default=None,
help="The number of steps to run for training")
help="The number of steps to run for training. If it is larger than "
"# batches per epoch, then use # bathes per epoch. When this flag is "
"set, only one epoch is going to run for training.")
def get_synth_input_fn(height, width, num_channels, num_classes,
......@@ -152,7 +134,7 @@ def get_synth_input_fn(height, width, num_channels, num_classes,
This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when
tunning the full input pipeline.
tuning the full input pipeline.
Args:
height: Integer height that will be used to create a fake image tensor.
......
......@@ -26,13 +26,11 @@ from official.resnet import imagenet_main
from official.resnet import imagenet_preprocessing
from official.resnet import resnet_run_loop
from official.resnet.keras import keras_common
from official.resnet.keras import resnet50
from official.resnet.keras import resnet_model
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.utils.misc import distribution_utils
# import os
# os.environ['TF2_BEHAVIOR'] = 'enabled'
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
......@@ -42,12 +40,8 @@ LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
def learning_rate_schedule(current_epoch, current_batch, batches_per_epoch, batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at 0, then it increases linearly per step.
After 5 epochs we reach the base learning rate (scaled to account
for batch size).
After 30, 60 and 80 epochs the learning rate is divided by 10.
After 90 epochs training stops and the LR is set to 0. This ensures
that we train for exactly 90 epochs for reproducibility.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the provided scaling
factor.
Args:
current_epoch: integer, current epoch indexed from 0.
......@@ -81,7 +75,7 @@ def parse_record_keras(raw_record, is_training, dtype):
return image, label
def run_imagenet_with_keras(flags_obj):
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using native Keras APIs.
Args:
......@@ -128,9 +122,9 @@ def run_imagenet_with_keras(flags_obj):
optimizer = keras_common.get_optimizer()
strategy = distribution_utils.get_distribution_strategy(
flags_obj.num_gpus, flags_obj.use_one_device_strategy)
flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy)
model = resnet50.ResNet50(num_classes=imagenet_main.NUM_CLASSES)
model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES)
model.compile(loss='sparse_categorical_crossentropy',
optimizer=optimizer,
......@@ -140,10 +134,6 @@ def run_imagenet_with_keras(flags_obj):
time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])
steps_per_epoch = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
train_epochs = flags_obj.train_epochs
......@@ -151,6 +141,14 @@ def run_imagenet_with_keras(flags_obj):
train_steps = min(flags_obj.train_steps, train_steps)
train_epochs = 1
num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
num_eval_steps = None
validation_data = None
history = model.fit(train_input_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
......@@ -160,7 +158,7 @@ def run_imagenet_with_keras(flags_obj):
tensorboard_callback
],
validation_steps=num_eval_steps,
validation_data=eval_input_dataset,
validation_data=validation_data,
verbose=1)
if not flags_obj.skip_eval:
......@@ -168,14 +166,10 @@ def run_imagenet_with_keras(flags_obj):
steps=num_eval_steps,
verbose=1)
stats = keras_common.analyze_fit_and_eval_result(history, eval_output)
return stats
def main(_):
with logger.benchmark_context(flags.FLAGS):
run_imagenet_with_keras(flags.FLAGS)
run(flags.FLAGS)
if __name__ == '__main__':
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet50 model for Keras adapted from tf.keras.applications.ResNet50.
"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
# Reference:
- [Deep Residual Learning for Image Recognition](
......@@ -200,7 +200,7 @@ def conv_building_block(input_tensor,
return x
def ResNet56(input_shape=None, classes=1000):
def resnet56(input_shape=None, classes=1000):
"""Instantiates the ResNet56 architecture.
Arguments:
......
......@@ -15,6 +15,7 @@
"""ResNet50 model for Keras.
Adapted from tf.keras.applications.resnet50.ResNet50().
This is ResNet model version 1.5.
Related papers/blogs:
- https://arxiv.org/abs/1512.03385
......@@ -179,7 +180,7 @@ def conv_block(input_tensor,
return x
def ResNet50(num_classes):
def resnet50(num_classes):
"""Instantiates the ResNet50 architecture.
Args:
......
......@@ -629,12 +629,9 @@ def define_resnet_flags(resnet_size_choices=None):
'inference. Note, this flag only applies to ImageNet and cannot '
'be used for CIFAR.'))
flags.DEFINE_boolean(
name='use_one_device_strategy', default=True,
help=flags_core.help_wrap('Set to False to not use distribution '
name='turn_off_distribution_strategy', default=False,
help=flags_core.help_wrap('Set to True to not use distribution '
'strategies.'))
flags.DEFINE_boolean(name='use_tf_momentum_optimizer', default=False,
help='Use tf MomentumOptimizer.')
choice_kwargs = dict(
name='resnet_size', short_name='rs', default='50',
help=flags_core.help_wrap('The size of the ResNet model to use.'))
......
......@@ -22,7 +22,7 @@ import tensorflow as tf
def get_distribution_strategy(
num_gpus, all_reduce_alg=None, use_one_device_strategy=True):
num_gpus, all_reduce_alg=None, turn_off_distribution_strategy=False):
"""Return a DistributionStrategy for running the model.
Args:
......@@ -31,25 +31,30 @@ def get_distribution_strategy(
See tf.contrib.distribute.AllReduceCrossDeviceOps for available
algorithms. If None, DistributionStrategy will choose based on device
topology.
use_one_device_strategy: Should only be set to Truen when num_gpus is 1.
If True, then use OneDeviceStrategy; otherwise, do not use any
distribution strategy.
turn_off_distribution_strategy: when set to True, do not use any
distribution strategy. Note that when it is True, and num_gpus is
larger than 1, it will raise a ValueError.
Returns:
tf.contrib.distribute.DistibutionStrategy object.
Raises:
ValueError: if turn_off_distribution_strategy is True and num_gpus is
larger than 1
"""
if num_gpus == 0 and use_one_device_strategy:
return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0")
elif num_gpus == 0:
if num_gpus == 0:
if turn_off_distribution_strategy:
return None
elif num_gpus == 1 and use_one_device_strategy:
return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
else:
return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0")
elif num_gpus == 1:
if turn_off_distribution_strategy:
return None
elif use_one_device_strategy:
raise ValueError("When %d GPUs are specified, use_one_device_strategy"
else:
return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
elif turn_off_distribution_strategy:
raise ValueError("When %d GPUs are specified, turn_off_distribution_strategy"
" flag cannot be set to True.".format(num_gpus))
else: # num_gpus > 1 and not use_one_device_strategy
else: # num_gpus > 1 and not turn_off_distribution_strategy
if all_reduce_alg:
return tf.contrib.distribute.MirroredStrategy(
num_gpus=num_gpus,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment