Commit 6f881f77 authored by Shining Sun's avatar Shining Sun
Browse files

bug fixes and clean ups

parent b1b4c805
......@@ -39,7 +39,7 @@ NUM_CLASSES = 10
_NUM_DATA_FILES = 5
# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
_NUM_IMAGES = {
NUM_IMAGES = {
'train': 50000,
'validation': 10000,
}
......@@ -134,7 +134,7 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
dataset=dataset,
is_training=is_training,
batch_size=batch_size,
shuffle_buffer=_NUM_IMAGES['train'],
shuffle_buffer=NUM_IMAGES['train'],
parse_record_fn=parse_record_fn,
num_epochs=num_epochs,
dtype=dtype,
......@@ -200,7 +200,7 @@ def cifar10_model_fn(features, labels, mode, params):
# Learning rate schedule follows arXiv:1512.03385 for ResNet-56 and under.
learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
batch_size=params['batch_size'], batch_denom=128,
num_images=_NUM_IMAGES['train'], boundary_epochs=[91, 136, 182],
num_images=NUM_IMAGES['train'], boundary_epochs=[91, 136, 182],
decay_rates=[1, 0.1, 0.01, 0.001])
# Weight decay of 2e-4 diverges from 1e-4 decay used in the ResNet paper
......
......@@ -152,18 +152,24 @@ def run(flags_obj):
model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=['categorical_accuracy'],
strategy=strategy)
distribute=strategy)
time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks(
learning_rate_schedule, cifar_main.NUM_IMAGES['train'])
steps_per_epoch = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size
train_epochs = flags_obj.train_epochs
if flags_obj.train_steps:
train_steps = min(flags_obj.train_steps, train_steps)
train_epochs = 1
num_eval_steps = (cifar_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
history = model.fit(train_input_dataset,
epochs=flags_obj.train_epochs,
steps_per_epoch=steps_per_epoch,
epochs=train_epochs,
steps_per_epoch=train_steps,
callbacks=[
time_callback,
lr_callback,
......@@ -190,4 +196,5 @@ def main(_):
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
cifar_main.define_cifar_flags()
keras_common.define_keras_flags()
absl_app.run(main)
......@@ -56,8 +56,7 @@ class TimeHistory(tf.keras.callbacks.Callback):
def on_batch_end(self, batch, logs=None):
if batch % self.log_batch_size == 0:
last_n_batches = time.time() - self.batch_time_start
examples_per_second =
(self._batch_size * self.log_batch_size) / last_n_batches
examples_per_second = (self._batch_size * self.log_batch_size) / last_n_batches
self.batch_times_secs.append(last_n_batches)
self.record_batch = True
# TODO(anjalisridhar): add timestamp as well.
......@@ -131,8 +130,14 @@ def analyze_fit_and_eval_result(history, eval_output):
stats['training_loss'] = history.history['loss'][-1]
stats['training_accuracy_top_1'] = history.history['categorical_accuracy'][-1]
print('Test loss:{}'.format(stats['']))
print('Test loss:{}'.format(stats['eval_loss']))
print('top_1 accuracy:{}'.format(stats['accuracy_top_1']))
print('top_1_training_accuracy:{}'.format(stats['training_accuracy_top_1']))
return stats
def define_keras_flags():
flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
flags.DEFINE_integer(
name="train_steps", default=None,
help="The number of steps to run for training")
......@@ -153,9 +153,16 @@ def run_imagenet_with_keras(flags_obj):
num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] //
flags_obj.batch_size)
train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size
train_epochs = flags_obj.train_epochs
if flags_obj.train_steps:
train_steps = min(flags_obj.train_steps, train_steps)
train_epochs = 1
history = model.fit(train_input_dataset,
epochs=flags_obj.train_epochs,
steps_per_epoch=steps_per_epoch,
epochs=train_epochs,
steps_per_epoch=train_steps,
callbacks=[
time_callback,
lr_callback,
......@@ -182,4 +189,5 @@ def main(_):
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
imagenet_main.define_imagenet_flags()
keras_common.define_keras_flags()
absl_app.run(main)
......@@ -632,7 +632,6 @@ def define_resnet_flags(resnet_size_choices=None):
name='use_one_device_strategy', default=True,
help=flags_core.help_wrap('Set to False to not use distribution '
'strategies.'))
flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
flags.DEFINE_boolean(name='use_tf_momentum_optimizer', default=False,
help='Use tf MomentumOptimizer.')
......
......@@ -22,7 +22,7 @@ import tensorflow as tf
def get_distribution_strategy(
num_gpus, all_reduce_alg=None, use_one_device_strategy):
num_gpus, all_reduce_alg=None, use_one_device_strategy=True):
"""Return a DistributionStrategy for running the model.
Args:
......@@ -47,7 +47,7 @@ def get_distribution_strategy(
elif num_gpus == 1:
return None
elif use_one_device_strategy:
rase ValueError("When %d GPUs are specified, use_one_device_strategy"
raise ValueError("When %d GPUs are specified, use_one_device_strategy"
" flag cannot be set to True.".format(num_gpus))
else: # num_gpus > 1 and not use_one_device_strategy
if all_reduce_alg:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment