Commit afa7954d authored by Marianne Linhares Monteiro's avatar Marianne Linhares Monteiro Committed by GitHub
Browse files

Refactoring and adding sync mode

parent 74ecc048
...@@ -36,7 +36,6 @@ import os ...@@ -36,7 +36,6 @@ import os
import numpy as np import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_runner # run the experiment
import cifar10 import cifar10
import cifar10_model import cifar10_model
...@@ -45,13 +44,13 @@ tf.logging.set_verbosity(tf.logging.INFO) ...@@ -45,13 +44,13 @@ tf.logging.set_verbosity(tf.logging.INFO)
FLAGS = tf.flags.FLAGS FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_string('data_dir', 'cifar10', tf.flags.DEFINE_string('data_dir', '',
'The directory where the CIFAR-10 input data is stored.') 'The directory where the CIFAR-10 input data is stored.')
tf.flags.DEFINE_string('model_dir', 'output2_2', tf.flags.DEFINE_string('model_dir', '',
'The directory where the model will be stored.') 'The directory where the model will be stored.')
tf.flags.DEFINE_boolean('is_cpu_ps', True, tf.flags.DEFINE_boolean('is_cpu_ps', False,
'If using CPU as the parameter server.') 'If using CPU as the parameter server.')
tf.flags.DEFINE_integer('num_gpus', 1, tf.flags.DEFINE_integer('num_gpus', 1,
...@@ -59,12 +58,12 @@ tf.flags.DEFINE_integer('num_gpus', 1, ...@@ -59,12 +58,12 @@ tf.flags.DEFINE_integer('num_gpus', 1,
tf.flags.DEFINE_integer('num_layers', 44, 'The number of layers of the model.') tf.flags.DEFINE_integer('num_layers', 44, 'The number of layers of the model.')
tf.flags.DEFINE_integer('train_batch_size', 1024, 'Batch size for training.') tf.flags.DEFINE_integer('train_steps', 10000,
'The number of steps to use for training.')
tf.flags.DEFINE_integer('train_steps', (50000.0/FLAGS.train_batch_size) * 40, tf.flags.DEFINE_integer('train_batch_size', 128, 'Batch size for training.')
'The number of steps to use for training.') # 40 epochs
tf.flags.DEFINE_integer('eval_batch_size', 200, 'Batch size for validation.') tf.flags.DEFINE_integer('eval_batch_size', 100, 'Batch size for validation.')
tf.flags.DEFINE_float('momentum', 0.9, 'Momentum for MomentumOptimizer.') tf.flags.DEFINE_float('momentum', 0.9, 'Momentum for MomentumOptimizer.')
...@@ -73,6 +72,18 @@ tf.flags.DEFINE_float('weight_decay', 1e-4, 'Weight decay for convolutions.') ...@@ -73,6 +72,18 @@ tf.flags.DEFINE_float('weight_decay', 1e-4, 'Weight decay for convolutions.')
tf.flags.DEFINE_boolean('use_distortion_for_training', True, tf.flags.DEFINE_boolean('use_distortion_for_training', True,
'If doing image distortion for training.') 'If doing image distortion for training.')
tf.flags.DEFINE_boolean('run_experiment', False,
'If True will run an experiment,'
'otherwise will run training and evaluation'
'using the estimator interface')
tf.flags.DEFINE_boolean('sync', False,
'If true when running in a distributed environment'
'will run on sync mode')
tf.flags.DEFINE_integer('num_workers', 1,
'Number of workers')
# Perf flags # Perf flags
tf.flags.DEFINE_integer('num_intra_threads', 1, tf.flags.DEFINE_integer('num_intra_threads', 1,
"""Number of threads to use for intra-op parallelism. """Number of threads to use for intra-op parallelism.
...@@ -138,6 +149,7 @@ def _create_device_setter(is_cpu_ps, worker): ...@@ -138,6 +149,7 @@ def _create_device_setter(is_cpu_ps, worker):
gpus = ['/gpu:%d' % i for i in range(FLAGS.num_gpus)] gpus = ['/gpu:%d' % i for i in range(FLAGS.num_gpus)]
return ParamServerDeviceSetter(worker, gpus) return ParamServerDeviceSetter(worker, gpus)
def _resnet_model_fn(features, labels, mode): def _resnet_model_fn(features, labels, mode):
"""Resnet model body. """Resnet model body.
...@@ -220,19 +232,26 @@ def _resnet_model_fn(features, labels, mode): ...@@ -220,19 +232,26 @@ def _resnet_model_fn(features, labels, mode):
for x in np.array([82, 123, 300], dtype=np.int64) for x in np.array([82, 123, 300], dtype=np.int64)
] ]
staged_lr = [0.1, 0.01, 0.001, 0.0002] staged_lr = [0.1, 0.01, 0.001, 0.0002]
learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
global_step = tf.train.get_global_step()
learning_rate = tf.train.piecewise_constant(global_step,
boundaries, staged_lr) boundaries, staged_lr)
# Create a nicely-named tensor for logging # Create a nicely-named tensor for logging
learning_rate = tf.identity(learning_rate, name='learning_rate') learning_rate = tf.identity(learning_rate, name='learning_rate')
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, optimizer = tf.train.MomentumOptimizer(
momentum=momentum) learning_rate=learning_rate, momentum=momentum)
chief_hooks = []
if FLAGS.sync:
optimizer = tf.train.SyncReplicasOptimizer(
optimizer,
replicas_to_aggregate=FLAGS.num_workers)
sync_replicas_hook = opt.make_session_run_hook(True)
chief_hooks.append(sync_replicas_hook)
# Create single grouped train op # Create single grouped train op
train_op = [ train_op = [
optimizer.apply_gradients( optimizer.apply_gradients(
gradvars, global_step=global_step) gradvars, global_step=tf.train.get_global_step())
] ]
train_op.extend(update_ops) train_op.extend(update_ops)
train_op = tf.group(*train_op) train_op = tf.group(*train_op)
...@@ -253,6 +272,7 @@ def _resnet_model_fn(features, labels, mode): ...@@ -253,6 +272,7 @@ def _resnet_model_fn(features, labels, mode):
predictions=predictions, predictions=predictions,
loss=loss, loss=loss,
train_op=train_op, train_op=train_op,
training_chief_hooks=chief_hooks,
eval_metric_ops=metrics) eval_metric_ops=metrics)
...@@ -283,7 +303,6 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses, ...@@ -283,7 +303,6 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
tower_loss = tf.losses.sparse_softmax_cross_entropy( tower_loss = tf.losses.sparse_softmax_cross_entropy(
logits=logits, labels=label) logits=logits, labels=label)
tower_loss = tf.reduce_mean(tower_loss) tower_loss = tf.reduce_mean(tower_loss)
tower_losses.append(tower_loss)
model_params = tf.trainable_variables() model_params = tf.trainable_variables()
tower_loss += weight_decay * tf.add_n( tower_loss += weight_decay * tf.add_n(
...@@ -303,43 +322,16 @@ def input_fn(subset, num_shards): ...@@ -303,43 +322,16 @@ def input_fn(subset, num_shards):
Returns: Returns:
two lists of tensors for features and labels, each of num_shards length. two lists of tensors for features and labels, each of num_shards length.
""" """
dataset = cifar10.Cifar10DataSet(FLAGS.data_dir) if subset == 'train':
is_training = (subset == 'train')
if is_training:
batch_size = FLAGS.train_batch_size batch_size = FLAGS.train_batch_size
else: elif subset == 'validate' or subset == 'eval':
batch_size = FLAGS.eval_batch_size batch_size = FLAGS.eval_batch_size
with tf.device('/cpu:0'), tf.name_scope('batching'): else:
# CPU loads all data from disk since there're only 60k 32*32 RGB images. raise ValueError('Subset must be one of \'train\', \'validate\' and \'eval\'')
all_images, all_labels = dataset.read_all_data(subset) with tf.device('/cpu:0'):
dataset = tf.contrib.data.Dataset.from_tensor_slices( use_distortion = subset == 'train' and FLAGS.use_distortion_for_training
(all_images, all_labels)) dataset = cifar10.Cifar10DataSet(FLAGS.data_dir, subset, use_distortion)
dataset = dataset.map( image_batch, label_batch = dataset.make_batch(batch_size)
lambda x, y: (tf.cast(x, tf.float32), tf.cast(y, tf.int32)),
num_threads=2,
output_buffer_size=batch_size)
# Image preprocessing.
def _preprocess(image, label):
# If GPU is available, NHWC to NCHW transpose is done in ResNetCifar10
# class, not included in preprocessing.
return cifar10.Cifar10DataSet.preprocess(
image, is_training, FLAGS.use_distortion_for_training), label
dataset = dataset.map(
_preprocess, num_threads=batch_size, output_buffer_size=2 * batch_size)
# Repeat infinitely.
dataset = dataset.repeat()
if is_training:
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(
cifar10.Cifar10DataSet.num_examples_per_epoch(subset) *
min_fraction_of_examples_in_queue)
# Ensure that the capacity is sufficiently large to provide good random
# shuffling
dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
image_batch, label_batch = iterator.get_next()
if num_shards <= 1: if num_shards <= 1:
# No GPU available or only 1 GPU. # No GPU available or only 1 GPU.
return [image_batch], [label_batch] return [image_batch], [label_batch]
...@@ -360,22 +352,27 @@ def input_fn(subset, num_shards): ...@@ -360,22 +352,27 @@ def input_fn(subset, num_shards):
label_shards = [tf.parallel_stack(x) for x in label_shards] label_shards = [tf.parallel_stack(x) for x in label_shards]
return feature_shards, label_shards return feature_shards, label_shards
def create_experiment_fn(train_input, test_input, hooks):
# create experiment
def get_experiment_fn(train_input_fn, eval_input_fn, train_steps, eval_steps,
train_hooks):
def _experiment_fn(run_config, hparams): def _experiment_fn(run_config, hparams):
estimator = tf.estimator.Estimator(model_fn=_resnet_model_fn, del hparams # unused arg
config=run_config, # create estimator
model_dir=FLAGS.model_dir) classifier = tf.estimator.Estimator(model_fn=_resnet_model_fn,
config=run_config)
experiment = tf.contrib.learn.Experiment( experiment = tf.contrib.learn.Experiment(
estimator, classifier,
train_input_fn=train_input, train_input_fn=train_input_fn,
eval_input_fn=test_input, eval_input_fn=eval_input_fn,
train_steps=FLAGS.train_steps) train_steps=train_steps,
eval_steps=eval_steps)
experiment.extend_train_hooks(hooks) # adding hooks to estimator on training mode
experiment.extend_train_hooks(train_hooks)
return experiment return experiment
return _experiment_fn return _experiment_fn
def main(unused_argv): def main(unused_argv):
# The env variable is on deprecation path, default is set to off. # The env variable is on deprecation path, default is set to off.
os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_SYNC_ON_FINISH'] = '0'
...@@ -397,26 +394,52 @@ def main(unused_argv): ...@@ -397,26 +394,52 @@ def main(unused_argv):
if num_eval_examples % FLAGS.eval_batch_size != 0: if num_eval_examples % FLAGS.eval_batch_size != 0:
raise ValueError('validation set size must be multiple of eval_batch_size') raise ValueError('validation set size must be multiple of eval_batch_size')
config = tf.contrib.learn.RunConfig(model_dir=FLAGS.model_dir) train_input_fn = functools.partial(input_fn, subset='train',
num_shards=FLAGS.num_gpus)
eval_input_fn = functools.partial(input_fn, subset='eval',
num_shards=FLAGS.num_gpus)
train_steps = FLAGS.train_steps
eval_steps = num_eval_examples // FLAGS.eval_batch_size
# session configuration
sess_config = tf.ConfigProto() sess_config = tf.ConfigProto()
sess_config.allow_soft_placement = True sess_config.allow_soft_placement = True
sess_config.log_device_placement = FLAGS.log_device_placement sess_config.log_device_placement = FLAGS.log_device_placement
sess_config.intra_op_parallelism_threads = FLAGS.num_intra_threads sess_config.intra_op_parallelism_threads = FLAGS.num_intra_threads
sess_config.inter_op_parallelism_threads = FLAGS.num_inter_threads sess_config.inter_op_parallelism_threads = FLAGS.num_inter_threads
sess_config.gpu_options.force_gpu_compatible = FLAGS.force_gpu_compatible sess_config.gpu_options.force_gpu_compatible = FLAGS.force_gpu_compatible
config = config.replace(session_config=sess_config)
train_input = functools.partial(input_fn, subset='train', num_shards=FLAGS.num_gpus)
test_input = functools.partial(input_fn, subset='eval', num_shards=FLAGS.num_gpus)
# log learning_rate
tensors_to_log = {'learning_rate': 'learning_rate'} tensors_to_log = {'learning_rate': 'learning_rate'}
logging_hook = tf.train.LoggingTensorHook( logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=100) tensors=tensors_to_log, every_n_iter=100)
hooks = [logging_hook] if FLAGS.run_experiment:
config = tf.contrib.learn.RunConfig(model_dir=FLAGS.model_dir)
config = config.replace(session_config=sess_config)
tf.contrib.learn.learn_runner.run(
get_experiment_fn(train_input_fn, eval_input_fn,
train_steps, eval_steps,
[logging_hook]), run_config=config)
# run experiment else:
learn_runner.run(create_experiment_fn(train_input, test_input, hooks), run_config=config) config = tf.estimator.RunConfig()
config = config.replace(session_config=sess_config)
classifier = tf.estimator.Estimator(
model_fn=_resnet_model_fn, model_dir=FLAGS.model_dir, config=config)
print('Starting to train...')
classifier.train(input_fn=train_input_fn,
steps=train_steps,
hooks=[logging_hook])
print('Starting to evaluate...')
eval_results = classifier.evaluate(
input_fn=eval_input_fn,
steps=eval_steps)
print(eval_results)
if __name__ == '__main__': if __name__ == '__main__':
tf.app.run() tf.app.run()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment