Add multi-GPU to MNIST, take two (#3199)

Add multi-GPU flag to MNIST and allow for setting of replicated optimizer and model_fn

Add multi-GPU to MNIST, take two (#3199)
Add multi-GPU flag to MNIST and allow for setting of replicated optimizer and model_fn
e8726907 · Karmel Allison · GitHub · 3cb798fe · e8726907 · e8726907
Unverified Commit e8726907 authored Jan 22, 2018 by Karmel Allison Committed by GitHub Jan 22, 2018
Show whitespace changes
Inline Side-by-side

Showing with 95 additions and 36 deletions

official/mnist/mnist.py official/mnist/mnist.py +89 -34

official/mnist/mnist_test.py official/mnist/mnist_test.py +6 -2

No files found.
--- a/official/mnist/mnist.py
+++ b/official/mnist/mnist.py
@@ -96,6 +96,11 @@ def model_fn(features, labels, mode, params):
        })
  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
+
+    # If we are running multi-GPU, we need to wrap the optimizer.
+    if params.get('multi_gpu'):
+      optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
+
    logits = model(image, training=True)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
    accuracy = tf.metrics.accuracy(
@@ -122,16 +127,53 @@ def model_fn(features, labels, mode, params):
        })


+def validate_batch_size_for_multi_gpu(batch_size):
+  """For multi-gpu, batch-size must be a multiple of the number of
+  available GPUs.
+
+  Note that this should eventually be handled by replicate_model_fn
+  directly. Multi-GPU support is currently experimental, however,
+  so doing the work here until that feature is in place.
+  """
+  from tensorflow.python.client import device_lib
+
+  local_device_protos = device_lib.list_local_devices()
+  num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
+  if not num_gpus:
+    raise ValueError('Multi-GPU mode was specified, but no GPUs '
+      'were found. To use CPU, run without --multi_gpu.')
+    
+  remainder = batch_size % num_gpus
+  if remainder:
+    err = ('When running with multiple GPUs, batch size '
+      'must be a multiple of the number of available GPUs. '
+      'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
+      ).format(num_gpus, batch_size, batch_size - remainder)
+    raise ValueError(err)
+
+
 def main(unused_argv):
+  model_function = model_fn
+
+  if FLAGS.multi_gpu:
+    validate_batch_size_for_multi_gpu(FLAGS.batch_size)
+
+    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
+    # and (2) wrap the optimizer. The first happens here, and (2) happens
+    # in the model_fn itself when the optimizer is defined.
+    model_function = tf.contrib.estimator.replicate_model_fn(
+        model_fn, loss_reduction=tf.losses.Reduction.MEAN)
+
  data_format = FLAGS.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
-      model_fn=model_fn,
+      model_fn=model_function,
      model_dir=FLAGS.model_dir,
      params={
-          'data_format': data_format
+          'data_format': data_format,
+          'multi_gpu': FLAGS.multi_gpu
      })

  # Train the model
@@ -169,39 +211,52 @@ def main(unused_argv):
    mnist_classifier.export_savedmodel(FLAGS.export_dir, input_fn)


-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
+class MNISTArgParser(argparse.ArgumentParser):
+
+  def __init__(self):
+    super(MNISTArgParser, self).__init__()
+
+    self.add_argument(
+        '--multi_gpu', action='store_true',
+        help='If set, run across all available GPUs.')
+    self.add_argument(
        '--batch_size',
        type=int,
        default=100,
        help='Number of images to process in a batch')
-  parser.add_argument(
+    self.add_argument(
        '--data_dir',
        type=str,
        default='/tmp/mnist_data',
        help='Path to directory containing the MNIST dataset')
-  parser.add_argument(
+    self.add_argument(
        '--model_dir',
        type=str,
        default='/tmp/mnist_model',
        help='The directory where the model will be stored.')
-  parser.add_argument(
-      '--train_epochs', type=int, default=40, help='Number of epochs to train.')
-  parser.add_argument(
+    self.add_argument(
+        '--train_epochs',
+        type=int,
+        default=40,
+        help='Number of epochs to train.')
+    self.add_argument(
        '--data_format',
        type=str,
        default=None,
        choices=['channels_first', 'channels_last'],
-      help='A flag to override the data format used in the model. channels_first '
-      'provides a performance boost on GPU but is not always compatible '
-      'with CPU. If left unspecified, the data format will be chosen '
-      'automatically based on whether TensorFlow was built for CPU or GPU.')
-  parser.add_argument(
+        help='A flag to override the data format used in the model. '
+        'channels_first provides a performance boost on GPU but is not always '
+        'compatible with CPU. If left unspecified, the data format will be '
+        'chosen automatically based on whether TensorFlow was built for CPU or '
+        'GPU.')
+    self.add_argument(
        '--export_dir',
        type=str,
        help='The directory where the exported SavedModel will be stored.')

+
+if __name__ == '__main__':
+  parser = MNISTArgParser()
  tf.logging.set_verbosity(tf.logging.INFO)
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/official/mnist/mnist_test.py
+++ b/official/mnist/mnist_test.py
@@ -62,11 +62,12 @@ class Tests(tf.test.TestCase):
      self.assertEqual(predictions['probabilities'].shape, (10,))
      self.assertEqual(predictions['classes'].shape, ())

-  def mnist_model_fn_helper(self, mode):
+  def mnist_model_fn_helper(self, mode, multi_gpu=False):
    features, labels = dummy_input_fn()
    image_count = features.shape[0]
    spec = mnist.model_fn(features, labels, mode, {
-        'data_format': 'channels_last'
+        'data_format': 'channels_last',
+        'multi_gpu': multi_gpu
    })

    if mode == tf.estimator.ModeKeys.PREDICT:
@@ -91,6 +92,9 @@ class Tests(tf.test.TestCase):
  def test_mnist_model_fn_train_mode(self):
    self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN)

+  def test_mnist_model_fn_train_mode_multi_gpu(self):
+    self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN, multi_gpu=True)
+
  def test_mnist_model_fn_eval_mode(self):
    self.mnist_model_fn_helper(tf.estimator.ModeKeys.EVAL)