Merge branch 'master' into fix-five-undefined-names

b99cba09 · cclauss · GitHub · 020efa74 · fc37f117 · b99cba09
Unverified Commit b99cba09 authored Jan 29, 2018 by cclauss Committed by GitHub Jan 29, 2018
4 changed files
--- a/official/README.md
+++ b/official/README.md
 # TensorFlow Official Models

-The TensorFlow official models are a collection of example models that use TensorFlow's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read.
+The TensorFlow official models are a collection of example models that use TensorFlow's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read.
+
+The master branch of the models are **in development**, and they target the [nightly binaries](https://github.com/tensorflow/tensorflow#installation) built from the [master branch of TensorFlow](https://github.com/tensorflow/tensorflow/tree/master).
+
+**Stable versions** of the official models targeting releases of TensorFlow are available as tagged branches or [downloadable releases](https://github.com/tensorflow/models/releases). Model repository version numbers match the target TensorFlow release, such that [branch r1.4.0](https://github.com/tensorflow/models/tree/r1.4.0) and [release v1.4.0](https://github.com/tensorflow/models/releases/tag/v1.4.0) are compatible with [TensorFlow v1.4.0](https://github.com/tensorflow/tensorflow/releases/tag/v1.4.0).
+
+If you are on a version of TensorFlow earlier than v1.4, please [update your installation](https://www.tensorflow.org/install/).

-Currently the models are compatible with TensorFlow 1.4. If you are on an earlier version please [update your installation](https://www.tensorflow.org/install/).

 ---


--- a/official/mnist/mnist.py
+++ b/official/mnist/mnist.py
@@ -96,6 +96,11 @@ def model_fn(features, labels, mode, params):
        })
  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
+
+    # If we are running multi-GPU, we need to wrap the optimizer.
+    if params.get('multi_gpu'):
+      optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
+
    logits = model(image, training=True)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
    accuracy = tf.metrics.accuracy(
@@ -122,16 +127,53 @@ def model_fn(features, labels, mode, params):
        })


+def validate_batch_size_for_multi_gpu(batch_size):
+  """For multi-gpu, batch-size must be a multiple of the number of
+  available GPUs.
+
+  Note that this should eventually be handled by replicate_model_fn
+  directly. Multi-GPU support is currently experimental, however,
+  so doing the work here until that feature is in place.
+  """
+  from tensorflow.python.client import device_lib
+
+  local_device_protos = device_lib.list_local_devices()
+  num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
+  if not num_gpus:
+    raise ValueError('Multi-GPU mode was specified, but no GPUs '
+      'were found. To use CPU, run without --multi_gpu.')
+    
+  remainder = batch_size % num_gpus
+  if remainder:
+    err = ('When running with multiple GPUs, batch size '
+      'must be a multiple of the number of available GPUs. '
+      'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
+      ).format(num_gpus, batch_size, batch_size - remainder)
+    raise ValueError(err)
+
+
 def main(unused_argv):
+  model_function = model_fn
+
+  if FLAGS.multi_gpu:
+    validate_batch_size_for_multi_gpu(FLAGS.batch_size)
+
+    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
+    # and (2) wrap the optimizer. The first happens here, and (2) happens
+    # in the model_fn itself when the optimizer is defined.
+    model_function = tf.contrib.estimator.replicate_model_fn(
+        model_fn, loss_reduction=tf.losses.Reduction.MEAN)
+
  data_format = FLAGS.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  mnist_classifier = tf.estimator.Estimator(
-      model_fn=model_fn,
+      model_fn=model_function,
      model_dir=FLAGS.model_dir,
      params={
-          'data_format': data_format
+          'data_format': data_format,
+          'multi_gpu': FLAGS.multi_gpu
      })

  # Train the model
@@ -169,39 +211,52 @@ def main(unused_argv):
    mnist_classifier.export_savedmodel(FLAGS.export_dir, input_fn)


-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Number of images to process in a batch')
-  parser.add_argument(
-      '--data_dir',
-      type=str,
-      default='/tmp/mnist_data',
-      help='Path to directory containing the MNIST dataset')
-  parser.add_argument(
-      '--model_dir',
-      type=str,
-      default='/tmp/mnist_model',
-      help='The directory where the model will be stored.')
-  parser.add_argument(
-      '--train_epochs', type=int, default=40, help='Number of epochs to train.')
-  parser.add_argument(
-      '--data_format',
-      type=str,
-      default=None,
-      choices=['channels_first', 'channels_last'],
-      help='A flag to override the data format used in the model. channels_first '
-      'provides a performance boost on GPU but is not always compatible '
-      'with CPU. If left unspecified, the data format will be chosen '
-      'automatically based on whether TensorFlow was built for CPU or GPU.')
-  parser.add_argument(
-      '--export_dir',
-      type=str,
-      help='The directory where the exported SavedModel will be stored.')
+class MNISTArgParser(argparse.ArgumentParser):
+
+  def __init__(self):
+    super(MNISTArgParser, self).__init__()

+    self.add_argument(
+        '--multi_gpu', action='store_true',
+        help='If set, run across all available GPUs.')
+    self.add_argument(
+        '--batch_size',
+        type=int,
+        default=100,
+        help='Number of images to process in a batch')
+    self.add_argument(
+        '--data_dir',
+        type=str,
+        default='/tmp/mnist_data',
+        help='Path to directory containing the MNIST dataset')
+    self.add_argument(
+        '--model_dir',
+        type=str,
+        default='/tmp/mnist_model',
+        help='The directory where the model will be stored.')
+    self.add_argument(
+        '--train_epochs',
+        type=int,
+        default=40,
+        help='Number of epochs to train.')
+    self.add_argument(
+        '--data_format',
+        type=str,
+        default=None,
+        choices=['channels_first', 'channels_last'],
+        help='A flag to override the data format used in the model. '
+        'channels_first provides a performance boost on GPU but is not always '
+        'compatible with CPU. If left unspecified, the data format will be '
+        'chosen automatically based on whether TensorFlow was built for CPU or '
+        'GPU.')
+    self.add_argument(
+        '--export_dir',
+        type=str,
+        help='The directory where the exported SavedModel will be stored.')
+
+
+if __name__ == '__main__':
+  parser = MNISTArgParser()
  tf.logging.set_verbosity(tf.logging.INFO)
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/official/mnist/mnist_test.py
+++ b/official/mnist/mnist_test.py
@@ -62,11 +62,12 @@ class Tests(tf.test.TestCase):
      self.assertEqual(predictions['probabilities'].shape, (10,))
      self.assertEqual(predictions['classes'].shape, ())

-  def mnist_model_fn_helper(self, mode):
+  def mnist_model_fn_helper(self, mode, multi_gpu=False):
    features, labels = dummy_input_fn()
    image_count = features.shape[0]
    spec = mnist.model_fn(features, labels, mode, {
-        'data_format': 'channels_last'
+        'data_format': 'channels_last',
+        'multi_gpu': multi_gpu
    })

    if mode == tf.estimator.ModeKeys.PREDICT:
@@ -91,6 +92,9 @@ class Tests(tf.test.TestCase):
  def test_mnist_model_fn_train_mode(self):
    self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN)

+  def test_mnist_model_fn_train_mode_multi_gpu(self):
+    self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN, multi_gpu=True)
+
  def test_mnist_model_fn_eval_mode(self):
    self.mnist_model_fn_helper(tf.estimator.ModeKeys.EVAL)


--- a/official/testing/docker_test.sh
+++ b/official/testing/docker_test.sh
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# 
+#
 # DO NOT MODIFY THIS FILE. Add tests to be executed in test_models.sh
 # Usage: docker_test.sh [--docker-image <DOCKER_IMG_NAME>]
 #
@@ -22,7 +22,7 @@
 #                  --docker-image flag), the default latest tensorflow docker
 #                  will be used.
 #
-# The script obeys the following required environment variables unless superceded by 
+# The script obeys the following required environment variables unless superceded by
 # the docker image flag:
 # PYTHON_VERSION:   (PYTHON2 | PYTHON3)

@@ -35,9 +35,9 @@ EXIT=0
 export WORKSPACE=${PWD}

 if [ "$PYTHON_VERSION" = "PYTHON3" ]; then
-  DOCKER_IMG_NAME="tensorflow/tensorflow:1.4.0-py3"
+  DOCKER_IMG_NAME="tensorflow/tensorflow:nightly-py3"
 else
-  DOCKER_IMG_NAME="tensorflow/tensorflow:1.4.0"
+  DOCKER_IMG_NAME="tensorflow/tensorflow:nightly"
  if [ "$PYTHON_VERSION" != "PYTHON2" ]; then
    echo "WARNING: Python version was not specified. Using Python2 by default."
    sleep 5
@@ -56,6 +56,9 @@ fi
 # Specify which test is to be run
 COMMAND="./official/testing/test_models.sh"

+# Check the recency of the desired image
+${DOCKER_BINARY} pull ${DOCKER_IMG_NAME}
+
 # RUN
 ${DOCKER_BINARY} run \
    -v ${WORKSPACE}:/workspace \