Sync to latest master.

b668f594 · pkulzc · d5fc3ef0 · 32aa6563 · b668f594 · b668f594
Commit b668f594 authored Apr 13, 2018 by pkulzc
20 changed files
--- a/CODEOWNERS
+++ b/CODEOWNERS
-/official/ @nealwu @k-w-w @karmel
+* @tensorflow/tf-garden-team
+/official/ @tensorflow/tf-garden-team @karmel
 /research/adversarial_crypto/ @dave-andersen
 /research/adversarial_text/ @rsepassi @a-dai
 /research/adv_imagenet_models/ @AlexeyKurakin
@@ -38,6 +39,7 @@
 /research/swivel/ @waterson
 /research/syntaxnet/ @calberti @andorardo @bogatyy @markomernick
 /research/tcn/ @coreylynch @sermanet
+/research/tensorrt/ @karmel
 /research/textsum/ @panyx0718 @peterjliu
 /research/transformer/ @daviddao
 /research/video_prediction/ @cbfinn

--- a/official/mnist/dataset.py
+++ b/official/mnist/dataset.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import gzip
 import os
 import shutil
+import tempfile

 import numpy as np
 from six.moves import urllib
@@ -67,10 +68,11 @@ def download(directory, filename):
    tf.gfile.MakeDirs(directory)
  # CVDF mirror of http://yann.lecun.com/exdb/mnist/
  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'
-  zipped_filepath = filepath + '.gz'
+  _, zipped_filepath = tempfile.mkstemp(suffix='.gz')
  print('Downloading %s to %s' % (url, zipped_filepath))
  urllib.request.urlretrieve(url, zipped_filepath)
-  with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out:
+  with gzip.open(zipped_filepath, 'rb') as f_in, \
+      tf.gfile.Open(filepath, 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)
  os.remove(zipped_filepath)
  return filepath

--- a/official/mnist/mnist.py
+++ b/official/mnist/mnist.py
@@ -25,11 +25,12 @@ import tensorflow as tf  # pylint: disable=g-bad-import-order
 from official.mnist import dataset
 from official.utils.arg_parsers import parsers
 from official.utils.logs import hooks_helper
+from official.utils.misc import model_helpers

 LEARNING_RATE = 1e-4


-class Model(tf.keras.Model):
+def create_model(data_format):
  """Model to recognize digits in the MNIST dataset.

  Network structure is equivalent to:
@@ -37,60 +38,55 @@ class Model(tf.keras.Model):
  and
  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py

-  But written as a tf.keras.Model using the tf.layers API.
-  """
+  But uses the tf.keras API.

-  def __init__(self, data_format):
-    """Creates a model for classifying a hand-written digit.
-
-    Args:
-      data_format: Either 'channels_first' or 'channels_last'.
-        'channels_first' is typically faster on GPUs while 'channels_last' is
-        typically faster on CPUs. See
-        https://www.tensorflow.org/performance/performance_guide#data_formats
-    """
-    super(Model, self).__init__()
-    if data_format == 'channels_first':
-      self._input_shape = [-1, 1, 28, 28]
-    else:
-      assert data_format == 'channels_last'
-      self._input_shape = [-1, 28, 28, 1]
-
-    self.conv1 = tf.layers.Conv2D(
-        32, 5, padding='same', data_format=data_format, activation=tf.nn.relu)
-    self.conv2 = tf.layers.Conv2D(
-        64, 5, padding='same', data_format=data_format, activation=tf.nn.relu)
-    self.fc1 = tf.layers.Dense(1024, activation=tf.nn.relu)
-    self.fc2 = tf.layers.Dense(10)
-    self.dropout = tf.layers.Dropout(0.4)
-    self.max_pool2d = tf.layers.MaxPooling2D(
-        (2, 2), (2, 2), padding='same', data_format=data_format)
-
-  def __call__(self, inputs, training):
-    """Add operations to classify a batch of input images.
-
-    Args:
-      inputs: A Tensor representing a batch of input images.
-      training: A boolean. Set to True to add operations required only when
-        training the classifier.
-
-    Returns:
-      A logits Tensor with shape [<batch_size>, 10].
-    """
-    y = tf.reshape(inputs, self._input_shape)
-    y = self.conv1(y)
-    y = self.max_pool2d(y)
-    y = self.conv2(y)
-    y = self.max_pool2d(y)
-    y = tf.layers.flatten(y)
-    y = self.fc1(y)
-    y = self.dropout(y, training=training)
-    return self.fc2(y)
+  Args:
+    data_format: Either 'channels_first' or 'channels_last'. 'channels_first' is
+      typically faster on GPUs while 'channels_last' is typically faster on
+      CPUs. See
+      https://www.tensorflow.org/performance/performance_guide#data_formats
+
+  Returns:
+    A tf.keras.Model.
+  """
+  if data_format == 'channels_first':
+    input_shape = [1, 28, 28]
+  else:
+    assert data_format == 'channels_last'
+    input_shape = [28, 28, 1]
+
+  l = tf.keras.layers
+  max_pool = l.MaxPooling2D(
+      (2, 2), (2, 2), padding='same', data_format=data_format)
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  return tf.keras.Sequential(
+      [
+          l.Reshape(input_shape),
+          l.Conv2D(
+              32,
+              5,
+              padding='same',
+              data_format=data_format,
+              activation=tf.nn.relu),
+          max_pool,
+          l.Conv2D(
+              64,
+              5,
+              padding='same',
+              data_format=data_format,
+              activation=tf.nn.relu),
+          max_pool,
+          l.Flatten(),
+          l.Dense(1024, activation=tf.nn.relu),
+          l.Dropout(0.4),
+          l.Dense(10)
+      ])


 def model_fn(features, labels, mode, params):
  """The model_fn argument for creating an Estimator."""
-  model = Model(params['data_format'])
+  model = create_model(params['data_format'])
  image = features
  if isinstance(image, dict):
    image = features['image']
@@ -140,8 +136,7 @@ def model_fn(features, labels, mode, params):
        eval_metric_ops={
            'accuracy':
                tf.metrics.accuracy(
-                    labels=labels,
-                    predictions=tf.argmax(logits, axis=1)),
+                    labels=labels, predictions=tf.argmax(logits, axis=1)),
        })


@@ -231,6 +226,10 @@ def main(argv):
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('\nEvaluation results:\n\t%s\n' % eval_results)

+    if model_helpers.past_stop_threshold(flags.stop_threshold,
+                                         eval_results['accuracy']):
+      break
+
  # Export the model
  if flags.export_dir is not None:
    image = tf.placeholder(tf.float32, [None, 28, 28])
@@ -245,7 +244,7 @@ class MNISTArgParser(argparse.ArgumentParser):

  def __init__(self):
    super(MNISTArgParser, self).__init__(parents=[
-        parsers.BaseParser(),
+        parsers.BaseParser(multi_gpu=True, num_gpu=False),
        parsers.ImageModelParser(),
        parsers.ExportParser(),
    ])

--- a/official/mnist/mnist_eager.py
+++ b/official/mnist/mnist_eager.py
@@ -116,7 +116,7 @@ def main(argv):
  test_ds = mnist_dataset.test(flags.data_dir).batch(flags.batch_size)

  # Create the model and optimizer
-  model = mnist.Model(data_format)
+  model = mnist.create_model(data_format)
  optimizer = tf.train.MomentumOptimizer(flags.lr, flags.momentum)

  # Create file writers for writing TensorBoard summaries.
@@ -164,8 +164,7 @@ class MNISTEagerArgParser(argparse.ArgumentParser):

  def __init__(self):
    super(MNISTEagerArgParser, self).__init__(parents=[
-        parsers.BaseParser(
-            epochs_between_evals=False, multi_gpu=False, hooks=False),
+        parsers.EagerParser(),
        parsers.ImageModelParser()])

    self.add_argument(

--- a/official/mnist/mnist_eager_test.py
+++ b/official/mnist/mnist_eager_test.py
@@ -40,7 +40,7 @@ def random_dataset():


 def train(defun=False):
-  model = mnist.Model(data_format())
+  model = mnist.create_model(data_format())
  if defun:
    model.call = tfe.defun(model.call)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
@@ -51,7 +51,7 @@ def train(defun=False):


 def evaluate(defun=False):
-  model = mnist.Model(data_format())
+  model = mnist.create_model(data_format())
  dataset = random_dataset()
  if defun:
    model.call = tfe.defun(model.call)

--- a/official/mnist/mnist_tpu.py
+++ b/official/mnist/mnist_tpu.py
@@ -86,7 +86,7 @@ def model_fn(features, labels, mode, params):
  if isinstance(image, dict):
    image = features["image"]

-  model = mnist.Model("channels_last")
+  model = mnist.create_model("channels_last")
  logits = model(image, training=(mode == tf.estimator.ModeKeys.TRAIN))
  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)


--- a/official/resnet/README.md
+++ b/official/resnet/README.md
@@ -55,9 +55,17 @@ You can download 190 MB pre-trained versions of ResNet-50 achieving 76.3% and 75

 Other versions and formats:

-* [ResNet-v2-ImageNet Checkpoint](http://download.tensorflow.org/models/official/resnetv2_imagenet_checkpoint.tar.gz)
-* [ResNet-v2-ImageNet SavedModel](http://download.tensorflow.org/models/official/resnetv2_imagenet_savedmodel.tar.gz)
-* [ResNet-v2-ImageNet Frozen Graph](http://download.tensorflow.org/models/official/resnetv2_imagenet_frozen_graph.pb)
-* [ResNet-v1-ImageNet Checkpoint](http://download.tensorflow.org/models/official/resnetv1_imagenet_checkpoint.tar.gz)
-* [ResNet-v1-ImageNet SavedModel](http://download.tensorflow.org/models/official/resnetv1_imagenet_savedmodel.tar.gz)
-* [ResNet-v1-ImageNet Frozen Graph](http://download.tensorflow.org/models/official/resnetv1_imagenet_frozen_graph.pb)
+* [ResNet-v2-ImageNet Checkpoint](http://download.tensorflow.org/models/official/resnet_v2_imagenet_checkpoint.tar.gz)
+* [ResNet-v2-ImageNet SavedModel](http://download.tensorflow.org/models/official/resnet_v2_imagenet_savedmodel.tar.gz)
+* [ResNet-v1-ImageNet Checkpoint](http://download.tensorflow.org/models/official/resnet_v1_imagenet_checkpoint.tar.gz)
+* [ResNet-v1-ImageNet SavedModel](http://download.tensorflow.org/models/official/resnet_v1_imagenet_savedmodel.tar.gz)
+
+## Compute Devices
+Training is accomplished using the DistributionStrategies API. (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md)
+
+The appropriate distribution strategy is chosen based on the `--num_gpus` flag. By default this flag is one if TensorFlow is compiled with CUDA, and zero otherwise.
+
+num_gpus:
+ 0:  Use OneDeviceStrategy and train on CPU.
+ 1:  Use OneDeviceStrategy and train on GPU.
+ 2+: Use MirroredStrategy (data parallelism) to distribute a batch between devices.
--- a/official/resnet/cifar10_main.py
+++ b/official/resnet/cifar10_main.py
@@ -103,8 +103,7 @@ def preprocess_image(image, is_training):
  return image


-def input_fn(is_training, data_dir, batch_size, num_epochs=1,
-             num_parallel_calls=1, multi_gpu=False):
+def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.

  Args:
@@ -112,12 +111,6 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
-    num_parallel_calls: The number of records that are processed in parallel.
-      This can be optimized per data set but for generally homogeneous data
-      sets, should be approximately the number of available CPU cores.
-    multi_gpu: Whether this is run multi-GPU. Note that this is only required
-      currently to handle the batch leftovers, and can be removed
-      when that is handled directly by Estimator.

  Returns:
    A dataset that can be used for iteration.
@@ -125,12 +118,10 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
  filenames = get_filenames(is_training, data_dir)
  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)

-  num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation']
-
  return resnet_run_loop.process_record_dataset(
      dataset, is_training, batch_size, _NUM_IMAGES['train'],
-      parse_record, num_epochs, num_parallel_calls,
-      examples_per_epoch=num_images, multi_gpu=multi_gpu)
+      parse_record, num_epochs,
+  )


 def get_synth_input_fn():
@@ -145,7 +136,8 @@ class Cifar10Model(resnet_model.Model):
  """Model class with appropriate defaults for CIFAR-10 data."""

  def __init__(self, resnet_size, data_format=None, num_classes=_NUM_CLASSES,
-               version=resnet_model.DEFAULT_VERSION):
+               version=resnet_model.DEFAULT_VERSION,
+               dtype=resnet_model.DEFAULT_DTYPE):
    """These are the parameters that work for CIFAR-10 data.

    Args:
@@ -156,6 +148,7 @@ class Cifar10Model(resnet_model.Model):
        enables users to extend the same model to their own datasets.
      version: Integer representing which version of the ResNet network to use.
        See README for details. Valid values: [1, 2]
+      dtype: The TensorFlow dtype to use for calculations.

    Raises:
      ValueError: if invalid resnet_size is chosen
@@ -180,7 +173,9 @@ class Cifar10Model(resnet_model.Model):
        block_strides=[1, 2, 2],
        final_size=64,
        version=version,
-        data_format=data_format)
+        data_format=data_format,
+        dtype=dtype
+    )


 def cifar10_model_fn(features, labels, mode, params):
@@ -204,15 +199,21 @@ def cifar10_model_fn(features, labels, mode, params):
  def loss_filter_fn(_):
    return True

-  return resnet_run_loop.resnet_model_fn(features, labels, mode, Cifar10Model,
-                                         resnet_size=params['resnet_size'],
-                                         weight_decay=weight_decay,
-                                         learning_rate_fn=learning_rate_fn,
-                                         momentum=0.9,
-                                         data_format=params['data_format'],
-                                         version=params['version'],
-                                         loss_filter_fn=loss_filter_fn,
-                                         multi_gpu=params['multi_gpu'])
+  return resnet_run_loop.resnet_model_fn(
+      features=features,
+      labels=labels,
+      mode=mode,
+      model_class=Cifar10Model,
+      resnet_size=params['resnet_size'],
+      weight_decay=weight_decay,
+      learning_rate_fn=learning_rate_fn,
+      momentum=0.9,
+      data_format=params['data_format'],
+      version=params['version'],
+      loss_scale=params['loss_scale'],
+      loss_filter_fn=loss_filter_fn,
+      dtype=params['dtype']
+  )


 def main(argv):

--- a/official/resnet/cifar10_test.py
+++ b/official/resnet/cifar10_test.py
@@ -71,18 +71,19 @@ class BaseTest(tf.test.TestCase):
        for pixel in row:
          self.assertAllClose(pixel, np.array([-1.225, 0., 1.225]), rtol=1e-3)

-  def cifar10_model_fn_helper(self, mode, version, multi_gpu=False):
+  def cifar10_model_fn_helper(self, mode, version, dtype):
    input_fn = cifar10_main.get_synth_input_fn()
    dataset = input_fn(True, '', _BATCH_SIZE)
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    spec = cifar10_main.cifar10_model_fn(
        features, labels, mode, {
+            'dtype': dtype,
            'resnet_size': 32,
            'data_format': 'channels_last',
            'batch_size': _BATCH_SIZE,
            'version': version,
-            'multi_gpu': multi_gpu
+            'loss_scale': 128 if dtype == tf.float16 else 1,
        })

    predictions = spec.predictions
@@ -105,44 +106,45 @@ class BaseTest(tf.test.TestCase):
      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)

  def test_cifar10_model_fn_train_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=1)
-
-  def test_cifar10_model_fn_trainmode__v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=2)
-
-  def test_cifar10_model_fn_train_mode_multi_gpu_v1(self):
    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=1,
-                                 multi_gpu=True)
+                                 dtype=tf.float32)

-  def test_cifar10_model_fn_train_mode_multi_gpu_v2(self):
+  def test_cifar10_model_fn_trainmode__v2(self):
    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=2,
-                                 multi_gpu=True)
+                                 dtype=tf.float32)

  def test_cifar10_model_fn_eval_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=1)
+    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=1,
+                                 dtype=tf.float32)

  def test_cifar10_model_fn_eval_mode_v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=2)
+    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=2,
+                                 dtype=tf.float32)

  def test_cifar10_model_fn_predict_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=1)
+    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=1,
+                                 dtype=tf.float32)

  def test_cifar10_model_fn_predict_mode_v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=2)
+    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=2,
+                                 dtype=tf.float32)

-  def test_cifar10model_shape(self):
+  def _test_cifar10model_shape(self, version):
    batch_size = 135
    num_classes = 246

-    for version in (1, 2):
-      model = cifar10_main.Cifar10Model(
-          32, data_format='channels_last', num_classes=num_classes,
-          version=version)
-      fake_input = tf.random_uniform(
-          [batch_size, _HEIGHT, _WIDTH, _NUM_CHANNELS])
-      output = model(fake_input, training=True)
+    model = cifar10_main.Cifar10Model(32, data_format='channels_last',
+                                      num_classes=num_classes, version=version)
+    fake_input = tf.random_uniform([batch_size, _HEIGHT, _WIDTH, _NUM_CHANNELS])
+    output = model(fake_input, training=True)
+
+    self.assertAllEqual(output.shape, (batch_size, num_classes))
+
+  def test_cifar10model_shape_v1(self):
+    self._test_cifar10model_shape(version=1)

-      self.assertAllEqual(output.shape, (batch_size, num_classes))
+  def test_cifar10model_shape_v2(self):
+    self._test_cifar10model_shape(version=2)

  def test_cifar10_end_to_end_synthetic_v1(self):
    integration.run_synthetic(

--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -154,8 +154,7 @@ def parse_record(raw_record, is_training):
  return image, label


-def input_fn(is_training, data_dir, batch_size, num_epochs=1,
-             num_parallel_calls=1, multi_gpu=False):
+def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  """Input function which provides batches for train or eval.

  Args:
@@ -163,12 +162,6 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
-    num_parallel_calls: The number of records that are processed in parallel.
-      This can be optimized per data set but for generally homogeneous data
-      sets, should be approximately the number of available CPU cores.
-    multi_gpu: Whether this is run multi-GPU. Note that this is only required
-      currently to handle the batch leftovers, and can be removed
-      when that is handled directly by Estimator.

  Returns:
    A dataset that can be used for iteration.
@@ -180,15 +173,13 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
    # Shuffle the input files
    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

-  num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation']
-
  # Convert to individual records
  dataset = dataset.flat_map(tf.data.TFRecordDataset)

  return resnet_run_loop.process_record_dataset(
      dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record,
-      num_epochs, num_parallel_calls, examples_per_epoch=num_images,
-      multi_gpu=multi_gpu)
+      num_epochs
+  )


 def get_synth_input_fn():
@@ -203,7 +194,8 @@ class ImagenetModel(resnet_model.Model):
  """Model class with appropriate defaults for Imagenet data."""

  def __init__(self, resnet_size, data_format=None, num_classes=_NUM_CLASSES,
-               version=resnet_model.DEFAULT_VERSION):
+               version=resnet_model.DEFAULT_VERSION,
+               dtype=resnet_model.DEFAULT_DTYPE):
    """These are the parameters that work for Imagenet data.

    Args:
@@ -214,6 +206,7 @@ class ImagenetModel(resnet_model.Model):
        enables users to extend the same model to their own datasets.
      version: Integer representing which version of the ResNet network to use.
        See README for details. Valid values: [1, 2]
+      dtype: The TensorFlow dtype to use for calculations.
    """

    # For bigger models, we want to use "bottleneck" layers
@@ -239,7 +232,9 @@ class ImagenetModel(resnet_model.Model):
        block_strides=[1, 2, 2, 2],
        final_size=final_size,
        version=version,
-        data_format=data_format)
+        data_format=data_format,
+        dtype=dtype
+    )


 def _get_block_sizes(resnet_size):
@@ -283,15 +278,21 @@ def imagenet_model_fn(features, labels, mode, params):
      num_images=_NUM_IMAGES['train'], boundary_epochs=[30, 60, 80, 90],
      decay_rates=[1, 0.1, 0.01, 0.001, 1e-4])

-  return resnet_run_loop.resnet_model_fn(features, labels, mode, ImagenetModel,
-                                         resnet_size=params['resnet_size'],
-                                         weight_decay=1e-4,
-                                         learning_rate_fn=learning_rate_fn,
-                                         momentum=0.9,
-                                         data_format=params['data_format'],
-                                         version=params['version'],
-                                         loss_filter_fn=None,
-                                         multi_gpu=params['multi_gpu'])
+  return resnet_run_loop.resnet_model_fn(
+      features=features,
+      labels=labels,
+      mode=mode,
+      model_class=ImagenetModel,
+      resnet_size=params['resnet_size'],
+      weight_decay=1e-4,
+      learning_rate_fn=learning_rate_fn,
+      momentum=0.9,
+      data_format=params['data_format'],
+      version=params['version'],
+      loss_scale=params['loss_scale'],
+      loss_filter_fn=None,
+      dtype=params['dtype']
+  )


 def main(argv):

--- a/official/resnet/imagenet_test.py
+++ b/official/resnet/imagenet_test.py
@@ -36,7 +36,7 @@ class BaseTest(tf.test.TestCase):
    super(BaseTest, self).tearDown()
    tf.gfile.DeleteRecursively(self.get_temp_dir())

-  def tensor_shapes_helper(self, resnet_size, version, with_gpu=False):
+  def _tensor_shapes_helper(self, resnet_size, version, dtype, with_gpu):
    """Checks the tensor shapes after each phase of the ResNet model."""
    def reshape(shape):
      """Returns the expected dimensions depending on if a GPU is being used."""
@@ -50,22 +50,24 @@ class BaseTest(tf.test.TestCase):
    graph = tf.Graph()

    with graph.as_default(), self.test_session(
-        use_gpu=with_gpu, force_gpu=with_gpu):
+        graph=graph, use_gpu=with_gpu, force_gpu=with_gpu):
      model = imagenet_main.ImagenetModel(
-          resnet_size,
+          resnet_size=resnet_size,
          data_format='channels_first' if with_gpu else 'channels_last',
-          version=version)
+          version=version,
+          dtype=dtype
+      )
      inputs = tf.random_uniform([1, 224, 224, 3])
      output = model(inputs, training=True)

-      initial_conv = graph.get_tensor_by_name('initial_conv:0')
-      max_pool = graph.get_tensor_by_name('initial_max_pool:0')
-      block_layer1 = graph.get_tensor_by_name('block_layer1:0')
-      block_layer2 = graph.get_tensor_by_name('block_layer2:0')
-      block_layer3 = graph.get_tensor_by_name('block_layer3:0')
-      block_layer4 = graph.get_tensor_by_name('block_layer4:0')
-      reduce_mean = graph.get_tensor_by_name('final_reduce_mean:0')
-      dense = graph.get_tensor_by_name('final_dense:0')
+      initial_conv = graph.get_tensor_by_name('resnet_model/initial_conv:0')
+      max_pool = graph.get_tensor_by_name('resnet_model/initial_max_pool:0')
+      block_layer1 = graph.get_tensor_by_name('resnet_model/block_layer1:0')
+      block_layer2 = graph.get_tensor_by_name('resnet_model/block_layer2:0')
+      block_layer3 = graph.get_tensor_by_name('resnet_model/block_layer3:0')
+      block_layer4 = graph.get_tensor_by_name('resnet_model/block_layer4:0')
+      reduce_mean = graph.get_tensor_by_name('resnet_model/final_reduce_mean:0')
+      dense = graph.get_tensor_by_name('resnet_model/final_dense:0')

      self.assertAllEqual(initial_conv.shape, reshape((1, 64, 112, 112)))
      self.assertAllEqual(max_pool.shape, reshape((1, 64, 56, 56)))
@@ -88,6 +90,12 @@ class BaseTest(tf.test.TestCase):
      self.assertAllEqual(dense.shape, (1, _LABEL_CLASSES))
      self.assertAllEqual(output.shape, (1, _LABEL_CLASSES))

+  def tensor_shapes_helper(self, resnet_size, version, with_gpu=False):
+    self._tensor_shapes_helper(resnet_size=resnet_size, version=version,
+                               dtype=tf.float32, with_gpu=with_gpu)
+    self._tensor_shapes_helper(resnet_size=resnet_size, version=version,
+                               dtype=tf.float16, with_gpu=with_gpu)
+
  def test_tensor_shapes_resnet_18_v1(self):
    self.tensor_shapes_helper(18, version=1)

@@ -172,7 +180,7 @@ class BaseTest(tf.test.TestCase):
  def test_tensor_shapes_resnet_200_with_gpu_v2(self):
    self.tensor_shapes_helper(200, version=2, with_gpu=True)

-  def resnet_model_fn_helper(self, mode, version, multi_gpu=False):
+  def resnet_model_fn_helper(self, mode, version, dtype):
    """Tests that the EstimatorSpec is given the appropriate arguments."""
    tf.train.create_global_step()

@@ -182,11 +190,12 @@ class BaseTest(tf.test.TestCase):
    features, labels = iterator.get_next()
    spec = imagenet_main.imagenet_model_fn(
        features, labels, mode, {
+            'dtype': dtype,
            'resnet_size': 50,
            'data_format': 'channels_last',
            'batch_size': _BATCH_SIZE,
            'version': version,
-            'multi_gpu': multi_gpu,
+            'loss_scale': 128 if dtype == tf.float16 else 1,
        })

    predictions = spec.predictions
@@ -209,43 +218,47 @@ class BaseTest(tf.test.TestCase):
      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)

  def test_resnet_model_fn_train_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=1)
-
-  def test_resnet_model_fn_train_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=2)
-
-  def test_resnet_model_fn_train_mode_multi_gpu_v1(self):
    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=1,
-                                multi_gpu=True)
+                                dtype=tf.float32)

-  def test_resnet_model_fn_train_mode_multi_gpu_v2(self):
+  def test_resnet_model_fn_train_mode_v2(self):
    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, version=2,
-                                multi_gpu=True)
+                                dtype=tf.float32)

  def test_resnet_model_fn_eval_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=1)
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=1,
+                                dtype=tf.float32)

  def test_resnet_model_fn_eval_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=2)
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, version=2,
+                                dtype=tf.float32)

  def test_resnet_model_fn_predict_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=1)
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=1,
+                                dtype=tf.float32)

  def test_resnet_model_fn_predict_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=2)
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, version=2,
+                                dtype=tf.float32)

-  def test_imagenetmodel_shape(self):
+  def _test_imagenetmodel_shape(self, version):
    batch_size = 135
    num_classes = 246

-    for version in (1, 2):
-      model = imagenet_main.ImagenetModel(
-          50, data_format='channels_last', num_classes=num_classes,
-          version=version)
-      fake_input = tf.random_uniform([batch_size, 224, 224, 3])
-      output = model(fake_input, training=True)
+    model = imagenet_main.ImagenetModel(
+        50, data_format='channels_last', num_classes=num_classes,
+        version=version)
+
+    fake_input = tf.random_uniform([batch_size, 224, 224, 3])
+    output = model(fake_input, training=True)

-      self.assertAllEqual(output.shape, (batch_size, num_classes))
+    self.assertAllEqual(output.shape, (batch_size, num_classes))
+
+  def test_imagenetmodel_shape_v1(self):
+    self._test_imagenetmodel_shape(version=1)
+
+  def test_imagenetmodel_shape_v2(self):
+    self._test_imagenetmodel_shape(version=2)

  def test_imagenet_end_to_end_synthetic_v1(self):
    integration.run_synthetic(
@@ -283,5 +296,6 @@ class BaseTest(tf.test.TestCase):
        extra_flags=['-v', '2', '-rs', '200']
    )

+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/resnet/resnet_model.py
+++ b/official/resnet/resnet_model.py
@@ -36,6 +36,9 @@ import tensorflow as tf
 _BATCH_NORM_DECAY = 0.997
 _BATCH_NORM_EPSILON = 1e-5
 DEFAULT_VERSION = 2
+DEFAULT_DTYPE = tf.float32
+CASTABLE_TYPES = (tf.float16,)
+ALLOWED_TYPES = (DEFAULT_DTYPE,) + CASTABLE_TYPES


 ################################################################################
@@ -351,7 +354,8 @@ class Model(object):
               kernel_size,
               conv_stride, first_pool_size, first_pool_stride,
               second_pool_size, second_pool_stride, block_sizes, block_strides,
-               final_size, version=DEFAULT_VERSION, data_format=None):
+               final_size, version=DEFAULT_VERSION, data_format=None,
+               dtype=DEFAULT_DTYPE):
    """Creates a model for classifying an image.

    Args:
@@ -379,6 +383,8 @@ class Model(object):
        See README for details. Valid values: [1, 2]
      data_format: Input format ('channels_last', 'channels_first', or None).
        If set to None, the format is dependent on whether a GPU is available.
+      dtype: The TensorFlow dtype to use for calculations. If not specified
+        tf.float32 is used.

    Raises:
      ValueError: if invalid version is selected.
@@ -406,6 +412,9 @@ class Model(object):
      else:
        self.block_fn = _building_block_v2

+    if dtype not in ALLOWED_TYPES:
+      raise ValueError('dtype must be one of: {}'.format(ALLOWED_TYPES))
+
    self.data_format = data_format
    self.num_classes = num_classes
    self.num_filters = num_filters
@@ -418,6 +427,61 @@ class Model(object):
    self.block_sizes = block_sizes
    self.block_strides = block_strides
    self.final_size = final_size
+    self.dtype = dtype
+
+  def _custom_dtype_getter(self, getter, name, shape=None, dtype=DEFAULT_DTYPE,
+                           *args, **kwargs):
+    """Creates variables in fp32, then casts to fp16 if necessary.
+
+    This function is a custom getter. A custom getter is a function with the
+    same signature as tf.get_variable, except it has an additional getter
+    parameter. Custom getters can be passed as the `custom_getter` parameter of
+    tf.variable_scope. Then, tf.get_variable will call the custom getter,
+    instead of directly getting a variable itself. This can be used to change
+    the types of variables that are retrieved with tf.get_variable.
+    The `getter` parameter is the underlying variable getter, that would have
+    been called if no custom getter was used. Custom getters typically get a
+    variable with `getter`, then modify it in some way.
+
+    This custom getter will create an fp32 variable. If a low precision
+    (e.g. float16) variable was requested it will then cast the variable to the
+    requested dtype. The reason we do not directly create variables in low
+    precision dtypes is that applying small gradients to such variables may
+    cause the variable not to change.
+
+    Args:
+      getter: The underlying variable getter, that has the same signature as
+        tf.get_variable and returns a variable.
+      name: The name of the variable to get.
+      shape: The shape of the variable to get.
+      dtype: The dtype of the variable to get. Note that if this is a low
+        precision dtype, the variable will be created as a tf.float32 variable,
+        then cast to the appropriate dtype
+      *args: Additional arguments to pass unmodified to getter.
+      **kwargs: Additional keyword arguments to pass unmodified to getter.
+
+    Returns:
+      A variable which is cast to fp16 if necessary.
+    """
+
+    if dtype in CASTABLE_TYPES:
+      var = getter(name, shape, tf.float32, *args, **kwargs)
+      return tf.cast(var, dtype=dtype, name=name + '_cast')
+    else:
+      return getter(name, shape, dtype, *args, **kwargs)
+
+  def _model_variable_scope(self):
+    """Returns a variable scope that the model should be created under.
+
+    If self.dtype is a castable type, model variable will be created in fp32
+    then cast to self.dtype before being used.
+
+    Returns:
+      A variable scope for the model.
+    """
+
+    return tf.variable_scope('resnet_model',
+                             custom_getter=self._custom_dtype_getter)

  def __call__(self, inputs, training):
    """Add operations to classify a batch of input images.
@@ -431,46 +495,46 @@ class Model(object):
      A logits Tensor with shape [<batch_size>, self.num_classes].
    """

-    if self.data_format == 'channels_first':
-      # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
-      # This provides a large performance boost on GPU. See
-      # https://www.tensorflow.org/performance/performance_guide#data_formats
-      inputs = tf.transpose(inputs, [0, 3, 1, 2])
-
-    inputs = conv2d_fixed_padding(
-        inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
-        strides=self.conv_stride, data_format=self.data_format)
-    inputs = tf.identity(inputs, 'initial_conv')
-
-    if self.first_pool_size:
-      inputs = tf.layers.max_pooling2d(
-          inputs=inputs, pool_size=self.first_pool_size,
-          strides=self.first_pool_stride, padding='SAME',
-          data_format=self.data_format)
-      inputs = tf.identity(inputs, 'initial_max_pool')
-
-    for i, num_blocks in enumerate(self.block_sizes):
-      num_filters = self.num_filters * (2**i)
-      inputs = block_layer(
-          inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
-          block_fn=self.block_fn, blocks=num_blocks,
-          strides=self.block_strides[i], training=training,
-          name='block_layer{}'.format(i + 1), data_format=self.data_format)
-
-    inputs = batch_norm(inputs, training, self.data_format)
-    inputs = tf.nn.relu(inputs)
-
-    # The current top layer has shape
-    # `batch_size x pool_size x pool_size x final_size`.
-    # ResNet does an Average Pooling layer over pool_size,
-    # but that is the same as doing a reduce_mean. We do a reduce_mean
-    # here because it performs better than AveragePooling2D.
-    axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
-    inputs = tf.reduce_mean(inputs, axes, keepdims=True)
-    inputs = tf.identity(inputs, 'final_reduce_mean')
-
-    inputs = tf.reshape(inputs, [-1, self.final_size])
-    inputs = tf.layers.dense(inputs=inputs, units=self.num_classes)
-    inputs = tf.identity(inputs, 'final_dense')
-
-    return inputs
+    with self._model_variable_scope():
+      if self.data_format == 'channels_first':
+        # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+        # This provides a large performance boost on GPU. See
+        # https://www.tensorflow.org/performance/performance_guide#data_formats
+        inputs = tf.transpose(inputs, [0, 3, 1, 2])
+
+      inputs = conv2d_fixed_padding(
+          inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
+          strides=self.conv_stride, data_format=self.data_format)
+      inputs = tf.identity(inputs, 'initial_conv')
+
+      if self.first_pool_size:
+        inputs = tf.layers.max_pooling2d(
+            inputs=inputs, pool_size=self.first_pool_size,
+            strides=self.first_pool_stride, padding='SAME',
+            data_format=self.data_format)
+        inputs = tf.identity(inputs, 'initial_max_pool')
+
+      for i, num_blocks in enumerate(self.block_sizes):
+        num_filters = self.num_filters * (2**i)
+        inputs = block_layer(
+            inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
+            block_fn=self.block_fn, blocks=num_blocks,
+            strides=self.block_strides[i], training=training,
+            name='block_layer{}'.format(i + 1), data_format=self.data_format)
+
+      inputs = batch_norm(inputs, training, self.data_format)
+      inputs = tf.nn.relu(inputs)
+
+      # The current top layer has shape
+      # `batch_size x pool_size x pool_size x final_size`.
+      # ResNet does an Average Pooling layer over pool_size,
+      # but that is the same as doing a reduce_mean. We do a reduce_mean
+      # here because it performs better than AveragePooling2D.
+      axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
+      inputs = tf.reduce_mean(inputs, axes, keepdims=True)
+      inputs = tf.identity(inputs, 'final_reduce_mean')
+
+      inputs = tf.reshape(inputs, [-1, self.final_size])
+      inputs = tf.layers.dense(inputs=inputs, units=self.num_classes)
+      inputs = tf.identity(inputs, 'final_dense')
+      return inputs
--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -33,14 +33,14 @@ from official.utils.arg_parsers import parsers
 from official.utils.export import export
 from official.utils.logs import hooks_helper
 from official.utils.logs import logger
+from official.utils.misc import model_helpers


 ################################################################################
 # Functions for input processing.
 ################################################################################
 def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer,
-                           parse_record_fn, num_epochs=1, num_parallel_calls=1,
-                           examples_per_epoch=0, multi_gpu=False):
+                           parse_record_fn, num_epochs=1):
  """Given a Dataset with raw records, return an iterator over the records.

  Args:
@@ -53,19 +53,11 @@ def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer,
    parse_record_fn: A function that takes a raw record and returns the
      corresponding (image, label) pair.
    num_epochs: The number of epochs to repeat the dataset.
-    num_parallel_calls: The number of records that are processed in parallel.
-      This can be optimized per data set but for generally homogeneous data
-      sets, should be approximately the number of available CPU cores.
-    examples_per_epoch: The number of examples in the current set that
-      are processed each epoch. Note that this is only used for multi-GPU mode,
-      and only to handle what will eventually be handled inside of Estimator.
-    multi_gpu: Whether this is run multi-GPU. Note that this is only required
-      currently to handle the batch leftovers (see below), and can be removed
-      when that is handled directly by Estimator.

  Returns:
    Dataset of (image, label) pairs ready for iteration.
  """
+
  # We prefetch a batch at a time, This can help smooth out the time taken to
  # load input files as we go through shuffling and processing.
  dataset = dataset.prefetch(buffer_size=batch_size)
@@ -78,29 +70,22 @@ def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer,
  # dataset for the appropriate number of epochs.
  dataset = dataset.repeat(num_epochs)

-  # Currently, if we are using multiple GPUs, we can't pass in uneven batches.
-  # (For example, if we have 4 GPUs, the number of examples in each batch
-  # must be divisible by 4.) We already ensured this for the batch_size, but
-  # we have to additionally ensure that any "leftover" examples-- the remainder
-  # examples (total examples % batch_size) that get called a batch for the very
-  # last batch of an epoch-- do not raise an error when we try to split them
-  # over the GPUs. This will likely be handled by Estimator during replication
-  # in the future, but for now, we just drop the leftovers here.
-  if multi_gpu:
-    total_examples = num_epochs * examples_per_epoch
-    dataset = dataset.take(batch_size * (total_examples // batch_size))
-
-  # Parse the raw records into images and labels
-  dataset = dataset.map(lambda value: parse_record_fn(value, is_training),
-                        num_parallel_calls=num_parallel_calls)
-
-  dataset = dataset.batch(batch_size)
+  # Parse the raw records into images and labels. Testing has shown that setting
+  # num_parallel_batches > 1 produces no improvement in throughput, since
+  # batch_size is almost always much greater than the number of CPU cores.
+  dataset = dataset.apply(
+      tf.contrib.data.map_and_batch(
+          lambda value: parse_record_fn(value, is_training),
+          batch_size=batch_size,
+          num_parallel_batches=1))

  # Operations between the final prefetch and the get_next call to the iterator
  # will happen synchronously during run time. We prefetch here again to
  # background all of the above processing work and keep it out of the
-  # critical training path.
-  dataset = dataset.prefetch(1)
+  # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

  return dataset

@@ -122,7 +107,7 @@ def get_synth_input_fn(height, width, num_channels, num_classes):
    An input_fn that can be used in place of a real one to return a dataset
    that can be used for iteration.
  """
-  def input_fn(is_training, data_dir, batch_size, *args):  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):  # pylint: disable=unused-argument
    images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
    labels = tf.zeros((batch_size, num_classes), tf.int32)
    return tf.data.Dataset.from_tensors((images, labels)).repeat()
@@ -170,7 +155,8 @@ def learning_rate_with_decay(

 def resnet_model_fn(features, labels, mode, model_class,
                    resnet_size, weight_decay, learning_rate_fn, momentum,
-                    data_format, version, loss_filter_fn=None, multi_gpu=False):
+                    data_format, version, loss_scale, loss_filter_fn=None,
+                    dtype=resnet_model.DEFAULT_DTYPE):
  """Shared functionality for different resnet model_fns.

  Initializes the ResnetModel representing the model layers
@@ -196,12 +182,13 @@ def resnet_model_fn(features, labels, mode, model_class,
      If set to None, the format is dependent on whether a GPU is available.
    version: Integer representing which version of the ResNet network to use.
      See README for details. Valid values: [1, 2]
+    loss_scale: The factor to scale the loss for numerical stability. A detailed
+      summary is present in the arg parser help text.
    loss_filter_fn: function that takes a string variable name and returns
      True if the var should be included in loss calculation, and False
      otherwise. If None, batch_normalization variables will be excluded
      from the loss.
-    multi_gpu: If True, wrap the optimizer in a TowerOptimizer suitable for
-      data-parallel distribution across multiple GPUs.
+    dtype: the TensorFlow dtype to use for calculations.

  Returns:
    EstimatorSpec parameterized according to the input params and the
@@ -211,9 +198,17 @@ def resnet_model_fn(features, labels, mode, model_class,
  # Generate a summary node for the images
  tf.summary.image('images', features, max_outputs=6)

-  model = model_class(resnet_size, data_format, version=version)
+  features = tf.cast(features, dtype)
+
+  model = model_class(resnet_size, data_format, version=version, dtype=dtype)
+
  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)

+  # This acts as a no-op if the logits are already in fp32 (provided logits are
+  # not a SparseTensor). If dtype is is low precision, logits must be cast to
+  # fp32 for numerical stability.
+  logits = tf.cast(logits, tf.float32)
+
  predictions = {
      'classes': tf.argmax(logits, axis=1),
      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
@@ -244,7 +239,8 @@ def resnet_model_fn(features, labels, mode, model_class,

  # Add weight decay to the loss.
  l2_loss = weight_decay * tf.add_n(
-      [tf.nn.l2_loss(v) for v in tf.trainable_variables()
+      # loss is computed using fp32 for numerical stability.
+      [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
       if loss_filter_fn(v.name)])
  tf.summary.scalar('l2_loss', l2_loss)
  loss = cross_entropy + l2_loss
@@ -260,19 +256,36 @@ def resnet_model_fn(features, labels, mode, model_class,

    optimizer = tf.train.MomentumOptimizer(
        learning_rate=learning_rate,
-        momentum=momentum)
+        momentum=momentum
+    )

-    # If we are running multi-GPU, we need to wrap the optimizer.
-    if multi_gpu:
-      optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
+    if loss_scale != 1:
+      # When computing fp16 gradients, often intermediate tensor values are
+      # so small, they underflow to 0. To avoid this, we multiply the loss by
+      # loss_scale to make these tensor values loss_scale times bigger.
+      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
+
+      # Once the gradient computation is complete we can scale the gradients
+      # back to the correct scale before passing them to the optimizer.
+      unscaled_grad_vars = [(grad / loss_scale, var)
+                            for grad, var in scaled_grad_vars]
+      minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
+    else:
+      minimize_op = optimizer.minimize(loss, global_step)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(optimizer.minimize(loss, global_step), update_ops)
+    train_op = tf.group(minimize_op, update_ops)
  else:
    train_op = None

-  accuracy = tf.metrics.accuracy(
-      tf.argmax(labels, axis=1), predictions['classes'])
+  if not tf.contrib.distribute.has_distribution_strategy():
+    accuracy = tf.metrics.accuracy(
+        tf.argmax(labels, axis=1), predictions['classes'])
+  else:
+    # Metrics are currently not compatible with distribution strategies during
+    # training. This does not affect the overall performance of the model.
+    accuracy = (tf.no_op(), tf.constant(0))
+
  metrics = {'accuracy': accuracy}

  # Create a tensor named train_accuracy for logging purposes
@@ -287,34 +300,35 @@ def resnet_model_fn(features, labels, mode, model_class,
      eval_metric_ops=metrics)


-def validate_batch_size_for_multi_gpu(batch_size):
+def per_device_batch_size(batch_size, num_gpus):
  """For multi-gpu, batch-size must be a multiple of the number of GPUs.

-  Note that this should eventually be handled by replicate_model_fn
+  Note that this should eventually be handled by DistributionStrategies
  directly. Multi-GPU support is currently experimental, however,
  so doing the work here until that feature is in place.

  Args:
-    batch_size: the number of examples processed in each training batch.
+    batch_size: Global batch size to be divided among devices. This should be
+      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
+    num_gpus: How many GPUs are used with DistributionStrategies.
+
+  Returns:
+    Batch size per device.

  Raises:
-    ValueError: if no GPUs are found, or selected batch_size is invalid.
+    ValueError: if batch_size is not divisible by number of devices
  """
-  from tensorflow.python.client import device_lib  # pylint: disable=g-import-not-at-top
-
-  local_device_protos = device_lib.list_local_devices()
-  num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
-  if not num_gpus:
-    raise ValueError('Multi-GPU mode was specified, but no GPUs '
-                     'were found. To use CPU, run without --multi_gpu.')
+  if num_gpus <= 1:
+    return batch_size

  remainder = batch_size % num_gpus
  if remainder:
    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. '
-           'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
+           'must be a multiple of the number of available GPUs. Found {} '
+           'GPUs with a batch size of {}; try --batch_size={} instead.'
          ).format(num_gpus, batch_size, batch_size - remainder)
    raise ValueError(err)
+  return int(batch_size / num_gpus)


 def resnet_main(flags, model_function, input_function, shape=None):
@@ -335,16 +349,6 @@ def resnet_main(flags, model_function, input_function, shape=None):
  # Using the Winograd non-fused algorithms provides a small performance boost.
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

-  if flags.multi_gpu:
-    validate_batch_size_for_multi_gpu(flags.batch_size)
-
-    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
-    # and (2) wrap the optimizer. The first happens here, and (2) happens
-    # in the model_fn itself when the optimizer is defined.
-    model_function = tf.contrib.estimator.replicate_model_fn(
-        model_function,
-        loss_reduction=tf.losses.Reduction.MEAN)
-
  # Create session config based on values of inter_op_parallelism_threads and
  # intra_op_parallelism_threads. Note that we default to having
  # allow_soft_placement = True, which is required for multi-GPU and not
@@ -354,22 +358,32 @@ def resnet_main(flags, model_function, input_function, shape=None):
      intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
      allow_soft_placement=True)

-  # Set up a RunConfig to save checkpoint and set session config.
-  run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9,
-                                                session_config=session_config)
+  if flags.num_gpus == 0:
+    distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0')
+  elif flags.num_gpus == 1:
+    distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0')
+  else:
+    distribution = tf.contrib.distribute.MirroredStrategy(
+        num_gpus=flags.num_gpus
+    )
+
+  run_config = tf.estimator.RunConfig(train_distribute=distribution,
+                                      session_config=session_config)
+
  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags.model_dir, config=run_config,
      params={
          'resnet_size': flags.resnet_size,
          'data_format': flags.data_format,
          'batch_size': flags.batch_size,
-          'multi_gpu': flags.multi_gpu,
          'version': flags.version,
+          'loss_scale': flags.loss_scale,
+          'dtype': flags.dtype
      })

  if flags.benchmark_log_dir is not None:
    benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
-    benchmark_logger.log_run_info("resnet")
+    benchmark_logger.log_run_info('resnet')
  else:
    benchmark_logger = None

@@ -382,9 +396,12 @@ def resnet_main(flags, model_function, input_function, shape=None):
    print('Starting a training cycle.')

    def input_fn_train():
-      return input_function(True, flags.data_dir, flags.batch_size,
-                            flags.epochs_between_evals,
-                            flags.num_parallel_calls, flags.multi_gpu)
+      return input_function(
+          is_training=True,
+          data_dir=flags.data_dir,
+          batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus),
+          num_epochs=flags.epochs_between_evals,
+      )

    classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                     max_steps=flags.max_train_steps)
@@ -392,8 +409,12 @@ def resnet_main(flags, model_function, input_function, shape=None):
    print('Starting to evaluate.')
    # Evaluate the model and print results
    def input_fn_eval():
-      return input_function(False, flags.data_dir, flags.batch_size,
-                            1, flags.num_parallel_calls, flags.multi_gpu)
+      return input_function(
+          is_training=False,
+          data_dir=flags.data_dir,
+          batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus),
+          num_epochs=1,
+      )

    # flags.max_train_steps is generally associated with testing and profiling.
    # As a result it is frequently called with synthetic data, which will
@@ -408,32 +429,24 @@ def resnet_main(flags, model_function, input_function, shape=None):
    if benchmark_logger:
      benchmark_logger.log_estimator_evaluation_result(eval_results)

-  if flags.export_dir is not None:
-    warn_on_multi_gpu_export(flags.multi_gpu)
+    if model_helpers.past_stop_threshold(
+        flags.stop_threshold, eval_results['accuracy']):
+      break

+  if flags.export_dir is not None:
    # Exports a saved model for the given classifier.
    input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
        shape, batch_size=flags.batch_size)
    classifier.export_savedmodel(flags.export_dir, input_receiver_fn)


-def warn_on_multi_gpu_export(multi_gpu=False):
-  """For the time being, multi-GPU mode does not play nicely with exporting."""
-  if multi_gpu:
-    tf.logging.warning(
-        'You are exporting a SavedModel while in multi-GPU mode. Note that '
-        'the resulting SavedModel will require the same GPUs be available.'
-        'If you wish to serve the SavedModel from a different device, '
-        'try exporting the SavedModel with multi-GPU mode turned off.')
-
-
 class ResnetArgParser(argparse.ArgumentParser):
  """Arguments for configuring and running a Resnet Model."""

  def __init__(self, resnet_size_choices=None):
    super(ResnetArgParser, self).__init__(parents=[
-        parsers.BaseParser(),
-        parsers.PerformanceParser(),
+        parsers.BaseParser(multi_gpu=False),
+        parsers.PerformanceParser(num_parallel_calls=False),
        parsers.ImageModelParser(),
        parsers.ExportParser(),
        parsers.BenchmarkParser(),
@@ -451,3 +464,12 @@ class ResnetArgParser(argparse.ArgumentParser):
        help='[default: %(default)s] The size of the ResNet model to use.',
        metavar='<RS>' if resnet_size_choices is None else None
    )
+
+  def parse_args(self, args=None, namespace=None):
+    args = super(ResnetArgParser, self).parse_args(
+        args=args, namespace=namespace)
+
+    # handle coupling between dtype and loss_scale
+    parsers.parse_dtype_info(args)
+
+    return args
--- a/official/utils/arg_parsers/parsers.py
+++ b/official/utils/arg_parsers/parsers.py
@@ -58,9 +58,37 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-
 import argparse

+import tensorflow as tf
+
+
+# Map string to (TensorFlow dtype, default loss scale)
+DTYPE_MAP = {
+    "fp16": (tf.float16, 128),
+    "fp32": (tf.float32, 1),
+}
+
+
+def parse_dtype_info(flags):
+  """Convert dtype string to tf dtype, and set loss_scale default as needed.
+
+  Args:
+    flags: namespace object returned by arg parser.
+
+  Raises:
+    ValueError: If an invalid dtype is provided.
+  """
+  if flags.dtype in (i[0] for i in DTYPE_MAP.values()):
+    return  # Make function idempotent
+
+  try:
+    flags.dtype, default_loss_scale = DTYPE_MAP[flags.dtype]
+  except KeyError:
+    raise ValueError("Invalid dtype: {}".format(flags.dtype))
+
+  flags.loss_scale = flags.loss_scale or default_loss_scale
+

 class BaseParser(argparse.ArgumentParser):
  """Parser to contain flags which will be nearly universal across models.
@@ -71,14 +99,18 @@ class BaseParser(argparse.ArgumentParser):
    model_dir: Create a flag for specifying the model file directory.
    train_epochs: Create a flag to specify the number of training epochs.
    epochs_between_evals: Create a flag to specify the frequency of testing.
-    batch_size: Create a flag to specify the batch size.
+    stop_threshold: Create a flag to specify a threshold accuracy or other
+      eval metric which should trigger the end of training.
+    batch_size: Create a flag to specify the global batch size.
    multi_gpu: Create a flag to allow the use of all available GPUs.
+    num_gpu: Create a flag to specify the number of GPUs used.
    hooks: Create a flag to specify hooks for logging.
  """

  def __init__(self, add_help=False, data_dir=True, model_dir=True,
-               train_epochs=True, epochs_between_evals=True, batch_size=True,
-               multi_gpu=True, hooks=True):
+               train_epochs=True, epochs_between_evals=True,
+               stop_threshold=True, batch_size=True,
+               multi_gpu=False, num_gpu=True, hooks=True):
    super(BaseParser, self).__init__(add_help=add_help)

    if data_dir:
@@ -111,19 +143,42 @@ class BaseParser(argparse.ArgumentParser):
          metavar="<EBE>"
      )

+    if stop_threshold:
+      self.add_argument(
+          "--stop_threshold", "-st", type=float, default=None,
+          help="[default: %(default)s] If passed, training will stop at "
+          "the earlier of train_epochs and when the evaluation metric is "
+          "greater than or equal to stop_threshold.",
+          metavar="<ST>"
+      )
+
    if batch_size:
      self.add_argument(
          "--batch_size", "-bs", type=int, default=32,
-          help="[default: %(default)s] Batch size for training and evaluation.",
+          help="[default: %(default)s] Global batch size for training and "
+               "evaluation.",
          metavar="<BS>"
      )

+    assert not (multi_gpu and num_gpu)
+
    if multi_gpu:
      self.add_argument(
          "--multi_gpu", action="store_true",
          help="If set, run across all available GPUs."
      )

+    if num_gpu:
+      self.add_argument(
+          "--num_gpus", "-ng",
+          type=int,
+          default=1 if tf.test.is_built_with_cuda() else 0,
+          help="[default: %(default)s] How many GPUs to use with the "
+               "DistributionStrategies API. The default is 1 if TensorFlow was"
+               "built with CUDA, and 0 otherwise.",
+          metavar="<NG>"
+      )
+
    if hooks:
      self.add_argument(
          "--hooks", "-hk", nargs="+", default=["LoggingTensorHook"],
@@ -148,7 +203,8 @@ class PerformanceParser(argparse.ArgumentParser):
  """

  def __init__(self, add_help=False, num_parallel_calls=True, inter_op=True,
-               intra_op=True, use_synthetic_data=True, max_train_steps=True):
+               intra_op=True, use_synthetic_data=True, max_train_steps=True,
+               dtype=True):
    super(PerformanceParser, self).__init__(add_help=add_help)

    if num_parallel_calls:
@@ -201,6 +257,31 @@ class PerformanceParser(argparse.ArgumentParser):
          metavar="<MTS>"
      )

+    if dtype:
+      self.add_argument(
+          "--dtype", "-dt",
+          default="fp32",
+          choices=list(DTYPE_MAP.keys()),
+          help="[default: %(default)s] {%(choices)s} The TensorFlow datatype "
+               "used for calculations. Variables may be cast to a higher"
+               "precision on a case-by-case basis for numerical stability.",
+          metavar="<DT>"
+      )
+
+      self.add_argument(
+          "--loss_scale", "-ls",
+          type=int,
+          help="[default: %(default)s] The amount to scale the loss by when "
+               "the model is run. Before gradients are computed, the loss is "
+               "multiplied by the loss scale, making all gradients loss_scale "
+               "times larger. To adjust for this, gradients are divided by the "
+               "loss scale before being applied to variables. This is "
+               "mathematically equivalent to training without a loss scale, "
+               "but the loss scale helps avoid some intermediate gradients "
+               "from underflowing to zero. If not provided the default for "
+               "fp16 is 128 and 1 for all other dtypes.",
+      )
+

 class ImageModelParser(argparse.ArgumentParser):
  """Default parser for specification image specific behavior.
@@ -291,3 +372,15 @@ class BenchmarkParser(argparse.ArgumentParser):
               " benchmark metric information will be uploaded.",
          metavar="<BMT>"
      )
+
+
+class EagerParser(BaseParser):
+  """Remove options not relevant for Eager from the BaseParser."""
+
+  def __init__(self, add_help=False, data_dir=True, model_dir=True,
+               train_epochs=True, batch_size=True):
+    super(EagerParser, self).__init__(
+        add_help=add_help, data_dir=data_dir, model_dir=model_dir,
+        train_epochs=train_epochs, epochs_between_evals=False,
+        stop_threshold=False, batch_size=batch_size, multi_gpu=False,
+        hooks=False)
--- a/official/utils/arg_parsers/parsers_test.py
+++ b/official/utils/arg_parsers/parsers_test.py
@@ -16,6 +16,7 @@
 import argparse
 import unittest

+import tensorflow as tf  # pylint: disable=g-bad-import-order

 from official.utils.arg_parsers import parsers

@@ -25,7 +26,7 @@ class TestParser(argparse.ArgumentParser):

  def __init__(self):
    super(TestParser, self).__init__(parents=[
-        parsers.BaseParser(),
+        parsers.BaseParser(multi_gpu=True, num_gpu=False),
        parsers.PerformanceParser(num_parallel_calls=True, inter_op=True,
                                  intra_op=True, use_synthetic_data=True),
        parsers.ImageModelParser(data_format=True),
@@ -83,6 +84,24 @@ class BaseTester(unittest.TestCase):
    assert namespace.multi_gpu
    assert namespace.use_synthetic_data

+  def test_parse_dtype_info(self):
+    parser = TestParser()
+    for dtype_str, tf_dtype, loss_scale in [["fp16", tf.float16, 128],
+                                            ["fp32", tf.float32, 1]]:
+      args = parser.parse_args(["--dtype", dtype_str])
+      parsers.parse_dtype_info(args)
+
+      assert args.dtype == tf_dtype
+      assert args.loss_scale == loss_scale
+
+      args = parser.parse_args(["--dtype", dtype_str, "--loss_scale", "5"])
+      parsers.parse_dtype_info(args)
+
+      assert args.loss_scale == 5
+
+    with self.assertRaises(SystemExit):
+      parser.parse_args(["--dtype", "int8"])
+

 if __name__ == "__main__":
  unittest.main()
--- a/official/utils/misc/__init__.py
+++ b/official/utils/misc/__init__.py
--- a/official/utils/misc/model_helpers.py
+++ b/official/utils/misc/model_helpers.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Miscellaneous functions that can be called by models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numbers
+
+import tensorflow as tf
+
+
+def past_stop_threshold(stop_threshold, eval_metric):
+  """Return a boolean representing whether a model should be stopped.
+
+  Args:
+    stop_threshold: float, the threshold above which a model should stop
+      training.
+    eval_metric: float, the current value of the relevant metric to check.
+
+  Returns:
+    True if training should stop, False otherwise.
+
+  Raises:
+    ValueError: if either stop_threshold or eval_metric is not a number
+  """
+  if stop_threshold is None:
+    return False
+
+  if not isinstance(stop_threshold, numbers.Number):
+    raise ValueError("Threshold for checking stop conditions must be a number.")
+  if not isinstance(eval_metric, numbers.Number):
+    raise ValueError("Eval metric being checked against stop conditions "
+                     "must be a number.")
+
+  if eval_metric >= stop_threshold:
+    tf.logging.info(
+        "Stop threshold of {} was passed with metric value {}.".format(
+            stop_threshold, eval_metric))
+    return True
+
+  return False
--- a/official/utils/misc/model_helpers_test.py
+++ b/official/utils/misc/model_helpers_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+""" Tests for Model Helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.utils.misc import model_helpers
+
+
+class PastStopThresholdTest(tf.test.TestCase):
+  """Tests for past_stop_threshold."""
+
+  def test_past_stop_threshold(self):
+    """Tests for normal operating conditions."""
+    self.assertTrue(model_helpers.past_stop_threshold(0.54, 1))
+    self.assertTrue(model_helpers.past_stop_threshold(54, 100))
+    self.assertFalse(model_helpers.past_stop_threshold(0.54, 0.1))
+    self.assertFalse(model_helpers.past_stop_threshold(-0.54, -1.5))
+    self.assertTrue(model_helpers.past_stop_threshold(-0.54, 0))
+    self.assertTrue(model_helpers.past_stop_threshold(0, 0))
+    self.assertTrue(model_helpers.past_stop_threshold(0.54, 0.54))
+
+  def test_past_stop_threshold_none_false(self):
+    """Tests that check None returns false."""
+    self.assertFalse(model_helpers.past_stop_threshold(None, -1.5))
+    self.assertFalse(model_helpers.past_stop_threshold(None, None))
+    self.assertFalse(model_helpers.past_stop_threshold(None, 1.5))
+    # Zero should be okay, though.
+    self.assertTrue(model_helpers.past_stop_threshold(0, 1.5))
+
+  def test_past_stop_threshold_not_number(self):
+    """Tests for error conditions."""
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold("str", 1)
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold("str", tf.constant(5))
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold("str", "another")
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(0, None)
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(0.7, "str")
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(tf.constant(4), None)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/utils/testing/reference_data.py
+++ b/official/utils/testing/reference_data.py
@@ -170,7 +170,7 @@ class BaseTest(tf.test.TestCase):
    # Serialize graph for comparison.
    graph_bytes = graph.as_graph_def().SerializeToString()
    expected_file = os.path.join(data_dir, "expected_graph")
-    with open(expected_file, "wb") as f:
+    with tf.gfile.Open(expected_file, "wb") as f:
      f.write(graph_bytes)

    with graph.as_default():
@@ -191,10 +191,10 @@ class BaseTest(tf.test.TestCase):

      if correctness_function is not None:
        results = correctness_function(*eval_results)
-        with open(os.path.join(data_dir, "results.json"), "wt") as f:
+        with tf.gfile.Open(os.path.join(data_dir, "results.json"), "w") as f:
          json.dump(results, f)

-      with open(os.path.join(data_dir, "tf_version.json"), "wt") as f:
+      with tf.gfile.Open(os.path.join(data_dir, "tf_version.json"), "w") as f:
        json.dump([tf.VERSION, tf.GIT_VERSION], f)

  def _evaluate_test_case(self, name, graph, ops_to_eval, correctness_function):
@@ -216,7 +216,7 @@ class BaseTest(tf.test.TestCase):
    # Serialize graph for comparison.
    graph_bytes = graph.as_graph_def().SerializeToString()
    expected_file = os.path.join(data_dir, "expected_graph")
-    with open(expected_file, "rb") as f:
+    with tf.gfile.Open(expected_file, "rb") as f:
      expected_graph_bytes = f.read()
      # The serialization is non-deterministic byte-for-byte. Instead there is
      # a utility which evaluates the semantics of the two graphs to test for
@@ -231,7 +231,7 @@ class BaseTest(tf.test.TestCase):
      init = tf.global_variables_initializer()
      saver = tf.train.Saver()

-    with open(os.path.join(data_dir, "tf_version.json"), "rt") as f:
+    with tf.gfile.Open(os.path.join(data_dir, "tf_version.json"), "r") as f:
      tf_version_reference, tf_git_version_reference = json.load(f)  # pylint: disable=unpacking-non-sequence

    tf_version_comparison = ""
@@ -262,7 +262,7 @@ class BaseTest(tf.test.TestCase):
      eval_results = [op.eval() for op in ops_to_eval]
      if correctness_function is not None:
        results = correctness_function(*eval_results)
-        with open(os.path.join(data_dir, "results.json"), "rt") as f:
+        with tf.gfile.Open(os.path.join(data_dir, "results.json"), "r") as f:
          expected_results = json.load(f)
        self.assertAllClose(results, expected_results)


--- a/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/expected_graph
+++ b/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/expected_graph