partition code between resnet_run_loop and resnet_model (#3621)

822875db · Taylor Robie · GitHub · d3067c21 · 822875db · 822875db
Unverified Commit 822875db authored Mar 15, 2018 by Taylor Robie Committed by GitHub Mar 15, 2018
5 changed files
--- a/official/resnet/cifar10_main.py
+++ b/official/resnet/cifar10_main.py
@@ -23,7 +23,8 @@ import sys

 import tensorflow as tf

-from official.resnet import resnet
+from official.resnet import resnet_model
+from official.resnet import resnet_run_loop

 _HEIGHT = 32
 _WIDTH = 32
@@ -126,22 +127,22 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,

  num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation']

-  return resnet.process_record_dataset(dataset, is_training, batch_size,
-      _NUM_IMAGES['train'], parse_record, num_epochs, num_parallel_calls,
-      examples_per_epoch=num_images, multi_gpu=multi_gpu)
+  return resnet_run_loop.process_record_dataset(dataset, is_training, batch_size,
+                                                _NUM_IMAGES['train'], parse_record, num_epochs, num_parallel_calls,
+                                                examples_per_epoch=num_images, multi_gpu=multi_gpu)


 def get_synth_input_fn():
-  return resnet.get_synth_input_fn(_HEIGHT, _WIDTH, _NUM_CHANNELS, _NUM_CLASSES)
+  return resnet_run_loop.get_synth_input_fn(_HEIGHT, _WIDTH, _NUM_CHANNELS, _NUM_CLASSES)


 ###############################################################################
 # Running the model
 ###############################################################################
-class Cifar10Model(resnet.Model):
+class Cifar10Model(resnet_model.Model):

  def __init__(self, resnet_size, data_format=None, num_classes=_NUM_CLASSES,
-      version=resnet.DEFAULT_VERSION):
+      version=resnet_model.DEFAULT_VERSION):
    """These are the parameters that work for CIFAR-10 data.

    Args:
@@ -180,7 +181,7 @@ def cifar10_model_fn(features, labels, mode, params):
  """Model function for CIFAR-10."""
  features = tf.reshape(features, [-1, _HEIGHT, _WIDTH, _NUM_CHANNELS])

-  learning_rate_fn = resnet.learning_rate_with_decay(
+  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
      batch_size=params['batch_size'], batch_denom=128,
      num_images=_NUM_IMAGES['train'], boundary_epochs=[100, 150, 200],
      decay_rates=[1, 0.1, 0.01, 0.001])
@@ -197,26 +198,26 @@ def cifar10_model_fn(features, labels, mode, params):
  def loss_filter_fn(name):
    return True

-  return resnet.resnet_model_fn(features, labels, mode, Cifar10Model,
-                                resnet_size=params['resnet_size'],
-                                weight_decay=weight_decay,
-                                learning_rate_fn=learning_rate_fn,
-                                momentum=0.9,
-                                data_format=params['data_format'],
-                                version=params['version'],
-                                loss_filter_fn=loss_filter_fn,
-                                multi_gpu=params['multi_gpu'])
+  return resnet_run_loop.resnet_model_fn(features, labels, mode, Cifar10Model,
+                                         resnet_size=params['resnet_size'],
+                                         weight_decay=weight_decay,
+                                         learning_rate_fn=learning_rate_fn,
+                                         momentum=0.9,
+                                         data_format=params['data_format'],
+                                         version=params['version'],
+                                         loss_filter_fn=loss_filter_fn,
+                                         multi_gpu=params['multi_gpu'])


 def main(unused_argv):
  input_function = FLAGS.use_synthetic_data and get_synth_input_fn() or input_fn
-  resnet.resnet_main(FLAGS, cifar10_model_fn, input_function)
+  resnet_run_loop.resnet_main(FLAGS, cifar10_model_fn, input_function)


 if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)

-  parser = resnet.ResnetArgParser()
+  parser = resnet_run_loop.ResnetArgParser()
  # Set defaults that are reasonable for this model.
  parser.set_defaults(data_dir='/tmp/cifar10_data',
                      model_dir='/tmp/cifar10_model',

--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -23,8 +23,9 @@ import sys

 import tensorflow as tf

-from official.resnet import resnet
 from official.resnet import imagenet_preprocessing
+from official.resnet import resnet_model
+from official.resnet import resnet_run_loop

 _DEFAULT_IMAGE_SIZE = 224
 _NUM_CHANNELS = 3
@@ -183,24 +184,24 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1,
  # Convert to individual records
  dataset = dataset.flat_map(tf.data.TFRecordDataset)

-  return resnet.process_record_dataset(
+  return resnet_run_loop.process_record_dataset(
      dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record,
      num_epochs, num_parallel_calls, examples_per_epoch=num_images,
      multi_gpu=multi_gpu)


 def get_synth_input_fn():
-  return resnet.get_synth_input_fn(
+  return resnet_run_loop.get_synth_input_fn(
        _DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS, _NUM_CLASSES)


 ###############################################################################
 # Running the model
 ###############################################################################
-class ImagenetModel(resnet.Model):
+class ImagenetModel(resnet_model.Model):

  def __init__(self, resnet_size, data_format=None, num_classes=_NUM_CLASSES,
-    version=resnet.DEFAULT_VERSION):
+    version=resnet_model.DEFAULT_VERSION):
    """These are the parameters that work for Imagenet data.

    Args:
@@ -264,31 +265,31 @@ def _get_block_sizes(resnet_size):

 def imagenet_model_fn(features, labels, mode, params):
  """Our model_fn for ResNet to be used with our Estimator."""
-  learning_rate_fn = resnet.learning_rate_with_decay(
+  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
      batch_size=params['batch_size'], batch_denom=256,
      num_images=_NUM_IMAGES['train'], boundary_epochs=[30, 60, 80, 90],
      decay_rates=[1, 0.1, 0.01, 0.001, 1e-4])

-  return resnet.resnet_model_fn(features, labels, mode, ImagenetModel,
-                                resnet_size=params['resnet_size'],
-                                weight_decay=1e-4,
-                                learning_rate_fn=learning_rate_fn,
-                                momentum=0.9,
-                                data_format=params['data_format'],
-                                version=params['version'],
-                                loss_filter_fn=None,
-                                multi_gpu=params['multi_gpu'])
+  return resnet_run_loop.resnet_model_fn(features, labels, mode, ImagenetModel,
+                                         resnet_size=params['resnet_size'],
+                                         weight_decay=1e-4,
+                                         learning_rate_fn=learning_rate_fn,
+                                         momentum=0.9,
+                                         data_format=params['data_format'],
+                                         version=params['version'],
+                                         loss_filter_fn=None,
+                                         multi_gpu=params['multi_gpu'])


 def main(unused_argv):
  input_function = FLAGS.use_synthetic_data and get_synth_input_fn() or input_fn
-  resnet.resnet_main(FLAGS, imagenet_model_fn, input_function)
+  resnet_run_loop.resnet_main(FLAGS, imagenet_model_fn, input_function)


 if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)

-  parser = resnet.ResnetArgParser(
+  parser = resnet_run_loop.ResnetArgParser(
      resnet_size_choices=[18, 34, 50, 101, 152, 200])
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(argv=[sys.argv[0]] + unparsed)
--- a/official/resnet/resnet.py
+++ b/official/resnet/resnet.py
@@ -31,116 +31,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import argparse
-import os

 import tensorflow as tf

-from official.utils.arg_parsers import parsers  # pylint: disable=g-bad-import-order
-from official.utils.logging import hooks_helper

 _BATCH_NORM_DECAY = 0.997
 _BATCH_NORM_EPSILON = 1e-5
-
 DEFAULT_VERSION = 2


-################################################################################
-# Functions for input processing.
-################################################################################
-def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer,
-                           parse_record_fn, num_epochs=1, num_parallel_calls=1,
-                           examples_per_epoch=0, multi_gpu=False):
-  """Given a Dataset with raw records, parse each record into images and labels,
-  and return an iterator over the records.
-
-  Args:
-    dataset: A Dataset representing raw records
-    is_training: A boolean denoting whether the input is for training.
-    batch_size: The number of samples per batch.
-    shuffle_buffer: The buffer size to use when shuffling records. A larger
-      value results in better randomness, but smaller values reduce startup
-      time and use less memory.
-    parse_record_fn: A function that takes a raw record and returns the
-      corresponding (image, label) pair.
-    num_epochs: The number of epochs to repeat the dataset.
-    num_parallel_calls: The number of records that are processed in parallel.
-      This can be optimized per data set but for generally homogeneous data
-      sets, should be approximately the number of available CPU cores.
-    examples_per_epoch: The number of examples in the current set that
-      are processed each epoch. Note that this is only used for multi-GPU mode,
-      and only to handle what will eventually be handled inside of Estimator.
-    multi_gpu: Whether this is run multi-GPU. Note that this is only required
-      currently to handle the batch leftovers (see below), and can be removed
-      when that is handled directly by Estimator.
-
-  Returns:
-    Dataset of (image, label) pairs ready for iteration.
-  """
-  # We prefetch a batch at a time, This can help smooth out the time taken to
-  # load input files as we go through shuffling and processing.
-  dataset = dataset.prefetch(buffer_size=batch_size)
-  if is_training:
-    # Shuffle the records. Note that we shuffle before repeating to ensure
-    # that the shuffling respects epoch boundaries.
-    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
-
-  # If we are training over multiple epochs before evaluating, repeat the
-  # dataset for the appropriate number of epochs.
-  dataset = dataset.repeat(num_epochs)
-
-  # Currently, if we are using multiple GPUs, we can't pass in uneven batches.
-  # (For example, if we have 4 GPUs, the number of examples in each batch
-  # must be divisible by 4.) We already ensured this for the batch_size, but
-  # we have to additionally ensure that any "leftover" examples-- the remainder
-  # examples (total examples % batch_size) that get called a batch for the very
-  # last batch of an epoch-- do not raise an error when we try to split them
-  # over the GPUs. This will likely be handled by Estimator during replication
-  # in the future, but for now, we just drop the leftovers here.
-  if multi_gpu:
-    total_examples = num_epochs * examples_per_epoch
-    dataset = dataset.take(batch_size * (total_examples // batch_size))
-
-  # Parse the raw records into images and labels
-  dataset = dataset.map(lambda value: parse_record_fn(value, is_training),
-                        num_parallel_calls=num_parallel_calls)
-
-  dataset = dataset.batch(batch_size)
-
-  # Operations between the final prefetch and the get_next call to the iterator
-  # will happen synchronously during run time. We prefetch here again to
-  # background all of the above processing work and keep it out of the
-  # critical training path.
-  dataset = dataset.prefetch(1)
-
-  return dataset
-
-
-def get_synth_input_fn(height, width, num_channels, num_classes):
-  """Returns an input function that returns a dataset with zeroes.
-
-  This is useful in debugging input pipeline performance, as it removes all
-  elements of file reading and image preprocessing.
-
-  Args:
-    height: Integer height that will be used to create a fake image tensor.
-    width: Integer width that will be used to create a fake image tensor.
-    num_channels: Integer depth that will be used to create a fake image tensor.
-    num_classes: Number of classes that should be represented in the fake labels
-      tensor
-
-  Returns:
-    An input_fn that can be used in place of a real one to return a dataset
-    that can be used for iteration.
-  """
-  def input_fn(is_training, data_dir, batch_size, *args):
-    images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
-    labels = tf.zeros((batch_size, num_classes), tf.int32)
-    return tf.data.Dataset.from_tensors((images, labels)).repeat()
-
-  return input_fn
-
-
 ################################################################################
 # Convenience functions for building the ResNet model.
 ################################################################################
@@ -199,7 +98,7 @@ def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
 # ResNet block definitions.
 ################################################################################
 def _building_block_v1(inputs, filters, training, projection_shortcut, strides,
-    data_format):
+                       data_format):
  """
  Convolution then batch normalization then ReLU as described by:
    Deep Residual Learning for Image Recognition
@@ -245,7 +144,7 @@ def _building_block_v1(inputs, filters, training, projection_shortcut, strides,


 def _building_block_v2(inputs, filters, training, projection_shortcut, strides,
-    data_format):
+                       data_format):
  """
  Batch normalization then ReLu then convolution as described by:
    Identity Mappings in Deep Residual Networks
@@ -290,7 +189,7 @@ def _building_block_v2(inputs, filters, training, projection_shortcut, strides,


 def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
-    strides, data_format):
+                         strides, data_format):
  """
  Similar to _building_block_v1(), except using the "bottleneck" blocks
  described in:
@@ -329,7 +228,7 @@ def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,


 def _bottleneck_block_v2(inputs, filters, training, projection_shortcut,
-    strides, data_format):
+                         strides, data_format):
  """
  Similar to _building_block_v2(), except using the "bottleneck" blocks
  described in:
@@ -538,258 +437,3 @@ class Model(object):
    return inputs


-################################################################################
-# Functions for running training/eval/validation loops for the model.
-################################################################################
-def learning_rate_with_decay(
-    batch_size, batch_denom, num_images, boundary_epochs, decay_rates):
-  """Get a learning rate that decays step-wise as training progresses.
-
-  Args:
-    batch_size: the number of examples processed in each training batch.
-    batch_denom: this value will be used to scale the base learning rate.
-      `0.1 * batch size` is divided by this number, such that when
-      batch_denom == batch_size, the initial learning rate will be 0.1.
-    num_images: total number of images that will be used for training.
-    boundary_epochs: list of ints representing the epochs at which we
-      decay the learning rate.
-    decay_rates: list of floats representing the decay rates to be used
-      for scaling the learning rate. Should be the same length as
-      boundary_epochs.
-
-  Returns:
-    Returns a function that takes a single argument - the number of batches
-    trained so far (global_step)- and returns the learning rate to be used
-    for training the next batch.
-  """
-  initial_learning_rate = 0.1 * batch_size / batch_denom
-  batches_per_epoch = num_images / batch_size
-
-  # Multiply the learning rate by 0.1 at 100, 150, and 200 epochs.
-  boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
-  vals = [initial_learning_rate * decay for decay in decay_rates]
-
-  def learning_rate_fn(global_step):
-    global_step = tf.cast(global_step, tf.int32)
-    return tf.train.piecewise_constant(global_step, boundaries, vals)
-
-  return learning_rate_fn
-
-
-def resnet_model_fn(features, labels, mode, model_class,
-                    resnet_size, weight_decay, learning_rate_fn, momentum,
-                    data_format, version, loss_filter_fn=None, multi_gpu=False):
-  """Shared functionality for different resnet model_fns.
-
-  Initializes the ResnetModel representing the model layers
-  and uses that model to build the necessary EstimatorSpecs for
-  the `mode` in question. For training, this means building losses,
-  the optimizer, and the train op that get passed into the EstimatorSpec.
-  For evaluation and prediction, the EstimatorSpec is returned without
-  a train op, but with the necessary parameters for the given mode.
-
-  Args:
-    features: tensor representing input images
-    labels: tensor representing class labels for all input images
-    mode: current estimator mode; should be one of
-      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
-    model_class: a class representing a TensorFlow model that has a __call__
-      function. We assume here that this is a subclass of ResnetModel.
-    resnet_size: A single integer for the size of the ResNet model.
-    weight_decay: weight decay loss rate used to regularize learned variables.
-    learning_rate_fn: function that returns the current learning rate given
-      the current global_step
-    momentum: momentum term used for optimization
-    data_format: Input format ('channels_last', 'channels_first', or None).
-      If set to None, the format is dependent on whether a GPU is available.
-    version: Integer representing which version of the ResNet network to use.
-      See README for details. Valid values: [1, 2]
-    loss_filter_fn: function that takes a string variable name and returns
-      True if the var should be included in loss calculation, and False
-      otherwise. If None, batch_normalization variables will be excluded
-      from the loss.
-    multi_gpu: If True, wrap the optimizer in a TowerOptimizer suitable for
-      data-parallel distribution across multiple GPUs.
-
-  Returns:
-    EstimatorSpec parameterized according to the input params and the
-    current mode.
-  """
-
-  # Generate a summary node for the images
-  tf.summary.image('images', features, max_outputs=6)
-
-  model = model_class(resnet_size, data_format, version=version)
-  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
-
-  predictions = {
-      'classes': tf.argmax(logits, axis=1),
-      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
-  }
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
-
-  # Calculate loss, which includes softmax cross entropy and L2 regularization.
-  cross_entropy = tf.losses.softmax_cross_entropy(
-      logits=logits, onehot_labels=labels)
-
-  # Create a tensor named cross_entropy for logging purposes.
-  tf.identity(cross_entropy, name='cross_entropy')
-  tf.summary.scalar('cross_entropy', cross_entropy)
-
-  # If no loss_filter_fn is passed, assume we want the default behavior,
-  # which is that batch_normalization variables are excluded from loss.
-  if not loss_filter_fn:
-    def loss_filter_fn(name):
-      return 'batch_normalization' not in name
-
-  # Add weight decay to the loss.
-  loss = cross_entropy + weight_decay * tf.add_n(
-      [tf.nn.l2_loss(v) for v in tf.trainable_variables()
-       if loss_filter_fn(v.name)])
-
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    global_step = tf.train.get_or_create_global_step()
-
-    learning_rate = learning_rate_fn(global_step)
-
-    # Create a tensor named learning_rate for logging purposes
-    tf.identity(learning_rate, name='learning_rate')
-    tf.summary.scalar('learning_rate', learning_rate)
-
-    optimizer = tf.train.MomentumOptimizer(
-        learning_rate=learning_rate,
-        momentum=momentum)
-
-    # If we are running multi-GPU, we need to wrap the optimizer.
-    if multi_gpu:
-      optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
-
-    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(optimizer.minimize(loss, global_step), update_ops)
-  else:
-    train_op = None
-
-  accuracy = tf.metrics.accuracy(
-      tf.argmax(labels, axis=1), predictions['classes'])
-  metrics = {'accuracy': accuracy}
-
-  # Create a tensor named train_accuracy for logging purposes
-  tf.identity(accuracy[1], name='train_accuracy')
-  tf.summary.scalar('train_accuracy', accuracy[1])
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=metrics)
-
-
-def validate_batch_size_for_multi_gpu(batch_size):
-  """For multi-gpu, batch-size must be a multiple of the number of
-  available GPUs.
-
-  Note that this should eventually be handled by replicate_model_fn
-  directly. Multi-GPU support is currently experimental, however,
-  so doing the work here until that feature is in place.
-  """
-  from tensorflow.python.client import device_lib
-
-  local_device_protos = device_lib.list_local_devices()
-  num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
-  if not num_gpus:
-    raise ValueError('Multi-GPU mode was specified, but no GPUs '
-      'were found. To use CPU, run without --multi_gpu.')
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-      'must be a multiple of the number of available GPUs. '
-      'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
-      ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-
-
-def resnet_main(flags, model_function, input_function):
-  # Using the Winograd non-fused algorithms provides a small performance boost.
-  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-
-  if flags.multi_gpu:
-    validate_batch_size_for_multi_gpu(flags.batch_size)
-
-    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
-    # and (2) wrap the optimizer. The first happens here, and (2) happens
-    # in the model_fn itself when the optimizer is defined.
-    model_function = tf.contrib.estimator.replicate_model_fn(
-        model_function,
-        loss_reduction=tf.losses.Reduction.MEAN)
-
-  # Create session config based on values of inter_op_parallelism_threads and
-  # intra_op_parallelism_threads. Note that we default to having
-  # allow_soft_placement = True, which is required for multi-GPU and not
-  # harmful for other modes.
-  session_config = tf.ConfigProto(
-      inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
-      intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
-      allow_soft_placement=True)
-
-  # Set up a RunConfig to save checkpoint and set session config.
-  run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9,
-                                                session_config=session_config)
-  classifier = tf.estimator.Estimator(
-      model_fn=model_function, model_dir=flags.model_dir, config=run_config,
-      params={
-          'resnet_size': flags.resnet_size,
-          'data_format': flags.data_format,
-          'batch_size': flags.batch_size,
-          'multi_gpu': flags.multi_gpu,
-          'version': flags.version,
-      })
-
-  for _ in range(flags.train_epochs // flags.epochs_per_eval):
-    train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=flags.batch_size)
-
-    print('Starting a training cycle.')
-
-    def input_fn_train():
-      return input_function(True, flags.data_dir, flags.batch_size,
-                            flags.epochs_per_eval, flags.num_parallel_calls,
-                            flags.multi_gpu)
-
-    classifier.train(input_fn=input_fn_train, hooks=train_hooks)
-
-    print('Starting to evaluate.')
-    # Evaluate the model and print results
-    def input_fn_eval():
-      return input_function(False, flags.data_dir, flags.batch_size,
-                            1, flags.num_parallel_calls, flags.multi_gpu)
-
-    eval_results = classifier.evaluate(input_fn=input_fn_eval)
-    print(eval_results)
-
-
-class ResnetArgParser(argparse.ArgumentParser):
-  """Arguments for configuring and running a Resnet Model.
-  """
-
-  def __init__(self, resnet_size_choices=None):
-    super(ResnetArgParser, self).__init__(parents=[
-        parsers.BaseParser(),
-        parsers.PerformanceParser(),
-        parsers.ImageModelParser(),
-    ])
-
-    self.add_argument(
-        '--version', '-v', type=int, choices=[1, 2],
-        default=DEFAULT_VERSION,
-        help="Version of ResNet. (1 or 2) See README.md for details."
-    )
-
-    self.add_argument(
-        '--resnet_size', '-rs', type=int, default=50,
-        choices=resnet_size_choices,
-        help='[default: %(default)s]The size of the ResNet model to use.',
-        metavar='<RS>'
-    )
--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains utility and supporting functions for ResNet.
+
+  This module contains ResNet code which does not directly build layers. This
+includes dataset management, hyperparameter and optimizer code, and argument
+parsing. Code for defining the ResNet layers can be found in resnet_model.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+
+import tensorflow as tf
+
+from official.utils.arg_parsers import parsers  # pylint: disable=g-bad-import-order
+from official.utils.logging import hooks_helper
+from official.resnet import resnet_model
+
+
+################################################################################
+# Functions for input processing.
+################################################################################
+def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer,
+                           parse_record_fn, num_epochs=1, num_parallel_calls=1,
+                           examples_per_epoch=0, multi_gpu=False):
+  """Given a Dataset with raw records, parse each record into images and labels,
+  and return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup
+      time and use less memory.
+    parse_record_fn: A function that takes a raw record and returns the
+      corresponding (image, label) pair.
+    num_epochs: The number of epochs to repeat the dataset.
+    num_parallel_calls: The number of records that are processed in parallel.
+      This can be optimized per data set but for generally homogeneous data
+      sets, should be approximately the number of available CPU cores.
+    examples_per_epoch: The number of examples in the current set that
+      are processed each epoch. Note that this is only used for multi-GPU mode,
+      and only to handle what will eventually be handled inside of Estimator.
+    multi_gpu: Whether this is run multi-GPU. Note that this is only required
+      currently to handle the batch leftovers (see below), and can be removed
+      when that is handled directly by Estimator.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # We prefetch a batch at a time, This can help smooth out the time taken to
+  # load input files as we go through shuffling and processing.
+  dataset = dataset.prefetch(buffer_size=batch_size)
+  if is_training:
+    # Shuffle the records. Note that we shuffle before repeating to ensure
+    # that the shuffling respects epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+
+  # If we are training over multiple epochs before evaluating, repeat the
+  # dataset for the appropriate number of epochs.
+  dataset = dataset.repeat(num_epochs)
+
+  # Currently, if we are using multiple GPUs, we can't pass in uneven batches.
+  # (For example, if we have 4 GPUs, the number of examples in each batch
+  # must be divisible by 4.) We already ensured this for the batch_size, but
+  # we have to additionally ensure that any "leftover" examples-- the remainder
+  # examples (total examples % batch_size) that get called a batch for the very
+  # last batch of an epoch-- do not raise an error when we try to split them
+  # over the GPUs. This will likely be handled by Estimator during replication
+  # in the future, but for now, we just drop the leftovers here.
+  if multi_gpu:
+    total_examples = num_epochs * examples_per_epoch
+    dataset = dataset.take(batch_size * (total_examples // batch_size))
+
+  # Parse the raw records into images and labels
+  dataset = dataset.map(lambda value: parse_record_fn(value, is_training),
+                        num_parallel_calls=num_parallel_calls)
+
+  dataset = dataset.batch(batch_size)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path.
+  dataset = dataset.prefetch(1)
+
+  return dataset
+
+
+def get_synth_input_fn(height, width, num_channels, num_classes):
+  """Returns an input function that returns a dataset with zeroes.
+
+  This is useful in debugging input pipeline performance, as it removes all
+  elements of file reading and image preprocessing.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+  def input_fn(is_training, data_dir, batch_size, *args):
+    images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
+    labels = tf.zeros((batch_size, num_classes), tf.int32)
+    return tf.data.Dataset.from_tensors((images, labels)).repeat()
+
+  return input_fn
+
+
+################################################################################
+# Functions for running training/eval/validation loops for the model.
+################################################################################
+def learning_rate_with_decay(
+    batch_size, batch_denom, num_images, boundary_epochs, decay_rates):
+  """Get a learning rate that decays step-wise as training progresses.
+
+  Args:
+    batch_size: the number of examples processed in each training batch.
+    batch_denom: this value will be used to scale the base learning rate.
+      `0.1 * batch size` is divided by this number, such that when
+      batch_denom == batch_size, the initial learning rate will be 0.1.
+    num_images: total number of images that will be used for training.
+    boundary_epochs: list of ints representing the epochs at which we
+      decay the learning rate.
+    decay_rates: list of floats representing the decay rates to be used
+      for scaling the learning rate. Should be the same length as
+      boundary_epochs.
+
+  Returns:
+    Returns a function that takes a single argument - the number of batches
+    trained so far (global_step)- and returns the learning rate to be used
+    for training the next batch.
+  """
+  initial_learning_rate = 0.1 * batch_size / batch_denom
+  batches_per_epoch = num_images / batch_size
+
+  # Multiply the learning rate by 0.1 at 100, 150, and 200 epochs.
+  boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
+  vals = [initial_learning_rate * decay for decay in decay_rates]
+
+  def learning_rate_fn(global_step):
+    global_step = tf.cast(global_step, tf.int32)
+    return tf.train.piecewise_constant(global_step, boundaries, vals)
+
+  return learning_rate_fn
+
+
+def resnet_model_fn(features, labels, mode, model_class,
+                    resnet_size, weight_decay, learning_rate_fn, momentum,
+                    data_format, version, loss_filter_fn=None, multi_gpu=False):
+  """Shared functionality for different resnet model_fns.
+
+  Initializes the ResnetModel representing the model layers
+  and uses that model to build the necessary EstimatorSpecs for
+  the `mode` in question. For training, this means building losses,
+  the optimizer, and the train op that get passed into the EstimatorSpec.
+  For evaluation and prediction, the EstimatorSpec is returned without
+  a train op, but with the necessary parameters for the given mode.
+
+  Args:
+    features: tensor representing input images
+    labels: tensor representing class labels for all input images
+    mode: current estimator mode; should be one of
+      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
+    model_class: a class representing a TensorFlow model that has a __call__
+      function. We assume here that this is a subclass of ResnetModel.
+    resnet_size: A single integer for the size of the ResNet model.
+    weight_decay: weight decay loss rate used to regularize learned variables.
+    learning_rate_fn: function that returns the current learning rate given
+      the current global_step
+    momentum: momentum term used for optimization
+    data_format: Input format ('channels_last', 'channels_first', or None).
+      If set to None, the format is dependent on whether a GPU is available.
+    version: Integer representing which version of the ResNet network to use.
+      See README for details. Valid values: [1, 2]
+    loss_filter_fn: function that takes a string variable name and returns
+      True if the var should be included in loss calculation, and False
+      otherwise. If None, batch_normalization variables will be excluded
+      from the loss.
+    multi_gpu: If True, wrap the optimizer in a TowerOptimizer suitable for
+      data-parallel distribution across multiple GPUs.
+
+  Returns:
+    EstimatorSpec parameterized according to the input params and the
+    current mode.
+  """
+
+  # Generate a summary node for the images
+  tf.summary.image('images', features, max_outputs=6)
+
+  model = model_class(resnet_size, data_format, version=version)
+  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
+
+  predictions = {
+      'classes': tf.argmax(logits, axis=1),
+      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
+  }
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+
+  # Calculate loss, which includes softmax cross entropy and L2 regularization.
+  cross_entropy = tf.losses.softmax_cross_entropy(
+      logits=logits, onehot_labels=labels)
+
+  # Create a tensor named cross_entropy for logging purposes.
+  tf.identity(cross_entropy, name='cross_entropy')
+  tf.summary.scalar('cross_entropy', cross_entropy)
+
+  # If no loss_filter_fn is passed, assume we want the default behavior,
+  # which is that batch_normalization variables are excluded from loss.
+  if not loss_filter_fn:
+    def loss_filter_fn(name):
+      return 'batch_normalization' not in name
+
+  # Add weight decay to the loss.
+  loss = cross_entropy + weight_decay * tf.add_n(
+      [tf.nn.l2_loss(v) for v in tf.trainable_variables()
+       if loss_filter_fn(v.name)])
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.train.get_or_create_global_step()
+
+    learning_rate = learning_rate_fn(global_step)
+
+    # Create a tensor named learning_rate for logging purposes
+    tf.identity(learning_rate, name='learning_rate')
+    tf.summary.scalar('learning_rate', learning_rate)
+
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate=learning_rate,
+        momentum=momentum)
+
+    # If we are running multi-GPU, we need to wrap the optimizer.
+    if multi_gpu:
+      optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
+
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    train_op = tf.group(optimizer.minimize(loss, global_step), update_ops)
+  else:
+    train_op = None
+
+  accuracy = tf.metrics.accuracy(
+      tf.argmax(labels, axis=1), predictions['classes'])
+  metrics = {'accuracy': accuracy}
+
+  # Create a tensor named train_accuracy for logging purposes
+  tf.identity(accuracy[1], name='train_accuracy')
+  tf.summary.scalar('train_accuracy', accuracy[1])
+
+  return tf.estimator.EstimatorSpec(
+      mode=mode,
+      predictions=predictions,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops=metrics)
+
+
+def validate_batch_size_for_multi_gpu(batch_size):
+  """For multi-gpu, batch-size must be a multiple of the number of
+  available GPUs.
+
+  Note that this should eventually be handled by replicate_model_fn
+  directly. Multi-GPU support is currently experimental, however,
+  so doing the work here until that feature is in place.
+  """
+  from tensorflow.python.client import device_lib
+
+  local_device_protos = device_lib.list_local_devices()
+  num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
+  if not num_gpus:
+    raise ValueError('Multi-GPU mode was specified, but no GPUs '
+      'were found. To use CPU, run without --multi_gpu.')
+
+  remainder = batch_size % num_gpus
+  if remainder:
+    err = ('When running with multiple GPUs, batch size '
+      'must be a multiple of the number of available GPUs. '
+      'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
+      ).format(num_gpus, batch_size, batch_size - remainder)
+    raise ValueError(err)
+
+
+def resnet_main(flags, model_function, input_function):
+  # Using the Winograd non-fused algorithms provides a small performance boost.
+  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+
+  if flags.multi_gpu:
+    validate_batch_size_for_multi_gpu(flags.batch_size)
+
+    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
+    # and (2) wrap the optimizer. The first happens here, and (2) happens
+    # in the model_fn itself when the optimizer is defined.
+    model_function = tf.contrib.estimator.replicate_model_fn(
+        model_function,
+        loss_reduction=tf.losses.Reduction.MEAN)
+
+  # Create session config based on values of inter_op_parallelism_threads and
+  # intra_op_parallelism_threads. Note that we default to having
+  # allow_soft_placement = True, which is required for multi-GPU and not
+  # harmful for other modes.
+  session_config = tf.ConfigProto(
+      inter_op_parallelism_threads=flags.inter_op_parallelism_threads,
+      intra_op_parallelism_threads=flags.intra_op_parallelism_threads,
+      allow_soft_placement=True)
+
+  # Set up a RunConfig to save checkpoint and set session config.
+  run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9,
+                                                session_config=session_config)
+  classifier = tf.estimator.Estimator(
+      model_fn=model_function, model_dir=flags.model_dir, config=run_config,
+      params={
+          'resnet_size': flags.resnet_size,
+          'data_format': flags.data_format,
+          'batch_size': flags.batch_size,
+          'multi_gpu': flags.multi_gpu,
+          'version': flags.version,
+      })
+
+  for _ in range(flags.train_epochs // flags.epochs_per_eval):
+    train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=flags.batch_size)
+
+    print('Starting a training cycle.')
+
+    def input_fn_train():
+      return input_function(True, flags.data_dir, flags.batch_size,
+                            flags.epochs_per_eval, flags.num_parallel_calls,
+                            flags.multi_gpu)
+
+    classifier.train(input_fn=input_fn_train, hooks=train_hooks)
+
+    print('Starting to evaluate.')
+    # Evaluate the model and print results
+    def input_fn_eval():
+      return input_function(False, flags.data_dir, flags.batch_size,
+                            1, flags.num_parallel_calls, flags.multi_gpu)
+
+    eval_results = classifier.evaluate(input_fn=input_fn_eval)
+    print(eval_results)
+
+
+class ResnetArgParser(argparse.ArgumentParser):
+  """Arguments for configuring and running a Resnet Model.
+  """
+
+  def __init__(self, resnet_size_choices=None):
+    super(ResnetArgParser, self).__init__(parents=[
+        parsers.BaseParser(),
+        parsers.PerformanceParser(),
+        parsers.ImageModelParser(),
+    ])
+
+    self.add_argument(
+        '--version', '-v', type=int, choices=[1, 2],
+        default=resnet_model.DEFAULT_VERSION,
+        help="Version of ResNet. (1 or 2) See README.md for details."
+    )
+
+    self.add_argument(
+        '--resnet_size', '-rs', type=int, default=50,
+        choices=resnet_size_choices,
+        help='[default: %(default)s]The size of the ResNet model to use.',
+        metavar='<RS>'
+    )
--- a/official/resnet/resnet_test.py
+++ b/official/resnet/resnet_test.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf


-from official.resnet import resnet  # pylint: disable=g-bad-import-order
+from official.resnet import resnet_model  # pylint: disable=g-bad-import-order


 class BlockTest(tf.test.TestCase):
@@ -63,7 +63,7 @@ class BlockTest(tf.test.TestCase):
      A 1 wide CNN projector function.
    """
    def projection_shortcut(inputs):
-      return resnet.conv2d_fixed_padding(
+      return resnet_model.conv2d_fixed_padding(
          inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
          data_format=data_format)
    return projection_shortcut
@@ -91,13 +91,13 @@ class BlockTest(tf.test.TestCase):
    data_format = "channels_last"

    if version == 1:
-      block_fn = resnet._building_block_v1
+      block_fn = resnet_model._building_block_v1
      if bottleneck:
-        block_fn = resnet._bottleneck_block_v1
+        block_fn = resnet_model._bottleneck_block_v1
    else:
-      block_fn = resnet._building_block_v2
+      block_fn = resnet_model._building_block_v2
      if bottleneck:
-        block_fn = resnet._bottleneck_block_v2
+        block_fn = resnet_model._bottleneck_block_v2

    with self.test_session(graph=tf.Graph()) as sess:
      tf.set_random_seed(tf_seed)