Use util functions hooks_helper and parser in mnist and wide_deep, and rename...

Use util functions hooks_helper and parser in mnist and wide_deep, and rename epochs_between_eval (from epochs_per_eval) (#3650)

Use util functions hooks_helper and parser in mnist and wide_deep, and rename...
Use util functions hooks_helper and parser in mnist and wide_deep, and rename epochs_between_eval (from epochs_per_eval) (#3650)
adfd5a3a · Katherine Wu · GitHub · 875fcb3b · adfd5a3a · adfd5a3a
Unverified Commit adfd5a3a authored Mar 20, 2018 by Katherine Wu Committed by GitHub Mar 20, 2018
11 changed files
--- a/official/mnist/README.md
+++ b/official/mnist/README.md
@@ -11,7 +11,10 @@ APIs.

 ## Setup

-To begin, you'll simply need the latest version of TensorFlow installed.
+To begin, you'll simply need the latest version of TensorFlow installed,
+and make sure to run the command to export the `/models` folder to the
+python path: https://github.com/tensorflow/models/tree/master/official#running-the-models
+
 Then to train the model, run the following:

 ```

--- a/official/mnist/mnist.py
+++ b/official/mnist/mnist.py
@@ -18,12 +18,15 @@ from __future__ import division
 from __future__ import print_function

 import argparse
-import os
 import sys

 import tensorflow as tf
+
 from official.mnist import dataset
+from official.utils.arg_parsers import parsers
+from official.utils.logging import hooks_helper

+LEARNING_RATE = 1e-4

 class Model(tf.keras.Model):
  """Model to recognize digits in the MNIST dataset.
@@ -104,7 +107,7 @@ def model_fn(features, labels, mode, params):
            'classify': tf.estimator.export.PredictOutput(predictions)
        })
  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
+    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)

    # If we are running multi-GPU, we need to wrap the optimizer.
    if params.get('multi_gpu'):
@@ -114,10 +117,15 @@ def model_fn(features, labels, mode, params):
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    accuracy = tf.metrics.accuracy(
        labels=labels, predictions=tf.argmax(logits, axis=1))
-    # Name the accuracy tensor 'train_accuracy' to demonstrate the
-    # LoggingTensorHook.
+
+    # Name tensors to be logged with LoggingTensorHook.
+    tf.identity(LEARNING_RATE, 'learning_rate')
+    tf.identity(loss, 'cross_entropy')
    tf.identity(accuracy[1], name='train_accuracy')
+
+    # Save accuracy scalar to Tensorboard output.
    tf.summary.scalar('train_accuracy', accuracy[1])
+
    return tf.estimator.EstimatorSpec(
        mode=tf.estimator.ModeKeys.TRAIN,
        loss=loss,
@@ -185,30 +193,32 @@ def main(unused_argv):
          'multi_gpu': FLAGS.multi_gpu
      })

-  # Train the model
+  # Set up training and evaluation input functions.
  def train_input_fn():
    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes use less memory. MNIST is a small
    # enough dataset that we can easily shuffle the full epoch.
    ds = dataset.train(FLAGS.data_dir)
-    ds = ds.cache().shuffle(buffer_size=50000).batch(FLAGS.batch_size).repeat(
-        FLAGS.train_epochs)
-    return ds
+    ds = ds.cache().shuffle(buffer_size=50000).batch(FLAGS.batch_size)

-  # Set up training hook that logs the training accuracy every 100 steps.
-  tensors_to_log = {'train_accuracy': 'train_accuracy'}
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=100)
-  mnist_classifier.train(input_fn=train_input_fn, hooks=[logging_hook])
+    # Iterate through the dataset a set number (`epochs_between_evals`) of times
+    # during each training session.
+    ds = ds.repeat(FLAGS.epochs_between_evals)
+    return ds

-  # Evaluate the model and print results
  def eval_input_fn():
    return dataset.test(FLAGS.data_dir).batch(
        FLAGS.batch_size).make_one_shot_iterator().get_next()

-  eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
-  print()
-  print('Evaluation results:\n\t%s' % eval_results)
+  # Set up hook that outputs training logs every 100 steps.
+  train_hooks = hooks_helper.get_train_hooks(
+      FLAGS.hooks, batch_size=FLAGS.batch_size)
+
+  # Train and evaluate model.
+  for n in range(FLAGS.train_epochs // FLAGS.epochs_between_evals):
+    mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
+    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
+    print('\nEvaluation results:\n\t%s\n' % eval_results)

  # Export the model
  if FLAGS.export_dir is not None:
@@ -220,51 +230,28 @@ def main(unused_argv):


 class MNISTArgParser(argparse.ArgumentParser):
-
+  """Argument parser for running MNIST model."""
  def __init__(self):
-    super(MNISTArgParser, self).__init__()
+    super(MNISTArgParser, self).__init__(parents=[
+      parsers.BaseParser(),
+      parsers.ImageModelParser()])

-    self.add_argument(
-        '--multi_gpu', action='store_true',
-        help='If set, run across all available GPUs.')
-    self.add_argument(
-        '--batch_size',
-        type=int,
-        default=100,
-        help='Number of images to process in a batch')
-    self.add_argument(
-        '--data_dir',
-        type=str,
-        default='/tmp/mnist_data',
-        help='Path to directory containing the MNIST dataset')
-    self.add_argument(
-        '--model_dir',
-        type=str,
-        default='/tmp/mnist_model',
-        help='The directory where the model will be stored.')
-    self.add_argument(
-        '--train_epochs',
-        type=int,
-        default=40,
-        help='Number of epochs to train.')
-    self.add_argument(
-        '--data_format',
-        type=str,
-        default=None,
-        choices=['channels_first', 'channels_last'],
-        help='A flag to override the data format used in the model. '
-        'channels_first provides a performance boost on GPU but is not always '
-        'compatible with CPU. If left unspecified, the data format will be '
-        'chosen automatically based on whether TensorFlow was built for CPU or '
-        'GPU.')
    self.add_argument(
        '--export_dir',
        type=str,
-        help='The directory where the exported SavedModel will be stored.')
+        help='[default: %(default)s] If set, a SavedModel serialization of the '
+             'model will be exported to this directory at the end of training. '
+             'See the README for more details and relevant links.')
+
+    self.set_defaults(
+        data_dir='/tmp/mnist_data',
+        model_dir='/tmp/mnist_model',
+        batch_size=100,
+        train_epochs=40)


 if __name__ == '__main__':
-  parser = MNISTArgParser()
  tf.logging.set_verbosity(tf.logging.INFO)
+  parser = MNISTArgParser()
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/official/mnist/mnist_eager.py
+++ b/official/mnist/mnist_eager.py
@@ -33,8 +33,10 @@ import time

 import tensorflow as tf
 import tensorflow.contrib.eager as tfe
+
 from official.mnist import mnist
 from official.mnist import dataset
+from official.utils.arg_parsers import parsers

 FLAGS = None

@@ -98,9 +100,13 @@ def test(model, dataset):
 def main(_):
  tfe.enable_eager_execution()

+  # Automatically determine device and data_format
  (device, data_format) = ('/gpu:0', 'channels_first')
  if FLAGS.no_gpu or tfe.num_gpus() <= 0:
    (device, data_format) = ('/cpu:0', 'channels_last')
+  # If data_format is defined in FLAGS, overwrite automatically set value.
+  if FLAGS.data_format is not None:
+    data_format = data_format
  print('Using device %s, and data format %s.' % (device, data_format))

  # Load the datasets
@@ -112,6 +118,7 @@ def main(_):
  model = mnist.Model(data_format)
  optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum)

+  # Create file writers for writing TensorBoard summaries.
  if FLAGS.output_dir:
    # Create directories to which summaries will be written
    # tensorboard --logdir=<output_dir>
@@ -126,15 +133,18 @@ def main(_):
      train_dir, flush_millis=10000)
  test_summary_writer = tf.contrib.summary.create_file_writer(
      test_dir, flush_millis=10000, name='test')
-  checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
+
+  # Create and restore checkpoint (if one exists on the path)
+  checkpoint_prefix = os.path.join(FLAGS.model_dir, 'ckpt')
  step_counter = tf.train.get_or_create_global_step()
  checkpoint = tfe.Checkpoint(
      model=model, optimizer=optimizer, step_counter=step_counter)
  # Restore variables on creation if a checkpoint exists.
-  checkpoint.restore(tf.train.latest_checkpoint(FLAGS.checkpoint_dir))
-  # Train and evaluate for 10 epochs.
+  checkpoint.restore(tf.train.latest_checkpoint(FLAGS.model_dir))
+
+  # Train and evaluate for a set number of epochs.
  with tf.device(device):
-    for _ in range(10):
+    for _ in range(FLAGS.train_epochs):
      start = time.time()
      with summary_writer.as_default():
        train(model, optimizer, train_ds, step_counter, FLAGS.log_interval)
@@ -148,54 +158,52 @@ def main(_):
      checkpoint.save(checkpoint_prefix)


-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--data_dir',
-      type=str,
-      default='/tmp/tensorflow/mnist/input_data',
-      help='Directory for storing input data')
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      metavar='N',
-      help='input batch size for training (default: 100)')
-  parser.add_argument(
-      '--log_interval',
-      type=int,
-      default=10,
-      metavar='N',
-      help='how many batches to wait before logging training status')
-  parser.add_argument(
-      '--output_dir',
-      type=str,
-      default=None,
-      metavar='N',
-      help='Directory to write TensorBoard summaries')
-  parser.add_argument(
-      '--checkpoint_dir',
-      type=str,
-      default='/tmp/tensorflow/mnist/checkpoints/',
-      metavar='N',
-      help='Directory to save checkpoints in (once per epoch)')
-  parser.add_argument(
-      '--lr',
-      type=float,
-      default=0.01,
-      metavar='LR',
-      help='learning rate (default: 0.01)')
-  parser.add_argument(
-      '--momentum',
-      type=float,
-      default=0.5,
-      metavar='M',
-      help='SGD momentum (default: 0.5)')
-  parser.add_argument(
-      '--no_gpu',
-      action='store_true',
-      default=False,
-      help='disables GPU usage even if a GPU is available')
+class MNISTEagerArgParser(argparse.ArgumentParser):
+  """Argument parser for running MNIST model with eager trainng loop."""
+  def __init__(self):
+    super(MNISTEagerArgParser, self).__init__(parents=[
+      parsers.BaseParser(epochs_between_evals=False, multi_gpu=False,
+                         hooks=False),
+      parsers.ImageModelParser()])
+
+    self.add_argument(
+        '--log_interval', '-li',
+        type=int,
+        default=10,
+        metavar='N',
+        help='[default: %(default)s] batches between logging training status')
+    self.add_argument(
+        '--output_dir', '-od',
+        type=str,
+        default=None,
+        metavar='<OD>',
+        help='[default: %(default)s] Directory to write TensorBoard summaries')
+    self.add_argument(
+        '--lr', '-lr',
+        type=float,
+        default=0.01,
+        metavar='<LR>',
+        help='[default: %(default)s] learning rate')
+    self.add_argument(
+        '--momentum', '-m',
+        type=float,
+        default=0.5,
+        metavar='<M>',
+        help='[default: %(default)s] SGD momentum')
+    self.add_argument(
+        '--no_gpu', '-nogpu',
+        action='store_true',
+        default=False,
+        help='disables GPU usage even if a GPU is available')
+
+    self.set_defaults(
+        data_dir='/tmp/tensorflow/mnist/input_data',
+        model_dir='/tmp/tensorflow/mnist/checkpoints/',
+        batch_size=100,
+        train_epochs=10,
+    )

+if __name__ == '__main__':
+  parser = MNISTEagerArgParser()
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/official/resnet/cifar10_main.py
+++ b/official/resnet/cifar10_main.py
@@ -216,7 +216,7 @@ def main(argv):
                      model_dir='/tmp/cifar10_model',
                      resnet_size=32,
                      train_epochs=250,
-                      epochs_per_eval=10,
+                      epochs_between_evals=10,
                      batch_size=128)

  flags = parser.parse_args(args=argv[1:])

--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -339,15 +339,16 @@ def resnet_main(flags, model_function, input_function):
          'version': flags.version,
      })

-  for _ in range(flags.train_epochs // flags.epochs_per_eval):
-    train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=flags.batch_size)
+  for _ in range(flags.train_epochs // flags.epochs_between_evals):
+    train_hooks = hooks_helper.get_train_hooks(flags.hooks,
+                                               batch_size=flags.batch_size)

    print('Starting a training cycle.')

    def input_fn_train():
      return input_function(True, flags.data_dir, flags.batch_size,
-                            flags.epochs_per_eval, flags.num_parallel_calls,
-                            flags.multi_gpu)
+                            flags.epochs_between_evals,
+                            flags.num_parallel_calls, flags.multi_gpu)

    classifier.train(input_fn=input_fn_train, hooks=train_hooks,
                     max_steps=flags.max_train_steps)

--- a/official/utils/arg_parsers/parsers.py
+++ b/official/utils/arg_parsers/parsers.py
@@ -70,14 +70,14 @@ class BaseParser(argparse.ArgumentParser):
    data_dir: Create a flag for specifying the input data directory.
    model_dir: Create a flag for specifying the model file directory.
    train_epochs: Create a flag to specify the number of training epochs.
-    epochs_per_eval: Create a flag to specify the frequency of testing.
+    epochs_between_evals: Create a flag to specify the frequency of testing.
    batch_size: Create a flag to specify the batch size.
    multi_gpu: Create a flag to allow the use of all available GPUs.
    hooks: Create a flag to specify hooks for logging.
  """

  def __init__(self, add_help=False, data_dir=True, model_dir=True,
-               train_epochs=True, epochs_per_eval=True, batch_size=True,
+               train_epochs=True, epochs_between_evals=True, batch_size=True,
               multi_gpu=True, hooks=True):
    super(BaseParser, self).__init__(add_help=add_help)

@@ -91,7 +91,8 @@ class BaseParser(argparse.ArgumentParser):
    if model_dir:
      self.add_argument(
          "--model_dir", "-md", default="/tmp",
-          help="[default: %(default)s] The location of the model files.",
+          help="[default: %(default)s] The location of the model checkpoint "
+               "files.",
          metavar="<MD>",
      )

@@ -102,12 +103,12 @@ class BaseParser(argparse.ArgumentParser):
          metavar="<TE>"
      )

-    if epochs_per_eval:
+    if epochs_between_evals:
      self.add_argument(
-          "--epochs_per_eval", "-epe", type=int, default=1,
+          "--epochs_between_evals", "-ebe", type=int, default=1,
          help="[default: %(default)s] The number of training epochs to run "
               "between evaluations.",
-          metavar="<EPE>"
+          metavar="<EBE>"
      )

    if batch_size:
@@ -214,6 +215,8 @@ class ImageModelParser(argparse.ArgumentParser):
    if data_format:
      self.add_argument(
          "--data_format", "-df",
+          default=None,
+          choices=['channels_first', 'channels_last'],
          help="A flag to override the data format used in the model. "
               "channels_first provides a performance boost on GPU but is not "
               "always compatible with CPU. If left unspecified, the data "

--- a/official/utils/arg_parsers/parsers_test.py
+++ b/official/utils/arg_parsers/parsers_test.py
@@ -42,7 +42,7 @@ class BaseTester(unittest.TestCase):
        data_dir="dfgasf",
        model_dir="dfsdkjgbs",
        train_epochs=534,
-        epochs_per_eval=15,
+        epochs_between_evals=15,
        batch_size=256,
        hooks=["LoggingTensorHook"],
        num_parallel_calls=18,

--- a/official/utils/logging/hooks_helper.py
+++ b/official/utils/logging/hooks_helper.py
@@ -63,20 +63,25 @@ def get_train_hooks(name_list, **kwargs):
  return train_hooks


-def get_logging_tensor_hook(every_n_iter=100, **kwargs):  # pylint: disable=unused-argument
+def get_logging_tensor_hook(every_n_iter=100, tensors_to_log=None, **kwargs):  # pylint: disable=unused-argument
  """Function to get LoggingTensorHook.

  Args:
    every_n_iter: `int`, print the values of `tensors` once every N local
      steps taken on the current worker.
+    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
+      names. If not set, log _TENSORS_TO_LOG by default.
    kwargs: a dictionary of arguments to LoggingTensorHook.

  Returns:
    Returns a LoggingTensorHook with a standard set of tensors that will be
    printed to stdout.
  """
+  if tensors_to_log is None:
+    tensors_to_log = _TENSORS_TO_LOG
+
  return tf.train.LoggingTensorHook(
-      tensors=_TENSORS_TO_LOG,
+      tensors=tensors_to_log,
      every_n_iter=every_n_iter)



--- a/official/utils/testing/integration.py
+++ b/official/utils/testing/integration.py
@@ -44,7 +44,7 @@ def run_synthetic(main, tmp_root, extra_flags=None):
  model_dir = tempfile.mkdtemp(dir=tmp_root)

  args = [sys.argv[0], "--model_dir", model_dir, "--train_epochs", "1",
-          "--epochs_per_eval", "1", "--use_synthetic_data",
+          "--epochs_between_evals", "1", "--use_synthetic_data",
          "--max_train_steps", "1"] + extra_flags

  try:

--- a/official/wide_deep/README.md
+++ b/official/wide_deep/README.md
@@ -15,6 +15,8 @@ The input function for the `Estimator` uses `tf.contrib.data.TextLineDataset`, w
 The `Estimator` and `Dataset` APIs are both highly encouraged for fast development and efficient training.

 ## Running the code
+Make sure to run the command to export the `/models` folder to the python path: https://github.com/tensorflow/models/tree/master/official#running-the-models
+
 ### Setup
 The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) that this sample uses for training is hosted by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/). We have provided a script that downloads and cleans the necessary files.


--- a/official/wide_deep/wide_deep.py
+++ b/official/wide_deep/wide_deep.py
@@ -18,11 +18,15 @@ from __future__ import division
 from __future__ import print_function

 import argparse
+import os
 import shutil
 import sys

 import tensorflow as tf

+from official.utils.arg_parsers import parsers  # pylint: disable=g-bad-import-order
+from official.utils.logging import hooks_helper
+
 _CSV_COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'gender',
@@ -33,34 +37,6 @@ _CSV_COLUMNS = [
 _CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                        [0], [0], [0], [''], ['']]

-parser = argparse.ArgumentParser()
-
-parser.add_argument(
-    '--model_dir', type=str, default='/tmp/census_model',
-    help='Base directory for the model.')
-
-parser.add_argument(
-    '--model_type', type=str, default='wide_deep',
-    help="Valid model types: {'wide', 'deep', 'wide_deep'}.")
-
-parser.add_argument(
-    '--train_epochs', type=int, default=40, help='Number of training epochs.')
-
-parser.add_argument(
-    '--epochs_per_eval', type=int, default=2,
-    help='The number of training epochs to run between evaluations.')
-
-parser.add_argument(
-    '--batch_size', type=int, default=40, help='Number of examples per batch.')
-
-parser.add_argument(
-    '--train_data', type=str, default='/tmp/census_data/adult.data',
-    help='Path to the training data.')
-
-parser.add_argument(
-    '--test_data', type=str, default='/tmp/census_data/adult.test',
-    help='Path to the test data.')
-
 _NUM_EXAMPLES = {
    'train': 32561,
    'validation': 16281,
@@ -170,8 +146,8 @@ def build_estimator(model_dir, model_type):
 def input_fn(data_file, num_epochs, shuffle, batch_size):
  """Generate an input function for the Estimator."""
  assert tf.gfile.Exists(data_file), (
-      '%s not found. Please make sure you have either run data_download.py or '
-      'set both arguments --train_data and --test_data.' % data_file)
+      '%s not found. Please make sure you have run data_download.py and '
+      'set the --data_dir argument to the correct path.' % data_file)

  def parse_csv(value):
    print('Parsing', data_file)
@@ -200,23 +176,51 @@ def main(unused_argv):
  shutil.rmtree(FLAGS.model_dir, ignore_errors=True)
  model = build_estimator(FLAGS.model_dir, FLAGS.model_type)

-  # Train and evaluate the model every `FLAGS.epochs_per_eval` epochs.
-  for n in range(FLAGS.train_epochs // FLAGS.epochs_per_eval):
-    model.train(input_fn=lambda: input_fn(
-        FLAGS.train_data, FLAGS.epochs_per_eval, True, FLAGS.batch_size))
+  train_file = os.path.join(FLAGS.data_dir, 'adult.data')
+  test_file = os.path.join(FLAGS.data_dir, 'adult.test')
+
+  train_hooks = hooks_helper.get_train_hooks(
+      FLAGS.hooks, batch_size=FLAGS.batch_size,
+      tensors_to_log={'average_loss': 'head/truediv',
+                      'loss': 'head/weighted_loss/Sum'})
+
+  # Train and evaluate the model every `FLAGS.epochs_between_evals` epochs.
+  for n in range(FLAGS.train_epochs // FLAGS.epochs_between_evals):
+    model.train(
+        input_fn=lambda: input_fn(train_file, FLAGS.epochs_between_evals, True,
+                                  FLAGS.batch_size),
+        hooks=train_hooks)

    results = model.evaluate(input_fn=lambda: input_fn(
-        FLAGS.test_data, 1, False, FLAGS.batch_size))
+        test_file, 1, False, FLAGS.batch_size))

    # Display evaluation metrics
-    print('Results at epoch', (n + 1) * FLAGS.epochs_per_eval)
+    print('Results at epoch', (n + 1) * FLAGS.epochs_between_evals)
    print('-' * 60)

    for key in sorted(results):
      print('%s: %s' % (key, results[key]))


+class WideDeepArgParser(argparse.ArgumentParser):
+  """Argument parser for running the wide deep model."""
+  def __init__(self):
+    super(WideDeepArgParser, self).__init__(parents=[parsers.BaseParser()])
+    self.add_argument(
+        '--model_type', '-mt', type=str, default='wide_deep',
+        choices=['wide', 'deep', 'wide_deep'],
+        help='[default %(default)s] Valid model types: wide, deep, wide_deep.',
+        metavar='<MT>')
+    self.set_defaults(
+        data_dir='/tmp/census_data',
+        model_dir='/tmp/census_model',
+        train_epochs=40,
+        epochs_between_evals=2,
+        batch_size=40)
+
+
 if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
+  parser = WideDeepArgParser()
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)