Intermediate commit for argparse move

30fa4ebb · Eli Bixby · 25fe395c · 30fa4ebb · 30fa4ebb · 30fa4ebb
Commit 30fa4ebb authored Aug 15, 2017 by Eli Bixby
4 changed files
--- a/tutorials/image/cifar10_estimator/README.md
+++ b/tutorials/image/cifar10_estimator/README.md
@@ -34,8 +34,8 @@ data_batch_4  data_batch_5  readme.html  test_batch
 ```shell
 # This will generate a tf record for the training and test data available at the input_dir.
 # You can see more details in generate_cifar10_tf_records.py
-$ python generate_cifar10_tfrecords.py --input_dir=/prefix/to/downloaded/data/cifar-10-batches-py \
-                                       --output_dir=/prefix/to/downloaded/data/cifar-10-batches-py
+$ python generate_cifar10_tfrecords.py --input-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
+                                       --output-dir=/prefix/to/downloaded/data/cifar-10-batches-py
 ```

 After running the command above, you should see the following new files in the output_dir.
@@ -51,30 +51,30 @@ train.tfrecords validation.tfrecords eval.tfrecords
 ```

 # Run the model on CPU only. After training, it runs the evaluation.
-$ python cifar10_main.py --data_dir=/prefix/to/downloaded/data/cifar-10-batches-py \
-                         --model_dir=/tmp/cifar10 \
-                         --is_cpu_ps=True \
-                         --num_gpus=0 \
-                         --train_steps=1000
+$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
+                         --job-dir=/tmp/cifar10 \
+                         --is-cpu-ps=True \
+                         --num-gpus=0 \
+                         --train-steps=1000

 # Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
-$ python cifar10_main.py --data_dir=/prefix/to/downloaded/data/cifar-10-batches-py \
-                         --model_dir=/tmp/cifar10 \
-                         --is_cpu_ps=True \
-                         --force_gpu_compatible=True \
-                         --num_gpus=2 \
-                         --train_steps=1000
+$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
+                         --job-dir=/tmp/cifar10 \
+                         --is-cpu-ps=True \
+                         --force-gpu-compatible=True \
+                         --num-gpus=2 \
+                         --train-steps=1000

 # Run the model on 2 GPUs using GPU as parameter server.
 # It will run an experiment, which for local setting basically means it will run stop training
 # a couple of times to perform evaluation.
-$ python cifar10_main.py --data_dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
-                         --model_dir=/tmp/cifar10 \
-                         --is_cpu_ps=False \
-                         --force_gpu_compatible=True \
-                         --num_gpus=2 \
-                         --train_steps=1000
-                         --run_experiment=True
+$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
+                         --job-dir=/tmp/cifar10 \
+                         --is-cpu-ps=False \
+                         --force-gpu-compatible=True \
+                         --num-gpus=2 \
+                         --train-steps=1000
+                         

 # There are more command line flags to play with; check cifar10_main.py for details.
 ```
@@ -105,13 +105,13 @@ gcloud ml-engine jobs submit training cifarmultigpu \
    --region us-central1 \
    --module-name cifar10_estimator.cifar10_main \
    -- \
-    --data_dir=$MY_BUCKET/cifar-10-batches-py \
-    --model_dir=$MY_BUCKLET/model_dirs/cifarmultigpu \
-    --is_cpu_ps=True \
-    --force_gpu_compatible=True \
-    --num_gpus=4 \
-    --train_steps=1000 \
-    --run_experiment=True
+    --data-dir=$MY_BUCKET/cifar-10-batches-py \
+    --job-dir=$MY_BUCKLET/model_dirs/cifarmultigpu \
+    --is-cpu-ps=True \
+    --force-gpu-compatible=True \
+    --num-gpus=4 \
+    --train-steps=1000 \
+    
 ```


@@ -188,15 +188,15 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
 # It will run evaluation a couple of times during training.
 # The num_workers arugument is used only to update the learning rate correctly.
 # Make sure the model_dir is the same as defined on the TF_CONFIG.
-$ python cifar10_main.py --data_dir=gs://path/cifar-10-batches-py \
-                         --model_dir=gs://path/model_dir/ \
-                         --is_cpu_ps=True \
-                         --force_gpu_compatible=True \
-                         --num_gpus=4 \
-                         --train_steps=40000 \
+$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
+                         --job-dir=gs://path/model_dir/ \
+                         --is-cpu-ps=True \
+                         --force-gpu-compatible=True \
+                         --num-gpus=4 \
+                         --train-steps=40000 \
                         --sync=True \
-                         --run_experiment=True \
-                         --num_workers=2
+                          \
+                         --num-workers=2
 ```

 *Output:*
@@ -331,14 +331,13 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
 # Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for 40000 steps.
 # It will run evaluation a couple of times during training.
 # Make sure the model_dir is the same as defined on the TF_CONFIG.
-$ python cifar10_main.py --data_dir=gs://path/cifar-10-batches-py \
-                         --model_dir=gs://path/model_dir/ \
-                         --is_cpu_ps=True \
-                         --force_gpu_compatible=True \
-                         --num_gpus=4 \
-                         --train_steps=40000 \
+$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
+                         --job-dir=gs://path/model_dir/ \
+                         --is-cpu-ps=True \
+                         --force-gpu-compatible=True \
+                         --num-gpus=4 \
+                         --train-steps=40000 \
                         --sync=True
-                         --run_experiment=True
 ```

 *Output:*
@@ -447,7 +446,7 @@ INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
 ```shell
 # Run this on ps:
 # The ps will not do training so most of the arguments won't affect the execution
-$ python cifar10_main.py --run_experiment=True --model_dir=gs://path/model_dir/
+$ python cifar10_main.py --job-dir=gs://path/model_dir/

 # There are more command line flags to play with; check cifar10_main.py for details.
 ```
@@ -480,7 +479,7 @@ You'll see something similar to this if you "point" TensorBoard to the `model_di
 # Check TensorBoard during training or after it.
 # Just point TensorBoard to the model_dir you chose on the previous step
 # by default the model_dir is "sentiment_analysis_output"
-$ tensorboard --log_dir="sentiment_analysis_output"
+$ tensorboard --log-dir="sentiment_analysis_output"
 ```

 ## Warnings

--- a/tutorials/image/cifar10_estimator/cifar10_main.py
+++ b/tutorials/image/cifar10_estimator/cifar10_main.py
@@ -29,13 +29,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import argparse
 import functools
 import operator
 import os

-from . import cifar10
-from . import cifar10_model
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
@@ -44,84 +42,11 @@ from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util

+from . import cifar10
+from . import cifar10_model

 tf.logging.set_verbosity(tf.logging.INFO)

-FLAGS = tf.flags.FLAGS
-
-tf.flags.DEFINE_string('data_dir', '',
-                       'The directory where the CIFAR-10 input data is stored.')
-
-tf.flags.DEFINE_string('model_dir', '',
-                       'The directory where the model will be stored.')
-
-tf.flags.DEFINE_boolean('is_cpu_ps', True,
-                        'If using CPU as the parameter server.')
-
-tf.flags.DEFINE_integer('num_gpus', 1,
-                        'The number of gpus used. Uses only CPU if set to 0.')
-
-tf.flags.DEFINE_integer('num_layers', 44, 'The number of layers of the model.')
-
-tf.flags.DEFINE_integer('train_steps', 80000,
-                        'The number of steps to use for training.')
-
-tf.flags.DEFINE_integer('train_batch_size', 128, 'Batch size for training.')
-
-tf.flags.DEFINE_integer('eval_batch_size', 100, 'Batch size for validation.')
-
-tf.flags.DEFINE_float('momentum', 0.9, 'Momentum for MomentumOptimizer.')
-
-tf.flags.DEFINE_float('weight_decay', 2e-4, 'Weight decay for convolutions.')
-
-tf.flags.DEFINE_float('learning_rate', 0.1,
-                      'This is the inital learning rate value.'
-                      ' The learning rate will decrease during training.'
-                      ' For more details check the model_fn implementation'
-                      ' in this file.')
-
-tf.flags.DEFINE_boolean('use_distortion_for_training', True,
-                        'If doing image distortion for training.')
-
-tf.flags.DEFINE_boolean('run_experiment', False,
-                        'If True will run an experiment,'
-                        ' otherwise will run training and evaluation'
-                        ' using the estimator interface.'
-                        ' Experiments perform training on several workers in'
-                        ' parallel, in other words experiments know how to'
-                        ' invoke train and eval in a sensible fashion for'
-                        ' distributed training.')
-
-tf.flags.DEFINE_boolean('sync', False,
-                        'If true when running in a distributed environment'
-                        ' will run on sync mode.')
-
-tf.flags.DEFINE_integer('num_workers', 1, 'Number of workers.')
-
-# Perf flags
-tf.flags.DEFINE_integer('num_intra_threads', 1,
-                        'Number of threads to use for intra-op parallelism.'
-                        ' If set to 0, the system will pick an appropriate number.'
-                        ' The default is 1 since in this example CPU only handles'
-                        ' the input pipeline and gradient aggregation (when'
-                        ' --is_cpu_ps). Ops that could potentially benefit'
-                        ' from intra-op parallelism are scheduled to run on GPUs.')
-
-tf.flags.DEFINE_integer('num_inter_threads', 0,
-                        'Number of threads to use for inter-op'
-                        ' parallelism. If set to 0, the system will pick'
-                        ' an appropriate number.')
-
-tf.flags.DEFINE_boolean('force_gpu_compatible', False,
-                        'Whether to enable force_gpu_compatible in'
-                        ' GPU_Options. Check'
-                        ' tensorflow/core/protobuf/config.proto#L69'
-                        ' for details.')
-
-# Debugging flags
-tf.flags.DEFINE_boolean('log_device_placement', False,
-                        'Whether to log device placement.')
-

 class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
  """Hook to print out examples per second.
@@ -221,147 +146,145 @@ class GpuParamServerDeviceSetter(object):
    return device_name


-def _create_device_setter(is_cpu_ps, worker, num_gpus):
+def _create_device_setter(avg_on_gpu, worker, num_gpus):
  """Create device setter object."""
-  if is_cpu_ps:
+  if avg_on_gpu:
+    gpus = ['/gpu:%d' % i for i in range(num_gpus)]
+    return GpuParamServerDeviceSetter(worker, gpus)
+  else:
    # tf.train.replica_device_setter supports placing variables on the CPU, all
    # on one GPU, or on ps_servers defined in a cluster_spec.
    return tf.train.replica_device_setter(
        worker_device=worker, ps_device='/cpu:0', ps_tasks=1)
-  else:
-    gpus = ['/gpu:%d' % i for i in range(num_gpus)]
-    return GpuParamServerDeviceSetter(worker, gpus)
-

-def _resnet_model_fn(features, labels, mode):
-  """Resnet model body.
+def get_model_fn(num_gpus, avg_on_gpu, num_workers):
+  def _resnet_model_fn(features, labels, mode, params):
+    """Resnet model body.

-  Support single host, one or more GPU training. Parameter distribution can be
-  either one of the following scheme.
-  1. CPU is the parameter server and manages gradient updates.
-  2. Parameters are distributed evenly across all GPUs, and the first GPU
-     manages gradient updates.
+    Support single host, one or more GPU training. Parameter distribution can be
+    either one of the following scheme.
+    1. CPU is the parameter server and manages gradient updates.
+    2. Parameters are distributed evenly across all GPUs, and the first GPU
+       manages gradient updates.

-  Args:
-    features: a list of tensors, one for each tower
-    labels: a list of tensors, one for each tower
-    mode: ModeKeys.TRAIN or EVAL
-  Returns:
-    A EstimatorSpec object.
-  """
-  is_training = (mode == tf.estimator.ModeKeys.TRAIN)
-  is_cpu_ps = FLAGS.is_cpu_ps
-  num_gpus = FLAGS.num_gpus
-  weight_decay = FLAGS.weight_decay
-  momentum = FLAGS.momentum
-
-  tower_features = features
-  tower_labels = labels
-  tower_losses = []
-  tower_gradvars = []
-  tower_preds = []
-
-  if num_gpus != 0:
-    for i in range(num_gpus):
-      worker = '/gpu:%d' % i
-      device_setter = _create_device_setter(is_cpu_ps, worker, FLAGS.num_gpus)
-      with tf.variable_scope('resnet', reuse=bool(i != 0)):
-        with tf.name_scope('tower_%d' % i) as name_scope:
-          with tf.device(device_setter):
-            _tower_fn(is_training, weight_decay, tower_features[i],
-                      tower_labels[i], tower_losses, tower_gradvars,
-                      tower_preds, False)
-            if i == 0:
-              # Only trigger batch_norm moving mean and variance update from the
-              # 1st tower. Ideally, we should grab the updates from all towers
-              # but these stats accumulate extremely fast so we can ignore the
-              # other stats from the other towers without significant detriment.
-              update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
-                                             name_scope)
-  else:
-    with tf.variable_scope('resnet'), tf.device('/cpu:0'):
-      with tf.name_scope('tower_cpu') as name_scope:
-        _tower_fn(is_training, weight_decay, tower_features[0], tower_labels[0],
-                  tower_losses, tower_gradvars, tower_preds, True)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
-
-  # Now compute global loss and gradients.
-  gradvars = []
-  # parameter server here isn't necessarily one server storing the model params.
-  # (For gpu-as-ps case, model params are distributed evenly across all gpus.)
-  # It's the server that runs the ops to apply global gradient updates.
-  ps_device = '/cpu:0' if is_cpu_ps else '/gpu:0'
-  with tf.device(ps_device):
-    with tf.name_scope('gradient_averaging'):
-      loss = tf.reduce_mean(tower_losses, name='loss')
-      for zipped_gradvars in zip(*tower_gradvars):
-        # Averaging one var's gradients computed from multiple towers
-        var = zipped_gradvars[0][1]
-        grads = [gv[0] for gv in zipped_gradvars]
-        with tf.device(var.device):
-          if len(grads) == 1:
-            avg_grad = grads[0]
-          else:
-            avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
-        gradvars.append((avg_grad, var))
-
-    # Suggested learning rate scheduling from
-    # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
-    # users could apply other scheduling.
-    num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
-        'train') // (FLAGS.train_batch_size * FLAGS.num_workers)
-    boundaries = [
-        num_batches_per_epoch * x
-        for x in np.array([82, 123, 300], dtype=np.int64)
-    ]
-    staged_lr = [FLAGS.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
-
-    learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
-                                                boundaries, staged_lr)
-    # Create a nicely-named tensor for logging
-    learning_rate = tf.identity(learning_rate, name='learning_rate')
-
-    optimizer = tf.train.MomentumOptimizer(
-        learning_rate=learning_rate, momentum=momentum)
-
-    chief_hooks = []
-    if FLAGS.sync:
-      optimizer = tf.train.SyncReplicasOptimizer(
-          optimizer,
-          replicas_to_aggregate=FLAGS.num_workers)
-      sync_replicas_hook = optimizer.make_session_run_hook(True)
-      chief_hooks.append(sync_replicas_hook)
-
-    # Create single grouped train op
-    train_op = [
-        optimizer.apply_gradients(
-            gradvars, global_step=tf.train.get_global_step())
-    ]
-    train_op.extend(update_ops)
-    train_op = tf.group(*train_op)
-
-    predictions = {
-        'classes':
-            tf.concat([p['classes'] for p in tower_preds], axis=0),
-        'probabilities':
-            tf.concat([p['probabilities'] for p in tower_preds], axis=0)
-    }
-    stacked_labels = tf.concat(labels, axis=0)
-    metrics = {
-        'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes'])
-    }
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      train_op=train_op,
-      training_chief_hooks=chief_hooks,
-      eval_metric_ops=metrics)
+    Args:
+      features: a list of tensors, one for each tower
+      labels: a list of tensors, one for each tower
+      mode: ModeKeys.TRAIN or EVAL
+      params: Dictionary of Hyperparameters suitable for tuning
+    Returns:
+      A EstimatorSpec object.
+    """
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+    weight_decay = params['weight_decay']
+    momentum = params['momentum']
+
+    tower_features = features
+    tower_labels = labels
+    tower_losses = []
+    tower_gradvars = []
+    tower_preds = []
+
+    if num_gpus != 0:
+      for i in range(num_gpus):
+        worker = '/gpu:%d' % i
+        device_setter = _create_device_setter(avg_on_gpu, worker, num_gpus)
+        with tf.variable_scope('resnet', reuse=bool(i != 0)):
+          with tf.name_scope('tower_%d' % i) as name_scope:
+            with tf.device(device_setter):
+              _tower_fn(is_training, weight_decay, tower_features[i],
+                        tower_labels[i], tower_losses, tower_gradvars,
+                        tower_preds, False, params['num_layers'])
+              if i == 0:
+                # Only trigger batch_norm moving mean and variance update from
+                # the 1st tower. Ideally, we should grab the updates from all
+                # towers but these stats accumulate extremely fast so we can
+                # ignore the other stats from the other towers without
+                # significant detriment.
+                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
+                                               name_scope)
+    else:
+      with tf.variable_scope('resnet'), tf.device('/cpu:0'):
+        with tf.name_scope('tower_cpu') as name_scope:
+          _tower_fn(is_training, weight_decay, tower_features[0], tower_labels[0],
+                    tower_losses, tower_gradvars, tower_preds, True)
+          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
+
+    # Now compute global loss and gradients.
+    gradvars = []
+    # Server that runs the ops to apply global gradient updates.
+    avg_device = '/gpu:0' if avg_on_gpu else '/cpu:0'
+    with tf.device(avg_device):
+      with tf.name_scope('gradient_averaging'):
+        loss = tf.reduce_mean(tower_losses, name='loss')
+        for zipped_gradvars in zip(*tower_gradvars):
+          # Averaging one var's gradients computed from multiple towers
+          var = zipped_gradvars[0][1]
+          grads = [gv[0] for gv in zipped_gradvars]
+          with tf.device(var.device):
+            if len(grads) == 1:
+              avg_grad = grads[0]
+            else:
+              avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
+          gradvars.append((avg_grad, var))
+
+      # Suggested learning rate scheduling from
+      # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
+      # users could apply other scheduling.
+      num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
+          'train') // (params['train_batch_size'] * num_workers)
+      boundaries = [
+          num_batches_per_epoch * x
+          for x in np.array([82, 123, 300], dtype=np.int64)
+      ]
+      staged_lr = [params['learning_rate'] * x for x in [1, 0.1, 0.01, 0.002]]
+
+      learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
+                                                  boundaries, staged_lr)
+      # Create a nicely-named tensor for logging
+      learning_rate = tf.identity(learning_rate, name='learning_rate')
+
+      optimizer = tf.train.MomentumOptimizer(
+          learning_rate=learning_rate, momentum=momentum)
+
+      chief_hooks = []
+      if params['sync']:
+        optimizer = tf.train.SyncReplicasOptimizer(
+            optimizer,
+            replicas_to_aggregate=num_workers)
+        sync_replicas_hook = optimizer.make_session_run_hook(True)
+        chief_hooks.append(sync_replicas_hook)
+
+      # Create single grouped train op
+      train_op = [
+          optimizer.apply_gradients(
+              gradvars, global_step=tf.train.get_global_step())
+      ]
+      train_op.extend(update_ops)
+      train_op = tf.group(*train_op)
+
+      predictions = {
+          'classes':
+              tf.concat([p['classes'] for p in tower_preds], axis=0),
+          'probabilities':
+              tf.concat([p['probabilities'] for p in tower_preds], axis=0)
+      }
+      stacked_labels = tf.concat(labels, axis=0)
+      metrics = {
+          'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes'])
+      }
+
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        training_chief_hooks=chief_hooks,
+        eval_metric_ops=metrics)


 def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
-              tower_gradvars, tower_preds, is_cpu):
+              tower_gradvars, tower_preds, is_cpu, num_layers):
  """Build computation tower for each device (CPU or GPU).

  Args:
@@ -376,7 +299,7 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
  """
  data_format = 'channels_last' if is_cpu else 'channels_first'
  model = cifar10_model.ResNetCifar10(
-      FLAGS.num_layers, is_training=is_training, data_format=data_format)
+      num_layers, is_training=is_training, data_format=data_format)
  logits = model.forward_pass(feature, input_data_format='channels_last')
  tower_pred = {
      'classes': tf.argmax(input=logits, axis=1),
@@ -397,7 +320,8 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
  tower_gradvars.append(zip(tower_grad, model_params))


-def input_fn(subset, num_shards):
+def input_fn(data_dir, subset, num_shards, batch_size,
+             use_distortion_for_training=True):
  """Create input graph for model.

  Args:
@@ -406,16 +330,9 @@ def input_fn(subset, num_shards):
  Returns:
    two lists of tensors for features and labels, each of num_shards length.
  """
-  if subset == 'train':
-    batch_size = FLAGS.train_batch_size
-  elif subset == 'validate' or subset == 'eval':
-    batch_size = FLAGS.eval_batch_size
-  else:
-    raise ValueError('Subset must be one of \'train\''
-                     ', \'validate\' and \'eval\'')
  with tf.device('/cpu:0'):
-    use_distortion = subset == 'train' and FLAGS.use_distortion_for_training
-    dataset = cifar10.Cifar10DataSet(FLAGS.data_dir, subset, use_distortion)
+    use_distortion = subset == 'train' and use_distortion_for_training
+    dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion)
    image_batch, label_batch = dataset.make_batch(batch_size)
    if num_shards <= 1:
      # No GPU available or only 1 GPU.
@@ -439,20 +356,73 @@ def input_fn(subset, num_shards):


 # create experiment
-def get_experiment_fn(train_input_fn, eval_input_fn, train_steps, eval_steps,
-                      train_hooks):
+def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
+                      use_distortion_for_training=True):
  """Returns an Experiment function.

  Experiments perform training on several workers in parallel,
  in other words experiments know how to invoke train and eval in a sensible
-  fashion for distributed training.
+  fashion for distributed training. Arguments passed directly to this
+  function are not tunable, all other arguments should be passed within
+  tf.HParams, passed to the enclosed function.
+
+  Args:
+      data_dir: str. Location of the data for input_fns.
+      num_gpus: int. Number of GPUs on each worker.
+      is_gpu_ps: bool. If true, average gradients on GPUs.
+      use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
+  Returns:
+      A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
+      tf.contrib.learn.Experiment.
+
+      Suitable for use by tf.contrib.learn.learn_runner, which will run various
+      methods on Experiment (train, evaluate) based on information
+      about the current runner in `run_config`.
  """
  def _experiment_fn(run_config, hparams):
    """Returns an Experiment."""
-    del hparams  # Unused arg.
    # Create estimator.
-    classifier = tf.estimator.Estimator(model_fn=_resnet_model_fn,
-                                        config=run_config)
+    train_input_fn = functools.partial(
+        input_fn,
+        data_dir,
+        subset='train',
+        num_shards=num_gpus,
+        batch_size=hparams.train_batch_size,
+        use_distortion_for_training=use_distortion_for_training
+    )
+
+    eval_input_fn = functools.partial(
+        input_fn,
+        data_dir,
+        subset='eval',
+        batch_size=hparams.eval_batch_size,
+        num_shards=num_gpus
+    )
+
+    num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
+    if num_eval_examples % hparams.eval_batch_size != 0:
+      raise ValueError('validation set size must be multiple of eval_batch_size')
+
+    train_steps = hparams.train_steps
+    eval_steps = num_eval_examples // hparams.eval_batch_size
+    examples_sec_hook = ExamplesPerSecondHook(
+      hparams.train_batch_size, every_n_steps=10)
+
+    tensors_to_log = {'learning_rate': 'learning_rate',
+                      'loss': 'gradient_averaging/loss'}
+
+    logging_hook = tf.train.LoggingTensorHook(
+      tensors=tensors_to_log, every_n_iter=100)
+
+    hooks = [logging_hook, examples_sec_hook]
+
+    classifier = tf.estimator.Estimator(
+        model_fn=get_model_fn(
+            num_gpus, is_gpu_ps, run_config.num_worker_replicas),
+        config=run_config,
+        params=vars(hparams)
+    )
+
    # Create experiment.
    experiment = tf.contrib.learn.Experiment(
        classifier,
@@ -461,89 +431,189 @@ def get_experiment_fn(train_input_fn, eval_input_fn, train_steps, eval_steps,
        train_steps=train_steps,
        eval_steps=eval_steps)
    # Adding hooks to be used by the estimator on training mode.
-    experiment.extend_train_hooks(train_hooks)
+    experiment.extend_train_hooks(hooks)
    return experiment
  return _experiment_fn


-def main(unused_argv):
+def main(job_dir,
+         data_dir,
+         num_gpus,
+         avg_on_gpu,
+         use_distortion_for_training,
+         log_device_placement,
+         num_intra_threads,
+         force_gpu_compatible,
+         **hparams):
  # The env variable is on deprecation path, default is set to off.
  os.environ['TF_SYNC_ON_FINISH'] = '0'

-  if FLAGS.num_gpus < 0:
-    raise ValueError(
-        'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
-  if FLAGS.num_gpus == 0 and not FLAGS.is_cpu_ps:
-    raise ValueError(
-        'No GPU available for use, must use CPU as parameter server.')
-  if (FLAGS.num_layers - 2) % 6 != 0:
-    raise ValueError('Invalid num_layers parameter.')
-  if FLAGS.num_gpus != 0 and FLAGS.train_batch_size % FLAGS.num_gpus != 0:
-    raise ValueError('train_batch_size must be multiple of num_gpus.')
-  if FLAGS.num_gpus != 0 and FLAGS.eval_batch_size % FLAGS.num_gpus != 0:
-    raise ValueError('eval_batch_size must be multiple of num_gpus.')
-
-  num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
-  if num_eval_examples % FLAGS.eval_batch_size != 0:
-    raise ValueError('validation set size must be multiple of eval_batch_size')
-
-  train_input_fn = functools.partial(input_fn, subset='train',
-                                     num_shards=FLAGS.num_gpus)
-
-  eval_input_fn = functools.partial(input_fn, subset='eval',
-                                    num_shards=FLAGS.num_gpus)
-
-  train_steps = FLAGS.train_steps
-  eval_steps = num_eval_examples // FLAGS.eval_batch_size
-
  # Session configuration.
  sess_config = tf.ConfigProto(
      allow_soft_placement=True,
-      log_device_placement=FLAGS.log_device_placement,
-      intra_op_parallelism_threads=FLAGS.num_intra_threads,
-      inter_op_parallelism_threads=FLAGS.num_inter_threads,
+      log_device_placement=log_device_placement,
+      intra_op_parallelism_threads=num_intra_threads,
      gpu_options=tf.GPUOptions(
-          force_gpu_compatible=FLAGS.force_gpu_compatible
+          force_gpu_compatible=force_gpu_compatible
      )
  )

-  # Hooks that add extra logging that is useful to see the loss more often in
-  # the console as well as examples per second.
-  tensors_to_log = {'learning_rate': 'learning_rate',
-                    'loss': 'gradient_averaging/loss'}
-
-  logging_hook = tf.train.LoggingTensorHook(
-      tensors=tensors_to_log, every_n_iter=100)
-
-  examples_sec_hook = ExamplesPerSecondHook(
-      FLAGS.train_batch_size, every_n_steps=10)
-
-  hooks = [logging_hook, examples_sec_hook]
+  config = tf.contrib.learn.RunConfig(
+      session_config=sess_config,
+      model_dir=job_dir)
+  tf.contrib.learn.learn_runner.run(
+      get_experiment_fn(
+          data_dir,
+          num_gpus,
+          avg_on_gpu,
+          use_distortion_for_training
+      ),
+      run_config=config,
+      hparams=tf.contrib.training.HParams(**hparams)
+  )

-  if FLAGS.run_experiment:
-    config = tf.contrib.learn.RunConfig(model_dir=FLAGS.model_dir)
-    config = config.replace(session_config=sess_config)
-    tf.contrib.learn.learn_runner.run(
-        get_experiment_fn(train_input_fn, eval_input_fn,
-                          train_steps, eval_steps,
-                          hooks), run_config=config)

-  else:
-    config = tf.estimator.RunConfig()
-    config = config.replace(session_config=sess_config)
-    classifier = tf.estimator.Estimator(
-        model_fn=_resnet_model_fn, model_dir=FLAGS.model_dir, config=config)
-
-    print('Starting to train...')
-    classifier.train(input_fn=train_input_fn,
-                     steps=train_steps,
-                     hooks=hooks)
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--data-dir',
+      type=str,
+      required=True,
+      help='The directory where the CIFAR-10 input data is stored.'
+  )
+  parser.add_argument(
+      '--job-dir',
+      type=str,
+      required=True,
+      help='The directory where the model will be stored.'
+  )
+  parser.add_argument(
+      '--avg-on-gpu',
+      action='store_true',
+      default=False,
+      help='If present, use GPU to average gradients.'
+  )
+  parser.add_argument(
+      '--num-gpus',
+      type=int,
+      default=1,
+      help='The number of gpus used. Uses only CPU if set to 0.'
+  )
+  parser.add_argument(
+      '--num-layers',
+      type=int,
+      default=44,
+      help='The number of layers of the model.'
+  )
+  parser.add_argument(
+      '--train-steps',
+      type=int,
+      default=80000,
+      help='The number of steps to use for training.'
+  )
+  parser.add_argument(
+      '--train-batch-size',
+      type=int,
+      default=128,
+      help='Batch size for training.'
+  )
+  parser.add_argument(
+      '--eval-batch-size',
+      type=int,
+      default=100,
+      help='Batch size for validation.'
+  )
+  parser.add_argument(
+      '--momentum',
+      type=float,
+      default=0.9,
+      help='Momentum for MomentumOptimizer.'
+  )
+  parser.add_argument(
+      '--weight-decay',
+      type=float,
+      default=2e-4,
+      help='Weight decay for convolutions.'
+  )
+  parser.add_argument(
+      '--learning-rate',
+      type=float,
+      default=0.1,
+      help="""\
+      This is the inital learning rate value. The learning rate will decrease
+      during training. For more details check the model_fn implementation in
+      this file.\
+      """
+  )
+  parser.add_argument(
+      '--use-distortion-for-training',
+      type=bool,
+      default=True,
+      help='If doing image distortion for training.'
+  )
+  parser.add_argument(
+      '--sync',
+      action='store_true',
+      default=False,
+      help="""\
+      If present when running in a distributed environment will run on sync mode.\
+      """
+  )
+  parser.add_argument(
+      '--num-workers',
+      type=int,
+      default=1,
+      help='Number of workers.'
+  )
+  parser.add_argument(
+      '--num-intra-threads',
+      type=int,
+      default=1,
+      help="""\
+      Number of threads to use for intra-op parallelism. If set to 0, the
+      system will pick an appropriate number. The default is 1 since in this
+      example CPU only handles the input pipeline and gradient aggregation
+      (when --is-cpu-ps). Ops that could potentially benefit from intra-op
+      parallelism are scheduled to run on GPUs.\
+      """
+  )
+  parser.add_argument(
+      '--num-inter-threads',
+      type=int,
+      default=0,
+      help="""\
+      Number of threads to use for inter-op parallelism. If set to 0, the
+      system will pick an appropriate number.\
+      """
+  )
+  parser.add_argument(
+      '--force-gpu-compatible',
+      action='store_true',
+      default=False,
+      help="""\
+      Whether to enable force_gpu_compatible in GPU_Options. Check
+      tensorflow/core/protobuf/config.proto#L69 for details.\
+      """
+  )
+  parser.add_argument(
+      '--log-device-placement',
+      action='store_true',
+      default=False,
+      help='Whether to log device placement.'
+  )
+  args = parser.parse_args()

-    print('Starting to evaluate...')
-    eval_results = classifier.evaluate(
-        input_fn=eval_input_fn,
-        steps=eval_steps)
-    print(eval_results)
+  if args.num_gpus < 0:
+    raise ValueError(
+        'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
+  if args.num_gpus == 0 and not args.avg_on_gpu:
+    raise ValueError(
+        'No GPU available for use, must use CPU to average gradients.')
+  if (args.num_layers - 2) % 6 != 0:
+    raise ValueError('Invalid num_layers parameter.')
+  if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
+    raise ValueError('train_batch_size must be multiple of num_gpus.')
+  if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
+    raise ValueError('eval_batch_size must be multiple of num_gpus.')

-if __name__ == '__main__':
-  tf.app.run()
+  main(**vars(args))
--- a/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
+++ b/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
@@ -22,19 +22,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import argparse
 import cPickle
 import os
+
 import tensorflow as tf

-FLAGS = tf.flags.FLAGS
+FLAGS = None

-tf.flags.DEFINE_string('input_dir', '',
-                       'Directory where CIFAR10 data is located.')

-tf.flags.DEFINE_string('output_dir', '',
-                       'Directory where TFRecords will be saved.'
-                       'The TFRecords will have the same name as'
-                       ' the CIFAR10 inputs + .tfrecords.')


 def _int64_feature(value):
@@ -91,4 +87,22 @@ def main(unused_argv):


 if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input_dir',
+      type=str,
+      default='',
+      help='Directory where CIFAR10 data is located.'
+  )
+  parser.add_argument(
+      '--output_dir',
+      type=str,
+      default='',
+      help="""\
+      Directory where TFRecords will be saved.The TFRecords will have the same
+      name as the CIFAR10 inputs + .tfrecords.\
+      """
+  )
+  FLAGS = parser.parse_args()
+
  tf.app.run(main)
--- a/tutorials/image/cifar10_estimator/model_base.py
+++ b/tutorials/image/cifar10_estimator/model_base.py
@@ -23,12 +23,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import argparse
+
 import tensorflow as tf

-FLAGS = tf.flags.FLAGS
+FLAGS = None

-tf.flags.DEFINE_float('batch_norm_decay', 0.997, 'Decay for batch norm.')
-tf.flags.DEFINE_float('batch_norm_epsilon', 1e-5, 'Epsilon for batch norm.')


 class ResNet(object):