Move to argparse some other modifications

8b829873 · Eli Bixby · d067ce0a · 8b829873 · 8b829873 · 8b829873
Commit 8b829873 authored Aug 16, 2017 by Eli Bixby
4 changed files
--- a/tutorials/image/cifar10_estimator/README.md
+++ b/tutorials/image/cifar10_estimator/README.md
@@ -53,15 +53,13 @@ train.tfrecords validation.tfrecords eval.tfrecords
 # Run the model on CPU only. After training, it runs the evaluation.
 $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
                         --job-dir=/tmp/cifar10 \
-                         --is-cpu-ps=True \
                         --num-gpus=0 \
                         --train-steps=1000

 # Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
 $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
                         --job-dir=/tmp/cifar10 \
-                         --is-cpu-ps=True \
-                         --force-gpu-compatible=True \
+                         --force-gpu-compatible \
                         --num-gpus=2 \
                         --train-steps=1000

@@ -70,8 +68,8 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
 # a couple of times to perform evaluation.
 $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
                         --job-dir=/tmp/cifar10 \
-                         --is-cpu-ps=False \
-                         --force-gpu-compatible=True \
+                         --avg-on-gpu \
+                         --force-gpu-compatible \
                         --num-gpus=2 \


@@ -104,8 +102,7 @@ gcloud ml-engine jobs submit training cifarmultigpu \
    --module-name cifar10_estimator.cifar10_main \
    -- \
    --data-dir=$MY_BUCKET/cifar-10-batches-py \
-    --is-cpu-ps=True \
-    --force-gpu-compatible=True \
+    --force-gpu-compatible \
    --num-gpus=4 \
    --train-steps=1000
 ```
@@ -186,11 +183,10 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
 # Make sure the model_dir is the same as defined on the TF_CONFIG.
 $ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
                         --job-dir=gs://path/model_dir/ \
-                         --is-cpu-ps=True \
-                         --force-gpu-compatible=True \
+                         --force-gpu-compatible \
                         --num-gpus=4 \
                         --train-steps=40000 \
-                         --sync=True \
+                         --sync \
                          \
                         --num-workers=2
 ```
@@ -329,11 +325,10 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
 # Make sure the model_dir is the same as defined on the TF_CONFIG.
 $ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
                         --job-dir=gs://path/model_dir/ \
-                         --is-cpu-ps=True \
-                         --force-gpu-compatible=True \
+                         --force-gpu-compatible \
                         --num-gpus=4 \
                         --train-steps=40000 \
-                         --sync=True
+                         --sync
 ```

 *Output:*
@@ -480,7 +475,7 @@ $ tensorboard --log-dir="sentiment_analysis_output"

 ## Warnings

-When runninng `cifar10_main.py` with `--sync=True` argument you may see an error similar to:
+When runninng `cifar10_main.py` with `--sync` argument you may see an error similar to:

 ```python
 File "cifar10_main.py", line 538, in <module>

--- a/tutorials/image/cifar10_estimator/cifar10_main.py
+++ b/tutorials/image/cifar10_estimator/cifar10_main.py
@@ -25,7 +25,6 @@ http://www.cs.toronto.edu/~kriz/cifar.html


 """
-from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

@@ -42,8 +41,8 @@ from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util

-from . import cifar10
-from . import cifar10_model
+import cifar10
+import cifar10_model

 tf.logging.set_verbosity(tf.logging.INFO)

@@ -192,9 +191,18 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
        with tf.variable_scope('resnet', reuse=bool(i != 0)):
          with tf.name_scope('tower_%d' % i) as name_scope:
            with tf.device(device_setter):
-              _tower_fn(is_training, weight_decay, tower_features[i],
-                        tower_labels[i], tower_losses, tower_gradvars,
-                        tower_preds, False, params['num_layers'])
+              loss, gradvars, preds = _tower_fn(
+                  is_training,
+                  weight_decay,
+                  tower_features[i],
+                  tower_labels[i],
+                  False,
+                  params['num_layers'],
+                  params['batch_norm_decay'],
+                  params['batch_norm_epsilon'])
+              tower_losses.append(loss)
+              tower_gradvars.append(gradvars)
+              tower_preds.append(preds)
              if i == 0:
                # Only trigger batch_norm moving mean and variance update from
                # the 1st tower. Ideally, we should grab the updates from all
@@ -206,8 +214,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
    else:
      with tf.variable_scope('resnet'), tf.device('/cpu:0'):
        with tf.name_scope('tower_cpu') as name_scope:
-          _tower_fn(is_training, weight_decay, tower_features[0], tower_labels[0],
-                    tower_losses, tower_gradvars, tower_preds, True)
+          loss, gradvars, preds = _tower_fn(
+              is_training,
+              weight_decay,
+              tower_features[0],
+              tower_labels[0],
+              True,
+              params['num_layers'],
+              params['batch_norm_decay'],
+              params['batch_norm_epsilon'])
+          tower_losses.append(loss)
+          tower_gradvars.append(gradvars)
+          tower_preds.append(preds)
+
          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)

    # Now compute global loss and gradients.
@@ -281,10 +300,17 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
        train_op=train_op,
        training_chief_hooks=chief_hooks,
        eval_metric_ops=metrics)
+  return _resnet_model_fn


-def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
-              tower_gradvars, tower_preds, is_cpu, num_layers):
+def _tower_fn(is_training,
+              weight_decay,
+              feature,
+              label,
+              is_cpu,
+              num_layers,
+              batch_norm_decay,
+              batch_norm_epsilon):
  """Build computation tower for each device (CPU or GPU).

  Args:
@@ -299,13 +325,15 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
  """
  data_format = 'channels_last' if is_cpu else 'channels_first'
  model = cifar10_model.ResNetCifar10(
-      num_layers, is_training=is_training, data_format=data_format)
+      num_layers,
+      batch_norm_decay=batch_norm_decay,
+      batch_norm_epsilon=batch_norm_epsilon,
+      is_training=is_training, data_format=data_format)
  logits = model.forward_pass(feature, input_data_format='channels_last')
  tower_pred = {
      'classes': tf.argmax(input=logits, axis=1),
      'probabilities': tf.nn.softmax(logits)
  }
-  tower_preds.append(tower_pred)

  tower_loss = tf.losses.sparse_softmax_cross_entropy(
      logits=logits, labels=label)
@@ -314,10 +342,10 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
  model_params = tf.trainable_variables()
  tower_loss += weight_decay * tf.add_n(
      [tf.nn.l2_loss(v) for v in model_params])
-  tower_losses.append(tower_loss)

  tower_grad = tf.gradients(tower_loss, model_params)
-  tower_gradvars.append(zip(tower_grad, model_params))
+
+  return tower_loss, tower_grad, tower_pred


 def input_fn(data_dir, subset, num_shards, batch_size,
@@ -535,6 +563,7 @@ if __name__ == '__main__':
      default=2e-4,
      help='Weight decay for convolutions.'
  )
+  
  parser.add_argument(
      '--learning-rate',
      type=float,
@@ -595,12 +624,24 @@ if __name__ == '__main__':
      default=False,
      help='Whether to log device placement.'
  )
+  parser.add_argument(
+      '--batch_norm_decay',
+      type=float,
+      default=0.997,
+      help='Decay for batch norm.'
+  )
+  parser.add_argument(
+      '--batch_norm_epsilon',
+      type=float,
+      default=1e-5,
+      help='Epsilon for batch norm.'
+  )
  args = parser.parse_args()

  if args.num_gpus < 0:
    raise ValueError(
        'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
-  if args.num_gpus == 0 and not args.avg_on_gpu:
+  if args.num_gpus == 0 and args.avg_on_gpu:
    raise ValueError(
        'No GPU available for use, must use CPU to average gradients.')
  if (args.num_layers - 2) % 6 != 0:

--- a/tutorials/image/cifar10_estimator/cifar10_model.py
+++ b/tutorials/image/cifar10_estimator/cifar10_model.py
@@ -13,20 +13,29 @@
 # limitations under the License.
 # ==============================================================================
 """Model class for Cifar10 Dataset."""
-from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import tensorflow as tf

-from . import model_base
+import model_base


 class ResNetCifar10(model_base.ResNet):
  """Cifar10 model with ResNetV1 and basic residual block."""

-  def __init__(self, num_layers, is_training, data_format='channels_first'):
-    super(ResNetCifar10, self).__init__(is_training, data_format)
+  def __init__(self,
+               num_layers,
+               is_training,
+               batch_norm_decay,
+               batch_norm_epsilon,
+               data_format='channels_first'):
+    super(ResNetCifar10, self).__init__(
+        is_training,
+        data_format,
+        batch_norm_decay,
+        batch_norm_epsilon
+    )
    self.n = (num_layers - 2) // 6
    # Add one in case label starts with 1. No impact if label starts with 0.
    self.num_classes = 10 + 1

--- a/tutorials/image/cifar10_estimator/model_base.py
+++ b/tutorials/image/cifar10_estimator/model_base.py
@@ -23,18 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import argparse
-
 import tensorflow as tf

-FLAGS = None
-
-

 class ResNet(object):
  """ResNet model."""

-  def __init__(self, is_training, data_format):
+  def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
    """ResNet constructor.

    Args:
@@ -42,6 +37,8 @@ class ResNet(object):
      data_format: the data_format used during computation.
                   one of 'channels_first' or 'channels_last'.
    """
+    self._batch_norm_decay = batch_norm_decay
+    self._batch_norm_epsilon = batch_norm_epsilon
    self._is_training = is_training
    assert data_format in ('channels_first', 'channels_last')
    self._data_format = data_format
@@ -185,10 +182,10 @@ class ResNet(object):
      data_format = 'NHWC'
    return tf.contrib.layers.batch_norm(
        x,
-        decay=FLAGS.batch_norm_decay,
+        decay=self._batch_norm_decay,
        center=True,
        scale=True,
-        epsilon=FLAGS.batch_norm_epsilon,
+        epsilon=self._batch_norm_epsilon,
        is_training=self._is_training,
        fused=True,
        data_format=data_format)