Merge branch 'master' of github.com:tensorflow/models

b8e7ff1c · Toby Boyd · 17ef7c7e · aae631cc · b8e7ff1c · b8e7ff1c
Commit b8e7ff1c authored Aug 24, 2017 by Toby Boyd
Showing with 31 additions and 28 deletions

tutorials/image/cifar10_estimator/cifar10_main.py tutorials/image/cifar10_estimator/cifar10_main.py +29 -28

tutorials/image/cifar10_estimator/cifar10_utils.py tutorials/image/cifar10_estimator/cifar10_utils.py +2 -0

No files found.
--- a/tutorials/image/cifar10_estimator/cifar10_main.py
+++ b/tutorials/image/cifar10_estimator/cifar10_main.py
@@ -29,7 +29,6 @@ from __future__ import division
 from __future__ import print_function
 import argparse
-import collections
 import functools
 import itertools
 import os
@@ -47,7 +46,7 @@ import cifar10_utils
 tf.logging.set_verbosity(tf.logging.INFO)
-def get_model_fn(num_gpus, variable_strategy, num_workers):
+def get_model_fn(num_gpus, variable_strategy, num_workers, sync):
  def _resnet_model_fn(features, labels, mode, params):
    """Resnet model body.
@@ -61,13 +60,13 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
-      params: Dictionary of Hyperparameters suitable for tuning
+      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
-    weight_decay = params['weight_decay']
+    weight_decay = params.weight_decay
-    momentum = params['momentum']
+    momentum = params.momentum
    tower_features = features
    tower_labels = labels
@@ -105,9 +104,9 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
                tower_features[i],
                tower_labels[i],
                (device_type == 'cpu'),
-                params['num_layers'],
+                params.num_layers,
-                params['batch_norm_decay'],
+                params.batch_norm_decay,
-                params['batch_norm_epsilon'])
+                params.batch_norm_epsilon)
            tower_losses.append(loss)
            tower_gradvars.append(gradvars)
            tower_preds.append(preds)
@@ -143,14 +142,13 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
    with tf.device(consolidation_device):
      # Suggested learning rate scheduling from
      # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
-      # users could apply other scheduling.
      num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
-          'train') // (params['train_batch_size'] * num_workers)
+          'train') // (params.train_batch_size * num_workers)
      boundaries = [
          num_batches_per_epoch * x
          for x in np.array([82, 123, 300], dtype=np.int64)
      ]
-      staged_lr = [params['learning_rate'] * x for x in [1, 0.1, 0.01, 0.002]]
+      staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
      learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
                                                  boundaries, staged_lr)
@@ -161,7 +159,7 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
          learning_rate=learning_rate, momentum=momentum)
      chief_hooks = []
-      if params['sync']:
+      if sync:
        optimizer = tf.train.SyncReplicasOptimizer(
            optimizer,
            replicas_to_aggregate=num_workers)
@@ -280,7 +278,8 @@ def input_fn(data_dir, subset, num_shards, batch_size,
 # create experiment
 def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
-                      use_distortion_for_training=True):
+                      use_distortion_for_training=True,
+                      sync=True):
  """Returns an Experiment function.
  Experiments perform training on several workers in parallel,
@@ -294,6 +293,7 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
      num_gpus: int. Number of GPUs on each worker.
      is_gpu_ps: bool. If true, average gradients on GPUs.
      use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
+      sync: bool. If true synchronizes variable updates across workers.
  Returns:
      A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
      tf.contrib.learn.Experiment.
@@ -341,9 +341,9 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
    classifier = tf.estimator.Estimator(
        model_fn=get_model_fn(
-            num_gpus, is_gpu_ps, run_config.num_worker_replicas or 1),
+            num_gpus, is_gpu_ps, run_config.num_worker_replicas or 1, sync),
        config=run_config,
-        params=vars(hparams)
+        params=hparams
    )
    # Create experiment.
@@ -366,6 +366,7 @@ def main(job_dir,
         use_distortion_for_training,
         log_device_placement,
         num_intra_threads,
+         sync,
         **hparams):
  # The env variable is on deprecation path, default is set to off.
  os.environ['TF_SYNC_ON_FINISH'] = '0'
@@ -388,7 +389,8 @@ def main(job_dir,
          data_dir,
          num_gpus,
          variable_strategy,
-          use_distortion_for_training
+          use_distortion_for_training,
+          sync
      ),
      run_config=config,
      hparams=tf.contrib.training.HParams(**hparams)
@@ -485,13 +487,11 @@ if __name__ == '__main__':
  parser.add_argument(
      '--num-intra-threads',
      type=int,
-      default=1,
+      default=0,
      help="""\
-      Number of threads to use for intra-op parallelism. If set to 0, the
+      Number of threads to use for intra-op parallelism. When training on CPU
-      system will pick an appropriate number. The default is 1 since in this
+      set to 0 to have the system pick the appropriate number or alternatively
-      example CPU only handles the input pipeline and gradient aggregation
+      set it to the number of physical CPU cores.\
-      (when --is-cpu-ps). Ops that could potentially benefit from intra-op
-      parallelism are scheduled to run on GPUs.\
      """
  )
  parser.add_argument(
@@ -525,15 +525,16 @@ if __name__ == '__main__':
  if args.num_gpus < 0:
    raise ValueError(
-        'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
+        'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
  if args.num_gpus == 0 and args.variable_strategy == 'GPU':
    raise ValueError(
-        'No GPU available for use, must use CPU to average gradients.')
+        'num-gpus=0, CPU must be used as parameter server. Set'
+        '--variable-strategy=CPU.')
  if (args.num_layers - 2) % 6 != 0:
-    raise ValueError('Invalid num_layers parameter.')
+    raise ValueError('Invalid --num-layers parameter.')
  if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
-    raise ValueError('train_batch_size must be multiple of num_gpus.')
+    raise ValueError('--train-batch-size must be multiple of --num-gpus.')
  if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
-    raise ValueError('eval_batch_size must be multiple of num_gpus.')
+    raise ValueError('--eval-batch-size must be multiple of --num-gpus.')
  main(**vars(args))
--- a/tutorials/image/cifar10_estimator/cifar10_utils.py
+++ b/tutorials/image/cifar10_estimator/cifar10_utils.py
@@ -2,6 +2,8 @@ import collections
 import six
 import tensorflow as tf
+import tensorflow as tf
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import device as pydev