Use HParams rather than dict. Don't tune sync

e11010fc · Eli Bixby · 5cb2dbde · e11010fc
Commit e11010fc authored Aug 23, 2017 by Eli Bixby
Hide whitespace changes
Inline Side-by-side

Showing with 19 additions and 16 deletions

tutorials/image/cifar10_estimator/cifar10_main.py tutorials/image/cifar10_estimator/cifar10_main.py +19 -16

No files found.
--- a/tutorials/image/cifar10_estimator/cifar10_main.py
+++ b/tutorials/image/cifar10_estimator/cifar10_main.py
@@ -29,7 +29,6 @@ from __future__ import division
 from __future__ import print_function

 import argparse
-import collections
 import functools
 import itertools
 import os
@@ -47,7 +46,7 @@ import cifar10_utils
 tf.logging.set_verbosity(tf.logging.INFO)


-def get_model_fn(num_gpus, variable_strategy, num_workers):
+def get_model_fn(num_gpus, variable_strategy, num_workers, sync):
  def _resnet_model_fn(features, labels, mode, params):
    """Resnet model body.

@@ -61,13 +60,13 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
      features: a list of tensors, one for each tower
      labels: a list of tensors, one for each tower
      mode: ModeKeys.TRAIN or EVAL
-      params: Dictionary of Hyperparameters suitable for tuning
+      params: Hyperparameters suitable for tuning
    Returns:
      A EstimatorSpec object.
    """
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
-    weight_decay = params['weight_decay']
-    momentum = params['momentum']
+    weight_decay = params.weight_decay
+    momentum = params.momentum

    tower_features = features
    tower_labels = labels
@@ -105,9 +104,9 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
                tower_features[i],
                tower_labels[i],
                (device_type == 'cpu'),
-                params['num_layers'],
-                params['batch_norm_decay'],
-                params['batch_norm_epsilon'])
+                params.num_layers,
+                params.batch_norm_decay,
+                params.batch_norm_epsilon)
            tower_losses.append(loss)
            tower_gradvars.append(gradvars)
            tower_preds.append(preds)
@@ -144,12 +143,12 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
      # Suggested learning rate scheduling from
      # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
      num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
-          'train') // (params['train_batch_size'] * num_workers)
+          'train') // (params.train_batch_size * num_workers)
      boundaries = [
          num_batches_per_epoch * x
          for x in np.array([82, 123, 300], dtype=np.int64)
      ]
-      staged_lr = [params['learning_rate'] * x for x in [1, 0.1, 0.01, 0.002]]
+      staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]

      learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
                                                  boundaries, staged_lr)
@@ -160,7 +159,7 @@ def get_model_fn(num_gpus, variable_strategy, num_workers):
          learning_rate=learning_rate, momentum=momentum)

      chief_hooks = []
-      if params['sync']:
+      if sync:
        optimizer = tf.train.SyncReplicasOptimizer(
            optimizer,
            replicas_to_aggregate=num_workers)
@@ -279,7 +278,8 @@ def input_fn(data_dir, subset, num_shards, batch_size,

 # create experiment
 def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
-                      use_distortion_for_training=True):
+                      use_distortion_for_training=True,
+                      sync=True):
  """Returns an Experiment function.

  Experiments perform training on several workers in parallel,
@@ -293,6 +293,7 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,
      num_gpus: int. Number of GPUs on each worker.
      is_gpu_ps: bool. If true, average gradients on GPUs.
      use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
+      sync: bool. If true synchronizes variable updates across workers.
  Returns:
      A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
      tf.contrib.learn.Experiment.
@@ -340,9 +341,9 @@ def get_experiment_fn(data_dir, num_gpus, is_gpu_ps,

    classifier = tf.estimator.Estimator(
        model_fn=get_model_fn(
-            num_gpus, is_gpu_ps, run_config.num_worker_replicas or 1),
+            num_gpus, is_gpu_ps, run_config.num_worker_replicas or 1, sync),
        config=run_config,
-        params=vars(hparams)
+        params=hparams
    )

    # Create experiment.
@@ -365,6 +366,7 @@ def main(job_dir,
         use_distortion_for_training,
         log_device_placement,
         num_intra_threads,
+         sync,
         **hparams):
  # The env variable is on deprecation path, default is set to off.
  os.environ['TF_SYNC_ON_FINISH'] = '0'
@@ -387,7 +389,8 @@ def main(job_dir,
          data_dir,
          num_gpus,
          variable_strategy,
-          use_distortion_for_training
+          use_distortion_for_training,
+          sync
      ),
      run_config=config,
      hparams=tf.contrib.training.HParams(**hparams)
@@ -456,7 +459,7 @@ if __name__ == '__main__':
      type=float,
      default=2e-4,
      help='Weight decay for convolutions.'
-  ) 
+  )
  parser.add_argument(
      '--learning-rate',
      type=float,