Merge branch 'cmlesupport' of https://github.com/elibixby/models

7c460c90 · Toby Boyd · b14765c3 · 2164c8db · 7c460c90 · 7c460c90
Commit 7c460c90 authored Aug 17, 2017 by Toby Boyd
8 changed files
--- a/tutorials/image/cifar10_estimator/README.md
+++ b/tutorials/image/cifar10_estimator/README.md
@@ -34,8 +34,8 @@ data_batch_4  data_batch_5  readme.html  test_batch
 ```shell
 # This will generate a tf record for the training and test data available at the input_dir.
 # You can see more details in generate_cifar10_tf_records.py
-$ python generate_cifar10_tfrecords.py --input_dir=/prefix/to/downloaded/data/cifar-10-batches-py \
-                                       --output_dir=/prefix/to/downloaded/data/cifar-10-batches-py
+$ python generate_cifar10_tfrecords.py --input-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
+                                       --output-dir=/prefix/to/downloaded/data/cifar-10-batches-py
 ```

 After running the command above, you should see the following new files in the output_dir.
@@ -51,36 +51,60 @@ train.tfrecords validation.tfrecords eval.tfrecords
 ```

 # Run the model on CPU only. After training, it runs the evaluation.
-$ python cifar10_main.py --data_dir=/prefix/to/downloaded/data/cifar-10-batches-py \
-                         --model_dir=/tmp/cifar10 \
-                         --is_cpu_ps=True \
-                         --num_gpus=0 \
-                         --train_steps=1000
+$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
+                         --job-dir=/tmp/cifar10 \
+                         --num-gpus=0 \
+                         --train-steps=1000

 # Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
-$ python cifar10_main.py --data_dir=/prefix/to/downloaded/data/cifar-10-batches-py \
-                         --model_dir=/tmp/cifar10 \
-                         --is_cpu_ps=True \
-                         --force_gpu_compatible=True \
-                         --num_gpus=2 \
-                         --train_steps=1000
+$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
+                         --job-dir=/tmp/cifar10 \
+                         --num-gpus=2 \
+                         --train-steps=1000

 # Run the model on 2 GPUs using GPU as parameter server.
 # It will run an experiment, which for local setting basically means it will run stop training
 # a couple of times to perform evaluation.
-$ python cifar10_main.py --data_dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
-                         --model_dir=/tmp/cifar10 \
-                         --is_cpu_ps=False \
-                         --force_gpu_compatible=True \
-                         --num_gpus=2 \
-                         --train_steps=1000
-                         --run_experiment=True
+$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
+                         --job-dir=/tmp/cifar10 \
+                         --variable-strategy GPU \
+                         --num-gpus=2 \
+

 # There are more command line flags to play with; check cifar10_main.py for details.
 ```

 ## How to run on distributed mode

+### (Optional) Running on Google Cloud Machine Learning Engine
+
+This example can be run on Google Cloud Machine Learning Engine (ML Engine), which will configure the environment and take care of running workers, parameters servers, and masters in a fault tolerant way.
+
+To install the command line tool, and set up a project and billing, see the quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line).
+
+You'll also need a Google Cloud Storage bucket for the data. If you followed the instructions above, you can just run:
+
+```
+MY_BUCKET=gs://<my-bucket-name>
+gsutil cp -r cifar-10-batches-py $MY_BUCKET/
+```
+
+Then run the following command from the `tutorials/image` directory of this repository (the parent directory of this README):
+
+```
+gcloud ml-engine jobs submit training cifarmultigpu \
+    --runtime-version 1.2 \
+    --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \
+    --config cifar10_estimator/cmle_config.yaml \
+    --package-path cifar10_estimator/ \
+    --module-name cifar10_estimator.cifar10_main \
+    -- \
+    --data-dir=$MY_BUCKET/cifar-10-batches-py \
+    --num-gpus=4 \
+    --train-steps=1000
+```
+
+
 ### Set TF_CONFIG

 Considering that you already have multiple hosts configured, all you need is a `TF_CONFIG`
@@ -154,15 +178,12 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
 # It will run evaluation a couple of times during training.
 # The num_workers arugument is used only to update the learning rate correctly.
 # Make sure the model_dir is the same as defined on the TF_CONFIG.
-$ python cifar10_main.py --data_dir=gs://path/cifar-10-batches-py \
-                         --model_dir=gs://path/model_dir/ \
-                         --is_cpu_ps=True \
-                         --force_gpu_compatible=True \
-                         --num_gpus=4 \
-                         --train_steps=40000 \
-                         --sync=True \
-                         --run_experiment=True \
-                         --num_workers=2
+$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
+                         --job-dir=gs://path/model_dir/ \
+                         --num-gpus=4 \
+                         --train-steps=40000 \
+                         --sync \
+                         --num-workers=2
 ```

 *Output:*
@@ -297,14 +318,11 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
 # Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for 40000 steps.
 # It will run evaluation a couple of times during training.
 # Make sure the model_dir is the same as defined on the TF_CONFIG.
-$ python cifar10_main.py --data_dir=gs://path/cifar-10-batches-py \
-                         --model_dir=gs://path/model_dir/ \
-                         --is_cpu_ps=True \
-                         --force_gpu_compatible=True \
-                         --num_gpus=4 \
-                         --train_steps=40000 \
-                         --sync=True
-                         --run_experiment=True
+$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
+                         --job-dir=gs://path/model_dir/ \
+                         --num-gpus=4 \
+                         --train-steps=40000 \
+                         --sync
 ```

 *Output:*
@@ -413,7 +431,7 @@ INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
 ```shell
 # Run this on ps:
 # The ps will not do training so most of the arguments won't affect the execution
-$ python cifar10_main.py --run_experiment=True --model_dir=gs://path/model_dir/
+$ python cifar10_main.py --job-dir=gs://path/model_dir/

 # There are more command line flags to play with; check cifar10_main.py for details.
 ```
@@ -446,12 +464,12 @@ You'll see something similar to this if you "point" TensorBoard to the `model_di
 # Check TensorBoard during training or after it.
 # Just point TensorBoard to the model_dir you chose on the previous step
 # by default the model_dir is "sentiment_analysis_output"
-$ tensorboard --log_dir="sentiment_analysis_output"
+$ tensorboard --log-dir="sentiment_analysis_output"
 ```

 ## Warnings

-When runninng `cifar10_main.py` with `--sync=True` argument you may see an error similar to:
+When runninng `cifar10_main.py` with `--sync` argument you may see an error similar to:

 ```python
 File "cifar10_main.py", line 538, in <module>

--- a/tutorials/image/cifar10_estimator/__init__.py
+++ b/tutorials/image/cifar10_estimator/__init__.py
--- a/tutorials/image/cifar10_estimator/cifar10_main.py
+++ b/tutorials/image/cifar10_estimator/cifar10_main.py
--- a/tutorials/image/cifar10_estimator/cifar10_model.py
+++ b/tutorials/image/cifar10_estimator/cifar10_model.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Model class for Cifar10 Dataset."""
-from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

@@ -25,8 +24,18 @@ import model_base
 class ResNetCifar10(model_base.ResNet):
  """Cifar10 model with ResNetV1 and basic residual block."""

-  def __init__(self, num_layers, is_training, data_format='channels_first'):
-    super(ResNetCifar10, self).__init__(is_training, data_format)
+  def __init__(self,
+               num_layers,
+               is_training,
+               batch_norm_decay,
+               batch_norm_epsilon,
+               data_format='channels_first'):
+    super(ResNetCifar10, self).__init__(
+        is_training,
+        data_format,
+        batch_norm_decay,
+        batch_norm_epsilon
+    )
    self.n = (num_layers - 2) // 6
    # Add one in case label starts with 1. No impact if label starts with 0.
    self.num_classes = 10 + 1

--- a/tutorials/image/cifar10_estimator/cifar10_utils.py
+++ b/tutorials/image/cifar10_estimator/cifar10_utils.py
+import six
+
+from tensorflow.python.platform import tf_logging as logging
+
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+from tensorflow.python.training import device_setter
+
+class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
+  """Hook to print out examples per second.
+
+    Total time is tracked and then divided by the total number of steps
+    to get the average step time and then batch_size is used to determine
+    the running average of examples per second. The examples per second for the
+    most recent interval is also logged.
+  """
+
+  def __init__(
+      self,
+      batch_size,
+      every_n_steps=100,
+      every_n_secs=None,):
+    """Initializer for ExamplesPerSecondHook.
+
+      Args:
+      batch_size: Total batch size used to calculate examples/second from
+      global time.
+      every_n_steps: Log stats every n steps.
+      every_n_secs: Log stats every n seconds.
+    """
+    if (every_n_steps is None) == (every_n_secs is None):
+      raise ValueError('exactly one of every_n_steps'
+                       ' and every_n_secs should be provided.')
+    self._timer = basic_session_run_hooks.SecondOrStepTimer(
+        every_steps=every_n_steps, every_secs=every_n_secs)
+
+    self._step_train_time = 0
+    self._total_steps = 0
+    self._batch_size = batch_size
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          'Global step should be created to use StepCounterHook.')
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+
+    global_step = run_values.results
+    if self._timer.should_trigger_for_step(global_step):
+      elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
+          global_step)
+      if elapsed_time is not None:
+        steps_per_sec = elapsed_steps / elapsed_time
+        self._step_train_time += elapsed_time
+        self._total_steps += elapsed_steps
+
+        average_examples_per_sec = self._batch_size * (
+            self._total_steps / self._step_train_time)
+        current_examples_per_sec = steps_per_sec * self._batch_size
+        # Average examples/sec followed by current examples/sec
+        logging.info('%s: %g (%g), step = %g', 'Average examples/sec',
+                     average_examples_per_sec, current_examples_per_sec,
+                     self._total_steps)
+
+def local_device_setter(num_devices=1,
+                        ps_device_type='cpu',
+                        worker_device='/cpu:0',
+                        ps_ops=None,
+                        ps_strategy=None):
+  if ps_ops == None:
+    ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
+
+  if ps_strategy is None:
+    ps_strategy = device_setter._RoundRobinStrategy(num_devices)
+  if not six.callable(ps_strategy):
+    raise TypeError("ps_strategy must be callable")
+
+  def _local_device_chooser(op):
+    current_device = pydev.DeviceSpec.from_string(op.device or "")
+
+    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+    if node_def.op in ps_ops:
+      ps_device_spec = pydev.DeviceSpec.from_string(
+          '/{}:{}'.format(ps_device_type, ps_strategy(op)))
+
+      ps_device_spec.merge_from(current_device)
+      return ps_device_spec.to_string()
+    else:
+      worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
+      worker_device_spec.merge_from(current_device)
+      return worker_device_spec.to_string()
+  return _local_device_chooser
--- a/tutorials/image/cifar10_estimator/cmle_config.yaml
+++ b/tutorials/image/cifar10_estimator/cmle_config.yaml
+trainingInput:
+  scaleTier: CUSTOM
+  masterType: complex_model_m_gpu
+  workerType: complex_model_m_gpu
+  parameterServerType: complex_model_m
+  workerCount: 1
--- a/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
+++ b/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
@@ -22,19 +22,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import argparse
 import cPickle
 import os
-import tensorflow as tf
-
-FLAGS = tf.flags.FLAGS

-tf.flags.DEFINE_string('input_dir', '',
-                       'Directory where CIFAR10 data is located.')
-
-tf.flags.DEFINE_string('output_dir', '',
-                       'Directory where TFRecords will be saved.'
-                       'The TFRecords will have the same name as'
-                       ' the CIFAR10 inputs + .tfrecords.')
+import tensorflow as tf


 def _int64_feature(value):
@@ -55,7 +47,7 @@ def _get_file_names():


 def read_pickle_from_file(filename):
-  with open(filename, 'r') as f:
+  with tf.gfile.Open(filename, 'r') as f:
    data_dict = cPickle.load(f)
  return data_dict

@@ -63,34 +55,49 @@ def read_pickle_from_file(filename):
 def convert_to_tfrecord(input_files, output_file):
  """Converts a file to tfrecords."""
  print('Generating %s' % output_file)
-  record_writer = tf.python_io.TFRecordWriter(output_file)
-
-  for input_file in input_files:
-    data_dict = read_pickle_from_file(input_file)
-    data = data_dict['data']
-    labels = data_dict['labels']
-
-    num_entries_in_batch = len(labels)
-    for i in range(num_entries_in_batch):
-      example = tf.train.Example(
-          features=tf.train.Features(feature={
-              'image': _bytes_feature(data[i].tobytes()),
-              'label': _int64_feature(labels[i])
-          }))
-      record_writer.write(example.SerializeToString())
-  record_writer.close()
-
-
-def main(unused_argv):
+  with tf.python_io.TFRecordWriter(output_file) as record_writer:
+    for input_file in input_files:
+      data_dict = read_pickle_from_file(input_file)
+      data = data_dict['data']
+      labels = data_dict['labels']
+
+      num_entries_in_batch = len(labels)
+      for i in range(num_entries_in_batch):
+        example = tf.train.Example(
+            features=tf.train.Features(feature={
+                'image': _bytes_feature(data[i].tobytes()),
+                'label': _int64_feature(labels[i])
+            }))
+        record_writer.write(example.SerializeToString())
+
+
+def main(input_dir, output_dir):
  file_names = _get_file_names()
  for mode, files in file_names.items():
    input_files = [
-        os.path.join(FLAGS.input_dir, f) for f in files]
-    output_file = os.path.join(FLAGS.output_dir, mode + '.tfrecords')
+        os.path.join(input_dir, f) for f in files]
+    output_file = os.path.join(output_dir, mode + '.tfrecords')
    # Convert to Examples and write the result to TFRecords.
    convert_to_tfrecord(input_files, output_file)
  print('Done!')


 if __name__ == '__main__':
-  tf.app.run(main)
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input-dir',
+      type=str,
+      default='',
+      help='Directory where CIFAR10 data is located.'
+  )
+  parser.add_argument(
+      '--output-dir',
+      type=str,
+      default='',
+      help="""\
+      Directory where TFRecords will be saved.The TFRecords will have the same
+      name as the CIFAR10 inputs + .tfrecords.\
+      """
+  )
+  args = parser.parse_args()
+  main(args.input_dir, args.output_dir)
--- a/tutorials/image/cifar10_estimator/model_base.py
+++ b/tutorials/image/cifar10_estimator/model_base.py
@@ -25,16 +25,11 @@ from __future__ import print_function

 import tensorflow as tf

-FLAGS = tf.flags.FLAGS
-
-tf.flags.DEFINE_float('batch_norm_decay', 0.997, 'Decay for batch norm.')
-tf.flags.DEFINE_float('batch_norm_epsilon', 1e-5, 'Epsilon for batch norm.')
-

 class ResNet(object):
  """ResNet model."""

-  def __init__(self, is_training, data_format):
+  def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
    """ResNet constructor.

    Args:
@@ -42,6 +37,8 @@ class ResNet(object):
      data_format: the data_format used during computation.
                   one of 'channels_first' or 'channels_last'.
    """
+    self._batch_norm_decay = batch_norm_decay
+    self._batch_norm_epsilon = batch_norm_epsilon
    self._is_training = is_training
    assert data_format in ('channels_first', 'channels_last')
    self._data_format = data_format
@@ -185,10 +182,10 @@ class ResNet(object):
      data_format = 'NHWC'
    return tf.contrib.layers.batch_norm(
        x,
-        decay=FLAGS.batch_norm_decay,
+        decay=self._batch_norm_decay,
        center=True,
        scale=True,
-        epsilon=FLAGS.batch_norm_epsilon,
+        epsilon=self._batch_norm_epsilon,
        is_training=self._is_training,
        fused=True,
        data_format=data_format)