"docs/git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "420e84ee0d689b788b25e6e95168da50c70ec90a"
Commit 8b829873 authored by Eli Bixby's avatar Eli Bixby
Browse files

Move to argparse some other modifications

parent d067ce0a
...@@ -53,15 +53,13 @@ train.tfrecords validation.tfrecords eval.tfrecords ...@@ -53,15 +53,13 @@ train.tfrecords validation.tfrecords eval.tfrecords
# Run the model on CPU only. After training, it runs the evaluation. # Run the model on CPU only. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \ --job-dir=/tmp/cifar10 \
--is-cpu-ps=True \
--num-gpus=0 \ --num-gpus=0 \
--train-steps=1000 --train-steps=1000
# Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation. # Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \ --job-dir=/tmp/cifar10 \
--is-cpu-ps=True \ --force-gpu-compatible \
--force-gpu-compatible=True \
--num-gpus=2 \ --num-gpus=2 \
--train-steps=1000 --train-steps=1000
...@@ -70,8 +68,8 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches- ...@@ -70,8 +68,8 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
# a couple of times to perform evaluation. # a couple of times to perform evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
--job-dir=/tmp/cifar10 \ --job-dir=/tmp/cifar10 \
--is-cpu-ps=False \ --avg-on-gpu \
--force-gpu-compatible=True \ --force-gpu-compatible \
--num-gpus=2 \ --num-gpus=2 \
...@@ -104,8 +102,7 @@ gcloud ml-engine jobs submit training cifarmultigpu \ ...@@ -104,8 +102,7 @@ gcloud ml-engine jobs submit training cifarmultigpu \
--module-name cifar10_estimator.cifar10_main \ --module-name cifar10_estimator.cifar10_main \
-- \ -- \
--data-dir=$MY_BUCKET/cifar-10-batches-py \ --data-dir=$MY_BUCKET/cifar-10-batches-py \
--is-cpu-ps=True \ --force-gpu-compatible \
--force-gpu-compatible=True \
--num-gpus=4 \ --num-gpus=4 \
--train-steps=1000 --train-steps=1000
``` ```
...@@ -186,11 +183,10 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run ...@@ -186,11 +183,10 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
# Make sure the model_dir is the same as defined on the TF_CONFIG. # Make sure the model_dir is the same as defined on the TF_CONFIG.
$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \ $ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
--job-dir=gs://path/model_dir/ \ --job-dir=gs://path/model_dir/ \
--is-cpu-ps=True \ --force-gpu-compatible \
--force-gpu-compatible=True \
--num-gpus=4 \ --num-gpus=4 \
--train-steps=40000 \ --train-steps=40000 \
--sync=True \ --sync \
\ \
--num-workers=2 --num-workers=2
``` ```
...@@ -329,11 +325,10 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = ...@@ -329,11 +325,10 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
# Make sure the model_dir is the same as defined on the TF_CONFIG. # Make sure the model_dir is the same as defined on the TF_CONFIG.
$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \ $ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
--job-dir=gs://path/model_dir/ \ --job-dir=gs://path/model_dir/ \
--is-cpu-ps=True \ --force-gpu-compatible \
--force-gpu-compatible=True \
--num-gpus=4 \ --num-gpus=4 \
--train-steps=40000 \ --train-steps=40000 \
--sync=True --sync
``` ```
*Output:* *Output:*
...@@ -480,7 +475,7 @@ $ tensorboard --log-dir="sentiment_analysis_output" ...@@ -480,7 +475,7 @@ $ tensorboard --log-dir="sentiment_analysis_output"
## Warnings ## Warnings
When runninng `cifar10_main.py` with `--sync=True` argument you may see an error similar to: When runninng `cifar10_main.py` with `--sync` argument you may see an error similar to:
```python ```python
File "cifar10_main.py", line 538, in <module> File "cifar10_main.py", line 538, in <module>
......
...@@ -25,7 +25,6 @@ http://www.cs.toronto.edu/~kriz/cifar.html ...@@ -25,7 +25,6 @@ http://www.cs.toronto.edu/~kriz/cifar.html
""" """
from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
...@@ -42,8 +41,8 @@ from tensorflow.python.training import basic_session_run_hooks ...@@ -42,8 +41,8 @@ from tensorflow.python.training import basic_session_run_hooks
from tensorflow.python.training import session_run_hook from tensorflow.python.training import session_run_hook
from tensorflow.python.training import training_util from tensorflow.python.training import training_util
from . import cifar10 import cifar10
from . import cifar10_model import cifar10_model
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
...@@ -192,9 +191,18 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers): ...@@ -192,9 +191,18 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
with tf.variable_scope('resnet', reuse=bool(i != 0)): with tf.variable_scope('resnet', reuse=bool(i != 0)):
with tf.name_scope('tower_%d' % i) as name_scope: with tf.name_scope('tower_%d' % i) as name_scope:
with tf.device(device_setter): with tf.device(device_setter):
_tower_fn(is_training, weight_decay, tower_features[i], loss, gradvars, preds = _tower_fn(
tower_labels[i], tower_losses, tower_gradvars, is_training,
tower_preds, False, params['num_layers']) weight_decay,
tower_features[i],
tower_labels[i],
False,
params['num_layers'],
params['batch_norm_decay'],
params['batch_norm_epsilon'])
tower_losses.append(loss)
tower_gradvars.append(gradvars)
tower_preds.append(preds)
if i == 0: if i == 0:
# Only trigger batch_norm moving mean and variance update from # Only trigger batch_norm moving mean and variance update from
# the 1st tower. Ideally, we should grab the updates from all # the 1st tower. Ideally, we should grab the updates from all
...@@ -206,8 +214,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers): ...@@ -206,8 +214,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
else: else:
with tf.variable_scope('resnet'), tf.device('/cpu:0'): with tf.variable_scope('resnet'), tf.device('/cpu:0'):
with tf.name_scope('tower_cpu') as name_scope: with tf.name_scope('tower_cpu') as name_scope:
_tower_fn(is_training, weight_decay, tower_features[0], tower_labels[0], loss, gradvars, preds = _tower_fn(
tower_losses, tower_gradvars, tower_preds, True) is_training,
weight_decay,
tower_features[0],
tower_labels[0],
True,
params['num_layers'],
params['batch_norm_decay'],
params['batch_norm_epsilon'])
tower_losses.append(loss)
tower_gradvars.append(gradvars)
tower_preds.append(preds)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
# Now compute global loss and gradients. # Now compute global loss and gradients.
...@@ -281,10 +300,17 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers): ...@@ -281,10 +300,17 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
train_op=train_op, train_op=train_op,
training_chief_hooks=chief_hooks, training_chief_hooks=chief_hooks,
eval_metric_ops=metrics) eval_metric_ops=metrics)
return _resnet_model_fn
def _tower_fn(is_training, weight_decay, feature, label, tower_losses, def _tower_fn(is_training,
tower_gradvars, tower_preds, is_cpu, num_layers): weight_decay,
feature,
label,
is_cpu,
num_layers,
batch_norm_decay,
batch_norm_epsilon):
"""Build computation tower for each device (CPU or GPU). """Build computation tower for each device (CPU or GPU).
Args: Args:
...@@ -299,13 +325,15 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses, ...@@ -299,13 +325,15 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
""" """
data_format = 'channels_last' if is_cpu else 'channels_first' data_format = 'channels_last' if is_cpu else 'channels_first'
model = cifar10_model.ResNetCifar10( model = cifar10_model.ResNetCifar10(
num_layers, is_training=is_training, data_format=data_format) num_layers,
batch_norm_decay=batch_norm_decay,
batch_norm_epsilon=batch_norm_epsilon,
is_training=is_training, data_format=data_format)
logits = model.forward_pass(feature, input_data_format='channels_last') logits = model.forward_pass(feature, input_data_format='channels_last')
tower_pred = { tower_pred = {
'classes': tf.argmax(input=logits, axis=1), 'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits) 'probabilities': tf.nn.softmax(logits)
} }
tower_preds.append(tower_pred)
tower_loss = tf.losses.sparse_softmax_cross_entropy( tower_loss = tf.losses.sparse_softmax_cross_entropy(
logits=logits, labels=label) logits=logits, labels=label)
...@@ -314,10 +342,10 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses, ...@@ -314,10 +342,10 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
model_params = tf.trainable_variables() model_params = tf.trainable_variables()
tower_loss += weight_decay * tf.add_n( tower_loss += weight_decay * tf.add_n(
[tf.nn.l2_loss(v) for v in model_params]) [tf.nn.l2_loss(v) for v in model_params])
tower_losses.append(tower_loss)
tower_grad = tf.gradients(tower_loss, model_params) tower_grad = tf.gradients(tower_loss, model_params)
tower_gradvars.append(zip(tower_grad, model_params))
return tower_loss, tower_grad, tower_pred
def input_fn(data_dir, subset, num_shards, batch_size, def input_fn(data_dir, subset, num_shards, batch_size,
...@@ -535,6 +563,7 @@ if __name__ == '__main__': ...@@ -535,6 +563,7 @@ if __name__ == '__main__':
default=2e-4, default=2e-4,
help='Weight decay for convolutions.' help='Weight decay for convolutions.'
) )
parser.add_argument( parser.add_argument(
'--learning-rate', '--learning-rate',
type=float, type=float,
...@@ -595,12 +624,24 @@ if __name__ == '__main__': ...@@ -595,12 +624,24 @@ if __name__ == '__main__':
default=False, default=False,
help='Whether to log device placement.' help='Whether to log device placement.'
) )
parser.add_argument(
'--batch_norm_decay',
type=float,
default=0.997,
help='Decay for batch norm.'
)
parser.add_argument(
'--batch_norm_epsilon',
type=float,
default=1e-5,
help='Epsilon for batch norm.'
)
args = parser.parse_args() args = parser.parse_args()
if args.num_gpus < 0: if args.num_gpus < 0:
raise ValueError( raise ValueError(
'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.') 'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
if args.num_gpus == 0 and not args.avg_on_gpu: if args.num_gpus == 0 and args.avg_on_gpu:
raise ValueError( raise ValueError(
'No GPU available for use, must use CPU to average gradients.') 'No GPU available for use, must use CPU to average gradients.')
if (args.num_layers - 2) % 6 != 0: if (args.num_layers - 2) % 6 != 0:
......
...@@ -13,20 +13,29 @@ ...@@ -13,20 +13,29 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Model class for Cifar10 Dataset.""" """Model class for Cifar10 Dataset."""
from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import tensorflow as tf import tensorflow as tf
from . import model_base import model_base
class ResNetCifar10(model_base.ResNet): class ResNetCifar10(model_base.ResNet):
"""Cifar10 model with ResNetV1 and basic residual block.""" """Cifar10 model with ResNetV1 and basic residual block."""
def __init__(self, num_layers, is_training, data_format='channels_first'): def __init__(self,
super(ResNetCifar10, self).__init__(is_training, data_format) num_layers,
is_training,
batch_norm_decay,
batch_norm_epsilon,
data_format='channels_first'):
super(ResNetCifar10, self).__init__(
is_training,
data_format,
batch_norm_decay,
batch_norm_epsilon
)
self.n = (num_layers - 2) // 6 self.n = (num_layers - 2) // 6
# Add one in case label starts with 1. No impact if label starts with 0. # Add one in case label starts with 1. No impact if label starts with 0.
self.num_classes = 10 + 1 self.num_classes = 10 + 1
......
...@@ -23,18 +23,13 @@ from __future__ import absolute_import ...@@ -23,18 +23,13 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse
import tensorflow as tf import tensorflow as tf
FLAGS = None
class ResNet(object): class ResNet(object):
"""ResNet model.""" """ResNet model."""
def __init__(self, is_training, data_format): def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
"""ResNet constructor. """ResNet constructor.
Args: Args:
...@@ -42,6 +37,8 @@ class ResNet(object): ...@@ -42,6 +37,8 @@ class ResNet(object):
data_format: the data_format used during computation. data_format: the data_format used during computation.
one of 'channels_first' or 'channels_last'. one of 'channels_first' or 'channels_last'.
""" """
self._batch_norm_decay = batch_norm_decay
self._batch_norm_epsilon = batch_norm_epsilon
self._is_training = is_training self._is_training = is_training
assert data_format in ('channels_first', 'channels_last') assert data_format in ('channels_first', 'channels_last')
self._data_format = data_format self._data_format = data_format
...@@ -185,10 +182,10 @@ class ResNet(object): ...@@ -185,10 +182,10 @@ class ResNet(object):
data_format = 'NHWC' data_format = 'NHWC'
return tf.contrib.layers.batch_norm( return tf.contrib.layers.batch_norm(
x, x,
decay=FLAGS.batch_norm_decay, decay=self._batch_norm_decay,
center=True, center=True,
scale=True, scale=True,
epsilon=FLAGS.batch_norm_epsilon, epsilon=self._batch_norm_epsilon,
is_training=self._is_training, is_training=self._is_training,
fused=True, fused=True,
data_format=data_format) data_format=data_format)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment