Commit 8b829873 authored by Eli Bixby's avatar Eli Bixby
Browse files

Move to argparse some other modifications

parent d067ce0a
......@@ -53,15 +53,13 @@ train.tfrecords validation.tfrecords eval.tfrecords
# Run the model on CPU only. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \
--is-cpu-ps=True \
--num-gpus=0 \
--train-steps=1000
# Run the model on 2 GPUs using CPU as parameter server. After training, it runs the evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-py \
--job-dir=/tmp/cifar10 \
--is-cpu-ps=True \
--force-gpu-compatible=True \
--force-gpu-compatible \
--num-gpus=2 \
--train-steps=1000
......@@ -70,8 +68,8 @@ $ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-
# a couple of times to perform evaluation.
$ python cifar10_main.py --data-dir=/prefix/to/downloaded/data/cifar-10-batches-bin \
--job-dir=/tmp/cifar10 \
--is-cpu-ps=False \
--force-gpu-compatible=True \
--avg-on-gpu \
--force-gpu-compatible \
--num-gpus=2 \
......@@ -104,8 +102,7 @@ gcloud ml-engine jobs submit training cifarmultigpu \
--module-name cifar10_estimator.cifar10_main \
-- \
--data-dir=$MY_BUCKET/cifar-10-batches-py \
--is-cpu-ps=True \
--force-gpu-compatible=True \
--force-gpu-compatible \
--num-gpus=4 \
--train-steps=1000
```
......@@ -186,11 +183,10 @@ Once you have a `TF_CONFIG` configured properly on each host you're ready to run
# Make sure the model_dir is the same as defined on the TF_CONFIG.
$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
--job-dir=gs://path/model_dir/ \
--is-cpu-ps=True \
--force-gpu-compatible=True \
--force-gpu-compatible \
--num-gpus=4 \
--train-steps=40000 \
--sync=True \
--sync \
\
--num-workers=2
```
......@@ -329,11 +325,10 @@ INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step =
# Make sure the model_dir is the same as defined on the TF_CONFIG.
$ python cifar10_main.py --data-dir=gs://path/cifar-10-batches-py \
--job-dir=gs://path/model_dir/ \
--is-cpu-ps=True \
--force-gpu-compatible=True \
--force-gpu-compatible \
--num-gpus=4 \
--train-steps=40000 \
--sync=True
--sync
```
*Output:*
......@@ -480,7 +475,7 @@ $ tensorboard --log-dir="sentiment_analysis_output"
## Warnings
When runninng `cifar10_main.py` with `--sync=True` argument you may see an error similar to:
When runninng `cifar10_main.py` with `--sync` argument you may see an error similar to:
```python
File "cifar10_main.py", line 538, in <module>
......
......@@ -25,7 +25,6 @@ http://www.cs.toronto.edu/~kriz/cifar.html
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
......@@ -42,8 +41,8 @@ from tensorflow.python.training import basic_session_run_hooks
from tensorflow.python.training import session_run_hook
from tensorflow.python.training import training_util
from . import cifar10
from . import cifar10_model
import cifar10
import cifar10_model
tf.logging.set_verbosity(tf.logging.INFO)
......@@ -192,9 +191,18 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
with tf.variable_scope('resnet', reuse=bool(i != 0)):
with tf.name_scope('tower_%d' % i) as name_scope:
with tf.device(device_setter):
_tower_fn(is_training, weight_decay, tower_features[i],
tower_labels[i], tower_losses, tower_gradvars,
tower_preds, False, params['num_layers'])
loss, gradvars, preds = _tower_fn(
is_training,
weight_decay,
tower_features[i],
tower_labels[i],
False,
params['num_layers'],
params['batch_norm_decay'],
params['batch_norm_epsilon'])
tower_losses.append(loss)
tower_gradvars.append(gradvars)
tower_preds.append(preds)
if i == 0:
# Only trigger batch_norm moving mean and variance update from
# the 1st tower. Ideally, we should grab the updates from all
......@@ -206,8 +214,19 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
else:
with tf.variable_scope('resnet'), tf.device('/cpu:0'):
with tf.name_scope('tower_cpu') as name_scope:
_tower_fn(is_training, weight_decay, tower_features[0], tower_labels[0],
tower_losses, tower_gradvars, tower_preds, True)
loss, gradvars, preds = _tower_fn(
is_training,
weight_decay,
tower_features[0],
tower_labels[0],
True,
params['num_layers'],
params['batch_norm_decay'],
params['batch_norm_epsilon'])
tower_losses.append(loss)
tower_gradvars.append(gradvars)
tower_preds.append(preds)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
# Now compute global loss and gradients.
......@@ -281,10 +300,17 @@ def get_model_fn(num_gpus, avg_on_gpu, num_workers):
train_op=train_op,
training_chief_hooks=chief_hooks,
eval_metric_ops=metrics)
return _resnet_model_fn
def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
tower_gradvars, tower_preds, is_cpu, num_layers):
def _tower_fn(is_training,
weight_decay,
feature,
label,
is_cpu,
num_layers,
batch_norm_decay,
batch_norm_epsilon):
"""Build computation tower for each device (CPU or GPU).
Args:
......@@ -299,13 +325,15 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
"""
data_format = 'channels_last' if is_cpu else 'channels_first'
model = cifar10_model.ResNetCifar10(
num_layers, is_training=is_training, data_format=data_format)
num_layers,
batch_norm_decay=batch_norm_decay,
batch_norm_epsilon=batch_norm_epsilon,
is_training=is_training, data_format=data_format)
logits = model.forward_pass(feature, input_data_format='channels_last')
tower_pred = {
'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits)
}
tower_preds.append(tower_pred)
tower_loss = tf.losses.sparse_softmax_cross_entropy(
logits=logits, labels=label)
......@@ -314,10 +342,10 @@ def _tower_fn(is_training, weight_decay, feature, label, tower_losses,
model_params = tf.trainable_variables()
tower_loss += weight_decay * tf.add_n(
[tf.nn.l2_loss(v) for v in model_params])
tower_losses.append(tower_loss)
tower_grad = tf.gradients(tower_loss, model_params)
tower_gradvars.append(zip(tower_grad, model_params))
return tower_loss, tower_grad, tower_pred
def input_fn(data_dir, subset, num_shards, batch_size,
......@@ -535,6 +563,7 @@ if __name__ == '__main__':
default=2e-4,
help='Weight decay for convolutions.'
)
parser.add_argument(
'--learning-rate',
type=float,
......@@ -595,12 +624,24 @@ if __name__ == '__main__':
default=False,
help='Whether to log device placement.'
)
parser.add_argument(
'--batch_norm_decay',
type=float,
default=0.997,
help='Decay for batch norm.'
)
parser.add_argument(
'--batch_norm_epsilon',
type=float,
default=1e-5,
help='Epsilon for batch norm.'
)
args = parser.parse_args()
if args.num_gpus < 0:
raise ValueError(
'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
if args.num_gpus == 0 and not args.avg_on_gpu:
if args.num_gpus == 0 and args.avg_on_gpu:
raise ValueError(
'No GPU available for use, must use CPU to average gradients.')
if (args.num_layers - 2) % 6 != 0:
......
......@@ -13,20 +13,29 @@
# limitations under the License.
# ==============================================================================
"""Model class for Cifar10 Dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from . import model_base
import model_base
class ResNetCifar10(model_base.ResNet):
"""Cifar10 model with ResNetV1 and basic residual block."""
def __init__(self, num_layers, is_training, data_format='channels_first'):
super(ResNetCifar10, self).__init__(is_training, data_format)
def __init__(self,
num_layers,
is_training,
batch_norm_decay,
batch_norm_epsilon,
data_format='channels_first'):
super(ResNetCifar10, self).__init__(
is_training,
data_format,
batch_norm_decay,
batch_norm_epsilon
)
self.n = (num_layers - 2) // 6
# Add one in case label starts with 1. No impact if label starts with 0.
self.num_classes = 10 + 1
......
......@@ -23,18 +23,13 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
FLAGS = None
class ResNet(object):
"""ResNet model."""
def __init__(self, is_training, data_format):
def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
"""ResNet constructor.
Args:
......@@ -42,6 +37,8 @@ class ResNet(object):
data_format: the data_format used during computation.
one of 'channels_first' or 'channels_last'.
"""
self._batch_norm_decay = batch_norm_decay
self._batch_norm_epsilon = batch_norm_epsilon
self._is_training = is_training
assert data_format in ('channels_first', 'channels_last')
self._data_format = data_format
......@@ -185,10 +182,10 @@ class ResNet(object):
data_format = 'NHWC'
return tf.contrib.layers.batch_norm(
x,
decay=FLAGS.batch_norm_decay,
decay=self._batch_norm_decay,
center=True,
scale=True,
epsilon=FLAGS.batch_norm_epsilon,
epsilon=self._batch_norm_epsilon,
is_training=self._is_training,
fused=True,
data_format=data_format)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment