Commit 5bb9e6f3 authored by Mark Sandler's avatar Mark Sandler Committed by Sergio Guadarrama
Browse files

1. Splits train_image_classifier into library and binary rule, to simplify reuse. (#4552)

2. Flag that allows to prevent imagenet.py  from downloading label_to_names from github and/or dumping into training directory (which might be read-only)
3. Adds some comments about how decay steps are computed, since it computed differently when there are clones vs sync replicas.
4. Updates mobilenet.md to describe the training process using train_image_classifer
5. Add citation for TF-Slim model library.

PiperOrigin-RevId: 191955231

PiperOrigin-RevId: 193254125

PiperOrigin-RevId: 193371562

PiperOrigin-RevId: 194085628

PiperOrigin-RevId: 194857067

PiperOrigin-RevId: 196125653

PiperOrigin-RevId: 196589070

PiperOrigin-RevId: 199522873

PiperOrigin-RevId: 200351305
parent f023a76d
......@@ -694,10 +694,9 @@ py_test(
],
)
py_binary(
name = "train_image_classifier",
py_library(
name = "train_image_classifier_lib",
srcs = ["train_image_classifier.py"],
paropts = ["--compress"],
deps = [
":dataset_factory",
":model_deploy",
......@@ -707,6 +706,15 @@ py_binary(
],
)
py_binary(
name = "train_image_classifier",
srcs = ["train_image_classifier.py"],
paropts = ["--compress"],
deps = [
":train_image_classifier_lib",
],
)
py_binary(
name = "eval_image_classifier",
srcs = ["eval_image_classifier.py"],
......
......@@ -25,6 +25,11 @@ Maintainers of TF-slim:
github: [nathansilberman](https://github.com/nathansilberman)
* Sergio Guadarrama, github: [sguada](https://github.com/sguada)
## Citation
"TensorFlow-Slim image classification model library"
N. Silberman and S. Guadarrama, 2016.
https://github.com/tensorflow/models/tree/master/research/slim
## Table of contents
<a href="#Install">Installation and setup</a><br>
......
......@@ -58,6 +58,10 @@ _ITEMS_TO_DESCRIPTIONS = {
_NUM_CLASSES = 1001
# If set to false, will not try to set label_to_names in dataset
# by reading them from labels.txt or github.
LOAD_READABLE_NAMES = True
def create_readable_names_for_imagenet_labels():
"""Create a dict mapping label id to human readable string.
......@@ -177,6 +181,7 @@ def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
keys_to_features, items_to_handlers)
labels_to_names = None
if LOAD_READABLE_NAMES:
if dataset_utils.has_labels(dataset_dir):
labels_to_names = dataset_utils.read_label_file(dataset_dir)
else:
......
......@@ -47,7 +47,27 @@ are no published size numbers. We estimate it to be comparable to MobileNetV2 nu
| [mobilenet_v2_0.35_128](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_128.tgz) | 20 | 1.66 | 50.8 | 75.0 | 6.9
| [mobilenet_v2_0.35_96](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_96.tgz) | 11 | 1.66 | 45.5 | 70.4 | 4.5
# Training
The numbers above can be reproduced using slim's `train_image_classifier`.
Below is the set of parameters that achieves 72.0% for full size MobileNetV2, after about 700K when trained on 8 GPU.
If trained on a single GPU the full convergence is after 5.5M steps. Also note that learning rate and
num_epochs_per_decay both need to be adjusted depending on how many GPUs are being
used due to slim's internal averaging.
```bash
--model_name="mobilenet_v2"
--learning_rate=0.045 * NUM_GPUS #slim internally averages clones so we compensate
--preprocessing_name="inception_v2"
--label_smoothing=0.1
--moving_average_decay=0.9999
--batch_size= 96
--num_clones = NUM_GPUS # you can use any number here between 1 and 8 depending on your hardware setup.
--learning_rate_decay_factor=0.98
--num_epochs_per_decay = 2.5 / NUM_GPUS # train_image_classifier does per clone epochs
```
# Example
See this [ipython notebook](mobilenet_example.ipynb) or open and run the network directly in [Colaboratory](https://colab.research.google.com/github/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_example.ipynb).
......@@ -133,8 +133,10 @@ def drop_path(net, keep_prob, is_training=True):
noise_shape = [batch_size, 1, 1, 1]
random_tensor = keep_prob
random_tensor += tf.random_uniform(noise_shape, dtype=tf.float32)
binary_tensor = tf.floor(random_tensor)
net = tf.div(net, keep_prob) * binary_tensor
binary_tensor = tf.cast(tf.floor(random_tensor), net.dtype)
keep_prob_inv = tf.cast(1.0 / keep_prob, net.dtype)
net = net * keep_prob_inv * binary_tensor
return net
......
......@@ -35,7 +35,10 @@ tf.app.flags.DEFINE_string(
'Directory where checkpoints and event logs are written to.')
tf.app.flags.DEFINE_integer('num_clones', 1,
'Number of model clones to deploy.')
'Number of model clones to deploy. Note For '
'historical reasons loss from all clones averaged '
'out and learning rate decay happen per clone '
'epochs')
tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
'Use CPUs to deploy clones.')
......@@ -145,7 +148,10 @@ tf.app.flags.DEFINE_float(
tf.app.flags.DEFINE_float(
'num_epochs_per_decay', 2.0,
'Number of epochs after which learning rate decays.')
'Number of epochs after which learning rate decays. Note: this flag counts '
'epochs per clone but aggregates per sync replicas. So 1.0 means that '
'each clone will go over full epoch individually, but replicas will go '
'once across all replicas.')
tf.app.flags.DEFINE_bool(
'sync_replicas', False,
......@@ -233,8 +239,12 @@ def _configure_learning_rate(num_samples_per_epoch, global_step):
Raises:
ValueError: if
"""
decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
FLAGS.num_epochs_per_decay)
# Note: when num_clones is > 1, this will actually have each clone to go
# over each epoch FLAGS.num_epochs_per_decay times. This is different
# behavior from sync replicas and is expected to produce different results.
decay_steps = int(num_samples_per_epoch * FLAGS.num_epochs_per_decay /
FLAGS.batch_size)
if FLAGS.sync_replicas:
decay_steps /= FLAGS.replicas_to_aggregate
......@@ -256,7 +266,7 @@ def _configure_learning_rate(num_samples_per_epoch, global_step):
cycle=False,
name='polynomial_decay_learning_rate')
else:
raise ValueError('learning_rate_decay_type [%s] was not recognized',
raise ValueError('learning_rate_decay_type [%s] was not recognized' %
FLAGS.learning_rate_decay_type)
......@@ -308,7 +318,7 @@ def _configure_optimizer(learning_rate):
elif FLAGS.optimizer == 'sgd':
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
else:
raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
raise ValueError('Optimizer [%s] was not recognized' % FLAGS.optimizer)
return optimizer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment