Commit 59c218f5 authored by Yukun Zhu's avatar Yukun Zhu Committed by aquariusjay
Browse files

Adding quantization support for deeplab (#6681)

* deeplab quantize

* Fix bug in train.py

* Create quantize.md
parent a182abc1
......@@ -38,8 +38,8 @@ flags.DEFINE_string('checkpoint_dir', None, 'Directory of model checkpoints.')
flags.DEFINE_integer('eval_batch_size', 1,
'The number of images in each batch during evaluation.')
flags.DEFINE_multi_integer('eval_crop_size', [513, 513],
'Image crop size [height, width] for evaluation.')
flags.DEFINE_list('eval_crop_size', '513,513',
'Image crop size [height, width] for evaluation.')
flags.DEFINE_integer('eval_interval_secs', 60 * 5,
'How often (in seconds) to run evaluation.')
......@@ -61,6 +61,10 @@ flags.DEFINE_multi_float('eval_scales', [1.0],
flags.DEFINE_bool('add_flipped_images', False,
'Add flipped images for evaluation or not.')
flags.DEFINE_integer(
'quantize_delay_step', -1,
'Steps to start quantized training. If < 0, will not quantize model.')
# Dataset settings.
flags.DEFINE_string('dataset', 'pascal_voc_seg',
......@@ -84,7 +88,7 @@ def main(unused_argv):
split_name=FLAGS.eval_split,
dataset_dir=FLAGS.dataset_dir,
batch_size=FLAGS.eval_batch_size,
crop_size=FLAGS.eval_crop_size,
crop_size=map(int, FLAGS.eval_crop_size),
min_resize_value=FLAGS.min_resize_value,
max_resize_value=FLAGS.max_resize_value,
resize_factor=FLAGS.resize_factor,
......@@ -102,15 +106,15 @@ def main(unused_argv):
model_options = common.ModelOptions(
outputs_to_num_classes={common.OUTPUT_TYPE: dataset.num_of_classes},
crop_size=FLAGS.eval_crop_size,
crop_size=map(int, FLAGS.eval_crop_size),
atrous_rates=FLAGS.atrous_rates,
output_stride=FLAGS.output_stride)
# Set shape in order for tf.contrib.tfprof.model_analyzer to work properly.
samples[common.IMAGE].set_shape(
[FLAGS.eval_batch_size,
FLAGS.eval_crop_size[0],
FLAGS.eval_crop_size[1],
int(FLAGS.eval_crop_size[0]),
int(FLAGS.eval_crop_size[1]),
3])
if tuple(FLAGS.eval_scales) == (1.0,):
tf.logging.info('Performing single-scale test.')
......@@ -118,6 +122,10 @@ def main(unused_argv):
image_pyramid=FLAGS.image_pyramid)
else:
tf.logging.info('Performing multi-scale test.')
if FLAGS.quantize_delay_step >= 0:
raise ValueError(
'Quantize mode is not supported with multi-scale test.')
predictions = model.predict_labels_multi_scale(
samples[common.IMAGE],
model_options=model_options,
......@@ -154,6 +162,9 @@ def main(unused_argv):
if FLAGS.max_number_of_evaluations > 0:
num_eval_iters = FLAGS.max_number_of_evaluations
if FLAGS.quantize_delay_step >= 0:
tf.contrib.quantize.create_eval_graph()
tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_options=tf.contrib.tfprof.model_analyzer.
......
......@@ -53,6 +53,10 @@ flags.DEFINE_multi_float('inference_scales', [1.0],
flags.DEFINE_bool('add_flipped_images', False,
'Add flipped images during inference or not.')
flags.DEFINE_integer(
'quantize_delay_step', -1,
'Steps to start quantized training. If < 0, will not quantize model.')
flags.DEFINE_bool('save_inference_graph', False,
'Save inference graph in text proto.')
......@@ -124,6 +128,9 @@ def main(unused_argv):
image_pyramid=FLAGS.image_pyramid)
else:
tf.logging.info('Exported model performs multi-scale inference.')
if FLAGS.quantize_delay_step >= 0:
raise ValueError(
'Quantize mode is not supported with multi-scale test.')
predictions = model.predict_labels_multi_scale(
image,
model_options=model_options,
......@@ -150,7 +157,10 @@ def main(unused_argv):
semantic_predictions = _resize_label(semantic_predictions, image_size)
semantic_predictions = tf.identity(semantic_predictions, name=_OUTPUT_NAME)
saver = tf.train.Saver(tf.model_variables())
if FLAGS.quantize_delay_step >= 0:
tf.contrib.quantize.create_eval_graph()
saver = tf.train.Saver(tf.all_variables())
dirname = os.path.dirname(FLAGS.export_path)
tf.gfile.MakeDirs(dirname)
......
......@@ -57,8 +57,7 @@ python deeplab/train.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=513 \
--train_crop_size=513 \
--train_crop_size="513,513" \
--train_batch_size=4 \
--min_resize_value=513 \
--max_resize_value=513 \
......
......@@ -50,8 +50,7 @@ python deeplab/train.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=769 \
--train_crop_size=769 \
--train_crop_size="769,769" \
--train_batch_size=1 \
--dataset="cityscapes" \
--tf_initial_checkpoint=${PATH_TO_INITIAL_CHECKPOINT} \
......@@ -103,8 +102,7 @@ python deeplab/eval.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--eval_crop_size=1025 \
--eval_crop_size=2049 \
--eval_crop_size="1025,2049" \
--dataset="cityscapes" \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--eval_logdir=${PATH_TO_EVAL_DIR} \
......@@ -130,8 +128,7 @@ python deeplab/vis.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--vis_crop_size=1025 \
--vis_crop_size=2049 \
--vis_crop_size="1025,2049" \
--dataset="cityscapes" \
--colormap_type="cityscapes" \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
......
......@@ -52,8 +52,7 @@ python deeplab/train.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=513 \
--train_crop_size=513 \
--train_crop_size="513,513" \
--train_batch_size=1 \
--dataset="pascal_voc_seg" \
--tf_initial_checkpoint=${PATH_TO_INITIAL_CHECKPOINT} \
......@@ -96,8 +95,7 @@ python deeplab/eval.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--eval_crop_size=513 \
--eval_crop_size=513 \
--eval_crop_size="513,513" \
--dataset="pascal_voc_seg" \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--eval_logdir=${PATH_TO_EVAL_DIR} \
......@@ -123,8 +121,7 @@ python deeplab/vis.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--vis_crop_size=513 \
--vis_crop_size=513 \
--vis_crop_size="513,513" \
--dataset="pascal_voc_seg" \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--vis_logdir=${PATH_TO_VIS_DIR} \
......
# Quantize DeepLab model for faster on-device inference
This page describes the steps required to quantize DeepLab model and convert it
to TFLite for on-device inference. The main steps include:
1. Quantization-aware training
1. Exporting model
1. Converting to TFLite FlatBuffer
We provide details for each step below.
## Quantization-aware training
DeepLab supports two approaches to quantize your model.
1. **[Recommended]** Training a non-quantized model until convergence. Then
fine-tune the trained float model with quantization using a small learning
rate (on PASCAL we use the value of 3e-5) . This fine-tuning step usually
takes 2k to 5k steps to converge.
1. Training a deeplab float model with delayed quantization. Usually we delay
quantization until the last a few thousand steps in training.
In the current implementation, quantization is only supported with 1)
`num_clones=1` for training and 2) single scale inference for evaluation,
visualization and model export. To get the best performance for the quantized
model, we strongly recommend to train the float model with larger `num_clones`
and then fine-tune the model with a single clone.
Here shows the commandline to quantize deeplab model trained on PASCAL VOC
dataset using fine-tuning:
```
# From tensorflow/models/research/
python deeplab/train.py \
--logtostderr \
--training_number_of_steps=3000 \
--train_split="train" \
--model_variant="mobilenet_v2" \
--output_stride=16 \
--train_crop_size="513,513" \
--train_batch_size=8 \
--base_learning_rate=3e-5 \
--dataset="pascal_voc_seg" \
--initialize_last_layer \
--quantize_delay_step=0 \
--tf_initial_checkpoint=${PATH_TO_TRAINED_FLOAT_MODEL} \
--train_logdir=${PATH_TO_TRAIN_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
## Converting to TFLite FlatBuffer
First use the following commandline to export your trained model.
```
# From tensorflow/models/research/
python deeplab/export_model.py \
--checkpoint_path=${CHECKPOINT_PATH} \
--quantize_delay_step=0 \
--export_path=${OUTPUT_DIR}/frozen_inference_graph.pb
```
Commandline below shows how to convert exported graphdef to TFlite model.
```
tflite_convert \
--graph_def_file=${OUTPUT_DIR}/frozen_inference_graph.pb \
--output_file=${OUTPUT_DIR}/frozen_inference_graph.tflite \
--output_format=TFLITE \
--input_shape=1,513,513,3 \
--input_arrays="MobilenetV2/MobilenetV2/input" \
--inference_type=QUANTIZED_UINT8 \
--inference_input_type=QUANTIZED_UINT8 \
--std_dev_values=128 \
--mean_values=128 \
--change_concat_input_ranges=true \
--output_arrays="ArgMax"
```
**[Important]** Note that converted model expects 513x513 RGB input and doesn't
include preprocessing (resize and pad input image) and post processing (crop
padded region and resize to original input size). These steps can be implemented
outside of TFlite model.
## Quantized model on PASCAL VOC
We provide float and quantized checkpoints that have been pretrained on VOC 2012
train_aug set, using MobileNet-v2 backbone with different depth multipliers.
Quantized model usually have 1% decay in mIoU.
For quantized (8bit) model, un-tar'ed directory includes:
* a frozen inference graph (frozen_inference_graph.pb)
* a checkpoint (model.ckpt.data*, model.ckpt.index)
* a converted TFlite FlatBuffer file (frozen_inference_graph.tflite)
Checkpoint name | Eval OS | Eval scales | Left-right Flip | Multiply-Adds | Quantize | PASCAL mIOU | File Size
-------------------------------------------------------------------------------------------------------------------------------------------- | :-----: | :---------: | :-------------: | :-----------: | :------: | :----------: | :-------:
[mobilenetv2_dm05_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_mnv2_dm05_pascal_trainaug_2018_10_01.tar.gz) | 16 | [1.0] | No | 0.88B | No | 70.19% (val) | 7.6MB
[mobilenetv2_dm05_coco_voc_trainaug_8bit](http://download.tensorflow.org/models/deeplabv3_mnv2_dm05_pascal_train_aug_8bit_2019_04_26.tar.gz) | 16 | [1.0] | No | 0.88B | Yes | 69.65% (val) | 8.2MB
[mobilenetv2_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz) | 16 | [1.0] | No | 2.75B | No | 75.32% (val) | 23MB
[mobilenetv2_coco_voc_trainaug_8bit](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_train_aug_8bit_2019_04_26.tar.gz) | 16 | [1.0] | No | 2.75B | Yes | 74.26% (val) | 24MB
Note that you might need the nightly build of TensorFlow (see
[here](https://www.tensorflow.org/install) for install instructions) to convert
above quantized model to TFLite.
......@@ -82,8 +82,7 @@ python "${WORK_DIR}"/train.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=513 \
--train_crop_size=513 \
--train_crop_size="513,513" \
--train_batch_size=4 \
--training_number_of_steps="${NUM_ITERATIONS}" \
--fine_tune_batch_norm=true \
......@@ -103,8 +102,7 @@ python "${WORK_DIR}"/eval.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--eval_crop_size=513 \
--eval_crop_size=513 \
--eval_crop_size="513,513" \
--checkpoint_dir="${TRAIN_LOGDIR}" \
--eval_logdir="${EVAL_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}" \
......@@ -120,8 +118,7 @@ python "${WORK_DIR}"/vis.py \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--vis_crop_size=513 \
--vis_crop_size=513 \
--vis_crop_size="513,513" \
--checkpoint_dir="${TRAIN_LOGDIR}" \
--vis_logdir="${VIS_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}" \
......
......@@ -79,8 +79,7 @@ python "${WORK_DIR}"/train.py \
--train_split="trainval" \
--model_variant="mobilenet_v2" \
--output_stride=16 \
--train_crop_size=513 \
--train_crop_size=513 \
--train_crop_size="513,513" \
--train_batch_size=4 \
--training_number_of_steps="${NUM_ITERATIONS}" \
--fine_tune_batch_norm=true \
......@@ -95,8 +94,7 @@ python "${WORK_DIR}"/eval.py \
--logtostderr \
--eval_split="val" \
--model_variant="mobilenet_v2" \
--eval_crop_size=513 \
--eval_crop_size=513 \
--eval_crop_size="513,513" \
--checkpoint_dir="${TRAIN_LOGDIR}" \
--eval_logdir="${EVAL_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}" \
......@@ -107,8 +105,7 @@ python "${WORK_DIR}"/vis.py \
--logtostderr \
--vis_split="val" \
--model_variant="mobilenet_v2" \
--vis_crop_size=513 \
--vis_crop_size=513 \
--vis_crop_size="513,513" \
--checkpoint_dir="${TRAIN_LOGDIR}" \
--vis_logdir="${VIS_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}" \
......
......@@ -107,8 +107,8 @@ flags.DEFINE_integer('train_batch_size', 8,
flags.DEFINE_float('weight_decay', 0.00004,
'The value of the weight decay for training.')
flags.DEFINE_multi_integer('train_crop_size', [513, 513],
'Image crop size [height, width] during training.')
flags.DEFINE_list('train_crop_size', '513,513',
'Image crop size [height, width] during training.')
flags.DEFINE_float(
'last_layer_gradient_multiplier', 1.0,
......@@ -166,7 +166,6 @@ flags.DEFINE_integer('output_stride', 16,
'The ratio of input to output spatial resolution.')
# Hard example mining related flags.
flags.DEFINE_integer(
'hard_example_mining_step', 0,
'The training step in which exact hard example mining kicks off. Note we '
......@@ -181,6 +180,11 @@ flags.DEFINE_float(
'The top k percent pixels (in terms of the loss values) used to compute '
'loss during training. This is useful for hard pixel mining.')
# Quantization setting.
flags.DEFINE_integer(
'quantize_delay_step', -1,
'Steps to start quantized training. If < 0, will not quantize model.')
# Dataset settings.
flags.DEFINE_string('dataset', 'pascal_voc_seg',
'Name of the segmentation dataset.')
......@@ -209,7 +213,7 @@ def _build_deeplab(iterator, outputs_to_num_classes, ignore_label):
model_options = common.ModelOptions(
outputs_to_num_classes=outputs_to_num_classes,
crop_size=FLAGS.train_crop_size,
crop_size=map(int, FLAGS.train_crop_size),
atrous_rates=FLAGS.atrous_rates,
output_stride=FLAGS.output_stride)
......@@ -344,39 +348,46 @@ def _train_deeplab_model(iterator, num_of_classes, ignore_label):
summary_op: An operation to log the summaries.
"""
global_step = tf.train.get_or_create_global_step()
summaries = []
learning_rate = train_utils.get_model_learning_rate(
FLAGS.learning_policy, FLAGS.base_learning_rate,
FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor,
FLAGS.training_number_of_steps, FLAGS.learning_power,
FLAGS.slow_start_step, FLAGS.slow_start_learning_rate)
summaries.append(tf.summary.scalar('learning_rate', learning_rate))
tf.summary.scalar('learning_rate', learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum)
tower_losses = []
tower_grads = []
tower_summaries = None
for i in range(FLAGS.num_clones):
with tf.device('/gpu:%d' % i):
with tf.name_scope('clone_%d' % i) as scope:
# First tower has default name scope.
name_scope = ('clone_%d' % i) if i else ''
with tf.name_scope(name_scope) as scope:
loss = _tower_loss(
iterator=iterator,
num_of_classes=num_of_classes,
ignore_label=ignore_label,
scope=scope,
reuse_variable=(i != 0))
grads = optimizer.compute_gradients(loss)
tower_grads.append(grads)
tower_losses.append(loss)
if FLAGS.quantize_delay_step >= 0:
if FLAGS.num_clones > 1:
raise ValueError('Quantization doesn\'t support multi-clone yet.')
tf.contrib.quantize.create_training_graph(
quant_delay=FLAGS.quantize_delay_step)
# Retain the summaries from the first tower.
if not i:
tower_summaries = tf.summary.merge_all(scope=scope)
for i in range(FLAGS.num_clones):
with tf.device('/gpu:%d' % i):
name_scope = ('clone_%d' % i) if i else ''
with tf.name_scope(name_scope) as scope:
grads = optimizer.compute_gradients(tower_losses[i])
tower_grads.append(grads)
with tf.device('/cpu:0'):
grads_and_vars = _average_gradients(tower_grads)
if tower_summaries is not None:
summaries.append(tower_summaries)
# Modify the gradients for biases and last layer variables.
last_layers = model.get_extra_layer_scopes(
......@@ -407,11 +418,12 @@ def _train_deeplab_model(iterator, num_of_classes, ignore_label):
lambda: tf.Print(total_loss, [total_loss], 'Total loss is :'),
lambda: total_loss)
summaries.append(tf.summary.scalar('total_loss', total_loss))
tf.summary.scalar('total_loss', total_loss)
with tf.control_dependencies([update_op]):
train_tensor = tf.identity(total_loss, name='train_op')
summary_op = tf.summary.merge(summaries)
# Excludes summaries from towers other than the first one.
summary_op = tf.summary.merge_all(scope='(?!clone_)')
return train_tensor, summary_op
......@@ -434,7 +446,7 @@ def main(unused_argv):
split_name=FLAGS.train_split,
dataset_dir=FLAGS.dataset_dir,
batch_size=clone_batch_size,
crop_size=FLAGS.train_crop_size,
crop_size=map(int, FLAGS.train_crop_size),
min_resize_value=FLAGS.min_resize_value,
max_resize_value=FLAGS.max_resize_value,
resize_factor=FLAGS.resize_factor,
......@@ -471,7 +483,8 @@ def main(unused_argv):
summary_op=summary_op,
)
stop_hook = tf.train.StopAtStepHook(FLAGS.training_number_of_steps)
stop_hook = tf.train.StopAtStepHook(
last_step=FLAGS.training_number_of_steps)
profile_dir = FLAGS.profile_logdir
if profile_dir is not None:
......
......@@ -43,8 +43,8 @@ flags.DEFINE_string('checkpoint_dir', None, 'Directory of model checkpoints.')
flags.DEFINE_integer('vis_batch_size', 1,
'The number of images in each batch during evaluation.')
flags.DEFINE_multi_integer('vis_crop_size', [513, 513],
'Crop size [height, width] for visualization.')
flags.DEFINE_list('vis_crop_size', '513,513',
'Crop size [height, width] for visualization.')
flags.DEFINE_integer('eval_interval_secs', 60 * 5,
'How often (in seconds) to run evaluation.')
......@@ -66,6 +66,10 @@ flags.DEFINE_multi_float('eval_scales', [1.0],
flags.DEFINE_bool('add_flipped_images', False,
'Add flipped images for evaluation or not.')
flags.DEFINE_integer(
'quantize_delay_step', -1,
'Steps to start quantized training. If < 0, will not quantize model.')
# Dataset settings.
flags.DEFINE_string('dataset', 'pascal_voc_seg',
......@@ -189,7 +193,7 @@ def main(unused_argv):
split_name=FLAGS.vis_split,
dataset_dir=FLAGS.dataset_dir,
batch_size=FLAGS.vis_batch_size,
crop_size=FLAGS.vis_crop_size,
crop_size=map(int, FLAGS.vis_crop_size),
min_resize_value=FLAGS.min_resize_value,
max_resize_value=FLAGS.max_resize_value,
resize_factor=FLAGS.resize_factor,
......@@ -218,7 +222,7 @@ def main(unused_argv):
model_options = common.ModelOptions(
outputs_to_num_classes={common.OUTPUT_TYPE: dataset.num_of_classes},
crop_size=FLAGS.vis_crop_size,
crop_size=map(int, FLAGS.vis_crop_size),
atrous_rates=FLAGS.atrous_rates,
output_stride=FLAGS.output_stride)
......@@ -230,6 +234,9 @@ def main(unused_argv):
image_pyramid=FLAGS.image_pyramid)
else:
tf.logging.info('Performing multi-scale test.')
if FLAGS.quantize_delay_step >= 0:
raise ValueError(
'Quantize mode is not supported with multi-scale test.')
predictions = model.predict_labels_multi_scale(
samples[common.IMAGE],
model_options=model_options,
......@@ -259,6 +266,10 @@ def main(unused_argv):
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
align_corners=True), 3)
tf.train.get_or_create_global_step()
if FLAGS.quantize_delay_step >= 0:
tf.contrib.quantize.create_eval_graph()
num_iteration = 0
max_num_iteration = FLAGS.max_number_of_iterations
......@@ -274,8 +285,6 @@ def main(unused_argv):
time.gmtime()))
tf.logging.info('Visualizing with model %s', checkpoint_path)
tf.train.get_or_create_global_step()
scaffold = tf.train.Scaffold(init_op=tf.global_variables_initializer())
session_creator = tf.train.ChiefSessionCreator(
scaffold=scaffold,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment